APE codec: Speed up decoding of -c2000 and higher on ARMv4 and coldfire by fusing...
[kugel-rb.git] / apps / codecs / demac / libdemac / vector_math16_cf.h
blob6e8216c9cc02bcbefe8c69ef24356c4d1b4bb31a
1 /*
3 libdemac - A Monkey's Audio decoder
5 $Id$
7 Copyright (C) Dave Chapman 2007
9 Coldfire vector math copyright (C) 2007 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 #define FUSED_VECTOR_MATH
29 #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
31 /* Calculate scalarproduct, then add a 2nd vector (fused for performance)
32 * This version fetches data as 32 bit words, and *recommends* v1 to be
33 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
34 * aligned or both unaligned. Performance will suffer if either condition
35 * isn't met. It also needs EMAC in signed integer mode. */
36 static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
38 int res;
39 #if ORDER > 16
40 int cnt = ORDER>>4;
41 #endif
43 #define ADDHALFREGS(s1, s2, sum) /* Add register halves straight. */ \
44 "move.l " #s1 ", " #sum "\n" /* 's1' and 's2' can be A or D */ \
45 "add.l " #s2 ", " #s1 "\n" /* regs, 'sum' must be a D reg. */ \
46 "clr.w " #sum " \n" /* 's1' is clobbered! */ \
47 "add.l " #s2 ", " #sum "\n" \
48 "move.w " #s1 ", " #sum "\n"
50 #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
51 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
52 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
53 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
54 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
55 "move.w " #s1 ", " #sum "\n"
57 asm volatile (
58 "move.l %[f2], %%d0 \n"
59 "and.l #2, %%d0 \n"
60 "jeq 20f \n"
62 "10: \n"
63 "move.w (%[f2])+, %%d0 \n"
64 "move.w (%[s2])+, %%d1 \n"
65 "swap %%d1 \n"
66 "1: \n"
67 ".rept 2 \n"
68 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
69 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
70 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
71 ADDHALFXREGS(%%d6, %%d2, %%d1)
72 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
73 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
74 "move.l %%d1, (%[v1])+ \n"
75 ADDHALFXREGS(%%d7, %%d6, %%d2)
76 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
77 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a0, %%d7, %%d6)
80 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
81 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
82 "move.l %%d6, (%[v1])+ \n"
83 ADDHALFXREGS(%%a1, %%d1, %%d7)
84 "move.l %%d7, (%[v1])+ \n"
85 ".endr \n"
87 #if ORDER > 16
88 "subq.l #1, %[res] \n"
89 "bne.w 1b \n"
90 #endif
91 "jra 99f \n"
93 "20: \n"
94 "move.l (%[f2])+, %%d0 \n"
95 "1: \n"
96 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
97 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
98 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
99 ADDHALFREGS(%%d6, %%d1, %%d2)
100 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
101 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
102 "move.l %%d2, (%[v1])+ \n"
103 ADDHALFREGS(%%d7, %%d1, %%d2)
104 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
105 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
106 "move.l %%d2, (%[v1])+ \n"
107 ADDHALFREGS(%%a0, %%d1, %%d2)
108 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
109 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
110 "move.l %%d2, (%[v1])+ \n"
111 ADDHALFREGS(%%a1, %%d1, %%d2)
112 "move.l %%d2, (%[v1])+ \n"
114 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
115 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
116 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
117 ADDHALFREGS(%%d6, %%d1, %%d2)
118 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
119 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
120 "move.l %%d2, (%[v1])+ \n"
121 ADDHALFREGS(%%d7, %%d1, %%d2)
122 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
123 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
124 "move.l %%d2, (%[v1])+ \n"
125 ADDHALFREGS(%%a0, %%d1, %%d2)
126 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
127 #if ORDER > 16
128 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
129 #else
130 "mac.w %%d0l, %%a1l, %%acc0 \n"
131 #endif
132 "move.l %%d2, (%[v1])+ \n"
133 ADDHALFREGS(%%a1, %%d1, %%d2)
134 "move.l %%d2, (%[v1])+ \n"
135 #if ORDER > 16
136 "subq.l #1, %[res] \n"
137 "bne.w 1b \n"
138 #endif
140 "99: \n"
141 "movclr.l %%acc0, %[res] \n"
142 : /* outputs */
143 [v1]"+a"(v1),
144 [f2]"+a"(f2),
145 [s2]"+a"(s2),
146 [res]"=d"(res)
147 : /* inputs */
148 #if ORDER > 16
149 [cnt]"[res]"(cnt)
150 #endif
151 : /* clobbers */
152 "d0", "d1", "d2", "d6", "d7",
153 "a0", "a1", "memory"
156 return res;
159 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
160 * This version fetches data as 32 bit words, and *recommends* v1 to be
161 * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
162 * aligned or both unaligned. Performance will suffer if either condition
163 * isn't met. It also needs EMAC in signed integer mode. */
164 static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
166 int res;
167 #if ORDER > 16
168 int cnt = ORDER>>4;
169 #endif
171 #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
172 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
173 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
174 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
175 "sub.l " #sub ", " #dif "\n" \
176 "move.w " #min ", " #dif "\n"
178 #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
179 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
180 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
181 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
182 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
183 "sub.l " #s2 ", " #min "\n" \
184 "move.w " #min ", " #s1d "\n"
186 asm volatile (
187 "move.l %[f2], %%d0 \n"
188 "and.l #2, %%d0 \n"
189 "jeq 20f \n"
191 "10: \n"
192 "move.w (%[f2])+, %%d0 \n"
193 "move.w (%[s2])+, %%d1 \n"
194 "swap %%d1 \n"
195 "1: \n"
196 ".rept 2 \n"
197 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
198 "mac.w %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
199 "mac.w %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
200 SUBHALFXREGS(%%d6, %%d2, %%d1)
201 "mac.w %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
202 "mac.w %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
203 "move.l %%d1, (%[v1])+ \n"
204 SUBHALFXREGS(%%d7, %%d6, %%d2)
205 "mac.w %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
206 "mac.w %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
207 "move.l %%d2, (%[v1])+ \n"
208 SUBHALFXREGS(%%a0, %%d7, %%d6)
209 "mac.w %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
210 "mac.w %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
211 "move.l %%d6, (%[v1])+ \n"
212 SUBHALFXREGS(%%a1, %%d1, %%d7)
213 "move.l %%d7, (%[v1])+ \n"
214 ".endr \n"
216 #if ORDER > 16
217 "subq.l #1, %[res] \n"
218 "bne.w 1b \n"
219 #endif
221 "jra 99f \n"
223 "20: \n"
224 "move.l (%[f2])+, %%d0 \n"
225 "1: \n"
226 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
227 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
228 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
229 SUBHALFREGS(%%d6, %%d1, %%d2)
230 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
231 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
232 "move.l %%d2, (%[v1])+ \n"
233 SUBHALFREGS(%%d7, %%d1, %%d2)
234 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
235 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
236 "move.l %%d2, (%[v1])+ \n"
237 SUBHALFREGS(%%a0, %%d1, %%d2)
238 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
239 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
240 "move.l %%d2, (%[v1])+ \n"
241 SUBHALFREGS(%%a1, %%d1, %%d2)
242 "move.l %%d2, (%[v1])+ \n"
244 "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1 \n"
245 "mac.w %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
246 "mac.w %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
247 SUBHALFREGS(%%d6, %%d1, %%d2)
248 "mac.w %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
249 "mac.w %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
250 "move.l %%d2, (%[v1])+ \n"
251 SUBHALFREGS(%%d7, %%d1, %%d2)
252 "mac.w %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
253 "mac.w %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
254 "move.l %%d2, (%[v1])+ \n"
255 SUBHALFREGS(%%a0, %%d1, %%d2)
256 "mac.w %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
257 #if ORDER > 16
258 "mac.w %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
259 #else
260 "mac.w %%d0l, %%a1l, %%acc0 \n"
261 #endif
262 "move.l %%d2, (%[v1])+ \n"
263 SUBHALFREGS(%%a1, %%d1, %%d2)
264 "move.l %%d2, (%[v1])+ \n"
265 #if ORDER > 16
266 "subq.l #1, %[res] \n"
267 "bne.w 1b \n"
268 #endif
270 "99: \n"
271 "movclr.l %%acc0, %[res] \n"
272 : /* outputs */
273 [v1]"+a"(v1),
274 [f2]"+a"(f2),
275 [s2]"+a"(s2),
276 [res]"=d"(res)
277 : /* inputs */
278 #if ORDER > 16
279 [cnt]"[res]"(cnt)
280 #endif
281 : /* clobbers */
282 "d0", "d1", "d2", "d6", "d7",
283 "a0", "a1", "memory"
286 return res;
289 /* This version fetches data as 32 bit words, and *recommends* v1 to be
290 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
291 * in signed integer mode. */
292 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
294 int res;
295 #if ORDER > 16
296 int cnt = ORDER>>4;
297 #endif
299 asm volatile (
300 "move.l %[v2], %%d0 \n"
301 "and.l #2, %%d0 \n"
302 "jeq 20f \n"
304 "10: \n"
305 "move.l (%[v1])+, %%d0 \n"
306 "move.w (%[v2])+, %%d1 \n"
307 "1: \n"
308 ".rept 7 \n"
309 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
310 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
311 ".endr \n"
313 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
314 #if ORDER > 16
315 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
316 "subq.l #1, %[res] \n"
317 "bne.b 1b \n"
318 #else
319 "mac.w %%d0l, %%d1u, %%acc0 \n"
320 #endif
321 "jra 99f \n"
323 "20: \n"
324 "move.l (%[v1])+, %%d0 \n"
325 "move.l (%[v2])+, %%d1 \n"
326 "1: \n"
327 ".rept 3 \n"
328 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
329 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
330 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
331 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
332 ".endr \n"
334 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
335 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
336 #if ORDER > 16
337 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
338 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
339 "subq.l #1, %[res] \n"
340 "bne.b 1b \n"
341 #else
342 "mac.w %%d2u, %%d1u, %%acc0 \n"
343 "mac.w %%d2l, %%d1l, %%acc0 \n"
344 #endif
346 "99: \n"
347 "movclr.l %%acc0, %[res] \n"
348 : /* outputs */
349 [v1]"+a"(v1),
350 [v2]"+a"(v2),
351 [res]"=d"(res)
352 : /* inputs */
353 #if ORDER > 16
354 [cnt]"[res]"(cnt)
355 #endif
356 : /* clobbers */
357 "d0", "d1", "d2"
359 return res;