3 libdemac - A Monkey's Audio decoder
7 Copyright (C) Dave Chapman 2007
9 ARMv6 vector math copyright (C) 2008 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 #define FUSED_VECTOR_MATH
30 #define REPEAT_BLOCK(x) x x x
32 #define REPEAT_BLOCK(x) x
35 /* Calculate scalarproduct, then add a 2nd vector (fused for performance)
36 * This version fetches data as 32 bit words, and *requires* v1 to be
37 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
38 * aligned or both unaligned. If either condition isn't met, it will either
39 * result in a data abort or incorrect results. */
40 static inline int32_t vector_sp_add(int16_t* v1
, int16_t* f2
, int16_t* s2
)
55 "ldrh r3, [%[f2]], #2 \n"
56 "ldrh r6, [%[s2]], #2 \n"
57 "ldmia %[f2]!, {r2,r4} \n"
58 "mov r3, r3, lsl #16 \n"
59 "mov r6, r6, lsl #16 \n"
62 "ldmia %[s2]!, {r5,r7} \n"
66 "mov r5, r5, ror #16 \n"
67 "pkhtb r6, r5, r6, asr #16 \n"
68 "pkhbt r5, r5, r7, lsl #16 \n"
70 "smladx %[res], r0, r3, %[res] \n"
72 "smuadx %[res], r0, r3 \n"
74 "smladx %[res], r1, r2, %[res] \n"
75 "ldmia %[f2]!, {r2,r3} \n"
76 "sadd16 r0, r0, r6 \n"
77 "sadd16 r1, r1, r5 \n"
78 "strd r0, [%[v1]], #8 \n"
81 "ldmia %[s2]!, {r5,r6} \n"
85 "mov r5, r5, ror #16 \n"
86 "pkhtb r7, r5, r7, asr #16 \n"
87 "pkhbt r5, r5, r6, lsl #16 \n"
88 "smladx %[res], r0, r4, %[res] \n"
89 "smladx %[res], r1, r2, %[res] \n"
90 "ldmia %[f2]!, {r2,r4} \n"
91 "sadd16 r0, r0, r7 \n"
92 "sadd16 r1, r1, r5 \n"
93 "strd r0, [%[v1]], #8 \n"
94 "ldmia %[s2]!, {r5,r7} \n"
98 "mov r5, r5, ror #16 \n"
99 "pkhtb r6, r5, r6, asr #16 \n"
100 "pkhbt r5, r5, r7, lsl #16 \n"
101 "smladx %[res], r0, r3, %[res] \n"
102 "smladx %[res], r1, r2, %[res] \n"
103 "ldmia %[f2]!, {r2,r3} \n"
104 "sadd16 r0, r0, r6 \n"
105 "sadd16 r1, r1, r5 \n"
106 "strd r0, [%[v1]], #8 \n"
109 "ldmia %[s2]!, {r5,r6} \n"
110 "pkhtb r4, r4, r2 \n"
111 "pkhtb r2, r2, r3 \n"
112 "ldrd r0, [%[v1]] \n"
113 "mov r5, r5, ror #16 \n"
114 "pkhtb r7, r5, r7, asr #16 \n"
115 "pkhbt r5, r5, r6, lsl #16 \n"
116 "smladx %[res], r0, r4, %[res] \n"
117 "smladx %[res], r1, r2, %[res] \n"
119 "subs %[cnt], %[cnt], #1 \n"
120 "ldmneia %[f2]!, {r2,r4} \n"
121 "sadd16 r0, r0, r7 \n"
122 "sadd16 r1, r1, r5 \n"
123 "strd r0, [%[v1]], #8 \n"
126 "sadd16 r0, r0, r7 \n"
127 "sadd16 r1, r1, r5 \n"
128 "strd r0, [%[v1]], #8 \n"
134 "ldrd r4, [%[f2]], #8 \n"
135 "ldrd r0, [%[v1]] \n"
139 "smlad %[res], r0, r4, %[res] \n"
141 "smuad %[res], r0, r4 \n"
143 "ldrd r6, [%[s2]], #8 \n"
144 "smlad %[res], r1, r5, %[res] \n"
145 "ldrd r4, [%[f2]], #8 \n"
146 "ldrd r2, [%[v1], #8] \n"
147 "sadd16 r0, r0, r6 \n"
148 "sadd16 r1, r1, r7 \n"
149 "strd r0, [%[v1]], #8 \n"
152 "smlad %[res], r2, r4, %[res] \n"
153 "ldrd r6, [%[s2]], #8 \n"
154 "smlad %[res], r3, r5, %[res] \n"
155 "ldrd r4, [%[f2]], #8 \n"
156 "ldrd r0, [%[v1], #8] \n"
157 "sadd16 r2, r2, r6 \n"
158 "sadd16 r3, r3, r7 \n"
159 "strd r2, [%[v1]], #8 \n"
160 "smlad %[res], r0, r4, %[res] \n"
161 "ldrd r6, [%[s2]], #8 \n"
162 "smlad %[res], r1, r5, %[res] \n"
163 "ldrd r4, [%[f2]], #8 \n"
164 "ldrd r2, [%[v1], #8] \n"
165 "sadd16 r0, r0, r6 \n"
166 "sadd16 r1, r1, r7 \n"
167 "strd r0, [%[v1]], #8 \n"
170 "smlad %[res], r2, r4, %[res] \n"
171 "ldrd r6, [%[s2]], #8 \n"
172 "smlad %[res], r3, r5, %[res] \n"
174 "subs %[cnt], %[cnt], #1 \n"
175 "ldrned r4, [%[f2]], #8 \n"
176 "ldrned r0, [%[v1], #8] \n"
177 "sadd16 r2, r2, r6 \n"
178 "sadd16 r3, r3, r7 \n"
179 "strd r2, [%[v1]], #8 \n"
182 "sadd16 r2, r2, r6 \n"
183 "sadd16 r3, r3, r7 \n"
184 "strd r2, [%[v1]], #8 \n"
198 "r0", "r1", "r2", "r3", "r4",
199 "r5", "r6", "r7", "cc", "memory"
204 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
205 * This version fetches data as 32 bit words, and *requires* v1 to be
206 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
207 * aligned or both unaligned. If either condition isn't met, it will either
208 * result in a data abort or incorrect results. */
209 static inline int32_t vector_sp_sub(int16_t* v1
, int16_t* f2
, int16_t* s2
)
224 "ldrh r3, [%[f2]], #2 \n"
225 "ldrh r6, [%[s2]], #2 \n"
226 "ldmia %[f2]!, {r2,r4} \n"
227 "mov r3, r3, lsl #16 \n"
228 "mov r6, r6, lsl #16 \n"
231 "ldmia %[s2]!, {r5,r7} \n"
232 "pkhtb r3, r3, r2 \n"
233 "pkhtb r2, r2, r4 \n"
234 "ldrd r0, [%[v1]] \n"
235 "mov r5, r5, ror #16 \n"
236 "pkhtb r6, r5, r6, asr #16 \n"
237 "pkhbt r5, r5, r7, lsl #16 \n"
239 "smladx %[res], r0, r3, %[res] \n"
241 "smuadx %[res], r0, r3 \n"
243 "smladx %[res], r1, r2, %[res] \n"
244 "ldmia %[f2]!, {r2,r3} \n"
245 "ssub16 r0, r0, r6 \n"
246 "ssub16 r1, r1, r5 \n"
247 "strd r0, [%[v1]], #8 \n"
250 "ldmia %[s2]!, {r5,r6} \n"
251 "pkhtb r4, r4, r2 \n"
252 "pkhtb r2, r2, r3 \n"
253 "ldrd r0, [%[v1]] \n"
254 "mov r5, r5, ror #16 \n"
255 "pkhtb r7, r5, r7, asr #16 \n"
256 "pkhbt r5, r5, r6, lsl #16 \n"
257 "smladx %[res], r0, r4, %[res] \n"
258 "smladx %[res], r1, r2, %[res] \n"
259 "ldmia %[f2]!, {r2,r4} \n"
260 "ssub16 r0, r0, r7 \n"
261 "ssub16 r1, r1, r5 \n"
262 "strd r0, [%[v1]], #8 \n"
263 "ldmia %[s2]!, {r5,r7} \n"
264 "pkhtb r3, r3, r2 \n"
265 "pkhtb r2, r2, r4 \n"
266 "ldrd r0, [%[v1]] \n"
267 "mov r5, r5, ror #16 \n"
268 "pkhtb r6, r5, r6, asr #16 \n"
269 "pkhbt r5, r5, r7, lsl #16 \n"
270 "smladx %[res], r0, r3, %[res] \n"
271 "smladx %[res], r1, r2, %[res] \n"
272 "ldmia %[f2]!, {r2,r3} \n"
273 "ssub16 r0, r0, r6 \n"
274 "ssub16 r1, r1, r5 \n"
275 "strd r0, [%[v1]], #8 \n"
278 "ldmia %[s2]!, {r5,r6} \n"
279 "pkhtb r4, r4, r2 \n"
280 "pkhtb r2, r2, r3 \n"
281 "ldrd r0, [%[v1]] \n"
282 "mov r5, r5, ror #16 \n"
283 "pkhtb r7, r5, r7, asr #16 \n"
284 "pkhbt r5, r5, r6, lsl #16 \n"
285 "smladx %[res], r0, r4, %[res] \n"
286 "smladx %[res], r1, r2, %[res] \n"
288 "subs %[cnt], %[cnt], #1 \n"
289 "ldmneia %[f2]!, {r2,r4} \n"
290 "ssub16 r0, r0, r7 \n"
291 "ssub16 r1, r1, r5 \n"
292 "strd r0, [%[v1]], #8 \n"
295 "ssub16 r0, r0, r7 \n"
296 "ssub16 r1, r1, r5 \n"
297 "strd r0, [%[v1]], #8 \n"
303 "ldrd r4, [%[f2]], #8 \n"
304 "ldrd r0, [%[v1]] \n"
308 "smlad %[res], r0, r4, %[res] \n"
310 "smuad %[res], r0, r4 \n"
312 "ldrd r6, [%[s2]], #8 \n"
313 "smlad %[res], r1, r5, %[res] \n"
314 "ldrd r4, [%[f2]], #8 \n"
315 "ldrd r2, [%[v1], #8] \n"
316 "ssub16 r0, r0, r6 \n"
317 "ssub16 r1, r1, r7 \n"
318 "strd r0, [%[v1]], #8 \n"
321 "smlad %[res], r2, r4, %[res] \n"
322 "ldrd r6, [%[s2]], #8 \n"
323 "smlad %[res], r3, r5, %[res] \n"
324 "ldrd r4, [%[f2]], #8 \n"
325 "ldrd r0, [%[v1], #8] \n"
326 "ssub16 r2, r2, r6 \n"
327 "ssub16 r3, r3, r7 \n"
328 "strd r2, [%[v1]], #8 \n"
329 "smlad %[res], r0, r4, %[res] \n"
330 "ldrd r6, [%[s2]], #8 \n"
331 "smlad %[res], r1, r5, %[res] \n"
332 "ldrd r4, [%[f2]], #8 \n"
333 "ldrd r2, [%[v1], #8] \n"
334 "ssub16 r0, r0, r6 \n"
335 "ssub16 r1, r1, r7 \n"
336 "strd r0, [%[v1]], #8 \n"
339 "smlad %[res], r2, r4, %[res] \n"
340 "ldrd r6, [%[s2]], #8 \n"
341 "smlad %[res], r3, r5, %[res] \n"
343 "subs %[cnt], %[cnt], #1 \n"
344 "ldrned r4, [%[f2]], #8 \n"
345 "ldrned r0, [%[v1], #8] \n"
346 "ssub16 r2, r2, r6 \n"
347 "ssub16 r3, r3, r7 \n"
348 "strd r2, [%[v1]], #8 \n"
351 "ssub16 r2, r2, r6 \n"
352 "ssub16 r3, r3, r7 \n"
353 "strd r2, [%[v1]], #8 \n"
367 "r0", "r1", "r2", "r3", "r4",
368 "r5", "r6", "r7", "cc", "memory"
373 /* This version fetches data as 32 bit words, and *requires* v1 to be
374 * 32 bit aligned, otherwise it will result either in a data abort, or
375 * incorrect results (if ARM aligncheck is disabled). */
376 static inline int32_t scalarproduct(int16_t* v1
, int16_t* v2
)
391 "bic %[v2], %[v2], #2 \n"
392 "ldmia %[v2]!, {r5-r7} \n"
393 "ldrd r0, [%[v1]], #8 \n"
396 "pkhtb r3, r5, r6 \n"
397 "ldrd r4, [%[v2]], #8 \n"
399 "smladx %[res], r0, r3, %[res] \n"
401 "smuadx %[res], r0, r3 \n"
404 "pkhtb r0, r6, r7 \n"
405 "ldrd r2, [%[v1]], #8 \n"
406 "smladx %[res], r1, r0, %[res] \n"
407 "pkhtb r1, r7, r4 \n"
408 "ldrd r6, [%[v2]], #8 \n"
409 "smladx %[res], r2, r1, %[res] \n"
410 "pkhtb r2, r4, r5 \n"
411 "ldrd r0, [%[v1]], #8 \n"
412 "smladx %[res], r3, r2, %[res] \n"
413 "pkhtb r3, r5, r6 \n"
414 "ldrd r4, [%[v2]], #8 \n"
415 "smladx %[res], r0, r3, %[res] \n"
418 "pkhtb r0, r6, r7 \n"
419 "ldrd r2, [%[v1]], #8 \n"
420 "smladx %[res], r1, r0, %[res] \n"
421 "pkhtb r1, r7, r4 \n"
423 "subs %[cnt], %[cnt], #1 \n"
424 "ldrned r6, [%[v2]], #8 \n"
425 "smladx %[res], r2, r1, %[res] \n"
426 "pkhtb r2, r4, r5 \n"
427 "ldrned r0, [%[v1]], #8 \n"
428 "smladx %[res], r3, r2, %[res] \n"
431 "pkhtb r4, r4, r5 \n"
432 "smladx %[res], r2, r1, %[res] \n"
433 "smladx %[res], r3, r4, %[res] \n"
439 "ldrd r0, [%[v1]], #8 \n"
440 "ldmia %[v2]!, {r5-r7} \n"
443 "ldrd r2, [%[v1]], #8 \n"
445 "smlad %[res], r0, r5, %[res] \n"
447 "smuad %[res], r0, r5 \n"
450 "ldrd r4, [%[v2]], #8 \n"
451 "smlad %[res], r1, r6, %[res] \n"
452 "ldrd r0, [%[v1]], #8 \n"
453 "smlad %[res], r2, r7, %[res] \n"
454 "ldrd r6, [%[v2]], #8 \n"
455 "smlad %[res], r3, r4, %[res] \n"
456 "ldrd r2, [%[v1]], #8 \n"
457 "smlad %[res], r0, r5, %[res] \n"
461 "ldrd r4, [%[v2]], #8 \n"
462 "smlad %[res], r1, r6, %[res] \n"
463 "subs %[cnt], %[cnt], #1 \n"
464 "ldrned r0, [%[v1]], #8 \n"
465 "smlad %[res], r2, r7, %[res] \n"
466 "ldrned r6, [%[v2]], #8 \n"
467 "smlad %[res], r3, r4, %[res] \n"
470 "ldr r4, [%[v2]], #4 \n"
471 "smlad %[res], r1, r6, %[res] \n"
472 "smlad %[res], r2, r7, %[res] \n"
473 "smlad %[res], r3, r4, %[res] \n"
486 "r0", "r1", "r2", "r3",
487 "r4", "r5", "r6", "r7", "cc", "memory"