Correction to clobber lists of several codec's inline assembly.
[kugel-rb.git] / apps / codecs / demac / libdemac / vector_math16_armv6.h
blob8d27331b62bac882b4ed725121114b3efafb1108
1 /*
3 libdemac - A Monkey's Audio decoder
5 $Id$
7 Copyright (C) Dave Chapman 2007
9 ARMv6 vector math copyright (C) 2008 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 #define FUSED_VECTOR_MATH
29 #if ORDER > 16
30 #define REPEAT_BLOCK(x) x x x
31 #else
32 #define REPEAT_BLOCK(x) x
33 #endif
35 /* Calculate scalarproduct, then add a 2nd vector (fused for performance)
36 * This version fetches data as 32 bit words, and *requires* v1 to be
37 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
38 * aligned or both unaligned. If either condition isn't met, it will either
39 * result in a data abort or incorrect results. */
40 static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
42 int res;
43 #if ORDER > 32
44 int cnt = ORDER>>5;
45 #endif
47 asm volatile (
48 #if ORDER > 32
49 "mov %[res], #0 \n"
50 #endif
51 "tst %[f2], #2 \n"
52 "beq 20f \n"
54 "10: \n"
55 "ldrh r3, [%[f2]], #2 \n"
56 "ldrh r6, [%[s2]], #2 \n"
57 "ldmia %[f2]!, {r2,r4} \n"
58 "mov r3, r3, lsl #16 \n"
59 "mov r6, r6, lsl #16 \n"
61 "1: \n"
62 "ldmia %[s2]!, {r5,r7} \n"
63 "pkhtb r3, r3, r2 \n"
64 "pkhtb r2, r2, r4 \n"
65 "ldrd r0, [%[v1]] \n"
66 "mov r5, r5, ror #16 \n"
67 "pkhtb r6, r5, r6, asr #16 \n"
68 "pkhbt r5, r5, r7, lsl #16 \n"
69 #if ORDER > 32
70 "smladx %[res], r0, r3, %[res] \n"
71 #else
72 "smuadx %[res], r0, r3 \n"
73 #endif
74 "smladx %[res], r1, r2, %[res] \n"
75 "ldmia %[f2]!, {r2,r3} \n"
76 "sadd16 r0, r0, r6 \n"
77 "sadd16 r1, r1, r5 \n"
78 "strd r0, [%[v1]], #8 \n"
80 REPEAT_BLOCK(
81 "ldmia %[s2]!, {r5,r6} \n"
82 "pkhtb r4, r4, r2 \n"
83 "pkhtb r2, r2, r3 \n"
84 "ldrd r0, [%[v1]] \n"
85 "mov r5, r5, ror #16 \n"
86 "pkhtb r7, r5, r7, asr #16 \n"
87 "pkhbt r5, r5, r6, lsl #16 \n"
88 "smladx %[res], r0, r4, %[res] \n"
89 "smladx %[res], r1, r2, %[res] \n"
90 "ldmia %[f2]!, {r2,r4} \n"
91 "sadd16 r0, r0, r7 \n"
92 "sadd16 r1, r1, r5 \n"
93 "strd r0, [%[v1]], #8 \n"
94 "ldmia %[s2]!, {r5,r7} \n"
95 "pkhtb r3, r3, r2 \n"
96 "pkhtb r2, r2, r4 \n"
97 "ldrd r0, [%[v1]] \n"
98 "mov r5, r5, ror #16 \n"
99 "pkhtb r6, r5, r6, asr #16 \n"
100 "pkhbt r5, r5, r7, lsl #16 \n"
101 "smladx %[res], r0, r3, %[res] \n"
102 "smladx %[res], r1, r2, %[res] \n"
103 "ldmia %[f2]!, {r2,r3} \n"
104 "sadd16 r0, r0, r6 \n"
105 "sadd16 r1, r1, r5 \n"
106 "strd r0, [%[v1]], #8 \n"
109 "ldmia %[s2]!, {r5,r6} \n"
110 "pkhtb r4, r4, r2 \n"
111 "pkhtb r2, r2, r3 \n"
112 "ldrd r0, [%[v1]] \n"
113 "mov r5, r5, ror #16 \n"
114 "pkhtb r7, r5, r7, asr #16 \n"
115 "pkhbt r5, r5, r6, lsl #16 \n"
116 "smladx %[res], r0, r4, %[res] \n"
117 "smladx %[res], r1, r2, %[res] \n"
118 #if ORDER > 32
119 "subs %[cnt], %[cnt], #1 \n"
120 "ldmneia %[f2]!, {r2,r4} \n"
121 "sadd16 r0, r0, r7 \n"
122 "sadd16 r1, r1, r5 \n"
123 "strd r0, [%[v1]], #8 \n"
124 "bne 1b \n"
125 #else
126 "sadd16 r0, r0, r7 \n"
127 "sadd16 r1, r1, r5 \n"
128 "strd r0, [%[v1]], #8 \n"
129 #endif
131 "b 99f \n"
133 "20: \n"
134 "ldrd r4, [%[f2]], #8 \n"
135 "ldrd r0, [%[v1]] \n"
137 #if ORDER > 32
138 "1: \n"
139 "smlad %[res], r0, r4, %[res] \n"
140 #else
141 "smuad %[res], r0, r4 \n"
142 #endif
143 "ldrd r6, [%[s2]], #8 \n"
144 "smlad %[res], r1, r5, %[res] \n"
145 "ldrd r4, [%[f2]], #8 \n"
146 "ldrd r2, [%[v1], #8] \n"
147 "sadd16 r0, r0, r6 \n"
148 "sadd16 r1, r1, r7 \n"
149 "strd r0, [%[v1]], #8 \n"
151 REPEAT_BLOCK(
152 "smlad %[res], r2, r4, %[res] \n"
153 "ldrd r6, [%[s2]], #8 \n"
154 "smlad %[res], r3, r5, %[res] \n"
155 "ldrd r4, [%[f2]], #8 \n"
156 "ldrd r0, [%[v1], #8] \n"
157 "sadd16 r2, r2, r6 \n"
158 "sadd16 r3, r3, r7 \n"
159 "strd r2, [%[v1]], #8 \n"
160 "smlad %[res], r0, r4, %[res] \n"
161 "ldrd r6, [%[s2]], #8 \n"
162 "smlad %[res], r1, r5, %[res] \n"
163 "ldrd r4, [%[f2]], #8 \n"
164 "ldrd r2, [%[v1], #8] \n"
165 "sadd16 r0, r0, r6 \n"
166 "sadd16 r1, r1, r7 \n"
167 "strd r0, [%[v1]], #8 \n"
170 "smlad %[res], r2, r4, %[res] \n"
171 "ldrd r6, [%[s2]], #8 \n"
172 "smlad %[res], r3, r5, %[res] \n"
173 #if ORDER > 32
174 "subs %[cnt], %[cnt], #1 \n"
175 "ldrned r4, [%[f2]], #8 \n"
176 "ldrned r0, [%[v1], #8] \n"
177 "sadd16 r2, r2, r6 \n"
178 "sadd16 r3, r3, r7 \n"
179 "strd r2, [%[v1]], #8 \n"
180 "bne 1b \n"
181 #else
182 "sadd16 r2, r2, r6 \n"
183 "sadd16 r3, r3, r7 \n"
184 "strd r2, [%[v1]], #8 \n"
185 #endif
187 "99: \n"
188 : /* outputs */
189 #if ORDER > 32
190 [cnt]"+r"(cnt),
191 #endif
192 [v1] "+r"(v1),
193 [f2] "+r"(f2),
194 [s2] "+r"(s2),
195 [res]"=r"(res)
196 : /* inputs */
197 : /* clobbers */
198 "r0", "r1", "r2", "r3", "r4",
199 "r5", "r6", "r7", "cc", "memory"
201 return res;
204 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
205 * This version fetches data as 32 bit words, and *requires* v1 to be
206 * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit
207 * aligned or both unaligned. If either condition isn't met, it will either
208 * result in a data abort or incorrect results. */
209 static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
211 int res;
212 #if ORDER > 32
213 int cnt = ORDER>>5;
214 #endif
216 asm volatile (
217 #if ORDER > 32
218 "mov %[res], #0 \n"
219 #endif
220 "tst %[f2], #2 \n"
221 "beq 20f \n"
223 "10: \n"
224 "ldrh r3, [%[f2]], #2 \n"
225 "ldrh r6, [%[s2]], #2 \n"
226 "ldmia %[f2]!, {r2,r4} \n"
227 "mov r3, r3, lsl #16 \n"
228 "mov r6, r6, lsl #16 \n"
230 "1: \n"
231 "ldmia %[s2]!, {r5,r7} \n"
232 "pkhtb r3, r3, r2 \n"
233 "pkhtb r2, r2, r4 \n"
234 "ldrd r0, [%[v1]] \n"
235 "mov r5, r5, ror #16 \n"
236 "pkhtb r6, r5, r6, asr #16 \n"
237 "pkhbt r5, r5, r7, lsl #16 \n"
238 #if ORDER > 32
239 "smladx %[res], r0, r3, %[res] \n"
240 #else
241 "smuadx %[res], r0, r3 \n"
242 #endif
243 "smladx %[res], r1, r2, %[res] \n"
244 "ldmia %[f2]!, {r2,r3} \n"
245 "ssub16 r0, r0, r6 \n"
246 "ssub16 r1, r1, r5 \n"
247 "strd r0, [%[v1]], #8 \n"
249 REPEAT_BLOCK(
250 "ldmia %[s2]!, {r5,r6} \n"
251 "pkhtb r4, r4, r2 \n"
252 "pkhtb r2, r2, r3 \n"
253 "ldrd r0, [%[v1]] \n"
254 "mov r5, r5, ror #16 \n"
255 "pkhtb r7, r5, r7, asr #16 \n"
256 "pkhbt r5, r5, r6, lsl #16 \n"
257 "smladx %[res], r0, r4, %[res] \n"
258 "smladx %[res], r1, r2, %[res] \n"
259 "ldmia %[f2]!, {r2,r4} \n"
260 "ssub16 r0, r0, r7 \n"
261 "ssub16 r1, r1, r5 \n"
262 "strd r0, [%[v1]], #8 \n"
263 "ldmia %[s2]!, {r5,r7} \n"
264 "pkhtb r3, r3, r2 \n"
265 "pkhtb r2, r2, r4 \n"
266 "ldrd r0, [%[v1]] \n"
267 "mov r5, r5, ror #16 \n"
268 "pkhtb r6, r5, r6, asr #16 \n"
269 "pkhbt r5, r5, r7, lsl #16 \n"
270 "smladx %[res], r0, r3, %[res] \n"
271 "smladx %[res], r1, r2, %[res] \n"
272 "ldmia %[f2]!, {r2,r3} \n"
273 "ssub16 r0, r0, r6 \n"
274 "ssub16 r1, r1, r5 \n"
275 "strd r0, [%[v1]], #8 \n"
278 "ldmia %[s2]!, {r5,r6} \n"
279 "pkhtb r4, r4, r2 \n"
280 "pkhtb r2, r2, r3 \n"
281 "ldrd r0, [%[v1]] \n"
282 "mov r5, r5, ror #16 \n"
283 "pkhtb r7, r5, r7, asr #16 \n"
284 "pkhbt r5, r5, r6, lsl #16 \n"
285 "smladx %[res], r0, r4, %[res] \n"
286 "smladx %[res], r1, r2, %[res] \n"
287 #if ORDER > 32
288 "subs %[cnt], %[cnt], #1 \n"
289 "ldmneia %[f2]!, {r2,r4} \n"
290 "ssub16 r0, r0, r7 \n"
291 "ssub16 r1, r1, r5 \n"
292 "strd r0, [%[v1]], #8 \n"
293 "bne 1b \n"
294 #else
295 "ssub16 r0, r0, r7 \n"
296 "ssub16 r1, r1, r5 \n"
297 "strd r0, [%[v1]], #8 \n"
298 #endif
300 "b 99f \n"
302 "20: \n"
303 "ldrd r4, [%[f2]], #8 \n"
304 "ldrd r0, [%[v1]] \n"
306 #if ORDER > 32
307 "1: \n"
308 "smlad %[res], r0, r4, %[res] \n"
309 #else
310 "smuad %[res], r0, r4 \n"
311 #endif
312 "ldrd r6, [%[s2]], #8 \n"
313 "smlad %[res], r1, r5, %[res] \n"
314 "ldrd r4, [%[f2]], #8 \n"
315 "ldrd r2, [%[v1], #8] \n"
316 "ssub16 r0, r0, r6 \n"
317 "ssub16 r1, r1, r7 \n"
318 "strd r0, [%[v1]], #8 \n"
320 REPEAT_BLOCK(
321 "smlad %[res], r2, r4, %[res] \n"
322 "ldrd r6, [%[s2]], #8 \n"
323 "smlad %[res], r3, r5, %[res] \n"
324 "ldrd r4, [%[f2]], #8 \n"
325 "ldrd r0, [%[v1], #8] \n"
326 "ssub16 r2, r2, r6 \n"
327 "ssub16 r3, r3, r7 \n"
328 "strd r2, [%[v1]], #8 \n"
329 "smlad %[res], r0, r4, %[res] \n"
330 "ldrd r6, [%[s2]], #8 \n"
331 "smlad %[res], r1, r5, %[res] \n"
332 "ldrd r4, [%[f2]], #8 \n"
333 "ldrd r2, [%[v1], #8] \n"
334 "ssub16 r0, r0, r6 \n"
335 "ssub16 r1, r1, r7 \n"
336 "strd r0, [%[v1]], #8 \n"
339 "smlad %[res], r2, r4, %[res] \n"
340 "ldrd r6, [%[s2]], #8 \n"
341 "smlad %[res], r3, r5, %[res] \n"
342 #if ORDER > 32
343 "subs %[cnt], %[cnt], #1 \n"
344 "ldrned r4, [%[f2]], #8 \n"
345 "ldrned r0, [%[v1], #8] \n"
346 "ssub16 r2, r2, r6 \n"
347 "ssub16 r3, r3, r7 \n"
348 "strd r2, [%[v1]], #8 \n"
349 "bne 1b \n"
350 #else
351 "ssub16 r2, r2, r6 \n"
352 "ssub16 r3, r3, r7 \n"
353 "strd r2, [%[v1]], #8 \n"
354 #endif
356 "99: \n"
357 : /* outputs */
358 #if ORDER > 32
359 [cnt]"+r"(cnt),
360 #endif
361 [v1] "+r"(v1),
362 [f2] "+r"(f2),
363 [s2] "+r"(s2),
364 [res]"=r"(res)
365 : /* inputs */
366 : /* clobbers */
367 "r0", "r1", "r2", "r3", "r4",
368 "r5", "r6", "r7", "cc", "memory"
370 return res;
373 /* This version fetches data as 32 bit words, and *requires* v1 to be
374 * 32 bit aligned, otherwise it will result either in a data abort, or
375 * incorrect results (if ARM aligncheck is disabled). */
376 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
378 int res;
379 #if ORDER > 32
380 int cnt = ORDER>>5;
381 #endif
383 asm volatile (
384 #if ORDER > 32
385 "mov %[res], #0 \n"
386 #endif
387 "tst %[v2], #2 \n"
388 "beq 20f \n"
390 "10: \n"
391 "bic %[v2], %[v2], #2 \n"
392 "ldmia %[v2]!, {r5-r7} \n"
393 "ldrd r0, [%[v1]], #8 \n"
395 "1: \n"
396 "pkhtb r3, r5, r6 \n"
397 "ldrd r4, [%[v2]], #8 \n"
398 #if ORDER > 32
399 "smladx %[res], r0, r3, %[res] \n"
400 #else
401 "smuadx %[res], r0, r3 \n"
402 #endif
403 REPEAT_BLOCK(
404 "pkhtb r0, r6, r7 \n"
405 "ldrd r2, [%[v1]], #8 \n"
406 "smladx %[res], r1, r0, %[res] \n"
407 "pkhtb r1, r7, r4 \n"
408 "ldrd r6, [%[v2]], #8 \n"
409 "smladx %[res], r2, r1, %[res] \n"
410 "pkhtb r2, r4, r5 \n"
411 "ldrd r0, [%[v1]], #8 \n"
412 "smladx %[res], r3, r2, %[res] \n"
413 "pkhtb r3, r5, r6 \n"
414 "ldrd r4, [%[v2]], #8 \n"
415 "smladx %[res], r0, r3, %[res] \n"
418 "pkhtb r0, r6, r7 \n"
419 "ldrd r2, [%[v1]], #8 \n"
420 "smladx %[res], r1, r0, %[res] \n"
421 "pkhtb r1, r7, r4 \n"
422 #if ORDER > 32
423 "subs %[cnt], %[cnt], #1 \n"
424 "ldrned r6, [%[v2]], #8 \n"
425 "smladx %[res], r2, r1, %[res] \n"
426 "pkhtb r2, r4, r5 \n"
427 "ldrned r0, [%[v1]], #8 \n"
428 "smladx %[res], r3, r2, %[res] \n"
429 "bne 1b \n"
430 #else
431 "pkhtb r4, r4, r5 \n"
432 "smladx %[res], r2, r1, %[res] \n"
433 "smladx %[res], r3, r4, %[res] \n"
434 #endif
436 "b 99f \n"
438 "20: \n"
439 "ldrd r0, [%[v1]], #8 \n"
440 "ldmia %[v2]!, {r5-r7} \n"
442 "1: \n"
443 "ldrd r2, [%[v1]], #8 \n"
444 #if ORDER > 32
445 "smlad %[res], r0, r5, %[res] \n"
446 #else
447 "smuad %[res], r0, r5 \n"
448 #endif
449 REPEAT_BLOCK(
450 "ldrd r4, [%[v2]], #8 \n"
451 "smlad %[res], r1, r6, %[res] \n"
452 "ldrd r0, [%[v1]], #8 \n"
453 "smlad %[res], r2, r7, %[res] \n"
454 "ldrd r6, [%[v2]], #8 \n"
455 "smlad %[res], r3, r4, %[res] \n"
456 "ldrd r2, [%[v1]], #8 \n"
457 "smlad %[res], r0, r5, %[res] \n"
460 #if ORDER > 32
461 "ldrd r4, [%[v2]], #8 \n"
462 "smlad %[res], r1, r6, %[res] \n"
463 "subs %[cnt], %[cnt], #1 \n"
464 "ldrned r0, [%[v1]], #8 \n"
465 "smlad %[res], r2, r7, %[res] \n"
466 "ldrned r6, [%[v2]], #8 \n"
467 "smlad %[res], r3, r4, %[res] \n"
468 "bne 1b \n"
469 #else
470 "ldr r4, [%[v2]], #4 \n"
471 "smlad %[res], r1, r6, %[res] \n"
472 "smlad %[res], r2, r7, %[res] \n"
473 "smlad %[res], r3, r4, %[res] \n"
474 #endif
476 "99: \n"
477 : /* outputs */
478 #if ORDER > 32
479 [cnt]"+r"(cnt),
480 #endif
481 [v1] "+r"(v1),
482 [v2] "+r"(v2),
483 [res]"=r"(res)
484 : /* inputs */
485 : /* clobbers */
486 "r0", "r1", "r2", "r3",
487 "r4", "r5", "r6", "r7", "cc", "memory"
489 return res;