Several tweaks and cleanups: * Use .rept instead of repeated macros for repeating...
[kugel-rb.git] / apps / codecs / demac / libdemac / vector_math16_armv6.h
blobcd27b271afd9e3bae335581a379a92ceec947fb9
1 /*
3 libdemac - A Monkey's Audio decoder
5 $Id$
7 Copyright (C) Dave Chapman 2007
9 ARMv6 vector math copyright (C) 2008 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 /* This version fetches data as 32 bit words, and *requires* v1 to be
28 * 32 bit aligned, otherwise it will result either in a data abort, or
29 * incorrect results (if ARM aligncheck is disabled). */
30 static inline void vector_add(int16_t* v1, int16_t* v2)
32 #if ORDER > 32
33 int cnt = ORDER>>5;
34 #endif
36 #if ORDER > 16
37 #define ADD_SUB_BLOCKS "4"
38 #else
39 #define ADD_SUB_BLOCKS "2"
40 #endif
42 asm volatile (
43 "tst %[v2], #2 \n"
44 "beq 20f \n"
46 "10: \n"
47 "ldrh r4, [%[v2]], #2 \n"
48 "ldr r5, [%[v2]], #4 \n"
49 "mov r4, r4, lsl #16 \n"
50 "1: \n"
51 ".rept " ADD_SUB_BLOCKS "\n"
52 "ldmia %[v2]!, {r6-r7} \n"
53 "ldmia %[v1], {r0-r3} \n"
54 "mov r5, r5, ror #16 \n"
55 "pkhtb r4, r5, r4, asr #16 \n"
56 "sadd16 r0, r0, r4 \n"
57 "pkhbt r5, r5, r6, lsl #16 \n"
58 "sadd16 r1, r1, r5 \n"
59 "ldmia %[v2]!, {r4-r5} \n"
60 "mov r7, r7, ror #16 \n"
61 "pkhtb r6, r7, r6, asr #16 \n"
62 "sadd16 r2, r2, r6 \n"
63 "pkhbt r7, r7, r4, lsl #16 \n"
64 "sadd16 r3, r3, r7 \n"
65 "stmia %[v1]!, {r0-r3} \n"
66 ".endr \n"
67 #if ORDER > 32
68 "subs %[cnt], %[cnt], #1 \n"
69 "bne 1b \n"
70 #endif
71 "b 99f \n"
73 "20: \n"
74 "1: \n"
75 ".rept " ADD_SUB_BLOCKS "\n"
76 "ldmia %[v2]!, {r4-r7} \n"
77 "ldmia %[v1], {r0-r3} \n"
78 "sadd16 r0, r0, r4 \n"
79 "sadd16 r1, r1, r5 \n"
80 "sadd16 r2, r2, r6 \n"
81 "sadd16 r3, r3, r7 \n"
82 "stmia %[v1]!, {r0-r3} \n"
83 ".endr \n"
84 #if ORDER > 32
85 "subs %[cnt], %[cnt], #1 \n"
86 "bne 1b \n"
87 #endif
89 "99: \n"
90 : /* outputs */
91 #if ORDER > 32
92 [cnt]"+r"(cnt),
93 #endif
94 [v1] "+r"(v1),
95 [v2] "+r"(v2)
96 : /* inputs */
97 : /* clobbers */
98 "r0", "r1", "r2", "r3", "r4",
99 "r5", "r6", "r7", "memory"
103 /* This version fetches data as 32 bit words, and *requires* v1 to be
104 * 32 bit aligned, otherwise it will result either in a data abort, or
105 * incorrect results (if ARM aligncheck is disabled). */
106 static inline void vector_sub(int16_t* v1, int16_t* v2)
108 #if ORDER > 32
109 int cnt = ORDER>>5;
110 #endif
112 asm volatile (
113 "tst %[v2], #2 \n"
114 "beq 20f \n"
116 "10: \n"
117 "ldrh r4, [%[v2]], #2 \n"
118 "ldr r5, [%[v2]], #4 \n"
119 "mov r4, r4, lsl #16 \n"
120 "1: \n"
121 ".rept " ADD_SUB_BLOCKS "\n"
122 "ldmia %[v2]!, {r6-r7} \n"
123 "ldmia %[v1], {r0-r3} \n"
124 "mov r5, r5, ror #16 \n"
125 "pkhtb r4, r5, r4, asr #16 \n"
126 "ssub16 r0, r0, r4 \n"
127 "pkhbt r5, r5, r6, lsl #16 \n"
128 "ssub16 r1, r1, r5 \n"
129 "ldmia %[v2]!, {r4-r5} \n"
130 "mov r7, r7, ror #16 \n"
131 "pkhtb r6, r7, r6, asr #16 \n"
132 "ssub16 r2, r2, r6 \n"
133 "pkhbt r7, r7, r4, lsl #16 \n"
134 "ssub16 r3, r3, r7 \n"
135 "stmia %[v1]!, {r0-r3} \n"
136 ".endr \n"
137 #if ORDER > 32
138 "subs %[cnt], %[cnt], #1 \n"
139 "bne 1b \n"
140 #endif
141 "b 99f \n"
143 "20: \n"
144 "1: \n"
145 ".rept " ADD_SUB_BLOCKS "\n"
146 "ldmia %[v2]!, {r4-r7} \n"
147 "ldmia %[v1], {r0-r3} \n"
148 "ssub16 r0, r0, r4 \n"
149 "ssub16 r1, r1, r5 \n"
150 "ssub16 r2, r2, r6 \n"
151 "ssub16 r3, r3, r7 \n"
152 "stmia %[v1]!, {r0-r3} \n"
153 ".endr \n"
154 #if ORDER > 32
155 "subs %[cnt], %[cnt], #1 \n"
156 "bne 1b \n"
157 #endif
159 "99: \n"
160 : /* outputs */
161 #if ORDER > 32
162 [cnt]"+r"(cnt),
163 #endif
164 [v1] "+r"(v1),
165 [v2] "+r"(v2)
166 : /* inputs */
167 : /* clobbers */
168 "r0", "r1", "r2", "r3", "r4",
169 "r5", "r6", "r7", "memory"
173 /* This version fetches data as 32 bit words, and *requires* v1 to be
174 * 32 bit aligned, otherwise it will result either in a data abort, or
175 * incorrect results (if ARM aligncheck is disabled). */
176 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
178 int res;
179 #if ORDER > 32
180 int cnt = ORDER>>5;
181 #endif
183 #if ORDER > 16
184 #define MLA_BLOCKS "3"
185 #else
186 #define MLA_BLOCKS "1"
187 #endif
189 asm volatile (
190 #if ORDER > 32
191 "mov %[res], #0 \n"
192 #endif
193 "tst %[v2], #2 \n"
194 "beq 20f \n"
196 "10: \n"
197 "ldrh r7, [%[v2]], #2 \n"
198 "ldmia %[v2]!, {r4-r5} \n"
199 "ldmia %[v1]!, {r0-r1} \n"
200 #if ORDER > 32
201 "mov r7, r7, lsl #16 \n"
202 "1: \n"
203 "pkhbt r8, r4, r7 \n"
204 "ldmia %[v2]!, {r6-r7} \n"
205 "smladx %[res], r0, r8, %[res] \n"
206 #else
207 "pkhbt r8, r4, r7, lsl #16 \n"
208 "ldmia %[v2]!, {r6-r7} \n"
209 "smuadx %[res], r0, r8 \n"
210 #endif
211 ".rept " MLA_BLOCKS "\n"
212 "pkhbt r8, r5, r4 \n"
213 "ldmia %[v1]!, {r2-r3} \n"
214 "smladx %[res], r1, r8, %[res] \n"
215 "pkhbt r8, r6, r5 \n"
216 "ldmia %[v2]!, {r4-r5} \n"
217 "smladx %[res], r2, r8, %[res] \n"
218 "pkhbt r8, r7, r6 \n"
219 "ldmia %[v1]!, {r0-r1} \n"
220 "smladx %[res], r3, r8, %[res] \n"
221 "pkhbt r8, r4, r7 \n"
222 "ldmia %[v2]!, {r6-r7} \n"
223 "smladx %[res], r0, r8, %[res] \n"
224 ".endr \n"
226 "pkhbt r8, r5, r4 \n"
227 "ldmia %[v1]!, {r2-r3} \n"
228 "smladx %[res], r1, r8, %[res] \n"
229 "pkhbt r8, r6, r5 \n"
230 #if ORDER > 32
231 "subs %[cnt], %[cnt], #1 \n"
232 "ldmneia %[v2]!, {r4-r5} \n"
233 "smladx %[res], r2, r8, %[res] \n"
234 "pkhbt r8, r7, r6 \n"
235 "ldmneia %[v1]!, {r0-r1} \n"
236 "smladx %[res], r3, r8, %[res] \n"
237 "bne 1b \n"
238 #else
239 "pkhbt r7, r7, r6 \n"
240 "smladx %[res], r2, r8, %[res] \n"
241 "smladx %[res], r3, r7, %[res] \n"
242 #endif
243 "b 99f \n"
245 "20: \n"
246 "ldmia %[v1]!, {r0-r1} \n"
247 "ldmia %[v2]!, {r5-r7} \n"
248 "1: \n"
249 "ldmia %[v1]!, {r2-r3} \n"
250 #if ORDER > 32
251 "smlad %[res], r0, r5, %[res] \n"
252 #else
253 "smuad %[res], r0, r5 \n"
254 #endif
255 ".rept " MLA_BLOCKS "\n"
256 "ldmia %[v2]!, {r4-r5} \n"
257 "smlad %[res], r1, r6, %[res] \n"
258 "ldmia %[v1]!, {r0-r1} \n"
259 "smlad %[res], r2, r7, %[res] \n"
260 "ldmia %[v2]!, {r6-r7} \n"
261 "smlad %[res], r3, r4, %[res] \n"
262 "ldmia %[v1]!, {r2-r3} \n"
263 "smlad %[res], r0, r5, %[res] \n"
264 ".endr \n"
266 "ldmia %[v2]!, {r4-r5} \n"
267 "smlad %[res], r1, r6, %[res] \n"
268 #if ORDER > 32
269 "subs %[cnt], %[cnt], #1 \n"
270 "ldmneia %[v1]!, {r0-r1} \n"
271 "smlad %[res], r2, r7, %[res] \n"
272 "ldmneia %[v2]!, {r6-r7} \n"
273 "smlad %[res], r3, r4, %[res] \n"
274 "bne 1b \n"
275 #else
276 "smlad %[res], r2, r7, %[res] \n"
277 "smlad %[res], r3, r4, %[res] \n"
278 #endif
280 "99: \n"
281 : /* outputs */
282 #if ORDER > 32
283 [cnt]"+r"(cnt),
284 #endif
285 [v1] "+r"(v1),
286 [v2] "+r"(v2),
287 [res]"=r"(res)
288 : /* inputs */
289 : /* clobbers */
290 "r0", "r1", "r2", "r3", "r4",
291 "r5", "r6", "r7", "r8"
293 return res;