Branch optimisation in both C (giving hints to gcc - verified using -fprofile-arcs...
[kugel-rb.git] / apps / codecs / demac / libdemac / vector_math16_cf.h
blob11e7f07adf93e3abcaf52e3a7cc07f004d92b831
1 /*
3 libdemac - A Monkey's Audio decoder
5 $Id$
7 Copyright (C) Dave Chapman 2007
9 Coldfire vector math copyright (C) 2007 Jens Arnold
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
27 /* This version fetches data as 32 bit words, and *recommends* v1 to be
28 * 32 bit aligned, otherwise performance will suffer. */
29 static inline void vector_add(int16_t* v1, int16_t* v2)
31 #if ORDER > 16
32 int cnt = ORDER>>4;
33 #endif
35 #define ADDHALFREGS(s1, sum) /* Add register halves straight. */ \
36 "move.l " #s1 ", %%d4 \n" /* 's1' can be an A or D reg. */ \
37 "add.l " #sum ", " #s1 "\n" /* 'sum' must be a D reg. */ \
38 "clr.w %%d4 \n" /* 's1' and %%d4 are clobbered! */ \
39 "add.l %%d4 , " #sum "\n" \
40 "move.w " #s1 ", " #sum "\n"
42 #define ADDHALFXREGS(s1, s2, sum) /* Add register halves across. */ \
43 "clr.w " #sum " \n" /* Needs 'sum' pre-swapped, swaps */ \
44 "add.l " #s1 ", " #sum "\n" /* 's2', and clobbers 's1'. */ \
45 "swap " #s2 " \n" /* 's1' can be an A or D reg. */ \
46 "add.l " #s2 ", " #s1 "\n" /* 'sum' and 's2' must be D regs. */ \
47 "move.w " #s1 ", " #sum "\n"
49 asm volatile (
50 "move.l %[v2], %%d0 \n"
51 "and.l #2, %%d0 \n"
52 "jeq 20f \n"
54 "10: \n"
55 "move.w (%[v2])+, %%d0 \n"
56 "swap %%d0 \n"
57 "1: \n"
58 "movem.l (%[v1]), %%a0-%%a3 \n"
59 "movem.l (%[v2]), %%d1-%%d4 \n"
60 ADDHALFXREGS(%%a0, %%d1, %%d0)
61 "move.l %%d0, (%[v1])+ \n"
62 ADDHALFXREGS(%%a1, %%d2, %%d1)
63 "move.l %%d1, (%[v1])+ \n"
64 ADDHALFXREGS(%%a2, %%d3, %%d2)
65 "move.l %%d2, (%[v1])+ \n"
66 ADDHALFXREGS(%%a3, %%d4, %%d3)
67 "move.l %%d3, (%[v1])+ \n"
68 "lea.l (16, %[v2]), %[v2] \n"
69 "move.l %%d4, %%d0 \n"
71 "movem.l (%[v1]), %%a0-%%a3 \n"
72 "movem.l (%[v2]), %%d1-%%d4 \n"
73 ADDHALFXREGS(%%a0, %%d1, %%d0)
74 "move.l %%d0, (%[v1])+ \n"
75 ADDHALFXREGS(%%a1, %%d2, %%d1)
76 "move.l %%d1, (%[v1])+ \n"
77 ADDHALFXREGS(%%a2, %%d3, %%d2)
78 "move.l %%d2, (%[v1])+ \n"
79 ADDHALFXREGS(%%a3, %%d4, %%d3)
80 "move.l %%d3, (%[v1])+ \n"
81 #if ORDER > 16
82 "lea.l (16, %[v2]), %[v2] \n"
83 "move.l %%d4, %%d0 \n"
85 "subq.l #1, %[cnt] \n"
86 "jne 1b \n"
87 #endif
88 "jra 99f \n"
90 "20: \n"
91 "1: \n"
92 "movem.l (%[v2]), %%a0-%%a3 \n"
93 "movem.l (%[v1]), %%d0-%%d3 \n"
94 ADDHALFREGS(%%a0, %%d0)
95 "move.l %%d0, (%[v1])+ \n"
96 ADDHALFREGS(%%a1, %%d1)
97 "move.l %%d1, (%[v1])+ \n"
98 ADDHALFREGS(%%a2, %%d2)
99 "move.l %%d2, (%[v1])+ \n"
100 ADDHALFREGS(%%a3, %%d3)
101 "move.l %%d3, (%[v1])+ \n"
102 "lea.l (16, %[v2]), %[v2] \n"
104 "movem.l (%[v2]), %%a0-%%a3 \n"
105 "movem.l (%[v1]), %%d0-%%d3 \n"
106 ADDHALFREGS(%%a0, %%d0)
107 "move.l %%d0, (%[v1])+ \n"
108 ADDHALFREGS(%%a1, %%d1)
109 "move.l %%d1, (%[v1])+ \n"
110 ADDHALFREGS(%%a2, %%d2)
111 "move.l %%d2, (%[v1])+ \n"
112 ADDHALFREGS(%%a3, %%d3)
113 "move.l %%d3, (%[v1])+ \n"
114 #if ORDER > 16
115 "lea.l (16, %[v2]), %[v2] \n"
117 "subq.l #1, %[cnt] \n"
118 "jne 1b \n"
119 #endif
120 "99: \n"
121 : /* outputs */
122 #if ORDER > 16
123 [cnt]"+d"(cnt),
124 #endif
125 [v1] "+a"(v1),
126 [v2] "+a"(v2)
127 : /* inputs */
128 : /* clobbers */
129 "d0", "d1", "d2", "d3", "d4",
130 "a0", "a1", "a2", "a3", "memory"
134 /* This version fetches data as 32 bit words, and *recommends* v1 to be
135 * 32 bit aligned, otherwise performance will suffer. */
136 static inline void vector_sub(int16_t* v1, int16_t* v2)
138 #if ORDER > 16
139 int cnt = ORDER>>4;
140 #endif
142 #define SUBHALFREGS(min, sub, dif) /* Subtract register halves straight. */ \
143 "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */ \
144 "sub.l " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */ \
145 "clr.w " #sub "\n" /* 'min' and 'sub' are clobbered! */ \
146 "sub.l " #sub ", " #dif "\n" \
147 "move.w " #min ", " #dif "\n"
149 #define SUBHALFXREGS(min, s2, s1d) /* Subtract register halves across. */ \
150 "clr.w " #s1d "\n" /* Needs 's1d' pre-swapped, swaps */ \
151 "sub.l " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */ \
152 "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */ \
153 "swap " #s2 "\n" /* 's2' and 's1d' must be D regs. */ \
154 "sub.l " #s2 ", " #min "\n" \
155 "move.w " #min ", " #s1d "\n"
157 asm volatile (
158 "move.l %[v2], %%d0 \n"
159 "and.l #2, %%d0 \n"
160 "jeq 20f \n"
162 "10: \n"
163 "move.w (%[v2])+, %%d0 \n"
164 "swap %%d0 \n"
165 "1: \n"
166 "movem.l (%[v2]), %%d1-%%d4 \n"
167 "movem.l (%[v1]), %%a0-%%a3 \n"
168 SUBHALFXREGS(%%a0, %%d1, %%d0)
169 "move.l %%d0, (%[v1])+ \n"
170 SUBHALFXREGS(%%a1, %%d2, %%d1)
171 "move.l %%d1, (%[v1])+ \n"
172 SUBHALFXREGS(%%a2, %%d3, %%d2)
173 "move.l %%d2, (%[v1])+ \n"
174 SUBHALFXREGS(%%a3, %%d4, %%d3)
175 "move.l %%d3, (%[v1])+ \n"
176 "lea.l (16, %[v2]), %[v2] \n"
177 "move.l %%d4, %%d0 \n"
179 "movem.l (%[v2]), %%d1-%%d4 \n"
180 "movem.l (%[v1]), %%a0-%%a3 \n"
181 SUBHALFXREGS(%%a0, %%d1, %%d0)
182 "move.l %%d0, (%[v1])+ \n"
183 SUBHALFXREGS(%%a1, %%d2, %%d1)
184 "move.l %%d1, (%[v1])+ \n"
185 SUBHALFXREGS(%%a2, %%d3, %%d2)
186 "move.l %%d2, (%[v1])+ \n"
187 SUBHALFXREGS(%%a3, %%d4, %%d3)
188 "move.l %%d3, (%[v1])+ \n"
189 #if ORDER > 16
190 "lea.l (16, %[v2]), %[v2] \n"
191 "move.l %%d4, %%d0 \n"
193 "subq.l #1, %[cnt] \n"
194 "bne.w 1b \n"
195 #endif
196 "jra 99f \n"
198 "20: \n"
199 "1: \n"
200 "movem.l (%[v2]), %%d1-%%d4 \n"
201 "movem.l (%[v1]), %%a0-%%a3 \n"
202 SUBHALFREGS(%%a0, %%d1, %%d0)
203 "move.l %%d0, (%[v1])+ \n"
204 SUBHALFREGS(%%a1, %%d2, %%d1)
205 "move.l %%d1, (%[v1])+ \n"
206 SUBHALFREGS(%%a2, %%d3, %%d2)
207 "move.l %%d2, (%[v1])+ \n"
208 SUBHALFREGS(%%a3, %%d4, %%d3)
209 "move.l %%d3, (%[v1])+ \n"
210 "lea.l (16, %[v2]), %[v2] \n"
212 "movem.l (%[v2]), %%d1-%%d4 \n"
213 "movem.l (%[v1]), %%a0-%%a3 \n"
214 SUBHALFREGS(%%a0, %%d1, %%d0)
215 "move.l %%d0, (%[v1])+ \n"
216 SUBHALFREGS(%%a1, %%d2, %%d1)
217 "move.l %%d1, (%[v1])+ \n"
218 SUBHALFREGS(%%a2, %%d3, %%d2)
219 "move.l %%d2, (%[v1])+ \n"
220 SUBHALFREGS(%%a3, %%d4, %%d3)
221 "move.l %%d3, (%[v1])+ \n"
222 #if ORDER > 16
223 "lea.l (16, %[v2]), %[v2] \n"
225 "subq.l #1, %[cnt] \n"
226 "bne.w 1b \n"
227 #endif
229 "99: \n"
230 : /* outputs */
231 #if ORDER > 16
232 [cnt]"+d"(cnt),
233 #endif
234 [v1] "+a"(v1),
235 [v2] "+a"(v2)
236 : /* inputs */
237 : /* clobbers */
238 "d0", "d1", "d2", "d3", "d4",
239 "a0", "a1", "a2", "a3", "memory"
243 #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
245 /* This version fetches data as 32 bit words, and *recommends* v1 to be
246 * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
247 * in signed integer mode - call above macro before use. */
248 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
250 int res;
251 #if ORDER > 32
252 int cnt = ORDER>>5;
253 #endif
255 #if ORDER > 16
256 #define MAC_BLOCKS "7"
257 #else
258 #define MAC_BLOCKS "3"
259 #endif
261 asm volatile (
262 "move.l %[v2], %%d0 \n"
263 "and.l #2, %%d0 \n"
264 "jeq 20f \n"
266 "10: \n"
267 "move.l (%[v1])+, %%d0 \n"
268 "move.w (%[v2])+, %%d1 \n"
269 "1: \n"
270 ".rept " MAC_BLOCKS "\n"
271 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
272 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
273 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
274 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
275 ".endr \n"
277 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
278 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
279 "mac.w %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
280 #if ORDER > 32
281 "mac.w %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
282 "subq.l #1, %[res] \n"
283 "bne.w 1b \n"
284 #else
285 "mac.w %%d0l, %%d1u, %%acc0 \n"
286 #endif
287 "jra 99f \n"
289 "20: \n"
290 "move.l (%[v1])+, %%d0 \n"
291 "move.l (%[v2])+, %%d1 \n"
292 "1: \n"
293 ".rept " MAC_BLOCKS "\n"
294 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
295 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
296 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
297 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
298 ".endr \n"
300 "mac.w %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
301 "mac.w %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
302 #if ORDER > 32
303 "mac.w %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
304 "mac.w %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
305 "subq.l #1, %[res] \n"
306 "bne.w 1b \n"
307 #else
308 "mac.w %%d2u, %%d1u, %%acc0 \n"
309 "mac.w %%d2l, %%d1l, %%acc0 \n"
310 #endif
312 "99: \n"
313 "movclr.l %%acc0, %[res] \n"
314 : /* outputs */
315 [v1]"+a"(v1),
316 [v2]"+a"(v2),
317 [res]"=d"(res)
318 : /* inputs */
319 #if ORDER > 32
320 [cnt]"[res]"(cnt)
321 #endif
322 : /* clobbers */
323 "d0", "d1", "d2"
325 return res;