Add the new default dir to clean to the manual
[kugel-rb.git] / apps / codecs / libtremor / asm_arm.h
blob9531f2165790c3a7036725b8c7ec1f586f4a76d0
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggVorbis 'TREMOR' CODEC SOURCE CODE. *
4 * *
5 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
6 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
7 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * *
9 * THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002 *
10 * BY THE Xiph.Org FOUNDATION http://www.xiph.org/ *
11 * *
12 ********************************************************************
14 function: arm7 and later wide math functions
16 ********************************************************************/
18 #ifdef _ARM_ASSEM_
20 #if !defined(_V_WIDE_MATH) && !defined(_LOW_ACCURACY_)
21 #define _V_WIDE_MATH
23 static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
24 int lo,hi;
25 asm volatile("smull\t%0, %1, %2, %3"
26 : "=&r"(lo),"=&r"(hi)
27 : "%r"(x),"r"(y) );
28 return(hi);
31 static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
32 return MULT32(x,y)<<1;
35 static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
36 int lo,hi;
37 asm volatile("smull %0, %1, %2, %3\n\t"
38 "movs %0, %0, lsr #15\n\t"
39 "adc %1, %0, %1, lsl #17\n\t"
40 : "=&r"(lo),"=&r"(hi)
41 : "%r"(x),"r"(y)
42 : "cc");
43 return(hi);
46 #ifndef _V_VECT_OPS
47 #define _V_VECT_OPS
49 /* asm versions of vector operations for block.c, window.c */
50 /* SOME IMPORTANT NOTES: this implementation of vect_mult_bw does
51 NOT do a final shift, meaning that the result of vect_mult_bw is
52 only 31 bits not 32. This is so that we can do the shift in-place
53 in vect_add_xxxx instead to save one instruction for each mult on arm */
54 static inline
55 void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n)
57 /* first arg is right subframe of previous frame and second arg
58 is left subframe of current frame. overlap left onto right overwriting
59 the right subframe */
61 do{
62 asm volatile (
63 "ldmia %[x], {r0, r1, r2, r3};"
64 "ldmia %[y]!, {r4, r5, r6, r7};"
65 "add r0, r4, r0, lsl #1;"
66 "add r1, r5, r1, lsl #1;"
67 "add r2, r6, r2, lsl #1;"
68 "add r3, r7, r3, lsl #1;"
69 "stmia %[x]!, {r0, r1, r2, r3};"
70 "ldmia %[x], {r0, r1, r2, r3};"
71 "ldmia %[y]!, {r4, r5, r6, r7};"
72 "add r0, r4, r0, lsl #1;"
73 "add r1, r5, r1, lsl #1;"
74 "add r2, r6, r2, lsl #1;"
75 "add r3, r7, r3, lsl #1;"
76 "stmia %[x]!, {r0, r1, r2, r3};"
77 : [x] "+r" (x), [y] "+r" (y)
78 : : "r0", "r1", "r2", "r3",
79 "r4", "r5", "r6", "r7",
80 "memory");
81 n -= 8;
82 } while (n);
85 static inline
86 void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n)
88 /* first arg is left subframe of current frame and second arg
89 is right subframe of previous frame. overlap right onto left overwriting
90 the LEFT subframe */
91 do{
92 asm volatile (
93 "ldmia %[x], {r0, r1, r2, r3};"
94 "ldmia %[y]!, {r4, r5, r6, r7};"
95 "add r0, r0, r4, lsl #1;"
96 "add r1, r1, r5, lsl #1;"
97 "add r2, r2, r6, lsl #1;"
98 "add r3, r3, r7, lsl #1;"
99 "stmia %[x]!, {r0, r1, r2, r3};"
100 "ldmia %[x], {r0, r1, r2, r3};"
101 "ldmia %[y]!, {r4, r5, r6, r7};"
102 "add r0, r0, r4, lsl #1;"
103 "add r1, r1, r5, lsl #1;"
104 "add r2, r2, r6, lsl #1;"
105 "add r3, r3, r7, lsl #1;"
106 "stmia %[x]!, {r0, r1, r2, r3};"
107 : [x] "+r" (x), [y] "+r" (y)
108 : : "r0", "r1", "r2", "r3",
109 "r4", "r5", "r6", "r7",
110 "memory");
111 n -= 8;
112 } while (n);
115 #if ARM_ARCH >= 6
116 static inline
117 void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
119 /* Note, mult_fw uses MULT31 */
121 asm volatile (
122 "ldmia %[d], {r0, r1, r2, r3};"
123 "ldmia %[w]!, {r4, r5, r6, r7};"
124 "smmul r0, r4, r0;"
125 "smmul r1, r5, r1;"
126 "smmul r2, r6, r2;"
127 "smmul r3, r7, r3;"
128 "mov r0, r0, lsl #1;"
129 "mov r1, r1, lsl #1;"
130 "mov r2, r2, lsl #1;"
131 "mov r3, r3, lsl #1;"
132 "stmia %[d]!, {r0, r1, r2, r3};"
133 : [d] "+r" (data), [w] "+r" (window)
134 : : "r0", "r1", "r2", "r3",
135 "r4", "r5", "r6", "r7",
136 "memory" );
137 n -= 4;
138 } while (n);
140 #else
141 static inline
142 void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
144 /* Note, mult_fw uses MULT31 */
146 asm volatile (
147 "ldmia %[d], {r0, r1, r2, r3};"
148 "ldmia %[w]!, {r4, r5, r6, r7};"
149 "smull r8, r0, r4, r0;"
150 "mov r0, r0, lsl #1;"
151 "smull r8, r1, r5, r1;"
152 "mov r1, r1, lsl #1;"
153 "smull r8, r2, r6, r2;"
154 "mov r2, r2, lsl #1;"
155 "smull r8, r3, r7, r3;"
156 "mov r3, r3, lsl #1;"
157 "stmia %[d]!, {r0, r1, r2, r3};"
158 : [d] "+r" (data), [w] "+r" (window)
159 : : "r0", "r1", "r2", "r3",
160 "r4", "r5", "r6", "r7", "r8",
161 "memory" );
162 n -= 4;
163 } while (n);
165 #endif
167 #if ARM_ARCH >= 6
168 static inline
169 void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
171 /* NOTE mult_bw uses MULT_32 i.e. doesn't shift result left at end */
172 /* On ARM, we can do the shift at the same time as the overlap-add */
174 asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
175 "ldmda %[w]!, {r4, r5, r6, r7};"
176 "smmul r0, r7, r0;"
177 "smmul r1, r6, r1;"
178 "smmul r2, r5, r2;"
179 "smmul r3, r4, r3;"
180 "stmia %[d]!, {r0, r1, r2, r3};"
181 : [d] "+r" (data), [w] "+r" (window)
182 : : "r0", "r1", "r2", "r3",
183 "r4", "r5", "r6", "r7",
184 "memory" );
185 n -= 4;
186 } while (n);
188 #else
189 static inline
190 void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
192 /* NOTE mult_bw uses MULT_32 i.e. doesn't shift result left at end */
193 /* On ARM, we can do the shift at the same time as the overlap-add */
195 asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
196 "ldmda %[w]!, {r4, r5, r6, r7};"
197 "smull r8, r0, r7, r0;"
198 "smull r7, r1, r6, r1;"
199 "smull r6, r2, r5, r2;"
200 "smull r5, r3, r4, r3;"
201 "stmia %[d]!, {r0, r1, r2, r3};"
202 : [d] "+r" (data), [w] "+r" (window)
203 : : "r0", "r1", "r2", "r3",
204 "r4", "r5", "r6", "r7", "r8",
205 "memory" );
206 n -= 4;
207 } while (n);
209 #endif
211 static inline void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n)
213 memcpy(x,y,n*sizeof(ogg_int32_t));
216 #endif
218 #endif
219 /* not used anymore */
221 #ifndef _V_CLIP_MATH
222 #define _V_CLIP_MATH
224 static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) {
225 int tmp;
226 asm volatile("subs %1, %0, #32768\n\t"
227 "movpl %0, #0x7f00\n\t"
228 "orrpl %0, %0, #0xff\n"
229 "adds %1, %0, #32768\n\t"
230 "movmi %0, #0x8000"
231 : "+r"(x),"=r"(tmp)
233 : "cc");
234 return(x);
237 #endif
240 #ifndef _V_LSP_MATH_ASM
241 #define _V_LSP_MATH_ASM
243 static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
244 ogg_int32_t *qexpp,
245 ogg_int32_t *ilsp,ogg_int32_t wi,
246 ogg_int32_t m){
248 ogg_uint32_t qi=*qip,pi=*pip;
249 ogg_int32_t qexp=*qexpp;
251 asm("mov r0,%3;"
252 "movs r1,%5,asr#1;"
253 "add r0,r0,r1,lsl#3;"
254 "beq 2f;\n"
255 "1:"
257 "ldmdb r0!,{r1,r3};"
258 "subs r1,r1,%4;" //ilsp[j]-wi
259 "rsbmi r1,r1,#0;" //labs(ilsp[j]-wi)
260 "umull %0,r2,r1,%0;" //qi*=labs(ilsp[j]-wi)
262 "subs r1,r3,%4;" //ilsp[j+1]-wi
263 "rsbmi r1,r1,#0;" //labs(ilsp[j+1]-wi)
264 "umull %1,r3,r1,%1;" //pi*=labs(ilsp[j+1]-wi)
266 "cmn r2,r3;" // shift down 16?
267 "beq 0f;"
268 "add %2,%2,#16;"
269 "mov %0,%0,lsr #16;"
270 "orr %0,%0,r2,lsl #16;"
271 "mov %1,%1,lsr #16;"
272 "orr %1,%1,r3,lsl #16;"
273 "0:"
274 "cmp r0,%3;\n"
275 "bhi 1b;\n"
277 "2:"
278 // odd filter assymetry
279 "ands r0,%5,#1;\n"
280 "beq 3f;\n"
281 "add r0,%3,%5,lsl#2;\n"
283 "ldr r1,[r0,#-4];\n"
284 "mov r0,#0x4000;\n"
286 "subs r1,r1,%4;\n" //ilsp[j]-wi
287 "rsbmi r1,r1,#0;\n" //labs(ilsp[j]-wi)
288 "umull %0,r2,r1,%0;\n" //qi*=labs(ilsp[j]-wi)
289 "umull %1,r3,r0,%1;\n" //pi*=labs(ilsp[j+1]-wi)
291 "cmn r2,r3;\n" // shift down 16?
292 "beq 3f;\n"
293 "add %2,%2,#16;\n"
294 "mov %0,%0,lsr #16;\n"
295 "orr %0,%0,r2,lsl #16;\n"
296 "mov %1,%1,lsr #16;\n"
297 "orr %1,%1,r3,lsl #16;\n"
299 //qi=(pi>>shift)*labs(ilsp[j]-wi);
300 //pi=(qi>>shift)*labs(ilsp[j+1]-wi);
301 //qexp+=shift;
305 /* normalize to max 16 sig figs */
306 "3:"
307 "mov r2,#0;"
308 "orr r1,%0,%1;"
309 "tst r1,#0xff000000;"
310 "addne r2,r2,#8;"
311 "movne r1,r1,lsr #8;"
312 "tst r1,#0x00f00000;"
313 "addne r2,r2,#4;"
314 "movne r1,r1,lsr #4;"
315 "tst r1,#0x000c0000;"
316 "addne r2,r2,#2;"
317 "movne r1,r1,lsr #2;"
318 "tst r1,#0x00020000;"
319 "addne r2,r2,#1;"
320 "movne r1,r1,lsr #1;"
321 "tst r1,#0x00010000;"
322 "addne r2,r2,#1;"
323 "mov %0,%0,lsr r2;"
324 "mov %1,%1,lsr r2;"
325 "add %2,%2,r2;"
327 : "+r"(qi),"+r"(pi),"+r"(qexp)
328 : "r"(ilsp),"r"(wi),"r"(m)
329 : "r0","r1","r2","r3","cc");
331 *qip=qi;
332 *pip=pi;
333 *qexpp=qexp;
336 static inline void lsp_norm_asm(ogg_uint32_t *qip,ogg_int32_t *qexpp){
338 ogg_uint32_t qi=*qip;
339 ogg_int32_t qexp=*qexpp;
341 asm("tst %0,#0x0000ff00;"
342 "moveq %0,%0,lsl #8;"
343 "subeq %1,%1,#8;"
344 "tst %0,#0x0000f000;"
345 "moveq %0,%0,lsl #4;"
346 "subeq %1,%1,#4;"
347 "tst %0,#0x0000c000;"
348 "moveq %0,%0,lsl #2;"
349 "subeq %1,%1,#2;"
350 "tst %0,#0x00008000;"
351 "moveq %0,%0,lsl #1;"
352 "subeq %1,%1,#1;"
353 : "+r"(qi),"+r"(qexp)
355 : "cc");
356 *qip=qi;
357 *qexpp=qexp;
360 #endif
361 #endif