Submit part of FS#11498. Major speedup for WMA Professional on ARM and Coldfire CPUs...
[kugel-rb.git] / apps / codecs / libwmapro / wmapro_math.h
blob30b9a987ee07153f2a53cec7d8ae7886c3031ddc
1 #ifndef _WMAPRO_MATH_H_
2 #define _WMAPRO_MATH_H_
4 #include <inttypes.h>
6 /* rockbox: not used
7 #define fixtof16(x) (float)((float)(x) / (float)(1 << 16))
8 #define fixtof31(x) (float)((float)(x) / (float)(1 << 31))
9 #define ftofix16(x) ((int32_t)((x) * (float)(1 << 16) + ((x) < 0 ? -0.5:0.5)))
10 #define ftofix31(x) ((int32_t)((x) * (float)(1 << 31) + ((x) < 0 ? -0.5:0.5)))
13 #if defined(CPU_ARM)
14 /* Calculates: result = (X*Y)>>Z */
15 #define fixmulshift(X,Y,Z) \
16 ({ \
17 int32_t lo; \
18 int32_t hi; \
19 asm volatile ( \
20 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
21 "mov %[lo], %[lo], lsr %[shr] \n\t" /* lo >>= Z */ \
22 "orr %[lo], %[lo], %[hi], lsl %[shl]" /* lo |= (hi << (32-Z)) */ \
23 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
24 : [x]"r"(X), [y]"r"(Y), [shr]"r"(Z), [shl]"r"(32-Z)); \
25 lo; \
28 /* Calculates: result = (X*Y)>>16 */
29 #define fixmul16(X,Y) \
30 ({ \
31 int32_t lo; \
32 int32_t hi; \
33 asm volatile ( \
34 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
35 "mov %[lo], %[lo], lsr #16 \n\t" /* lo >>= 16 */ \
36 "orr %[lo], %[lo], %[hi], lsl #16" /* lo |= (hi << 16) */ \
37 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
38 : [x]"r"(X), [y]"r"(Y)); \
39 lo; \
42 /* Calculates: result = (X*Y)>>24 */
43 #define fixmul24(X,Y) \
44 ({ \
45 int32_t lo; \
46 int32_t hi; \
47 asm volatile ( \
48 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
49 "mov %[lo], %[lo], lsr #24 \n\t" /* lo >>= 24 */ \
50 "orr %[lo], %[lo], %[hi], lsl #8" /* lo |= (hi << 8) */ \
51 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
52 : [x]"r"(X), [y]"r"(Y)); \
53 lo; \
56 /* Calculates: result = (X*Y)>>31 */
57 #define fixmul31(X,Y) \
58 ({ \
59 int32_t lo; \
60 int32_t hi; \
61 asm volatile ( \
62 "smull %[lo], %[hi], %[x], %[y] \n\t" /* multiply */ \
63 "mov %[lo], %[lo], lsr #31 \n\t" /* lo >>= 31 */ \
64 "orr %[lo], %[lo], %[hi], lsl #1" /* lo |= (hi << 1) */ \
65 : [lo]"=&r"(lo), [hi]"=&r"(hi) \
66 : [x]"r"(X), [y]"r"(Y)); \
67 lo; \
69 #elif defined(CPU_COLDFIRE)
70 /* Calculates: result = (X*Y)>>Z */
71 #define fixmulshift(X,Y,Z) \
72 ({ \
73 int32_t t1; \
74 int32_t t2; \
75 asm volatile ( \
76 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
77 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
78 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
79 "moveq.l #31,%[t2] \n\t" \
80 "sub.l %[sh],%[t2] \n\t" /* t2 = 31 - shift */ \
81 "ble.s 1f \n\t" \
82 "asl.l %[t2],%[t1] \n\t" /* hi <<= 31 - shift */ \
83 "lsr.l %[sh],%[x] \n\t" /* (unsigned)lo >>= shift */ \
84 "or.l %[x],%[t1] \n\t" /* combine result */ \
85 "bra.s 2f \n\t" \
86 "1: \n\t" \
87 "neg.l %[t2] \n\t" /* t2 = shift - 31 */ \
88 "asr.l %[t2],%[t1] \n\t" /* hi >>= t2 */ \
89 "2: \n" \
90 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
91 : [x] "d"((X)), [y] "d"((Y)), [sh]"d"((Z))); \
92 t1; \
95 /* Calculates: result = (X*Y)>>16 */
96 #define fixmul16(X,Y) \
97 ({ \
98 int32_t t1, t2; \
99 asm volatile ( \
100 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
101 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
102 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
103 "moveq.l #15,%[t2] \n\t" \
104 "asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \
105 "moveq.l #16,%[t2] \n\t" \
106 "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \
107 "or.l %[x],%[t1] \n\t" /* combine result */ \
108 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
109 : [x] "d" ((X)), [y] "d" ((Y))); \
110 t1; \
113 /* Calculates: result = (X*Y)>>24 */
114 #define fixmul24(X,Y) \
115 ({ \
116 int32_t t1, t2; \
117 asm volatile ( \
118 "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \
119 "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \
120 "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \
121 "moveq.l #7,%[t2] \n\t" \
122 "asl.l %[t2],%[t1] \n\t" /* hi <<= 7, plus one free */ \
123 "moveq.l #24,%[t2] \n\t" \
124 "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 24 */ \
125 "or.l %[x],%[t1] \n\t" /* combine result */ \
126 : [t1]"=&d"(t1), [t2]"=&d"(t2) \
127 : [x] "d" ((X)), [y] "d" ((Y))); \
128 t1; \
131 /* Calculates: result = (X*Y)>>32 */
132 #define fixmul31(X,Y) \
133 ({ \
134 int32_t t; \
135 asm volatile ( \
136 "mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \
137 "movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \
138 : [t] "=d" (t) \
139 : [x] "r" ((X)), [y] "r" ((Y))); \
140 t; \
142 #else
143 static inline int32_t fixmulshift(int32_t x, int32_t y, int shamt)
145 int64_t temp;
146 temp = x;
147 temp *= y;
149 temp >>= shamt;
151 return (int32_t)temp;
154 static inline int32_t fixmul31(int32_t x, int32_t y)
156 int64_t temp;
157 temp = x;
158 temp *= y;
160 temp >>= 31;
162 return (int32_t)temp;
165 static inline int32_t fixmul24(int32_t x, int32_t y)
167 int64_t temp;
168 temp = x;
169 temp *= y;
171 temp >>= 24;
173 return (int32_t)temp;
176 static inline int32_t fixmul16(int32_t x, int32_t y)
178 int64_t temp;
179 temp = x;
180 temp *= y;
182 temp >>= 16;
184 return (int32_t)temp;
186 #endif /* CPU_COLDFIRE, CPU_ARM */
188 #ifdef CPU_COLDFIRE
189 static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
190 const int32_t *src1, const int32_t *win,
191 int len)
193 int i, j;
194 dst += len;
195 win += len;
196 src0+= len;
197 for(i=-len, j=len-1; i<0; i++, j--) {
198 int32_t s0 = src0[i];
199 int32_t s1 = src1[j];
200 int32_t wi = -win[i];
201 int32_t wj = -win[j];
203 asm volatile ("mac.l %[s0], %[wj], %%acc0\n\t"
204 "msac.l %[s1], %[wi], %%acc0\n\t"
205 "mac.l %[s0], %[wi], %%acc1\n\t"
206 "mac.l %[s1], %[wj], %%acc1\n\t"
207 "movclr.l %%acc0, %[s0]\n\t"
208 "move.l %[s0], (%[dst_i])\n\t"
209 "movclr.l %%acc1, %[s0]\n\t"
210 "move.l %[s0], (%[dst_j])\n\t"
211 : [s0] "+r" (s0) /* this register is clobbered so specify it as an input */
212 : [dst_i] "a" (&dst[i]), [dst_j] "a" (&dst[j]),
213 [s1] "r" (s1), [wi] "r" (wi), [wj] "r" (wj)
214 : "cc", "memory");
217 #else
218 static inline void vector_fixmul_window(int32_t *dst, const int32_t *src0,
219 const int32_t *src1, const int32_t *win,
220 int len)
222 int i, j;
223 dst += len;
224 win += len;
225 src0+= len;
226 for(i=-len, j=len-1; i<0; i++, j--) {
227 int32_t s0 = src0[i];
228 int32_t s1 = src1[j];
229 int32_t wi = -win[i];
230 int32_t wj = -win[j];
231 dst[i] = fixmul31(s0, wj) - fixmul31(s1, wi);
232 dst[j] = fixmul31(s0, wi) + fixmul31(s1, wj);
235 #endif
237 static inline void vector_fixmul_scalar(int32_t *dst, const int32_t *src,
238 int32_t mul, int len)
240 int i;
241 for(i=0; i<len; i++)
242 dst[i] = fixmul24(src[i], mul);
245 static inline int av_clip(int a, int amin, int amax)
247 if (a < amin) return amin;
248 else if (a > amax) return amax;
249 else return a;
251 #endif /* _WMAPRO_MATH_H_ */