Move (small) data into DRAM on PP5020, it's ~4.5% faster that way. Closes about half...
[kugel-rb.git] / apps / codecs / demac / libdemac / filter.c
blob80550983013b4428179fee740cf5e1f86f7d07db
1 /*
3 libdemac - A Monkey's Audio decoder
5 $Id$
7 Copyright (C) Dave Chapman 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
25 #include <string.h>
26 #include <inttypes.h>
28 #include "demac.h"
29 #include "filter.h"
30 #include "demac_config.h"
32 #if FILTER_BITS == 32
34 #if defined(CPU_ARM) && (ARM_ARCH == 4)
35 #include "vector_math32_armv4.h"
36 #else
37 #include "vector_math_generic.h"
38 #endif
40 #else /* FILTER_BITS == 16 */
42 #ifdef CPU_COLDFIRE
43 #include "vector_math16_cf.h"
44 #elif defined(CPU_ARM) && (ARM_ARCH >= 6)
45 #include "vector_math16_armv6.h"
46 #elif defined(CPU_ARM) && (ARM_ARCH >= 5)
47 /* Assume all our ARMv5 targets are ARMv5te(j) */
48 #include "vector_math16_armv5te.h"
49 #elif (defined(__i386__) || defined(__i486__)) && defined(__MMX__) \
50 || defined(__x86_64__)
51 #include "vector_math16_mmx.h"
52 #else
53 #include "vector_math_generic.h"
54 #endif
56 #endif /* FILTER_BITS */
58 struct filter_t {
59 filter_int* coeffs; /* ORDER entries */
61 /* We store all the filter delays in a single buffer */
62 filter_int* history_end;
64 filter_int* delay;
65 filter_int* adaptcoeffs;
67 int avg;
70 /* We name the functions according to the ORDER and FRACBITS
71 pre-processor symbols and build multiple .o files from this .c file
72 - this increases code-size but gives the compiler more scope for
73 optimising the individual functions, as well as replacing a lot of
74 variables with constants.
77 #if FRACBITS == 11
78 #if ORDER == 16
79 #define INIT_FILTER init_filter_16_11
80 #define APPLY_FILTER apply_filter_16_11
81 #elif ORDER == 64
82 #define INIT_FILTER init_filter_64_11
83 #define APPLY_FILTER apply_filter_64_11
84 #endif
85 #elif FRACBITS == 13
86 #define INIT_FILTER init_filter_256_13
87 #define APPLY_FILTER apply_filter_256_13
88 #elif FRACBITS == 10
89 #define INIT_FILTER init_filter_32_10
90 #define APPLY_FILTER apply_filter_32_10
91 #elif FRACBITS == 15
92 #define INIT_FILTER init_filter_1280_15
93 #define APPLY_FILTER apply_filter_1280_15
94 #endif
96 /* Some macros to handle the fixed-point stuff */
98 /* Convert from (32-FRACBITS).FRACBITS fixed-point format to an
99 integer (rounding to nearest). */
100 #define FP_HALF (1 << (FRACBITS - 1)) /* 0.5 in fixed-point format. */
101 #define FP_TO_INT(x) ((x + FP_HALF) >> FRACBITS) /* round(x) */
103 #ifdef CPU_ARM
104 #if ARM_ARCH >= 6
105 #define SATURATE(x) ({int __res; asm("ssat %0, #16, %1" : "=r"(__res) : "r"(x)); __res; })
106 #else /* ARM_ARCH < 6 */
107 /* Keeping the asr #31 outside of the asm allows loads to be scheduled between
108 it and the rest of the block on ARM9E, with the load's result latency filled
109 by the other calculations. */
110 #define SATURATE(x) ({ \
111 int __res = (x) >> 31; \
112 asm volatile ( \
113 "teq %0, %1, asr #15\n\t" \
114 "moveq %0, %1\n\t" \
115 "eorne %0, %0, #0xff\n\t" \
116 "eorne %0, %0, #0x7f00" \
117 : "+r" (__res) : "r" (x) : "cc" \
118 ); \
119 __res; \
121 #endif /* ARM_ARCH */
122 #else /* CPU_ARM */
123 #define SATURATE(x) (LIKELY((x) == (int16_t)(x)) ? (x) : ((x) >> 31) ^ 0x7FFF)
124 #endif
126 /* Apply the filter with state f to count entries in data[] */
128 static void ICODE_ATTR_DEMAC do_apply_filter_3980(struct filter_t* f,
129 int32_t* data, int count)
131 int res;
132 int absres;
134 #ifdef PREPARE_SCALARPRODUCT
135 PREPARE_SCALARPRODUCT
136 #endif
138 while(LIKELY(count--))
140 #ifdef FUSED_VECTOR_MATH
141 if (LIKELY(*data != 0)) {
142 if (*data < 0)
143 res = vector_sp_add(f->coeffs, f->delay - ORDER,
144 f->adaptcoeffs - ORDER);
145 else
146 res = vector_sp_sub(f->coeffs, f->delay - ORDER,
147 f->adaptcoeffs - ORDER);
148 } else {
149 res = scalarproduct(f->coeffs, f->delay - ORDER);
151 res = FP_TO_INT(res);
152 #else
153 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
155 if (LIKELY(*data != 0)) {
156 if (*data < 0)
157 vector_add(f->coeffs, f->adaptcoeffs - ORDER);
158 else
159 vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
161 #endif
163 res += *data;
165 *data++ = res;
167 /* Update the output history */
168 *f->delay++ = SATURATE(res);
170 /* Version 3.98 and later files */
172 /* Update the adaption coefficients */
173 absres = (res < 0 ? -res : res);
175 if (UNLIKELY(absres > 3 * f->avg))
176 *f->adaptcoeffs = ((res >> 25) & 64) - 32;
177 else if (3 * absres > 4 * f->avg)
178 *f->adaptcoeffs = ((res >> 26) & 32) - 16;
179 else if (LIKELY(absres > 0))
180 *f->adaptcoeffs = ((res >> 27) & 16) - 8;
181 else
182 *f->adaptcoeffs = 0;
184 f->avg += (absres - f->avg) / 16;
186 f->adaptcoeffs[-1] >>= 1;
187 f->adaptcoeffs[-2] >>= 1;
188 f->adaptcoeffs[-8] >>= 1;
190 f->adaptcoeffs++;
192 /* Have we filled the history buffer? */
193 if (UNLIKELY(f->delay == f->history_end)) {
194 memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
195 (ORDER*2) * sizeof(filter_int));
196 f->adaptcoeffs = f->coeffs + ORDER*2;
197 f->delay = f->coeffs + ORDER*3;
202 static void ICODE_ATTR_DEMAC do_apply_filter_3970(struct filter_t* f,
203 int32_t* data, int count)
205 int res;
207 #ifdef PREPARE_SCALARPRODUCT
208 PREPARE_SCALARPRODUCT
209 #endif
211 while(LIKELY(count--))
213 #ifdef FUSED_VECTOR_MATH
214 if (LIKELY(*data != 0)) {
215 if (*data < 0)
216 res = vector_sp_add(f->coeffs, f->delay - ORDER,
217 f->adaptcoeffs - ORDER);
218 else
219 res = vector_sp_sub(f->coeffs, f->delay - ORDER,
220 f->adaptcoeffs - ORDER);
221 } else {
222 res = scalarproduct(f->coeffs, f->delay - ORDER);
224 res = FP_TO_INT(res);
225 #else
226 res = FP_TO_INT(scalarproduct(f->coeffs, f->delay - ORDER));
228 if (LIKELY(*data != 0)) {
229 if (*data < 0)
230 vector_add(f->coeffs, f->adaptcoeffs - ORDER);
231 else
232 vector_sub(f->coeffs, f->adaptcoeffs - ORDER);
234 #endif
236 /* Convert res from (32-FRACBITS).FRACBITS fixed-point format to an
237 integer (rounding to nearest) and add the input value to
238 it */
239 res += *data;
241 *data++ = res;
243 /* Update the output history */
244 *f->delay++ = SATURATE(res);
246 /* Version ??? to < 3.98 files (untested) */
247 f->adaptcoeffs[0] = (res == 0) ? 0 : ((res >> 28) & 8) - 4;
248 f->adaptcoeffs[-4] >>= 1;
249 f->adaptcoeffs[-8] >>= 1;
251 f->adaptcoeffs++;
253 /* Have we filled the history buffer? */
254 if (UNLIKELY(f->delay == f->history_end)) {
255 memmove(f->coeffs + ORDER, f->delay - (ORDER*2),
256 (ORDER*2) * sizeof(filter_int));
257 f->adaptcoeffs = f->coeffs + ORDER*2;
258 f->delay = f->coeffs + ORDER*3;
263 static struct filter_t filter[2] IBSS_ATTR_DEMAC;
265 static void do_init_filter(struct filter_t* f, filter_int* buf)
267 f->coeffs = buf;
268 f->history_end = buf + ORDER*3 + FILTER_HISTORY_SIZE;
270 /* Init pointers */
271 f->adaptcoeffs = f->coeffs + ORDER*2;
272 f->delay = f->coeffs + ORDER*3;
274 /* Zero coefficients and history buffer */
275 memset(f->coeffs, 0, ORDER*3 * sizeof(filter_int));
277 /* Zero the running average */
278 f->avg = 0;
281 void INIT_FILTER(filter_int* buf)
283 do_init_filter(&filter[0], buf);
284 do_init_filter(&filter[1], buf + ORDER*3 + FILTER_HISTORY_SIZE);
287 void ICODE_ATTR_DEMAC APPLY_FILTER(int fileversion, int channel,
288 int32_t* data, int count)
290 if (fileversion >= 3980)
291 do_apply_filter_3980(&filter[channel], data, count);
292 else
293 do_apply_filter_3970(&filter[channel], data, count);