Bumping gaia.json for 2 gaia revision(s) a=gaia-bump
[gecko.git] / media / libspeex_resampler / simd-detect-runtime.patch
blobc8b182ddaddac9758e6cc194ac75a9d3b747e4f6
1 diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
2 --- a/media/libspeex_resampler/src/resample.c
3 +++ b/media/libspeex_resampler/src/resample.c
4 @@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
6 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
7 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
9 #ifndef NULL
10 #define NULL 0
11 #endif
13 -#ifdef _USE_SSE
14 -#include "resample_sse.h"
15 -#endif
17 -#ifdef _USE_NEON
18 -#include "resample_neon.h"
19 -#endif
20 +#include "simd_detect.h"
22 /* Numer of elements to allocate on the stack */
23 #ifdef VAR_ARRAYS
24 #define FIXED_STACK_ALLOC 8192
25 #else
26 #define FIXED_STACK_ALLOC 1024
27 #endif
29 @@ -344,17 +338,19 @@ static int resampler_basic_direct_single
30 const spx_uint32_t den_rate = st->den_rate;
31 spx_word32_t sum;
33 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
35 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
36 const spx_word16_t *iptr = & in[last_sample];
38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
40 + if (!moz_speex_have_single_simd()) {
41 +#endif
42 int j;
43 sum = 0;
44 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
46 /* This code is slower on most DSPs which have only 2 accumulators.
47 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
48 I think we can trust the compiler and let it vectorize and/or unroll itself.
49 spx_word32_t accum[4] = {0,0,0,0};
50 @@ -362,18 +358,20 @@ static int resampler_basic_direct_single
51 accum[0] += MULT16_16(sinct[j], iptr[j]);
52 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
53 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
54 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
56 sum = accum[0] + accum[1] + accum[2] + accum[3];
58 sum = SATURATE32PSHR(sum, 15, 32767);
59 -#else
60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
61 + } else {
62 sum = inner_product_single(sinct, iptr, N);
63 + }
64 #endif
66 out[out_stride * out_sample++] = sum;
67 last_sample += int_advance;
68 samp_frac_num += frac_advance;
69 if (samp_frac_num >= den_rate)
71 samp_frac_num -= den_rate;
72 @@ -402,29 +400,33 @@ static int resampler_basic_direct_double
73 const spx_uint32_t den_rate = st->den_rate;
74 double sum;
76 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
78 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
79 const spx_word16_t *iptr = & in[last_sample];
81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
83 + if(moz_speex_have_double_simd()) {
84 +#endif
85 int j;
86 double accum[4] = {0,0,0,0};
88 for(j=0;j<N;j+=4) {
89 accum[0] += sinct[j]*iptr[j];
90 accum[1] += sinct[j+1]*iptr[j+1];
91 accum[2] += sinct[j+2]*iptr[j+2];
92 accum[3] += sinct[j+3]*iptr[j+3];
94 sum = accum[0] + accum[1] + accum[2] + accum[3];
95 -#else
96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
97 + } else {
98 sum = inner_product_double(sinct, iptr, N);
99 + }
100 #endif
102 out[out_stride * out_sample++] = PSHR32(sum, 15);
103 last_sample += int_advance;
104 samp_frac_num += frac_advance;
105 if (samp_frac_num >= den_rate)
107 samp_frac_num -= den_rate;
108 @@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
109 #ifdef FIXED_POINT
110 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
111 #else
112 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
113 #endif
114 spx_word16_t interp[4];
117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
119 + if (!moz_speex_have_single_simd()) {
120 +#endif
121 int j;
122 spx_word32_t accum[4] = {0,0,0,0};
124 for(j=0;j<N;j++) {
125 const spx_word16_t curr_in=iptr[j];
126 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
127 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
128 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
129 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
132 cubic_coef(frac, interp);
133 sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
134 sum = SATURATE32PSHR(sum, 15, 32767);
135 -#else
136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
137 + } else {
138 cubic_coef(frac, interp);
139 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
141 #endif
143 out[out_stride * out_sample++] = sum;
144 last_sample += int_advance;
145 samp_frac_num += frac_advance;
146 if (samp_frac_num >= den_rate)
148 samp_frac_num -= den_rate;
149 @@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
150 #ifdef FIXED_POINT
151 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
152 #else
153 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
154 #endif
155 spx_word16_t interp[4];
158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
160 + if (!moz_speex_have_double_simd()) {
161 +#endif
162 int j;
163 double accum[4] = {0,0,0,0};
165 for(j=0;j<N;j++) {
166 const double curr_in=iptr[j];
167 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
168 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
169 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
170 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
173 cubic_coef(frac, interp);
174 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
175 -#else
176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
177 + } else {
178 cubic_coef(frac, interp);
179 sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
181 #endif
183 out[out_stride * out_sample++] = PSHR32(sum,15);
184 last_sample += int_advance;
185 samp_frac_num += frac_advance;
186 if (samp_frac_num >= den_rate)
188 samp_frac_num -= den_rate;
189 diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
190 --- a/media/libspeex_resampler/src/resample_neon.c
191 +++ b/media/libspeex_resampler/src/resample_neon.c
192 @@ -31,16 +31,18 @@
193 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
194 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
195 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
196 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
197 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
198 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
201 +#include "simd_detect.h"
203 #include <arm_neon.h>
205 #ifdef FIXED_POINT
206 #ifdef __thumb2__
207 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
208 int32_t ret;
209 asm ("ssat %[ret], #16, %[a]"
210 : [ret] "=&r" (ret)
211 @@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
212 return ret;
214 #endif
215 #undef WORD2INT
216 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
218 #define OVERRIDE_INNER_PRODUCT_SINGLE
219 /* Only works when len % 4 == 0 */
220 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
221 +int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
223 int32_t ret;
224 uint32_t remainder = len % 16;
225 len = len - remainder;
227 asm volatile (" cmp %[len], #0\n"
228 " bne 1f\n"
229 " vld1.16 {d16}, [%[b]]!\n"
230 @@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
231 : "q0");
232 return ret;
234 #undef WORD2INT
235 #define WORD2INT(x) (saturate_float_to_16bit(x))
237 #define OVERRIDE_INNER_PRODUCT_SINGLE
238 /* Only works when len % 4 == 0 */
239 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
240 +float inner_product_single(const float *a, const float *b, unsigned int len)
242 float ret;
243 uint32_t remainder = len % 16;
244 len = len - remainder;
246 asm volatile (" cmp %[len], #0\n"
247 " bne 1f\n"
248 " vld1.32 {q4}, [%[b]]!\n"
249 diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
250 --- a/media/libspeex_resampler/src/resample_sse.c
251 +++ b/media/libspeex_resampler/src/resample_sse.c
252 @@ -29,37 +29,39 @@
253 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
254 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
255 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
256 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
257 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
258 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
261 +#include "simd_detect.h"
263 #include <xmmintrin.h>
265 #define OVERRIDE_INNER_PRODUCT_SINGLE
266 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
267 +float inner_product_single(const float *a, const float *b, unsigned int len)
269 int i;
270 float ret;
271 __m128 sum = _mm_setzero_ps();
272 for (i=0;i<len;i+=8)
274 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
275 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
277 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
278 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
279 _mm_store_ss(&ret, sum);
280 return ret;
283 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
284 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
285 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
286 int i;
287 float ret;
288 __m128 sum = _mm_setzero_ps();
289 __m128 f = _mm_loadu_ps(frac);
290 for(i=0;i<len;i+=2)
292 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
293 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
294 @@ -70,17 +72,17 @@ static inline float interpolate_product_
295 _mm_store_ss(&ret, sum);
296 return ret;
299 #ifdef _USE_SSE2
300 #include <emmintrin.h>
301 #define OVERRIDE_INNER_PRODUCT_DOUBLE
303 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
304 +double inner_product_double(const float *a, const float *b, unsigned int len)
306 int i;
307 double ret;
308 __m128d sum = _mm_setzero_pd();
309 __m128 t;
310 for (i=0;i<len;i+=8)
312 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
313 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
314 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
316 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
317 _mm_store_sd(&ret, sum);
318 return ret;
321 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
322 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
323 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
324 int i;
325 double ret;
326 __m128d sum;
327 __m128d sum1 = _mm_setzero_pd();
328 __m128d sum2 = _mm_setzero_pd();
329 __m128 f = _mm_loadu_ps(frac);
330 __m128d f1 = _mm_cvtps_pd(f);
331 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));