1 diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
2 --- a/media/libspeex_resampler/src/resample.c
3 +++ b/media/libspeex_resampler/src/resample.c
4 @@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
6 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
7 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
14 -#include "resample_sse.h"
18 -#include "resample_neon.h"
20 +#include "simd_detect.h"
22 /* Numer of elements to allocate on the stack */
24 #define FIXED_STACK_ALLOC 8192
26 #define FIXED_STACK_ALLOC 1024
29 @@ -344,17 +338,19 @@ static int resampler_basic_direct_single
30 const spx_uint32_t den_rate = st->den_rate;
33 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
35 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
36 const spx_word16_t *iptr = & in[last_sample];
38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
40 + if (!moz_speex_have_single_simd()) {
44 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
46 /* This code is slower on most DSPs which have only 2 accumulators.
47 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
48 I think we can trust the compiler and let it vectorize and/or unroll itself.
49 spx_word32_t accum[4] = {0,0,0,0};
50 @@ -362,18 +358,20 @@ static int resampler_basic_direct_single
51 accum[0] += MULT16_16(sinct[j], iptr[j]);
52 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
53 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
54 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
56 sum = accum[0] + accum[1] + accum[2] + accum[3];
58 sum = SATURATE32PSHR(sum, 15, 32767);
60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
62 sum = inner_product_single(sinct, iptr, N);
66 out[out_stride * out_sample++] = sum;
67 last_sample += int_advance;
68 samp_frac_num += frac_advance;
69 if (samp_frac_num >= den_rate)
71 samp_frac_num -= den_rate;
72 @@ -402,29 +400,33 @@ static int resampler_basic_direct_double
73 const spx_uint32_t den_rate = st->den_rate;
76 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
78 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
79 const spx_word16_t *iptr = & in[last_sample];
81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
83 + if(moz_speex_have_double_simd()) {
86 double accum[4] = {0,0,0,0};
89 accum[0] += sinct[j]*iptr[j];
90 accum[1] += sinct[j+1]*iptr[j+1];
91 accum[2] += sinct[j+2]*iptr[j+2];
92 accum[3] += sinct[j+3]*iptr[j+3];
94 sum = accum[0] + accum[1] + accum[2] + accum[3];
96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
98 sum = inner_product_double(sinct, iptr, N);
102 out[out_stride * out_sample++] = PSHR32(sum, 15);
103 last_sample += int_advance;
104 samp_frac_num += frac_advance;
105 if (samp_frac_num >= den_rate)
107 samp_frac_num -= den_rate;
108 @@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
110 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
112 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
114 spx_word16_t interp[4];
117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
119 + if (!moz_speex_have_single_simd()) {
122 spx_word32_t accum[4] = {0,0,0,0};
125 const spx_word16_t curr_in=iptr[j];
126 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
127 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
128 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
129 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
132 cubic_coef(frac, interp);
133 sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
134 sum = SATURATE32PSHR(sum, 15, 32767);
136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
138 cubic_coef(frac, interp);
139 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
143 out[out_stride * out_sample++] = sum;
144 last_sample += int_advance;
145 samp_frac_num += frac_advance;
146 if (samp_frac_num >= den_rate)
148 samp_frac_num -= den_rate;
149 @@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
151 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
153 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
155 spx_word16_t interp[4];
158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
160 + if (!moz_speex_have_double_simd()) {
163 double accum[4] = {0,0,0,0};
166 const double curr_in=iptr[j];
167 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
168 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
169 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
170 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
173 cubic_coef(frac, interp);
174 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
178 cubic_coef(frac, interp);
179 sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
183 out[out_stride * out_sample++] = PSHR32(sum,15);
184 last_sample += int_advance;
185 samp_frac_num += frac_advance;
186 if (samp_frac_num >= den_rate)
188 samp_frac_num -= den_rate;
189 diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
190 --- a/media/libspeex_resampler/src/resample_neon.c
191 +++ b/media/libspeex_resampler/src/resample_neon.c
193 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
194 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
195 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
196 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
197 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
198 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
201 +#include "simd_detect.h"
203 #include <arm_neon.h>
207 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
209 asm ("ssat %[ret], #16, %[a]"
211 @@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
216 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
218 #define OVERRIDE_INNER_PRODUCT_SINGLE
219 /* Only works when len % 4 == 0 */
220 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
221 +int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
224 uint32_t remainder = len % 16;
225 len = len - remainder;
227 asm volatile (" cmp %[len], #0\n"
229 " vld1.16 {d16}, [%[b]]!\n"
230 @@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
235 #define WORD2INT(x) (saturate_float_to_16bit(x))
237 #define OVERRIDE_INNER_PRODUCT_SINGLE
238 /* Only works when len % 4 == 0 */
239 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
240 +float inner_product_single(const float *a, const float *b, unsigned int len)
243 uint32_t remainder = len % 16;
244 len = len - remainder;
246 asm volatile (" cmp %[len], #0\n"
248 " vld1.32 {q4}, [%[b]]!\n"
249 diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
250 --- a/media/libspeex_resampler/src/resample_sse.c
251 +++ b/media/libspeex_resampler/src/resample_sse.c
253 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
254 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
255 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
256 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
257 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
258 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
261 +#include "simd_detect.h"
263 #include <xmmintrin.h>
265 #define OVERRIDE_INNER_PRODUCT_SINGLE
266 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
267 +float inner_product_single(const float *a, const float *b, unsigned int len)
271 __m128 sum = _mm_setzero_ps();
274 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
275 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
277 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
278 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
279 _mm_store_ss(&ret, sum);
283 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
284 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
285 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
288 __m128 sum = _mm_setzero_ps();
289 __m128 f = _mm_loadu_ps(frac);
292 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
293 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
294 @@ -70,17 +72,17 @@ static inline float interpolate_product_
295 _mm_store_ss(&ret, sum);
300 #include <emmintrin.h>
301 #define OVERRIDE_INNER_PRODUCT_DOUBLE
303 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
304 +double inner_product_double(const float *a, const float *b, unsigned int len)
308 __m128d sum = _mm_setzero_pd();
312 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
313 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
314 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
316 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
317 _mm_store_sd(&ret, sum);
321 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
322 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
323 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
327 __m128d sum1 = _mm_setzero_pd();
328 __m128d sum2 = _mm_setzero_pd();
329 __m128 f = _mm_loadu_ps(frac);
330 __m128d f1 = _mm_cvtps_pd(f);
331 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));