1 diff --git a/src/resample.c b/src/resample.c
4 @@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free(
10 #define UINT32_MAX 4294967295U
14 -#include "resample_sse.h"
18 -#include "resample_neon.h"
20 +#include "simd_detect.h"
22 /* Number of elements to allocate on the stack */
24 #define FIXED_STACK_ALLOC 8192
26 #define FIXED_STACK_ALLOC 1024
29 @@ -341,17 +335,19 @@ static int resampler_basic_direct_single
30 const spx_uint32_t den_rate = st->den_rate;
33 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
35 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
36 const spx_word16_t *iptr = & in[last_sample];
38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
40 + if (!moz_speex_have_single_simd()) {
44 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
46 /* This code is slower on most DSPs which have only 2 accumulators.
47 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
48 I think we can trust the compiler and let it vectorize and/or unroll itself.
49 spx_word32_t accum[4] = {0,0,0,0};
50 @@ -359,18 +355,20 @@ static int resampler_basic_direct_single
51 accum[0] += MULT16_16(sinct[j], iptr[j]);
52 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
53 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
54 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
56 sum = accum[0] + accum[1] + accum[2] + accum[3];
58 sum = SATURATE32PSHR(sum, 15, 32767);
60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
62 sum = inner_product_single(sinct, iptr, N);
66 out[out_stride * out_sample++] = sum;
67 last_sample += int_advance;
68 samp_frac_num += frac_advance;
69 if (samp_frac_num >= den_rate)
71 samp_frac_num -= den_rate;
72 @@ -399,29 +397,33 @@ static int resampler_basic_direct_double
73 const spx_uint32_t den_rate = st->den_rate;
76 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
78 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
79 const spx_word16_t *iptr = & in[last_sample];
81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
83 + if(moz_speex_have_double_simd()) {
86 double accum[4] = {0,0,0,0};
89 accum[0] += sinct[j]*iptr[j];
90 accum[1] += sinct[j+1]*iptr[j+1];
91 accum[2] += sinct[j+2]*iptr[j+2];
92 accum[3] += sinct[j+3]*iptr[j+3];
94 sum = accum[0] + accum[1] + accum[2] + accum[3];
96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
98 sum = inner_product_double(sinct, iptr, N);
102 out[out_stride * out_sample++] = PSHR32(sum, 15);
103 last_sample += int_advance;
104 samp_frac_num += frac_advance;
105 if (samp_frac_num >= den_rate)
107 samp_frac_num -= den_rate;
108 @@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s
110 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
112 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
114 spx_word16_t interp[4];
117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
119 + if (!moz_speex_have_single_simd()) {
122 spx_word32_t accum[4] = {0,0,0,0};
125 const spx_word16_t curr_in=iptr[j];
126 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
127 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
128 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
129 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
132 cubic_coef(frac, interp);
133 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
134 sum = SATURATE32PSHR(sum, 15, 32767);
136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
138 cubic_coef(frac, interp);
139 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
143 out[out_stride * out_sample++] = sum;
144 last_sample += int_advance;
145 samp_frac_num += frac_advance;
146 if (samp_frac_num >= den_rate)
148 samp_frac_num -= den_rate;
149 @@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d
151 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
153 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
155 spx_word16_t interp[4];
158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
160 + if (!moz_speex_have_double_simd()) {
163 double accum[4] = {0,0,0,0};
166 const double curr_in=iptr[j];
167 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
168 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
169 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
170 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
173 cubic_coef(frac, interp);
174 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
178 cubic_coef(frac, interp);
179 sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
183 out[out_stride * out_sample++] = PSHR32(sum,15);
184 last_sample += int_advance;
185 samp_frac_num += frac_advance;
186 if (samp_frac_num >= den_rate)
188 samp_frac_num -= den_rate;
189 diff --git a/src/resample_neon.c b/src/resample_neon.c
190 --- a/src/resample_neon.c
191 +++ b/src/resample_neon.c
193 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
194 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
195 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
196 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
197 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
201 +#include "simd_detect.h"
204 #if defined(__aarch64__)
205 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
207 asm ("fmov s0, %w[a]\n"
209 "sxtl v0.4s, v0.4h\n"
214 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
216 #define OVERRIDE_INNER_PRODUCT_SINGLE
217 /* Only works when len % 4 == 0 and len >= 4 */
218 #if defined(__aarch64__)
219 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
220 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
223 uint32_t remainder = len % 16;
224 len = len - remainder;
226 asm volatile (" cmp %w[len], #0\n"
228 " ld1 {v16.4h}, [%[b]], #8\n"
229 @@ -128,17 +129,17 @@
230 : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
231 [len] "+r" (len), [remainder] "+r" (remainder)
234 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
238 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
239 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
242 uint32_t remainder = len % 16;
243 len = len - remainder;
245 asm volatile (" cmp %[len], #0\n"
247 " vld1.16 {d16}, [%[b]]!\n"
248 @@ -218,17 +219,17 @@
252 #define WORD2INT(x) (saturate_float_to_16bit(x))
254 #define OVERRIDE_INNER_PRODUCT_SINGLE
255 /* Only works when len % 4 == 0 and len >= 4 */
256 #if defined(__aarch64__)
257 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
258 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
261 uint32_t remainder = len % 16;
262 len = len - remainder;
264 asm volatile (" cmp %w[len], #0\n"
266 " ld1 {v16.4s}, [%[b]], #16\n"
267 @@ -273,17 +274,17 @@
268 : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
269 [len] "+r" (len), [remainder] "+r" (remainder)
271 : "cc", "v1", "v2", "v3", "v4",
272 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
276 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
277 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
280 uint32_t remainder = len % 16;
281 len = len - remainder;
283 asm volatile (" cmp %[len], #0\n"
285 " vld1.32 {q4}, [%[b]]!\n"
286 diff --git a/src/resample_sse.c b/src/resample_sse.c
287 --- a/src/resample_sse.c
288 +++ b/src/resample_sse.c
290 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
291 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
292 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
293 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
294 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
295 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
298 +#include "simd_detect.h"
300 #include <xmmintrin.h>
302 #define OVERRIDE_INNER_PRODUCT_SINGLE
303 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
304 +float inner_product_single(const float *a, const float *b, unsigned int len)
308 __m128 sum = _mm_setzero_ps();
311 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
312 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
314 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
315 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
316 _mm_store_ss(&ret, sum);
320 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
321 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
322 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
325 __m128 sum = _mm_setzero_ps();
326 __m128 f = _mm_loadu_ps(frac);
329 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
330 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
331 @@ -70,17 +72,17 @@ static inline float interpolate_product_
332 _mm_store_ss(&ret, sum);
337 #include <emmintrin.h>
338 #define OVERRIDE_INNER_PRODUCT_DOUBLE
340 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
341 +double inner_product_double(const float *a, const float *b, unsigned int len)
345 __m128d sum = _mm_setzero_pd();
349 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
350 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
351 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
353 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
354 _mm_store_sd(&ret, sum);
358 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
359 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
360 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
364 __m128d sum1 = _mm_setzero_pd();
365 __m128d sum2 = _mm_setzero_pd();
366 __m128 f = _mm_loadu_ps(frac);
367 __m128d f1 = _mm_cvtps_pd(f);
368 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));