Bug 1852740: add tests for the `fetchpriority` attribute in Link headers. r=necko...
[gecko.git] / media / libspeex_resampler / 02_simd-detect-runtime.patch
blobbf23c8ac8e5263e8acbdc7bdaa90cffe3a4d5f67
1 diff --git a/src/resample.c b/src/resample.c
2 --- a/src/resample.c
3 +++ b/src/resample.c
4 @@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free(
5 #ifndef NULL
6 #define NULL 0
7 #endif
9 #ifndef UINT32_MAX
10 #define UINT32_MAX 4294967295U
11 #endif
13 -#ifdef USE_SSE
14 -#include "resample_sse.h"
15 -#endif
17 -#ifdef USE_NEON
18 -#include "resample_neon.h"
19 -#endif
20 +#include "simd_detect.h"
22 /* Number of elements to allocate on the stack */
23 #ifdef VAR_ARRAYS
24 #define FIXED_STACK_ALLOC 8192
25 #else
26 #define FIXED_STACK_ALLOC 1024
27 #endif
29 @@ -341,17 +335,19 @@ static int resampler_basic_direct_single
30 const spx_uint32_t den_rate = st->den_rate;
31 spx_word32_t sum;
33 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
35 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
36 const spx_word16_t *iptr = & in[last_sample];
38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
40 + if (!moz_speex_have_single_simd()) {
41 +#endif
42 int j;
43 sum = 0;
44 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
46 /* This code is slower on most DSPs which have only 2 accumulators.
47 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
48 I think we can trust the compiler and let it vectorize and/or unroll itself.
49 spx_word32_t accum[4] = {0,0,0,0};
50 @@ -359,18 +355,20 @@ static int resampler_basic_direct_single
51 accum[0] += MULT16_16(sinct[j], iptr[j]);
52 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
53 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
54 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
56 sum = accum[0] + accum[1] + accum[2] + accum[3];
58 sum = SATURATE32PSHR(sum, 15, 32767);
59 -#else
60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
61 + } else {
62 sum = inner_product_single(sinct, iptr, N);
63 + }
64 #endif
66 out[out_stride * out_sample++] = sum;
67 last_sample += int_advance;
68 samp_frac_num += frac_advance;
69 if (samp_frac_num >= den_rate)
71 samp_frac_num -= den_rate;
72 @@ -399,29 +397,33 @@ static int resampler_basic_direct_double
73 const spx_uint32_t den_rate = st->den_rate;
74 double sum;
76 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
78 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
79 const spx_word16_t *iptr = & in[last_sample];
81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
83 + if(moz_speex_have_double_simd()) {
84 +#endif
85 int j;
86 double accum[4] = {0,0,0,0};
88 for(j=0;j<N;j+=4) {
89 accum[0] += sinct[j]*iptr[j];
90 accum[1] += sinct[j+1]*iptr[j+1];
91 accum[2] += sinct[j+2]*iptr[j+2];
92 accum[3] += sinct[j+3]*iptr[j+3];
94 sum = accum[0] + accum[1] + accum[2] + accum[3];
95 -#else
96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
97 + } else {
98 sum = inner_product_double(sinct, iptr, N);
99 + }
100 #endif
102 out[out_stride * out_sample++] = PSHR32(sum, 15);
103 last_sample += int_advance;
104 samp_frac_num += frac_advance;
105 if (samp_frac_num >= den_rate)
107 samp_frac_num -= den_rate;
108 @@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s
109 #ifdef FIXED_POINT
110 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
111 #else
112 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
113 #endif
114 spx_word16_t interp[4];
117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
119 + if (!moz_speex_have_single_simd()) {
120 +#endif
121 int j;
122 spx_word32_t accum[4] = {0,0,0,0};
124 for(j=0;j<N;j++) {
125 const spx_word16_t curr_in=iptr[j];
126 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
127 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
128 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
129 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
132 cubic_coef(frac, interp);
133 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
134 sum = SATURATE32PSHR(sum, 15, 32767);
135 -#else
136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
137 + } else {
138 cubic_coef(frac, interp);
139 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
141 #endif
143 out[out_stride * out_sample++] = sum;
144 last_sample += int_advance;
145 samp_frac_num += frac_advance;
146 if (samp_frac_num >= den_rate)
148 samp_frac_num -= den_rate;
149 @@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d
150 #ifdef FIXED_POINT
151 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
152 #else
153 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
154 #endif
155 spx_word16_t interp[4];
158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
160 + if (!moz_speex_have_double_simd()) {
161 +#endif
162 int j;
163 double accum[4] = {0,0,0,0};
165 for(j=0;j<N;j++) {
166 const double curr_in=iptr[j];
167 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
168 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
169 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
170 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
173 cubic_coef(frac, interp);
174 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
175 -#else
176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
177 + } else {
178 cubic_coef(frac, interp);
179 sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
181 #endif
183 out[out_stride * out_sample++] = PSHR32(sum,15);
184 last_sample += int_advance;
185 samp_frac_num += frac_advance;
186 if (samp_frac_num >= den_rate)
188 samp_frac_num -= den_rate;
189 diff --git a/src/resample_neon.c b/src/resample_neon.c
190 --- a/src/resample_neon.c
191 +++ b/src/resample_neon.c
192 @@ -32,16 +32,17 @@
193 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
194 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
195 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
196 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
197 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
200 #include <stdint.h>
201 +#include "simd_detect.h"
203 #ifdef FIXED_POINT
204 #if defined(__aarch64__)
205 static inline int32_t saturate_32bit_to_16bit(int32_t a) {
206 int32_t ret;
207 asm ("fmov s0, %w[a]\n"
208 "sqxtn h0, s0\n"
209 "sxtl v0.4s, v0.4h\n"
210 @@ -73,17 +74,17 @@
212 #endif
213 #undef WORD2INT
214 #define WORD2INT(x) (saturate_32bit_to_16bit(x))
216 #define OVERRIDE_INNER_PRODUCT_SINGLE
217 /* Only works when len % 4 == 0 and len >= 4 */
218 #if defined(__aarch64__)
219 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
220 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
222 int32_t ret;
223 uint32_t remainder = len % 16;
224 len = len - remainder;
226 asm volatile (" cmp %w[len], #0\n"
227 " b.ne 1f\n"
228 " ld1 {v16.4h}, [%[b]], #8\n"
229 @@ -128,17 +129,17 @@
230 : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
231 [len] "+r" (len), [remainder] "+r" (remainder)
233 : "cc", "v0",
234 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
235 return ret;
237 #else
238 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
239 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
241 int32_t ret;
242 uint32_t remainder = len % 16;
243 len = len - remainder;
245 asm volatile (" cmp %[len], #0\n"
246 " bne 1f\n"
247 " vld1.16 {d16}, [%[b]]!\n"
248 @@ -218,17 +219,17 @@
249 #endif
251 #undef WORD2INT
252 #define WORD2INT(x) (saturate_float_to_16bit(x))
254 #define OVERRIDE_INNER_PRODUCT_SINGLE
255 /* Only works when len % 4 == 0 and len >= 4 */
256 #if defined(__aarch64__)
257 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
258 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
260 float ret;
261 uint32_t remainder = len % 16;
262 len = len - remainder;
264 asm volatile (" cmp %w[len], #0\n"
265 " b.ne 1f\n"
266 " ld1 {v16.4s}, [%[b]], #16\n"
267 @@ -273,17 +274,17 @@
268 : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
269 [len] "+r" (len), [remainder] "+r" (remainder)
271 : "cc", "v1", "v2", "v3", "v4",
272 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
273 return ret;
275 #else
276 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
277 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
279 float ret;
280 uint32_t remainder = len % 16;
281 len = len - remainder;
283 asm volatile (" cmp %[len], #0\n"
284 " bne 1f\n"
285 " vld1.32 {q4}, [%[b]]!\n"
286 diff --git a/src/resample_sse.c b/src/resample_sse.c
287 --- a/src/resample_sse.c
288 +++ b/src/resample_sse.c
289 @@ -29,37 +29,39 @@
290 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
291 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
292 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
293 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
294 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
295 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
298 +#include "simd_detect.h"
300 #include <xmmintrin.h>
302 #define OVERRIDE_INNER_PRODUCT_SINGLE
303 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
304 +float inner_product_single(const float *a, const float *b, unsigned int len)
306 int i;
307 float ret;
308 __m128 sum = _mm_setzero_ps();
309 for (i=0;i<len;i+=8)
311 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
312 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
314 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
315 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
316 _mm_store_ss(&ret, sum);
317 return ret;
320 #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
321 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
322 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
323 int i;
324 float ret;
325 __m128 sum = _mm_setzero_ps();
326 __m128 f = _mm_loadu_ps(frac);
327 for(i=0;i<len;i+=2)
329 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
330 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
331 @@ -70,17 +72,17 @@ static inline float interpolate_product_
332 _mm_store_ss(&ret, sum);
333 return ret;
336 #ifdef USE_SSE2
337 #include <emmintrin.h>
338 #define OVERRIDE_INNER_PRODUCT_DOUBLE
340 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
341 +double inner_product_double(const float *a, const float *b, unsigned int len)
343 int i;
344 double ret;
345 __m128d sum = _mm_setzero_pd();
346 __m128 t;
347 for (i=0;i<len;i+=8)
349 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
350 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
351 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
353 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
354 _mm_store_sd(&ret, sum);
355 return ret;
358 #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
359 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
360 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
361 int i;
362 double ret;
363 __m128d sum;
364 __m128d sum1 = _mm_setzero_pd();
365 __m128d sum2 = _mm_setzero_pd();
366 __m128 f = _mm_loadu_ps(frac);
367 __m128d f1 = _mm_cvtps_pd(f);
368 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));