media/libspeex_resampler/simd-detect-runtime.patch

   1 diff --git a/media/libspeex_resampler/src/resample.c b/media/libspeex_resampler/src/resample.c
   2 --- a/media/libspeex_resampler/src/resample.c
   3 +++ b/media/libspeex_resampler/src/resample.c
   4 @@ -92,23 +92,17 @@ static void speex_free (void *ptr) {free
   5
   6  #define IMAX(a,b) ((a) > (b) ? (a) : (b))
   7  #define IMIN(a,b) ((a) < (b) ? (a) : (b))
   8
   9  #ifndef NULL
  10  #define NULL 0
  11  #endif
  12
  13 -#ifdef _USE_SSE
  14 -#include "resample_sse.h"
  15 -#endif
  16 -
  17 -#ifdef _USE_NEON
  18 -#include "resample_neon.h"
  19 -#endif
  20 +#include "simd_detect.h"
  21
  22  /* Numer of elements to allocate on the stack */
  23  #ifdef VAR_ARRAYS
  24  #define FIXED_STACK_ALLOC 8192
  25  #else
  26  #define FIXED_STACK_ALLOC 1024
  27  #endif
  28
  29 @@ -344,17 +338,19 @@ static int resampler_basic_direct_single
  30     const spx_uint32_t den_rate = st->den_rate;
  31     spx_word32_t sum;
  32
  33     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
  34     {
  35        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
  36        const spx_word16_t *iptr = & in[last_sample];
  37
  38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
  39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
  40 +      if (!moz_speex_have_single_simd()) {
  41 +#endif
  42        int j;
  43        sum = 0;
  44        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
  45
  46  /*    This code is slower on most DSPs which have only 2 accumulators.
  47        Plus this this forces truncation to 32 bits and you lose the HW guard bits.
  48        I think we can trust the compiler and let it vectorize and/or unroll itself.
  49        spx_word32_t accum[4] = {0,0,0,0};
  50 @@ -362,18 +358,20 @@ static int resampler_basic_direct_single
  51          accum[0] += MULT16_16(sinct[j], iptr[j]);
  52          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
  53          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
  54          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
  55        }
  56        sum = accum[0] + accum[1] + accum[2] + accum[3];
  57  */
  58        sum = SATURATE32PSHR(sum, 15, 32767);
  59 -#else
  60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
  61 +      } else {
  62        sum = inner_product_single(sinct, iptr, N);
  63 +      }
  64  #endif
  65
  66        out[out_stride * out_sample++] = sum;
  67        last_sample += int_advance;
  68        samp_frac_num += frac_advance;
  69        if (samp_frac_num >= den_rate)
  70        {
  71           samp_frac_num -= den_rate;
  72 @@ -402,29 +400,33 @@ static int resampler_basic_direct_double
  73     const spx_uint32_t den_rate = st->den_rate;
  74     double sum;
  75
  76     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
  77     {
  78        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
  79        const spx_word16_t *iptr = & in[last_sample];
  80
  81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
  82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
  83 +      if(moz_speex_have_double_simd()) {
  84 +#endif
  85        int j;
  86        double accum[4] = {0,0,0,0};
  87
  88        for(j=0;j<N;j+=4) {
  89          accum[0] += sinct[j]*iptr[j];
  90          accum[1] += sinct[j+1]*iptr[j+1];
  91          accum[2] += sinct[j+2]*iptr[j+2];
  92          accum[3] += sinct[j+3]*iptr[j+3];
  93        }
  94        sum = accum[0] + accum[1] + accum[2] + accum[3];
  95 -#else
  96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
  97 +      } else {
  98        sum = inner_product_double(sinct, iptr, N);
  99 +      }
 100  #endif
 101
 102        out[out_stride * out_sample++] = PSHR32(sum, 15);
 103        last_sample += int_advance;
 104        samp_frac_num += frac_advance;
 105        if (samp_frac_num >= den_rate)
 106        {
 107           samp_frac_num -= den_rate;
 108 @@ -458,34 +460,38 @@ static int resampler_basic_interpolate_s
 109  #ifdef FIXED_POINT
 110        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 111  #else
 112        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 113  #endif
 114        spx_word16_t interp[4];
 115
 116
 117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 119 +      if (!moz_speex_have_single_simd()) {
 120 +#endif
 121        int j;
 122        spx_word32_t accum[4] = {0,0,0,0};
 123
 124        for(j=0;j<N;j++) {
 125          const spx_word16_t curr_in=iptr[j];
 126          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
 127          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
 128          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
 129          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
 130        }
 131
 132        cubic_coef(frac, interp);
 133        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
 134        sum = SATURATE32PSHR(sum, 15, 32767);
 135 -#else
 136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 137 +      } else {
 138        cubic_coef(frac, interp);
 139        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 140 +      }
 141  #endif
 142
 143        out[out_stride * out_sample++] = sum;
 144        last_sample += int_advance;
 145        samp_frac_num += frac_advance;
 146        if (samp_frac_num >= den_rate)
 147        {
 148           samp_frac_num -= den_rate;
 149 @@ -521,33 +527,37 @@ static int resampler_basic_interpolate_d
 150  #ifdef FIXED_POINT
 151        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 152  #else
 153        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 154  #endif
 155        spx_word16_t interp[4];
 156
 157
 158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 160 +      if (!moz_speex_have_double_simd()) {
 161 +#endif
 162        int j;
 163        double accum[4] = {0,0,0,0};
 164
 165        for(j=0;j<N;j++) {
 166          const double curr_in=iptr[j];
 167          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
 168          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
 169          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
 170          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
 171        }
 172
 173        cubic_coef(frac, interp);
 174        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
 175 -#else
 176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 177 +      } else {
 178        cubic_coef(frac, interp);
 179        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 180 +      }
 181  #endif
 182
 183        out[out_stride * out_sample++] = PSHR32(sum,15);
 184        last_sample += int_advance;
 185        samp_frac_num += frac_advance;
 186        if (samp_frac_num >= den_rate)
 187        {
 188           samp_frac_num -= den_rate;
 189 diff --git a/media/libspeex_resampler/src/resample_neon.c b/media/libspeex_resampler/src/resample_neon.c
 190 --- a/media/libspeex_resampler/src/resample_neon.c
 191 +++ b/media/libspeex_resampler/src/resample_neon.c
 192 @@ -31,16 +31,18 @@
 193     EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 194     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 195     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 196     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 197     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 198     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 199  */
 200
 201 +#include "simd_detect.h"
 202 +
 203  #include <arm_neon.h>
 204
 205  #ifdef FIXED_POINT
 206  #ifdef __thumb2__
 207  static inline int32_t saturate_32bit_to_16bit(int32_t a) {
 208      int32_t ret;
 209      asm ("ssat %[ret], #16, %[a]"
 210           : [ret] "=&r" (ret)
 211 @@ -60,17 +62,17 @@ static inline int32_t saturate_32bit_to_
 212      return ret;
 213  }
 214  #endif
 215  #undef WORD2INT
 216  #define WORD2INT(x) (saturate_32bit_to_16bit(x))
 217
 218  #define OVERRIDE_INNER_PRODUCT_SINGLE
 219  /* Only works when len % 4 == 0 */
 220 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 221 +int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 222  {
 223      int32_t ret;
 224      uint32_t remainder = len % 16;
 225      len = len - remainder;
 226
 227      asm volatile ("     cmp %[len], #0\n"
 228                   "      bne 1f\n"
 229                   "      vld1.16 {d16}, [%[b]]!\n"
 230 @@ -134,17 +136,17 @@ static inline int32_t saturate_float_to_
 231           : "q0");
 232      return ret;
 233  }
 234  #undef WORD2INT
 235  #define WORD2INT(x) (saturate_float_to_16bit(x))
 236
 237  #define OVERRIDE_INNER_PRODUCT_SINGLE
 238  /* Only works when len % 4 == 0 */
 239 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 240 +float inner_product_single(const float *a, const float *b, unsigned int len)
 241  {
 242      float ret;
 243      uint32_t remainder = len % 16;
 244      len = len - remainder;
 245
 246      asm volatile ("     cmp %[len], #0\n"
 247                   "      bne 1f\n"
 248                   "      vld1.32 {q4}, [%[b]]!\n"
 249 diff --git a/media/libspeex_resampler/src/resample_sse.c b/media/libspeex_resampler/src/resample_sse.c
 250 --- a/media/libspeex_resampler/src/resample_sse.c
 251 +++ b/media/libspeex_resampler/src/resample_sse.c
 252 @@ -29,37 +29,39 @@
 253     EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 254     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 255     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 256     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 257     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 258     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 259  */
 260
 261 +#include "simd_detect.h"
 262 +
 263  #include <xmmintrin.h>
 264
 265  #define OVERRIDE_INNER_PRODUCT_SINGLE
 266 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 267 +float inner_product_single(const float *a, const float *b, unsigned int len)
 268  {
 269     int i;
 270     float ret;
 271     __m128 sum = _mm_setzero_ps();
 272     for (i=0;i<len;i+=8)
 273     {
 274        sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
 275        sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
 276     }
 277     sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
 278     sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
 279     _mm_store_ss(&ret, sum);
 280     return ret;
 281  }
 282
 283  #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 284 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 285 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 286    int i;
 287    float ret;
 288    __m128 sum = _mm_setzero_ps();
 289    __m128 f = _mm_loadu_ps(frac);
 290    for(i=0;i<len;i+=2)
 291    {
 292      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
 293      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
 294 @@ -70,17 +72,17 @@ static inline float interpolate_product_
 295     _mm_store_ss(&ret, sum);
 296     return ret;
 297  }
 298
 299  #ifdef _USE_SSE2
 300  #include <emmintrin.h>
 301  #define OVERRIDE_INNER_PRODUCT_DOUBLE
 302
 303 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
 304 +double inner_product_double(const float *a, const float *b, unsigned int len)
 305  {
 306     int i;
 307     double ret;
 308     __m128d sum = _mm_setzero_pd();
 309     __m128 t;
 310     for (i=0;i<len;i+=8)
 311     {
 312        t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
 313 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
 314        sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
 315     }
 316     sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
 317     _mm_store_sd(&ret, sum);
 318     return ret;
 319  }
 320
 321  #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 322 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 323 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 324    int i;
 325    double ret;
 326    __m128d sum;
 327    __m128d sum1 = _mm_setzero_pd();
 328    __m128d sum2 = _mm_setzero_pd();
 329    __m128 f = _mm_loadu_ps(frac);
 330    __m128d f1 = _mm_cvtps_pd(f);
 331    __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));