media/libspeex_resampler/02_simd-detect-runtime.patch

   1 diff --git a/src/resample.c b/src/resample.c
   2 --- a/src/resample.c
   3 +++ b/src/resample.c
   4 @@ -91,23 +91,17 @@ static void speex_free(void *ptr) {free(
   5  #ifndef NULL
   6  #define NULL 0
   7  #endif
   8
   9  #ifndef UINT32_MAX
  10  #define UINT32_MAX 4294967295U
  11  #endif
  12
  13 -#ifdef USE_SSE
  14 -#include "resample_sse.h"
  15 -#endif
  16 -
  17 -#ifdef USE_NEON
  18 -#include "resample_neon.h"
  19 -#endif
  20 +#include "simd_detect.h"
  21
  22  /* Number of elements to allocate on the stack */
  23  #ifdef VAR_ARRAYS
  24  #define FIXED_STACK_ALLOC 8192
  25  #else
  26  #define FIXED_STACK_ALLOC 1024
  27  #endif
  28
  29 @@ -341,17 +335,19 @@ static int resampler_basic_direct_single
  30     const spx_uint32_t den_rate = st->den_rate;
  31     spx_word32_t sum;
  32
  33     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
  34     {
  35        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
  36        const spx_word16_t *iptr = & in[last_sample];
  37
  38 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
  39 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
  40 +      if (!moz_speex_have_single_simd()) {
  41 +#endif
  42        int j;
  43        sum = 0;
  44        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
  45
  46  /*    This code is slower on most DSPs which have only 2 accumulators.
  47        Plus this this forces truncation to 32 bits and you lose the HW guard bits.
  48        I think we can trust the compiler and let it vectorize and/or unroll itself.
  49        spx_word32_t accum[4] = {0,0,0,0};
  50 @@ -359,18 +355,20 @@ static int resampler_basic_direct_single
  51          accum[0] += MULT16_16(sinct[j], iptr[j]);
  52          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
  53          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
  54          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
  55        }
  56        sum = accum[0] + accum[1] + accum[2] + accum[3];
  57  */
  58        sum = SATURATE32PSHR(sum, 15, 32767);
  59 -#else
  60 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
  61 +      } else {
  62        sum = inner_product_single(sinct, iptr, N);
  63 +      }
  64  #endif
  65
  66        out[out_stride * out_sample++] = sum;
  67        last_sample += int_advance;
  68        samp_frac_num += frac_advance;
  69        if (samp_frac_num >= den_rate)
  70        {
  71           samp_frac_num -= den_rate;
  72 @@ -399,29 +397,33 @@ static int resampler_basic_direct_double
  73     const spx_uint32_t den_rate = st->den_rate;
  74     double sum;
  75
  76     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
  77     {
  78        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
  79        const spx_word16_t *iptr = & in[last_sample];
  80
  81 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
  82 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
  83 +      if(moz_speex_have_double_simd()) {
  84 +#endif
  85        int j;
  86        double accum[4] = {0,0,0,0};
  87
  88        for(j=0;j<N;j+=4) {
  89          accum[0] += sinct[j]*iptr[j];
  90          accum[1] += sinct[j+1]*iptr[j+1];
  91          accum[2] += sinct[j+2]*iptr[j+2];
  92          accum[3] += sinct[j+3]*iptr[j+3];
  93        }
  94        sum = accum[0] + accum[1] + accum[2] + accum[3];
  95 -#else
  96 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
  97 +      } else {
  98        sum = inner_product_double(sinct, iptr, N);
  99 +      }
 100  #endif
 101
 102        out[out_stride * out_sample++] = PSHR32(sum, 15);
 103        last_sample += int_advance;
 104        samp_frac_num += frac_advance;
 105        if (samp_frac_num >= den_rate)
 106        {
 107           samp_frac_num -= den_rate;
 108 @@ -455,34 +457,38 @@ static int resampler_basic_interpolate_s
 109  #ifdef FIXED_POINT
 110        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 111  #else
 112        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 113  #endif
 114        spx_word16_t interp[4];
 115
 116
 117 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 118 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 119 +      if (!moz_speex_have_single_simd()) {
 120 +#endif
 121        int j;
 122        spx_word32_t accum[4] = {0,0,0,0};
 123
 124        for(j=0;j<N;j++) {
 125          const spx_word16_t curr_in=iptr[j];
 126          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
 127          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
 128          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
 129          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
 130        }
 131
 132        cubic_coef(frac, interp);
 133        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
 134        sum = SATURATE32PSHR(sum, 15, 32767);
 135 -#else
 136 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 137 +      } else {
 138        cubic_coef(frac, interp);
 139        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 140 +      }
 141  #endif
 142
 143        out[out_stride * out_sample++] = sum;
 144        last_sample += int_advance;
 145        samp_frac_num += frac_advance;
 146        if (samp_frac_num >= den_rate)
 147        {
 148           samp_frac_num -= den_rate;
 149 @@ -518,33 +524,37 @@ static int resampler_basic_interpolate_d
 150  #ifdef FIXED_POINT
 151        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
 152  #else
 153        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
 154  #endif
 155        spx_word16_t interp[4];
 156
 157
 158 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 159 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 160 +      if (!moz_speex_have_double_simd()) {
 161 +#endif
 162        int j;
 163        double accum[4] = {0,0,0,0};
 164
 165        for(j=0;j<N;j++) {
 166          const double curr_in=iptr[j];
 167          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
 168          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
 169          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
 170          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
 171        }
 172
 173        cubic_coef(frac, interp);
 174        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
 175 -#else
 176 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 177 +      } else {
 178        cubic_coef(frac, interp);
 179        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 180 +      }
 181  #endif
 182
 183        out[out_stride * out_sample++] = PSHR32(sum,15);
 184        last_sample += int_advance;
 185        samp_frac_num += frac_advance;
 186        if (samp_frac_num >= den_rate)
 187        {
 188           samp_frac_num -= den_rate;
 189 diff --git a/src/resample_neon.c b/src/resample_neon.c
 190 --- a/src/resample_neon.c
 191 +++ b/src/resample_neon.c
 192 @@ -32,16 +32,17 @@
 193     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 194     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 195     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 196     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 197     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 198  */
 199
 200  #include <stdint.h>
 201 +#include "simd_detect.h"
 202
 203  #ifdef FIXED_POINT
 204  #if defined(__aarch64__)
 205  static inline int32_t saturate_32bit_to_16bit(int32_t a) {
 206      int32_t ret;
 207      asm ("fmov s0, %w[a]\n"
 208           "sqxtn h0, s0\n"
 209           "sxtl v0.4s, v0.4h\n"
 210 @@ -73,17 +74,17 @@
 211  }
 212  #endif
 213  #undef WORD2INT
 214  #define WORD2INT(x) (saturate_32bit_to_16bit(x))
 215
 216  #define OVERRIDE_INNER_PRODUCT_SINGLE
 217  /* Only works when len % 4 == 0 and len >= 4 */
 218  #if defined(__aarch64__)
 219 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 220 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 221  {
 222      int32_t ret;
 223      uint32_t remainder = len % 16;
 224      len = len - remainder;
 225
 226      asm volatile ("     cmp %w[len], #0\n"
 227                   "      b.ne 1f\n"
 228                   "      ld1 {v16.4h}, [%[b]], #8\n"
 229 @@ -128,17 +129,17 @@
 230                   : [ret] "=r" (ret), [a] "+r" (a), [b] "+r" (b),
 231                     [len] "+r" (len), [remainder] "+r" (remainder)
 232                   :
 233                   : "cc", "v0",
 234                     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 235      return ret;
 236  }
 237  #else
 238 -static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 239 +inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 240  {
 241      int32_t ret;
 242      uint32_t remainder = len % 16;
 243      len = len - remainder;
 244
 245      asm volatile ("     cmp %[len], #0\n"
 246                   "      bne 1f\n"
 247                   "      vld1.16 {d16}, [%[b]]!\n"
 248 @@ -218,17 +219,17 @@
 249  #endif
 250
 251  #undef WORD2INT
 252  #define WORD2INT(x) (saturate_float_to_16bit(x))
 253
 254  #define OVERRIDE_INNER_PRODUCT_SINGLE
 255  /* Only works when len % 4 == 0 and len >= 4 */
 256  #if defined(__aarch64__)
 257 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 258 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
 259  {
 260      float ret;
 261      uint32_t remainder = len % 16;
 262      len = len - remainder;
 263
 264      asm volatile ("     cmp %w[len], #0\n"
 265                   "      b.ne 1f\n"
 266                   "      ld1 {v16.4s}, [%[b]], #16\n"
 267 @@ -273,17 +274,17 @@
 268                   : [ret] "=w" (ret), [a] "+r" (a), [b] "+r" (b),
 269                     [len] "+r" (len), [remainder] "+r" (remainder)
 270                   :
 271                   : "cc", "v1", "v2", "v3", "v4",
 272                     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 273      return ret;
 274  }
 275  #else
 276 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 277 +inline float inner_product_single(const float *a, const float *b, unsigned int len)
 278  {
 279      float ret;
 280      uint32_t remainder = len % 16;
 281      len = len - remainder;
 282
 283      asm volatile ("     cmp %[len], #0\n"
 284                   "      bne 1f\n"
 285                   "      vld1.32 {q4}, [%[b]]!\n"
 286 diff --git a/src/resample_sse.c b/src/resample_sse.c
 287 --- a/src/resample_sse.c
 288 +++ b/src/resample_sse.c
 289 @@ -29,37 +29,39 @@
 290     EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 291     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 292     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 293     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 294     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 295     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 296  */
 297
 298 +#include "simd_detect.h"
 299 +
 300  #include <xmmintrin.h>
 301
 302  #define OVERRIDE_INNER_PRODUCT_SINGLE
 303 -static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 304 +float inner_product_single(const float *a, const float *b, unsigned int len)
 305  {
 306     int i;
 307     float ret;
 308     __m128 sum = _mm_setzero_ps();
 309     for (i=0;i<len;i+=8)
 310     {
 311        sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
 312        sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
 313     }
 314     sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
 315     sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
 316     _mm_store_ss(&ret, sum);
 317     return ret;
 318  }
 319
 320  #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 321 -static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 322 +float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 323    int i;
 324    float ret;
 325    __m128 sum = _mm_setzero_ps();
 326    __m128 f = _mm_loadu_ps(frac);
 327    for(i=0;i<len;i+=2)
 328    {
 329      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
 330      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
 331 @@ -70,17 +72,17 @@ static inline float interpolate_product_
 332     _mm_store_ss(&ret, sum);
 333     return ret;
 334  }
 335
 336  #ifdef USE_SSE2
 337  #include <emmintrin.h>
 338  #define OVERRIDE_INNER_PRODUCT_DOUBLE
 339
 340 -static inline double inner_product_double(const float *a, const float *b, unsigned int len)
 341 +double inner_product_double(const float *a, const float *b, unsigned int len)
 342  {
 343     int i;
 344     double ret;
 345     __m128d sum = _mm_setzero_pd();
 346     __m128 t;
 347     for (i=0;i<len;i+=8)
 348     {
 349        t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
 350 @@ -92,17 +94,17 @@ static inline double inner_product_doubl
 351        sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
 352     }
 353     sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
 354     _mm_store_sd(&ret, sum);
 355     return ret;
 356  }
 357
 358  #define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
 359 -static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 360 +double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
 361    int i;
 362    double ret;
 363    __m128d sum;
 364    __m128d sum1 = _mm_setzero_pd();
 365    __m128d sum2 = _mm_setzero_pd();
 366    __m128 f = _mm_loadu_ps(frac);
 367    __m128d f1 = _mm_cvtps_pd(f);
 368    __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));