Alc/mixer_sse41.c

   1 /**
   2  * OpenAL cross platform audio library
   3  * Copyright (C) 1999-2014 by authors.
   4  * This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the
  16  *  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  *  Boston, MA  02111-1307, USA.
  18  * Or go to http://www.gnu.org/copyleft/lgpl.html
  19  */
  20
  21 #include "config.h"
  22
  23 #include <xmmintrin.h>
  24 #include <emmintrin.h>
  25 #include <smmintrin.h>
  26
  27 #include "alu.h"
  28 #include "mixer_defs.h"
  29
  30
  31 static inline void InitiatePositionArrays(ALuint frac, ALuint increment,
  32                                           ALuint *frac_arr, ALuint *pos_arr)
  33 {
  34     ALuint frac_tmp;
  35
  36     pos_arr[0] = 0;
  37     frac_arr[0] = frac;
  38
  39     frac_tmp = frac_arr[0] + increment;
  40     pos_arr[1] = pos_arr[0] + (frac_tmp>>FRACTIONBITS);
  41     frac_arr[1] = frac_tmp & FRACTIONMASK;
  42
  43     frac_tmp = frac_arr[1] + increment;
  44     pos_arr[2] = pos_arr[1] + (frac_tmp>>FRACTIONBITS);
  45     frac_arr[2] = frac_tmp & FRACTIONMASK;
  46
  47     frac_tmp = frac_arr[2] + increment;
  48     pos_arr[3] = pos_arr[2] + (frac_tmp>>FRACTIONBITS);
  49     frac_arr[3] = frac_tmp & FRACTIONMASK;
  50 }
  51
  52 const ALfloat *Resample_lerp32_SSE41(const ALfloat *src, ALuint frac, ALuint increment,
  53                                      ALfloat *restrict dst, ALuint numsamples)
  54 {
  55     const __m128i increment4 = _mm_set1_epi32(increment*4);
  56     const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE);
  57     const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK);
  58     alignas(16) union { ALuint i[4]; float f[4]; } pos_;
  59     alignas(16) union { ALuint i[4]; float f[4]; } frac_;
  60     __m128i frac4, pos4;
  61     ALuint pos;
  62     ALuint i;
  63
  64     InitiatePositionArrays(frac, increment, frac_.i, pos_.i);
  65
  66     frac4 = _mm_castps_si128(_mm_load_ps(frac_.f));
  67     pos4 = _mm_castps_si128(_mm_load_ps(pos_.f));
  68
  69     for(i = 0;i < numsamples-3;i += 4)
  70     {
  71         const __m128 val1 = _mm_setr_ps(src[pos_.i[0]], src[pos_.i[1]], src[pos_.i[2]], src[pos_.i[3]]);
  72         const __m128 val2 = _mm_setr_ps(src[pos_.i[0]+1], src[pos_.i[1]+1], src[pos_.i[2]+1], src[pos_.i[3]+1]);
  73
  74         /* val1 + (val2-val1)*mu */
  75         const __m128 r0 = _mm_sub_ps(val2, val1);
  76         const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4);
  77         const __m128 out = _mm_add_ps(val1, _mm_mul_ps(mu, r0));
  78
  79         _mm_store_ps(&dst[i], out);
  80
  81         frac4 = _mm_add_epi32(frac4, increment4);
  82         pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, FRACTIONBITS));
  83         frac4 = _mm_and_si128(frac4, fracMask4);
  84
  85         pos_.i[0] = _mm_extract_epi32(pos4, 0);
  86         pos_.i[1] = _mm_extract_epi32(pos4, 1);
  87         pos_.i[2] = _mm_extract_epi32(pos4, 2);
  88         pos_.i[3] = _mm_extract_epi32(pos4, 3);
  89     }
  90
  91     pos = pos_.i[0];
  92     frac = _mm_cvtsi128_si32(frac4);
  93
  94     for(;i < numsamples;i++)
  95     {
  96         dst[i] = lerp(src[pos], src[pos+1], frac * (1.0f/FRACTIONONE));
  97
  98         frac += increment;
  99         pos  += frac>>FRACTIONBITS;
 100         frac &= FRACTIONMASK;
 101     }
 102     return dst;
 103 }