libs/soundtouch/sse_win.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 ///
   3 /// Win32 version of the SSE optimized routines for Pentium-III, Athlon-XP and
   4 /// later. All SSE optimized functions have been gathered into this single source
   5 /// code file, regardless to their class or original source code file, in order
   6 /// to ease porting the library to other compiler and processor platforms.
   7 ///
   8 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
   9 /// 6.0 processor pack" update to support SSE instruction set. The update is
  10 /// available for download at Microsoft Developers Network, see here:
  11 /// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
  12 ///
  13 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
  14 /// perform a search with keywords "processor pack".
  15 ///
  16 /// This file is to be compiled in Windows platform with Microsoft Visual C++
  17 /// Compiler. Please see 'sse_gcc.cpp' for the gcc compiler version for all
  18 /// GNU platforms (if file supplied).
  19 ///
  20 /// Author        : Copyright (c) Olli Parviainen
  21 /// Author e-mail : oparviai @ iki.fi
  22 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
  23 ///
  24 ////////////////////////////////////////////////////////////////////////////////
  25 //
  26 // Last changed  : $Date$
  27 // File revision : $Revision$
  28 //
  29 // $Id$
  30 //
  31 ////////////////////////////////////////////////////////////////////////////////
  32 //
  33 // License :
  34 //
  35 //  SoundTouch audio processing library
  36 //  Copyright (c) Olli Parviainen
  37 //
  38 //  This library is free software; you can redistribute it and/or
  39 //  modify it under the terms of the GNU Lesser General Public
  40 //  License as published by the Free Software Foundation; either
  41 //  version 2.1 of the License, or (at your option) any later version.
  42 //
  43 //  This library is distributed in the hope that it will be useful,
  44 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  45 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  46 //  Lesser General Public License for more details.
  47 //
  48 //  You should have received a copy of the GNU Lesser General Public
  49 //  License along with this library; if not, write to the Free Software
  50 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  51 //
  52 ////////////////////////////////////////////////////////////////////////////////
  53
  54 #include "cpu_detect.h"
  55 #include "STTypes.h"
  56
  57 #ifndef WIN32
  58 #error "wrong platform - this source code file is exclusively for Win32 platform"
  59 #endif
  60
  61 using namespace soundtouch;
  62
  63 #ifdef ALLOW_SSE
  64 // SSE routines available only with float sample type
  65
  66 //////////////////////////////////////////////////////////////////////////////
  67 //
  68 // implementation of SSE optimized functions of class 'TDStretchSSE'
  69 //
  70 //////////////////////////////////////////////////////////////////////////////
  71
  72 #include "TDStretch.h"
  73 #include <limits.h>
  74
  75 // these are declared in 'TDStretch.cpp'
  76 extern int scanOffsets[4][24];
  77
  78 // Calculates cross correlation of two buffers
  79 double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
  80 {
  81     uint overlapLengthLocal = overlapLength;
  82     float corr;
  83
  84     /*
  85     double corr;
  86     uint i;
  87
  88     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
  89     corr = 0.0;
  90     for (i = 0; i < overlapLength / 8; i ++)
  91     {
  92         corr += pV1[0] * pV2[0] +
  93                 pV1[1] * pV2[1] +
  94                 pV1[2] * pV2[2] +
  95                 pV1[3] * pV2[3] +
  96                 pV1[4] * pV2[4] +
  97                 pV1[5] * pV2[5] +
  98                 pV1[6] * pV2[6] +
  99                 pV1[7] * pV2[7] +
 100                 pV1[8] * pV2[8] +
 101                 pV1[9] * pV2[9] +
 102                 pV1[10] * pV2[10] +
 103                 pV1[11] * pV2[11] +
 104                 pV1[12] * pV2[12] +
 105                 pV1[13] * pV2[13] +
 106                 pV1[14] * pV2[14] +
 107                 pV1[15] * pV2[15];
 108
 109         pV1 += 16;
 110         pV2 += 16;
 111     }
 112     */
 113
 114     _asm
 115     {
 116         // Very important note: data in 'pV2' _must_ be aligned to
 117         // 16-byte boundary!
 118
 119         // give prefetch hints to CPU of what data are to be needed soonish
 120         // give more aggressive hints on pV1 as that changes while pV2 stays
 121         // same between runs
 122         prefetcht0 [pV1]
 123         prefetcht0 [pV2]
 124         prefetcht0 [pV1 + 32]
 125
 126         mov     eax, dword ptr pV1
 127         mov     ebx, dword ptr pV2
 128
 129         xorps   xmm0, xmm0
 130
 131         mov     ecx, overlapLengthLocal
 132         shr     ecx, 3  // div by eight
 133
 134     loop1:
 135         prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
 136         prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 137         movups  xmm1, [eax]
 138         mulps   xmm1, [ebx]
 139         addps   xmm0, xmm1
 140
 141         movups  xmm2, [eax + 16]
 142         mulps   xmm2, [ebx + 16]
 143         addps   xmm0, xmm2
 144
 145         prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
 146         prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
 147
 148         movups  xmm3, [eax + 32]
 149         mulps   xmm3, [ebx + 32]
 150         addps   xmm0, xmm3
 151
 152         movups  xmm4, [eax + 48]
 153         mulps   xmm4, [ebx + 48]
 154         addps   xmm0, xmm4
 155
 156         add     eax, 64
 157         add     ebx, 64
 158
 159         dec     ecx
 160         jnz     loop1
 161
 162         // add the four floats of xmm0 together and return the result.
 163
 164         movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
 165         addps   xmm1, xmm0
 166         movaps  xmm2, xmm1
 167         shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
 168         addss   xmm2, xmm1
 169         movss   corr, xmm2
 170     }
 171
 172     return (double)corr;
 173 }
 174
 175
 176 //////////////////////////////////////////////////////////////////////////////
 177 //
 178 // implementation of SSE optimized functions of class 'FIRFilter'
 179 //
 180 //////////////////////////////////////////////////////////////////////////////
 181
 182 #include "FIRFilter.h"
 183
 184 FIRFilterSSE::FIRFilterSSE() : FIRFilter()
 185 {
 186     filterCoeffsUnalign = NULL;
 187 }
 188
 189
 190 FIRFilterSSE::~FIRFilterSSE()
 191 {
 192     delete[] filterCoeffsUnalign;
 193 }
 194
 195
 196 // (overloaded) Calculates filter coefficients for SSE routine
 197 void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
 198 {
 199     uint i;
 200     float fDivider;
 201
 202     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
 203
 204     // Scale the filter coefficients so that it won't be necessary to scale the filtering result
 205     // also rearrange coefficients suitably for 3DNow!
 206     // Ensure that filter coeffs array is aligned to 16-byte boundary
 207     delete[] filterCoeffsUnalign;
 208     filterCoeffsUnalign = new float[2 * newLength + 4];
 209     filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);
 210
 211     fDivider = (float)resultDivider;
 212
 213     // rearrange the filter coefficients for mmx routines
 214     for (i = 0; i < newLength; i ++)
 215     {
 216         filterCoeffsAlign[2 * i + 0] =
 217         filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
 218     }
 219 }
 220
 221
 222
 223 // SSE-optimized version of the filter routine for stereo sound
 224 uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
 225 {
 226     int count = (numSamples - length) & -2;
 227     uint lengthLocal = length / 8;
 228     float *filterCoeffsLocal = filterCoeffsAlign;
 229
 230     assert(count % 2 == 0);
 231
 232     if (count < 2) return 0;
 233
 234     /*
 235     double suml1, suml2;
 236     double sumr1, sumr2;
 237     uint i, j;
 238
 239     for (j = 0; j < count; j += 2)
 240     {
 241         const float *ptr;
 242         const float *pFil;
 243
 244         suml1 = sumr1 = 0.0;
 245         suml2 = sumr2 = 0.0;
 246         ptr = src;
 247         pFil = filterCoeffs;
 248         for (i = 0; i < lengthLocal; i ++)
 249         {
 250             // unroll loop for efficiency.
 251
 252             suml1 += ptr[0] * pFil[0] +
 253                      ptr[2] * pFil[2] +
 254                      ptr[4] * pFil[4] +
 255                      ptr[6] * pFil[6];
 256
 257             sumr1 += ptr[1] * pFil[1] +
 258                      ptr[3] * pFil[3] +
 259                      ptr[5] * pFil[5] +
 260                      ptr[7] * pFil[7];
 261
 262             suml2 += ptr[8] * pFil[0] +
 263                      ptr[10] * pFil[2] +
 264                      ptr[12] * pFil[4] +
 265                      ptr[14] * pFil[6];
 266
 267             sumr2 += ptr[9] * pFil[1] +
 268                      ptr[11] * pFil[3] +
 269                      ptr[13] * pFil[5] +
 270                      ptr[15] * pFil[7];
 271
 272             ptr += 16;
 273             pFil += 8;
 274         }
 275         dest[0] = (float)suml1;
 276         dest[1] = (float)sumr1;
 277         dest[2] = (float)suml2;
 278         dest[3] = (float)sumr2;
 279
 280         src += 4;
 281         dest += 4;
 282     }
 283     */
 284
 285     _asm
 286     {
 287         // Very important note: data in 'src' _must_ be aligned to
 288         // 16-byte boundary!
 289         mov     edx, count
 290         mov     ebx, dword ptr src
 291         mov     eax, dword ptr dest
 292         shr     edx, 1
 293
 294     loop1:
 295         // "outer loop" : during each round 2*2 output samples are calculated
 296
 297         // give prefetch hints to CPU of what data are to be needed soonish
 298         prefetcht0 [ebx]
 299         prefetcht0 [filterCoeffsLocal]
 300
 301         mov     esi, ebx
 302         mov     edi, filterCoeffsLocal
 303         xorps   xmm0, xmm0
 304         xorps   xmm1, xmm1
 305         mov     ecx, lengthLocal
 306
 307     loop2:
 308         // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
 309         prefetcht0 [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 310         prefetcht0 [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 311
 312         movups  xmm2, [esi]         // possibly unaligned load
 313         movups  xmm3, [esi + 8]     // possibly unaligned load
 314         mulps   xmm2, [edi]
 315         mulps   xmm3, [edi]
 316         addps   xmm0, xmm2
 317         addps   xmm1, xmm3
 318
 319         movups  xmm4, [esi + 16]    // possibly unaligned load
 320         movups  xmm5, [esi + 24]    // possibly unaligned load
 321         mulps   xmm4, [edi + 16]
 322         mulps   xmm5, [edi + 16]
 323         addps   xmm0, xmm4
 324         addps   xmm1, xmm5
 325
 326         prefetcht0 [esi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
 327         prefetcht0 [edi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
 328
 329         movups  xmm6, [esi + 32]    // possibly unaligned load
 330         movups  xmm7, [esi + 40]    // possibly unaligned load
 331         mulps   xmm6, [edi + 32]
 332         mulps   xmm7, [edi + 32]
 333         addps   xmm0, xmm6
 334         addps   xmm1, xmm7
 335
 336         movups  xmm4, [esi + 48]    // possibly unaligned load
 337         movups  xmm5, [esi + 56]    // possibly unaligned load
 338         mulps   xmm4, [edi + 48]
 339         mulps   xmm5, [edi + 48]
 340         addps   xmm0, xmm4
 341         addps   xmm1, xmm5
 342
 343         add     esi, 64
 344         add     edi, 64
 345         dec     ecx
 346         jnz     loop2
 347
 348         // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
 349         // to sum the two hi- and lo-floats of these registers together.
 350
 351         movhlps xmm2, xmm0          // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
 352         movlhps xmm2, xmm1          // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
 353         shufps  xmm0, xmm1, 0xe4    // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
 354         addps   xmm0, xmm2
 355
 356         movaps  [eax], xmm0
 357         add     ebx, 16
 358         add     eax, 16
 359
 360         dec     edx
 361         jnz     loop1
 362     }
 363
 364     return (uint)count;
 365 }
 366
 367 #endif  // ALLOW_SSE