libs/soundtouch/3dnow_win.cpp

   1 ////////////////////////////////////////////////////////////////////////////////
   2 ///
   3 /// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon
   4 /// processors. All 3DNow! optimized functions have been gathered into this
   5 /// single source code file, regardless to their class or original source code
   6 /// file, in order to ease porting the library to other compiler and processor
   7 /// platforms.
   8 ///
   9 /// By the way; the performance gain depends heavily on the CPU generation: On
  10 /// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the
  11 /// difference to the original routines stayed at unremarkable 8%! Such a small
  12 /// improvement on Athlon is due to 3DNow can perform only two operations in
  13 /// parallel, and obviously also the Athlon FPU is doing a very good job with
  14 /// the standard C floating point routines! Here these routines are anyway,
  15 /// although it might not be worth the effort to convert these to GCC platform,
  16 /// for Athlon CPU at least. The situation is different regarding the SSE
  17 /// optimizations though, thanks to the four parallel operations of SSE that
  18 /// already make a difference.
  19 ///
  20 /// This file is to be compiled in Windows platform with Microsoft Visual C++
  21 /// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
  22 /// GNU platforms (if file supplied).
  23 ///
  24 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
  25 /// 6.0 processor pack" update to support 3DNow! instruction set. The update is
  26 /// available for download at Microsoft Developers Network, see here:
  27 /// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
  28 ///
  29 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
  30 /// perform a search with keywords "processor pack".
  31 ///
  32 /// Author        : Copyright (c) Olli Parviainen
  33 /// Author e-mail : oparviai @ iki.fi
  34 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
  35 ///
  36 ////////////////////////////////////////////////////////////////////////////////
  37 //
  38 // Last changed  : $Date$
  39 // File revision : $Revision$
  40 //
  41 // $Id$
  42 //
  43 ////////////////////////////////////////////////////////////////////////////////
  44 //
  45 // License :
  46 //
  47 //  SoundTouch audio processing library
  48 //  Copyright (c) Olli Parviainen
  49 //
  50 //  This library is free software; you can redistribute it and/or
  51 //  modify it under the terms of the GNU Lesser General Public
  52 //  License as published by the Free Software Foundation; either
  53 //  version 2.1 of the License, or (at your option) any later version.
  54 //
  55 //  This library is distributed in the hope that it will be useful,
  56 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  57 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  58 //  Lesser General Public License for more details.
  59 //
  60 //  You should have received a copy of the GNU Lesser General Public
  61 //  License along with this library; if not, write to the Free Software
  62 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  63 //
  64 ////////////////////////////////////////////////////////////////////////////////
  65
  66 #include "cpu_detect.h"
  67 #include "STTypes.h"
  68
  69 #ifndef WIN32
  70 #error "wrong platform - this source code file is exclusively for Win32 platform"
  71 #endif
  72
  73 using namespace soundtouch;
  74
  75 #ifdef ALLOW_3DNOW
  76 // 3DNow! routines available only with float sample type
  77
  78 //////////////////////////////////////////////////////////////////////////////
  79 //
  80 // implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
  81 //
  82 //////////////////////////////////////////////////////////////////////////////
  83
  84 #include "TDStretch.h"
  85 #include <limits.h>
  86
  87 // these are declared in 'TDStretch.cpp'
  88 extern int scanOffsets[4][24];
  89
  90
  91 // Calculates cross correlation of two buffers
  92 double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
  93 {
  94     uint overlapLengthLocal = overlapLength;
  95     float corr;
  96
  97     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
  98     /*
  99     c-pseudocode:
 100
 101         corr = 0;
 102         for (i = 0; i < overlapLength / 4; i ++)
 103         {
 104             corr += pV1[0] * pV2[0];
 105                     pV1[1] * pV2[1];
 106                     pV1[2] * pV2[2];
 107                     pV1[3] * pV2[3];
 108                     pV1[4] * pV2[4];
 109                     pV1[5] * pV2[5];
 110                     pV1[6] * pV2[6];
 111                     pV1[7] * pV2[7];
 112
 113             pV1 += 8;
 114             pV2 += 8;
 115         }
 116     */
 117
 118     _asm
 119     {
 120         // give prefetch hints to CPU of what data are to be needed soonish.
 121         // give more aggressive hints on pV1 as that changes more between different calls
 122         // while pV2 stays the same.
 123         prefetch [pV1]
 124         prefetch [pV2]
 125         prefetch [pV1 + 32]
 126
 127         mov     eax, dword ptr pV2
 128         mov     ebx, dword ptr pV1
 129
 130         pxor    mm0, mm0
 131
 132         mov     ecx, overlapLengthLocal
 133         shr     ecx, 2  // div by four
 134
 135     loop1:
 136         movq    mm1, [eax]
 137         prefetch [eax + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 138         pfmul   mm1, [ebx]
 139         prefetch [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
 140
 141         movq    mm2, [eax + 8]
 142         pfadd   mm0, mm1
 143         pfmul   mm2, [ebx + 8]
 144
 145         movq    mm3, [eax + 16]
 146         pfadd   mm0, mm2
 147         pfmul   mm3, [ebx + 16]
 148
 149         movq    mm4, [eax + 24]
 150         pfadd   mm0, mm3
 151         pfmul   mm4, [ebx + 24]
 152
 153         add     eax, 32
 154         pfadd   mm0, mm4
 155         add     ebx, 32
 156
 157         dec     ecx
 158         jnz     loop1
 159
 160         // add halfs of mm0 together and return the result.
 161         // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
 162         pfacc   mm0, mm1
 163         movd    corr, mm0
 164         femms
 165     }
 166
 167     return corr;
 168 }
 169
 170
 171
 172
 173 //////////////////////////////////////////////////////////////////////////////
 174 //
 175 // implementation of 3DNow! optimized functions of class 'FIRFilter'
 176 //
 177 //////////////////////////////////////////////////////////////////////////////
 178
 179 #include "FIRFilter.h"
 180
 181 FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
 182 {
 183     filterCoeffsUnalign = NULL;
 184 }
 185
 186
 187 FIRFilter3DNow::~FIRFilter3DNow()
 188 {
 189     delete[] filterCoeffsUnalign;
 190 }
 191
 192
 193 // (overloaded) Calculates filter coefficients for 3DNow! routine
 194 void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
 195 {
 196     uint i;
 197     float fDivider;
 198
 199     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
 200
 201     // Scale the filter coefficients so that it won't be necessary to scale the filtering result
 202     // also rearrange coefficients suitably for 3DNow!
 203     // Ensure that filter coeffs array is aligned to 16-byte boundary
 204     delete[] filterCoeffsUnalign;
 205     filterCoeffsUnalign = new float[2 * newLength + 4];
 206     filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);
 207
 208     fDivider = (float)resultDivider;
 209
 210     // rearrange the filter coefficients for mmx routines
 211     for (i = 0; i < newLength; i ++)
 212     {
 213         filterCoeffsAlign[2 * i + 0] =
 214         filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
 215     }
 216 }
 217
 218
 219 // 3DNow!-optimized version of the filter routine for stereo sound
 220 uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
 221 {
 222     float *filterCoeffsLocal = filterCoeffsAlign;
 223     uint count = (numSamples - length) & -2;
 224     uint lengthLocal = length / 4;
 225
 226     assert(length != 0);
 227     assert(count % 2 == 0);
 228
 229     /* original code:
 230
 231     double suml1, suml2;
 232     double sumr1, sumr2;
 233     uint i, j;
 234
 235     for (j = 0; j < count; j += 2)
 236     {
 237         const float *ptr;
 238
 239         suml1 = sumr1 = 0.0;
 240         suml2 = sumr2 = 0.0;
 241         ptr = src;
 242         filterCoeffsLocal = filterCoeffs;
 243         for (i = 0; i < lengthLocal; i ++)
 244         {
 245             // unroll loop for efficiency.
 246
 247             suml1 += ptr[0] * filterCoeffsLocal[0] +
 248                      ptr[2] * filterCoeffsLocal[2] +
 249                      ptr[4] * filterCoeffsLocal[4] +
 250                      ptr[6] * filterCoeffsLocal[6];
 251
 252             sumr1 += ptr[1] * filterCoeffsLocal[1] +
 253                      ptr[3] * filterCoeffsLocal[3] +
 254                      ptr[5] * filterCoeffsLocal[5] +
 255                      ptr[7] * filterCoeffsLocal[7];
 256
 257             suml2 += ptr[8] * filterCoeffsLocal[0] +
 258                      ptr[10] * filterCoeffsLocal[2] +
 259                      ptr[12] * filterCoeffsLocal[4] +
 260                      ptr[14] * filterCoeffsLocal[6];
 261
 262             sumr2 += ptr[9] * filterCoeffsLocal[1] +
 263                      ptr[11] * filterCoeffsLocal[3] +
 264                      ptr[13] * filterCoeffsLocal[5] +
 265                      ptr[15] * filterCoeffsLocal[7];
 266
 267             ptr += 16;
 268             filterCoeffsLocal += 8;
 269         }
 270         dest[0] = (float)suml1;
 271         dest[1] = (float)sumr1;
 272         dest[2] = (float)suml2;
 273         dest[3] = (float)sumr2;
 274
 275         src += 4;
 276         dest += 4;
 277     }
 278
 279     */
 280     _asm
 281     {
 282         mov     eax, dword ptr dest
 283         mov     ebx, dword ptr src
 284         mov     edx, count
 285         shr     edx, 1
 286
 287     loop1:
 288         // "outer loop" : during each round 2*2 output samples are calculated
 289         prefetch  [ebx]                 // give a prefetch hint to CPU what data are to be needed soonish
 290         prefetch  [filterCoeffsLocal]   // give a prefetch hint to CPU what data are to be needed soonish
 291
 292         mov     esi, ebx
 293         mov     edi, filterCoeffsLocal
 294         pxor    mm0, mm0
 295         pxor    mm1, mm1
 296         mov     ecx, lengthLocal
 297
 298     loop2:
 299         // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
 300         movq    mm2, [edi]
 301         movq    mm3, mm2
 302         prefetch  [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 303         pfmul   mm2, [esi]
 304         prefetch  [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
 305         pfmul   mm3, [esi + 8]
 306
 307         movq    mm4, [edi + 8]
 308         movq    mm5, mm4
 309         pfadd   mm0, mm2
 310         pfmul   mm4, [esi + 8]
 311         pfadd   mm1, mm3
 312         pfmul   mm5, [esi + 16]
 313
 314         movq    mm2, [edi + 16]
 315         movq    mm6, mm2
 316         pfadd   mm0, mm4
 317         pfmul   mm2, [esi + 16]
 318         pfadd   mm1, mm5
 319         pfmul   mm6, [esi + 24]
 320
 321         movq    mm3, [edi + 24]
 322         movq    mm7, mm3
 323         pfadd   mm0, mm2
 324         pfmul   mm3, [esi + 24]
 325         pfadd   mm1, mm6
 326         pfmul   mm7, [esi + 32]
 327         add     esi, 32
 328         pfadd   mm0, mm3
 329         add     edi, 32
 330         pfadd   mm1, mm7
 331
 332         dec     ecx
 333         jnz     loop2
 334
 335         movq    [eax], mm0
 336         add     ebx, 16
 337         movq    [eax + 8], mm1
 338         add     eax, 16
 339
 340         dec     edx
 341         jnz     loop1
 342
 343         femms
 344     }
 345
 346     return count;
 347 }
 348
 349
 350 #endif  // ALLOW_3DNOW