remove a bunch of explicit uses of '/' as a directory separator; use Glib::build_file...
[ardour2.git] / libs / soundtouch / 3dnow_win.cpp
blob0d593214b7355d97bf178709dca7a603f66caaf9
1 ////////////////////////////////////////////////////////////////////////////////
2 ///
3 /// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon
4 /// processors. All 3DNow! optimized functions have been gathered into this
5 /// single source code file, regardless to their class or original source code
6 /// file, in order to ease porting the library to other compiler and processor
7 /// platforms.
8 ///
9 /// By the way; the performance gain depends heavily on the CPU generation: On
10 /// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the
11 /// difference to the original routines stayed at unremarkable 8%! Such a small
12 /// improvement on Athlon is due to 3DNow can perform only two operations in
13 /// parallel, and obviously also the Athlon FPU is doing a very good job with
14 /// the standard C floating point routines! Here these routines are anyway,
15 /// although it might not be worth the effort to convert these to GCC platform,
16 /// for Athlon CPU at least. The situation is different regarding the SSE
17 /// optimizations though, thanks to the four parallel operations of SSE that
18 /// already make a difference.
19 ///
20 /// This file is to be compiled in Windows platform with Microsoft Visual C++
21 /// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
22 /// GNU platforms (if file supplied).
23 ///
24 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
25 /// 6.0 processor pack" update to support 3DNow! instruction set. The update is
26 /// available for download at Microsoft Developers Network, see here:
27 /// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
28 ///
29 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
30 /// perform a search with keywords "processor pack".
31 ///
32 /// Author : Copyright (c) Olli Parviainen
33 /// Author e-mail : oparviai @ iki.fi
34 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
35 ///
36 ////////////////////////////////////////////////////////////////////////////////
38 // Last changed : $Date$
39 // File revision : $Revision$
41 // $Id$
43 ////////////////////////////////////////////////////////////////////////////////
45 // License :
47 // SoundTouch audio processing library
48 // Copyright (c) Olli Parviainen
50 // This library is free software; you can redistribute it and/or
51 // modify it under the terms of the GNU Lesser General Public
52 // License as published by the Free Software Foundation; either
53 // version 2.1 of the License, or (at your option) any later version.
55 // This library is distributed in the hope that it will be useful,
56 // but WITHOUT ANY WARRANTY; without even the implied warranty of
57 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
58 // Lesser General Public License for more details.
60 // You should have received a copy of the GNU Lesser General Public
61 // License along with this library; if not, write to the Free Software
62 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
64 ////////////////////////////////////////////////////////////////////////////////
66 #include "cpu_detect.h"
67 #include "STTypes.h"
69 #ifndef WIN32
70 #error "wrong platform - this source code file is exclusively for Win32 platform"
71 #endif
73 using namespace soundtouch;
75 #ifdef ALLOW_3DNOW
76 // 3DNow! routines available only with float sample type
78 //////////////////////////////////////////////////////////////////////////////
80 // implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
82 //////////////////////////////////////////////////////////////////////////////
84 #include "TDStretch.h"
85 #include <limits.h>
87 // these are declared in 'TDStretch.cpp'
88 extern int scanOffsets[4][24];
91 // Calculates cross correlation of two buffers
92 double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
94 uint overlapLengthLocal = overlapLength;
95 float corr;
97 // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
99 c-pseudocode:
101 corr = 0;
102 for (i = 0; i < overlapLength / 4; i ++)
104 corr += pV1[0] * pV2[0];
105 pV1[1] * pV2[1];
106 pV1[2] * pV2[2];
107 pV1[3] * pV2[3];
108 pV1[4] * pV2[4];
109 pV1[5] * pV2[5];
110 pV1[6] * pV2[6];
111 pV1[7] * pV2[7];
113 pV1 += 8;
114 pV2 += 8;
118 _asm
120 // give prefetch hints to CPU of what data are to be needed soonish.
121 // give more aggressive hints on pV1 as that changes more between different calls
122 // while pV2 stays the same.
123 prefetch [pV1]
124 prefetch [pV2]
125 prefetch [pV1 + 32]
127 mov eax, dword ptr pV2
128 mov ebx, dword ptr pV1
130 pxor mm0, mm0
132 mov ecx, overlapLengthLocal
133 shr ecx, 2 // div by four
135 loop1:
136 movq mm1, [eax]
137 prefetch [eax + 32] // give a prefetch hint to CPU what data are to be needed soonish
138 pfmul mm1, [ebx]
139 prefetch [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
141 movq mm2, [eax + 8]
142 pfadd mm0, mm1
143 pfmul mm2, [ebx + 8]
145 movq mm3, [eax + 16]
146 pfadd mm0, mm2
147 pfmul mm3, [ebx + 16]
149 movq mm4, [eax + 24]
150 pfadd mm0, mm3
151 pfmul mm4, [ebx + 24]
153 add eax, 32
154 pfadd mm0, mm4
155 add ebx, 32
157 dec ecx
158 jnz loop1
160 // add halfs of mm0 together and return the result.
161 // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
162 pfacc mm0, mm1
163 movd corr, mm0
164 femms
167 return corr;
173 //////////////////////////////////////////////////////////////////////////////
175 // implementation of 3DNow! optimized functions of class 'FIRFilter'
177 //////////////////////////////////////////////////////////////////////////////
179 #include "FIRFilter.h"
181 FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
183 filterCoeffsUnalign = NULL;
187 FIRFilter3DNow::~FIRFilter3DNow()
189 delete[] filterCoeffsUnalign;
193 // (overloaded) Calculates filter coefficients for 3DNow! routine
194 void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
196 uint i;
197 float fDivider;
199 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
201 // Scale the filter coefficients so that it won't be necessary to scale the filtering result
202 // also rearrange coefficients suitably for 3DNow!
203 // Ensure that filter coeffs array is aligned to 16-byte boundary
204 delete[] filterCoeffsUnalign;
205 filterCoeffsUnalign = new float[2 * newLength + 4];
206 filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);
208 fDivider = (float)resultDivider;
210 // rearrange the filter coefficients for mmx routines
211 for (i = 0; i < newLength; i ++)
213 filterCoeffsAlign[2 * i + 0] =
214 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
219 // 3DNow!-optimized version of the filter routine for stereo sound
220 uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
222 float *filterCoeffsLocal = filterCoeffsAlign;
223 uint count = (numSamples - length) & -2;
224 uint lengthLocal = length / 4;
226 assert(length != 0);
227 assert(count % 2 == 0);
229 /* original code:
231 double suml1, suml2;
232 double sumr1, sumr2;
233 uint i, j;
235 for (j = 0; j < count; j += 2)
237 const float *ptr;
239 suml1 = sumr1 = 0.0;
240 suml2 = sumr2 = 0.0;
241 ptr = src;
242 filterCoeffsLocal = filterCoeffs;
243 for (i = 0; i < lengthLocal; i ++)
245 // unroll loop for efficiency.
247 suml1 += ptr[0] * filterCoeffsLocal[0] +
248 ptr[2] * filterCoeffsLocal[2] +
249 ptr[4] * filterCoeffsLocal[4] +
250 ptr[6] * filterCoeffsLocal[6];
252 sumr1 += ptr[1] * filterCoeffsLocal[1] +
253 ptr[3] * filterCoeffsLocal[3] +
254 ptr[5] * filterCoeffsLocal[5] +
255 ptr[7] * filterCoeffsLocal[7];
257 suml2 += ptr[8] * filterCoeffsLocal[0] +
258 ptr[10] * filterCoeffsLocal[2] +
259 ptr[12] * filterCoeffsLocal[4] +
260 ptr[14] * filterCoeffsLocal[6];
262 sumr2 += ptr[9] * filterCoeffsLocal[1] +
263 ptr[11] * filterCoeffsLocal[3] +
264 ptr[13] * filterCoeffsLocal[5] +
265 ptr[15] * filterCoeffsLocal[7];
267 ptr += 16;
268 filterCoeffsLocal += 8;
270 dest[0] = (float)suml1;
271 dest[1] = (float)sumr1;
272 dest[2] = (float)suml2;
273 dest[3] = (float)sumr2;
275 src += 4;
276 dest += 4;
280 _asm
282 mov eax, dword ptr dest
283 mov ebx, dword ptr src
284 mov edx, count
285 shr edx, 1
287 loop1:
288 // "outer loop" : during each round 2*2 output samples are calculated
289 prefetch [ebx] // give a prefetch hint to CPU what data are to be needed soonish
290 prefetch [filterCoeffsLocal] // give a prefetch hint to CPU what data are to be needed soonish
292 mov esi, ebx
293 mov edi, filterCoeffsLocal
294 pxor mm0, mm0
295 pxor mm1, mm1
296 mov ecx, lengthLocal
298 loop2:
299 // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
300 movq mm2, [edi]
301 movq mm3, mm2
302 prefetch [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
303 pfmul mm2, [esi]
304 prefetch [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
305 pfmul mm3, [esi + 8]
307 movq mm4, [edi + 8]
308 movq mm5, mm4
309 pfadd mm0, mm2
310 pfmul mm4, [esi + 8]
311 pfadd mm1, mm3
312 pfmul mm5, [esi + 16]
314 movq mm2, [edi + 16]
315 movq mm6, mm2
316 pfadd mm0, mm4
317 pfmul mm2, [esi + 16]
318 pfadd mm1, mm5
319 pfmul mm6, [esi + 24]
321 movq mm3, [edi + 24]
322 movq mm7, mm3
323 pfadd mm0, mm2
324 pfmul mm3, [esi + 24]
325 pfadd mm1, mm6
326 pfmul mm7, [esi + 32]
327 add esi, 32
328 pfadd mm0, mm3
329 add edi, 32
330 pfadd mm1, mm7
332 dec ecx
333 jnz loop2
335 movq [eax], mm0
336 add ebx, 16
337 movq [eax + 8], mm1
338 add eax, 16
340 dec edx
341 jnz loop1
343 femms
346 return count;
350 #endif // ALLOW_3DNOW