remove a bunch of explicit uses of '/' as a directory separator; use Glib::build_file...
[ardour2.git] / libs / soundtouch / mmx_gcc.cpp
blob9e92765e4ef228a6d9707e6ec1d184d54b60c1f6
1 ////////////////////////////////////////////////////////////////////////////////
2 ///
3 /// gcc version of the MMX optimized routines. All MMX optimized functions
4 /// have been gathered into this single source code file, regardless to their
5 /// class or original source code file, in order to ease porting the library
6 /// to other compiler and processor platforms.
7 ///
8 /// This file is to be compiled on any platform with the GNU C compiler.
9 /// Compiler. Please see 'mmx_win.cpp' for the x86 Windows version of this
10 /// file.
11 ///
12 /// Author : Copyright (c) Olli Parviainen
13 /// Author e-mail : oparviai @ iki.fi
14 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
15 ///
16 ////////////////////////////////////////////////////////////////////////////////
18 // Last changed : $Date$
19 // File revision : $Revision$
21 // $Id$
23 ////////////////////////////////////////////////////////////////////////////////
25 // License :
27 // SoundTouch audio processing library
28 // Copyright (c) Olli Parviainen
30 // This library is free software; you can redistribute it and/or
31 // modify it under the terms of the GNU Lesser General Public
32 // License as published by the Free Software Foundation; either
33 // version 2.1 of the License, or (at your option) any later version.
35 // This library is distributed in the hope that it will be useful,
36 // but WITHOUT ANY WARRANTY; without even the implied warranty of
37 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 // Lesser General Public License for more details.
40 // You should have received a copy of the GNU Lesser General Public
41 // License along with this library; if not, write to the Free Software
42 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
44 ////////////////////////////////////////////////////////////////////////////////
46 #include <stdexcept>
47 #include <string>
48 #include "cpu_detect.h"
50 #ifndef __GNUC__
51 #error "wrong platform - this source code file is for the GNU C compiler."
52 #endif
54 using namespace std;
55 using namespace soundtouch;
58 #ifdef ALLOW_MMX
59 // MMX routines available only with integer sample type
61 //////////////////////////////////////////////////////////////////////////////
63 // implementation of MMX optimized functions of class 'TDStretch'
65 // NOTE: ebx in gcc 3.x is not preserved if -fPIC and -DPIC
66 // gcc-3.4 correctly flags this error and wont let you continue.
67 // gcc-2.95 preserves esi correctly
69 //////////////////////////////////////////////////////////////////////////////
71 #include "TDStretch.h"
72 #include <limits.h>
74 // these are declared in 'TDStretch.cpp'
75 extern int scanOffsets[4][24];
77 // Calculates cross correlation of two buffers
78 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
80 #ifdef __i386__
81 int corr;
82 uint local_overlapLength = overlapLength;
83 uint local_overlapDividerBits = overlapDividerBits;
85 asm volatile(
86 // Calculate cross-correlation between the tempOffset and tmpbid_buffer.
88 // Process 4 parallel batches of 2 * stereo samples each during one
89 // round to improve CPU-level parallellization.
91 // load address of sloped pV2 buffer to eax
92 // load address of mixing point of the sample data buffer to edi
93 // load counter to ecx = overlapLength / 8 - 1
94 // empty the mm0
96 // prepare to the first round by loading
97 // load mm1 = eax[0]
98 // load mm2 = eax[1];
100 "\n\tmovl %1, %%eax"
101 "\n\tmovl %2, %%edi"
103 "\n\tmovq (%%eax), %%mm1"
104 "\n\tmovl %3, %%ecx"
106 "\n\tmovq 8(%%eax), %%mm2"
107 "\n\tshr $3, %%ecx"
109 "\n\tpxor %%mm0, %%mm0"
110 "\n\tsub $1, %%ecx"
112 "\n\tmovd %4, %%mm5"
114 "\n1:"
115 // multiply-add mm1 = mm1 * edi[0]
116 // multiply-add mm2 = mm2 * edi[1]
118 // add mm2 += mm1
119 // mm2 >>= mm5 (=overlapDividerBits)
120 // add mm0 += mm2
122 // load mm3 = eax[2]
123 // multiply-add mm3 = mm3 * edi[2]
125 // load mm4 = eax[3]
126 // multiply-add mm4 = mm4 * edi[3]
128 // add mm3 += mm4
129 // mm3 >>= mm5 (=overlapDividerBits)
130 // add mm0 += mm3
132 // add eax += 4
133 // add edi += 4
134 // load mm1 = eax[0] (~eax[4])
135 // load mm2 = eax[1] (~eax[5])
137 // loop
139 "\n\tpmaddwd (%%edi), %%mm1" // qword ptr [edi]
140 "\n\tmovq 16(%%eax), %%mm3" // qword ptr [eax+16]
142 "\n\tpmaddwd 8(%%edi), %%mm2" // qword ptr [edi+8]
143 "\n\tmovq 24(%%eax), %%mm4" // qword ptr [eax+24]
145 "\n\tpmaddwd 16(%%edi), %%mm3" // qword ptr [edi+16]
146 "\n\tpaddd %%mm1, %%mm2"
148 "\n\tpmaddwd 24(%%edi), %%mm4" // qword ptr [edi+24]
149 "\n\tmovq 32(%%eax), %%mm1" // qword ptr [eax+32]
151 "\n\tpsrad %%mm5, %%mm2"
152 "\n\tadd $32, %%eax"
154 "\n\tpaddd %%mm4, %%mm3"
155 "\n\tpaddd %%mm2, %%mm0"
157 "\n\tmovq 8(%%eax), %%mm2" // qword ptr [eax+8]
158 "\n\tpsrad %%mm5, %%mm3"
160 "\n\tadd $32, %%edi"
161 "\n\tpaddd %%mm3, %%mm0"
163 "\n\tdec %%ecx"
164 "\n\tjnz 1b"
166 // Finalize the last partial loop:
168 "\n\tmovq 16(%%eax), %%mm3" // qword ptr [eax+16]
169 "\n\tpmaddwd (%%edi), %%mm1" // qword ptr [edi]
171 "\n\tmovq 24(%%eax), %%mm4" // qword ptr [eax+24]
172 "\n\tpmaddwd 8(%%edi), %%mm2" // qword ptr [edi+8]
174 "\n\tpmaddwd 16(%%edi), %%mm3" // qword ptr [edi+16]
175 "\n\tpaddd %%mm1, %%mm2"
177 "\n\tpmaddwd 24(%%edi), %%mm4" // qword ptr [edi+24]
178 "\n\tpsrad %%mm5, %%mm2"
180 "\n\tpaddd %%mm4, %%mm3"
181 "\n\tpaddd %%mm2, %%mm0"
183 "\n\tpsrad %%mm5, %%mm3"
184 "\n\tpaddd %%mm3, %%mm0"
186 // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
187 // and finally store the result into the variable "corr"
189 "\n\tmovq %%mm0, %%mm1"
190 "\n\tpsrlq $32, %%mm1"
191 "\n\tpaddd %%mm1, %%mm0"
192 "\n\tmovd %%mm0, %0"
193 : "=rm" (corr)
194 : "rim" (pV1), "rim" (pV2), "rim" (local_overlapLength),
195 "rim" (local_overlapDividerBits)
196 : "%ecx", "%eax", "%edi"
198 return corr;
200 // Note: Warning about the missing EMMS instruction is harmless
201 // as it'll be called elsewhere.
202 #else
203 throw runtime_error("MMX not supported");
204 #endif
207 void TDStretchMMX::clearCrossCorrState()
209 #ifdef __i386__
210 asm volatile("EMMS");
211 #endif
214 // MMX-optimized version of the function overlapStereo
215 void TDStretchMMX::overlapStereo(short *output, const short *input) const
217 #ifdef __i386__
218 short *local_midBuffer = pMidBuffer;
219 uint local_overlapLength = overlapLength;
220 uint local_overlapDividerBits = overlapDividerBits;
222 asm volatile(
223 "\n\t"
224 // load sliding mixing value counter to mm6 and mm7
225 // load counter value to ecx = overlapLength / 4
226 // load divider-shifter value to esi
227 // load mixing value adder to mm5
228 // load address of midBuffer to eax
229 // load address of inputBuffer added with ovlOffset to edi
230 // load address of end of the outputBuffer to edx
232 // We need to preserve esi, since gcc uses it for the
233 // stack frame.
235 "movl %0, %%eax\n\t" // ecx = 0x0000 OVL_
236 "movl $0x0002fffe, %%edi\n\t" // ecx = 0x0002 fffe
238 "movl %1, %%esi\n\t"
239 "movd %%eax, %%mm6\n\t" // mm6 = 0x0000 0000 0000 OVL_
241 "movl %%eax, %%ecx\n\t"
242 "sub $1, %%eax\n\t"
244 "punpckldq %%mm6, %%mm6\n\t" // mm6 = 0x0000 OVL_ 0000 OVL_
246 "or $0x00010000, %%eax\n\t" // eax = 0x0001 overlapLength-1
248 "movd %%edi, %%mm5\n\t" // mm5 = 0x0000 0000 0002 fffe
249 "movd %%eax, %%mm7\n\t" // mm7 = 0x0000 0000 0001 01ff
251 "movl %3, %%edi\n\t"
253 "movl %4, %%eax\n\t" // dword ptr local_midBuffer
254 "punpckldq %%mm5, %%mm5\n\t" // mm5 = 0x0002 fffe 0002 fffe
256 "shr $2, %%ecx\n\t" // ecx = overlapLength / 2
257 "punpckldq %%mm7, %%mm7\n\t" // mm7 = 0x0001 01ff 0001 01ff
259 "movl %2, %%edx\n"
261 "2:\n\t"
262 // Process two parallel batches of 2+2 stereo samples during each round
263 // to improve CPU-level parallellization.
265 // Load [eax] into mm0 and mm1
266 // Load [edi] into mm3
267 // unpack words of mm0, mm1 and mm3 into mm0 and mm1
268 // multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
269 // divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
270 // pack the result into mm0 and store into [edx]
272 // Load [eax+8] into mm2 and mm3
273 // Load [edi+8] into mm4
274 // unpack words of mm2, mm3 and mm4 into mm2 and mm3
275 // multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
276 // divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
277 // pack the result into mm2 and store into [edx+8]
280 "movq (%%eax), %%mm0\n\t" // mm0 = m1l m1r m0l m0r
281 "add $16, %%edx\n\t"
283 "movq (%%edi), %%mm3\n\t" // mm3 = i1l i1r i0l i0r
284 "movq %%mm0, %%mm1\n\t" // mm1 = m1l m1r m0l m0r
286 "movq 8(%%eax), %%mm2\n\t" // mm2 = m3l m3r m2l m2r
287 "punpcklwd %%mm3, %%mm0\n\t" // mm0 = i0l m0l i0r m0r
289 "movq 8(%%edi), %%mm4\n\t" // mm4 = i3l i3r i2l i2r
290 "punpckhwd %%mm3, %%mm1\n\t" // mm1 = i1l m1l i1r m1r
292 "movq %%mm2, %%mm3\n\t" // mm3 = m3l m3r m2l m2r
293 "punpcklwd %%mm4, %%mm2\n\t" // mm2 = i2l m2l i2r m2r
295 "pmaddwd %%mm6, %%mm0\n\t" // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
296 "punpckhwd %%mm4, %%mm3\n\t" // mm3 = i3l m3l i3r m3r
298 "movd %%esi, %%mm4\n\t" // mm4 = overlapDividerBits
300 "pmaddwd %%mm7, %%mm1\n\t" // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
301 "paddw %%mm5, %%mm6\n\t"
303 "paddw %%mm5, %%mm7\n\t"
304 "psrad %%mm4, %%mm0\n\t" // mmo >>= overlapDividerBits
306 "pmaddwd %%mm6, %%mm2\n\t" // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
307 "psrad %%mm4, %%mm1\n\t" // mm1 >>= overlapDividerBits
309 "pmaddwd %%mm7, %%mm3\n\t" // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
310 "psrad %%mm4, %%mm2\n\t" // mm2 >>= overlapDividerBits
312 "packssdw %%mm1, %%mm0\n\t" // mm0 = mm1h mm1l mm0h mm0l
313 "psrad %%mm4, %%mm3\n\t" // mm3 >>= overlapDividerBits
315 "add $16, %%eax\n\t"
316 "paddw %%mm5, %%mm6\n\t"
318 "packssdw %%mm3, %%mm2\n\t" // mm2 = mm2h mm2l mm3h mm3l
319 "paddw %%mm5, %%mm7\n\t"
321 "movq %%mm0, -16(%%edx)\n\t"
322 "add $16, %%edi\n\t"
324 "movq %%mm2, -8(%%edx)\n\t"
325 "dec %%ecx\n\t"
327 "jnz 2b\n\t"
329 "emms\n\t"
332 : "rim" (local_overlapLength),
333 "rim" (local_overlapDividerBits),
334 "rim" (output),
335 "rim" (input),
336 "rim" (local_midBuffer)
337 /* input */
338 : "%edi", "%ecx", "%edx", "%eax", "%esi" /* regs */
340 #else
341 throw runtime_error("MMX not supported");
342 #endif
346 //////////////////////////////////////////////////////////////////////////////
348 // implementation of MMX optimized functions of class 'FIRFilter'
350 //////////////////////////////////////////////////////////////////////////////
352 #include "FIRFilter.h"
354 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
356 filterCoeffsUnalign = NULL;
360 FIRFilterMMX::~FIRFilterMMX()
362 delete[] filterCoeffsUnalign;
366 #if 1
367 // (overloaded) Calculates filter coefficients for MMX routine
368 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
370 #ifdef __i386__
371 uint i;
372 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
374 // Ensure that filter coeffs array is aligned to 16-byte boundary
375 delete[] filterCoeffsUnalign;
376 filterCoeffsUnalign = new short[2 * newLength + 8];
377 filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);
379 // rearrange the filter coefficients for mmx routines
380 for (i = 0;i < length; i += 4)
382 filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
383 filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
384 filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
385 filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
387 filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
388 filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
389 filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
390 filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
392 #else
393 throw runtime_error("MMX not supported");
394 #endif
399 // mmx-optimized version of the filter routine for stereo sound
400 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
402 #ifdef __i386__
403 // Create stack copies of the needed member variables for asm routines :
404 uint local_length = length;
405 uint local_lengthDiv8 = lengthDiv8;
406 uint local_resultDivider = resultDivFactor;
407 short *local_filterCoeffs = (short*)filterCoeffsAlign;
408 short *local_src = (short *)src;
410 asm volatile(
411 "\n\t"
412 // Load (num_samples-aa_filter_length)/2 to edi as a i
413 // Load a pointer to samples to esi
414 // Load a pointer to destination to edx
416 "movl %0, %%edi\n\t"
417 "subl %2, %%edi\n\t"
418 "movl %3, %%edx\n\t"
419 "sar $1, %%edi\n"
421 // Load filter length/8 to ecx
422 // Load pointer to samples from esi to ebx
423 // Load counter from edi to ecx
424 // Load [ebx] to mm3
425 // Load pointer to filter coefficients to eax
426 "3:\n\t"
427 "movl %1, %%esi\n\t"
428 "pxor %%mm0, %%mm0\n\t"
430 "movl %4, %%ecx\n\t"
431 "pxor %%mm7, %%mm7\n\t"
433 "movq (%%esi), %%mm1\n\t" // mm1 = l1 r1 l0 r0
434 "movl %5, %%eax\n"
435 "4:\n\t"
437 "movq 8(%%esi), %%mm2\n\t" // mm2 = l3 r3 l2 r2
438 "movq %%mm1, %%mm4\n\t" // mm4 = l1 r1 l0 r0
440 "movq 16(%%esi), %%mm3\n\t" // mm3 = l5 r5 l4 r4
441 "punpckhwd %%mm2, %%mm1\n\t" // mm1 = l3 l1 r3 r1
443 "movq %%mm2, %%mm6\n\t" // mm6 = l3 r3 l2 r2
444 "punpcklwd %%mm2, %%mm4\n\t" // mm4 = l2 l0 r2 r0
446 "movq (%%eax), %%mm2\n\t" // mm2 = f2 f0 f2 f0
447 "movq %%mm1, %%mm5\n\t" // mm5 = l3 l1 r3 r1
449 "punpcklwd %%mm3, %%mm6\n\t" // mm6 = l4 l2 r4 r2
450 "pmaddwd %%mm2, %%mm4\n\t" // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
452 "pmaddwd %%mm2, %%mm5\n\t" // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
453 "movq 8(%%eax), %%mm2\n\t" // mm2 = f3 f1 f3 f1
455 "paddd %%mm4, %%mm0\n\t" // mm0 += s02*f02
456 "movq %%mm3, %%mm4\n\t" // mm4 = l1 r1 l0 r0
458 "pmaddwd %%mm2, %%mm1\n\t" // mm1 = l3*f3+l1*f1 r3*f3+l1*f1
459 "paddd %%mm5, %%mm7\n\t" // mm7 += s13*f02
461 "pmaddwd %%mm2, %%mm6\n\t" // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
462 "movq 24(%%esi), %%mm2\n\t" // mm2 = l3 r3 l2 r2
464 "paddd %%mm1, %%mm0\n\t" // mm0 += s31*f31
465 "movq 32(%%esi), %%mm1\n\t" // mm1 = l5 r5 l4 r4
467 "paddd %%mm6, %%mm7\n\t" // mm7 += s42*f31
468 "punpckhwd %%mm2, %%mm3\n\t" // mm3 = l3 l1 r3 r1
470 "movq %%mm2, %%mm6\n\t" // mm6 = l3 r3 l2 r2
471 "punpcklwd %%mm2, %%mm4\n\t" // mm4 = l2 l0 r2 r0
473 "movq 16(%%eax), %%mm2\n\t" // mm2 = f2 f0 f2 f0
474 "movq %%mm3, %%mm5\n\t" // mm5 = l3 l1 r3 r1
476 "punpcklwd %%mm1, %%mm6\n\t" // mm6 = l4 l2 r4 r2
477 "add $32, %%eax\n\t"
479 "pmaddwd %%mm2, %%mm4\n\t" // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
480 "add $32, %%esi\n\t"
482 "pmaddwd %%mm2, %%mm5\n\t" // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
483 "movq -8(%%eax), %%mm2\n\t" // mm2 = f3 f1 f3 f1
485 "paddd %%mm4, %%mm0\n\t" // mm0 += s02*f02
486 "pmaddwd %%mm2, %%mm3\n\t" // mm3 = l3*f3+l1*f1 r3*f3+l1*f1
488 "paddd %%mm5, %%mm7\n\t" // mm7 += s13*f02
489 "pmaddwd %%mm2, %%mm6\n\t" // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
491 "paddd %%mm3, %%mm0\n\t" // mm0 += s31*f31
492 "paddd %%mm6, %%mm7\n\t" // mm7 += s42*f31
494 "dec %%ecx\n\t"
495 "jnz 4b\n\t"
497 // Divide mm0 and mm7 by 8192 (= right-shift by 13),
498 // pack and store to [edx]
499 "movd %6, %%mm4\n\t"
501 "psrad %%mm4, %%mm0\n\t" // divide the result
503 "add $8, %%edx\n\t"
504 "psrad %%mm4, %%mm7\n\t" // divide the result
506 "add $8, %1\n\t"
507 "packssdw %%mm7, %%mm0\n\t"
509 "movq %%mm0, -8(%%edx)\n\t"
510 "dec %%edi\n\t"
512 "jnz 3b\n\t"
514 "emms\n\t"
517 : "rim" (numSamples),
518 "rim" (local_src),
519 "rim" (local_length),
520 "rim" (dest),
521 "rim" (local_lengthDiv8),
522 "rim" (local_filterCoeffs),
523 "rim" (local_resultDivider) /* input */
524 : "%eax", "%ecx", "%edx", "%edi", "%esi" /* regs */
526 return (numSamples & 0xfffffffe) - local_length;
527 #else
528 throw runtime_error("MMX not supported");
529 return 0;
530 #endif
532 #endif
534 #endif // ALLOW_MMX