2 Copyright (C) 2005 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
24 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
26 .globl x86_sse_mix_buffers_with_gain
27 .type x86_sse_mix_buffers_with_gain,@function
29 x86_sse_mix_buffers_with_gain
:
30 #; 8(%ebp) = float *dst = %edi
31 #; 12(%ebp) = float *src = %esi
32 #; 16(%ebp) = long nframes = %ecx
33 #; 20(%ebp) = float gain = st(0)
45 #; if nframes == 0, go to end
46 movl
16(%ebp
), %ecx
#; nframes
50 #; Check for alignment
52 movl
8(%ebp
), %edi
#; dst
53 movl
12(%ebp
), %esi
#; src
56 andl $
12, %eax
#; mask alignemnt offset
59 andl $
12, %ebx
#; mask alignment offset
62 jne
.MBWG_NONALIGN #; if not aligned, calculate manually
68 #; Pre-loop, we need to run 1-3 frames "manually" without
71 movss
20(%ebp
), %xmm1
#; xmm1
80 addl $
4, %edi
#; dst++
81 addl $
4, %esi
#; src++
82 decl
%ecx
#; nframes--
86 #; je .MBWG_END #; if we run out of frames, go to end
90 cmp $
16, %ebx
#; test if we've reached 16 byte alignment
96 cmp $
4, %ecx
#; we know it's not zero, but if it's not >=4, then
97 jnge
.MBWG_NONALIGN #; we jump straight to the "normal" code
99 #; copy gain to fill %xmm1
100 movss
20(%ebp
), %xmm1
101 shufps $
0x00, %xmm1
, %xmm1
106 movaps
(%esi
), %xmm0
#; source => xmm0
107 mulps
%xmm1
, %xmm0
#; apply gain to source
108 addps
(%edi
), %xmm0
#; mix with destination
109 movaps
%xmm0
, (%edi
) #; copy result to destination
111 addl $
16, %edi
#; dst+=4
112 addl $
16, %esi
#; src+=4
114 subl $
4, %ecx
#; nframes-=4
121 #; if there are remaining frames, the nonalign code will do nicely
122 #; for the rest 1-3 frames.
127 movss
20(%ebp
), %xmm1
#; gain => xmm1
140 jnz
.MBWG_NONALIGNLOOP
154 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
159 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
161 .globl x86_sse_mix_buffers_no_gain
162 .type x86_sse_mix_buffers_no_gain,@function
164 x86_sse_mix_buffers_no_gain
:
165 #; 8(%ebp) = float *dst = %edi
166 #; 12(%ebp) = float *src = %esi
167 #; 16(%ebp) = long nframes = %ecx
172 #; save the registers
181 #; if nframes == 0, go to end
182 movl
16(%ebp
), %ecx
#; nframes
186 #; Check for alignment
188 movl
8(%ebp
), %edi
#; dst
189 movl
12(%ebp
), %esi
#; src
192 andl $
12, %eax
#; mask alignemnt offset
195 andl $
12, %ebx
#; mask alignment offset
198 jne
.MBNG_NONALIGN #; if not aligned, calculate manually
203 #; Pre-loop, we need to run 1-3 frames "manually" without
212 addl $
4, %edi
#; dst++
213 addl $
4, %esi
#; src++
214 decl
%ecx
#; nframes--
218 cmp $
16, %ebx
#; test if we've reached 16 byte alignment
223 cmp $
4, %ecx
#; if there are frames left, but less than 4
224 jnge
.MBNG_NONALIGN #; we can't run SSE
228 movaps
(%esi
), %xmm0
#; source => xmm0
229 addps
(%edi
), %xmm0
#; mix with destination
230 movaps
%xmm0
, (%edi
) #; copy result to destination
232 addl $
16, %edi
#; dst+=4
233 addl $
16, %esi
#; src+=4
235 subl $
4, %ecx
#; nframes-=4
242 #; if there are remaining frames, the nonalign code will do nicely
243 #; for the rest 1-3 frames.
248 movss
(%esi
), %xmm0
#; src => xmm0
249 addss
(%edi
), %xmm0
#; xmm0 += dst
250 movss
%xmm0
, (%edi
) #; xmm0 => dst
270 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
275 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
277 .globl x86_sse_apply_gain_to_buffer
278 .type x86_sse_apply_gain_to_buffer,@function
280 x86_sse_apply_gain_to_buffer
:
281 #; 8(%ebp) = float *buf = %edi
282 #; 12(%ebp) = long nframes = %ecx
283 #; 16(%ebp) = float gain = st(0)
293 #; if nframes == 0, go to end
294 movl
12(%ebp
), %ecx
#; nframes
298 #; create the gain buffer in %xmm1
299 movss
16(%ebp
), %xmm1
300 shufps $
0x00, %xmm1
, %xmm1
302 #; Check for alignment
304 movl
8(%ebp
), %edi
#; buf
305 movl
%edi
, %edx
#; buf => %edx
306 andl $
12, %edx
#; mask bits 1 & 2, result = 0, 4, 8 or 12
307 jz
.AG_SSE #; if buffer IS aligned
310 #; we iterate 1-3 times, doing normal x87 float comparison
311 #; so we reach a 16 byte aligned "buf" (=%edi) value
315 #; Load next value from the buffer
320 #; increment buffer, decrement counter
321 addl $
4, %edi
#; buf++;
323 decl
%ecx
#; nframes--
324 jz
.AG_END #; if we run out of frames, we go to the end
326 addl $
4, %edx
#; one non-aligned byte less
328 jne
.AGLP_START #; if more non-aligned frames exist, we do a do-over
332 #; We have reached the 16 byte aligned "buf" ("edi") value
334 #; Figure out how many loops we should do
335 movl
%ecx
, %eax
#; copy remaining nframes to %eax for division
336 movl $
0, %edx
#; 0 the edx register
341 divl
%edi
#; %edx = remainder == 0
344 #; %eax = SSE iterations
356 #; subl $4, %ecx #; nframes-=4
361 #; Next we need to post-process all remaining frames
362 #; the remaining frame count is in %ecx
364 #; if no remaining frames, jump to the end
366 andl $
3, %ecx
#; nframes % 4
375 #; increment buffer, decrement counter
376 addl $
4, %edi
#; buf++;
378 decl
%ecx
#; nframes--
379 jnz
.AGPOST_START #; if we run out of frames, we go to the end
390 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
395 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
397 .globl x86_sse_compute_peak
398 .type x86_sse_compute_peak,@function
400 x86_sse_compute_peak
:
401 #; 8(%ebp) = float *buf = %edi
402 #; 12(%ebp) = long nframes = %ecx
403 #; 16(%ebp) = float current = st(0)
413 #; Load "current" in xmm0
414 movss
16(%ebp
), %xmm0
416 #; if nframes == 0, go to end
417 movl
12(%ebp
), %ecx
#; nframes
421 #; create the "abs" mask in %xmm2
425 shufps $
0x00, %xmm2
, %xmm2
427 #; Check for alignment
429 movl
8(%ebp
), %edi
#; buf
430 movl
%edi
, %edx
#; buf => %edx
431 andl $
12, %edx
#; mask bits 1 & 2, result = 0, 4, 8 or 12
432 jz
.CP_SSE #; if buffer IS aligned
435 #; we iterate 1-3 times, doing normal x87 float comparison
436 #; so we reach a 16 byte aligned "buf" (=%edi) value
440 #; Load next value from the buffer
445 #; increment buffer, decrement counter
446 addl $
4, %edi
#; buf++;
448 decl
%ecx
#; nframes--
449 jz
.CP_END #; if we run out of frames, we go to the end
451 addl $
4, %edx
#; one non-aligned byte less
453 jne
.LP_START #; if more non-aligned frames exist, we do a do-over
457 #; We have reached the 16 byte aligned "buf" ("edi") value
459 #; Figure out how many loops we should do
460 movl
%ecx
, %eax
#; copy remaining nframes to %eax for division
462 shr $
2,%eax
#; unsigned divide by 4
465 #; %eax = SSE iterations
467 #; current maximum is at %xmm0, but we need to ..
468 shufps $
0x00, %xmm0
, %xmm0
#; shuffle "current" to all 4 FP's
470 #;prefetcht0 16(%edi)
483 #; Calculate the maximum value contained in the 4 FP's in %xmm0
485 shufps $
0x4e, %xmm1
, %xmm1
#; shuffle left & right pairs (1234 => 3412)
486 maxps
%xmm1
, %xmm0
#; maximums of the two pairs
488 shufps $
0xb1, %xmm1
, %xmm1
#; shuffle the floats inside the two pairs (1234 => 2143)
491 #; now every float in %xmm0 is the same value, current maximum value
493 #; Next we need to post-process all remaining frames
494 #; the remaining frame count is in %ecx
496 #; if no remaining frames, jump to the end
498 andl $
3, %ecx
#; nframes % 4
507 addl $
4, %edi
#; buf++;
509 decl
%ecx
#; nframes--;
514 #; Load the value from xmm0 to the float stack for returning
515 movss
%xmm0
, 16(%ebp
)
524 .size x86_sse_compute_peak, .-x86_sse_compute_peak
528 .section .note.GNU-stack,"",%progbits