2 Copyright (C) 2005-2006 Paul Davis, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
19 64-bit conversion: John Rigg
25 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
27 .globl x86_sse_mix_buffers_with_gain
28 .type x86_sse_mix_buffers_with_gain,@function
30 x86_sse_mix_buffers_with_gain
:
34 #; %rdx unsigned int nframes
45 #; if nframes == 0, go to end
49 #; Check for alignment
52 andq $
12, %rax
#; mask alignment offset
55 andq $
12, %rbx
#; mask alignment offset
58 jne
.MBWG_NONALIGN #; if not aligned, calculate manually
64 #; Pre-loop, we need to run 1-3 frames "manually" without
69 #; gain is already in %xmm0
75 addq $
4, %rdi
#; dst++
76 addq $
4, %rsi
#; src++
77 decq
%rdx
#; nframes--
82 cmp $
16, %rbx
#; test if we've reached 16 byte alignment
88 cmp $
4, %rdx
#; we know it's not zero, but if it's not >=4, then
89 jnge
.MBWG_NONALIGN #; we jump straight to the "normal" code
91 #; gain is already in %xmm0
92 shufps $
0x00, %xmm0
, %xmm0
97 movaps
(%rsi
), %xmm1
#; source => xmm0
98 mulps
%xmm0
, %xmm1
#; apply gain to source
99 addps
(%rdi
), %xmm1
#; mix with destination
100 movaps
%xmm1
, (%rdi
) #; copy result to destination
102 addq $
16, %rdi
#; dst+=4
103 addq $
16, %rsi
#; src+=4
105 subq $
4, %rdx
#; nframes-=4
112 #; if there are remaining frames, the nonalign code will do nicely
113 #; for the rest 1-3 frames.
118 #; gain is already in %xmm0
131 jnz
.MBWG_NONALIGNLOOP
143 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
146 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
148 .globl x86_sse_mix_buffers_no_gain
149 .type x86_sse_mix_buffers_no_gain,@function
151 x86_sse_mix_buffers_no_gain
:
155 #; %rdx unsigned int nframes
160 #; save the registers
167 #; if nframes == 0, go to end
171 #; Check for alignment
174 andq $
12, %rax
#; mask alignment offset
177 andq $
12, %rbx
#; mask alignment offset
180 jne
.MBNG_NONALIGN #; if not aligned, calculate manually
185 #; Pre-loop, we need to run 1-3 frames "manually" without
194 addq $
4, %rdi
#; dst++
195 addq $
4, %rsi
#; src++
196 decq
%rdx
#; nframes--
200 cmp $
16, %rbx
#; test if we've reached 16 byte alignment
205 cmp $
4, %rdx
#; if there are frames left, but less than 4
206 jnge
.MBNG_NONALIGN #; we can't run SSE
210 movaps
(%rsi
), %xmm0
#; source => xmm0
211 addps
(%rdi
), %xmm0
#; mix with destination
212 movaps
%xmm0
, (%rdi
) #; copy result to destination
214 addq $
16, %rdi
#; dst+=4
215 addq $
16, %rsi
#; src+=4
217 subq $
4, %rdx
#; nframes-=4
224 #; if there are remaining frames, the nonalign code will do nicely
225 #; for the rest 1-3 frames.
230 movss
(%rsi
), %xmm0
#; src => xmm0
231 addss
(%rdi
), %xmm0
#; xmm0 += dst
232 movss
%xmm0
, (%rdi
) #; xmm0 => dst
250 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
253 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
255 .globl x86_sse_apply_gain_to_buffer
256 .type x86_sse_apply_gain_to_buffer,@function
258 x86_sse_apply_gain_to_buffer
:
260 #; %rdi float *buf 32(%rbp)
261 #; %rsi unsigned int nframes
263 #; %xmm1 float buf[0]
273 #; if nframes == 0, go to end
274 movq
%rsi
, %rcx
#; nframes
278 #; set up the gain buffer (gain is already in %xmm0)
279 shufps $
0x00, %xmm0
, %xmm0
281 #; Check for alignment
283 movq
%rdi
, %rdx
#; buf => %rdx
284 andq $
12, %rdx
#; mask bits 1 & 2, result = 0, 4, 8 or 12
285 jz
.AG_SSE #; if buffer IS aligned
288 #; we iterate 1-3 times, doing normal x87 float comparison
289 #; so we reach a 16 byte aligned "buf" (=%rdi) value
293 #; Load next value from the buffer into %xmm1
298 #; increment buffer, decrement counter
299 addq $
4, %rdi
#; buf++;
301 decq
%rcx
#; nframes--
302 jz
.AG_END #; if we run out of frames, we go to the end
304 addq $
4, %rdx
#; one non-aligned byte less
306 jne
.AGLP_START #; if more non-aligned frames exist, we do a do-over
310 #; We have reached the 16 byte aligned "buf" ("rdi") value
312 #; Figure out how many loops we should do
313 movq
%rcx
, %rax
#; copy remaining nframes to %rax for division
314 movq $
0, %rdx
#; 0 the edx register
319 divq
%rdi
#; %rdx = remainder == 0
322 #; %rax = SSE iterations
334 subq $
4, %rcx
#; nframes-=4
339 #; Next we need to post-process all remaining frames
340 #; the remaining frame count is in %rcx
342 #; if no remaining frames, jump to the end
344 andq $
3, %rcx
#; nframes % 4
353 #; increment buffer, decrement counter
354 addq $
4, %rdi
#; buf++;
356 decq
%rcx
#; nframes--
357 jnz
.AGPOST_START #; if we run out of frames, we go to the end
368 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
372 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
374 .globl x86_sse_apply_gain_vector
375 .type x86_sse_apply_gain_vector,@function
377 x86_sse_apply_gain_vector
:
380 #; %rsi float *gain_vector
381 #; %rdx unsigned int nframes
391 #; if nframes == 0 go to end
406 jz
.AGA_SSE #; if buffers are aligned, jump to the SSE loop
408 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
411 movss
(%rdi
), %xmm0
#; buf => xmm0
412 movss
(%rsi
), %xmm1
#; gain value => xmm1
413 mulss
%xmm1
, %xmm0
#; xmm1 * xmm0 => xmm0
414 movss
%xmm0
, (%rdi
) #; signal with gain => buf
419 addq $
4, %rdi
#; buf++
420 addq $
4, %rsi
#; gab++
426 #; There are frames left for sure, as that is checked in the beginning
427 #; and within the previous loop. BUT, there might be less than 4 frames
431 movq
%rdx
, %rax
#; nframes => %rax
432 shr $
2, %rax
#; unsigned divide by 4
434 cmp $
0, %rax
#; Jos toimii ilman tätä, niin kiva
449 andq $
3, %rdx
#; Remaining frames are nframes & 3
453 #; Inside this loop, we know there are frames left to process
454 #; but because either there are < 4 frames left, or the buffers
455 #; are not aligned, we can't use the parallel SSE ops
457 movss
(%rdi
), %xmm0
#; buf => xmm0
458 movss
(%rsi
), %xmm1
#; gain value => xmm1
459 mulss
%xmm1
, %xmm0
#; xmm1 * xmm0 => xmm0
460 movss
%xmm0
, (%rdi
) #; signal with gain => buf
464 decq
%rdx
#; nframes--
476 .size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
480 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
482 .globl x86_sse_compute_peak
483 .type x86_sse_compute_peak,@function
486 x86_sse_compute_peak
:
488 #; %rdi float *buf 32(%rbp)
489 #; %rsi unsigned int nframes
490 #; %xmm0 float current
491 #; %xmm1 float buf[0]
499 #; if nframes == 0, go to end
500 movq
%rsi
, %rcx
#; nframes
504 #; create the "abs" mask in %xmm2
508 shufps $
0x00, %xmm2
, %xmm2
510 #; Check for alignment
512 #;movq 8(%rbp), %rdi #; buf
513 movq
%rdi
, %rdx
#; buf => %rdx
514 andq $
12, %rdx
#; mask bits 1 & 2, result = 0, 4, 8 or 12
515 jz
.CP_SSE #; if buffer IS aligned
518 #; we iterate 1-3 times, doing normal x87 float comparison
519 #; so we reach a 16 byte aligned "buf" (=%rdi) value
523 #; Load next value from the buffer
528 #; increment buffer, decrement counter
529 addq $
4, %rdi
#; buf++;
531 decq
%rcx
#; nframes--
532 jz
.CP_END #; if we run out of frames, we go to the end
534 addq $
4, %rdx
#; one non-aligned byte less
536 jne
.LP_START #; if more non-aligned frames exist, we do a do-over
540 #; We have reached the 16 byte aligned "buf" ("rdi") value
542 #; Figure out how many loops we should do
543 movq
%rcx
, %rax
#; copy remaining nframes to %rax for division
545 shr $
2,%rax
#; unsigned divide by 4
548 #; %rax = SSE iterations
550 #; current maximum is at %xmm0, but we need to ..
551 shufps $
0x00, %xmm0
, %xmm0
#; shuffle "current" to all 4 FP's
553 #;prefetcht0 16(%rdi)
566 #; Calculate the maximum value contained in the 4 FP's in %xmm0
568 shufps $
0x4e, %xmm1
, %xmm1
#; shuffle left & right pairs (1234 => 3412)
569 maxps
%xmm1
, %xmm0
#; maximums of the two pairs
571 shufps $
0xb1, %xmm1
, %xmm1
#; shuffle the floats inside the two pairs (1234 => 2143)
574 #; now every float in %xmm0 is the same value, current maximum value
576 #; Next we need to post-process all remaining frames
577 #; the remaining frame count is in %rcx
579 #; if no remaining frames, jump to the end
581 andq $
3, %rcx
#; nframes % 4
590 addq $
4, %rdi
#; buf++;
592 decq
%rcx
#; nframes--;
603 .size x86_sse_compute_peak, .-x86_sse_compute_peak
607 .section .note.GNU-stack,"",%progbits