Move panner bypass state up to the PannerShell so that it is preserved even when...
[ardour2.git] / libs / ardour / sse_functions_64bit.s
blob0242db3e778ea2b1150d8ca54891a9aa4fc0a70f
1 /*
2 Copyright (C) 2005-2006 Paul Davis, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
19 64-bit conversion: John Rigg
21 $Id$
25 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
27 .globl x86_sse_mix_buffers_with_gain
28 .type x86_sse_mix_buffers_with_gain,@function
30 x86_sse_mix_buffers_with_gain:
32 #; %rdi float *dst
33 #; %rsi float *src
34 #; %rdx unsigned int nframes
35 #; %xmm0 float gain
37 pushq %rbp
38 movq %rsp, %rbp
40 #; save the registers
41 pushq %rbx
42 pushq %rdi
43 pushq %rsi
45 #; if nframes == 0, go to end
46 cmp $0, %rdx
47 je .MBWG_END
49 #; Check for alignment
51 movq %rdi, %rax
52 andq $12, %rax #; mask alignment offset
54 movq %rsi, %rbx
55 andq $12, %rbx #; mask alignment offset
57 cmp %rax, %rbx
58 jne .MBWG_NONALIGN #; if not aligned, calculate manually
60 #; if we are aligned
61 cmp $0, %rbx
62 jz .MBWG_SSE
64 #; Pre-loop, we need to run 1-3 frames "manually" without
65 #; SSE instructions
67 .MBWG_PRELOOP:
69 #; gain is already in %xmm0
70 movss (%rsi), %xmm1
71 mulss %xmm0, %xmm1
72 addss (%rdi), %xmm1
73 movss %xmm1, (%rdi)
75 addq $4, %rdi #; dst++
76 addq $4, %rsi #; src++
77 decq %rdx #; nframes--
78 jz .MBWG_END
80 addq $4, %rbx
82 cmp $16, %rbx #; test if we've reached 16 byte alignment
83 jne .MBWG_PRELOOP
86 .MBWG_SSE:
88 cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
89 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
91 #; gain is already in %xmm0
92 shufps $0x00, %xmm0, %xmm0
95 .MBWG_SSELOOP:
97 movaps (%rsi), %xmm1 #; source => xmm0
98 mulps %xmm0, %xmm1 #; apply gain to source
99 addps (%rdi), %xmm1 #; mix with destination
100 movaps %xmm1, (%rdi) #; copy result to destination
102 addq $16, %rdi #; dst+=4
103 addq $16, %rsi #; src+=4
105 subq $4, %rdx #; nframes-=4
106 cmp $4, %rdx
107 jge .MBWG_SSELOOP
109 cmp $0, %rdx
110 je .MBWG_END
112 #; if there are remaining frames, the nonalign code will do nicely
113 #; for the rest 1-3 frames.
115 .MBWG_NONALIGN:
116 #; not aligned!
118 #; gain is already in %xmm0
120 .MBWG_NONALIGNLOOP:
122 movss (%rsi), %xmm1
123 mulss %xmm0, %xmm1
124 addss (%rdi), %xmm1
125 movss %xmm1, (%rdi)
127 addq $4, %rdi
128 addq $4, %rsi
130 decq %rdx
131 jnz .MBWG_NONALIGNLOOP
133 .MBWG_END:
135 popq %rsi
136 popq %rdi
137 popq %rbx
139 #; return
140 leave
143 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
146 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
148 .globl x86_sse_mix_buffers_no_gain
149 .type x86_sse_mix_buffers_no_gain,@function
151 x86_sse_mix_buffers_no_gain:
153 #; %rdi float *dst
154 #; %rsi float *src
155 #; %rdx unsigned int nframes
157 pushq %rbp
158 movq %rsp, %rbp
160 #; save the registers
161 pushq %rbx
162 pushq %rdi
163 pushq %rsi
165 #; the real function
167 #; if nframes == 0, go to end
168 cmp $0, %rdx
169 je .MBNG_END
171 #; Check for alignment
173 movq %rdi, %rax
174 andq $12, %rax #; mask alignment offset
176 movq %rsi, %rbx
177 andq $12, %rbx #; mask alignment offset
179 cmp %rax, %rbx
180 jne .MBNG_NONALIGN #; if not aligned, calculate manually
182 cmp $0, %rbx
183 je .MBNG_SSE
185 #; Pre-loop, we need to run 1-3 frames "manually" without
186 #; SSE instructions
188 .MBNG_PRELOOP:
190 movss (%rsi), %xmm0
191 addss (%rdi), %xmm0
192 movss %xmm0, (%rdi)
194 addq $4, %rdi #; dst++
195 addq $4, %rsi #; src++
196 decq %rdx #; nframes--
197 jz .MBNG_END
198 addq $4, %rbx
200 cmp $16, %rbx #; test if we've reached 16 byte alignment
201 jne .MBNG_PRELOOP
203 .MBNG_SSE:
205 cmp $4, %rdx #; if there are frames left, but less than 4
206 jnge .MBNG_NONALIGN #; we can't run SSE
208 .MBNG_SSELOOP:
210 movaps (%rsi), %xmm0 #; source => xmm0
211 addps (%rdi), %xmm0 #; mix with destination
212 movaps %xmm0, (%rdi) #; copy result to destination
214 addq $16, %rdi #; dst+=4
215 addq $16, %rsi #; src+=4
217 subq $4, %rdx #; nframes-=4
218 cmp $4, %rdx
219 jge .MBNG_SSELOOP
221 cmp $0, %rdx
222 je .MBNG_END
224 #; if there are remaining frames, the nonalign code will do nicely
225 #; for the rest 1-3 frames.
227 .MBNG_NONALIGN:
228 #; not aligned!
230 movss (%rsi), %xmm0 #; src => xmm0
231 addss (%rdi), %xmm0 #; xmm0 += dst
232 movss %xmm0, (%rdi) #; xmm0 => dst
234 addq $4, %rdi
235 addq $4, %rsi
237 decq %rdx
238 jnz .MBNG_NONALIGN
240 .MBNG_END:
242 popq %rsi
243 popq %rdi
244 popq %rbx
246 #; return
247 leave
250 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
253 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
255 .globl x86_sse_apply_gain_to_buffer
256 .type x86_sse_apply_gain_to_buffer,@function
258 x86_sse_apply_gain_to_buffer:
260 #; %rdi float *buf 32(%rbp)
261 #; %rsi unsigned int nframes
262 #; %xmm0 float gain
263 #; %xmm1 float buf[0]
265 pushq %rbp
266 movq %rsp, %rbp
268 #; save %rdi
269 pushq %rdi
271 #; the real function
273 #; if nframes == 0, go to end
274 movq %rsi, %rcx #; nframes
275 cmp $0, %rcx
276 je .AG_END
278 #; set up the gain buffer (gain is already in %xmm0)
279 shufps $0x00, %xmm0, %xmm0
281 #; Check for alignment
283 movq %rdi, %rdx #; buf => %rdx
284 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
285 jz .AG_SSE #; if buffer IS aligned
287 #; PRE-LOOP
288 #; we iterate 1-3 times, doing normal x87 float comparison
289 #; so we reach a 16 byte aligned "buf" (=%rdi) value
291 .AGLP_START:
293 #; Load next value from the buffer into %xmm1
294 movss (%rdi), %xmm1
295 mulss %xmm0, %xmm1
296 movss %xmm1, (%rdi)
298 #; increment buffer, decrement counter
299 addq $4, %rdi #; buf++;
301 decq %rcx #; nframes--
302 jz .AG_END #; if we run out of frames, we go to the end
304 addq $4, %rdx #; one non-aligned byte less
305 cmp $16, %rdx
306 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
308 .AG_SSE:
310 #; We have reached the 16 byte aligned "buf" ("rdi") value
312 #; Figure out how many loops we should do
313 movq %rcx, %rax #; copy remaining nframes to %rax for division
314 movq $0, %rdx #; 0 the edx register
317 pushq %rdi
318 movq $4, %rdi
319 divq %rdi #; %rdx = remainder == 0
320 popq %rdi
322 #; %rax = SSE iterations
323 cmp $0, %rax
324 je .AGPOST_START
327 .AGLP_SSE:
329 movaps (%rdi), %xmm1
330 mulps %xmm0, %xmm1
331 movaps %xmm1, (%rdi)
333 addq $16, %rdi
334 subq $4, %rcx #; nframes-=4
336 decq %rax
337 jnz .AGLP_SSE
339 #; Next we need to post-process all remaining frames
340 #; the remaining frame count is in %rcx
342 #; if no remaining frames, jump to the end
343 cmp $0, %rcx
344 andq $3, %rcx #; nframes % 4
345 je .AG_END
347 .AGPOST_START:
349 movss (%rdi), %xmm1
350 mulss %xmm0, %xmm1
351 movss %xmm1, (%rdi)
353 #; increment buffer, decrement counter
354 addq $4, %rdi #; buf++;
356 decq %rcx #; nframes--
357 jnz .AGPOST_START #; if we run out of frames, we go to the end
359 .AG_END:
362 popq %rdi
364 #; return
365 leave
368 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
369 #; end proc
372 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
374 .globl x86_sse_apply_gain_vector
375 .type x86_sse_apply_gain_vector,@function
377 x86_sse_apply_gain_vector:
379 #; %rdi float *buf
380 #; %rsi float *gain_vector
381 #; %rdx unsigned int nframes
383 pushq %rbp
384 movq %rsp, %rbp
386 #; Save registers
387 pushq %rdi
388 pushq %rsi
389 pushq %rbx
391 #; if nframes == 0 go to end
392 cmp $0, %rdx
393 je .AGA_END
395 #; Check alignment
396 movq %rdi, %rax
397 andq $12, %rax
399 movq %rsi, %rbx
400 andq $12, %rbx
402 cmp %rax,%rbx
403 jne .AGA_ENDLOOP
405 cmp $0, %rax
406 jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
408 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
409 .AGA_ALIGNLOOP:
411 movss (%rdi), %xmm0 #; buf => xmm0
412 movss (%rsi), %xmm1 #; gain value => xmm1
413 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
414 movss %xmm0, (%rdi) #; signal with gain => buf
416 decq %rdx
417 jz .AGA_END
419 addq $4, %rdi #; buf++
420 addq $4, %rsi #; gab++
422 addq $4, %rax
423 cmp $16, %rax
424 jne .AGA_ALIGNLOOP
426 #; There are frames left for sure, as that is checked in the beginning
427 #; and within the previous loop. BUT, there might be less than 4 frames
428 #; to process
430 .AGA_SSE:
431 movq %rdx, %rax #; nframes => %rax
432 shr $2, %rax #; unsigned divide by 4
434 cmp $0, %rax #; Jos toimii ilman tätä, niin kiva
435 je .AGA_ENDLOOP
437 .AGA_SSELOOP:
438 movaps (%rdi), %xmm0
439 movaps (%rsi), %xmm1
440 mulps %xmm1, %xmm0
441 movaps %xmm0, (%rdi)
443 addq $16, %rdi
444 addq $16, %rsi
446 decq %rax
447 jnz .AGA_SSELOOP
449 andq $3, %rdx #; Remaining frames are nframes & 3
450 jz .AGA_END
453 #; Inside this loop, we know there are frames left to process
454 #; but because either there are < 4 frames left, or the buffers
455 #; are not aligned, we can't use the parallel SSE ops
456 .AGA_ENDLOOP:
457 movss (%rdi), %xmm0 #; buf => xmm0
458 movss (%rsi), %xmm1 #; gain value => xmm1
459 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
460 movss %xmm0, (%rdi) #; signal with gain => buf
462 addq $4,%rdi
463 addq $4,%rsi
464 decq %rdx #; nframes--
465 jnz .AGA_ENDLOOP
467 .AGA_END:
469 popq %rbx
470 popq %rsi
471 popq %rdi
473 leave
476 .size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
477 #; end proc
480 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
482 .globl x86_sse_compute_peak
483 .type x86_sse_compute_peak,@function
486 x86_sse_compute_peak:
488 #; %rdi float *buf 32(%rbp)
489 #; %rsi unsigned int nframes
490 #; %xmm0 float current
491 #; %xmm1 float buf[0]
493 pushq %rbp
494 movq %rsp, %rbp
496 #; save %rdi
497 pushq %rdi
499 #; if nframes == 0, go to end
500 movq %rsi, %rcx #; nframes
501 cmp $0, %rcx
502 je .CP_END
504 #; create the "abs" mask in %xmm2
505 pushq $2147483647
506 movss (%rsp), %xmm2
507 addq $8, %rsp
508 shufps $0x00, %xmm2, %xmm2
510 #; Check for alignment
512 #;movq 8(%rbp), %rdi #; buf
513 movq %rdi, %rdx #; buf => %rdx
514 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
515 jz .CP_SSE #; if buffer IS aligned
517 #; PRE-LOOP
518 #; we iterate 1-3 times, doing normal x87 float comparison
519 #; so we reach a 16 byte aligned "buf" (=%rdi) value
521 .LP_START:
523 #; Load next value from the buffer
524 movss (%rdi), %xmm1
525 andps %xmm2, %xmm1
526 maxss %xmm1, %xmm0
528 #; increment buffer, decrement counter
529 addq $4, %rdi #; buf++;
531 decq %rcx #; nframes--
532 jz .CP_END #; if we run out of frames, we go to the end
534 addq $4, %rdx #; one non-aligned byte less
535 cmp $16, %rdx
536 jne .LP_START #; if more non-aligned frames exist, we do a do-over
538 .CP_SSE:
540 #; We have reached the 16 byte aligned "buf" ("rdi") value
542 #; Figure out how many loops we should do
543 movq %rcx, %rax #; copy remaining nframes to %rax for division
545 shr $2,%rax #; unsigned divide by 4
546 jz .POST_START
548 #; %rax = SSE iterations
550 #; current maximum is at %xmm0, but we need to ..
551 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
553 #;prefetcht0 16(%rdi)
555 .LP_SSE:
557 movaps (%rdi), %xmm1
558 andps %xmm2, %xmm1
559 maxps %xmm1, %xmm0
561 addq $16, %rdi
563 decq %rax
564 jnz .LP_SSE
566 #; Calculate the maximum value contained in the 4 FP's in %xmm0
567 movaps %xmm0, %xmm1
568 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
569 maxps %xmm1, %xmm0 #; maximums of the two pairs
570 movaps %xmm0, %xmm1
571 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
572 maxps %xmm1, %xmm0
574 #; now every float in %xmm0 is the same value, current maximum value
576 #; Next we need to post-process all remaining frames
577 #; the remaining frame count is in %rcx
579 #; if no remaining frames, jump to the end
581 andq $3, %rcx #; nframes % 4
582 jz .CP_END
584 .POST_START:
586 movss (%rdi), %xmm1
587 andps %xmm2, %xmm1
588 maxss %xmm1, %xmm0
590 addq $4, %rdi #; buf++;
592 decq %rcx #; nframes--;
593 jnz .POST_START
595 .CP_END:
597 popq %rdi
599 #; return
600 leave
603 .size x86_sse_compute_peak, .-x86_sse_compute_peak
604 #; end proc
606 #ifdef __ELF__
607 .section .note.GNU-stack,"",%progbits
608 #endif