second (and hopefully) final part of changes to respond to header format changes...
[ArdourMidi.git] / libs / ardour / sse_functions.s
blob934ce6887a25e5075782a538013a59ea19049545
1 /*
2 Copyright (C) 2005 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
20 $Id$
24 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
26 .globl x86_sse_mix_buffers_with_gain
27 .type x86_sse_mix_buffers_with_gain,@function
29 x86_sse_mix_buffers_with_gain:
30 #; 8(%ebp) = float *dst = %edi
31 #; 12(%ebp) = float *src = %esi
32 #; 16(%ebp) = long nframes = %ecx
33 #; 20(%ebp) = float gain = st(0)
35 pushl %ebp
36 movl %esp, %ebp
38 #; save the registers
39 #; pushl %eax
40 pushl %ebx
41 #; pushl %ecx
42 pushl %edi
43 pushl %esi
45 #; if nframes == 0, go to end
46 movl 16(%ebp), %ecx #; nframes
47 cmp $0, %ecx
48 je .MBWG_END
50 #; Check for alignment
52 movl 8(%ebp), %edi #; dst
53 movl 12(%ebp), %esi #; src
55 movl %edi, %eax
56 andl $12, %eax #; mask alignemnt offset
58 movl %esi, %ebx
59 andl $12, %ebx #; mask alignment offset
61 cmp %eax, %ebx
62 jne .MBWG_NONALIGN #; if not aligned, calculate manually
64 #; if we are aligned
65 cmp $0, %ebx
66 jz .MBWG_SSE
68 #; Pre-loop, we need to run 1-3 frames "manually" without
69 #; SSE instructions
71 movss 20(%ebp), %xmm1 #; xmm1
73 .MBWG_PRELOOP:
75 movss (%esi), %xmm0
76 mulss %xmm1, %xmm0
77 addss (%edi), %xmm0
78 movss %xmm0, (%edi)
80 addl $4, %edi #; dst++
81 addl $4, %esi #; src++
82 decl %ecx #; nframes--
83 jz .MBWG_END
85 #; cmp $0, %ecx
86 #; je .MBWG_END #; if we run out of frames, go to end
88 addl $4, %ebx
90 cmp $16, %ebx #; test if we've reached 16 byte alignment
91 jne .MBWG_PRELOOP
94 .MBWG_SSE:
96 cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
97 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
99 #; copy gain to fill %xmm1
100 movss 20(%ebp), %xmm1
101 shufps $0x00, %xmm1, %xmm1
104 .MBWG_SSELOOP:
106 movaps (%esi), %xmm0 #; source => xmm0
107 mulps %xmm1, %xmm0 #; apply gain to source
108 addps (%edi), %xmm0 #; mix with destination
109 movaps %xmm0, (%edi) #; copy result to destination
111 addl $16, %edi #; dst+=4
112 addl $16, %esi #; src+=4
114 subl $4, %ecx #; nframes-=4
115 cmp $4, %ecx
116 jge .MBWG_SSELOOP
118 cmp $0, %ecx
119 je .MBWG_END
121 #; if there are remaining frames, the nonalign code will do nicely
122 #; for the rest 1-3 frames.
124 .MBWG_NONALIGN:
125 #; not aligned!
127 movss 20(%ebp), %xmm1 #; gain => xmm1
129 .MBWG_NONALIGNLOOP:
131 movss (%esi), %xmm0
132 mulss %xmm1, %xmm0
133 addss (%edi), %xmm0
134 movss %xmm0, (%edi)
136 addl $4, %edi
137 addl $4, %esi
139 decl %ecx
140 jnz .MBWG_NONALIGNLOOP
142 .MBWG_END:
144 popl %esi
145 popl %edi
146 #; popl %ecx
147 popl %ebx
148 #; popl %eax
150 #; return
151 leave
154 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
159 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
161 .globl x86_sse_mix_buffers_no_gain
162 .type x86_sse_mix_buffers_no_gain,@function
164 x86_sse_mix_buffers_no_gain:
165 #; 8(%ebp) = float *dst = %edi
166 #; 12(%ebp) = float *src = %esi
167 #; 16(%ebp) = long nframes = %ecx
169 pushl %ebp
170 movl %esp, %ebp
172 #; save the registers
173 #; pushl %eax
174 pushl %ebx
175 #; pushl %ecx
176 pushl %edi
177 pushl %esi
179 #; the real function
181 #; if nframes == 0, go to end
182 movl 16(%ebp), %ecx #; nframes
183 cmp $0, %ecx
184 je .MBNG_END
186 #; Check for alignment
188 movl 8(%ebp), %edi #; dst
189 movl 12(%ebp), %esi #; src
191 movl %edi, %eax
192 andl $12, %eax #; mask alignemnt offset
194 movl %esi, %ebx
195 andl $12, %ebx #; mask alignment offset
197 cmp %eax, %ebx
198 jne .MBNG_NONALIGN #; if not aligned, calculate manually
200 cmp $0, %ebx
201 je .MBNG_SSE
203 #; Pre-loop, we need to run 1-3 frames "manually" without
204 #; SSE instructions
206 .MBNG_PRELOOP:
208 movss (%esi), %xmm0
209 addss (%edi), %xmm0
210 movss %xmm0, (%edi)
212 addl $4, %edi #; dst++
213 addl $4, %esi #; src++
214 decl %ecx #; nframes--
215 jz .MBNG_END
216 addl $4, %ebx
218 cmp $16, %ebx #; test if we've reached 16 byte alignment
219 jne .MBNG_PRELOOP
221 .MBNG_SSE:
223 cmp $4, %ecx #; if there are frames left, but less than 4
224 jnge .MBNG_NONALIGN #; we can't run SSE
226 .MBNG_SSELOOP:
228 movaps (%esi), %xmm0 #; source => xmm0
229 addps (%edi), %xmm0 #; mix with destination
230 movaps %xmm0, (%edi) #; copy result to destination
232 addl $16, %edi #; dst+=4
233 addl $16, %esi #; src+=4
235 subl $4, %ecx #; nframes-=4
236 cmp $4, %ecx
237 jge .MBNG_SSELOOP
239 cmp $0, %ecx
240 je .MBNG_END
242 #; if there are remaining frames, the nonalign code will do nicely
243 #; for the rest 1-3 frames.
245 .MBNG_NONALIGN:
246 #; not aligned!
248 movss (%esi), %xmm0 #; src => xmm0
249 addss (%edi), %xmm0 #; xmm0 += dst
250 movss %xmm0, (%edi) #; xmm0 => dst
252 addl $4, %edi
253 addl $4, %esi
255 decl %ecx
256 jnz .MBNG_NONALIGN
258 .MBNG_END:
260 popl %esi
261 popl %edi
262 #; popl %ecx
263 popl %ebx
264 #; popl %eax
266 #; return
267 leave
270 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
275 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
277 .globl x86_sse_apply_gain_to_buffer
278 .type x86_sse_apply_gain_to_buffer,@function
280 x86_sse_apply_gain_to_buffer:
281 #; 8(%ebp) = float *buf = %edi
282 #; 12(%ebp) = long nframes = %ecx
283 #; 16(%ebp) = float gain = st(0)
285 pushl %ebp
286 movl %esp, %ebp
288 #; save %edi
289 pushl %edi
291 #; the real function
293 #; if nframes == 0, go to end
294 movl 12(%ebp), %ecx #; nframes
295 cmp $0, %ecx
296 je .AG_END
298 #; create the gain buffer in %xmm1
299 movss 16(%ebp), %xmm1
300 shufps $0x00, %xmm1, %xmm1
302 #; Check for alignment
304 movl 8(%ebp), %edi #; buf
305 movl %edi, %edx #; buf => %edx
306 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
307 jz .AG_SSE #; if buffer IS aligned
309 #; PRE-LOOP
310 #; we iterate 1-3 times, doing normal x87 float comparison
311 #; so we reach a 16 byte aligned "buf" (=%edi) value
313 .AGLP_START:
315 #; Load next value from the buffer
316 movss (%edi), %xmm0
317 mulss %xmm1, %xmm0
318 movss %xmm0, (%edi)
320 #; increment buffer, decrement counter
321 addl $4, %edi #; buf++;
323 decl %ecx #; nframes--
324 jz .AG_END #; if we run out of frames, we go to the end
326 addl $4, %edx #; one non-aligned byte less
327 cmp $16, %edx
328 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
330 .AG_SSE:
332 #; We have reached the 16 byte aligned "buf" ("edi") value
334 #; Figure out how many loops we should do
335 movl %ecx, %eax #; copy remaining nframes to %eax for division
336 movl $0, %edx #; 0 the edx register
339 pushl %edi
340 movl $4, %edi
341 divl %edi #; %edx = remainder == 0
342 popl %edi
344 #; %eax = SSE iterations
345 cmp $0, %eax
346 je .AGPOST_START
349 .AGLP_SSE:
351 movaps (%edi), %xmm0
352 mulps %xmm1, %xmm0
353 movaps %xmm0, (%edi)
355 addl $16, %edi
356 #; subl $4, %ecx #; nframes-=4
358 decl %eax
359 jnz .AGLP_SSE
361 #; Next we need to post-process all remaining frames
362 #; the remaining frame count is in %ecx
364 #; if no remaining frames, jump to the end
365 #; cmp $0, %ecx
366 andl $3, %ecx #; nframes % 4
367 je .AG_END
369 .AGPOST_START:
371 movss (%edi), %xmm0
372 mulss %xmm1, %xmm0
373 movss %xmm0, (%edi)
375 #; increment buffer, decrement counter
376 addl $4, %edi #; buf++;
378 decl %ecx #; nframes--
379 jnz .AGPOST_START #; if we run out of frames, we go to the end
381 .AG_END:
384 popl %edi
386 #; return
387 leave
390 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
391 #; end proc
395 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
397 .globl x86_sse_compute_peak
398 .type x86_sse_compute_peak,@function
400 x86_sse_compute_peak:
401 #; 8(%ebp) = float *buf = %edi
402 #; 12(%ebp) = long nframes = %ecx
403 #; 16(%ebp) = float current = st(0)
405 pushl %ebp
406 movl %esp, %ebp
408 #; save %edi
409 pushl %edi
411 #; the real function
413 #; Load "current" in xmm0
414 movss 16(%ebp), %xmm0
416 #; if nframes == 0, go to end
417 movl 12(%ebp), %ecx #; nframes
418 cmp $0, %ecx
419 je .CP_END
421 #; create the "abs" mask in %xmm2
422 pushl $2147483647
423 movss (%esp), %xmm2
424 addl $4, %esp
425 shufps $0x00, %xmm2, %xmm2
427 #; Check for alignment
429 movl 8(%ebp), %edi #; buf
430 movl %edi, %edx #; buf => %edx
431 andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
432 jz .CP_SSE #; if buffer IS aligned
434 #; PRE-LOOP
435 #; we iterate 1-3 times, doing normal x87 float comparison
436 #; so we reach a 16 byte aligned "buf" (=%edi) value
438 .LP_START:
440 #; Load next value from the buffer
441 movss (%edi), %xmm1
442 andps %xmm2, %xmm1
443 maxss %xmm1, %xmm0
445 #; increment buffer, decrement counter
446 addl $4, %edi #; buf++;
448 decl %ecx #; nframes--
449 jz .CP_END #; if we run out of frames, we go to the end
451 addl $4, %edx #; one non-aligned byte less
452 cmp $16, %edx
453 jne .LP_START #; if more non-aligned frames exist, we do a do-over
455 .CP_SSE:
457 #; We have reached the 16 byte aligned "buf" ("edi") value
459 #; Figure out how many loops we should do
460 movl %ecx, %eax #; copy remaining nframes to %eax for division
462 shr $2,%eax #; unsigned divide by 4
463 jz .POST_START
465 #; %eax = SSE iterations
467 #; current maximum is at %xmm0, but we need to ..
468 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
470 #;prefetcht0 16(%edi)
472 .LP_SSE:
474 movaps (%edi), %xmm1
475 andps %xmm2, %xmm1
476 maxps %xmm1, %xmm0
478 addl $16, %edi
480 decl %eax
481 jnz .LP_SSE
483 #; Calculate the maximum value contained in the 4 FP's in %xmm0
484 movaps %xmm0, %xmm1
485 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
486 maxps %xmm1, %xmm0 #; maximums of the two pairs
487 movaps %xmm0, %xmm1
488 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
489 maxps %xmm1, %xmm0
491 #; now every float in %xmm0 is the same value, current maximum value
493 #; Next we need to post-process all remaining frames
494 #; the remaining frame count is in %ecx
496 #; if no remaining frames, jump to the end
498 andl $3, %ecx #; nframes % 4
499 jz .CP_END
501 .POST_START:
503 movss (%edi), %xmm1
504 andps %xmm2, %xmm1
505 maxss %xmm1, %xmm0
507 addl $4, %edi #; buf++;
509 decl %ecx #; nframes--;
510 jnz .POST_START
512 .CP_END:
514 #; Load the value from xmm0 to the float stack for returning
515 movss %xmm0, 16(%ebp)
516 flds 16(%ebp)
518 popl %edi
520 #; return
521 leave
524 .size x86_sse_compute_peak, .-x86_sse_compute_peak
525 #; end proc
527 #ifdef __ELF__
528 .section .note.GNU-stack,"",%progbits
529 #endif