libs/ardour/sse_functions.s

   1 /*
   2     Copyright (C) 2005 Paul Davis
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18         Author: Sampo Savolainen
  19
  20     $Id$
  21 */
  22
  23
  24 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
  25
  26 .globl x86_sse_mix_buffers_with_gain
  27         .type   x86_sse_mix_buffers_with_gain,@function
  28
  29 x86_sse_mix_buffers_with_gain:
  30 #; 8(%ebp)      = float *dst    = %edi
  31 #; 12(%ebp) = float *src        = %esi
  32 #; 16(%ebp) = long      nframes = %ecx
  33 #; 20(%ebp) = float     gain    = st(0)
  34
  35         pushl %ebp
  36         movl %esp, %ebp
  37
  38         #; save the registers
  39 #;      pushl %eax
  40         pushl %ebx
  41 #;      pushl %ecx
  42         pushl %edi
  43         pushl %esi
  44
  45         #; if nframes == 0, go to end
  46         movl 16(%ebp), %ecx #; nframes
  47         cmp     $0, %ecx
  48         je      .MBWG_END
  49
  50         #; Check for alignment
  51
  52         movl 8(%ebp), %edi  #; dst
  53         movl 12(%ebp), %esi #; src
  54
  55         movl %edi, %eax
  56         andl $12, %eax #; mask alignemnt offset
  57
  58         movl %esi, %ebx
  59         andl $12, %ebx #; mask alignment offset
  60
  61         cmp %eax, %ebx
  62         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  63
  64         #; if we are aligned
  65         cmp $0, %ebx
  66         jz .MBWG_SSE
  67
  68         #; Pre-loop, we need to run 1-3 frames "manually" without
  69         #; SSE instructions
  70
  71         movss 20(%ebp), %xmm1 #; xmm1
  72
  73 .MBWG_PRELOOP:
  74
  75         movss (%esi), %xmm0
  76         mulss %xmm1, %xmm0
  77         addss (%edi), %xmm0
  78         movss %xmm0, (%edi)
  79
  80         addl $4, %edi #; dst++
  81         addl $4, %esi #; src++
  82         decl %ecx         #; nframes--
  83         jz .MBWG_END
  84
  85 #;      cmp $0, %ecx
  86 #;      je .MBWG_END #; if we run out of frames, go to end
  87
  88         addl $4, %ebx
  89
  90         cmp $16, %ebx #; test if we've reached 16 byte alignment
  91         jne .MBWG_PRELOOP
  92
  93
  94 .MBWG_SSE:
  95
  96         cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
  97         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
  98
  99         #; copy gain to fill %xmm1
 100         movss   20(%ebp), %xmm1
 101     shufps  $0x00, %xmm1, %xmm1
 102
 103
 104 .MBWG_SSELOOP:
 105
 106         movaps  (%esi), %xmm0 #; source => xmm0
 107         mulps   %xmm1,  %xmm0 #; apply gain to source
 108         addps   (%edi), %xmm0 #; mix with destination
 109         movaps  %xmm0, (%edi) #; copy result to destination
 110
 111         addl $16, %edi #; dst+=4
 112         addl $16, %esi #; src+=4
 113
 114         subl $4, %ecx #; nframes-=4
 115         cmp $4, %ecx
 116         jge .MBWG_SSELOOP
 117
 118         cmp $0, %ecx
 119         je .MBWG_END
 120
 121         #; if there are remaining frames, the nonalign code will do nicely
 122         #; for the rest 1-3 frames.
 123
 124 .MBWG_NONALIGN:
 125         #; not aligned!
 126
 127         movss 20(%ebp), %xmm1 #; gain => xmm1
 128
 129 .MBWG_NONALIGNLOOP:
 130
 131         movss (%esi), %xmm0
 132         mulss %xmm1, %xmm0
 133         addss (%edi), %xmm0
 134         movss %xmm0, (%edi)
 135
 136         addl $4, %edi
 137         addl $4, %esi
 138
 139         decl %ecx
 140         jnz .MBWG_NONALIGNLOOP
 141
 142 .MBWG_END:
 143
 144         popl %esi
 145         popl %edi
 146 #;      popl %ecx
 147         popl %ebx
 148 #;      popl %eax
 149
 150         #; return
 151         leave
 152         ret
 153
 154 .size   x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
 155
 156
 157
 158
 159 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
 160
 161 .globl x86_sse_mix_buffers_no_gain
 162         .type   x86_sse_mix_buffers_no_gain,@function
 163
 164 x86_sse_mix_buffers_no_gain:
 165 #; 8(%ebp)      = float *dst    = %edi
 166 #; 12(%ebp) = float *src        = %esi
 167 #; 16(%ebp) = long      nframes = %ecx
 168
 169         pushl %ebp
 170         movl %esp, %ebp
 171
 172         #; save the registers
 173 #;      pushl %eax
 174         pushl %ebx
 175 #;      pushl %ecx
 176         pushl %edi
 177         pushl %esi
 178
 179         #; the real function
 180
 181         #; if nframes == 0, go to end
 182         movl 16(%ebp), %ecx #; nframes
 183         cmp     $0, %ecx
 184         je      .MBNG_END
 185
 186         #; Check for alignment
 187
 188         movl 8(%ebp), %edi  #; dst
 189         movl 12(%ebp), %esi #; src
 190
 191         movl %edi, %eax
 192         andl $12, %eax #; mask alignemnt offset
 193
 194         movl %esi, %ebx
 195         andl $12, %ebx #; mask alignment offset
 196
 197         cmp %eax, %ebx
 198         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 199
 200         cmp $0, %ebx
 201         je .MBNG_SSE
 202
 203         #; Pre-loop, we need to run 1-3 frames "manually" without
 204         #; SSE instructions
 205
 206 .MBNG_PRELOOP:
 207
 208         movss (%esi), %xmm0
 209         addss (%edi), %xmm0
 210         movss %xmm0, (%edi)
 211
 212         addl $4, %edi #; dst++
 213         addl $4, %esi #; src++
 214         decl %ecx         #; nframes--
 215         jz      .MBNG_END
 216         addl $4, %ebx
 217
 218         cmp $16, %ebx #; test if we've reached 16 byte alignment
 219         jne .MBNG_PRELOOP
 220
 221 .MBNG_SSE:
 222
 223         cmp $4, %ecx #; if there are frames left, but less than 4
 224         jnge .MBNG_NONALIGN #; we can't run SSE
 225
 226 .MBNG_SSELOOP:
 227
 228         movaps  (%esi), %xmm0 #; source => xmm0
 229         addps   (%edi), %xmm0 #; mix with destination
 230         movaps  %xmm0, (%edi) #; copy result to destination
 231
 232         addl $16, %edi #; dst+=4
 233         addl $16, %esi #; src+=4
 234
 235         subl $4, %ecx #; nframes-=4
 236         cmp $4, %ecx
 237         jge .MBNG_SSELOOP
 238
 239         cmp $0, %ecx
 240         je .MBNG_END
 241
 242         #; if there are remaining frames, the nonalign code will do nicely
 243         #; for the rest 1-3 frames.
 244
 245 .MBNG_NONALIGN:
 246         #; not aligned!
 247
 248         movss (%esi), %xmm0 #; src => xmm0
 249         addss (%edi), %xmm0 #; xmm0 += dst
 250         movss %xmm0, (%edi) #; xmm0 => dst
 251
 252         addl $4, %edi
 253         addl $4, %esi
 254
 255         decl %ecx
 256         jnz .MBNG_NONALIGN
 257
 258 .MBNG_END:
 259
 260         popl %esi
 261         popl %edi
 262 #;      popl %ecx
 263         popl %ebx
 264 #;      popl %eax
 265
 266         #; return
 267         leave
 268         ret
 269
 270 .size   x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
 271
 272
 273
 274
 275 #; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
 276
 277 .globl x86_sse_apply_gain_to_buffer
 278         .type   x86_sse_apply_gain_to_buffer,@function
 279
 280 x86_sse_apply_gain_to_buffer:
 281 #; 8(%ebp)      = float *buf    = %edi
 282 #; 12(%ebp) = long      nframes = %ecx
 283 #; 16(%ebp) = float     gain    = st(0)
 284
 285         pushl %ebp
 286         movl %esp, %ebp
 287
 288         #; save %edi
 289         pushl %edi
 290
 291         #; the real function
 292
 293         #; if nframes == 0, go to end
 294         movl 12(%ebp), %ecx #; nframes
 295         cmp     $0, %ecx
 296         je      .AG_END
 297
 298         #; create the gain buffer in %xmm1
 299         movss   16(%ebp), %xmm1
 300         shufps  $0x00, %xmm1, %xmm1
 301
 302         #; Check for alignment
 303
 304         movl 8(%ebp), %edi #; buf
 305         movl %edi, %edx #; buf => %edx
 306         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 307         jz      .AG_SSE #; if buffer IS aligned
 308
 309         #; PRE-LOOP
 310         #; we iterate 1-3 times, doing normal x87 float comparison
 311         #; so we reach a 16 byte aligned "buf" (=%edi) value
 312
 313 .AGLP_START:
 314
 315         #; Load next value from the buffer
 316         movss (%edi), %xmm0
 317         mulss %xmm1, %xmm0
 318         movss %xmm0, (%edi)
 319
 320         #; increment buffer, decrement counter
 321         addl $4, %edi #; buf++;
 322
 323         decl %ecx   #; nframes--
 324         jz      .AG_END #; if we run out of frames, we go to the end
 325
 326         addl $4, %edx #; one non-aligned byte less
 327         cmp $16, %edx
 328         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 329
 330 .AG_SSE:
 331
 332         #; We have reached the 16 byte aligned "buf" ("edi") value
 333
 334         #; Figure out how many loops we should do
 335         movl %ecx, %eax #; copy remaining nframes to %eax for division
 336         movl $0, %edx   #; 0 the edx register
 337
 338
 339         pushl %edi
 340         movl $4, %edi
 341         divl %edi #; %edx = remainder == 0
 342         popl %edi
 343
 344         #; %eax = SSE iterations
 345         cmp $0, %eax
 346         je .AGPOST_START
 347
 348
 349 .AGLP_SSE:
 350
 351         movaps (%edi), %xmm0
 352         mulps %xmm1, %xmm0
 353         movaps %xmm0, (%edi)
 354
 355         addl $16, %edi
 356 #;      subl $4, %ecx   #; nframes-=4
 357
 358         decl %eax
 359         jnz .AGLP_SSE
 360
 361         #; Next we need to post-process all remaining frames
 362         #; the remaining frame count is in %ecx
 363
 364         #; if no remaining frames, jump to the end
 365 #;      cmp $0, %ecx
 366         andl $3, %ecx #; nframes % 4
 367         je .AG_END
 368
 369 .AGPOST_START:
 370
 371         movss (%edi), %xmm0
 372         mulss %xmm1, %xmm0
 373         movss %xmm0, (%edi)
 374
 375         #; increment buffer, decrement counter
 376         addl $4, %edi #; buf++;
 377
 378         decl %ecx   #; nframes--
 379         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 380
 381 .AG_END:
 382
 383
 384         popl %edi
 385
 386         #; return
 387         leave
 388         ret
 389
 390 .size   x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
 391 #; end proc
 392
 393
 394
 395 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 396
 397 .globl x86_sse_compute_peak
 398         .type   x86_sse_compute_peak,@function
 399
 400 x86_sse_compute_peak:
 401 #; 8(%ebp)      = float *buf    = %edi
 402 #; 12(%ebp) = long      nframes = %ecx
 403 #; 16(%ebp) = float     current = st(0)
 404
 405         pushl %ebp
 406         movl %esp, %ebp
 407
 408         #; save %edi
 409         pushl %edi
 410
 411         #; the real function
 412
 413         #; Load "current" in xmm0
 414         movss 16(%ebp), %xmm0
 415
 416         #; if nframes == 0, go to end
 417         movl 12(%ebp), %ecx #; nframes
 418         cmp     $0, %ecx
 419         je      .CP_END
 420
 421         #; create the "abs" mask in %xmm2
 422         pushl   $2147483647
 423         movss   (%esp), %xmm2
 424         addl    $4, %esp
 425         shufps  $0x00, %xmm2, %xmm2
 426
 427         #; Check for alignment
 428
 429         movl 8(%ebp), %edi #; buf
 430         movl %edi, %edx #; buf => %edx
 431         andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 432         jz      .CP_SSE #; if buffer IS aligned
 433
 434         #; PRE-LOOP
 435         #; we iterate 1-3 times, doing normal x87 float comparison
 436         #; so we reach a 16 byte aligned "buf" (=%edi) value
 437
 438 .LP_START:
 439
 440         #; Load next value from the buffer
 441         movss (%edi), %xmm1
 442         andps %xmm2, %xmm1
 443         maxss %xmm1, %xmm0
 444
 445         #; increment buffer, decrement counter
 446         addl $4, %edi #; buf++;
 447
 448         decl %ecx   #; nframes--
 449         jz      .CP_END #; if we run out of frames, we go to the end
 450
 451         addl $4, %edx #; one non-aligned byte less
 452         cmp $16, %edx
 453         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 454
 455 .CP_SSE:
 456
 457         #; We have reached the 16 byte aligned "buf" ("edi") value
 458
 459         #; Figure out how many loops we should do
 460         movl %ecx, %eax #; copy remaining nframes to %eax for division
 461
 462         shr $2,%eax #; unsigned divide by 4
 463         jz .POST_START
 464
 465         #; %eax = SSE iterations
 466
 467         #; current maximum is at %xmm0, but we need to ..
 468         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 469
 470         #;prefetcht0 16(%edi)
 471
 472 .LP_SSE:
 473
 474         movaps (%edi), %xmm1
 475         andps %xmm2, %xmm1
 476         maxps %xmm1, %xmm0
 477
 478         addl $16, %edi
 479
 480         decl %eax
 481         jnz .LP_SSE
 482
 483         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 484         movaps %xmm0, %xmm1
 485         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 486         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 487         movaps %xmm0, %xmm1
 488         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 489         maxps  %xmm1, %xmm0
 490
 491         #; now every float in %xmm0 is the same value, current maximum value
 492
 493         #; Next we need to post-process all remaining frames
 494         #; the remaining frame count is in %ecx
 495
 496         #; if no remaining frames, jump to the end
 497
 498         andl $3, %ecx #; nframes % 4
 499         jz .CP_END
 500
 501 .POST_START:
 502
 503         movss (%edi), %xmm1
 504         andps %xmm2, %xmm1
 505         maxss %xmm1, %xmm0
 506
 507         addl $4, %edi   #; buf++;
 508
 509         decl %ecx               #; nframes--;
 510         jnz .POST_START
 511
 512 .CP_END:
 513
 514         #; Load the value from xmm0 to the float stack for returning
 515         movss %xmm0, 16(%ebp)
 516         flds 16(%ebp)
 517
 518         popl %edi
 519
 520         #; return
 521         leave
 522         ret
 523
 524 .size   x86_sse_compute_peak, .-x86_sse_compute_peak
 525 #; end proc
 526
 527 #ifdef __ELF__
 528 .section .note.GNU-stack,"",%progbits
 529 #endif
 530
 531