libs/ardour/sse_functions_64bit.s

   1 /*
   2     Copyright (C) 2005-2006 Paul Davis, John Rigg
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18         Author: Sampo Savolainen
  19         64-bit conversion: John Rigg
  20
  21     $Id$
  22 */
  23
  24
  25 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  26
  27 .globl x86_sse_mix_buffers_with_gain
  28         .type   x86_sse_mix_buffers_with_gain,@function
  29
  30 x86_sse_mix_buffers_with_gain:
  31
  32 #; %rdi float   *dst
  33 #; %rsi float   *src
  34 #; %rdx unsigned int nframes
  35 #; %xmm0 float  gain
  36
  37         pushq %rbp
  38         movq %rsp, %rbp
  39
  40         #; save the registers
  41         pushq %rbx
  42         pushq %rdi
  43         pushq %rsi
  44
  45         #; if nframes == 0, go to end
  46         cmp     $0, %rdx
  47         je      .MBWG_END
  48
  49         #; Check for alignment
  50
  51         movq %rdi, %rax
  52         andq $12, %rax #; mask alignment offset
  53
  54         movq %rsi, %rbx
  55         andq $12, %rbx #; mask alignment offset
  56
  57         cmp %rax, %rbx
  58         jne .MBWG_NONALIGN #; if not aligned, calculate manually
  59
  60         #; if we are aligned
  61         cmp $0, %rbx
  62         jz .MBWG_SSE
  63
  64         #; Pre-loop, we need to run 1-3 frames "manually" without
  65         #; SSE instructions
  66
  67 .MBWG_PRELOOP:
  68
  69         #; gain is already in %xmm0
  70         movss (%rsi), %xmm1
  71         mulss %xmm0, %xmm1
  72         addss (%rdi), %xmm1
  73         movss %xmm1, (%rdi)
  74
  75         addq $4, %rdi #; dst++
  76         addq $4, %rsi #; src++
  77         decq %rdx         #; nframes--
  78         jz .MBWG_END
  79
  80         addq $4, %rbx
  81
  82         cmp $16, %rbx #; test if we've reached 16 byte alignment
  83         jne .MBWG_PRELOOP
  84
  85
  86 .MBWG_SSE:
  87
  88         cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
  89         jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
  90
  91         #; gain is already in %xmm0
  92         shufps  $0x00, %xmm0, %xmm0
  93
  94
  95 .MBWG_SSELOOP:
  96
  97         movaps  (%rsi), %xmm1 #; source => xmm0
  98         mulps   %xmm0,  %xmm1 #; apply gain to source
  99         addps   (%rdi), %xmm1 #; mix with destination
 100         movaps  %xmm1, (%rdi) #; copy result to destination
 101
 102         addq $16, %rdi #; dst+=4
 103         addq $16, %rsi #; src+=4
 104
 105         subq $4, %rdx #; nframes-=4
 106         cmp $4, %rdx
 107         jge .MBWG_SSELOOP
 108
 109         cmp $0, %rdx
 110         je .MBWG_END
 111
 112         #; if there are remaining frames, the nonalign code will do nicely
 113         #; for the rest 1-3 frames.
 114
 115 .MBWG_NONALIGN:
 116         #; not aligned!
 117
 118         #; gain is already in %xmm0
 119
 120 .MBWG_NONALIGNLOOP:
 121
 122         movss (%rsi), %xmm1
 123         mulss %xmm0, %xmm1
 124         addss (%rdi), %xmm1
 125         movss %xmm1, (%rdi)
 126
 127         addq $4, %rdi
 128         addq $4, %rsi
 129
 130         decq %rdx
 131         jnz .MBWG_NONALIGNLOOP
 132
 133 .MBWG_END:
 134
 135         popq %rsi
 136         popq %rdi
 137         popq %rbx
 138
 139         #; return
 140         leave
 141         ret
 142
 143 .size   x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
 144
 145
 146 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 147
 148 .globl x86_sse_mix_buffers_no_gain
 149         .type   x86_sse_mix_buffers_no_gain,@function
 150
 151 x86_sse_mix_buffers_no_gain:
 152
 153 #; %rdi float *dst
 154 #; %rsi float *src
 155 #; %rdx unsigned int nframes
 156
 157         pushq %rbp
 158         movq %rsp, %rbp
 159
 160         #; save the registers
 161         pushq %rbx
 162         pushq %rdi
 163         pushq %rsi
 164
 165         #; the real function
 166
 167         #; if nframes == 0, go to end
 168         cmp     $0, %rdx
 169         je      .MBNG_END
 170
 171         #; Check for alignment
 172
 173         movq %rdi, %rax
 174         andq $12, %rax #; mask alignment offset
 175
 176         movq %rsi, %rbx
 177         andq $12, %rbx #; mask alignment offset
 178
 179         cmp %rax, %rbx
 180         jne .MBNG_NONALIGN #; if not aligned, calculate manually
 181
 182         cmp $0, %rbx
 183         je .MBNG_SSE
 184
 185         #; Pre-loop, we need to run 1-3 frames "manually" without
 186         #; SSE instructions
 187
 188 .MBNG_PRELOOP:
 189
 190         movss (%rsi), %xmm0
 191         addss (%rdi), %xmm0
 192         movss %xmm0, (%rdi)
 193
 194         addq $4, %rdi #; dst++
 195         addq $4, %rsi #; src++
 196         decq %rdx         #; nframes--
 197         jz      .MBNG_END
 198         addq $4, %rbx
 199
 200         cmp $16, %rbx #; test if we've reached 16 byte alignment
 201         jne .MBNG_PRELOOP
 202
 203 .MBNG_SSE:
 204
 205         cmp $4, %rdx #; if there are frames left, but less than 4
 206         jnge .MBNG_NONALIGN #; we can't run SSE
 207
 208 .MBNG_SSELOOP:
 209
 210         movaps  (%rsi), %xmm0 #; source => xmm0
 211         addps   (%rdi), %xmm0 #; mix with destination
 212         movaps  %xmm0, (%rdi) #; copy result to destination
 213
 214         addq $16, %rdi #; dst+=4
 215         addq $16, %rsi #; src+=4
 216
 217         subq $4, %rdx #; nframes-=4
 218         cmp $4, %rdx
 219         jge .MBNG_SSELOOP
 220
 221         cmp $0, %rdx
 222         je .MBNG_END
 223
 224         #; if there are remaining frames, the nonalign code will do nicely
 225         #; for the rest 1-3 frames.
 226
 227 .MBNG_NONALIGN:
 228         #; not aligned!
 229
 230         movss (%rsi), %xmm0 #; src => xmm0
 231         addss (%rdi), %xmm0 #; xmm0 += dst
 232         movss %xmm0, (%rdi) #; xmm0 => dst
 233
 234         addq $4, %rdi
 235         addq $4, %rsi
 236
 237         decq %rdx
 238         jnz .MBNG_NONALIGN
 239
 240 .MBNG_END:
 241
 242         popq %rsi
 243         popq %rdi
 244         popq %rbx
 245
 246         #; return
 247         leave
 248         ret
 249
 250 .size   x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
 251
 252
 253 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 254
 255 .globl x86_sse_apply_gain_to_buffer
 256         .type   x86_sse_apply_gain_to_buffer,@function
 257
 258 x86_sse_apply_gain_to_buffer:
 259
 260 #; %rdi  float          *buf    32(%rbp)
 261 #; %rsi  unsigned int   nframes
 262 #; %xmm0 float          gain
 263 #; %xmm1 float          buf[0]
 264
 265         pushq %rbp
 266         movq %rsp, %rbp
 267
 268         #; save %rdi
 269         pushq %rdi
 270
 271         #; the real function
 272
 273         #; if nframes == 0, go to end
 274         movq %rsi, %rcx #; nframes
 275         cmp     $0, %rcx
 276         je      .AG_END
 277
 278         #; set up the gain buffer (gain is already in %xmm0)
 279         shufps  $0x00, %xmm0, %xmm0
 280
 281         #; Check for alignment
 282
 283         movq %rdi, %rdx #; buf => %rdx
 284         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 285         jz      .AG_SSE #; if buffer IS aligned
 286
 287         #; PRE-LOOP
 288         #; we iterate 1-3 times, doing normal x87 float comparison
 289         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 290
 291 .AGLP_START:
 292
 293         #; Load next value from the buffer into %xmm1
 294         movss (%rdi), %xmm1
 295         mulss %xmm0, %xmm1
 296         movss %xmm1, (%rdi)
 297
 298         #; increment buffer, decrement counter
 299         addq $4, %rdi #; buf++;
 300
 301         decq %rcx   #; nframes--
 302         jz      .AG_END #; if we run out of frames, we go to the end
 303
 304         addq $4, %rdx #; one non-aligned byte less
 305         cmp $16, %rdx
 306         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 307
 308 .AG_SSE:
 309
 310         #; We have reached the 16 byte aligned "buf" ("rdi") value
 311
 312         #; Figure out how many loops we should do
 313         movq %rcx, %rax #; copy remaining nframes to %rax for division
 314         movq $0, %rdx   #; 0 the edx register
 315
 316
 317         pushq %rdi
 318         movq $4, %rdi
 319         divq %rdi #; %rdx = remainder == 0
 320         popq %rdi
 321
 322         #; %rax = SSE iterations
 323         cmp $0, %rax
 324         je .AGPOST_START
 325
 326
 327 .AGLP_SSE:
 328
 329         movaps (%rdi), %xmm1
 330         mulps %xmm0, %xmm1
 331         movaps %xmm1, (%rdi)
 332
 333         addq $16, %rdi
 334         subq $4, %rcx   #; nframes-=4
 335
 336         decq %rax
 337         jnz .AGLP_SSE
 338
 339         #; Next we need to post-process all remaining frames
 340         #; the remaining frame count is in %rcx
 341
 342         #; if no remaining frames, jump to the end
 343         cmp $0, %rcx
 344         andq $3, %rcx #; nframes % 4
 345         je .AG_END
 346
 347 .AGPOST_START:
 348
 349         movss (%rdi), %xmm1
 350         mulss %xmm0, %xmm1
 351         movss %xmm1, (%rdi)
 352
 353         #; increment buffer, decrement counter
 354         addq $4, %rdi #; buf++;
 355
 356         decq %rcx   #; nframes--
 357         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 358
 359 .AG_END:
 360
 361
 362         popq %rdi
 363
 364         #; return
 365         leave
 366         ret
 367
 368 .size   x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
 369 #; end proc
 370
 371
 372 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
 373
 374 .globl x86_sse_apply_gain_vector
 375         .type   x86_sse_apply_gain_vector,@function
 376
 377 x86_sse_apply_gain_vector:
 378
 379 #; %rdi float *buf
 380 #; %rsi float *gain_vector
 381 #; %rdx unsigned int nframes
 382
 383         pushq %rbp
 384         movq %rsp, %rbp
 385
 386         #; Save registers
 387         pushq %rdi
 388         pushq %rsi
 389         pushq %rbx
 390
 391         #; if nframes == 0 go to end
 392         cmp $0, %rdx
 393         je .AGA_END
 394
 395         #; Check alignment
 396         movq %rdi, %rax
 397         andq $12, %rax
 398
 399         movq %rsi, %rbx
 400         andq $12, %rbx
 401
 402         cmp %rax,%rbx
 403         jne .AGA_ENDLOOP
 404
 405         cmp $0, %rax
 406         jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
 407
 408 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
 409 .AGA_ALIGNLOOP:
 410
 411         movss (%rdi), %xmm0 #; buf => xmm0
 412         movss (%rsi), %xmm1 #; gain value => xmm1
 413         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 414         movss %xmm0, (%rdi) #; signal with gain => buf
 415
 416         decq %rdx
 417         jz .AGA_END
 418
 419         addq $4, %rdi #; buf++
 420         addq $4, %rsi #; gab++
 421
 422         addq $4, %rax
 423         cmp $16, %rax
 424         jne .AGA_ALIGNLOOP
 425
 426 #; There are frames left for sure, as that is checked in the beginning
 427 #; and within the previous loop. BUT, there might be less than 4 frames
 428 #; to process
 429
 430 .AGA_SSE:
 431         movq %rdx, %rax #; nframes => %rax
 432         shr $2, %rax #; unsigned divide by 4
 433
 434         cmp $0, %rax  #; Jos toimii ilman tätä, niin kiva
 435         je .AGA_ENDLOOP
 436
 437 .AGA_SSELOOP:
 438         movaps (%rdi), %xmm0
 439         movaps (%rsi), %xmm1
 440         mulps %xmm1, %xmm0
 441         movaps %xmm0, (%rdi)
 442
 443         addq $16, %rdi
 444         addq $16, %rsi
 445
 446         decq %rax
 447         jnz .AGA_SSELOOP
 448
 449         andq $3, %rdx #; Remaining frames are nframes & 3
 450         jz .AGA_END
 451
 452
 453 #; Inside this loop, we know there are frames left to process
 454 #; but because either there are < 4 frames left, or the buffers
 455 #; are not aligned, we can't use the parallel SSE ops
 456 .AGA_ENDLOOP:
 457         movss (%rdi), %xmm0 #; buf => xmm0
 458         movss (%rsi), %xmm1 #; gain value => xmm1
 459         mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
 460         movss %xmm0, (%rdi) #; signal with gain => buf
 461
 462         addq $4,%rdi
 463         addq $4,%rsi
 464         decq %rdx #; nframes--
 465         jnz .AGA_ENDLOOP
 466
 467 .AGA_END:
 468
 469         popq %rbx
 470         popq %rsi
 471         popq %rdi
 472
 473         leave
 474         ret
 475
 476 .size   x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
 477 #; end proc
 478
 479
 480 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
 481
 482 .globl x86_sse_compute_peak
 483         .type   x86_sse_compute_peak,@function
 484
 485
 486 x86_sse_compute_peak:
 487
 488 #; %rdi  float          *buf    32(%rbp)
 489 #; %rsi  unsigned int   nframes
 490 #; %xmm0 float          current
 491 #; %xmm1 float          buf[0]
 492
 493         pushq %rbp
 494         movq %rsp, %rbp
 495
 496         #; save %rdi
 497         pushq %rdi
 498
 499         #; if nframes == 0, go to end
 500         movq %rsi, %rcx #; nframes
 501         cmp     $0, %rcx
 502         je      .CP_END
 503
 504         #; create the "abs" mask in %xmm2
 505         pushq   $2147483647
 506         movss   (%rsp), %xmm2
 507         addq    $8, %rsp
 508         shufps  $0x00, %xmm2, %xmm2
 509
 510         #; Check for alignment
 511
 512         #;movq 8(%rbp), %rdi #; buf
 513         movq %rdi, %rdx #; buf => %rdx
 514         andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 515         jz      .CP_SSE #; if buffer IS aligned
 516
 517         #; PRE-LOOP
 518         #; we iterate 1-3 times, doing normal x87 float comparison
 519         #; so we reach a 16 byte aligned "buf" (=%rdi) value
 520
 521 .LP_START:
 522
 523         #; Load next value from the buffer
 524         movss (%rdi), %xmm1
 525         andps %xmm2, %xmm1
 526         maxss %xmm1, %xmm0
 527
 528         #; increment buffer, decrement counter
 529         addq $4, %rdi #; buf++;
 530
 531         decq %rcx   #; nframes--
 532         jz      .CP_END #; if we run out of frames, we go to the end
 533
 534         addq $4, %rdx #; one non-aligned byte less
 535         cmp $16, %rdx
 536         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 537
 538 .CP_SSE:
 539
 540         #; We have reached the 16 byte aligned "buf" ("rdi") value
 541
 542         #; Figure out how many loops we should do
 543         movq %rcx, %rax #; copy remaining nframes to %rax for division
 544
 545         shr $2,%rax #; unsigned divide by 4
 546         jz .POST_START
 547
 548         #; %rax = SSE iterations
 549
 550         #; current maximum is at %xmm0, but we need to ..
 551         shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 552
 553         #;prefetcht0 16(%rdi)
 554
 555 .LP_SSE:
 556
 557         movaps (%rdi), %xmm1
 558         andps %xmm2, %xmm1
 559         maxps %xmm1, %xmm0
 560
 561         addq $16, %rdi
 562
 563         decq %rax
 564         jnz .LP_SSE
 565
 566         #; Calculate the maximum value contained in the 4 FP's in %xmm0
 567         movaps %xmm0, %xmm1
 568         shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
 569         maxps  %xmm1, %xmm0 #; maximums of the two pairs
 570         movaps %xmm0, %xmm1
 571         shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
 572         maxps  %xmm1, %xmm0
 573
 574         #; now every float in %xmm0 is the same value, current maximum value
 575
 576         #; Next we need to post-process all remaining frames
 577         #; the remaining frame count is in %rcx
 578
 579         #; if no remaining frames, jump to the end
 580
 581         andq $3, %rcx #; nframes % 4
 582         jz .CP_END
 583
 584 .POST_START:
 585
 586         movss (%rdi), %xmm1
 587         andps %xmm2, %xmm1
 588         maxss %xmm1, %xmm0
 589
 590         addq $4, %rdi   #; buf++;
 591
 592         decq %rcx               #; nframes--;
 593         jnz .POST_START
 594
 595 .CP_END:
 596
 597         popq %rdi
 598
 599         #; return
 600         leave
 601         ret
 602
 603 .size   x86_sse_compute_peak, .-x86_sse_compute_peak
 604 #; end proc
 605
 606 #ifdef __ELF__
 607 .section .note.GNU-stack,"",%progbits
 608 #endif
 609