source/libs/gmp/gmp-src/mpn/x86/k6/sqr_basecase.asm

   1 dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
  35 C     product (measured on the speed difference between 17 and 33 limbs,
  36 C     which is roughly the Karatsuba recursing range).
  37
  38
  39 dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
  40 dnl  code supports.  This value is used only by the tune program to know
  41 dnl  what it can go up to.  (An attempt to compile with a bigger value will
  42 dnl  trigger some m4_assert()s in the code, making the build fail.)
  43 dnl
  44 dnl  The value is determined by requiring the displacements in the unrolled
  45 dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
  46 dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
  47
  48 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
  49
  50
  51 dnl  Allow a value from the tune program to override config.m4.
  52
  53 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  54 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  55
  56
  57 dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
  58 dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
  59 dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
  60 dnl
  61 dnl  The first addmul is the biggest, and this takes the second least
  62 dnl  significant limb and multiplies it by the third least significant and
  63 dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
  64 dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
  65
  66 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  67 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  68
  69
  70 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  71 C
  72 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
  73 C lot of function call overheads are avoided, especially when the given size
  74 C is small.
  75 C
  76 C The code size might look a bit excessive, but not all of it is executed
  77 C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
  78 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  79 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
  80 C at least be getting value for money, because 35x35 spends something like
  81 C 5780 cycles here.
  82 C
  83 C Different values of UNROLL_COUNT give slightly different speeds, between
  84 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
  85 C This isn't a big difference, but it's presumably some alignment effect
  86 C which if understood could give a simple speedup.
  87
  88 defframe(PARAM_SIZE,12)
  89 defframe(PARAM_SRC, 8)
  90 defframe(PARAM_DST, 4)
  91
  92         TEXT
  93         ALIGN(32)
  94 PROLOGUE(mpn_sqr_basecase)
  95 deflit(`FRAME',0)
  96
  97         movl    PARAM_SIZE, %ecx
  98         movl    PARAM_SRC, %eax
  99
 100         cmpl    $2, %ecx
 101         je      L(two_limbs)
 102
 103         movl    PARAM_DST, %edx
 104         ja      L(three_or_more)
 105
 106
 107 C -----------------------------------------------------------------------------
 108 C one limb only
 109         C eax   src
 110         C ebx
 111         C ecx   size
 112         C edx   dst
 113
 114         movl    (%eax), %eax
 115         movl    %edx, %ecx
 116
 117         mull    %eax
 118
 119         movl    %eax, (%ecx)
 120         movl    %edx, 4(%ecx)
 121         ret
 122
 123
 124 C -----------------------------------------------------------------------------
 125         ALIGN(16)
 126 L(two_limbs):
 127         C eax   src
 128         C ebx
 129         C ecx   size
 130         C edx   dst
 131
 132         pushl   %ebx
 133         movl    %eax, %ebx      C src
 134 deflit(`FRAME',4)
 135
 136         movl    (%ebx), %eax
 137         movl    PARAM_DST, %ecx
 138
 139         mull    %eax            C src[0]^2
 140
 141         movl    %eax, (%ecx)
 142         movl    4(%ebx), %eax
 143
 144         movl    %edx, 4(%ecx)
 145
 146         mull    %eax            C src[1]^2
 147
 148         movl    %eax, 8(%ecx)
 149         movl    (%ebx), %eax
 150
 151         movl    %edx, 12(%ecx)
 152         movl    4(%ebx), %edx
 153
 154         mull    %edx            C src[0]*src[1]
 155
 156         addl    %eax, 4(%ecx)
 157
 158         adcl    %edx, 8(%ecx)
 159         adcl    $0, 12(%ecx)
 160
 161         popl    %ebx
 162         addl    %eax, 4(%ecx)
 163
 164         adcl    %edx, 8(%ecx)
 165         adcl    $0, 12(%ecx)
 166
 167         ret
 168
 169
 170 C -----------------------------------------------------------------------------
 171 L(three_or_more):
 172 deflit(`FRAME',0)
 173         cmpl    $4, %ecx
 174         jae     L(four_or_more)
 175
 176
 177 C -----------------------------------------------------------------------------
 178 C three limbs
 179         C eax   src
 180         C ecx   size
 181         C edx   dst
 182
 183         pushl   %ebx
 184         movl    %eax, %ebx      C src
 185
 186         movl    (%ebx), %eax
 187         movl    %edx, %ecx      C dst
 188
 189         mull    %eax            C src[0] ^ 2
 190
 191         movl    %eax, (%ecx)
 192         movl    4(%ebx), %eax
 193
 194         movl    %edx, 4(%ecx)
 195         pushl   %esi
 196
 197         mull    %eax            C src[1] ^ 2
 198
 199         movl    %eax, 8(%ecx)
 200         movl    8(%ebx), %eax
 201
 202         movl    %edx, 12(%ecx)
 203         pushl   %edi
 204
 205         mull    %eax            C src[2] ^ 2
 206
 207         movl    %eax, 16(%ecx)
 208         movl    (%ebx), %eax
 209
 210         movl    %edx, 20(%ecx)
 211         movl    4(%ebx), %edx
 212
 213         mull    %edx            C src[0] * src[1]
 214
 215         movl    %eax, %esi
 216         movl    (%ebx), %eax
 217
 218         movl    %edx, %edi
 219         movl    8(%ebx), %edx
 220
 221         pushl   %ebp
 222         xorl    %ebp, %ebp
 223
 224         mull    %edx            C src[0] * src[2]
 225
 226         addl    %eax, %edi
 227         movl    4(%ebx), %eax
 228
 229         adcl    %edx, %ebp
 230
 231         movl    8(%ebx), %edx
 232
 233         mull    %edx            C src[1] * src[2]
 234
 235         addl    %eax, %ebp
 236
 237         adcl    $0, %edx
 238
 239
 240         C eax   will be dst[5]
 241         C ebx
 242         C ecx   dst
 243         C edx   dst[4]
 244         C esi   dst[1]
 245         C edi   dst[2]
 246         C ebp   dst[3]
 247
 248         xorl    %eax, %eax
 249         addl    %esi, %esi
 250         adcl    %edi, %edi
 251         adcl    %ebp, %ebp
 252         adcl    %edx, %edx
 253         adcl    $0, %eax
 254
 255         addl    %esi, 4(%ecx)
 256         adcl    %edi, 8(%ecx)
 257         adcl    %ebp, 12(%ecx)
 258
 259         popl    %ebp
 260         popl    %edi
 261
 262         adcl    %edx, 16(%ecx)
 263
 264         popl    %esi
 265         popl    %ebx
 266
 267         adcl    %eax, 20(%ecx)
 268         ASSERT(nc)
 269
 270         ret
 271
 272
 273 C -----------------------------------------------------------------------------
 274
 275 defframe(SAVE_EBX,   -4)
 276 defframe(SAVE_ESI,   -8)
 277 defframe(SAVE_EDI,   -12)
 278 defframe(SAVE_EBP,   -16)
 279 defframe(VAR_COUNTER,-20)
 280 defframe(VAR_JMP,    -24)
 281 deflit(STACK_SPACE, 24)
 282
 283         ALIGN(16)
 284 L(four_or_more):
 285
 286         C eax   src
 287         C ebx
 288         C ecx   size
 289         C edx   dst
 290         C esi
 291         C edi
 292         C ebp
 293
 294 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 295 C
 296 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
 297 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
 298 C a 5780 cycle operation, which is not surprising since the loop here is 8
 299 C c/l and mpn_mul_1 is 6.25 c/l.
 300
 301         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
 302
 303         movl    %edi, SAVE_EDI
 304         leal    4(%edx), %edi
 305
 306         movl    %ebx, SAVE_EBX
 307         leal    4(%eax), %ebx
 308
 309         movl    %esi, SAVE_ESI
 310         xorl    %esi, %esi
 311
 312         movl    %ebp, SAVE_EBP
 313
 314         C eax
 315         C ebx   src+4
 316         C ecx   size
 317         C edx
 318         C esi
 319         C edi   dst+4
 320         C ebp
 321
 322         movl    (%eax), %ebp    C multiplier
 323         leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
 324
 325
 326         ALIGN(16)
 327 L(mul_1):
 328         C eax   scratch
 329         C ebx   src ptr
 330         C ecx   counter
 331         C edx   scratch
 332         C esi   carry
 333         C edi   dst ptr
 334         C ebp   multiplier
 335
 336         movl    (%ebx), %eax
 337         addl    $4, %ebx
 338
 339         mull    %ebp
 340
 341         addl    %esi, %eax
 342         movl    $0, %esi
 343
 344         adcl    %edx, %esi
 345
 346         movl    %eax, (%edi)
 347         addl    $4, %edi
 348
 349         loop    L(mul_1)
 350
 351
 352 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 353 C
 354 C The last two addmuls, which are the bottom right corner of the product
 355 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 356 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 357 C cases that need to be done.
 358 C
 359 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 360 C comments.
 361 C
 362 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 363 C
 364 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 365 C chunk each outer loop.
 366 C
 367 C K6 doesn't do any branch prediction on indirect jumps, which is good
 368 C actually because it's a different target each time.  The unrolled addmul
 369 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
 370 C the indirect jump is quickly recovered.
 371
 372
 373 dnl  This value is also implicitly encoded in a shift and add.
 374 dnl
 375 deflit(CODE_BYTES_PER_LIMB, 15)
 376
 377 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 378 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 379 dnl  values up to 31.  Above that an offset must be added to them.
 380 dnl
 381 deflit(OFFSET,
 382 ifelse(eval(UNROLL_COUNT>31),1,
 383 eval((UNROLL_COUNT-31)*4),
 384 0))
 385
 386         C eax
 387         C ebx   &src[size]
 388         C ecx
 389         C edx
 390         C esi   carry
 391         C edi   &dst[size]
 392         C ebp
 393
 394         movl    PARAM_SIZE, %ecx
 395         movl    %esi, (%edi)
 396
 397         subl    $4, %ecx
 398         jz      L(corner)
 399
 400         movl    %ecx, %edx
 401 ifelse(OFFSET,0,,
 402 `       subl    $OFFSET, %ebx')
 403
 404         shll    $4, %ecx
 405 ifelse(OFFSET,0,,
 406 `       subl    $OFFSET, %edi')
 407
 408         negl    %ecx
 409
 410 ifdef(`PIC',`
 411         call    L(pic_calc)
 412 L(here):
 413 ',`
 414         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 415 ')
 416         negl    %edx
 417
 418
 419         C The calculated jump mustn't be before the start of the available
 420         C code.  This is the limitation UNROLL_COUNT puts on the src operand
 421         C size, but checked here using the jump address directly.
 422         C
 423         ASSERT(ae,`
 424         movl_text_address( L(unroll_inner_start), %eax)
 425         cmpl    %eax, %ecx
 426         ')
 427
 428
 429 C -----------------------------------------------------------------------------
 430         ALIGN(16)
 431 L(unroll_outer_top):
 432         C eax
 433         C ebx   &src[size], constant
 434         C ecx   VAR_JMP
 435         C edx   VAR_COUNTER, limbs, negative
 436         C esi   high limb to store
 437         C edi   dst ptr, high of last addmul
 438         C ebp
 439
 440         movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
 441         movl    %edx, VAR_COUNTER
 442
 443         movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
 444
 445         mull    %ebp
 446
 447         testb   $1, %cl
 448
 449         movl    %edx, %esi      C high carry
 450         movl    %ecx, %edx      C jump
 451
 452         movl    %eax, %ecx      C low carry
 453         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 454
 455         movl    %edx, VAR_JMP
 456         leal    4(%edi), %edi
 457
 458         C A branch-free version of this using some xors was found to be a
 459         C touch slower than just a conditional jump, despite the jump
 460         C switching between taken and not taken on every loop.
 461
 462 ifelse(eval(UNROLL_COUNT%2),0,
 463         jz,jnz) L(unroll_noswap)
 464         movl    %esi, %eax      C high,low carry other way around
 465
 466         movl    %ecx, %esi
 467         movl    %eax, %ecx
 468 L(unroll_noswap):
 469
 470         jmp     *%edx
 471
 472
 473         C Must be on an even address here so the low bit of the jump address
 474         C will indicate which way around ecx/esi should start.
 475         C
 476         C An attempt was made at padding here to get the end of the unrolled
 477         C code to come out on a good alignment, to save padding before
 478         C L(corner).  This worked, but turned out to run slower than just an
 479         C ALIGN(2).  The reason for this is not clear, it might be related
 480         C to the different speeds on different UNROLL_COUNTs noted above.
 481
 482         ALIGN(2)
 483
 484 L(unroll_inner_start):
 485         C eax   scratch
 486         C ebx   src
 487         C ecx   carry low
 488         C edx   scratch
 489         C esi   carry high
 490         C edi   dst
 491         C ebp   multiplier
 492         C
 493         C 15 code bytes each limb
 494         C ecx/esi swapped on each chunk
 495
 496 forloop(`i', UNROLL_COUNT, 1, `
 497         deflit(`disp_src', eval(-i*4 + OFFSET))
 498         deflit(`disp_dst', eval(disp_src - 4))
 499
 500         m4_assert(`disp_src>=-128 && disp_src<128')
 501         m4_assert(`disp_dst>=-128 && disp_dst<128')
 502
 503 ifelse(eval(i%2),0,`
 504 Zdisp(  movl,   disp_src,(%ebx), %eax)
 505         mull    %ebp
 506 Zdisp(  addl,   %esi, disp_dst,(%edi))
 507         adcl    %eax, %ecx
 508         movl    %edx, %esi
 509         jadcl0( %esi)
 510 ',`
 511         dnl  this one comes out last
 512 Zdisp(  movl,   disp_src,(%ebx), %eax)
 513         mull    %ebp
 514 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 515         adcl    %eax, %esi
 516         movl    %edx, %ecx
 517         jadcl0( %ecx)
 518 ')
 519 ')
 520 L(unroll_inner_end):
 521
 522         addl    %esi, -4+OFFSET(%edi)
 523
 524         movl    VAR_COUNTER, %edx
 525         jadcl0( %ecx)
 526
 527         movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
 528         movl    VAR_JMP, %ecx
 529
 530         incl    %edx
 531         jnz     L(unroll_outer_top)
 532
 533
 534 ifelse(OFFSET,0,,`
 535         addl    $OFFSET, %ebx
 536         addl    $OFFSET, %edi
 537 ')
 538
 539
 540 C -----------------------------------------------------------------------------
 541         ALIGN(16)
 542 L(corner):
 543         C ebx   &src[size]
 544         C edi   &dst[2*size-5]
 545
 546         movl    -12(%ebx), %ebp
 547
 548         movl    -8(%ebx), %eax
 549         movl    %eax, %ecx
 550
 551         mull    %ebp
 552
 553         addl    %eax, -4(%edi)
 554         adcl    $0, %edx
 555
 556         movl    -4(%ebx), %eax
 557         movl    %edx, %esi
 558         movl    %eax, %ebx
 559
 560         mull    %ebp
 561
 562         addl    %esi, %eax
 563         adcl    $0, %edx
 564
 565         addl    %eax, (%edi)
 566         adcl    $0, %edx
 567
 568         movl    %edx, %esi
 569         movl    %ebx, %eax
 570
 571         mull    %ecx
 572
 573         addl    %esi, %eax
 574         movl    %eax, 4(%edi)
 575
 576         adcl    $0, %edx
 577
 578         movl    %edx, 8(%edi)
 579
 580
 581 C -----------------------------------------------------------------------------
 582 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 583 C The loop measures about 6 cycles/iteration, though it looks like it should
 584 C decode in 5.
 585
 586 L(lshift_start):
 587         movl    PARAM_SIZE, %ecx
 588
 589         movl    PARAM_DST, %edi
 590         subl    $1, %ecx                C size-1 and clear carry
 591
 592         movl    PARAM_SRC, %ebx
 593         movl    %ecx, %edx
 594
 595         xorl    %eax, %eax              C ready for adcl
 596
 597
 598         ALIGN(16)
 599 L(lshift):
 600         C eax
 601         C ebx   src (for later use)
 602         C ecx   counter, decrementing
 603         C edx   size-1 (for later use)
 604         C esi
 605         C edi   dst, incrementing
 606         C ebp
 607
 608         rcll    4(%edi)
 609         rcll    8(%edi)
 610         leal    8(%edi), %edi
 611         loop    L(lshift)
 612
 613
 614         adcl    %eax, %eax
 615
 616         movl    %eax, 4(%edi)           C dst most significant limb
 617         movl    (%ebx), %eax            C src[0]
 618
 619         leal    4(%ebx,%edx,4), %ebx    C &src[size]
 620         subl    %edx, %ecx              C -(size-1)
 621
 622
 623 C -----------------------------------------------------------------------------
 624 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 625 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 626 C low limb of src[0]^2.
 627
 628
 629         mull    %eax
 630
 631         movl    %eax, (%edi,%ecx,8)     C dst[0]
 632
 633
 634         ALIGN(16)
 635 L(diag):
 636         C eax   scratch
 637         C ebx   &src[size]
 638         C ecx   counter, negative
 639         C edx   carry
 640         C esi   scratch
 641         C edi   dst[2*size-2]
 642         C ebp
 643
 644         movl    (%ebx,%ecx,4), %eax
 645         movl    %edx, %esi
 646
 647         mull    %eax
 648
 649         addl    %esi, 4(%edi,%ecx,8)
 650         adcl    %eax, 8(%edi,%ecx,8)
 651         adcl    $0, %edx
 652
 653         incl    %ecx
 654         jnz     L(diag)
 655
 656
 657         movl    SAVE_EBX, %ebx
 658         movl    SAVE_ESI, %esi
 659
 660         addl    %edx, 4(%edi)           C dst most significant limb
 661
 662         movl    SAVE_EDI, %edi
 663         movl    SAVE_EBP, %ebp
 664         addl    $FRAME, %esp
 665         ret
 666
 667
 668
 669 C -----------------------------------------------------------------------------
 670 ifdef(`PIC',`
 671 L(pic_calc):
 672         C See mpn/x86/README about old gas bugs
 673         addl    (%esp), %ecx
 674         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 675         addl    %edx, %ecx
 676         ret_internal
 677 ')
 678
 679
 680 EPILOGUE()