source/libs/gmp/gmp-src/mpn/x86/k7/sqr_basecase.asm

   1 dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
  35 C     (measured on the speed difference between 25 and 50 limbs, which is
  36 C     roughly the Karatsuba recursing range).
  37
  38
  39 dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
  40 dnl  some comments.
  41
  42 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
  43
  44 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  45 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  46
  47 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  48 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  49
  50
  51 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  52 C
  53 C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
  54 C which is quite a bit, but is considered good value since squares big
  55 C enough to use most of the code will be spending quite a few cycles in it.
  56
  57
  58 defframe(PARAM_SIZE,12)
  59 defframe(PARAM_SRC, 8)
  60 defframe(PARAM_DST, 4)
  61
  62         TEXT
  63         ALIGN(32)
  64 PROLOGUE(mpn_sqr_basecase)
  65 deflit(`FRAME',0)
  66
  67         movl    PARAM_SIZE, %ecx
  68         movl    PARAM_SRC, %eax
  69         cmpl    $2, %ecx
  70
  71         movl    PARAM_DST, %edx
  72         je      L(two_limbs)
  73         ja      L(three_or_more)
  74
  75
  76 C------------------------------------------------------------------------------
  77 C one limb only
  78         C eax   src
  79         C ecx   size
  80         C edx   dst
  81
  82         movl    (%eax), %eax
  83         movl    %edx, %ecx
  84
  85         mull    %eax
  86
  87         movl    %edx, 4(%ecx)
  88         movl    %eax, (%ecx)
  89         ret
  90
  91
  92 C------------------------------------------------------------------------------
  93 C
  94 C Using the read/modify/write "add"s seems to be faster than saving and
  95 C restoring registers.  Perhaps the loads for the first set hide under the
  96 C mul latency and the second gets store to load forwarding.
  97
  98         ALIGN(16)
  99 L(two_limbs):
 100         C eax   src
 101         C ebx
 102         C ecx   size
 103         C edx   dst
 104 deflit(`FRAME',0)
 105
 106         pushl   %ebx            FRAME_pushl()
 107         movl    %eax, %ebx      C src
 108         movl    (%eax), %eax
 109
 110         movl    %edx, %ecx      C dst
 111
 112         mull    %eax            C src[0]^2
 113
 114         movl    %eax, (%ecx)    C dst[0]
 115         movl    4(%ebx), %eax
 116
 117         movl    %edx, 4(%ecx)   C dst[1]
 118
 119         mull    %eax            C src[1]^2
 120
 121         movl    %eax, 8(%ecx)   C dst[2]
 122         movl    (%ebx), %eax
 123
 124         movl    %edx, 12(%ecx)  C dst[3]
 125
 126         mull    4(%ebx)         C src[0]*src[1]
 127
 128         popl    %ebx
 129
 130         addl    %eax, 4(%ecx)
 131         adcl    %edx, 8(%ecx)
 132         adcl    $0, 12(%ecx)
 133         ASSERT(nc)
 134
 135         addl    %eax, 4(%ecx)
 136         adcl    %edx, 8(%ecx)
 137         adcl    $0, 12(%ecx)
 138         ASSERT(nc)
 139
 140         ret
 141
 142
 143 C------------------------------------------------------------------------------
 144 defframe(SAVE_EBX,  -4)
 145 defframe(SAVE_ESI,  -8)
 146 defframe(SAVE_EDI, -12)
 147 defframe(SAVE_EBP, -16)
 148 deflit(STACK_SPACE, 16)
 149
 150 L(three_or_more):
 151         subl    $STACK_SPACE, %esp
 152         cmpl    $4, %ecx
 153         jae     L(four_or_more)
 154 deflit(`FRAME',STACK_SPACE)
 155
 156
 157 C------------------------------------------------------------------------------
 158 C Three limbs
 159 C
 160 C Writing out the loads and stores separately at the end of this code comes
 161 C out about 10 cycles faster than using adcls to memory.
 162
 163         C eax   src
 164         C ecx   size
 165         C edx   dst
 166
 167         movl    %ebx, SAVE_EBX
 168         movl    %eax, %ebx      C src
 169         movl    (%eax), %eax
 170
 171         movl    %edx, %ecx      C dst
 172         movl    %esi, SAVE_ESI
 173         movl    %edi, SAVE_EDI
 174
 175         mull    %eax            C src[0] ^ 2
 176
 177         movl    %eax, (%ecx)
 178         movl    4(%ebx), %eax
 179         movl    %edx, 4(%ecx)
 180
 181         mull    %eax            C src[1] ^ 2
 182
 183         movl    %eax, 8(%ecx)
 184         movl    8(%ebx), %eax
 185         movl    %edx, 12(%ecx)
 186
 187         mull    %eax            C src[2] ^ 2
 188
 189         movl    %eax, 16(%ecx)
 190         movl    (%ebx), %eax
 191         movl    %edx, 20(%ecx)
 192
 193         mull    4(%ebx)         C src[0] * src[1]
 194
 195         movl    %eax, %esi
 196         movl    (%ebx), %eax
 197         movl    %edx, %edi
 198
 199         mull    8(%ebx)         C src[0] * src[2]
 200
 201         addl    %eax, %edi
 202         movl    %ebp, SAVE_EBP
 203         movl    $0, %ebp
 204
 205         movl    4(%ebx), %eax
 206         adcl    %edx, %ebp
 207
 208         mull    8(%ebx)         C src[1] * src[2]
 209
 210         xorl    %ebx, %ebx
 211         addl    %eax, %ebp
 212
 213         adcl    $0, %edx
 214
 215         C eax
 216         C ebx   zero, will be dst[5]
 217         C ecx   dst
 218         C edx   dst[4]
 219         C esi   dst[1]
 220         C edi   dst[2]
 221         C ebp   dst[3]
 222
 223         adcl    $0, %edx
 224         addl    %esi, %esi
 225
 226         adcl    %edi, %edi
 227         movl    4(%ecx), %eax
 228
 229         adcl    %ebp, %ebp
 230
 231         adcl    %edx, %edx
 232
 233         adcl    $0, %ebx
 234         addl    %eax, %esi
 235         movl    8(%ecx), %eax
 236
 237         adcl    %eax, %edi
 238         movl    12(%ecx), %eax
 239         movl    %esi, 4(%ecx)
 240
 241         adcl    %eax, %ebp
 242         movl    16(%ecx), %eax
 243         movl    %edi, 8(%ecx)
 244
 245         movl    SAVE_ESI, %esi
 246         movl    SAVE_EDI, %edi
 247
 248         adcl    %eax, %edx
 249         movl    20(%ecx), %eax
 250         movl    %ebp, 12(%ecx)
 251
 252         adcl    %ebx, %eax
 253         ASSERT(nc)
 254         movl    SAVE_EBX, %ebx
 255         movl    SAVE_EBP, %ebp
 256
 257         movl    %edx, 16(%ecx)
 258         movl    %eax, 20(%ecx)
 259         addl    $FRAME, %esp
 260
 261         ret
 262
 263
 264 C------------------------------------------------------------------------------
 265 L(four_or_more):
 266
 267 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 268 C Further products are added in rather than stored.
 269
 270         C eax   src
 271         C ebx
 272         C ecx   size
 273         C edx   dst
 274         C esi
 275         C edi
 276         C ebp
 277
 278 defframe(`VAR_COUNTER',-20)
 279 defframe(`VAR_JMP',    -24)
 280 deflit(EXTRA_STACK_SPACE, 8)
 281
 282         movl    %ebx, SAVE_EBX
 283         movl    %edi, SAVE_EDI
 284         leal    (%edx,%ecx,4), %edi     C &dst[size]
 285
 286         movl    %esi, SAVE_ESI
 287         movl    %ebp, SAVE_EBP
 288         leal    (%eax,%ecx,4), %esi     C &src[size]
 289
 290         movl    (%eax), %ebp            C multiplier
 291         movl    $0, %ebx
 292         decl    %ecx
 293
 294         negl    %ecx
 295         subl    $EXTRA_STACK_SPACE, %esp
 296 FRAME_subl_esp(EXTRA_STACK_SPACE)
 297
 298 L(mul_1):
 299         C eax   scratch
 300         C ebx   carry
 301         C ecx   counter
 302         C edx   scratch
 303         C esi   &src[size]
 304         C edi   &dst[size]
 305         C ebp   multiplier
 306
 307         movl    (%esi,%ecx,4), %eax
 308
 309         mull    %ebp
 310
 311         addl    %ebx, %eax
 312         movl    %eax, (%edi,%ecx,4)
 313         movl    $0, %ebx
 314
 315         adcl    %edx, %ebx
 316         incl    %ecx
 317         jnz     L(mul_1)
 318
 319
 320 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 321 C
 322 C The last two products, which are the bottom right corner of the product
 323 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 324 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 325 C cases that need to be done.
 326 C
 327 C The unrolled code is the same as in mpn_addmul_1, see that routine for
 328 C some comments.
 329 C
 330 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
 331 C
 332 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 333 C chunk each outer loop.
 334 C
 335 C K7 does branch prediction on indirect jumps, which is bad since it's a
 336 C different target each time.  There seems no way to avoid this.
 337
 338 dnl  This value also hard coded in some shifts and adds
 339 deflit(CODE_BYTES_PER_LIMB, 17)
 340
 341 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 342 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 343 dnl  values up to 31, but above that an offset must be added to them.
 344
 345 deflit(OFFSET,
 346 ifelse(eval(UNROLL_COUNT>31),1,
 347 eval((UNROLL_COUNT-31)*4),
 348 0))
 349
 350 dnl  Because the last chunk of code is generated differently, a label placed
 351 dnl  at the end doesn't work.  Instead calculate the implied end using the
 352 dnl  start and how many chunks of code there are.
 353
 354 deflit(UNROLL_INNER_END,
 355 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
 356
 357         C eax
 358         C ebx   carry
 359         C ecx
 360         C edx
 361         C esi   &src[size]
 362         C edi   &dst[size]
 363         C ebp
 364
 365         movl    PARAM_SIZE, %ecx
 366         movl    %ebx, (%edi)
 367
 368         subl    $4, %ecx
 369         jz      L(corner)
 370
 371         negl    %ecx
 372 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 373 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 374
 375         movl    %ecx, %edx
 376         shll    $4, %ecx
 377
 378 ifdef(`PIC',`
 379         call    L(pic_calc)
 380 L(here):
 381 ',`
 382         leal    UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 383 ')
 384
 385
 386         C The calculated jump mustn't come out to before the start of the
 387         C code available.  This is the limit UNROLL_COUNT puts on the src
 388         C operand size, but checked here directly using the jump address.
 389         ASSERT(ae,
 390         `movl_text_address(L(unroll_inner_start), %eax)
 391         cmpl    %eax, %ecx')
 392
 393
 394 C------------------------------------------------------------------------------
 395         ALIGN(16)
 396 L(unroll_outer_top):
 397         C eax
 398         C ebx   high limb to store
 399         C ecx   VAR_JMP
 400         C edx   VAR_COUNTER, limbs, negative
 401         C esi   &src[size], constant
 402         C edi   dst ptr, high of last addmul
 403         C ebp
 404
 405         movl    -12+OFFSET(%esi,%edx,4), %ebp   C next multiplier
 406         movl    -8+OFFSET(%esi,%edx,4), %eax    C first of multiplicand
 407
 408         movl    %edx, VAR_COUNTER
 409
 410         mull    %ebp
 411
 412 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
 413
 414         testb   $1, %cl
 415         movl    %edx, %ebx      C high carry
 416         movl    %ecx, %edx      C jump
 417
 418         movl    %eax, %ecx      C low carry
 419         cmovX(  %ebx, %ecx)     C high carry reverse
 420         cmovX(  %eax, %ebx)     C low carry reverse
 421
 422         leal    CODE_BYTES_PER_LIMB(%edx), %eax
 423         xorl    %edx, %edx
 424         leal    4(%edi), %edi
 425
 426         movl    %eax, VAR_JMP
 427
 428         jmp     *%eax
 429
 430
 431 ifdef(`PIC',`
 432 L(pic_calc):
 433         addl    (%esp), %ecx
 434         addl    $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
 435         addl    %edx, %ecx
 436         ret_internal
 437 ')
 438
 439
 440         C Must be an even address to preserve the significance of the low
 441         C bit of the jump address indicating which way around ecx/ebx should
 442         C start.
 443         ALIGN(2)
 444
 445 L(unroll_inner_start):
 446         C eax   next limb
 447         C ebx   carry high
 448         C ecx   carry low
 449         C edx   scratch
 450         C esi   src
 451         C edi   dst
 452         C ebp   multiplier
 453
 454 forloop(`i', UNROLL_COUNT, 1, `
 455         deflit(`disp_src', eval(-i*4 + OFFSET))
 456         deflit(`disp_dst', eval(disp_src - 4))
 457
 458         m4_assert(`disp_src>=-128 && disp_src<128')
 459         m4_assert(`disp_dst>=-128 && disp_dst<128')
 460
 461 ifelse(eval(i%2),0,`
 462 Zdisp(  movl,   disp_src,(%esi), %eax)
 463         adcl    %edx, %ebx
 464
 465         mull    %ebp
 466
 467 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 468         movl    $0, %ecx
 469
 470         adcl    %eax, %ebx
 471
 472 ',`
 473         dnl  this bit comes out last
 474 Zdisp(  movl,   disp_src,(%esi), %eax)
 475         adcl    %edx, %ecx
 476
 477         mull    %ebp
 478
 479 Zdisp(  addl,   %ebx, disp_dst,(%edi))
 480
 481 ifelse(forloop_last,0,
 482 `       movl    $0, %ebx')
 483
 484         adcl    %eax, %ecx
 485 ')
 486 ')
 487
 488         C eax   next limb
 489         C ebx   carry high
 490         C ecx   carry low
 491         C edx   scratch
 492         C esi   src
 493         C edi   dst
 494         C ebp   multiplier
 495
 496         adcl    $0, %edx
 497         addl    %ecx, -4+OFFSET(%edi)
 498         movl    VAR_JMP, %ecx
 499
 500         adcl    $0, %edx
 501
 502         movl    %edx, m4_empty_if_zero(OFFSET) (%edi)
 503         movl    VAR_COUNTER, %edx
 504
 505         incl    %edx
 506         jnz     L(unroll_outer_top)
 507
 508
 509 ifelse(OFFSET,0,,`
 510         addl    $OFFSET, %esi
 511         addl    $OFFSET, %edi
 512 ')
 513
 514
 515 C------------------------------------------------------------------------------
 516 L(corner):
 517         C esi   &src[size]
 518         C edi   &dst[2*size-5]
 519
 520         movl    -12(%esi), %ebp
 521         movl    -8(%esi), %eax
 522         movl    %eax, %ecx
 523
 524         mull    %ebp
 525
 526         addl    %eax, -4(%edi)
 527         movl    -4(%esi), %eax
 528
 529         adcl    $0, %edx
 530         movl    %edx, %ebx
 531         movl    %eax, %esi
 532
 533         mull    %ebp
 534
 535         addl    %ebx, %eax
 536
 537         adcl    $0, %edx
 538         addl    %eax, (%edi)
 539         movl    %esi, %eax
 540
 541         adcl    $0, %edx
 542         movl    %edx, %ebx
 543
 544         mull    %ecx
 545
 546         addl    %ebx, %eax
 547         movl    %eax, 4(%edi)
 548
 549         adcl    $0, %edx
 550         movl    %edx, 8(%edi)
 551
 552
 553
 554 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 555
 556 L(lshift_start):
 557         movl    PARAM_SIZE, %eax
 558         movl    PARAM_DST, %edi
 559         xorl    %ecx, %ecx              C clear carry
 560
 561         leal    (%edi,%eax,8), %edi
 562         notl    %eax                    C -size-1, preserve carry
 563
 564         leal    2(%eax), %eax           C -(size-1)
 565
 566 L(lshift):
 567         C eax   counter, negative
 568         C ebx
 569         C ecx
 570         C edx
 571         C esi
 572         C edi   dst, pointing just after last limb
 573         C ebp
 574
 575         rcll    -4(%edi,%eax,8)
 576         rcll    (%edi,%eax,8)
 577         incl    %eax
 578         jnz     L(lshift)
 579
 580         setc    %al
 581
 582         movl    PARAM_SRC, %esi
 583         movl    %eax, -4(%edi)          C dst most significant limb
 584
 585         movl    PARAM_SIZE, %ecx
 586
 587
 588 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 589 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 590 C low limb of src[0]^2.
 591
 592         movl    (%esi), %eax            C src[0]
 593
 594         mull    %eax
 595
 596         leal    (%esi,%ecx,4), %esi     C src point just after last limb
 597         negl    %ecx
 598
 599         movl    %eax, (%edi,%ecx,8)     C dst[0]
 600         incl    %ecx
 601
 602 L(diag):
 603         C eax   scratch
 604         C ebx   scratch
 605         C ecx   counter, negative
 606         C edx   carry
 607         C esi   src just after last limb
 608         C edi   dst just after last limb
 609         C ebp
 610
 611         movl    (%esi,%ecx,4), %eax
 612         movl    %edx, %ebx
 613
 614         mull    %eax
 615
 616         addl    %ebx, -4(%edi,%ecx,8)
 617         adcl    %eax, (%edi,%ecx,8)
 618         adcl    $0, %edx
 619
 620         incl    %ecx
 621         jnz     L(diag)
 622
 623
 624         movl    SAVE_ESI, %esi
 625         movl    SAVE_EBX, %ebx
 626
 627         addl    %edx, -4(%edi)          C dst most significant limb
 628         movl    SAVE_EDI, %edi
 629
 630         movl    SAVE_EBP, %ebp
 631         addl    $FRAME, %esp
 632
 633         ret
 634
 635 EPILOGUE()