source/libs/gmp/gmp-src/mpn/x86/p6/sqr_basecase.asm

   1 dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
  35 C     product (measured on the speed difference between 20 and 40 limbs,
  36 C     which is the Karatsuba recursing range).
  37
  38
  39 dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
  40 dnl  a description.  The only difference here is that UNROLL_COUNT can go up
  41 dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
  42
  43 deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
  44
  45 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  46 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  47
  48 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  49 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  50
  51
  52 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  53 C
  54 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  55 C lot of function call overheads are avoided, especially when the given size
  56 C is small.
  57 C
  58 C The code size might look a bit excessive, but not all of it is executed so
  59 C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
  60 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  61 C the unrolled addmul; and big sizes like 40x40 that do use the full
  62 C unrolling will least be making good use of it, because 40x40 will take
  63 C something like 7000 cycles.
  64
  65 defframe(PARAM_SIZE,12)
  66 defframe(PARAM_SRC, 8)
  67 defframe(PARAM_DST, 4)
  68
  69         TEXT
  70         ALIGN(32)
  71 PROLOGUE(mpn_sqr_basecase)
  72 deflit(`FRAME',0)
  73
  74         movl    PARAM_SIZE, %edx
  75
  76         movl    PARAM_SRC, %eax
  77
  78         cmpl    $2, %edx
  79         movl    PARAM_DST, %ecx
  80         je      L(two_limbs)
  81
  82         movl    (%eax), %eax
  83         ja      L(three_or_more)
  84
  85
  86 C -----------------------------------------------------------------------------
  87 C one limb only
  88         C eax   src limb
  89         C ebx
  90         C ecx   dst
  91         C edx
  92
  93         mull    %eax
  94
  95         movl    %eax, (%ecx)
  96         movl    %edx, 4(%ecx)
  97
  98         ret
  99
 100
 101 C -----------------------------------------------------------------------------
 102 L(two_limbs):
 103         C eax   src
 104         C ebx
 105         C ecx   dst
 106         C edx
 107
 108 defframe(SAVE_ESI, -4)
 109 defframe(SAVE_EBX, -8)
 110 defframe(SAVE_EDI, -12)
 111 defframe(SAVE_EBP, -16)
 112 deflit(`STACK_SPACE',16)
 113
 114         subl    $STACK_SPACE, %esp
 115 deflit(`FRAME',STACK_SPACE)
 116
 117         movl    %esi, SAVE_ESI
 118         movl    %eax, %esi
 119         movl    (%eax), %eax
 120
 121         mull    %eax            C src[0]^2
 122
 123         movl    %eax, (%ecx)    C dst[0]
 124         movl    4(%esi), %eax
 125
 126         movl    %ebx, SAVE_EBX
 127         movl    %edx, %ebx      C dst[1]
 128
 129         mull    %eax            C src[1]^2
 130
 131         movl    %edi, SAVE_EDI
 132         movl    %eax, %edi      C dst[2]
 133         movl    (%esi), %eax
 134
 135         movl    %ebp, SAVE_EBP
 136         movl    %edx, %ebp      C dst[3]
 137
 138         mull    4(%esi)         C src[0]*src[1]
 139
 140         addl    %eax, %ebx
 141         movl    SAVE_ESI, %esi
 142
 143         adcl    %edx, %edi
 144
 145         adcl    $0, %ebp
 146         addl    %ebx, %eax
 147         movl    SAVE_EBX, %ebx
 148
 149         adcl    %edi, %edx
 150         movl    SAVE_EDI, %edi
 151
 152         adcl    $0, %ebp
 153
 154         movl    %eax, 4(%ecx)
 155
 156         movl    %ebp, 12(%ecx)
 157         movl    SAVE_EBP, %ebp
 158
 159         movl    %edx, 8(%ecx)
 160         addl    $FRAME, %esp
 161
 162         ret
 163
 164
 165 C -----------------------------------------------------------------------------
 166 L(three_or_more):
 167         C eax   src low limb
 168         C ebx
 169         C ecx   dst
 170         C edx   size
 171 deflit(`FRAME',0)
 172
 173         pushl   %esi    defframe_pushl(`SAVE_ESI')
 174         cmpl    $4, %edx
 175
 176         movl    PARAM_SRC, %esi
 177         jae     L(four_or_more)
 178
 179
 180 C -----------------------------------------------------------------------------
 181 C three limbs
 182
 183         C eax   src low limb
 184         C ebx
 185         C ecx   dst
 186         C edx
 187         C esi   src
 188         C edi
 189         C ebp
 190
 191         pushl   %ebp    defframe_pushl(`SAVE_EBP')
 192         pushl   %edi    defframe_pushl(`SAVE_EDI')
 193
 194         mull    %eax            C src[0] ^ 2
 195
 196         movl    %eax, (%ecx)
 197         movl    %edx, 4(%ecx)
 198
 199         movl    4(%esi), %eax
 200         xorl    %ebp, %ebp
 201
 202         mull    %eax            C src[1] ^ 2
 203
 204         movl    %eax, 8(%ecx)
 205         movl    %edx, 12(%ecx)
 206         movl    8(%esi), %eax
 207
 208         pushl   %ebx    defframe_pushl(`SAVE_EBX')
 209
 210         mull    %eax            C src[2] ^ 2
 211
 212         movl    %eax, 16(%ecx)
 213         movl    %edx, 20(%ecx)
 214
 215         movl    (%esi), %eax
 216
 217         mull    4(%esi)         C src[0] * src[1]
 218
 219         movl    %eax, %ebx
 220         movl    %edx, %edi
 221
 222         movl    (%esi), %eax
 223
 224         mull    8(%esi)         C src[0] * src[2]
 225
 226         addl    %eax, %edi
 227         movl    %edx, %ebp
 228
 229         adcl    $0, %ebp
 230         movl    4(%esi), %eax
 231
 232         mull    8(%esi)         C src[1] * src[2]
 233
 234         xorl    %esi, %esi
 235         addl    %eax, %ebp
 236
 237         C eax
 238         C ebx   dst[1]
 239         C ecx   dst
 240         C edx   dst[4]
 241         C esi   zero, will be dst[5]
 242         C edi   dst[2]
 243         C ebp   dst[3]
 244
 245         adcl    $0, %edx
 246         addl    %ebx, %ebx
 247
 248         adcl    %edi, %edi
 249
 250         adcl    %ebp, %ebp
 251
 252         adcl    %edx, %edx
 253         movl    4(%ecx), %eax
 254
 255         adcl    $0, %esi
 256         addl    %ebx, %eax
 257
 258         movl    %eax, 4(%ecx)
 259         movl    8(%ecx), %eax
 260
 261         adcl    %edi, %eax
 262         movl    12(%ecx), %ebx
 263
 264         adcl    %ebp, %ebx
 265         movl    16(%ecx), %edi
 266
 267         movl    %eax, 8(%ecx)
 268         movl    SAVE_EBP, %ebp
 269
 270         movl    %ebx, 12(%ecx)
 271         movl    SAVE_EBX, %ebx
 272
 273         adcl    %edx, %edi
 274         movl    20(%ecx), %eax
 275
 276         movl    %edi, 16(%ecx)
 277         movl    SAVE_EDI, %edi
 278
 279         adcl    %esi, %eax      C no carry out of this
 280         movl    SAVE_ESI, %esi
 281
 282         movl    %eax, 20(%ecx)
 283         addl    $FRAME, %esp
 284
 285         ret
 286
 287
 288
 289 C -----------------------------------------------------------------------------
 290 defframe(VAR_COUNTER,-20)
 291 defframe(VAR_JMP,    -24)
 292 deflit(`STACK_SPACE',24)
 293
 294 L(four_or_more):
 295         C eax   src low limb
 296         C ebx
 297         C ecx
 298         C edx   size
 299         C esi   src
 300         C edi
 301         C ebp
 302 deflit(`FRAME',4)  dnl  %esi already pushed
 303
 304 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 305
 306         subl    $STACK_SPACE-FRAME, %esp
 307 deflit(`FRAME',STACK_SPACE)
 308         movl    $1, %ecx
 309
 310         movl    %edi, SAVE_EDI
 311         movl    PARAM_DST, %edi
 312
 313         movl    %ebx, SAVE_EBX
 314         subl    %edx, %ecx              C -(size-1)
 315
 316         movl    %ebp, SAVE_EBP
 317         movl    $0, %ebx                C initial carry
 318
 319         leal    (%esi,%edx,4), %esi     C &src[size]
 320         movl    %eax, %ebp              C multiplier
 321
 322         leal    -4(%edi,%edx,4), %edi   C &dst[size-1]
 323
 324
 325 C This loop runs at just over 6 c/l.
 326
 327 L(mul_1):
 328         C eax   scratch
 329         C ebx   carry
 330         C ecx   counter, limbs, negative, -(size-1) to -1
 331         C edx   scratch
 332         C esi   &src[size]
 333         C edi   &dst[size-1]
 334         C ebp   multiplier
 335
 336         movl    %ebp, %eax
 337
 338         mull    (%esi,%ecx,4)
 339
 340         addl    %ebx, %eax
 341         movl    $0, %ebx
 342
 343         adcl    %edx, %ebx
 344         movl    %eax, 4(%edi,%ecx,4)
 345
 346         incl    %ecx
 347         jnz     L(mul_1)
 348
 349
 350         movl    %ebx, 4(%edi)
 351
 352
 353 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 354 C
 355 C The last two addmuls, which are the bottom right corner of the product
 356 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 357 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 358 C cases that need to be done.
 359 C
 360 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 361 C comments.
 362 C
 363 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 364 C
 365 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 366 C chunk each outer loop.
 367
 368 dnl  This is also hard-coded in the address calculation below.
 369 deflit(CODE_BYTES_PER_LIMB, 15)
 370
 371 dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
 372 dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
 373 dnl  that an offset must be added to them.
 374 deflit(OFFSET,
 375 ifelse(eval(UNROLL_COUNT>32),1,
 376 eval((UNROLL_COUNT-32)*4),
 377 0))
 378
 379         C eax
 380         C ebx   carry
 381         C ecx
 382         C edx
 383         C esi   &src[size]
 384         C edi   &dst[size-1]
 385         C ebp
 386
 387         movl    PARAM_SIZE, %ecx
 388
 389         subl    $4, %ecx
 390         jz      L(corner)
 391
 392         movl    %ecx, %edx
 393         negl    %ecx
 394
 395         shll    $4, %ecx
 396 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 397
 398 ifdef(`PIC',`
 399         call    L(pic_calc)
 400 L(here):
 401 ',`
 402         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 403 ')
 404         negl    %edx
 405
 406 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 407
 408         C The calculated jump mustn't be before the start of the available
 409         C code.  This is the limit that UNROLL_COUNT puts on the src operand
 410         C size, but checked here using the jump address directly.
 411
 412         ASSERT(ae,
 413         `movl_text_address( L(unroll_inner_start), %eax)
 414         cmpl    %eax, %ecx')
 415
 416
 417 C -----------------------------------------------------------------------------
 418         ALIGN(16)
 419 L(unroll_outer_top):
 420         C eax
 421         C ebx   high limb to store
 422         C ecx   VAR_JMP
 423         C edx   VAR_COUNTER, limbs, negative
 424         C esi   &src[size], constant
 425         C edi   dst ptr, second highest limb of last addmul
 426         C ebp
 427
 428         movl    -12+OFFSET(%esi,%edx,4), %ebp   C multiplier
 429         movl    %edx, VAR_COUNTER
 430
 431         movl    -8+OFFSET(%esi,%edx,4), %eax    C first limb of multiplicand
 432
 433         mull    %ebp
 434
 435 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
 436
 437         testb   $1, %cl
 438
 439         movl    %edx, %ebx      C high carry
 440         leal    4(%edi), %edi
 441
 442         movl    %ecx, %edx      C jump
 443
 444         movl    %eax, %ecx      C low carry
 445         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 446
 447         cmovX(  %ebx, %ecx)     C high carry reverse
 448         cmovX(  %eax, %ebx)     C low carry reverse
 449         movl    %edx, VAR_JMP
 450         jmp     *%edx
 451
 452
 453         C Must be on an even address here so the low bit of the jump address
 454         C will indicate which way around ecx/ebx should start.
 455
 456         ALIGN(2)
 457
 458 L(unroll_inner_start):
 459         C eax   scratch
 460         C ebx   carry high
 461         C ecx   carry low
 462         C edx   scratch
 463         C esi   src pointer
 464         C edi   dst pointer
 465         C ebp   multiplier
 466         C
 467         C 15 code bytes each limb
 468         C ecx/ebx reversed on each chunk
 469
 470 forloop(`i', UNROLL_COUNT, 1, `
 471         deflit(`disp_src', eval(-i*4 + OFFSET))
 472         deflit(`disp_dst', eval(disp_src))
 473
 474         m4_assert(`disp_src>=-128 && disp_src<128')
 475         m4_assert(`disp_dst>=-128 && disp_dst<128')
 476
 477 ifelse(eval(i%2),0,`
 478 Zdisp(  movl,   disp_src,(%esi), %eax)
 479         mull    %ebp
 480 Zdisp(  addl,   %ebx, disp_dst,(%edi))
 481         adcl    %eax, %ecx
 482         movl    %edx, %ebx
 483         adcl    $0, %ebx
 484 ',`
 485         dnl  this one comes out last
 486 Zdisp(  movl,   disp_src,(%esi), %eax)
 487         mull    %ebp
 488 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 489         adcl    %eax, %ebx
 490         movl    %edx, %ecx
 491         adcl    $0, %ecx
 492 ')
 493 ')
 494 L(unroll_inner_end):
 495
 496         addl    %ebx, m4_empty_if_zero(OFFSET)(%edi)
 497
 498         movl    VAR_COUNTER, %edx
 499         adcl    $0, %ecx
 500
 501         movl    %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
 502         movl    VAR_JMP, %ecx
 503
 504         incl    %edx
 505         jnz     L(unroll_outer_top)
 506
 507
 508 ifelse(OFFSET,0,,`
 509         addl    $OFFSET, %esi
 510         addl    $OFFSET, %edi
 511 ')
 512
 513
 514 C -----------------------------------------------------------------------------
 515         ALIGN(16)
 516 L(corner):
 517         C eax
 518         C ebx
 519         C ecx
 520         C edx
 521         C esi   &src[size]
 522         C edi   &dst[2*size-5]
 523         C ebp
 524
 525         movl    -12(%esi), %eax
 526
 527         mull    -8(%esi)
 528
 529         addl    %eax, (%edi)
 530         movl    -12(%esi), %eax
 531         movl    $0, %ebx
 532
 533         adcl    %edx, %ebx
 534
 535         mull    -4(%esi)
 536
 537         addl    %eax, %ebx
 538         movl    -8(%esi), %eax
 539
 540         adcl    $0, %edx
 541
 542         addl    %ebx, 4(%edi)
 543         movl    $0, %ebx
 544
 545         adcl    %edx, %ebx
 546
 547         mull    -4(%esi)
 548
 549         movl    PARAM_SIZE, %ecx
 550         addl    %ebx, %eax
 551
 552         adcl    $0, %edx
 553
 554         movl    %eax, 8(%edi)
 555
 556         movl    %edx, 12(%edi)
 557         movl    PARAM_DST, %edi
 558
 559
 560 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 561
 562         subl    $1, %ecx                C size-1
 563         xorl    %eax, %eax              C ready for final adcl, and clear carry
 564
 565         movl    %ecx, %edx
 566         movl    PARAM_SRC, %esi
 567
 568
 569 L(lshift):
 570         C eax
 571         C ebx
 572         C ecx   counter, size-1 to 1
 573         C edx   size-1 (for later use)
 574         C esi   src (for later use)
 575         C edi   dst, incrementing
 576         C ebp
 577
 578         rcll    4(%edi)
 579         rcll    8(%edi)
 580
 581         leal    8(%edi), %edi
 582         decl    %ecx
 583         jnz     L(lshift)
 584
 585
 586         adcl    %eax, %eax
 587
 588         movl    %eax, 4(%edi)           C dst most significant limb
 589         movl    (%esi), %eax            C src[0]
 590
 591         leal    4(%esi,%edx,4), %esi    C &src[size]
 592         subl    %edx, %ecx              C -(size-1)
 593
 594
 595 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 596 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 597 C low limb of src[0]^2.
 598
 599
 600         mull    %eax
 601
 602         movl    %eax, (%edi,%ecx,8)     C dst[0]
 603
 604
 605 L(diag):
 606         C eax   scratch
 607         C ebx   scratch
 608         C ecx   counter, negative
 609         C edx   carry
 610         C esi   &src[size]
 611         C edi   dst[2*size-2]
 612         C ebp
 613
 614         movl    (%esi,%ecx,4), %eax
 615         movl    %edx, %ebx
 616
 617         mull    %eax
 618
 619         addl    %ebx, 4(%edi,%ecx,8)
 620         adcl    %eax, 8(%edi,%ecx,8)
 621         adcl    $0, %edx
 622
 623         incl    %ecx
 624         jnz     L(diag)
 625
 626
 627         movl    SAVE_ESI, %esi
 628         movl    SAVE_EBX, %ebx
 629
 630         addl    %edx, 4(%edi)           C dst most significant limb
 631
 632         movl    SAVE_EDI, %edi
 633         movl    SAVE_EBP, %ebp
 634         addl    $FRAME, %esp
 635         ret
 636
 637
 638
 639 C -----------------------------------------------------------------------------
 640 ifdef(`PIC',`
 641 L(pic_calc):
 642         addl    (%esp), %ecx
 643         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 644         addl    %edx, %ecx
 645         ret_internal
 646 ')
 647
 648
 649 EPILOGUE()