source/libs/gmp/gmp-src/mpn/x86/k7/mul_basecase.asm

   1 dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
  35 C     limbs/loop unrolling).
  36
  37
  38
  39 dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
  40 dnl           8           4.67
  41 dnl          16           4.59
  42 dnl          32           4.42
  43 dnl  Maximum possible with the current code is 32.
  44 dnl
  45 dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
  46 dnl  done with a straight run through a block of code, no inner loop.  Using
  47 dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
  48
  49 deflit(UNROLL_COUNT, 32)
  50
  51
  52 C void mpn_mul_basecase (mp_ptr wp,
  53 C                        mp_srcptr xp, mp_size_t xsize,
  54 C                        mp_srcptr yp, mp_size_t ysize);
  55 C
  56 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  57 C wp,xsize+ysize.
  58 C
  59 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  60 C it's faster because it does most of the mpn_addmul_1() startup
  61 C calculations only once.  The saving is 15-25% on typical sizes coming from
  62 C the Karatsuba multiply code.
  63
  64 ifdef(`PIC',`
  65 deflit(UNROLL_THRESHOLD, 5)
  66 ',`
  67 deflit(UNROLL_THRESHOLD, 5)
  68 ')
  69
  70 defframe(PARAM_YSIZE,20)
  71 defframe(PARAM_YP,   16)
  72 defframe(PARAM_XSIZE,12)
  73 defframe(PARAM_XP,   8)
  74 defframe(PARAM_WP,   4)
  75
  76         TEXT
  77         ALIGN(32)
  78 PROLOGUE(mpn_mul_basecase)
  79 deflit(`FRAME',0)
  80
  81         movl    PARAM_XSIZE, %ecx
  82         movl    PARAM_YP, %eax
  83
  84         movl    PARAM_XP, %edx
  85         movl    (%eax), %eax    C yp low limb
  86
  87         cmpl    $2, %ecx
  88         ja      L(xsize_more_than_two)
  89         je      L(two_by_something)
  90
  91
  92         C one limb by one limb
  93
  94         mull    (%edx)
  95
  96         movl    PARAM_WP, %ecx
  97         movl    %eax, (%ecx)
  98         movl    %edx, 4(%ecx)
  99         ret
 100
 101
 102 C -----------------------------------------------------------------------------
 103 L(two_by_something):
 104 deflit(`FRAME',0)
 105         decl    PARAM_YSIZE
 106         pushl   %ebx            defframe_pushl(`SAVE_EBX')
 107         movl    %eax, %ecx      C yp low limb
 108
 109         movl    PARAM_WP, %ebx
 110         pushl   %esi            defframe_pushl(`SAVE_ESI')
 111         movl    %edx, %esi      C xp
 112
 113         movl    (%edx), %eax    C xp low limb
 114         jnz     L(two_by_two)
 115
 116
 117         C two limbs by one limb
 118
 119         mull    %ecx
 120
 121         movl    %eax, (%ebx)
 122         movl    4(%esi), %eax
 123         movl    %edx, %esi      C carry
 124
 125         mull    %ecx
 126
 127         addl    %eax, %esi
 128
 129         movl    %esi, 4(%ebx)
 130         movl    SAVE_ESI, %esi
 131
 132         adcl    $0, %edx
 133
 134         movl    %edx, 8(%ebx)
 135         movl    SAVE_EBX, %ebx
 136         addl    $FRAME, %esp
 137
 138         ret
 139
 140
 141
 142 C -----------------------------------------------------------------------------
 143 C Could load yp earlier into another register.
 144
 145         ALIGN(16)
 146 L(two_by_two):
 147         C eax   xp low limb
 148         C ebx   wp
 149         C ecx   yp low limb
 150         C edx
 151         C esi   xp
 152         C edi
 153         C ebp
 154
 155 dnl  FRAME carries on from previous
 156
 157         mull    %ecx            C xp[0] * yp[0]
 158
 159         push    %edi            defframe_pushl(`SAVE_EDI')
 160         movl    %edx, %edi      C carry, for wp[1]
 161
 162         movl    %eax, (%ebx)
 163         movl    4(%esi), %eax
 164
 165         mull    %ecx            C xp[1] * yp[0]
 166
 167         addl    %eax, %edi
 168         movl    PARAM_YP, %ecx
 169
 170         adcl    $0, %edx
 171         movl    4(%ecx), %ecx   C yp[1]
 172         movl    %edi, 4(%ebx)
 173
 174         movl    4(%esi), %eax   C xp[1]
 175         movl    %edx, %edi      C carry, for wp[2]
 176
 177         mull    %ecx            C xp[1] * yp[1]
 178
 179         addl    %eax, %edi
 180
 181         adcl    $0, %edx
 182         movl    (%esi), %eax    C xp[0]
 183
 184         movl    %edx, %esi      C carry, for wp[3]
 185
 186         mull    %ecx            C xp[0] * yp[1]
 187
 188         addl    %eax, 4(%ebx)
 189         adcl    %edx, %edi
 190         movl    %edi, 8(%ebx)
 191
 192         adcl    $0, %esi
 193         movl    SAVE_EDI, %edi
 194         movl    %esi, 12(%ebx)
 195
 196         movl    SAVE_ESI, %esi
 197         movl    SAVE_EBX, %ebx
 198         addl    $FRAME, %esp
 199
 200         ret
 201
 202
 203 C -----------------------------------------------------------------------------
 204         ALIGN(16)
 205 L(xsize_more_than_two):
 206
 207 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 208 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 209 C (whereas the addmul below is run ysize-1 many times).  A call to the
 210 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 211 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
 212 C limb operations the Karatsuba code calls here with.
 213
 214         C eax   yp[0]
 215         C ebx
 216         C ecx   xsize
 217         C edx   xp
 218         C esi
 219         C edi
 220         C ebp
 221
 222 dnl  FRAME doesn't carry on from previous, no pushes yet here
 223 defframe(`SAVE_EBX',-4)
 224 defframe(`SAVE_ESI',-8)
 225 defframe(`SAVE_EDI',-12)
 226 defframe(`SAVE_EBP',-16)
 227 deflit(`FRAME',0)
 228
 229         subl    $16, %esp
 230 deflit(`FRAME',16)
 231
 232         movl    %edi, SAVE_EDI
 233         movl    PARAM_WP, %edi
 234
 235         movl    %ebx, SAVE_EBX
 236         movl    %ebp, SAVE_EBP
 237         movl    %eax, %ebp
 238
 239         movl    %esi, SAVE_ESI
 240         xorl    %ebx, %ebx
 241         leal    (%edx,%ecx,4), %esi     C xp end
 242
 243         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 244         negl    %ecx
 245
 246
 247 L(mul1):
 248         C eax   scratch
 249         C ebx   carry
 250         C ecx   counter, negative
 251         C edx   scratch
 252         C esi   xp end
 253         C edi   wp end of mul1
 254         C ebp   multiplier
 255
 256         movl    (%esi,%ecx,4), %eax
 257
 258         mull    %ebp
 259
 260         addl    %ebx, %eax
 261         movl    %eax, (%edi,%ecx,4)
 262         movl    $0, %ebx
 263
 264         adcl    %edx, %ebx
 265         incl    %ecx
 266         jnz     L(mul1)
 267
 268
 269         movl    PARAM_YSIZE, %edx
 270         movl    PARAM_XSIZE, %ecx
 271
 272         movl    %ebx, (%edi)            C final carry
 273         decl    %edx
 274
 275         jnz     L(ysize_more_than_one)
 276
 277
 278         movl    SAVE_EDI, %edi
 279         movl    SAVE_EBX, %ebx
 280
 281         movl    SAVE_EBP, %ebp
 282         movl    SAVE_ESI, %esi
 283         addl    $FRAME, %esp
 284
 285         ret
 286
 287
 288 L(ysize_more_than_one):
 289         cmpl    $UNROLL_THRESHOLD, %ecx
 290         movl    PARAM_YP, %eax
 291
 292         jae     L(unroll)
 293
 294
 295 C -----------------------------------------------------------------------------
 296         C simple addmul looping
 297         C
 298         C eax   yp
 299         C ebx
 300         C ecx   xsize
 301         C edx   ysize-1
 302         C esi   xp end
 303         C edi   wp end of mul1
 304         C ebp
 305
 306         leal    4(%eax,%edx,4), %ebp    C yp end
 307         negl    %ecx
 308         negl    %edx
 309
 310         movl    (%esi,%ecx,4), %eax     C xp low limb
 311         movl    %edx, PARAM_YSIZE       C -(ysize-1)
 312         incl    %ecx
 313
 314         xorl    %ebx, %ebx              C initial carry
 315         movl    %ecx, PARAM_XSIZE       C -(xsize-1)
 316         movl    %ebp, PARAM_YP
 317
 318         movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
 319         jmp     L(simple_outer_entry)
 320
 321
 322         C this is offset 0x121 so close enough to aligned
 323 L(simple_outer_top):
 324         C ebp   ysize counter, negative
 325
 326         movl    PARAM_YP, %edx
 327         movl    PARAM_XSIZE, %ecx       C -(xsize-1)
 328         xorl    %ebx, %ebx              C carry
 329
 330         movl    %ebp, PARAM_YSIZE
 331         addl    $4, %edi                C next position in wp
 332
 333         movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
 334         movl    -4(%esi,%ecx,4), %eax   C xp low limb
 335
 336
 337 L(simple_outer_entry):
 338
 339 L(simple_inner):
 340         C eax   xp limb
 341         C ebx   carry limb
 342         C ecx   loop counter (negative)
 343         C edx   scratch
 344         C esi   xp end
 345         C edi   wp end
 346         C ebp   multiplier
 347
 348         mull    %ebp
 349
 350         addl    %eax, %ebx
 351         adcl    $0, %edx
 352
 353         addl    %ebx, (%edi,%ecx,4)
 354         movl    (%esi,%ecx,4), %eax
 355         adcl    $0, %edx
 356
 357         incl    %ecx
 358         movl    %edx, %ebx
 359         jnz     L(simple_inner)
 360
 361
 362         mull    %ebp
 363
 364         movl    PARAM_YSIZE, %ebp
 365         addl    %eax, %ebx
 366
 367         adcl    $0, %edx
 368         addl    %ebx, (%edi)
 369
 370         adcl    $0, %edx
 371         incl    %ebp
 372
 373         movl    %edx, 4(%edi)
 374         jnz     L(simple_outer_top)
 375
 376
 377         movl    SAVE_EBX, %ebx
 378         movl    SAVE_ESI, %esi
 379
 380         movl    SAVE_EDI, %edi
 381         movl    SAVE_EBP, %ebp
 382         addl    $FRAME, %esp
 383
 384         ret
 385
 386
 387
 388 C -----------------------------------------------------------------------------
 389 C
 390 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
 391 C comments.
 392 C
 393 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
 394 C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
 395 C to given an initial VAR_COUNTER at the top of the outer loop.
 396 C
 397 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
 398 C up to -1, inclusive.
 399 C
 400 C VAR_JMP is the computed jump into the unrolled loop.
 401 C
 402 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 403 C start of the unrolled loop.
 404 C
 405 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 406 C inclusive.
 407 C
 408 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 409 C added to give the location of the next limb of yp, which is the multiplier
 410 C in the unrolled loop.
 411 C
 412 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
 413 C outer loop to take care of xp, wp and the inner loop counter.
 414
 415 defframe(VAR_COUNTER,  -20)
 416 defframe(VAR_ADJUST,   -24)
 417 defframe(VAR_JMP,      -28)
 418 defframe(VAR_XP_LOW,   -32)
 419 deflit(VAR_EXTRA_SPACE, 16)
 420
 421
 422 L(unroll):
 423         C eax   yp
 424         C ebx
 425         C ecx   xsize
 426         C edx   ysize-1
 427         C esi   xp end
 428         C edi   wp end of mul1
 429         C ebp
 430
 431         movl    PARAM_XP, %esi
 432         movl    4(%eax), %ebp           C multiplier (yp second limb)
 433         leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
 434
 435         movl    PARAM_WP, %edi
 436         movl    %eax, PARAM_YP
 437         negl    %edx
 438
 439         movl    %edx, PARAM_YSIZE
 440         leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
 441         decl    %ecx                            C xsize-1
 442
 443         movl    (%esi), %eax            C xp low limb
 444         andl    $-UNROLL_MASK-1, %ebx
 445         negl    %ecx
 446
 447         subl    $VAR_EXTRA_SPACE, %esp
 448 deflit(`FRAME',16+VAR_EXTRA_SPACE)
 449         negl    %ebx
 450         andl    $UNROLL_MASK, %ecx
 451
 452         movl    %ebx, VAR_ADJUST
 453         movl    %ecx, %edx
 454         shll    $4, %ecx
 455
 456         sarl    $UNROLL_LOG2, %ebx
 457
 458         C 17 code bytes per limb
 459 ifdef(`PIC',`
 460         call    L(pic_calc)
 461 L(unroll_here):
 462 ',`
 463         leal    L(unroll_entry) (%ecx,%edx,1), %ecx
 464 ')
 465         negl    %edx
 466
 467         movl    %eax, VAR_XP_LOW
 468         movl    %ecx, VAR_JMP
 469         leal    4(%edi,%edx,4), %edi    C wp and xp, adjust for unrolling,
 470         leal    4(%esi,%edx,4), %esi    C  and start at second limb
 471         jmp     L(unroll_outer_entry)
 472
 473
 474 ifdef(`PIC',`
 475 L(pic_calc):
 476         C See mpn/x86/README about old gas bugs
 477         leal    (%ecx,%edx,1), %ecx
 478         addl    $L(unroll_entry)-L(unroll_here), %ecx
 479         addl    (%esp), %ecx
 480         ret_internal
 481 ')
 482
 483
 484 C --------------------------------------------------------------------------
 485         ALIGN(32)
 486 L(unroll_outer_top):
 487         C ebp   ysize counter, negative
 488
 489         movl    VAR_ADJUST, %ebx
 490         movl    PARAM_YP, %edx
 491
 492         movl    VAR_XP_LOW, %eax
 493         movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
 494
 495         leal    4(%edi,%ebx,4), %edi
 496         leal    (%esi,%ebx,4), %esi
 497         sarl    $UNROLL_LOG2, %ebx
 498
 499         movl    (%edx,%ebp,4), %ebp     C yp next multiplier
 500         movl    VAR_JMP, %ecx
 501
 502 L(unroll_outer_entry):
 503         mull    %ebp
 504
 505         testb   $1, %cl         C and clear carry bit
 506         movl    %ebx, VAR_COUNTER
 507         movl    $0, %ebx
 508
 509         movl    $0, %ecx
 510         cmovz(  %eax, %ecx)     C eax into low carry, zero into high carry limb
 511         cmovnz( %eax, %ebx)
 512
 513         C Extra fetch of VAR_JMP is bad, but registers are tight
 514         jmp     *VAR_JMP
 515
 516
 517 C -----------------------------------------------------------------------------
 518         ALIGN(32)
 519 L(unroll_top):
 520         C eax   xp limb
 521         C ebx   carry high
 522         C ecx   carry low
 523         C edx   scratch
 524         C esi   xp+8
 525         C edi   wp
 526         C ebp   yp multiplier limb
 527         C
 528         C VAR_COUNTER  loop counter, negative
 529         C
 530         C 17 bytes each limb
 531
 532 L(unroll_entry):
 533
 534 deflit(CHUNK_COUNT,2)
 535 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 536         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 537         deflit(`disp1', eval(disp0 + 4))
 538
 539 Zdisp(  movl,   disp0,(%esi), %eax)
 540         adcl    %edx, %ebx
 541
 542         mull    %ebp
 543
 544 Zdisp(  addl,   %ecx, disp0,(%edi))
 545         movl    $0, %ecx
 546
 547         adcl    %eax, %ebx
 548
 549
 550         movl    disp1(%esi), %eax
 551         adcl    %edx, %ecx
 552
 553         mull    %ebp
 554
 555         addl    %ebx, disp1(%edi)
 556         movl    $0, %ebx
 557
 558         adcl    %eax, %ecx
 559 ')
 560
 561
 562         incl    VAR_COUNTER
 563         leal    UNROLL_BYTES(%esi), %esi
 564         leal    UNROLL_BYTES(%edi), %edi
 565
 566         jnz     L(unroll_top)
 567
 568
 569         C eax
 570         C ebx   zero
 571         C ecx   low
 572         C edx   high
 573         C esi
 574         C edi   wp, pointing at second last limb)
 575         C ebp
 576         C
 577         C carry flag to be added to high
 578
 579 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 580 deflit(`disp1', eval(disp0-0 + 4))
 581
 582         movl    PARAM_YSIZE, %ebp
 583         adcl    $0, %edx
 584         addl    %ecx, disp0(%edi)
 585
 586         adcl    $0, %edx
 587         incl    %ebp
 588
 589         movl    %edx, disp1(%edi)
 590         jnz     L(unroll_outer_top)
 591
 592
 593         movl    SAVE_ESI, %esi
 594         movl    SAVE_EBP, %ebp
 595
 596         movl    SAVE_EDI, %edi
 597         movl    SAVE_EBX, %ebx
 598         addl    $FRAME, %esp
 599
 600         ret
 601
 602 EPILOGUE()