source/libs/gmp/gmp-src/mpn/x86/k6/mul_basecase.asm

   1 dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
   2
   3 dnl  Copyright 1999-2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
  35 C     unrolling).
  36
  37
  38
  39 dnl  K6: UNROLL_COUNT cycles/product (approx)
  40 dnl           8           9.75
  41 dnl          16           9.3
  42 dnl          32           9.3
  43 dnl  Maximum possible with the current code is 32.
  44 dnl
  45 dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
  46 dnl  might explain it's good performance.
  47
  48 deflit(UNROLL_COUNT, 16)
  49
  50
  51 C void mpn_mul_basecase (mp_ptr wp,
  52 C                        mp_srcptr xp, mp_size_t xsize,
  53 C                        mp_srcptr yp, mp_size_t ysize);
  54 C
  55 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  56 C wp,xsize+ysize.
  57 C
  58 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  59 C it's faster because it does most of the mpn_addmul_1() entry code only
  60 C once.  The saving is about 10-20% on typical sizes coming from the
  61 C Karatsuba multiply code.
  62 C
  63 C Enhancements:
  64 C
  65 C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
  66 C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
  67 C
  68 C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
  69 C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
  70 C would be 256 bytes.
  71
  72 ifdef(`PIC',`
  73 deflit(UNROLL_THRESHOLD, 8)
  74 ',`
  75 deflit(UNROLL_THRESHOLD, 8)
  76 ')
  77
  78 defframe(PARAM_YSIZE,20)
  79 defframe(PARAM_YP,   16)
  80 defframe(PARAM_XSIZE,12)
  81 defframe(PARAM_XP,   8)
  82 defframe(PARAM_WP,   4)
  83
  84         TEXT
  85         ALIGN(32)
  86 PROLOGUE(mpn_mul_basecase)
  87 deflit(`FRAME',0)
  88
  89         movl    PARAM_XSIZE, %ecx
  90         movl    PARAM_YP, %eax
  91
  92         movl    PARAM_XP, %edx
  93         movl    (%eax), %eax    C yp low limb
  94
  95         cmpl    $2, %ecx
  96         ja      L(xsize_more_than_two_limbs)
  97         je      L(two_by_something)
  98
  99
 100         C one limb by one limb
 101
 102         movl    (%edx), %edx    C xp low limb
 103         movl    PARAM_WP, %ecx
 104
 105         mull    %edx
 106
 107         movl    %eax, (%ecx)
 108         movl    %edx, 4(%ecx)
 109         ret
 110
 111
 112 C -----------------------------------------------------------------------------
 113 L(two_by_something):
 114         decl    PARAM_YSIZE
 115         pushl   %ebx
 116 deflit(`FRAME',4)
 117
 118         movl    PARAM_WP, %ebx
 119         pushl   %esi
 120 deflit(`FRAME',8)
 121
 122         movl    %eax, %ecx      C yp low limb
 123         movl    (%edx), %eax    C xp low limb
 124
 125         movl    %edx, %esi      C xp
 126         jnz     L(two_by_two)
 127
 128
 129         C two limbs by one limb
 130
 131         mull    %ecx
 132
 133         movl    %eax, (%ebx)
 134         movl    4(%esi), %eax
 135
 136         movl    %edx, %esi      C carry
 137
 138         mull    %ecx
 139
 140         addl    %eax, %esi
 141         movl    %esi, 4(%ebx)
 142
 143         adcl    $0, %edx
 144
 145         movl    %edx, 8(%ebx)
 146         popl    %esi
 147
 148         popl    %ebx
 149         ret
 150
 151
 152
 153 C -----------------------------------------------------------------------------
 154         ALIGN(16)
 155 L(two_by_two):
 156         C eax   xp low limb
 157         C ebx   wp
 158         C ecx   yp low limb
 159         C edx
 160         C esi   xp
 161         C edi
 162         C ebp
 163 deflit(`FRAME',8)
 164
 165         mull    %ecx            C xp[0] * yp[0]
 166
 167         push    %edi
 168 deflit(`FRAME',12)
 169         movl    %eax, (%ebx)
 170
 171         movl    4(%esi), %eax
 172         movl    %edx, %edi      C carry, for wp[1]
 173
 174         mull    %ecx            C xp[1] * yp[0]
 175
 176         addl    %eax, %edi
 177         movl    PARAM_YP, %ecx
 178
 179         adcl    $0, %edx
 180
 181         movl    %edi, 4(%ebx)
 182         movl    4(%ecx), %ecx   C yp[1]
 183
 184         movl    4(%esi), %eax   C xp[1]
 185         movl    %edx, %edi      C carry, for wp[2]
 186
 187         mull    %ecx            C xp[1] * yp[1]
 188
 189         addl    %eax, %edi
 190
 191         adcl    $0, %edx
 192
 193         movl    (%esi), %eax    C xp[0]
 194         movl    %edx, %esi      C carry, for wp[3]
 195
 196         mull    %ecx            C xp[0] * yp[1]
 197
 198         addl    %eax, 4(%ebx)
 199         adcl    %edx, %edi
 200         adcl    $0, %esi
 201
 202         movl    %edi, 8(%ebx)
 203         popl    %edi
 204
 205         movl    %esi, 12(%ebx)
 206         popl    %esi
 207
 208         popl    %ebx
 209         ret
 210
 211
 212 C -----------------------------------------------------------------------------
 213         ALIGN(16)
 214 L(xsize_more_than_two_limbs):
 215
 216 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 217 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 218 C (whereas the addmul below is run ysize-1 many times).  A call to the
 219 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 220 C popping, and doesn't seem likely to be worthwhile on the typical 10-20
 221 C limb operations the Karatsuba code calls here with.
 222
 223         C eax   yp[0]
 224         C ebx
 225         C ecx   xsize
 226         C edx   xp
 227         C esi
 228         C edi
 229         C ebp
 230 deflit(`FRAME',0)
 231
 232         pushl   %edi            defframe_pushl(SAVE_EDI)
 233         pushl   %ebp            defframe_pushl(SAVE_EBP)
 234
 235         movl    PARAM_WP, %edi
 236         pushl   %esi            defframe_pushl(SAVE_ESI)
 237
 238         movl    %eax, %ebp
 239         pushl   %ebx            defframe_pushl(SAVE_EBX)
 240
 241         leal    (%edx,%ecx,4), %ebx     C xp end
 242         xorl    %esi, %esi
 243
 244         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 245         negl    %ecx
 246
 247
 248 L(mul1):
 249         C eax   scratch
 250         C ebx   xp end
 251         C ecx   counter, negative
 252         C edx   scratch
 253         C esi   carry
 254         C edi   wp end of mul1
 255         C ebp   multiplier
 256
 257         movl    (%ebx,%ecx,4), %eax
 258
 259         mull    %ebp
 260
 261         addl    %esi, %eax
 262         movl    $0, %esi
 263
 264         adcl    %edx, %esi
 265
 266         movl    %eax, (%edi,%ecx,4)
 267         incl    %ecx
 268
 269         jnz     L(mul1)
 270
 271
 272         movl    PARAM_YSIZE, %edx
 273         movl    %esi, (%edi)            C final carry
 274
 275         movl    PARAM_XSIZE, %ecx
 276         decl    %edx
 277
 278         jnz     L(ysize_more_than_one_limb)
 279
 280         popl    %ebx
 281         popl    %esi
 282         popl    %ebp
 283         popl    %edi
 284         ret
 285
 286
 287 L(ysize_more_than_one_limb):
 288         cmpl    $UNROLL_THRESHOLD, %ecx
 289         movl    PARAM_YP, %eax
 290
 291         jae     L(unroll)
 292
 293
 294 C -----------------------------------------------------------------------------
 295 C Simple addmul loop.
 296 C
 297 C Using ebx and edi pointing at the ends of their respective locations saves
 298 C a couple of instructions in the outer loop.  The inner loop is still 11
 299 C cycles, the same as the simple loop in aorsmul_1.asm.
 300
 301         C eax   yp
 302         C ebx   xp end
 303         C ecx   xsize
 304         C edx   ysize-1
 305         C esi
 306         C edi   wp end of mul1
 307         C ebp
 308
 309         movl    4(%eax), %ebp           C multiplier
 310         negl    %ecx
 311
 312         movl    %ecx, PARAM_XSIZE       C -xsize
 313         xorl    %esi, %esi              C initial carry
 314
 315         leal    4(%eax,%edx,4), %eax    C yp end
 316         negl    %edx
 317
 318         movl    %eax, PARAM_YP
 319         movl    %edx, PARAM_YSIZE
 320
 321         jmp     L(simple_outer_entry)
 322
 323
 324         C aligning here saves a couple of cycles
 325         ALIGN(16)
 326 L(simple_outer_top):
 327         C edx   ysize counter, negative
 328
 329         movl    PARAM_YP, %eax          C yp end
 330         xorl    %esi, %esi              C carry
 331
 332         movl    PARAM_XSIZE, %ecx       C -xsize
 333         movl    %edx, PARAM_YSIZE
 334
 335         movl    (%eax,%edx,4), %ebp     C yp limb multiplier
 336 L(simple_outer_entry):
 337         addl    $4, %edi
 338
 339
 340 L(simple_inner):
 341         C eax   scratch
 342         C ebx   xp end
 343         C ecx   counter, negative
 344         C edx   scratch
 345         C esi   carry
 346         C edi   wp end of this addmul
 347         C ebp   multiplier
 348
 349         movl    (%ebx,%ecx,4), %eax
 350
 351         mull    %ebp
 352
 353         addl    %esi, %eax
 354         movl    $0, %esi
 355
 356         adcl    $0, %edx
 357         addl    %eax, (%edi,%ecx,4)
 358         adcl    %edx, %esi
 359
 360         incl    %ecx
 361         jnz     L(simple_inner)
 362
 363
 364         movl    PARAM_YSIZE, %edx
 365         movl    %esi, (%edi)
 366
 367         incl    %edx
 368         jnz     L(simple_outer_top)
 369
 370
 371         popl    %ebx
 372         popl    %esi
 373         popl    %ebp
 374         popl    %edi
 375         ret
 376
 377
 378 C -----------------------------------------------------------------------------
 379 C Unrolled loop.
 380 C
 381 C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
 382 C some comments.
 383 C
 384 C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
 385 C 0, inclusive.
 386 C
 387 C VAR_JMP is the computed jump into the unrolled loop.
 388 C
 389 C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
 390 C is entered.
 391 C
 392 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 393 C start of the unrolled loop.  This can't just be fetched through the xp
 394 C pointer because of the offset applied to it.
 395 C
 396 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 397 C inclusive.
 398 C
 399 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 400 C added to give the location of the next limb of yp, which is the multiplier
 401 C in the unrolled loop.
 402 C
 403 C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
 404 C to give the starting point in the destination for each unrolled loop (this
 405 C point is one limb upwards for each limb of yp processed).
 406 C
 407 C Having PARAM_YSIZE count negative to zero means it's not necessary to
 408 C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
 409 C the stack remain constant and on each loop an leal adjusts them with the
 410 C PARAM_YSIZE counter value.
 411
 412
 413 defframe(VAR_COUNTER,      -20)
 414 defframe(VAR_COUNTER_INIT, -24)
 415 defframe(VAR_JMP,          -28)
 416 defframe(VAR_XP_LOW,       -32)
 417 deflit(VAR_STACK_SPACE, 16)
 418
 419 dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
 420 dnl  slower in this code, hence the defframe empty-if-zero feature is
 421 dnl  disabled.
 422 dnl
 423 dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
 424 dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
 425 dnl  anything isn't clear.
 426 dnl
 427 define(`defframe_empty_if_zero_disabled',1)
 428
 429 L(unroll):
 430         C eax   yp (not used)
 431         C ebx   xp end (not used)
 432         C ecx   xsize
 433         C edx   ysize-1
 434         C esi
 435         C edi   wp end of mul1 (not used)
 436         C ebp
 437 deflit(`FRAME', 16)
 438
 439         leal    -2(%ecx), %ebp  C one limb processed at start,
 440         decl    %ecx            C and ebp is one less
 441
 442         shrl    $UNROLL_LOG2, %ebp
 443         negl    %ecx
 444
 445         subl    $VAR_STACK_SPACE, %esp
 446 deflit(`FRAME', 16+VAR_STACK_SPACE)
 447         andl    $UNROLL_MASK, %ecx
 448
 449         movl    %ecx, %esi
 450         shll    $4, %ecx
 451
 452         movl    %ebp, VAR_COUNTER_INIT
 453         negl    %esi
 454
 455         C 15 code bytes per limb
 456 ifdef(`PIC',`
 457         call    L(pic_calc)
 458 L(unroll_here):
 459 ',`
 460         leal    L(unroll_entry) (%ecx,%esi,1), %ecx
 461 ')
 462
 463         movl    PARAM_XP, %ebx
 464         movl    %ebp, VAR_COUNTER
 465
 466         movl    PARAM_WP, %edi
 467         movl    %ecx, VAR_JMP
 468
 469         movl    (%ebx), %eax
 470         leal    4(%edi,%esi,4), %edi    C wp adjust for unrolling and mul1
 471
 472         leal    (%ebx,%esi,4), %ebx     C xp adjust for unrolling
 473
 474         movl    %eax, VAR_XP_LOW
 475
 476         movl    %ebx, PARAM_XP
 477         movl    PARAM_YP, %ebx
 478
 479         leal    (%edi,%edx,4), %ecx     C wp adjust for ysize indexing
 480         movl    4(%ebx), %ebp           C multiplier (yp second limb)
 481
 482         leal    4(%ebx,%edx,4), %ebx    C yp adjust for ysize indexing
 483
 484         movl    %ecx, PARAM_WP
 485
 486         leal    1(%esi), %ecx   C adjust parity for decl %ecx above
 487
 488         movl    %ebx, PARAM_YP
 489         negl    %edx
 490
 491         movl    %edx, PARAM_YSIZE
 492         jmp     L(unroll_outer_entry)
 493
 494
 495 ifdef(`PIC',`
 496 L(pic_calc):
 497         C See mpn/x86/README about old gas bugs
 498         leal    (%ecx,%esi,1), %ecx
 499         addl    $L(unroll_entry)-L(unroll_here), %ecx
 500         addl    (%esp), %ecx
 501         ret_internal
 502 ')
 503
 504
 505 C -----------------------------------------------------------------------------
 506         C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
 507         C cost any extra space, since the inner unrolled loop below is
 508         C aligned to 32.
 509         ALIGN(32)
 510 L(unroll_outer_top):
 511         C edx   ysize
 512
 513         movl    PARAM_YP, %eax
 514         movl    %edx, PARAM_YSIZE       C incremented ysize counter
 515
 516         movl    PARAM_WP, %edi
 517
 518         movl    VAR_COUNTER_INIT, %ebx
 519         movl    (%eax,%edx,4), %ebp     C next multiplier
 520
 521         movl    PARAM_XSIZE, %ecx
 522         leal    (%edi,%edx,4), %edi     C adjust wp for where we are in yp
 523
 524         movl    VAR_XP_LOW, %eax
 525         movl    %ebx, VAR_COUNTER
 526
 527 L(unroll_outer_entry):
 528         mull    %ebp
 529
 530         C using testb is a tiny bit faster than testl
 531         testb   $1, %cl
 532
 533         movl    %eax, %ecx      C low carry
 534         movl    VAR_JMP, %eax
 535
 536         movl    %edx, %esi      C high carry
 537         movl    PARAM_XP, %ebx
 538
 539         jnz     L(unroll_noswap)
 540         movl    %ecx, %esi      C high,low carry other way around
 541
 542         movl    %edx, %ecx
 543 L(unroll_noswap):
 544
 545         jmp     *%eax
 546
 547
 548
 549 C -----------------------------------------------------------------------------
 550         ALIGN(32)
 551 L(unroll_top):
 552         C eax   scratch
 553         C ebx   xp
 554         C ecx   carry low
 555         C edx   scratch
 556         C esi   carry high
 557         C edi   wp
 558         C ebp   multiplier
 559         C VAR_COUNTER  loop counter
 560         C
 561         C 15 code bytes each limb
 562
 563         leal    UNROLL_BYTES(%edi), %edi
 564
 565 L(unroll_entry):
 566 deflit(CHUNK_COUNT,2)
 567 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 568         deflit(`disp0', eval(i*CHUNK_COUNT*4))
 569         deflit(`disp1', eval(disp0 + 4))
 570         deflit(`disp2', eval(disp1 + 4))
 571
 572         movl    disp1(%ebx), %eax
 573         mull    %ebp
 574 Zdisp(  addl,   %ecx, disp0,(%edi))
 575         adcl    %eax, %esi
 576         movl    %edx, %ecx
 577         jadcl0( %ecx)
 578
 579         movl    disp2(%ebx), %eax
 580         mull    %ebp
 581         addl    %esi, disp1(%edi)
 582         adcl    %eax, %ecx
 583         movl    %edx, %esi
 584         jadcl0( %esi)
 585 ')
 586
 587         decl    VAR_COUNTER
 588         leal    UNROLL_BYTES(%ebx), %ebx
 589
 590         jns     L(unroll_top)
 591
 592
 593         movl    PARAM_YSIZE, %edx
 594         addl    %ecx, UNROLL_BYTES(%edi)
 595
 596         adcl    $0, %esi
 597
 598         incl    %edx
 599         movl    %esi, UNROLL_BYTES+4(%edi)
 600
 601         jnz     L(unroll_outer_top)
 602
 603
 604         movl    SAVE_ESI, %esi
 605         movl    SAVE_EBP, %ebp
 606         movl    SAVE_EDI, %edi
 607         movl    SAVE_EBX, %ebx
 608
 609         addl    $FRAME, %esp
 610         ret
 611
 612 EPILOGUE()