source/libs/gmp/gmp-src/mpn/x86/p6/mul_basecase.asm

   1 dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
   2
   3 dnl  Copyright 1999-2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
  35
  36
  37 dnl  P6 UNROLL_COUNT cycles/product (approx)
  38 dnl           8           7
  39 dnl          16           6.5
  40 dnl          32           6.4
  41 dnl  Maximum possible with the current code is 32.
  42
  43 deflit(UNROLL_COUNT, 16)
  44
  45
  46 C void mpn_mul_basecase (mp_ptr wp,
  47 C                        mp_srcptr xp, mp_size_t xsize,
  48 C                        mp_srcptr yp, mp_size_t ysize);
  49 C
  50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  51 C it's faster because it does most of the mpn_addmul_1() startup
  52 C calculations only once.
  53
  54 ifdef(`PIC',`
  55 deflit(UNROLL_THRESHOLD, 5)
  56 ',`
  57 deflit(UNROLL_THRESHOLD, 5)
  58 ')
  59
  60 defframe(PARAM_YSIZE,20)
  61 defframe(PARAM_YP,   16)
  62 defframe(PARAM_XSIZE,12)
  63 defframe(PARAM_XP,   8)
  64 defframe(PARAM_WP,   4)
  65
  66         TEXT
  67         ALIGN(16)
  68
  69 PROLOGUE(mpn_mul_basecase)
  70 deflit(`FRAME',0)
  71
  72         movl    PARAM_XSIZE, %ecx
  73
  74         movl    PARAM_YP, %eax
  75
  76         movl    PARAM_XP, %edx
  77
  78         movl    (%eax), %eax            C yp[0]
  79         cmpl    $2, %ecx
  80         ja      L(xsize_more_than_two)
  81         je      L(two_by_something)
  82
  83
  84         C one limb by one limb
  85
  86         mull    (%edx)
  87
  88         movl    PARAM_WP, %ecx
  89         movl    %eax, (%ecx)
  90         movl    %edx, 4(%ecx)
  91         ret
  92
  93
  94 C -----------------------------------------------------------------------------
  95 L(two_by_something):
  96 deflit(`FRAME',0)
  97
  98 dnl  re-use parameter space
  99 define(SAVE_EBX, `PARAM_XSIZE')
 100 define(SAVE_ESI, `PARAM_YSIZE')
 101
 102         movl    %ebx, SAVE_EBX
 103         cmpl    $1, PARAM_YSIZE
 104         movl    %eax, %ecx              C yp[0]
 105
 106         movl    %esi, SAVE_ESI          C save esi
 107         movl    PARAM_WP, %ebx
 108         movl    %edx, %esi              C xp
 109
 110         movl    (%edx), %eax            C xp[0]
 111         jne     L(two_by_two)
 112
 113
 114         C two limbs by one limb
 115         C
 116         C eax   xp[0]
 117         C ebx   wp
 118         C ecx   yp[0]
 119         C edx
 120         C esi   xp
 121
 122         mull    %ecx
 123
 124         movl    %eax, (%ebx)
 125         movl    4(%esi), %eax
 126         movl    %edx, %esi              C carry
 127
 128         mull    %ecx
 129
 130         addl    %eax, %esi
 131
 132         movl    %esi, 4(%ebx)
 133         movl    SAVE_ESI, %esi
 134
 135         adcl    $0, %edx
 136
 137         movl    %edx, 8(%ebx)
 138         movl    SAVE_EBX, %ebx
 139
 140         ret
 141
 142
 143
 144 C -----------------------------------------------------------------------------
 145
 146         ALIGN(16)
 147 L(two_by_two):
 148         C eax   xp[0]
 149         C ebx   wp
 150         C ecx   yp[0]
 151         C edx
 152         C esi   xp
 153         C edi
 154         C ebp
 155
 156 dnl  more parameter space re-use
 157 define(SAVE_EDI, `PARAM_WP')
 158
 159         mull    %ecx            C xp[0] * yp[0]
 160
 161         movl    %edi, SAVE_EDI
 162         movl    %edx, %edi      C carry, for wp[1]
 163
 164         movl    %eax, (%ebx)
 165         movl    4(%esi), %eax
 166
 167         mull    %ecx            C xp[1] * yp[0]
 168
 169         addl    %eax, %edi
 170         movl    PARAM_YP, %ecx
 171
 172         adcl    $0, %edx
 173         movl    4(%ecx), %ecx   C yp[1]
 174
 175         movl    %edi, 4(%ebx)
 176         movl    4(%esi), %eax   C xp[1]
 177         movl    %edx, %edi      C carry, for wp[2]
 178
 179         mull    %ecx            C xp[1] * yp[1]
 180
 181         addl    %eax, %edi
 182         movl    (%esi), %eax    C xp[0]
 183
 184         adcl    $0, %edx
 185         movl    %edx, %esi      C carry, for wp[3]
 186
 187         mull    %ecx            C xp[0] * yp[1]
 188
 189         addl    %eax, 4(%ebx)
 190         movl    %esi, %eax
 191
 192         adcl    %edx, %edi
 193         movl    SAVE_ESI, %esi
 194
 195         movl    %edi, 8(%ebx)
 196
 197         adcl    $0, %eax
 198         movl    SAVE_EDI, %edi
 199
 200         movl    %eax, 12(%ebx)
 201         movl    SAVE_EBX, %ebx
 202
 203         ret
 204
 205
 206 C -----------------------------------------------------------------------------
 207         ALIGN(16)
 208 L(xsize_more_than_two):
 209
 210 C The first limb of yp is processed with a simple mpn_mul_1 loop running at
 211 C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
 212 C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
 213 C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
 214 C and doesn't seem likely to be worthwhile on the typical sizes reaching
 215 C here from the Karatsuba code.
 216
 217         C eax   yp[0]
 218         C ebx
 219         C ecx   xsize
 220         C edx   xp
 221         C esi
 222         C edi
 223         C ebp
 224
 225 defframe(`SAVE_EBX',    -4)
 226 defframe(`SAVE_ESI',    -8)
 227 defframe(`SAVE_EDI',   -12)
 228 defframe(`SAVE_EBP',   -16)
 229 defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
 230 defframe(VAR_ADJUST,   -24)
 231 defframe(VAR_JMP,      -28)
 232 defframe(VAR_SWAP,     -32)
 233 defframe(VAR_XP_LOW,   -36)
 234 deflit(STACK_SPACE, 36)
 235
 236         subl    $STACK_SPACE, %esp
 237 deflit(`FRAME',STACK_SPACE)
 238
 239         movl    %edi, SAVE_EDI
 240         movl    PARAM_WP, %edi
 241
 242         movl    %ebx, SAVE_EBX
 243
 244         movl    %ebp, SAVE_EBP
 245         movl    %eax, %ebp
 246
 247         movl    %esi, SAVE_ESI
 248         xorl    %ebx, %ebx
 249         leal    (%edx,%ecx,4), %esi     C xp end
 250
 251         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 252         negl    %ecx
 253
 254
 255 L(mul1):
 256         C eax   scratch
 257         C ebx   carry
 258         C ecx   counter, negative
 259         C edx   scratch
 260         C esi   xp end
 261         C edi   wp end of mul1
 262         C ebp   multiplier
 263
 264         movl    (%esi,%ecx,4), %eax
 265
 266         mull    %ebp
 267
 268         addl    %ebx, %eax
 269         movl    %eax, (%edi,%ecx,4)
 270         movl    $0, %ebx
 271
 272         adcl    %edx, %ebx
 273         incl    %ecx
 274         jnz     L(mul1)
 275
 276
 277         movl    PARAM_YSIZE, %edx
 278
 279         movl    %ebx, (%edi)            C final carry
 280         movl    PARAM_XSIZE, %ecx
 281         decl    %edx
 282
 283         jz      L(done)                 C if ysize==1
 284
 285         cmpl    $UNROLL_THRESHOLD, %ecx
 286         movl    PARAM_YP, %eax
 287         jae     L(unroll)
 288
 289
 290 C -----------------------------------------------------------------------------
 291         C simple addmul looping
 292         C
 293         C eax   yp
 294         C ebx
 295         C ecx   xsize
 296         C edx   ysize-1
 297         C esi   xp end
 298         C edi   wp end of mul1
 299         C ebp
 300
 301         leal    4(%eax,%edx,4), %ebp    C yp end
 302         negl    %ecx
 303         negl    %edx
 304
 305         movl    %edx, PARAM_YSIZE       C -(ysize-1)
 306         movl    (%esi,%ecx,4), %eax     C xp low limb
 307         incl    %ecx
 308
 309         movl    %ecx, PARAM_XSIZE       C -(xsize-1)
 310         xorl    %ebx, %ebx              C initial carry
 311
 312         movl    %ebp, PARAM_YP
 313         movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
 314         jmp     L(simple_outer_entry)
 315
 316
 317 L(simple_outer_top):
 318         C ebp   ysize counter, negative
 319
 320         movl    PARAM_YP, %edx
 321
 322         movl    PARAM_XSIZE, %ecx       C -(xsize-1)
 323         xorl    %ebx, %ebx              C carry
 324
 325         movl    %ebp, PARAM_YSIZE
 326         addl    $4, %edi                C next position in wp
 327
 328         movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
 329
 330         movl    -4(%esi,%ecx,4), %eax   C xp low limb
 331
 332
 333 L(simple_outer_entry):
 334
 335 L(simple_inner_top):
 336         C eax   xp limb
 337         C ebx   carry limb
 338         C ecx   loop counter (negative)
 339         C edx   scratch
 340         C esi   xp end
 341         C edi   wp end
 342         C ebp   multiplier
 343
 344         mull    %ebp
 345
 346         addl    %eax, %ebx
 347         adcl    $0, %edx
 348
 349         addl    %ebx, (%edi,%ecx,4)
 350         movl    (%esi,%ecx,4), %eax
 351         adcl    $0, %edx
 352
 353         incl    %ecx
 354         movl    %edx, %ebx
 355         jnz     L(simple_inner_top)
 356
 357
 358         C separate code for last limb so outer loop counter handling can be
 359         C interleaved
 360
 361         mull    %ebp
 362
 363         movl    PARAM_YSIZE, %ebp
 364         addl    %eax, %ebx
 365
 366         adcl    $0, %edx
 367
 368         addl    %ebx, (%edi)
 369
 370         adcl    $0, %edx
 371         incl    %ebp
 372
 373         movl    %edx, 4(%edi)
 374         jnz     L(simple_outer_top)
 375
 376
 377 L(done):
 378         movl    SAVE_EBX, %ebx
 379
 380         movl    SAVE_ESI, %esi
 381
 382         movl    SAVE_EDI, %edi
 383
 384         movl    SAVE_EBP, %ebp
 385         addl    $FRAME, %esp
 386
 387         ret
 388
 389
 390
 391 C -----------------------------------------------------------------------------
 392 C
 393 C The unrolled loop is the same as in mpn_addmul_1, see that code for some
 394 C comments.
 395 C
 396 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
 397 C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
 398 C given an initial VAR_COUNTER at the top of the outer loop.
 399 C
 400 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
 401 C up to -1, inclusive.
 402 C
 403 C VAR_JMP is the computed jump into the unrolled loop.
 404 C
 405 C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
 406 C initial ebx and ecx on entry to the unrolling.
 407 C
 408 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 409 C start of the unrolled loop.
 410 C
 411 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 412 C inclusive.
 413 C
 414 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 415 C added to give the location of the next limb of yp, which is the multiplier
 416 C in the unrolled loop.
 417 C
 418 C The trick with the VAR_ADJUST value means it's only necessary to do one
 419 C fetch in the outer loop to take care of xp, wp and the inner loop counter.
 420
 421
 422 L(unroll):
 423         C eax   yp
 424         C ebx
 425         C ecx   xsize
 426         C edx   ysize-1
 427         C esi   xp end
 428         C edi   wp end of mul1
 429         C ebp
 430
 431         movl    PARAM_XP, %esi
 432
 433         movl    4(%eax), %ebp           C multiplier (yp second limb)
 434         leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
 435
 436         movl    %eax, PARAM_YP
 437         movl    PARAM_WP, %edi
 438         negl    %edx
 439
 440         movl    %edx, PARAM_YSIZE
 441         leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
 442         decl    %ecx                            C xsize-1
 443
 444         movl    (%esi), %eax            C xp low limb
 445         andl    $-UNROLL_MASK-1, %ebx
 446         negl    %ecx                    C -(xsize-1)
 447
 448         negl    %ebx
 449         andl    $UNROLL_MASK, %ecx
 450
 451         movl    %ebx, VAR_ADJUST
 452         movl    %ecx, %edx
 453         shll    $4, %ecx
 454
 455         movl    %eax, VAR_XP_LOW
 456         sarl    $UNROLL_LOG2, %ebx
 457         negl    %edx
 458
 459         C 15 code bytes per limb
 460 ifdef(`PIC',`
 461         call    L(pic_calc)
 462 L(unroll_here):
 463 ',`
 464         leal    L(unroll_inner_entry) (%ecx,%edx,1), %ecx
 465 ')
 466
 467         movl    %ecx, VAR_JMP
 468         movl    %edx, %ecx
 469         shll    $31, %edx
 470
 471         sarl    $31, %edx               C 0 or -1 as xsize odd or even
 472         leal    4(%edi,%ecx,4), %edi    C wp and xp, adjust for unrolling,
 473         leal    4(%esi,%ecx,4), %esi    C  and start at second limb
 474
 475         movl    %edx, VAR_SWAP
 476         jmp     L(unroll_outer_entry)
 477
 478
 479 ifdef(`PIC',`
 480 L(pic_calc):
 481         C See mpn/x86/README about old gas bugs
 482         leal    (%ecx,%edx,1), %ecx
 483         addl    $L(unroll_inner_entry)-L(unroll_here), %ecx
 484         addl    (%esp), %ecx
 485         ret_internal
 486 ')
 487
 488
 489 C --------------------------------------------------------------------------
 490         ALIGN(16)
 491 L(unroll_outer_top):
 492         C eax
 493         C ebx
 494         C ecx
 495         C edx
 496         C esi   xp + offset
 497         C edi   wp + offset
 498         C ebp   ysize counter, negative
 499
 500         movl    VAR_ADJUST, %ebx
 501         movl    PARAM_YP, %edx
 502
 503         movl    VAR_XP_LOW, %eax
 504         movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
 505
 506         leal    eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
 507         leal    (%esi,%ebx,4), %esi
 508         sarl    $UNROLL_LOG2, %ebx
 509
 510         movl    (%edx,%ebp,4), %ebp     C yp next multiplier
 511
 512 L(unroll_outer_entry):
 513         mull    %ebp
 514
 515         movl    %ebx, VAR_COUNTER
 516         movl    %edx, %ebx              C carry high
 517         movl    %eax, %ecx              C carry low
 518
 519         xorl    %edx, %eax
 520         movl    VAR_JMP, %edx
 521
 522         andl    VAR_SWAP, %eax
 523
 524         xorl    %eax, %ebx              C carries other way for odd index
 525         xorl    %eax, %ecx
 526
 527         jmp     *%edx
 528
 529
 530 C -----------------------------------------------------------------------------
 531
 532 L(unroll_inner_top):
 533         C eax   xp limb
 534         C ebx   carry high
 535         C ecx   carry low
 536         C edx   scratch
 537         C esi   xp+8
 538         C edi   wp
 539         C ebp   yp multiplier limb
 540         C
 541         C VAR_COUNTER  loop counter, negative
 542         C
 543         C 15 bytes each limb
 544
 545         addl    $UNROLL_BYTES, %edi
 546
 547 L(unroll_inner_entry):
 548
 549 deflit(CHUNK_COUNT,2)
 550 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 551         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 552         deflit(`disp1', eval(disp0 + 4))
 553
 554 Zdisp(  movl,   disp0,(%esi), %eax)
 555         mull    %ebp
 556 Zdisp(  addl,   %ecx, disp0,(%edi))
 557         adcl    %eax, %ebx              C new carry low
 558         movl    %edx, %ecx
 559         adcl    $0, %ecx                C new carry high
 560
 561         movl    disp1(%esi), %eax
 562         mull    %ebp
 563         addl    %ebx, disp1(%edi)
 564         adcl    %eax, %ecx              C new carry low
 565         movl    %edx, %ebx
 566         adcl    $0, %ebx                C new carry high
 567 ')
 568
 569
 570         incl    VAR_COUNTER
 571         leal    UNROLL_BYTES(%esi), %esi
 572         jnz     L(unroll_inner_top)
 573
 574
 575         C eax
 576         C ebx   carry high
 577         C ecx   carry low
 578         C edx
 579         C esi
 580         C edi   wp, pointing at second last limb)
 581         C ebp
 582
 583 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
 584 deflit(`disp1', eval(disp0 + 4))
 585
 586         movl    PARAM_YSIZE, %ebp
 587         addl    %ecx, disp0(%edi)       C carry low
 588
 589         adcl    $0, %ebx
 590         incl    %ebp
 591
 592         movl    %ebx, disp1(%edi)       C carry high
 593         jnz     L(unroll_outer_top)
 594
 595
 596         movl    SAVE_ESI, %esi
 597
 598         movl    SAVE_EBP, %ebp
 599
 600         movl    SAVE_EDI, %edi
 601
 602         movl    SAVE_EBX, %ebx
 603         addl    $FRAME, %esp
 604
 605         ret
 606
 607 EPILOGUE()