source/libs/gmp/gmp-src/mpn/x86/pentium4/sse2/divrem_1.asm

   1 dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
   2
   3 dnl  Copyright 1999-2004 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
  35
  36
  37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  38 C                         mp_srcptr src, mp_size_t size,
  39 C                         mp_limb_t divisor);
  40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  41 C                          mp_srcptr src, mp_size_t size,
  42 C                          mp_limb_t divisor, mp_limb_t carry);
  43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  44 C                                mp_srcptr src, mp_size_t size,
  45 C                                mp_limb_t divisor, mp_limb_t inverse,
  46 C                                unsigned shift);
  47 C
  48 C Algorithm:
  49 C
  50 C The method and nomenclature follow part 8 of "Division by Invariant
  51 C Integers using Multiplication" by Granlund and Montgomery, reference in
  52 C gmp.texi.
  53 C
  54 C "m" is written for what is m' in the paper, and "d" for d_norm, which
  55 C won't cause any confusion since it's only the normalized divisor that's of
  56 C any use in the code.  "b" is written for 2^N, the size of a limb, N being
  57 C 32 here.
  58 C
  59 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
  60 C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
  61 C us have just a psubq on the dependent chain.
  62 C
  63 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
  64 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
  65 C
  66 C Notes:
  67 C
  68 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
  69 C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
  70 C carry, since in normal circumstances that will be a very rare event.
  71 C
  72 C The test for skipping a division is branch free (once size>=1 is tested).
  73 C The store to the destination high limb is 0 when a divide is skipped, or
  74 C if it's not skipped then a copy of the src high limb is stored.  The
  75 C latter is in case src==dst.
  76 C
  77 C There's a small bias towards expecting xsize==0, by having code for
  78 C xsize==0 in a straight line and xsize!=0 under forward jumps.
  79 C
  80 C Enhancements:
  81 C
  82 C The loop measures 32 cycles, but the dependent chain would suggest it
  83 C could be done with 30.  Not sure where to start looking for the extras.
  84 C
  85 C Alternatives:
  86 C
  87 C If the divisor is normalized (high bit set) then a division step can
  88 C always be skipped, since the high destination limb is always 0 or 1 in
  89 C that case.  It doesn't seem worth checking for this though, since it
  90 C probably occurs infrequently.
  91
  92
  93 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  94 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  95 dnl
  96 dnl  The inverse takes about 80-90 cycles to calculate, but after that the
  97 dnl  multiply is 32 c/l versus division at about 58 c/l.
  98 dnl
  99 dnl  At 4 limbs the div is a touch faster than the mul (and of course
 100 dnl  simpler), so start the mul from 5 limbs.
 101
 102 deflit(MUL_THRESHOLD, 5)
 103
 104
 105 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
 106 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
 107 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
 108 defframe(PARAM_DIVISOR,20)
 109 defframe(PARAM_SIZE,   16)
 110 defframe(PARAM_SRC,    12)
 111 defframe(PARAM_XSIZE,  8)
 112 defframe(PARAM_DST,    4)
 113
 114 dnl  re-use parameter space
 115 define(SAVE_ESI,`PARAM_SIZE')
 116 define(SAVE_EBP,`PARAM_SRC')
 117 define(SAVE_EDI,`PARAM_DIVISOR')
 118 define(SAVE_EBX,`PARAM_DST')
 119
 120         TEXT
 121
 122         ALIGN(16)
 123 PROLOGUE(mpn_preinv_divrem_1)
 124 deflit(`FRAME',0)
 125
 126         movl    PARAM_SIZE, %ecx
 127         xorl    %edx, %edx              C carry if can't skip a div
 128
 129         movl    %esi, SAVE_ESI
 130         movl    PARAM_SRC, %esi
 131
 132         movl    %ebp, SAVE_EBP
 133         movl    PARAM_DIVISOR, %ebp
 134
 135         movl    %edi, SAVE_EDI
 136         movl    PARAM_DST, %edi
 137
 138         movl    -4(%esi,%ecx,4), %eax   C src high limb
 139
 140         movl    %ebx, SAVE_EBX
 141         movl    PARAM_XSIZE, %ebx
 142
 143         movd    PARAM_PREINV_INVERSE, %mm4
 144
 145         movd    PARAM_PREINV_SHIFT, %mm7  C l
 146         cmpl    %ebp, %eax              C high cmp divisor
 147
 148         cmovc(  %eax, %edx)             C high is carry if high<divisor
 149         movd    %edx, %mm0              C carry
 150
 151         movd    %edx, %mm1              C carry
 152         movl    $0, %edx
 153
 154         movd    %ebp, %mm5              C d
 155         cmovnc( %eax, %edx)             C 0 if skip div, src high if not
 156                                         C (the latter in case src==dst)
 157         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 158
 159         movl    %edx, (%edi,%ecx,4)     C dst high limb
 160         sbbl    $0, %ecx                C skip one division if high<divisor
 161         movl    $32, %eax
 162
 163         subl    PARAM_PREINV_SHIFT, %eax
 164         psllq   %mm7, %mm5              C d normalized
 165         leal    (%edi,%ecx,4), %edi     C &dst[xsize+size-1]
 166         leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
 167
 168         movd    %eax, %mm6              C 32-l
 169         jmp     L(start_preinv)
 170
 171 EPILOGUE()
 172
 173
 174         ALIGN(16)
 175 PROLOGUE(mpn_divrem_1c)
 176 deflit(`FRAME',0)
 177
 178         movl    PARAM_CARRY, %edx
 179
 180         movl    PARAM_SIZE, %ecx
 181
 182         movl    %esi, SAVE_ESI
 183         movl    PARAM_SRC, %esi
 184
 185         movl    %ebp, SAVE_EBP
 186         movl    PARAM_DIVISOR, %ebp
 187
 188         movl    %edi, SAVE_EDI
 189         movl    PARAM_DST, %edi
 190
 191         movl    %ebx, SAVE_EBX
 192         movl    PARAM_XSIZE, %ebx
 193
 194         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 195         jmp     L(start_1c)
 196
 197 EPILOGUE()
 198
 199
 200         ALIGN(16)
 201 PROLOGUE(mpn_divrem_1)
 202 deflit(`FRAME',0)
 203
 204         movl    PARAM_SIZE, %ecx
 205         xorl    %edx, %edx              C initial carry (if can't skip a div)
 206
 207         movl    %esi, SAVE_ESI
 208         movl    PARAM_SRC, %esi
 209
 210         movl    %ebp, SAVE_EBP
 211         movl    PARAM_DIVISOR, %ebp
 212
 213         movl    %edi, SAVE_EDI
 214         movl    PARAM_DST, %edi
 215
 216         movl    %ebx, SAVE_EBX
 217         movl    PARAM_XSIZE, %ebx
 218         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 219
 220         orl     %ecx, %ecx              C size
 221         jz      L(no_skip_div)          C if size==0
 222         movl    -4(%esi,%ecx,4), %eax   C src high limb
 223
 224         cmpl    %ebp, %eax              C high cmp divisor
 225
 226         cmovnc( %eax, %edx)             C 0 if skip div, src high if not
 227         movl    %edx, (%edi,%ecx,4)     C dst high limb
 228
 229         movl    $0, %edx
 230         cmovc(  %eax, %edx)             C high is carry if high<divisor
 231
 232         sbbl    $0, %ecx                C size-1 if high<divisor
 233 L(no_skip_div):
 234
 235
 236 L(start_1c):
 237         C eax
 238         C ebx   xsize
 239         C ecx   size
 240         C edx   carry
 241         C esi   src
 242         C edi   &dst[xsize-1]
 243         C ebp   divisor
 244
 245         leal    (%ebx,%ecx), %eax       C size+xsize
 246         leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
 247         leal    (%edi,%ecx,4), %edi     C &dst[size+xsize-1]
 248
 249         cmpl    $MUL_THRESHOLD, %eax
 250         jae     L(mul_by_inverse)
 251
 252
 253         orl     %ecx, %ecx
 254         jz      L(divide_no_integer)    C if size==0
 255
 256 L(divide_integer):
 257         C eax   scratch (quotient)
 258         C ebx   xsize
 259         C ecx   counter
 260         C edx   carry
 261         C esi   src, decrementing
 262         C edi   dst, decrementing
 263         C ebp   divisor
 264
 265         movl    (%esi), %eax
 266         subl    $4, %esi
 267
 268         divl    %ebp
 269
 270         movl    %eax, (%edi)
 271         subl    $4, %edi
 272
 273         subl    $1, %ecx
 274         jnz     L(divide_integer)
 275
 276
 277 L(divide_no_integer):
 278         orl     %ebx, %ebx
 279         jnz     L(divide_fraction)      C if xsize!=0
 280
 281 L(divide_done):
 282         movl    SAVE_ESI, %esi
 283         movl    SAVE_EDI, %edi
 284         movl    SAVE_EBX, %ebx
 285         movl    SAVE_EBP, %ebp
 286         movl    %edx, %eax
 287         ret
 288
 289
 290 L(divide_fraction):
 291         C eax   scratch (quotient)
 292         C ebx   counter
 293         C ecx
 294         C edx   carry
 295         C esi
 296         C edi   dst, decrementing
 297         C ebp   divisor
 298
 299         movl    $0, %eax
 300
 301         divl    %ebp
 302
 303         movl    %eax, (%edi)
 304         subl    $4, %edi
 305
 306         subl    $1, %ebx
 307         jnz     L(divide_fraction)
 308
 309         jmp     L(divide_done)
 310
 311
 312
 313 C -----------------------------------------------------------------------------
 314
 315 L(mul_by_inverse):
 316         C eax
 317         C ebx   xsize
 318         C ecx   size
 319         C edx   carry
 320         C esi   &src[size-1]
 321         C edi   &dst[size+xsize-1]
 322         C ebp   divisor
 323
 324         bsrl    %ebp, %eax              C 31-l
 325         movd    %edx, %mm0              C carry
 326         movd    %edx, %mm1              C carry
 327         movl    %ecx, %edx              C size
 328         movl    $31, %ecx
 329
 330         C
 331
 332         xorl    %eax, %ecx              C l = leading zeros on d
 333         addl    $1, %eax
 334
 335         shll    %cl, %ebp               C d normalized
 336         movd    %ecx, %mm7              C l
 337         movl    %edx, %ecx              C size
 338
 339         movd    %eax, %mm6              C 32-l
 340         movl    $-1, %edx
 341         movl    $-1, %eax
 342
 343         C
 344
 345         subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
 346
 347         divl    %ebp                    C floor (b*(b-d)-1 / d)
 348         movd    %ebp, %mm5              C d
 349
 350         C
 351
 352         movd    %eax, %mm4              C m
 353
 354
 355 L(start_preinv):
 356         C eax   inverse
 357         C ebx   xsize
 358         C ecx   size
 359         C edx
 360         C esi   &src[size-1]
 361         C edi   &dst[size+xsize-1]
 362         C ebp
 363         C
 364         C mm0   carry
 365         C mm1   carry
 366         C mm2
 367         C mm4   m
 368         C mm5   d
 369         C mm6   31-l
 370         C mm7   l
 371
 372         psllq   %mm7, %mm0              C n2 = carry << l, for size==0
 373
 374         subl    $1, %ecx
 375         jb      L(integer_none)
 376
 377         movd    (%esi), %mm0            C src high limb
 378         punpckldq %mm1, %mm0
 379         psrlq   %mm6, %mm0              C n2 = high (carry:srchigh << l)
 380         jz      L(integer_last)
 381
 382
 383 C The dependent chain here consists of
 384 C
 385 C       2   paddd    n1+n2
 386 C       8   pmuludq  m*(n1+n2)
 387 C       2   paddq    n2:nadj + m*(n1+n2)
 388 C       2   psrlq    q1
 389 C       8   pmuludq  d*q1
 390 C       2   psubq    (n-d)-q1*d
 391 C       2   psrlq    high n-(q1+1)*d mask
 392 C       2   pand     d masked
 393 C       2   paddd    n2+d addback
 394 C       --
 395 C       30
 396 C
 397 C But it seems to run at 32 cycles, so presumably there's something else
 398 C going on.
 399
 400         ALIGN(16)
 401 L(integer_top):
 402         C eax
 403         C ebx
 404         C ecx   counter, size-1 to 0
 405         C edx
 406         C esi   src, decrementing
 407         C edi   dst, decrementing
 408         C
 409         C mm0   n2
 410         C mm4   m
 411         C mm5   d
 412         C mm6   32-l
 413         C mm7   l
 414
 415         ASSERT(b,`C n2<d
 416          movd   %mm0, %eax
 417          movd   %mm5, %edx
 418          cmpl   %edx, %eax')
 419
 420         movd    -4(%esi), %mm1          C next src limbs
 421         movd    (%esi), %mm2
 422         leal    -4(%esi), %esi
 423
 424         punpckldq %mm2, %mm1
 425         psrlq   %mm6, %mm1              C n10
 426
 427         movq    %mm1, %mm2              C n10
 428         movq    %mm1, %mm3              C n10
 429         psrad   $31, %mm1               C -n1
 430         pand    %mm5, %mm1              C -n1 & d
 431         paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
 432
 433         psrld   $31, %mm2               C n1
 434         paddd   %mm0, %mm2              C n2+n1
 435         punpckldq %mm0, %mm1            C n2:nadj
 436
 437         pmuludq %mm4, %mm2              C m*(n2+n1)
 438
 439         C
 440
 441         paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
 442         pxor    %mm2, %mm2              C break dependency, saves 4 cycles
 443         pcmpeqd %mm2, %mm2              C FF...FF
 444         psrlq   $63, %mm2               C 1
 445
 446         psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
 447
 448         paddd   %mm1, %mm2              C q1+1
 449         pmuludq %mm5, %mm1              C q1*d
 450
 451         punpckldq %mm0, %mm3            C n = n2:n10
 452         pxor    %mm0, %mm0
 453
 454         psubq   %mm5, %mm3              C n - d
 455
 456         C
 457
 458         psubq   %mm1, %mm3              C n - (q1+1)*d
 459
 460         por     %mm3, %mm0              C copy remainder -> new n2
 461         psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
 462
 463         ASSERT(be,`C 0 or -1
 464          movd   %mm3, %eax
 465          addl   $1, %eax
 466          cmpl   $1, %eax')
 467
 468         paddd   %mm3, %mm2              C q
 469         pand    %mm5, %mm3              C mask & d
 470
 471         paddd   %mm3, %mm0              C addback if necessary
 472         movd    %mm2, (%edi)
 473         leal    -4(%edi), %edi
 474
 475         subl    $1, %ecx
 476         ja      L(integer_top)
 477
 478
 479 L(integer_last):
 480         C eax
 481         C ebx   xsize
 482         C ecx
 483         C edx
 484         C esi   &src[0]
 485         C edi   &dst[xsize]
 486         C
 487         C mm0   n2
 488         C mm4   m
 489         C mm5   d
 490         C mm6
 491         C mm7   l
 492
 493         ASSERT(b,`C n2<d
 494          movd   %mm0, %eax
 495          movd   %mm5, %edx
 496          cmpl   %edx, %eax')
 497
 498         movd    (%esi), %mm1            C src[0]
 499         psllq   %mm7, %mm1              C n10
 500
 501         movq    %mm1, %mm2              C n10
 502         movq    %mm1, %mm3              C n10
 503         psrad   $31, %mm1               C -n1
 504         pand    %mm5, %mm1              C -n1 & d
 505         paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
 506
 507         psrld   $31, %mm2               C n1
 508         paddd   %mm0, %mm2              C n2+n1
 509         punpckldq %mm0, %mm1            C n2:nadj
 510
 511         pmuludq %mm4, %mm2              C m*(n2+n1)
 512
 513         C
 514
 515         paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
 516         pcmpeqd %mm2, %mm2              C FF...FF
 517         psrlq   $63, %mm2               C 1
 518
 519         psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
 520         paddd   %mm1, %mm2              C q1
 521
 522         pmuludq %mm5, %mm1              C q1*d
 523         punpckldq %mm0, %mm3            C n
 524         psubq   %mm5, %mm3              C n - d
 525         pxor    %mm0, %mm0
 526
 527         C
 528
 529         psubq   %mm1, %mm3              C n - (q1+1)*d
 530
 531         por     %mm3, %mm0              C remainder -> n2
 532         psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
 533
 534         ASSERT(be,`C 0 or -1
 535          movd   %mm3, %eax
 536          addl   $1, %eax
 537          cmpl   $1, %eax')
 538
 539         paddd   %mm3, %mm2              C q
 540         pand    %mm5, %mm3              C mask & d
 541
 542         paddd   %mm3, %mm0              C addback if necessary
 543         movd    %mm2, (%edi)
 544         leal    -4(%edi), %edi
 545
 546
 547 L(integer_none):
 548         C eax
 549         C ebx   xsize
 550
 551         orl     %ebx, %ebx
 552         jnz     L(fraction_some)        C if xsize!=0
 553
 554
 555 L(fraction_done):
 556         movl    SAVE_EBP, %ebp
 557         psrld   %mm7, %mm0              C remainder
 558
 559         movl    SAVE_EDI, %edi
 560         movd    %mm0, %eax
 561
 562         movl    SAVE_ESI, %esi
 563         movl    SAVE_EBX, %ebx
 564         emms
 565         ret
 566
 567
 568
 569 C -----------------------------------------------------------------------------
 570 C
 571
 572 L(fraction_some):
 573         C eax
 574         C ebx   xsize
 575         C ecx
 576         C edx
 577         C esi
 578         C edi   &dst[xsize-1]
 579         C ebp
 580
 581
 582 L(fraction_top):
 583         C eax
 584         C ebx   counter, xsize iterations
 585         C ecx
 586         C edx
 587         C esi   src, decrementing
 588         C edi   dst, decrementing
 589         C
 590         C mm0   n2
 591         C mm4   m
 592         C mm5   d
 593         C mm6   32-l
 594         C mm7   l
 595
 596         ASSERT(b,`C n2<d
 597          movd   %mm0, %eax
 598          movd   %mm5, %edx
 599          cmpl   %edx, %eax')
 600
 601         movq    %mm0, %mm1              C n2
 602         pmuludq %mm4, %mm0              C m*n2
 603
 604         pcmpeqd %mm2, %mm2
 605         psrlq   $63, %mm2
 606
 607         C
 608
 609         psrlq   $32, %mm0               C high(m*n2)
 610
 611         paddd   %mm1, %mm0              C q1 = high(n2:0 + m*n2)
 612
 613         paddd   %mm0, %mm2              C q1+1
 614         pmuludq %mm5, %mm0              C q1*d
 615
 616         psllq   $32, %mm1               C n = n2:0
 617         psubq   %mm5, %mm1              C n - d
 618
 619         C
 620
 621         psubq   %mm0, %mm1              C r = n - (q1+1)*d
 622         pxor    %mm0, %mm0
 623
 624         por     %mm1, %mm0              C r -> n2
 625         psrlq   $32, %mm1               C high n - (q1+1)*d, 0 or -1
 626
 627         ASSERT(be,`C 0 or -1
 628          movd   %mm1, %eax
 629          addl   $1, %eax
 630          cmpl   $1, %eax')
 631
 632         paddd   %mm1, %mm2              C q
 633         pand    %mm5, %mm1              C mask & d
 634
 635         paddd   %mm1, %mm0              C addback if necessary
 636         movd    %mm2, (%edi)
 637         leal    -4(%edi), %edi
 638
 639         subl    $1, %ebx
 640         jne     L(fraction_top)
 641
 642
 643         jmp     L(fraction_done)
 644
 645 EPILOGUE()