source/libs/gmp/gmp-src/mpn/x86/pentium/sqr_basecase.asm

   1 dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
  35 C product at around 20x20 limbs.
  36
  37
  38 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  39 C
  40 C Calculate src,size squared, storing the result in dst,2*size.
  41 C
  42 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  43 C lot of function call overheads are avoided, especially when the size is
  44 C small.
  45
  46 defframe(PARAM_SIZE,12)
  47 defframe(PARAM_SRC, 8)
  48 defframe(PARAM_DST, 4)
  49
  50         TEXT
  51         ALIGN(8)
  52 PROLOGUE(mpn_sqr_basecase)
  53 deflit(`FRAME',0)
  54
  55         movl    PARAM_SIZE, %edx
  56         movl    PARAM_SRC, %eax
  57
  58         cmpl    $2, %edx
  59         movl    PARAM_DST, %ecx
  60
  61         je      L(two_limbs)
  62
  63         movl    (%eax), %eax
  64         ja      L(three_or_more)
  65
  66 C -----------------------------------------------------------------------------
  67 C one limb only
  68         C eax   src
  69         C ebx
  70         C ecx   dst
  71         C edx
  72
  73         mull    %eax
  74
  75         movl    %eax, (%ecx)
  76         movl    %edx, 4(%ecx)
  77
  78         ret
  79
  80 C -----------------------------------------------------------------------------
  81         ALIGN(8)
  82 L(two_limbs):
  83         C eax   src
  84         C ebx
  85         C ecx   dst
  86         C edx   size
  87
  88         pushl   %ebp
  89         pushl   %edi
  90
  91         pushl   %esi
  92         pushl   %ebx
  93
  94         movl    %eax, %ebx
  95         movl    (%eax), %eax
  96
  97         mull    %eax            C src[0]^2
  98
  99         movl    %eax, (%ecx)    C dst[0]
 100         movl    %edx, %esi      C dst[1]
 101
 102         movl    4(%ebx), %eax
 103
 104         mull    %eax            C src[1]^2
 105
 106         movl    %eax, %edi      C dst[2]
 107         movl    %edx, %ebp      C dst[3]
 108
 109         movl    (%ebx), %eax
 110
 111         mull    4(%ebx)         C src[0]*src[1]
 112
 113         addl    %eax, %esi
 114         popl    %ebx
 115
 116         adcl    %edx, %edi
 117
 118         adcl    $0, %ebp
 119         addl    %esi, %eax
 120
 121         adcl    %edi, %edx
 122         movl    %eax, 4(%ecx)
 123
 124         adcl    $0, %ebp
 125         popl    %esi
 126
 127         movl    %edx, 8(%ecx)
 128         movl    %ebp, 12(%ecx)
 129
 130         popl    %edi
 131         popl    %ebp
 132
 133         ret
 134
 135
 136 C -----------------------------------------------------------------------------
 137         ALIGN(8)
 138 L(three_or_more):
 139         C eax   src low limb
 140         C ebx
 141         C ecx   dst
 142         C edx   size
 143
 144         cmpl    $4, %edx
 145         pushl   %ebx
 146 deflit(`FRAME',4)
 147
 148         movl    PARAM_SRC, %ebx
 149         jae     L(four_or_more)
 150
 151
 152 C -----------------------------------------------------------------------------
 153 C three limbs
 154         C eax   src low limb
 155         C ebx   src
 156         C ecx   dst
 157         C edx   size
 158
 159         pushl   %ebp
 160         pushl   %edi
 161
 162         mull    %eax            C src[0] ^ 2
 163
 164         movl    %eax, (%ecx)
 165         movl    %edx, 4(%ecx)
 166
 167         movl    4(%ebx), %eax
 168         xorl    %ebp, %ebp
 169
 170         mull    %eax            C src[1] ^ 2
 171
 172         movl    %eax, 8(%ecx)
 173         movl    %edx, 12(%ecx)
 174
 175         movl    8(%ebx), %eax
 176         pushl   %esi            C risk of cache bank clash
 177
 178         mull    %eax            C src[2] ^ 2
 179
 180         movl    %eax, 16(%ecx)
 181         movl    %edx, 20(%ecx)
 182
 183         movl    (%ebx), %eax
 184
 185         mull    4(%ebx)         C src[0] * src[1]
 186
 187         movl    %eax, %esi
 188         movl    %edx, %edi
 189
 190         movl    (%ebx), %eax
 191
 192         mull    8(%ebx)         C src[0] * src[2]
 193
 194         addl    %eax, %edi
 195         movl    %edx, %ebp
 196
 197         adcl    $0, %ebp
 198         movl    4(%ebx), %eax
 199
 200         mull    8(%ebx)         C src[1] * src[2]
 201
 202         xorl    %ebx, %ebx
 203         addl    %eax, %ebp
 204
 205         C eax
 206         C ebx   zero, will be dst[5]
 207         C ecx   dst
 208         C edx   dst[4]
 209         C esi   dst[1]
 210         C edi   dst[2]
 211         C ebp   dst[3]
 212
 213         adcl    $0, %edx
 214         addl    %esi, %esi
 215
 216         adcl    %edi, %edi
 217
 218         adcl    %ebp, %ebp
 219
 220         adcl    %edx, %edx
 221         movl    4(%ecx), %eax
 222
 223         adcl    $0, %ebx
 224         addl    %esi, %eax
 225
 226         movl    %eax, 4(%ecx)
 227         movl    8(%ecx), %eax
 228
 229         adcl    %edi, %eax
 230         movl    12(%ecx), %esi
 231
 232         adcl    %ebp, %esi
 233         movl    16(%ecx), %edi
 234
 235         movl    %eax, 8(%ecx)
 236         movl    %esi, 12(%ecx)
 237
 238         adcl    %edx, %edi
 239         popl    %esi
 240
 241         movl    20(%ecx), %eax
 242         movl    %edi, 16(%ecx)
 243
 244         popl    %edi
 245         popl    %ebp
 246
 247         adcl    %ebx, %eax      C no carry out of this
 248         popl    %ebx
 249
 250         movl    %eax, 20(%ecx)
 251
 252         ret
 253
 254
 255 C -----------------------------------------------------------------------------
 256         ALIGN(8)
 257 L(four_or_more):
 258         C eax   src low limb
 259         C ebx   src
 260         C ecx   dst
 261         C edx   size
 262         C esi
 263         C edi
 264         C ebp
 265         C
 266         C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 267
 268 deflit(`FRAME',4)
 269
 270         pushl   %edi
 271 FRAME_pushl()
 272         pushl   %esi
 273 FRAME_pushl()
 274
 275         pushl   %ebp
 276 FRAME_pushl()
 277         leal    (%ecx,%edx,4), %edi     C dst end of this mul1
 278
 279         leal    (%ebx,%edx,4), %esi     C src end
 280         movl    %ebx, %ebp              C src
 281
 282         negl    %edx                    C -size
 283         xorl    %ebx, %ebx              C clear carry limb and carry flag
 284
 285         leal    1(%edx), %ecx           C -(size-1)
 286
 287 L(mul1):
 288         C eax   scratch
 289         C ebx   carry
 290         C ecx   counter, negative
 291         C edx   scratch
 292         C esi   &src[size]
 293         C edi   &dst[size]
 294         C ebp   src
 295
 296         adcl    $0, %ebx
 297         movl    (%esi,%ecx,4), %eax
 298
 299         mull    (%ebp)
 300
 301         addl    %eax, %ebx
 302
 303         movl    %ebx, (%edi,%ecx,4)
 304         incl    %ecx
 305
 306         movl    %edx, %ebx
 307         jnz     L(mul1)
 308
 309
 310         C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
 311         C n=1..size-2.
 312         C
 313         C The last two products, which are the end corner of the product
 314         C triangle, are handled separately to save looping overhead.  These
 315         C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
 316         C If size is 4 then it's only these that need to be done.
 317         C
 318         C In the outer loop %esi is a constant, and %edi just advances by 1
 319         C limb each time.  The size of the operation decreases by 1 limb
 320         C each time.
 321
 322         C eax
 323         C ebx   carry (needing carry flag added)
 324         C ecx
 325         C edx
 326         C esi   &src[size]
 327         C edi   &dst[size]
 328         C ebp
 329
 330         adcl    $0, %ebx
 331         movl    PARAM_SIZE, %edx
 332
 333         movl    %ebx, (%edi)
 334         subl    $4, %edx
 335
 336         negl    %edx
 337         jz      L(corner)
 338
 339
 340 L(outer):
 341         C ebx   previous carry limb to store
 342         C edx   outer loop counter (negative)
 343         C esi   &src[size]
 344         C edi   dst, pointing at stored carry limb of previous loop
 345
 346         pushl   %edx                    C new outer loop counter
 347         leal    -2(%edx), %ecx
 348
 349         movl    %ebx, (%edi)
 350         addl    $4, %edi
 351
 352         addl    $4, %ebp
 353         xorl    %ebx, %ebx              C initial carry limb, clear carry flag
 354
 355 L(inner):
 356         C eax   scratch
 357         C ebx   carry (needing carry flag added)
 358         C ecx   counter, negative
 359         C edx   scratch
 360         C esi   &src[size]
 361         C edi   dst end of this addmul
 362         C ebp   &src[j]
 363
 364         adcl    $0, %ebx
 365         movl    (%esi,%ecx,4), %eax
 366
 367         mull    (%ebp)
 368
 369         addl    %ebx, %eax
 370         movl    (%edi,%ecx,4), %ebx
 371
 372         adcl    $0, %edx
 373         addl    %eax, %ebx
 374
 375         movl    %ebx, (%edi,%ecx,4)
 376         incl    %ecx
 377
 378         movl    %edx, %ebx
 379         jnz     L(inner)
 380
 381
 382         adcl    $0, %ebx
 383         popl    %edx            C outer loop counter
 384
 385         incl    %edx
 386         jnz     L(outer)
 387
 388
 389         movl    %ebx, (%edi)
 390
 391 L(corner):
 392         C esi   &src[size]
 393         C edi   &dst[2*size-4]
 394
 395         movl    -8(%esi), %eax
 396         movl    -4(%edi), %ebx          C risk of data cache bank clash here
 397
 398         mull    -12(%esi)               C src[size-2]*src[size-3]
 399
 400         addl    %eax, %ebx
 401         movl    %edx, %ecx
 402
 403         adcl    $0, %ecx
 404         movl    -4(%esi), %eax
 405
 406         mull    -12(%esi)               C src[size-1]*src[size-3]
 407
 408         addl    %ecx, %eax
 409         movl    (%edi), %ecx
 410
 411         adcl    $0, %edx
 412         movl    %ebx, -4(%edi)
 413
 414         addl    %eax, %ecx
 415         movl    %edx, %ebx
 416
 417         adcl    $0, %ebx
 418         movl    -4(%esi), %eax
 419
 420         mull    -8(%esi)                C src[size-1]*src[size-2]
 421
 422         movl    %ecx, (%edi)
 423         addl    %eax, %ebx
 424
 425         adcl    $0, %edx
 426         movl    PARAM_SIZE, %eax
 427
 428         negl    %eax
 429         movl    %ebx, 4(%edi)
 430
 431         addl    $1, %eax                C -(size-1) and clear carry
 432         movl    %edx, 8(%edi)
 433
 434
 435 C -----------------------------------------------------------------------------
 436 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 437
 438 L(lshift):
 439         C eax   counter, negative
 440         C ebx   next limb
 441         C ecx
 442         C edx
 443         C esi
 444         C edi   &dst[2*size-4]
 445         C ebp
 446
 447         movl    12(%edi,%eax,8), %ebx
 448
 449         rcll    %ebx
 450         movl    16(%edi,%eax,8), %ecx
 451
 452         rcll    %ecx
 453         movl    %ebx, 12(%edi,%eax,8)
 454
 455         movl    %ecx, 16(%edi,%eax,8)
 456         incl    %eax
 457
 458         jnz     L(lshift)
 459
 460
 461         adcl    %eax, %eax              C high bit out
 462         movl    PARAM_SRC, %esi
 463
 464         movl    PARAM_SIZE, %ecx        C risk of cache bank clash
 465         movl    %eax, 12(%edi)          C dst most significant limb
 466
 467
 468 C -----------------------------------------------------------------------------
 469 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
 470 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 471 C low limb of src[0]^2.
 472
 473         movl    (%esi), %eax            C src[0]
 474         leal    (%esi,%ecx,4), %esi     C src end
 475
 476         negl    %ecx
 477
 478         mull    %eax
 479
 480         movl    %eax, 16(%edi,%ecx,8)   C dst[0]
 481         movl    %edx, %ebx
 482
 483         addl    $1, %ecx                C size-1 and clear carry
 484
 485 L(diag):
 486         C eax   scratch (low product)
 487         C ebx   carry limb
 488         C ecx   counter, negative
 489         C edx   scratch (high product)
 490         C esi   &src[size]
 491         C edi   &dst[2*size-4]
 492         C ebp   scratch (fetched dst limbs)
 493
 494         movl    (%esi,%ecx,4), %eax
 495         adcl    $0, %ebx
 496
 497         mull    %eax
 498
 499         movl    16-4(%edi,%ecx,8), %ebp
 500
 501         addl    %ebp, %ebx
 502         movl    16(%edi,%ecx,8), %ebp
 503
 504         adcl    %eax, %ebp
 505         movl    %ebx, 16-4(%edi,%ecx,8)
 506
 507         movl    %ebp, 16(%edi,%ecx,8)
 508         incl    %ecx
 509
 510         movl    %edx, %ebx
 511         jnz     L(diag)
 512
 513
 514         adcl    $0, %edx
 515         movl    16-4(%edi), %eax        C dst most significant limb
 516
 517         addl    %eax, %edx
 518         popl    %ebp
 519
 520         movl    %edx, 16-4(%edi)
 521         popl    %esi            C risk of cache bank clash
 522
 523         popl    %edi
 524         popl    %ebx
 525
 526         ret
 527
 528 EPILOGUE()