source/libs/gmp/gmp-src/mpn/sparc64/ultrasparc1234/addmul_2.asm

   1 dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
   2 dnl  number and add the result to a n limb vector.
   3
   4 dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C                  cycles/limb
  35 C UltraSPARC 1&2:      9
  36 C UltraSPARC 3:       10
  37
  38 C Algorithm: We use 16 floating-point multiplies per limb product, with the
  39 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
  40 C split into 32-bit pieces.  We sum four 48-bit partial products using
  41 C floating-point add, then convert the resulting four 50-bit quantities and
  42 C transfer them to the integer unit.
  43
  44 C Possible optimizations:
  45 C   1. Align the stack area where we transfer the four 50-bit product-sums
  46 C      to a 32-byte boundary.  That would minimize the cache collision.
  47 C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
  48 C      be to align the area to map to the area immediately before up?)
  49 C   2. Perform two of the fp->int conversions with integer instructions.  We
  50 C      can get almost ten free IEU slots, if we clean up bookkeeping and the
  51 C      silly carry-limb code.
  52 C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
  53 C      code.
  54
  55 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
  56 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
  57 C FI    = 20
  58 C L     =  9 x un * vn
  59 C WDFI  = 10 x vn / 2
  60 C WD    = 4
  61
  62 C Instruction classification (as per UltraSPARC functional units).
  63 C Assuming silly carry code is fixed.  Includes bookkeeping.
  64 C
  65 C               mpn_addmul_X     mpn_mul_X
  66 C                1       2       1       2
  67 C               ==========      ==========
  68 C      FM        8      16       8      16
  69 C      FA       10      18      10      18
  70 C     MEM       12      12      10      10
  71 C  ISHIFT        6       6       6       6
  72 C IADDLOG       11      11      10      10
  73 C  BRANCH        1       1       1       1
  74 C
  75 C TOTAL IEU     17      17      16      16
  76 C TOTAL         48      64      45      61
  77 C
  78 C IEU cycles     8.5     8.5     8       8
  79 C MEM cycles    12      12      10      10
  80 C ISSUE cycles  12      16      11.25   15.25
  81 C FPU cycles    10      18      10      18
  82 C cycles/loop   12      18      12      18
  83 C cycles/limb   12       9      12       9
  84
  85
  86 C INPUT PARAMETERS
  87 C rp[n + 1]     i0
  88 C up[n]         i1
  89 C n             i2
  90 C vp[2]         i3
  91
  92
  93 ASM_START()
  94         REGISTER(%g2,#scratch)
  95         REGISTER(%g3,#scratch)
  96
  97 C Combine registers:
  98 C u00_hi= u32_hi
  99 C u00_lo= u32_lo
 100 C a000  = out000
 101 C a016  = out016
 102 C Free: f52 f54
 103
 104
 105 define(`p000', `%f8')  define(`p016',`%f10')
 106 define(`p032',`%f12')  define(`p048',`%f14')
 107 define(`p064',`%f16')  define(`p080',`%f18')
 108 define(`p096a',`%f20') define(`p112a',`%f22')
 109 define(`p096b',`%f56') define(`p112b',`%f58')
 110
 111 define(`out000',`%f0') define(`out016',`%f6')
 112
 113 define(`v000',`%f24')  define(`v016',`%f26')
 114 define(`v032',`%f28')  define(`v048',`%f30')
 115 define(`v064',`%f44')  define(`v080',`%f46')
 116 define(`v096',`%f48')  define(`v112',`%f50')
 117
 118 define(`u00',`%f32')   define(`u32', `%f34')
 119
 120 define(`a000',`%f36')  define(`a016',`%f38')
 121 define(`a032',`%f40')  define(`a048',`%f42')
 122 define(`a064',`%f60')  define(`a080',`%f62')
 123
 124 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
 125 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
 126
 127 define(`cy',`%g1')
 128 define(`rlimb',`%g3')
 129 define(`i00',`%l0')    define(`i16',`%l1')
 130 define(`r00',`%l2')    define(`r32',`%l3')
 131 define(`xffffffff',`%l7')
 132 define(`xffff',`%o0')
 133
 134
 135 PROLOGUE(mpn_addmul_2)
 136
 137 C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
 138 C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
 139 C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
 140 C This code could be better scheduled.
 141
 142         save    %sp, -256, %sp
 143
 144 ifdef(`HAVE_VIS',
 145 `       mov     -1, %g4
 146         wr      %g0, 0xD2, %asi
 147         srlx    %g4, 32, xffffffff      C store mask in register `xffffffff'
 148         ldda    [%i3+6] %asi, v000
 149         ldda    [%i3+4] %asi, v016
 150         ldda    [%i3+2] %asi, v032
 151         ldda    [%i3+0] %asi, v048
 152         fxtod   v000, v000
 153         ldda    [%i3+14] %asi, v064
 154         fxtod   v016, v016
 155         ldda    [%i3+12] %asi, v080
 156         fxtod   v032, v032
 157         ldda    [%i3+10] %asi, v096
 158         fxtod   v048, v048
 159         ldda    [%i3+8] %asi, v112
 160         fxtod   v064, v064
 161         fxtod   v080, v080
 162         fxtod   v096, v096
 163         fxtod   v112, v112
 164         fzero   u00_hi
 165         fzero   u32_hi
 166 ',
 167 `       mov     -1, %g4
 168         ldx     [%i3+0], %l0            C vp[0]
 169         srlx    %g4, 48, xffff          C store mask in register `xffff'
 170         ldx     [%i3+8], %l1            C vp[1]
 171
 172         and     %l0, xffff, %g2
 173         stx     %g2, [%sp+2223+0]
 174         srlx    %l0, 16, %g3
 175         and     %g3, xffff, %g3
 176         stx     %g3, [%sp+2223+8]
 177         srlx    %l0, 32, %g2
 178         and     %g2, xffff, %g2
 179         stx     %g2, [%sp+2223+16]
 180         srlx    %l0, 48, %g3
 181         stx     %g3, [%sp+2223+24]
 182         and     %l1, xffff, %g2
 183         stx     %g2, [%sp+2223+32]
 184         srlx    %l1, 16, %g3
 185         and     %g3, xffff, %g3
 186         stx     %g3, [%sp+2223+40]
 187         srlx    %l1, 32, %g2
 188         and     %g2, xffff, %g2
 189         stx     %g2, [%sp+2223+48]
 190         srlx    %l1, 48, %g3
 191         stx     %g3, [%sp+2223+56]
 192
 193         srlx    %g4, 32, xffffffff      C store mask in register `xffffffff'
 194
 195         ldd     [%sp+2223+0], v000
 196         ldd     [%sp+2223+8], v016
 197         ldd     [%sp+2223+16], v032
 198         ldd     [%sp+2223+24], v048
 199         fxtod   v000, v000
 200         ldd     [%sp+2223+32], v064
 201         fxtod   v016, v016
 202         ldd     [%sp+2223+40], v080
 203         fxtod   v032, v032
 204         ldd     [%sp+2223+48], v096
 205         fxtod   v048, v048
 206         ldd     [%sp+2223+56], v112
 207         fxtod   v064, v064
 208         ld      [%sp+2223+0], u00_hi    C zero u00_hi
 209         fxtod   v080, v080
 210         ld      [%sp+2223+0], u32_hi    C zero u32_hi
 211         fxtod   v096, v096
 212         fxtod   v112, v112
 213 ')
 214 C Initialization done.
 215         mov     0, %g2
 216         mov     0, rlimb
 217         mov     0, %g4
 218         add     %i0, -8, %i0            C BOOKKEEPING
 219
 220 C Start software pipeline.
 221
 222         ld      [%i1+4], u00_lo         C read low 32 bits of up[i]
 223         fxtod   u00_hi, u00
 224 C mid
 225         ld      [%i1+0], u32_lo         C read high 32 bits of up[i]
 226         fmuld   u00, v000, a000
 227         fmuld   u00, v016, a016
 228         fmuld   u00, v032, a032
 229         fmuld   u00, v048, a048
 230         add     %i2, -1, %i2            C BOOKKEEPING
 231         fmuld   u00, v064, p064
 232         add     %i1, 8, %i1             C BOOKKEEPING
 233         fxtod   u32_hi, u32
 234         fmuld   u00, v080, p080
 235         fmuld   u00, v096, p096a
 236         brnz,pt %i2, .L_2_or_more
 237          fmuld  u00, v112, p112a
 238
 239 .L1:    fdtox   a000, out000
 240         fmuld   u32, v000, p000
 241         fdtox   a016, out016
 242         fmuld   u32, v016, p016
 243         fmovd   p064, a064
 244         fmuld   u32, v032, p032
 245         fmovd   p080, a080
 246         fmuld   u32, v048, p048
 247         std     out000, [%sp+2223+16]
 248         faddd   p000, a032, a000
 249         fmuld   u32, v064, p064
 250         std     out016, [%sp+2223+24]
 251         fxtod   u00_hi, u00
 252         faddd   p016, a048, a016
 253         fmuld   u32, v080, p080
 254         faddd   p032, a064, a032
 255         fmuld   u32, v096, p096b
 256         faddd   p048, a080, a048
 257         fmuld   u32, v112, p112b
 258 C mid
 259         fdtox   a000, out000
 260         fdtox   a016, out016
 261         faddd   p064, p096a, a064
 262         faddd   p080, p112a, a080
 263         std     out000, [%sp+2223+0]
 264         b       .L_wd2
 265          std    out016, [%sp+2223+8]
 266
 267 .L_2_or_more:
 268         ld      [%i1+4], u00_lo         C read low 32 bits of up[i]
 269         fdtox   a000, out000
 270         fmuld   u32, v000, p000
 271         fdtox   a016, out016
 272         fmuld   u32, v016, p016
 273         fmovd   p064, a064
 274         fmuld   u32, v032, p032
 275         fmovd   p080, a080
 276         fmuld   u32, v048, p048
 277         std     out000, [%sp+2223+16]
 278         faddd   p000, a032, a000
 279         fmuld   u32, v064, p064
 280         std     out016, [%sp+2223+24]
 281         fxtod   u00_hi, u00
 282         faddd   p016, a048, a016
 283         fmuld   u32, v080, p080
 284         faddd   p032, a064, a032
 285         fmuld   u32, v096, p096b
 286         faddd   p048, a080, a048
 287         fmuld   u32, v112, p112b
 288 C mid
 289         ld      [%i1+0], u32_lo         C read high 32 bits of up[i]
 290         fdtox   a000, out000
 291         fmuld   u00, v000, p000
 292         fdtox   a016, out016
 293         fmuld   u00, v016, p016
 294         faddd   p064, p096a, a064
 295         fmuld   u00, v032, p032
 296         faddd   p080, p112a, a080
 297         fmuld   u00, v048, p048
 298         add     %i2, -1, %i2            C BOOKKEEPING
 299         std     out000, [%sp+2223+0]
 300         faddd   p000, a032, a000
 301         fmuld   u00, v064, p064
 302         add     %i1, 8, %i1             C BOOKKEEPING
 303         std     out016, [%sp+2223+8]
 304         fxtod   u32_hi, u32
 305         faddd   p016, a048, a016
 306         fmuld   u00, v080, p080
 307         faddd   p032, a064, a032
 308         fmuld   u00, v096, p096a
 309         faddd   p048, a080, a048
 310         brnz,pt %i2, .L_3_or_more
 311          fmuld  u00, v112, p112a
 312
 313         b       .Lend
 314          nop
 315
 316 C  64      32       0
 317 C   .       .       .
 318 C   .       |__rXXX_|   32
 319 C   .      |___cy___|   34
 320 C   .  |_______i00__|   50
 321 C  |_______i16__|   .   50
 322
 323
 324 C BEGIN MAIN LOOP
 325         .align  16
 326 .L_3_or_more:
 327 .Loop:  ld      [%i1+4], u00_lo         C read low 32 bits of up[i]
 328         and     %g2, xffffffff, %g2
 329         fdtox   a000, out000
 330         fmuld   u32, v000, p000
 331 C
 332         lduw    [%i0+4+8], r00          C read low 32 bits of rp[i]
 333         add     %g2, rlimb, %l5
 334         fdtox   a016, out016
 335         fmuld   u32, v016, p016
 336 C
 337         srlx    %l5, 32, cy
 338         ldx     [%sp+2223+16], i00
 339         faddd   p064, p096b, a064
 340         fmuld   u32, v032, p032
 341 C
 342         add     %g4, cy, cy             C new cy
 343         ldx     [%sp+2223+24], i16
 344         faddd   p080, p112b, a080
 345         fmuld   u32, v048, p048
 346 C
 347         nop
 348         std     out000, [%sp+2223+16]
 349         faddd   p000, a032, a000
 350         fmuld   u32, v064, p064
 351 C
 352         add     i00, r00, rlimb
 353         add     %i0, 8, %i0             C BOOKKEEPING
 354         std     out016, [%sp+2223+24]
 355         fxtod   u00_hi, u00
 356 C
 357         sllx    i16, 16, %g2
 358         add     cy, rlimb, rlimb
 359         faddd   p016, a048, a016
 360         fmuld   u32, v080, p080
 361 C
 362         srlx    i16, 16, %g4
 363         add     %g2, rlimb, %l5
 364         faddd   p032, a064, a032
 365         fmuld   u32, v096, p096b
 366 C
 367         stw     %l5, [%i0+4]
 368         nop
 369         faddd   p048, a080, a048
 370         fmuld   u32, v112, p112b
 371 C midloop
 372         ld      [%i1+0], u32_lo         C read high 32 bits of up[i]
 373         and     %g2, xffffffff, %g2
 374         fdtox   a000, out000
 375         fmuld   u00, v000, p000
 376 C
 377         lduw    [%i0+0], r32            C read high 32 bits of rp[i]
 378         add     %g2, rlimb, %l5
 379         fdtox   a016, out016
 380         fmuld   u00, v016, p016
 381 C
 382         srlx    %l5, 32, cy
 383         ldx     [%sp+2223+0], i00
 384         faddd   p064, p096a, a064
 385         fmuld   u00, v032, p032
 386 C
 387         add     %g4, cy, cy             C new cy
 388         ldx     [%sp+2223+8], i16
 389         faddd   p080, p112a, a080
 390         fmuld   u00, v048, p048
 391 C
 392         add     %i2, -1, %i2            C BOOKKEEPING
 393         std     out000, [%sp+2223+0]
 394         faddd   p000, a032, a000
 395         fmuld   u00, v064, p064
 396 C
 397         add     i00, r32, rlimb
 398         add     %i1, 8, %i1             C BOOKKEEPING
 399         std     out016, [%sp+2223+8]
 400         fxtod   u32_hi, u32
 401 C
 402         sllx    i16, 16, %g2
 403         add     cy, rlimb, rlimb
 404         faddd   p016, a048, a016
 405         fmuld   u00, v080, p080
 406 C
 407         srlx    i16, 16, %g4
 408         add     %g2, rlimb, %l5
 409         faddd   p032, a064, a032
 410         fmuld   u00, v096, p096a
 411 C
 412         stw     %l5, [%i0+0]
 413         faddd   p048, a080, a048
 414         brnz,pt %i2, .Loop
 415          fmuld  u00, v112, p112a
 416 C END MAIN LOOP
 417
 418 C WIND-DOWN PHASE 1
 419 .Lend:  and     %g2, xffffffff, %g2
 420         fdtox   a000, out000
 421         fmuld   u32, v000, p000
 422         lduw    [%i0+4+8], r00          C read low 32 bits of rp[i]
 423         add     %g2, rlimb, %l5
 424         fdtox   a016, out016
 425         fmuld   u32, v016, p016
 426         srlx    %l5, 32, cy
 427         ldx     [%sp+2223+16], i00
 428         faddd   p064, p096b, a064
 429         fmuld   u32, v032, p032
 430         add     %g4, cy, cy             C new cy
 431         ldx     [%sp+2223+24], i16
 432         faddd   p080, p112b, a080
 433         fmuld   u32, v048, p048
 434         std     out000, [%sp+2223+16]
 435         faddd   p000, a032, a000
 436         fmuld   u32, v064, p064
 437         add     i00, r00, rlimb
 438         add     %i0, 8, %i0             C BOOKKEEPING
 439         std     out016, [%sp+2223+24]
 440         sllx    i16, 16, %g2
 441         add     cy, rlimb, rlimb
 442         faddd   p016, a048, a016
 443         fmuld   u32, v080, p080
 444         srlx    i16, 16, %g4
 445         add     %g2, rlimb, %l5
 446         faddd   p032, a064, a032
 447         fmuld   u32, v096, p096b
 448         stw     %l5, [%i0+4]
 449         faddd   p048, a080, a048
 450         fmuld   u32, v112, p112b
 451 C mid
 452         and     %g2, xffffffff, %g2
 453         fdtox   a000, out000
 454         lduw    [%i0+0], r32            C read high 32 bits of rp[i]
 455         add     %g2, rlimb, %l5
 456         fdtox   a016, out016
 457         srlx    %l5, 32, cy
 458         ldx     [%sp+2223+0], i00
 459         faddd   p064, p096a, a064
 460         add     %g4, cy, cy             C new cy
 461         ldx     [%sp+2223+8], i16
 462         faddd   p080, p112a, a080
 463         std     out000, [%sp+2223+0]
 464         add     i00, r32, rlimb
 465         std     out016, [%sp+2223+8]
 466         sllx    i16, 16, %g2
 467         add     cy, rlimb, rlimb
 468         srlx    i16, 16, %g4
 469         add     %g2, rlimb, %l5
 470         stw     %l5, [%i0+0]
 471
 472 C WIND-DOWN PHASE 2
 473 .L_wd2: and     %g2, xffffffff, %g2
 474         fdtox   a032, out000
 475         lduw    [%i0+4+8], r00          C read low 32 bits of rp[i]
 476         add     %g2, rlimb, %l5
 477         fdtox   a048, out016
 478         srlx    %l5, 32, cy
 479         ldx     [%sp+2223+16], i00
 480         add     %g4, cy, cy             C new cy
 481         ldx     [%sp+2223+24], i16
 482         std     out000, [%sp+2223+16]
 483         add     i00, r00, rlimb
 484         add     %i0, 8, %i0             C BOOKKEEPING
 485         std     out016, [%sp+2223+24]
 486         sllx    i16, 16, %g2
 487         add     cy, rlimb, rlimb
 488         srlx    i16, 16, %g4
 489         add     %g2, rlimb, %l5
 490         stw     %l5, [%i0+4]
 491 C mid
 492         and     %g2, xffffffff, %g2
 493         fdtox   a064, out000
 494         lduw    [%i0+0], r32            C read high 32 bits of rp[i]
 495         add     %g2, rlimb, %l5
 496         fdtox   a080, out016
 497         srlx    %l5, 32, cy
 498         ldx     [%sp+2223+0], i00
 499         add     %g4, cy, cy             C new cy
 500         ldx     [%sp+2223+8], i16
 501         std     out000, [%sp+2223+0]
 502         add     i00, r32, rlimb
 503         std     out016, [%sp+2223+8]
 504         sllx    i16, 16, %g2
 505         add     cy, rlimb, rlimb
 506         srlx    i16, 16, %g4
 507         add     %g2, rlimb, %l5
 508         stw     %l5, [%i0+0]
 509
 510 C WIND-DOWN PHASE 3
 511 .L_wd3: and     %g2, xffffffff, %g2
 512         fdtox   p096b, out000
 513         add     %g2, rlimb, %l5
 514         fdtox   p112b, out016
 515         srlx    %l5, 32, cy
 516         ldx     [%sp+2223+16], rlimb
 517         add     %g4, cy, cy             C new cy
 518         ldx     [%sp+2223+24], i16
 519         std     out000, [%sp+2223+16]
 520         add     %i0, 8, %i0             C BOOKKEEPING
 521         std     out016, [%sp+2223+24]
 522         sllx    i16, 16, %g2
 523         add     cy, rlimb, rlimb
 524         srlx    i16, 16, %g4
 525         add     %g2, rlimb, %l5
 526         stw     %l5, [%i0+4]
 527 C mid
 528         and     %g2, xffffffff, %g2
 529         add     %g2, rlimb, %l5
 530         srlx    %l5, 32, cy
 531         ldx     [%sp+2223+0], rlimb
 532         add     %g4, cy, cy             C new cy
 533         ldx     [%sp+2223+8], i16
 534         sllx    i16, 16, %g2
 535         add     cy, rlimb, rlimb
 536         srlx    i16, 16, %g4
 537         add     %g2, rlimb, %l5
 538         stw     %l5, [%i0+0]
 539
 540         and     %g2, xffffffff, %g2
 541         add     %g2, rlimb, %l5
 542         srlx    %l5, 32, cy
 543         ldx     [%sp+2223+16], i00
 544         add     %g4, cy, cy             C new cy
 545         ldx     [%sp+2223+24], i16
 546
 547         sllx    i16, 16, %g2
 548         add     i00, cy, cy
 549         return  %i7+8
 550         add     %g2, cy, %o0
 551 EPILOGUE(mpn_addmul_2)