source/libs/gmp/gmp-src/mpn/sparc32/v9/sqr_diagonal.asm

   1 dnl  SPARC v9 32-bit mpn_sqr_diagonal.
   2
   3 dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31
  32 include(`../config.m4')
  33
  34 C INPUT PARAMETERS
  35 C rp    i0
  36 C up    i1
  37 C n     i2
  38
  39 C This code uses a very deep software pipeline, due to the need for moving data
  40 C forth and back between the integer registers and floating-point registers.
  41 C
  42 C A VIS variant of this code would make the pipeline less deep, since the
  43 C masking now done in the integer unit could take place in the floating-point
  44 C unit using the FAND instruction.  It would be possible to save several cycles
  45 C too.
  46 C
  47 C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
  48 C not much slower from the Ecache.  It would perhaps be possible to shave off
  49 C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
  50 C used instructions, since we have 10 memory operations per limb.  But a VIS
  51 C variant could run three cycles faster than the corresponding non-VIS code.
  52
  53 C This is non-pipelined code showing the algorithm:
  54 C
  55 C .Loop:
  56 C       lduw    [up+0],%g4              C 00000000hhhhllll
  57 C       sllx    %g4,16,%g3              C 0000hhhhllll0000
  58 C       or      %g3,%g4,%g2             C 0000hhhhXXXXllll
  59 C       andn    %g2,%g5,%g2             C 0000hhhh0000llll
  60 C       stx     %g2,[%fp+80]
  61 C       ldd     [%fp+80],%f0
  62 C       fitod   %f0,%f4                 C hi16
  63 C       fitod   %f1,%f6                 C lo16
  64 C       ld      [up+0],%f9
  65 C       fxtod   %f8,%f2
  66 C       fmuld   %f2,%f4,%f4
  67 C       fmuld   %f2,%f6,%f6
  68 C       fdtox   %f4,%f4
  69 C       fdtox   %f6,%f6
  70 C       std     %f4,[%fp-24]
  71 C       std     %f6,[%fp-16]
  72 C       ldx     [%fp-24],%g2
  73 C       ldx     [%fp-16],%g1
  74 C       sllx    %g2,16,%g2
  75 C       add     %g2,%g1,%g1
  76 C       stw     %g1,[rp+0]
  77 C       srlx    %g1,32,%l0
  78 C       stw     %l0,[rp+4]
  79 C       add     up,4,up
  80 C       subcc   n,1,n
  81 C       bne,pt  %icc,.Loop
  82 C       add     rp,8,rp
  83
  84 define(`fanop',`fitod %f12,%f10')       dnl  A quasi nop running in the FA pipe
  85
  86 ASM_START()
  87
  88         TEXT
  89         ALIGN(4)
  90 .Lnoll:
  91         .word   0
  92
  93 PROLOGUE(mpn_sqr_diagonal)
  94         save    %sp,-256,%sp
  95
  96 ifdef(`PIC',
  97 `.Lpc:  rd      %pc,%o7
  98         ld      [%o7+.Lnoll-.Lpc],%f8',
  99 `       sethi   %hi(.Lnoll),%g1
 100         ld      [%g1+%lo(.Lnoll)],%f8')
 101
 102         sethi   %hi(0xffff0000),%g5
 103         add     %i1,-8,%i1
 104
 105         lduw    [%i1+8],%g4
 106         add     %i1,4,%i1               C s1_ptr++
 107         sllx    %g4,16,%g3              C 0000hhhhllll0000
 108         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 109         subcc   %i2,1,%i2
 110         bne,pt  %icc,.L_grt_1
 111         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 112
 113         add     %i1,4,%i1               C s1_ptr++
 114         stx     %g2,[%fp+80]
 115         ld      [%i1],%f9
 116         ldd     [%fp+80],%f0
 117         fxtod   %f8,%f2
 118         fitod   %f0,%f4
 119         fitod   %f1,%f6
 120         fmuld   %f2,%f4,%f4
 121         fmuld   %f2,%f6,%f6
 122         fdtox   %f4,%f4
 123         fdtox   %f6,%f6
 124         std     %f4,[%fp-24]
 125         std     %f6,[%fp-16]
 126
 127         add     %fp, 80, %l3
 128         add     %fp, -24, %l4
 129         add     %fp, 72, %l5
 130         b       .L1
 131         add     %fp, -40, %l6
 132
 133 .L_grt_1:
 134         stx     %g2,[%fp+80]
 135         lduw    [%i1+8],%g4
 136         add     %i1,4,%i1               C s1_ptr++
 137         sllx    %g4,16,%g3              C 0000hhhhllll0000
 138         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 139         subcc   %i2,1,%i2
 140         bne,pt  %icc,.L_grt_2
 141         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 142
 143         stx     %g2,[%fp+72]
 144         ld      [%i1],%f9
 145         add     %i1,4,%i1               C s1_ptr++
 146         ldd     [%fp+80],%f0
 147         fxtod   %f8,%f2
 148         fitod   %f0,%f4
 149         fitod   %f1,%f6
 150         fmuld   %f2,%f4,%f4
 151         ld      [%i1],%f9
 152         fmuld   %f2,%f6,%f6
 153         ldd     [%fp+72],%f0
 154         fdtox   %f4,%f4
 155         fdtox   %f6,%f6
 156         std     %f4,[%fp-24]
 157         fxtod   %f8,%f2
 158         std     %f6,[%fp-16]
 159         fitod   %f0,%f4
 160         fitod   %f1,%f6
 161         fmuld   %f2,%f4,%f4
 162         fmuld   %f2,%f6,%f6
 163         fdtox   %f4,%f4
 164
 165         add     %fp, 72, %l3
 166         add     %fp, -40, %l4
 167         add     %fp, 80, %l5
 168         b       .L2
 169         add     %fp, -24, %l6
 170
 171 .L_grt_2:
 172         stx     %g2,[%fp+72]
 173         lduw    [%i1+8],%g4
 174         ld      [%i1],%f9
 175         add     %i1,4,%i1               C s1_ptr++
 176         ldd     [%fp+80],%f0
 177         sllx    %g4,16,%g3              C 0000hhhhllll0000
 178         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 179         subcc   %i2,1,%i2
 180         fxtod   %f8,%f2
 181         bne,pt  %icc,.L_grt_3
 182         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 183
 184         stx     %g2,[%fp+80]
 185         fitod   %f0,%f4
 186         fitod   %f1,%f6
 187         fmuld   %f2,%f4,%f4
 188         ld      [%i1],%f9
 189         fmuld   %f2,%f6,%f6
 190         add     %i1,4,%i1               C s1_ptr++
 191         ldd     [%fp+72],%f0
 192         fdtox   %f4,%f4
 193         fdtox   %f6,%f6
 194         std     %f4,[%fp-24]
 195         fxtod   %f8,%f2
 196         std     %f6,[%fp-16]
 197         fitod   %f0,%f4
 198         fitod   %f1,%f6
 199         fmuld   %f2,%f4,%f4
 200         ld      [%i1],%f9
 201         add     %fp, 80, %l3
 202         fmuld   %f2,%f6,%f6
 203         add     %fp, -24, %l4
 204         ldd     [%fp+80],%f0
 205         add     %fp, 72, %l5
 206         fdtox   %f4,%f4
 207         b       .L3
 208         add     %fp, -40, %l6
 209
 210 .L_grt_3:
 211         stx     %g2,[%fp+80]
 212         fitod   %f0,%f4
 213         lduw    [%i1+8],%g4
 214         fitod   %f1,%f6
 215         fmuld   %f2,%f4,%f4
 216         ld      [%i1],%f9
 217         fmuld   %f2,%f6,%f6
 218         add     %i1,4,%i1               C s1_ptr++
 219         ldd     [%fp+72],%f0
 220         fdtox   %f4,%f4
 221         sllx    %g4,16,%g3              C 0000hhhhllll0000
 222         fdtox   %f6,%f6
 223         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 224         subcc   %i2,1,%i2
 225         std     %f4,[%fp-24]
 226         fxtod   %f8,%f2
 227         std     %f6,[%fp-16]
 228         bne,pt  %icc,.L_grt_4
 229         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 230
 231         stx     %g2,[%fp+72]
 232         fitod   %f0,%f4
 233         fitod   %f1,%f6
 234         add     %fp, 72, %l3
 235         fmuld   %f2,%f4,%f4
 236         add     %fp, -40, %l4
 237         ld      [%i1],%f9
 238         fmuld   %f2,%f6,%f6
 239         add     %i1,4,%i1               C s1_ptr++
 240         ldd     [%fp+80],%f0
 241         add     %fp, 80, %l5
 242         fdtox   %f4,%f4
 243         b       .L4
 244         add     %fp, -24, %l6
 245
 246 .L_grt_4:
 247         stx     %g2,[%fp+72]
 248         fitod   %f0,%f4
 249         lduw    [%i1+8],%g4
 250         fitod   %f1,%f6
 251         fmuld   %f2,%f4,%f4
 252         ld      [%i1],%f9
 253         fmuld   %f2,%f6,%f6
 254         add     %i1,4,%i1               C s1_ptr++
 255         ldd     [%fp+80],%f0
 256         fdtox   %f4,%f4
 257         sllx    %g4,16,%g3              C 0000hhhhllll0000
 258         fdtox   %f6,%f6
 259         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 260         subcc   %i2,1,%i2
 261         std     %f4,[%fp-40]
 262         fxtod   %f8,%f2
 263         std     %f6,[%fp-32]
 264         be,pn   %icc,.L5
 265         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 266
 267         b,a     .Loop
 268
 269         .align  16
 270 C --- LOOP BEGIN
 271 .Loop:  nop
 272         nop
 273         stx     %g2,[%fp+80]
 274         fitod   %f0,%f4
 275 C ---
 276         nop
 277         nop
 278         lduw    [%i1+8],%g4
 279         fitod   %f1,%f6
 280 C ---
 281         nop
 282         nop
 283         ldx     [%fp-24],%g2            C p16
 284         fanop
 285 C ---
 286         nop
 287         nop
 288         ldx     [%fp-16],%g1            C p0
 289         fmuld   %f2,%f4,%f4
 290 C ---
 291         sllx    %g2,16,%g2              C align p16
 292         add     %i0,8,%i0               C res_ptr++
 293         ld      [%i1],%f9
 294         fmuld   %f2,%f6,%f6
 295 C ---
 296         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 297         add     %i1,4,%i1               C s1_ptr++
 298         ldd     [%fp+72],%f0
 299         fanop
 300 C ---
 301         srlx    %g1,32,%l0
 302         nop
 303         stw     %g1,[%i0-8]
 304         fdtox   %f4,%f4
 305 C ---
 306         sllx    %g4,16,%g3              C 0000hhhhllll0000
 307         nop
 308         stw     %l0,[%i0-4]
 309         fdtox   %f6,%f6
 310 C ---
 311         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 312         subcc   %i2,1,%i2
 313         std     %f4,[%fp-24]
 314         fxtod   %f8,%f2
 315 C ---
 316         std     %f6,[%fp-16]
 317         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 318         be,pn   %icc,.Lend
 319         fanop
 320 C ---  LOOP MIDDLE
 321         nop
 322         nop
 323         stx     %g2,[%fp+72]
 324         fitod   %f0,%f4
 325 C ---
 326         nop
 327         nop
 328         lduw    [%i1+8],%g4
 329         fitod   %f1,%f6
 330 C ---
 331         nop
 332         nop
 333         ldx     [%fp-40],%g2            C p16
 334         fanop
 335 C ---
 336         nop
 337         nop
 338         ldx     [%fp-32],%g1            C p0
 339         fmuld   %f2,%f4,%f4
 340 C ---
 341         sllx    %g2,16,%g2              C align p16
 342         add     %i0,8,%i0               C res_ptr++
 343         ld      [%i1],%f9
 344         fmuld   %f2,%f6,%f6
 345 C ---
 346         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 347         add     %i1,4,%i1               C s1_ptr++
 348         ldd     [%fp+80],%f0
 349         fanop
 350 C ---
 351         srlx    %g1,32,%l0
 352         nop
 353         stw     %g1,[%i0-8]
 354         fdtox   %f4,%f4
 355 C ---
 356         sllx    %g4,16,%g3              C 0000hhhhllll0000
 357         nop
 358         stw     %l0,[%i0-4]
 359         fdtox   %f6,%f6
 360 C ---
 361         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 362         subcc   %i2,1,%i2
 363         std     %f4,[%fp-40]
 364         fxtod   %f8,%f2
 365 C ---
 366         std     %f6,[%fp-32]
 367         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 368         bne,pt  %icc,.Loop
 369         fanop
 370 C --- LOOP END
 371
 372 .L5:    add     %fp, 80, %l3
 373         add     %fp, -24, %l4
 374         add     %fp, 72, %l5
 375         b       .Ltail
 376         add     %fp, -40, %l6
 377
 378 .Lend:  add     %fp, 72, %l3
 379         add     %fp, -40, %l4
 380         add     %fp, 80, %l5
 381         add     %fp, -24, %l6
 382 .Ltail: stx     %g2,[%l3]
 383         fitod   %f0,%f4
 384         fitod   %f1,%f6
 385         ldx     [%l4],%g2               C p16
 386         ldx     [%l4+8],%g1             C p0
 387         fmuld   %f2,%f4,%f4
 388         sllx    %g2,16,%g2              C align p16
 389         add     %i0,8,%i0               C res_ptr++
 390         ld      [%i1],%f9
 391         fmuld   %f2,%f6,%f6
 392         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 393         add     %i1,4,%i1               C s1_ptr++
 394         ldd     [%l5],%f0
 395         srlx    %g1,32,%l0
 396         stw     %g1,[%i0-8]
 397         fdtox   %f4,%f4
 398         stw     %l0,[%i0-4]
 399 .L4:    fdtox   %f6,%f6
 400         std     %f4,[%l4]
 401         fxtod   %f8,%f2
 402         std     %f6,[%l4+8]
 403
 404         fitod   %f0,%f4
 405         fitod   %f1,%f6
 406         ldx     [%l6],%g2               C p16
 407         ldx     [%l6+8],%g1             C p0
 408         fmuld   %f2,%f4,%f4
 409         sllx    %g2,16,%g2              C align p16
 410         add     %i0,8,%i0               C res_ptr++
 411         ld      [%i1],%f9
 412         fmuld   %f2,%f6,%f6
 413         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 414         ldd     [%l3],%f0
 415         srlx    %g1,32,%l0
 416         stw     %g1,[%i0-8]
 417         fdtox   %f4,%f4
 418         stw     %l0,[%i0-4]
 419 .L3:    fdtox   %f6,%f6
 420         std     %f4,[%l6]
 421         fxtod   %f8,%f2
 422         std     %f6,[%l6+8]
 423
 424         fitod   %f0,%f4
 425         fitod   %f1,%f6
 426         ldx     [%l4],%g2               C p16
 427         ldx     [%l4+8],%g1             C p0
 428         fmuld   %f2,%f4,%f4
 429         sllx    %g2,16,%g2              C align p16
 430         add     %i0,8,%i0               C res_ptr++
 431         fmuld   %f2,%f6,%f6
 432         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 433         srlx    %g1,32,%l0
 434         stw     %g1,[%i0-8]
 435         fdtox   %f4,%f4
 436         stw     %l0,[%i0-4]
 437 .L2:    fdtox   %f6,%f6
 438         std     %f4,[%l4]
 439         std     %f6,[%l4+8]
 440
 441         ldx     [%l6],%g2               C p16
 442         ldx     [%l6+8],%g1             C p0
 443         sllx    %g2,16,%g2              C align p16
 444         add     %i0,8,%i0               C res_ptr++
 445         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 446         srlx    %g1,32,%l0
 447         stw     %g1,[%i0-8]
 448         stw     %l0,[%i0-4]
 449
 450 .L1:    ldx     [%l4],%g2               C p16
 451         ldx     [%l4+8],%g1             C p0
 452         sllx    %g2,16,%g2              C align p16
 453         add     %i0,8,%i0               C res_ptr++
 454         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 455         srlx    %g1,32,%l0
 456         stw     %g1,[%i0-8]
 457         stw     %l0,[%i0-4]
 458
 459         ret
 460         restore %g0,%g0,%o0
 461
 462 EPILOGUE(mpn_sqr_diagonal)