source/libs/gmp/gmp-src/mpn/sparc32/v9/addmul_1.asm

   1 dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
   2 dnl  the result to a second limb vector.
   3
   4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C Algorithm: We use two floating-point multiplies per limb product, with the
  35 C invariant v operand split into two 16-bit pieces, and the u operand split
  36 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
  37 C the integer unit.
  38
  39 C                  cycles/limb
  40 C UltraSPARC 1&2:     6.5
  41 C UltraSPARC 3:       ?
  42
  43 C Possible optimizations:
  44 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
  45 C      memory bandwidth limited, this could save 1.5 cycles/limb.
  46 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
  47 C      it is very straightforward to unroll, using an exit branch midways.
  48 C      Unrolling would allow deeper scheduling which could improve speed for L2
  49 C      cache case.
  50 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
  51 C      aren't sufficiently apart-scheduled with just two temp areas.
  52 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
  53 C      could save many operations.
  54
  55 C INPUT PARAMETERS
  56 C rp    i0
  57 C up    i1
  58 C n     i2
  59 C v     i3
  60
  61 define(`FSIZE',224)
  62
  63 ASM_START()
  64 PROLOGUE(mpn_addmul_1)
  65         add     %sp, -FSIZE, %sp
  66         sethi   %hi(0xffff), %g1
  67         srl     %o3, 16, %g2
  68         or      %g1, %lo(0xffff), %g1
  69         and     %o3, %g1, %g1
  70         stx     %g1, [%sp+104]
  71         stx     %g2, [%sp+112]
  72         ldd     [%sp+104], %f6
  73         ldd     [%sp+112], %f8
  74         fxtod   %f6, %f6
  75         fxtod   %f8, %f8
  76         ld      [%sp+104], %f10         C zero f10
  77
  78         mov     0, %g3                  C cy = 0
  79
  80 define(`fanop', `fitod %f18, %f0')      C  A quasi nop running in the FA pipe
  81
  82         add     %sp, 160, %o5           C point in scratch area
  83         and     %o5, -32, %o5           C align at 0 (mod 32) in scratch area
  84
  85         subcc   %o2, 1, %o2
  86         ld      [%o1], %f11             C read up[i]
  87         add     %o1, 4, %o1             C up++
  88         bne,pt  %icc, .L_two_or_more
  89         fxtod   %f10, %f2
  90
  91         fmuld   %f2, %f8, %f16
  92         fmuld   %f2, %f6, %f4
  93         fdtox   %f16, %f14
  94         fdtox   %f4, %f12
  95         std     %f14, [%o5+16]
  96         std     %f12, [%o5+24]
  97         ldx     [%o5+16], %g2           C p16
  98         ldx     [%o5+24], %g1           C p0
  99         lduw    [%o0], %g5              C read rp[i]
 100         b       .L1
 101         add     %o0, -16, %o0
 102
 103         .align  16
 104 .L_two_or_more:
 105         subcc   %o2, 1, %o2
 106         ld      [%o1], %f11             C read up[i]
 107         fmuld   %f2, %f8, %f16
 108         fmuld   %f2, %f6, %f4
 109         add     %o1, 4, %o1             C up++
 110         bne,pt  %icc, .L_three_or_more
 111         fxtod   %f10, %f2
 112
 113         fdtox   %f16, %f14
 114         fdtox   %f4, %f12
 115         std     %f14, [%o5+16]
 116         fmuld   %f2, %f8, %f16
 117         std     %f12, [%o5+24]
 118         fmuld   %f2, %f6, %f4
 119         fdtox   %f16, %f14
 120         fdtox   %f4, %f12
 121         std     %f14, [%o5+0]
 122         std     %f12, [%o5+8]
 123         lduw    [%o0], %g5              C read rp[i]
 124         ldx     [%o5+16], %g2           C p16
 125         ldx     [%o5+24], %g1           C p0
 126         b       .L2
 127         add     %o0, -12, %o0
 128
 129         .align  16
 130 .L_three_or_more:
 131         subcc   %o2, 1, %o2
 132         ld      [%o1], %f11             C read up[i]
 133         fdtox   %f16, %f14
 134         fdtox   %f4, %f12
 135         std     %f14, [%o5+16]
 136         fmuld   %f2, %f8, %f16
 137         std     %f12, [%o5+24]
 138         fmuld   %f2, %f6, %f4
 139         add     %o1, 4, %o1             C up++
 140         bne,pt  %icc, .L_four_or_more
 141         fxtod   %f10, %f2
 142
 143         fdtox   %f16, %f14
 144         fdtox   %f4, %f12
 145         std     %f14, [%o5+0]
 146         fmuld   %f2, %f8, %f16
 147         std     %f12, [%o5+8]
 148         fmuld   %f2, %f6, %f4
 149         fdtox   %f16, %f14
 150         ldx     [%o5+16], %g2           C p16
 151         fdtox   %f4, %f12
 152         ldx     [%o5+24], %g1           C p0
 153         std     %f14, [%o5+16]
 154         std     %f12, [%o5+24]
 155         lduw    [%o0], %g5              C read rp[i]
 156         b       .L3
 157         add     %o0, -8, %o0
 158
 159         .align  16
 160 .L_four_or_more:
 161         subcc   %o2, 1, %o2
 162         ld      [%o1], %f11             C read up[i]
 163         fdtox   %f16, %f14
 164         fdtox   %f4, %f12
 165         std     %f14, [%o5+0]
 166         fmuld   %f2, %f8, %f16
 167         std     %f12, [%o5+8]
 168         fmuld   %f2, %f6, %f4
 169         add     %o1, 4, %o1             C up++
 170         bne,pt  %icc, .L_five_or_more
 171         fxtod   %f10, %f2
 172
 173         fdtox   %f16, %f14
 174         ldx     [%o5+16], %g2           C p16
 175         fdtox   %f4, %f12
 176         ldx     [%o5+24], %g1           C p0
 177         std     %f14, [%o5+16]
 178         fmuld   %f2, %f8, %f16
 179         std     %f12, [%o5+24]
 180         fmuld   %f2, %f6, %f4
 181         add     %o1, 4, %o1             C up++
 182         lduw    [%o0], %g5              C read rp[i]
 183         b       .L4
 184         add     %o0, -4, %o0
 185
 186         .align  16
 187 .L_five_or_more:
 188         subcc   %o2, 1, %o2
 189         ld      [%o1], %f11             C read up[i]
 190         fdtox   %f16, %f14
 191         ldx     [%o5+16], %g2           C p16
 192         fdtox   %f4, %f12
 193         ldx     [%o5+24], %g1           C p0
 194         std     %f14, [%o5+16]
 195         fmuld   %f2, %f8, %f16
 196         std     %f12, [%o5+24]
 197         fmuld   %f2, %f6, %f4
 198         add     %o1, 4, %o1             C up++
 199         lduw    [%o0], %g5              C read rp[i]
 200         bne,pt  %icc, .Loop
 201         fxtod   %f10, %f2
 202         b,a     .L5
 203
 204 C BEGIN MAIN LOOP
 205         .align 16
 206 C -- 0
 207 .Loop:  nop
 208         subcc   %o2, 1, %o2
 209         ld      [%o1], %f11             C read up[i]
 210         fdtox   %f16, %f14
 211 C -- 1
 212         sllx    %g2, 16, %g4            C (p16 << 16)
 213         add     %o0, 4, %o0             C rp++
 214         ldx     [%o5+0], %g2            C p16
 215         fdtox   %f4, %f12
 216 C -- 2
 217         nop
 218         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 219         ldx     [%o5+8], %g1            C p0
 220         fanop
 221 C -- 3
 222         nop
 223         add     %g3, %g4, %g4           C p += cy
 224         std     %f14, [%o5+0]
 225         fmuld   %f2, %f8, %f16
 226 C -- 4
 227         nop
 228         add     %g5, %g4, %g4           C p += rp[i]
 229         std     %f12, [%o5+8]
 230         fmuld   %f2, %f6, %f4
 231 C -- 5
 232         xor     %o5, 16, %o5            C alternate scratch variables
 233         add     %o1, 4, %o1             C up++
 234         stw     %g4, [%o0-4]
 235         fanop
 236 C -- 6
 237         srlx    %g4, 32, %g3            C new cy
 238         lduw    [%o0], %g5              C read rp[i]
 239         bne,pt  %icc, .Loop
 240         fxtod   %f10, %f2
 241 C END MAIN LOOP
 242
 243 .L5:    fdtox   %f16, %f14
 244         sllx    %g2, 16, %g4            C (p16 << 16)
 245         ldx     [%o5+0], %g2            C p16
 246         fdtox   %f4, %f12
 247         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 248         ldx     [%o5+8], %g1            C p0
 249         add     %g4, %g3, %g4           C p += cy
 250         std     %f14, [%o5+0]
 251         fmuld   %f2, %f8, %f16
 252         add     %g5, %g4, %g4           C p += rp[i]
 253         std     %f12, [%o5+8]
 254         fmuld   %f2, %f6, %f4
 255         xor     %o5, 16, %o5
 256         stw     %g4, [%o0+0]
 257         srlx    %g4, 32, %g3            C new cy
 258         lduw    [%o0+4], %g5            C read rp[i]
 259
 260 .L4:    fdtox   %f16, %f14
 261         sllx    %g2, 16, %g4            C (p16 << 16)
 262         ldx     [%o5+0], %g2            C p16
 263         fdtox   %f4, %f12
 264         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 265         ldx     [%o5+8], %g1            C p0
 266         add     %g3, %g4, %g4           C p += cy
 267         std     %f14, [%o5+0]
 268         add     %g5, %g4, %g4           C p += rp[i]
 269         std     %f12, [%o5+8]
 270         xor     %o5, 16, %o5
 271         stw     %g4, [%o0+4]
 272         srlx    %g4, 32, %g3            C new cy
 273         lduw    [%o0+8], %g5            C read rp[i]
 274
 275 .L3:    sllx    %g2, 16, %g4            C (p16 << 16)
 276         ldx     [%o5+0], %g2            C p16
 277         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 278         ldx     [%o5+8], %g1            C p0
 279         add     %g3, %g4, %g4           C p += cy
 280         add     %g5, %g4, %g4           C p += rp[i]
 281         xor     %o5, 16, %o5
 282         stw     %g4, [%o0+8]
 283         srlx    %g4, 32, %g3            C new cy
 284         lduw    [%o0+12], %g5           C read rp[i]
 285
 286 .L2:    sllx    %g2, 16, %g4            C (p16 << 16)
 287         ldx     [%o5+0], %g2            C p16
 288         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 289         ldx     [%o5+8], %g1            C p0
 290         add     %g3, %g4, %g4           C p += cy
 291         add     %g5, %g4, %g4           C p += rp[i]
 292         stw     %g4, [%o0+12]
 293         srlx    %g4, 32, %g3            C new cy
 294         lduw    [%o0+16], %g5           C read rp[i]
 295
 296 .L1:    sllx    %g2, 16, %g4            C (p16 << 16)
 297         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 298         add     %g3, %g4, %g4           C p += cy
 299         add     %g5, %g4, %g4           C p += rp[i]
 300         stw     %g4, [%o0+16]
 301         srlx    %g4, 32, %g3            C new cy
 302
 303         mov     %g3, %o0
 304         retl
 305         sub     %sp, -FSIZE, %sp
 306 EPILOGUE(mpn_addmul_1)