source/libs/gmp/gmp-src/mpn/sparc32/v9/submul_1.asm

   1 dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
   2 dnl  subtract the result from a second limb vector.
   3
   4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C Algorithm: We use two floating-point multiplies per limb product, with the
  35 C invariant v operand split into two 16-bit pieces, and the u operand split
  36 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
  37 C the integer unit.
  38
  39 C                  cycles/limb
  40 C UltraSPARC 1&2:     6.5
  41 C UltraSPARC 3:       ?
  42
  43 C Possible optimizations:
  44 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
  45 C      memory bandwidth limited, this could save 1.5 cycles/limb.
  46 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
  47 C      it is very straightforward to unroll, using an exit branch midways.
  48 C      Unrolling would allow deeper scheduling which could improve speed for L2
  49 C      cache case.
  50 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
  51 C      aren't sufficiently apart-scheduled with just two temp areas.
  52 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
  53 C      could save many operations.
  54
  55 C INPUT PARAMETERS
  56 C rp    i0
  57 C up    i1
  58 C n     i2
  59 C v     i3
  60
  61 define(`FSIZE',224)
  62
  63 ASM_START()
  64 PROLOGUE(mpn_submul_1)
  65         add     %sp, -FSIZE, %sp
  66         sethi   %hi(0xffff), %g1
  67         srl     %o3, 16, %g2
  68         or      %g1, %lo(0xffff), %g1
  69         and     %o3, %g1, %g1
  70         stx     %g1, [%sp+104]
  71         stx     %g2, [%sp+112]
  72         ldd     [%sp+104], %f6
  73         ldd     [%sp+112], %f8
  74         fxtod   %f6, %f6
  75         fxtod   %f8, %f8
  76         ld      [%sp+104], %f10         C zero f10
  77
  78         mov     0, %g3                  C cy = 0
  79
  80 define(`fanop', `fitod %f18, %f0')      C  A quasi nop running in the FA pipe
  81
  82         add     %sp, 160, %o5           C point in scratch area
  83         and     %o5, -32, %o5           C align at 0 (mod 32) in scratch area
  84
  85         subcc   %o2, 1, %o2
  86         ld      [%o1], %f11             C read up[i]
  87         add     %o1, 4, %o1             C up++
  88         bne,pt  %icc, .L_two_or_more
  89         fxtod   %f10, %f2
  90
  91         fmuld   %f2, %f8, %f16
  92         fmuld   %f2, %f6, %f4
  93         fdtox   %f16, %f14
  94         fdtox   %f4, %f12
  95         std     %f14, [%o5+16]
  96         std     %f12, [%o5+24]
  97         ldx     [%o5+16], %g2           C p16
  98         ldx     [%o5+24], %g1           C p0
  99         lduw    [%o0], %g5              C read rp[i]
 100         b       .L1
 101         add     %o0, -16, %o0
 102
 103         .align  16
 104 .L_two_or_more:
 105         subcc   %o2, 1, %o2
 106         ld      [%o1], %f11             C read up[i]
 107         fmuld   %f2, %f8, %f16
 108         fmuld   %f2, %f6, %f4
 109         add     %o1, 4, %o1             C up++
 110         bne,pt  %icc, .L_three_or_more
 111         fxtod   %f10, %f2
 112
 113         fdtox   %f16, %f14
 114         fdtox   %f4, %f12
 115         std     %f14, [%o5+16]
 116         fmuld   %f2, %f8, %f16
 117         std     %f12, [%o5+24]
 118         fmuld   %f2, %f6, %f4
 119         fdtox   %f16, %f14
 120         fdtox   %f4, %f12
 121         std     %f14, [%o5+0]
 122         std     %f12, [%o5+8]
 123         lduw    [%o0], %g5              C read rp[i]
 124         ldx     [%o5+16], %g2           C p16
 125         ldx     [%o5+24], %g1           C p0
 126         b       .L2
 127         add     %o0, -12, %o0
 128
 129         .align  16
 130 .L_three_or_more:
 131         subcc   %o2, 1, %o2
 132         ld      [%o1], %f11             C read up[i]
 133         fdtox   %f16, %f14
 134         fdtox   %f4, %f12
 135         std     %f14, [%o5+16]
 136         fmuld   %f2, %f8, %f16
 137         std     %f12, [%o5+24]
 138         fmuld   %f2, %f6, %f4
 139         add     %o1, 4, %o1             C up++
 140         bne,pt  %icc, .L_four_or_more
 141         fxtod   %f10, %f2
 142
 143         fdtox   %f16, %f14
 144         fdtox   %f4, %f12
 145         std     %f14, [%o5+0]
 146         fmuld   %f2, %f8, %f16
 147         std     %f12, [%o5+8]
 148         fmuld   %f2, %f6, %f4
 149         fdtox   %f16, %f14
 150         ldx     [%o5+16], %g2           C p16
 151         fdtox   %f4, %f12
 152         ldx     [%o5+24], %g1           C p0
 153         std     %f14, [%o5+16]
 154         std     %f12, [%o5+24]
 155         lduw    [%o0], %g5              C read rp[i]
 156         b       .L3
 157         add     %o0, -8, %o0
 158
 159         .align  16
 160 .L_four_or_more:
 161         subcc   %o2, 1, %o2
 162         ld      [%o1], %f11             C read up[i]
 163         fdtox   %f16, %f14
 164         fdtox   %f4, %f12
 165         std     %f14, [%o5+0]
 166         fmuld   %f2, %f8, %f16
 167         std     %f12, [%o5+8]
 168         fmuld   %f2, %f6, %f4
 169         add     %o1, 4, %o1             C up++
 170         bne,pt  %icc, .L_five_or_more
 171         fxtod   %f10, %f2
 172
 173         fdtox   %f16, %f14
 174         ldx     [%o5+16], %g2           C p16
 175         fdtox   %f4, %f12
 176         ldx     [%o5+24], %g1           C p0
 177         std     %f14, [%o5+16]
 178         fmuld   %f2, %f8, %f16
 179         std     %f12, [%o5+24]
 180         fmuld   %f2, %f6, %f4
 181         add     %o1, 4, %o1             C up++
 182         lduw    [%o0], %g5              C read rp[i]
 183         b       .L4
 184         add     %o0, -4, %o0
 185
 186         .align  16
 187 .L_five_or_more:
 188         subcc   %o2, 1, %o2
 189         ld      [%o1], %f11             C read up[i]
 190         fdtox   %f16, %f14
 191         ldx     [%o5+16], %g2           C p16
 192         fdtox   %f4, %f12
 193         ldx     [%o5+24], %g1           C p0
 194         std     %f14, [%o5+16]
 195         fmuld   %f2, %f8, %f16
 196         std     %f12, [%o5+24]
 197         fmuld   %f2, %f6, %f4
 198         add     %o1, 4, %o1             C up++
 199         lduw    [%o0], %g5              C read rp[i]
 200         bne,pt  %icc, .Loop
 201         fxtod   %f10, %f2
 202         b,a     .L5
 203
 204 C BEGIN MAIN LOOP
 205         .align 16
 206 C -- 0
 207 .Loop:  sub     %g0, %g3, %g3
 208         subcc   %o2, 1, %o2
 209         ld      [%o1], %f11             C read up[i]
 210         fdtox   %f16, %f14
 211 C -- 1
 212         sllx    %g2, 16, %g4            C (p16 << 16)
 213         add     %o0, 4, %o0             C rp++
 214         ldx     [%o5+0], %g2            C p16
 215         fdtox   %f4, %f12
 216 C -- 2
 217         srl     %g3, 0, %g3             C zero most significant 32 bits
 218         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 219         ldx     [%o5+8], %g1            C p0
 220         fanop
 221 C -- 3
 222         nop
 223         add     %g3, %g4, %g4           C p += cy
 224         std     %f14, [%o5+0]
 225         fmuld   %f2, %f8, %f16
 226 C -- 4
 227         nop
 228         sub     %g5, %g4, %g4           C p += rp[i]
 229         std     %f12, [%o5+8]
 230         fmuld   %f2, %f6, %f4
 231 C -- 5
 232         xor     %o5, 16, %o5            C alternate scratch variables
 233         add     %o1, 4, %o1             C up++
 234         stw     %g4, [%o0-4]
 235         fanop
 236 C -- 6
 237         srlx    %g4, 32, %g3            C new cy
 238         lduw    [%o0], %g5              C read rp[i]
 239         bne,pt  %icc, .Loop
 240         fxtod   %f10, %f2
 241 C END MAIN LOOP
 242
 243 .L5:    sub     %g0, %g3, %g3
 244         fdtox   %f16, %f14
 245         sllx    %g2, 16, %g4            C (p16 << 16)
 246         ldx     [%o5+0], %g2            C p16
 247         fdtox   %f4, %f12
 248         srl     %g3, 0, %g3             C zero most significant 32 bits
 249         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 250         ldx     [%o5+8], %g1            C p0
 251         add     %g4, %g3, %g4           C p += cy
 252         std     %f14, [%o5+0]
 253         fmuld   %f2, %f8, %f16
 254         sub     %g5, %g4, %g4           C p += rp[i]
 255         std     %f12, [%o5+8]
 256         fmuld   %f2, %f6, %f4
 257         xor     %o5, 16, %o5
 258         stw     %g4, [%o0+0]
 259         srlx    %g4, 32, %g3            C new cy
 260         lduw    [%o0+4], %g5            C read rp[i]
 261
 262         sub     %g0, %g3, %g3
 263 .L4:    fdtox   %f16, %f14
 264         sllx    %g2, 16, %g4            C (p16 << 16)
 265         ldx     [%o5+0], %g2            C p16
 266         fdtox   %f4, %f12
 267         srl     %g3, 0, %g3             C zero most significant 32 bits
 268         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 269         ldx     [%o5+8], %g1            C p0
 270         add     %g3, %g4, %g4           C p += cy
 271         std     %f14, [%o5+0]
 272         sub     %g5, %g4, %g4           C p += rp[i]
 273         std     %f12, [%o5+8]
 274         xor     %o5, 16, %o5
 275         stw     %g4, [%o0+4]
 276         srlx    %g4, 32, %g3            C new cy
 277         lduw    [%o0+8], %g5            C read rp[i]
 278
 279         sub     %g0, %g3, %g3
 280 .L3:    sllx    %g2, 16, %g4            C (p16 << 16)
 281         ldx     [%o5+0], %g2            C p16
 282         srl     %g3, 0, %g3             C zero most significant 32 bits
 283         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 284         ldx     [%o5+8], %g1            C p0
 285         add     %g3, %g4, %g4           C p += cy
 286         sub     %g5, %g4, %g4           C p += rp[i]
 287         xor     %o5, 16, %o5
 288         stw     %g4, [%o0+8]
 289         srlx    %g4, 32, %g3            C new cy
 290         lduw    [%o0+12], %g5           C read rp[i]
 291
 292         sub     %g0, %g3, %g3
 293 .L2:    sllx    %g2, 16, %g4            C (p16 << 16)
 294         ldx     [%o5+0], %g2            C p16
 295         srl     %g3, 0, %g3             C zero most significant 32 bits
 296         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 297         ldx     [%o5+8], %g1            C p0
 298         add     %g3, %g4, %g4           C p += cy
 299         sub     %g5, %g4, %g4           C p += rp[i]
 300         stw     %g4, [%o0+12]
 301         srlx    %g4, 32, %g3            C new cy
 302         lduw    [%o0+16], %g5           C read rp[i]
 303
 304         sub     %g0, %g3, %g3
 305 .L1:    sllx    %g2, 16, %g4            C (p16 << 16)
 306         srl     %g3, 0, %g3             C zero most significant 32 bits
 307         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 308         add     %g3, %g4, %g4           C p += cy
 309         sub     %g5, %g4, %g4           C p += rp[i]
 310         stw     %g4, [%o0+16]
 311         srlx    %g4, 32, %g3            C new cy
 312
 313         sub     %g0, %g3, %o0
 314         retl
 315         sub     %sp, -FSIZE, %sp
 316 EPILOGUE(mpn_submul_1)