source/libs/gmp/gmp-src/mpn/sparc32/v9/mul_1.asm

   1 dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
   2 dnl  the result in a second limb vector.
   3
   4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 C Algorithm: We use two floating-point multiplies per limb product, with the
  35 C invariant v operand split into two 16-bit pieces, and the u operand split
  36 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
  37 C the integer unit.
  38
  39 C                  cycles/limb
  40 C UltraSPARC 1&2:     6.5
  41 C UltraSPARC 3:       ?
  42
  43 C Possible optimizations:
  44 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
  45 C      memory bandwidth limited, this could save 1.5 cycles/limb.
  46 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
  47 C      it is very straightforward to unroll, using an exit branch midways.
  48 C      Unrolling would allow deeper scheduling which could improve speed for L2
  49 C      cache case.
  50 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
  51 C      aren't sufficiently apart-scheduled with just two temp areas.
  52 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
  53 C      could save many operations.
  54
  55 C INPUT PARAMETERS
  56 C rp    i0
  57 C up    i1
  58 C n     i2
  59 C v     i3
  60
  61 define(`FSIZE',224)
  62
  63 ASM_START()
  64 PROLOGUE(mpn_mul_1)
  65         add     %sp, -FSIZE, %sp
  66         sethi   %hi(0xffff), %g1
  67         srl     %o3, 16, %g2
  68         or      %g1, %lo(0xffff), %g1
  69         and     %o3, %g1, %g1
  70         stx     %g1, [%sp+104]
  71         stx     %g2, [%sp+112]
  72         ldd     [%sp+104], %f6
  73         ldd     [%sp+112], %f8
  74         fxtod   %f6, %f6
  75         fxtod   %f8, %f8
  76         ld      [%sp+104], %f10         C zero f10
  77
  78         mov     0, %g3                  C cy = 0
  79
  80 define(`fanop', `fitod %f18, %f0')      C  A quasi nop running in the FA pipe
  81
  82         add     %sp, 160, %o5           C point in scratch area
  83         and     %o5, -32, %o5           C align at 0 (mod 32) in scratch area
  84
  85         subcc   %o2, 1, %o2
  86         ld      [%o1], %f11             C read up[i]
  87         add     %o1, 4, %o1             C up++
  88         bne,pt  %icc, .L_two_or_more
  89         fxtod   %f10, %f2
  90
  91         fmuld   %f2, %f8, %f16
  92         fmuld   %f2, %f6, %f4
  93         fdtox   %f16, %f14
  94         fdtox   %f4, %f12
  95         std     %f14, [%o5+16]
  96         std     %f12, [%o5+24]
  97         ldx     [%o5+16], %g2           C p16
  98         ldx     [%o5+24], %g1           C p0
  99         b       .L1
 100         add     %o0, -16, %o0
 101
 102         .align  16
 103 .L_two_or_more:
 104         subcc   %o2, 1, %o2
 105         ld      [%o1], %f11             C read up[i]
 106         fmuld   %f2, %f8, %f16
 107         fmuld   %f2, %f6, %f4
 108         add     %o1, 4, %o1             C up++
 109         bne,pt  %icc, .L_three_or_more
 110         fxtod   %f10, %f2
 111
 112         fdtox   %f16, %f14
 113         fdtox   %f4, %f12
 114         std     %f14, [%o5+16]
 115         fmuld   %f2, %f8, %f16
 116         std     %f12, [%o5+24]
 117         fmuld   %f2, %f6, %f4
 118         fdtox   %f16, %f14
 119         fdtox   %f4, %f12
 120         std     %f14, [%o5+0]
 121         std     %f12, [%o5+8]
 122         ldx     [%o5+16], %g2           C p16
 123         ldx     [%o5+24], %g1           C p0
 124         b       .L2
 125         add     %o0, -12, %o0
 126
 127         .align  16
 128 .L_three_or_more:
 129         subcc   %o2, 1, %o2
 130         ld      [%o1], %f11             C read up[i]
 131         fdtox   %f16, %f14
 132         fdtox   %f4, %f12
 133         std     %f14, [%o5+16]
 134         fmuld   %f2, %f8, %f16
 135         std     %f12, [%o5+24]
 136         fmuld   %f2, %f6, %f4
 137         add     %o1, 4, %o1             C up++
 138         bne,pt  %icc, .L_four_or_more
 139         fxtod   %f10, %f2
 140
 141         fdtox   %f16, %f14
 142         fdtox   %f4, %f12
 143         std     %f14, [%o5+0]
 144         fmuld   %f2, %f8, %f16
 145         std     %f12, [%o5+8]
 146         fmuld   %f2, %f6, %f4
 147         fdtox   %f16, %f14
 148         ldx     [%o5+16], %g2           C p16
 149         fdtox   %f4, %f12
 150         ldx     [%o5+24], %g1           C p0
 151         std     %f14, [%o5+16]
 152         std     %f12, [%o5+24]
 153         b       .L3
 154         add     %o0, -8, %o0
 155
 156         .align  16
 157 .L_four_or_more:
 158         subcc   %o2, 1, %o2
 159         ld      [%o1], %f11             C read up[i]
 160         fdtox   %f16, %f14
 161         fdtox   %f4, %f12
 162         std     %f14, [%o5+0]
 163         fmuld   %f2, %f8, %f16
 164         std     %f12, [%o5+8]
 165         fmuld   %f2, %f6, %f4
 166         add     %o1, 4, %o1             C up++
 167         bne,pt  %icc, .L_five_or_more
 168         fxtod   %f10, %f2
 169
 170         fdtox   %f16, %f14
 171         ldx     [%o5+16], %g2           C p16
 172         fdtox   %f4, %f12
 173         ldx     [%o5+24], %g1           C p0
 174         std     %f14, [%o5+16]
 175         fmuld   %f2, %f8, %f16
 176         std     %f12, [%o5+24]
 177         fmuld   %f2, %f6, %f4
 178         add     %o1, 4, %o1             C up++
 179         b       .L4
 180         add     %o0, -4, %o0
 181
 182         .align  16
 183 .L_five_or_more:
 184         subcc   %o2, 1, %o2
 185         ld      [%o1], %f11             C read up[i]
 186         fdtox   %f16, %f14
 187         ldx     [%o5+16], %g2           C p16
 188         fdtox   %f4, %f12
 189         ldx     [%o5+24], %g1           C p0
 190         std     %f14, [%o5+16]
 191         fmuld   %f2, %f8, %f16
 192         std     %f12, [%o5+24]
 193         fmuld   %f2, %f6, %f4
 194         add     %o1, 4, %o1             C up++
 195         bne,pt  %icc, .Loop
 196         fxtod   %f10, %f2
 197         b,a     .L5
 198
 199 C BEGIN MAIN LOOP
 200         .align 16
 201 C -- 0
 202 .Loop:  nop
 203         subcc   %o2, 1, %o2
 204         ld      [%o1], %f11             C read up[i]
 205         fdtox   %f16, %f14
 206 C -- 1
 207         sllx    %g2, 16, %g4            C (p16 << 16)
 208         add     %o0, 4, %o0             C rp++
 209         ldx     [%o5+0], %g2            C p16
 210         fdtox   %f4, %f12
 211 C -- 2
 212         nop
 213         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 214         ldx     [%o5+8], %g1            C p0
 215         fanop
 216 C -- 3
 217         nop
 218         add     %g3, %g4, %g4           C p += cy
 219         std     %f14, [%o5+0]
 220         fmuld   %f2, %f8, %f16
 221 C -- 4
 222         srlx    %g4, 32, %g3            C new cy
 223         add     %o1, 4, %o1             C up++
 224         std     %f12, [%o5+8]
 225         fmuld   %f2, %f6, %f4
 226 C -- 5
 227         xor     %o5, 16, %o5            C alternate scratch variables
 228         stw     %g4, [%o0-4]
 229         bne,pt  %icc, .Loop
 230         fxtod   %f10, %f2
 231 C END MAIN LOOP
 232
 233 .L5:    fdtox   %f16, %f14
 234         sllx    %g2, 16, %g4            C (p16 << 16)
 235         ldx     [%o5+0], %g2            C p16
 236         fdtox   %f4, %f12
 237         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 238         ldx     [%o5+8], %g1            C p0
 239         add     %g4, %g3, %g4           C p += cy
 240         std     %f14, [%o5+0]
 241         fmuld   %f2, %f8, %f16
 242         std     %f12, [%o5+8]
 243         fmuld   %f2, %f6, %f4
 244         xor     %o5, 16, %o5
 245         stw     %g4, [%o0+0]
 246         srlx    %g4, 32, %g3            C new cy
 247
 248 .L4:    fdtox   %f16, %f14
 249         sllx    %g2, 16, %g4            C (p16 << 16)
 250         ldx     [%o5+0], %g2            C p16
 251         fdtox   %f4, %f12
 252         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 253         ldx     [%o5+8], %g1            C p0
 254         add     %g3, %g4, %g4           C p += cy
 255         std     %f14, [%o5+0]
 256         std     %f12, [%o5+8]
 257         xor     %o5, 16, %o5
 258         stw     %g4, [%o0+4]
 259         srlx    %g4, 32, %g3            C new cy
 260
 261 .L3:    sllx    %g2, 16, %g4            C (p16 << 16)
 262         ldx     [%o5+0], %g2            C p16
 263         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 264         ldx     [%o5+8], %g1            C p0
 265         add     %g3, %g4, %g4           C p += cy
 266         xor     %o5, 16, %o5
 267         stw     %g4, [%o0+8]
 268         srlx    %g4, 32, %g3            C new cy
 269
 270 .L2:    sllx    %g2, 16, %g4            C (p16 << 16)
 271         ldx     [%o5+0], %g2            C p16
 272         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 273         ldx     [%o5+8], %g1            C p0
 274         add     %g3, %g4, %g4           C p += cy
 275         stw     %g4, [%o0+12]
 276         srlx    %g4, 32, %g3            C new cy
 277
 278 .L1:    sllx    %g2, 16, %g4            C (p16 << 16)
 279         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 280         add     %g3, %g4, %g4           C p += cy
 281         stw     %g4, [%o0+16]
 282         srlx    %g4, 32, %g3            C new cy
 283
 284         mov     %g3, %o0
 285         retl
 286         sub     %sp, -FSIZE, %sp
 287 EPILOGUE(mpn_mul_1)