source/libs/gmp/gmp-src/mpn/alpha/mode1o.asm

   1 dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
   2
   3 dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C      cycles/limb
  35 C EV4:    47
  36 C EV5:    30
  37 C EV6:    15
  38
  39
  40 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
  41 C                                mp_limb_t c)
  42 C
  43 C This code follows the "alternate" code in mpn/generic/mode1o.c,
  44 C eliminating cbit+climb from the dependent chain.  This leaves,
  45 C
  46 C        ev4   ev5   ev6
  47 C         1     3     1    subq   y = x - h
  48 C        23    13     7    mulq   q = y * inverse
  49 C        23    14     7    umulh  h = high (q * d)
  50 C        --    --    --
  51 C        47    30    15
  52 C
  53 C In each case, the load latency, loop control, and extra carry bit handling
  54 C hide under the multiply latencies.  Those latencies are long enough that
  55 C we don't need to worry about alignment or pairing to squeeze out
  56 C performance.
  57 C
  58 C For the first limb, some of the loop code is broken out and scheduled back
  59 C since it can be done earlier.
  60 C
  61 C   - The first ldq src[0] is near the start of the routine, for maximum
  62 C     time from memory.
  63 C
  64 C   - The subq y=x-climb can be done without waiting for the inverse.
  65 C
  66 C   - The mulq y*inverse is replicated after the final subq for the inverse,
  67 C     instead of branching to the mulq in the main loop.  On ev4 a branch
  68 C     there would cost cycles, but we can hide them under the mulq latency.
  69 C
  70 C For the last limb, high<divisor is tested and if that's true a subtract
  71 C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
  72 C data-dependent branch, but we're waiting for umulh so any penalty should
  73 C hide there.  The multiplies saved would be worth the cost anyway.
  74 C
  75 C Enhancements:
  76 C
  77 C For size==1, a plain division (done bitwise say) might be faster than
  78 C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
  79 C ev5.  A call to gcc __remqu might be a possibility.
  80
  81 ASM_START()
  82 PROLOGUE(mpn_modexact_1c_odd,gp)
  83
  84         C r16   src
  85         C r17   size
  86         C r18   d
  87         C r19   c
  88
  89         LEA(r0, binvert_limb_table)
  90         srl     r18, 1, r20             C d >> 1
  91
  92         and     r20, 127, r20           C idx = d>>1 & 0x7F
  93
  94         addq    r0, r20, r21            C table + idx
  95
  96 ifelse(bwx_available_p,1,
  97 `       ldbu    r20, 0(r21)             C table[idx], inverse 8 bits
  98 ',`
  99         ldq_u   r20, 0(r21)             C table[idx] qword
 100         extbl   r20, r21, r20           C table[idx], inverse 8 bits
 101 ')
 102
 103         mull    r20, r20, r7            C i*i
 104         addq    r20, r20, r20           C 2*i
 105
 106         ldq     r2, 0(r16)              C x = s = src[0]
 107         lda     r17, -1(r17)            C size--
 108         clr     r0                      C initial cbit=0
 109
 110         mull    r7, r18, r7             C i*i*d
 111
 112         subq    r20, r7, r20            C 2*i-i*i*d, inverse 16 bits
 113
 114         mull    r20, r20, r7            C i*i
 115         addq    r20, r20, r20           C 2*i
 116
 117         mull    r7, r18, r7             C i*i*d
 118
 119         subq    r20, r7, r20            C 2*i-i*i*d, inverse 32 bits
 120
 121         mulq    r20, r20, r7            C i*i
 122         addq    r20, r20, r20           C 2*i
 123
 124         mulq    r7, r18, r7             C i*i*d
 125         subq    r2, r19, r3             C y = x - climb
 126
 127         subq    r20, r7, r20            C inv = 2*i-i*i*d, inverse 64 bits
 128
 129 ASSERT(r7, C should have d*inv==1 mod 2^64
 130 `       mulq    r18, r20, r7
 131         cmpeq   r7, 1, r7')
 132
 133         mulq    r3, r20, r4             C first q = y * inv
 134
 135         beq     r17, L(one)             C if size==1
 136         br      L(entry)
 137
 138
 139 L(top):
 140         C r0    cbit
 141         C r16   src, incrementing
 142         C r17   size, decrementing
 143         C r18   d
 144         C r19   climb
 145         C r20   inv
 146
 147         ldq     r1, 0(r16)              C s = src[i]
 148         subq    r1, r0, r2              C x = s - cbit
 149         cmpult  r1, r0, r0              C new cbit = s < cbit
 150
 151         subq    r2, r19, r3             C y = x - climb
 152
 153         mulq    r3, r20, r4             C q = y * inv
 154 L(entry):
 155         cmpult  r2, r19, r5             C cbit2 = x < climb
 156         addq    r5, r0, r0              C cbit += cbit2
 157         lda     r16, 8(r16)             C src++
 158         lda     r17, -1(r17)            C size--
 159
 160         umulh   r4, r18, r19            C climb = q * d
 161         bne     r17, L(top)             C while 2 or more limbs left
 162
 163
 164
 165         C r0    cbit
 166         C r18   d
 167         C r19   climb
 168         C r20   inv
 169
 170         ldq     r1, 0(r16)              C s = src[size-1] high limb
 171
 172         cmpult  r1, r18, r2             C test high<divisor
 173         bne     r2, L(skip)             C skip if so
 174
 175         C can't skip a division, repeat loop code
 176
 177         subq    r1, r0, r2              C x = s - cbit
 178         cmpult  r1, r0, r0              C new cbit = s < cbit
 179
 180         subq    r2, r19, r3             C y = x - climb
 181
 182         mulq    r3, r20, r4             C q = y * inv
 183 L(one):
 184         cmpult  r2, r19, r5             C cbit2 = x < climb
 185         addq    r5, r0, r0              C cbit += cbit2
 186
 187         umulh   r4, r18, r19            C climb = q * d
 188
 189         addq    r19, r0, r0             C return climb + cbit
 190         ret     r31, (r26), 1
 191
 192
 193         ALIGN(8)
 194 L(skip):
 195         C with high<divisor, the final step can be just (cbit+climb)-s and
 196         C an addback of d if that underflows
 197
 198         addq    r19, r0, r19            C c = climb + cbit
 199
 200         subq    r19, r1, r2             C c - s
 201         cmpult  r19, r1, r3             C c < s
 202
 203         addq    r2, r18, r0             C return c-s + divisor
 204
 205         cmoveq  r3, r2, r0              C return c-s if no underflow
 206         ret     r31, (r26), 1
 207
 208 EPILOGUE()
 209 ASM_END()