source/libs/gmp/gmp-src/mpn/ia64/mode1o.asm

   1 dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
   2
   3 dnl  Contributed to the GNU project by Kevin Ryde.
   4
   5 dnl  Copyright 2003-2005 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb
  37 C Itanium:      15
  38 C Itanium 2:     8
  39
  40
  41 dnl  Usage: ABI32(`code')
  42 dnl
  43 dnl  Emit the given code only under HAVE_ABI_32.
  44 dnl
  45 define(ABI32,
  46 m4_assert_onearg()
  47 `ifdef(`HAVE_ABI_32',`$1')')
  48
  49
  50 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  51 C                                mp_limb_t divisor, mp_limb_t carry);
  52 C
  53 C The modexact algorithm is usually conceived as a dependent chain
  54 C
  55 C       l = src[i] - c
  56 C       q = low(l * inverse)
  57 C       c = high(q*divisor) + (src[i]<c)
  58 C
  59 C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
  60 C separately (off the dependent chain) and using
  61 C
  62 C       q = low(c * inverse + si)
  63 C       c = high(q*divisor + c)
  64 C
  65 C This means the dependent chain is simply xma.l followed by xma.hu, for a
  66 C total 8 cycles/limb on itanium-2.
  67 C
  68 C The reason xma.hu works for the new c is that the low of q*divisor is
  69 C src[i]-c (being the whole purpose of the q generated, and it can be
  70 C verified algebraically).  If there was an underflow from src[i]-c, then
  71 C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
  72 C the same as the borrow bit (src[i]<c) gives in the first style shown.
  73 C
  74 C Incidentally, fcmp is not an option for treating src[i]-c, since it
  75 C apparently traps to the kernel for unnormalized operands like those used
  76 C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
  77 C cycles.
  78 C
  79 C
  80 C First Limb:
  81 C
  82 C The first limb uses q = (src[0]-c) * inverse shown in the first style.
  83 C This lets us get the first q as soon as the inverse is ready, without
  84 C going through si=s*inverse.  Basically at the start we have c and can use
  85 C it while waiting for the inverse, whereas for the second and subsequent
  86 C limbs it's the other way around, ie. we have the inverse and are waiting
  87 C for c.
  88 C
  89 C At .Lentry the first two instructions in the loop have been done already.
  90 C The load of f11=src[1] at the start (predicated on size>=2), and the
  91 C calculation of q by the initial different scheme.
  92 C
  93 C
  94 C Entry Sequence:
  95 C
  96 C In the entry sequence, the critical path is the calculation of the
  97 C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
  98 C established nice and early so the br.cloop's should predict perfectly.
  99 C And the load for the low limbs src[0] and src[1] can be initiated long
 100 C ahead of where they're needed.
 101 C
 102 C
 103 C Inverse Calculation:
 104 C
 105 C The initial 8-bit inverse is calculated using a table lookup.  If it hits
 106 C L1 (which is likely if we're called several times) then it should take a
 107 C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
 108 C the best approach, on balance.  It could be done bitwise, but that would
 109 C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
 110 C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
 111 C but that would be about 11 cycles.
 112 C
 113 C The table is not the same as binvert_limb_table, instead it's 256 bytes,
 114 C designed to be indexed by the low byte of the divisor.  The divisor is
 115 C always odd, so the relevant data is every second byte in the table.  The
 116 C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
 117 C cycle because it must go down I0, and we're using the first I0 slot to get
 118 C ip.  The extra 128 bytes of padding should be insignificant compared to
 119 C typical ia64 code bloat.
 120 C
 121 C Having the table in .text allows us to use IP-relative addressing,
 122 C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
 123 C IP-relative, it gets a linker relocation overflow on GNU/Linux.
 124 C
 125 C
 126 C Load Scheduling:
 127 C
 128 C In the main loop, the data loads are scheduled for an L2 hit, which means
 129 C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
 130 C any case that scheduling is achieved simply by doing the load (and xmpy.l
 131 C for "si") in the immediately preceding iteration.
 132 C
 133 C The main loop requires size >= 2, and we handle size==1 by an initial
 134 C br.cloop to enter the loop only if size>1.  Since ar.lc is established
 135 C early, this should predict perfectly.
 136 C
 137 C
 138 C Not done:
 139 C
 140 C Consideration was given to using a plain "(src[0]-c) % divisor" for
 141 C size==1, but cycle counting suggests about 50 for the sort of approach
 142 C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
 143 C L1 hits for their respective fetching.)
 144 C
 145 C Consideration was given to a test for high<divisor and replacing the last
 146 C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
 147 C Branching on high<divisor wouldn't be good since a mispredict would cost
 148 C more than the loop iteration saved, and the condition is of course data
 149 C dependent.  So the theory would be to shorten the loop count if
 150 C high<divisor, and predicate extra operations at the end.  That would mean
 151 C a gain of 6 when high<divisor, or a cost of 2 if not.
 152 C
 153 C Whether such a tradeoff is a win on average depends on assumptions about
 154 C how many bits in the high and the divisor.  If both are uniformly
 155 C distributed then high<divisor about 50% of the time.  But smallish
 156 C divisors (less chance of high<divisor) might be more likely from
 157 C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
 158 C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
 159 C for the moment, it's felt the gain is not really enough to be worth the
 160 C trouble.
 161 C
 162 C
 163 C Enhancement:
 164 C
 165 C Process two source limbs per iteration using a two-limb inverse and a
 166 C sequence like
 167 C
 168 C       ql  = low (c * il + sil)        quotient low limb
 169 C       qlc = high(c * il + sil)
 170 C       qh1 = low (c * ih + sih)        quotient high, partial
 171 C
 172 C       cl = high (ql * d + c)          carry out of low
 173 C       qh = low (qlc * 1 + qh1)        quotient high limb
 174 C
 175 C       new c = high (qh * d + cl)      carry out of high
 176 C
 177 C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
 178 C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
 179 C chain with 4 multiplies.  The bigger inverse would take extra time to
 180 C calculate, but a one limb iteration to handle an odd size could be done as
 181 C soon as 64-bits of inverse were ready.
 182 C
 183 C Perhaps this could even extend to a 3 limb inverse, which might promise 17
 184 C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
 185 C
 186
 187 ASM_START()
 188         .explicit
 189
 190         .text
 191         .align  32
 192 .Ltable:
 193 data1   0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
 194 data1   0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
 195 data1   0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
 196 data1   0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
 197 data1   0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
 198 data1   0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
 199 data1   0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
 200 data1   0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
 201 data1   0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
 202 data1   0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
 203 data1   0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
 204 data1   0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
 205 data1   0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
 206 data1   0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
 207 data1   0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
 208 data1   0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
 209
 210
 211 PROLOGUE(mpn_modexact_1c_odd)
 212
 213         C r32   src
 214         C r33   size
 215         C r34   divisor
 216         C r35   carry
 217
 218         .prologue
 219 .Lhere:
 220 { .mmi; add     r33 = -1, r33           C M0  size-1
 221         mov     r14 = 2                 C M1  2
 222         mov     r15 = ip                C I0  .Lhere
 223 }{.mmi; setf.sig f6 = r34               C M2  divisor
 224         setf.sig f9 = r35               C M3  carry
 225         zxt1    r3 = r34                C I1  divisor low byte
 226 }       ;;
 227
 228 { .mmi; add     r3 = .Ltable-.Lhere, r3 C M0  table offset ip and index
 229         sub     r16 = 0, r34            C M1  -divisor
 230         .save   ar.lc, r2
 231         mov     r2 = ar.lc              C I0
 232 }{.mmi; .body
 233         setf.sig f13 = r14              C M2  2 in significand
 234         mov     r17 = -1                C M3  -1
 235 ABI32(` zxt4    r33 = r33')             C I1  size extend
 236 }       ;;
 237
 238 { .mmi; add     r3 = r3, r15            C M0  table entry address
 239 ABI32(` addp4   r32 = 0, r32')          C M1  src extend
 240         mov     ar.lc = r33             C I0  size-1 loop count
 241 }{.mmi; setf.sig f12 = r16              C M2  -divisor
 242         setf.sig f8 = r17               C M3  -1
 243 }       ;;
 244
 245 { .mmi; ld1     r3 = [r3]               C M0  inverse, 8 bits
 246         ldf8    f10 = [r32], 8          C M1  src[0]
 247         cmp.ne  p6,p0 = 0, r33          C I0  test size!=1
 248 }       ;;
 249
 250         C Wait for table load.
 251         C Hope for an L1 hit of 1 cycles to ALU, but could be more.
 252         setf.sig f7 = r3                C M2  inverse, 8 bits
 253 (p6)    ldf8    f11 = [r32], 8          C M1  src[1], if size!=1
 254         ;;
 255
 256         C 5 cycles
 257
 258         C f6    divisor
 259         C f7    inverse, being calculated
 260         C f8    -1, will be -inverse
 261         C f9    carry
 262         C f10   src[0]
 263         C f11   src[1]
 264         C f12   -divisor
 265         C f13   2
 266         C f14   scratch
 267
 268         xmpy.l  f14 = f13, f7           C 2*i
 269         xmpy.l  f7 = f7, f7             C i*i
 270         ;;
 271         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 16 bits
 272         ;;
 273
 274         xmpy.l  f14 = f13, f7           C 2*i
 275         xmpy.l  f7 = f7, f7             C i*i
 276         ;;
 277         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 32 bits
 278         ;;
 279
 280         xmpy.l  f14 = f13, f7           C 2*i
 281         xmpy.l  f7 = f7, f7             C i*i
 282         ;;
 283
 284         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 64 bits
 285         xma.l   f10 = f9, f8, f10       C sc = c * -1 + src[0]
 286         ;;
 287 ASSERT(p6, `
 288         xmpy.l  f15 = f6, f7 ;; C divisor*inverse
 289         getf.sig r31 = f15 ;;
 290         cmp.eq  p6,p0 = 1, r31  C should == 1
 291 ')
 292
 293         xmpy.l  f10 = f10, f7           C q = sc * inverse
 294         xmpy.l  f8 = f7, f8             C -inverse = inverse * -1
 295         br.cloop.sptk.few.clr .Lentry   C main loop, if size > 1
 296         ;;
 297
 298         C size==1, finish up now
 299         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 300         mov     ar.lc = r2              C I0
 301         ;;
 302         getf.sig r8 = f9                C M2  return c
 303         br.ret.sptk.many b0
 304
 305
 306
 307 .Ltop:
 308         C r2    saved ar.lc
 309         C f6    divisor
 310         C f7    inverse
 311         C f8    -inverse
 312         C f9    carry
 313         C f10   src[i] * inverse
 314         C f11   scratch src[i+1]
 315
 316         add     r16 = 160, r32
 317         ldf8    f11 = [r32], 8          C src[i+1]
 318         ;;
 319         C 2 cycles
 320
 321         lfetch  [r16]
 322         xma.l   f10 = f9, f8, f10       C q = c * -inverse + si
 323         ;;
 324         C 3 cycles
 325
 326 .Lentry:
 327         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 328         xmpy.l  f10 = f11, f7           C si = src[i] * inverse
 329         br.cloop.sptk.few.clr .Ltop
 330         ;;
 331
 332
 333
 334         xma.l   f10 = f9, f8, f10       C q = c * -inverse + si
 335         mov     ar.lc = r2              C I0
 336         ;;
 337         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 338         ;;
 339         getf.sig r8 = f9                C M2  return c
 340         br.ret.sptk.many b0
 341
 342 EPILOGUE()