source/libs/gmp/gmp-src/mpn/ia64/gcd_1.asm

   1 dnl  Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
   2
   3 dnl  Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
   4 dnl  Granlund.
   5
   6 dnl  Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc.
   7
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  11 dnl  it under the terms of either:
  12 dnl
  13 dnl    * the GNU Lesser General Public License as published by the Free
  14 dnl      Software Foundation; either version 3 of the License, or (at your
  15 dnl      option) any later version.
  16 dnl
  17 dnl  or
  18 dnl
  19 dnl    * the GNU General Public License as published by the Free Software
  20 dnl      Foundation; either version 2 of the License, or (at your option) any
  21 dnl      later version.
  22 dnl
  23 dnl  or both in parallel, as here.
  24 dnl
  25 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  26 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 dnl  for more details.
  29 dnl
  30 dnl  You should have received copies of the GNU General Public License and the
  31 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 dnl  see https://www.gnu.org/licenses/.
  33
  34 include(`../config.m4')
  35
  36
  37 C           cycles/bitpair (1x1 gcd)
  38 C Itanium:       ?
  39 C Itanium 2:     5.1
  40
  41
  42 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
  43 C
  44 C The entry sequence is designed to expect xsize>1 and hence a modexact
  45 C call.  This ought to be more common than a 1x1 operation.  Our critical
  46 C path is thus stripping factors of 2 from y, calling modexact, then
  47 C stripping factors of 2 from the x remainder returned.
  48 C
  49 C The common factors of 2 between x and y must be determined using the
  50 C original x, not the remainder from the modexact.  This is done with
  51 C x_orig which is xp[0].  There's plenty of time to do this while the rest
  52 C of the modexact etc is happening.
  53 C
  54 C It's possible xp[0] is zero.  In this case the trailing zeros calculation
  55 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
  56 C have, making min(x_twos,y_twos) == y_twos.
  57 C
  58 C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
  59 C stripping factors of 2 from abs(x-y).  Those factors of two are
  60 C determined from just y-x, without the abs(), since there's the same
  61 C number of trailing zeros on n or -n in twos complement.  That makes the
  62 C dependent chain 8 cycles deep.
  63 C
  64 C The selection of x-y versus y-x for abs(x-y), and the selection of the
  65 C minimum of x and y, is done in parallel with the critical path.
  66 C
  67 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
  68 C average, hence the final 5.8 cycles/bitpair.
  69 C
  70 C Not done:
  71 C
  72 C An alternate algorithm which didn't strip all twos, but instead applied
  73 C tbit and predicated extr on x, and then y, was attempted.  The loop was 6
  74 C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
  75 C total 7.25 c/bp, which is slower than the current approach.
  76 C
  77 C Alternatives:
  78 C
  79 C Perhaps we could do something tricky by extracting a few high bits and a
  80 C few low bits from the operands, and looking up a table which would give a
  81 C set of predicates to control some shifts or subtracts or whatever.  That
  82 C could knock off multiple bits per iteration.
  83 C
  84 C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
  85 C only going down I0), perhaps it'd be possible to shift left instead,
  86 C using add.  That would mean keeping track of the lowest not-yet-zeroed
  87 C bit, using some sort of mask.
  88 C
  89 C TODO:
  90 C  * Once mod_1_N exists in assembly for Itanium, add conditional calls.
  91 C  * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
  92 C  * Probably avoid popcnt also outside of loop, instead use ctz_table.
  93
  94 ASM_START()
  95         .explicit                               C What does this mean?
  96
  97 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
  98         .global mpn_modexact_1c_odd
  99         .type   mpn_modexact_1c_odd,@function
 100
 101 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
 102
 103 deflit(MAXSHIFT, 7)
 104 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
 105
 106 C       .section        ".rodata"
 107         .rodata
 108         ALIGN(m4_lshift(1,MAXSHIFT))    C align table to allow using dep
 109 ctz_table:
 110         data1   MAXSHIFT
 111 forloop(i,1,MASK,
 112 `       data1   m4_count_trailing_zeros(i)
 113 ')
 114
 115 PROLOGUE(mpn_gcd_1)
 116
 117                 C r32   xp
 118                 C r33   xsize
 119                 C r34   y
 120
 121 define(x,           r8)
 122 define(xp_orig,     r32)
 123 define(xsize,       r33)
 124 define(y,           r34)  define(inputs, 3)
 125 define(save_rp,     r35)
 126 define(save_pfs,    r36)
 127 define(x_orig,      r37)
 128 define(x_orig_one,  r38)
 129 define(y_twos,      r39)  define(locals, 5)
 130 define(out_xp,      r40)
 131 define(out_xsize,   r41)
 132 define(out_divisor, r42)
 133 define(out_carry,   r43)  define(outputs, 4)
 134
 135         .prologue
 136  {.mmi;
 137 ifdef(`HAVE_ABI_32',
 138 `               addp4   r9 = 0, xp_orig   define(xp,r9)',       C M0
 139 `                                         define(xp,xp_orig)')
 140         .save ar.pfs, save_pfs
 141                 alloc   save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
 142         .save rp, save_rp
 143                 mov     save_rp = b0                    C I0
 144 }{.mbb; .body
 145                 add     r10 = -1, y                     C M3  y-1
 146                 nop.b   0                               C B0
 147                 nop.b   0                               C B1
 148         ;;
 149
 150 }{.mmi;         ld8     x = [xp]                        C M0  x = xp[0] if no modexact
 151                 ld8     x_orig = [xp]                   C M1  orig x for common twos
 152                 cmp.ne  p6,p0 = 1, xsize                C I0
 153 }{.mmi;         andcm   y_twos = r10, y                 C M2  (y-1)&~y
 154                 mov     out_xp = xp_orig                C M3
 155                 mov     out_xsize = xsize               C I1
 156         ;;
 157 }{.mmi;         mov     out_carry = 0                   C M0
 158                 nop.m   0                               C M1
 159                 popcnt  y_twos = y_twos                 C I0  y twos
 160         ;;
 161 }{.mmi;         add     x_orig_one = -1, x_orig         C M0  orig x-1
 162                 nop.m   0                               C M1
 163                 shr.u   out_divisor = y, y_twos         C I0  y without twos
 164 }{.mib;         nop.m   0                               C M2
 165                 shr.u   y = y, y_twos                   C I1  y without twos
 166         (p6)    br.call.sptk.many b0 = mpn_modexact_1c_odd  C if xsize>1
 167         ;;
 168 }
 169         C modexact can leave x==0
 170  {.mmi;         cmp.eq  p6,p0 = 0, x                    C M0  if {xp,xsize} % y == 0
 171                 andcm   x_orig = x_orig_one, x_orig     C M1  orig (x-1)&~x
 172                 add     r9 = -1, x                      C I0  x-1
 173         ;;
 174 }{.mmi;         andcm   r9 = r9, x                      C M0  (x-1)&~x
 175                 nop.m   0                               C M1
 176                 mov     b0 = save_rp                    C I0
 177         ;;
 178 }{.mii;         nop.m   0                               C M0
 179                 popcnt  x_orig = x_orig                 C I0  orig x twos
 180                 popcnt  r9 = r9                         C I0  x twos
 181         ;;
 182 }{.mmi;         cmp.lt  p7,p0 = x_orig, y_twos          C M0  orig x_twos < y_twos
 183                 addl    r22 = @ltoff(ctz_table), r1
 184                 shr.u   x = x, r9                       C I0  x odd
 185         ;;
 186 }{.mib;
 187         (p7)    mov     y_twos = x_orig         C M0  common twos
 188                 add     r10 = -1, y             C I0  y-1
 189         (p6)    br.dpnt.few L(done_y)           C B0  x%y==0 then result y
 190         ;;
 191 }
 192                 mov     r25 = m4_lshift(MASK, MAXSHIFT)
 193                 ld8     r22 = [r22]
 194                 br      L(ent)
 195         ;;
 196
 197         ALIGN(32)
 198 L(top):
 199         .pred.rel "mutex", p6,p7
 200  {.mmi; (p7)    mov     y = x
 201         (p6)    sub     x = x, y
 202                 dep     r21 = r19, r22, 0, MAXSHIFT     C concat(table,lowbits)
 203 }{.mmi;         and     r20 = MASK, r19
 204         (p7)    mov     x = r19
 205                 nop     0
 206         ;;
 207 }
 208 L(mid):
 209 {.mmb;          ld1     r16 = [r21]
 210                 cmp.eq  p10,p0 = 0, r20
 211         (p10)   br.spnt.few.clr  L(shift_alot)
 212         ;;
 213 }{.mmi;         nop     0
 214                 nop     0
 215                 shr.u   x = x, r16
 216         ;;
 217 }
 218 L(ent):
 219  {.mmi;         sub     r19 = y, x
 220                 cmp.gtu p6,p7 = x, y
 221                 cmp.ne  p8,p0 = x, y
 222 }{.mmb;         nop     0
 223                 nop     0
 224         (p8)    br.sptk.few.clr L(top)
 225 }
 226
 227 L(done_y):      C result is y
 228                 mov     ar.pfs = save_pfs       C I0
 229                 shl     r8 = y, y_twos          C I   common factors of 2
 230                 br.ret.sptk.many b0
 231
 232 L(shift_alot):
 233                 and     r20 = x, r25
 234                 shr.u   x = x, MAXSHIFT
 235         ;;
 236                 dep     r21 = x, r22, 0, MAXSHIFT
 237                 br      L(mid)
 238 EPILOGUE()