source/libs/gmp/gmp-src/mpn/alpha/ev67/gcd_1.asm

   1 dnl  Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
   2
   3 dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C ev67: 3.4 cycles/bitpair for 1x1 part
  35
  36
  37 C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
  38 C
  39 C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
  40 C strip trailing zeros from abs(x-y) to maintain x and y both odd.
  41 C
  42 C The trailing zeros are calculated from just x-y, since in twos-complement
  43 C there's the same number of trailing zeros on d or -d.  This means the cttz
  44 C runs in parallel with abs(x-y).
  45 C
  46 C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
  47 C operands with this algorithm gives the measured 3.4 c/l.
  48 C
  49 C The slottings shown are for SVR4 style systems, Unicos differs in the
  50 C initial gp setup and the LEA.
  51 C
  52 C Enhancement:
  53 C
  54 C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
  55 C it to a bsr, but probably only in a static binary.  Plain "jsr foo" gives
  56 C the right object code for relaxation, and ought to be available
  57 C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
  58 C the usual case of running in a shared library.
  59 C
  60 C bsr could perhaps be used explicitly anyway.  We should be able to assume
  61 C modexact is in the same module as us (ie. shared library or mainline).
  62 C Would there be any worries about the size of the displacement?  Could
  63 C always put modexact and gcd_1 in the same .o to be certain.
  64
  65 ASM_START()
  66 PROLOGUE(mpn_gcd_1, gp)
  67
  68         C r16   xp
  69         C r17   size
  70         C r18   y
  71
  72         C ldah                          C l
  73         C lda                           C u
  74
  75         ldq     r0, 0(r16)              C L   x = xp[0]
  76         lda     r30, -32(r30)           C u   alloc stack
  77
  78         LEA(  r27, mpn_modexact_1c_odd) C L   modexact addr, ldq (gp)
  79         stq     r10, 16(r30)            C L   save r10
  80         cttz    r18, r10                C U0  y twos
  81         cmpeq   r17, 1, r5              C u   test size==1
  82
  83         stq     r9, 8(r30)              C L   save r9
  84         clr     r19                     C u   zero c for modexact
  85         unop
  86         unop
  87
  88         cttz    r0, r6                  C U0  x twos
  89         stq     r26, 0(r30)             C L   save ra
  90
  91         srl     r18, r10, r18           C U   y odd
  92
  93         mov     r18, r9                 C l   hold y across call
  94
  95         cmpult  r6, r10, r2             C u   test x_twos < y_twos
  96
  97         cmovne  r2, r6, r10             C l   common_twos = min(x_twos,y_twos)
  98         bne     r5, L(one)              C U   no modexact if size==1
  99         jsr     r26, (r27), mpn_modexact_1c_odd   C L0
 100
 101         LDGP(   r29, 0(r26))            C u,l ldah,lda
 102         cttz    r0, r6                  C U0  new x twos
 103         ldq     r26, 0(r30)             C L   restore ra
 104
 105 L(one):
 106         mov     r9, r1                  C u   y
 107         ldq     r9, 8(r30)              C L   restore r9
 108         mov     r10, r2                 C u   common twos
 109         ldq     r10, 16(r30)            C L   restore r10
 110
 111         lda     r30, 32(r30)            C l   free stack
 112         beq     r0, L(done)             C U   return y if x%y==0
 113
 114         srl     r0, r6, r0              C U   x odd
 115         unop
 116
 117         ALIGN(16)
 118 L(top):
 119         C r0    x
 120         C r1    y
 121         C r2    common twos, for use at end
 122
 123         subq    r0, r1, r7              C l0  d = x - y
 124         cmpult  r0, r1, r16             C u0  test x >= y
 125
 126         subq    r1, r0, r4              C l0  new_x = y - x
 127         cttz    r7, r8                  C U0  d twos
 128
 129         cmoveq  r16, r7, r4             C l0  new_x = d if x>=y
 130         cmovne  r16, r0, r1             C u0  y = x if x<y
 131         unop                            C l   \ force cmoveq into l0
 132         unop                            C u   /
 133
 134         C                               C cmoveq2 L0, cmovne2 U0
 135
 136         srl     r4, r8, r0              C U0  x = new_x >> twos
 137         bne     r7, L(top)              C U1  stop when d==0
 138
 139
 140 L(done):
 141         sll     r1, r2, r0              C U0  return y << common_twos
 142         ret     r31, (r26), 1           C L0
 143
 144 EPILOGUE()
 145 ASM_END()