sysdeps/alpha/divq.S

   1 /* Copyright (C) 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    The GNU C Library is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The GNU C Library is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the GNU C Library; if not, write to the Free
  16    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  17    02111-1307 USA.  */
  18
  19 #include "div_libc.h"
  20
  21
  22 /* 64-bit signed long divide.  These are not normal C functions.  Argument
  23    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
  24    be clobbered.
  25
  26    Theory of operation here is that we can use the FPU divider for virtually
  27    all operands that we see: all dividend values between -2**53 and 2**53-1
  28    can be computed directly.  Note that divisor values need not be checked
  29    against that range because the rounded fp value will be close enough such
  30    that the quotient is < 1, which will properly be truncated to zero when we
  31    convert back to integer.
  32
  33    When the dividend is outside the range for which we can compute exact
  34    results, we use the fp quotent as an estimate from which we begin refining
  35    an exact integral value.  This reduces the number of iterations in the
  36    shift-and-subtract loop significantly.  */
  37
  38         .text
  39         .align  4
  40         .globl  __divq
  41         .type   __divq, @function
  42         .usepv  __divq, no
  43
  44         cfi_startproc
  45         cfi_return_column (RA)
  46 __divq:
  47         lda     sp, -FRAME(sp)
  48         cfi_def_cfa_offset (FRAME)
  49         CALL_MCOUNT
  50
  51         /* Get the fp divide insn issued as quickly as possible.  After
  52            that's done, we have at least 22 cycles until its results are
  53            ready -- all the time in the world to figure out how we're
  54            going to use the results.  */
  55         stt     $f0, 0(sp)
  56         stt     $f1, 8(sp)
  57         beq     Y, DIVBYZERO
  58         cfi_rel_offset ($f0, 0)
  59         cfi_rel_offset ($f1, 8)
  60
  61         _ITOFT2 X, $f0, 16, Y, $f1, 24
  62         cvtqt   $f0, $f0
  63         cvtqt   $f1, $f1
  64         divt/c  $f0, $f1, $f0
  65
  66         /* Check to see if X fit in the double as an exact value.  */
  67         sll     X, (64-53), AT
  68         ldt     $f1, 8(sp)
  69         sra     AT, (64-53), AT
  70         cmpeq   X, AT, AT
  71         beq     AT, $x_big
  72
  73         /* If we get here, we're expecting exact results from the division.
  74            Do nothing else besides convert and clean up.  */
  75         cvttq/c $f0, $f0
  76         _FTOIT  $f0, RV, 16
  77
  78         ldt     $f0, 0(sp)
  79         cfi_restore ($f1)
  80         cfi_remember_state
  81         cfi_restore ($f0)
  82         cfi_def_cfa_offset (0)
  83         lda     sp, FRAME(sp)
  84         ret     $31, (RA), 1
  85
  86         .align  4
  87         cfi_restore_state
  88 $x_big:
  89         /* If we get here, X is large enough that we don't expect exact
  90            results, and neither X nor Y got mis-translated for the fp
  91            division.  Our task is to take the fp result, figure out how
  92            far it's off from the correct result and compute a fixup.  */
  93         stq     t0, 16(sp)
  94         stq     t1, 24(sp)
  95         stq     t2, 32(sp)
  96         stq     t5, 40(sp)
  97         cfi_rel_offset (t0, 16)
  98         cfi_rel_offset (t1, 24)
  99         cfi_rel_offset (t2, 32)
 100         cfi_rel_offset (t5, 40)
 101
 102 #define Q       RV              /* quotient */
 103 #define R       t0              /* remainder */
 104 #define SY      t1              /* scaled Y */
 105 #define S       t2              /* scalar */
 106 #define QY      t3              /* Q*Y */
 107
 108         /* The fixup code below can only handle unsigned values.  */
 109         or      X, Y, AT
 110         mov     $31, t5
 111         blt     AT, $fix_sign_in
 112 $fix_sign_in_ret1:
 113         cvttq/c $f0, $f0
 114
 115         _FTOIT  $f0, Q, 8
 116         .align  3
 117 $fix_sign_in_ret2:
 118         mulq    Q, Y, QY
 119         stq     t4, 8(sp)
 120
 121         ldt     $f0, 0(sp)
 122         unop
 123         cfi_rel_offset (t4, 8)
 124         cfi_restore ($f0)
 125         stq     t3, 0(sp)
 126         unop
 127         cfi_rel_offset (t3, 0)
 128
 129         subq    QY, X, R
 130         mov     Y, SY
 131         mov     1, S
 132         bgt     R, $q_high
 133
 134 $q_high_ret:
 135         subq    X, QY, R
 136         mov     Y, SY
 137         mov     1, S
 138         bgt     R, $q_low
 139
 140 $q_low_ret:
 141         ldq     t0, 16(sp)
 142         ldq     t1, 24(sp)
 143         ldq     t2, 32(sp)
 144         bne     t5, $fix_sign_out
 145
 146 $fix_sign_out_ret:
 147         ldq     t3, 0(sp)
 148         ldq     t4, 8(sp)
 149         ldq     t5, 40(sp)
 150         lda     sp, FRAME(sp)
 151         cfi_remember_state
 152         cfi_restore (t0)
 153         cfi_restore (t1)
 154         cfi_restore (t2)
 155         cfi_restore (t3)
 156         cfi_restore (t4)
 157         cfi_restore (t5)
 158         cfi_def_cfa_offset (0)
 159         ret     $31, (RA), 1
 160
 161         .align  4
 162         cfi_restore_state
 163         /* The quotient that we computed was too large.  We need to reduce
 164            it by S such that Y*S >= R.  Obviously the closer we get to the
 165            correct value the better, but overshooting high is ok, as we'll
 166            fix that up later.  */
 167 0:
 168         addq    SY, SY, SY
 169         addq    S, S, S
 170 $q_high:
 171         cmpult  SY, R, AT
 172         bne     AT, 0b
 173
 174         subq    Q, S, Q
 175         unop
 176         subq    QY, SY, QY
 177         br      $q_high_ret
 178
 179         .align  4
 180         /* The quotient that we computed was too small.  Divide Y by the
 181            current remainder (R) and add that to the existing quotient (Q).
 182            The expectation, of course, is that R is much smaller than X.  */
 183         /* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 184            already have a copy of Y in SY and the value 1 in S.  */
 185 0:
 186         addq    SY, SY, SY
 187         addq    S, S, S
 188 $q_low:
 189         cmpult  SY, R, AT
 190         bne     AT, 0b
 191
 192         /* Shift-down and subtract loop.  Each iteration compares our scaled
 193            Y (SY) with the remainder (R); if SY <= R then X is divisible by
 194            Y's scalar (S) so add it to the quotient (Q).  */
 195 2:      addq    Q, S, t3
 196         srl     S, 1, S
 197         cmpule  SY, R, AT
 198         subq    R, SY, t4
 199
 200         cmovne  AT, t3, Q
 201         cmovne  AT, t4, R
 202         srl     SY, 1, SY
 203         bne     S, 2b
 204
 205         br      $q_low_ret
 206
 207         .align  4
 208 $fix_sign_in:
 209         /* If we got here, then X|Y is negative.  Need to adjust everything
 210            such that we're doing unsigned division in the fixup loop.  */
 211         /* T5 records the changes we had to make:
 212                 bit 0:  set if result should be negative.
 213                 bit 2:  set if X was negated.
 214                 bit 3:  set if Y was negated.
 215         */
 216         xor     X, Y, AT
 217         cmplt   AT, 0, t5
 218         cmplt   X, 0, AT
 219         negq    X, t0
 220
 221         s4addq  AT, t5, t5
 222         cmovne  AT, t0, X
 223         cmplt   Y, 0, AT
 224         negq    Y, t0
 225
 226         s8addq  AT, t5, t5
 227         cmovne  AT, t0, Y
 228         unop
 229         blbc    t5, $fix_sign_in_ret1
 230
 231         cvttq/c $f0, $f0
 232         _FTOIT  $f0, Q, 8
 233         .align  3
 234         negq    Q, Q
 235         br      $fix_sign_in_ret2
 236
 237         .align  4
 238 $fix_sign_out:
 239         /* Now we get to undo what we did above.  */
 240         /* ??? Is this really faster than just increasing the size of
 241            the stack frame and storing X and Y in memory?  */
 242         and     t5, 8, AT
 243         negq    Y, t4
 244         cmovne  AT, t4, Y
 245
 246         and     t5, 4, AT
 247         negq    X, t4
 248         cmovne  AT, t4, X
 249
 250         negq    RV, t4
 251         cmovlbs t5, t4, RV
 252
 253         br      $fix_sign_out_ret
 254
 255         cfi_endproc
 256         .size   __divq, .-__divq
 257
 258         DO_DIVBYZERO