sysdeps/alpha/divq.S

   1 /* Copyright (C) 2004-2015 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    The GNU C Library is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The GNU C Library is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the GNU C Library.  If not, see
  16    <http://www.gnu.org/licenses/>.  */
  17
  18 #include "div_libc.h"
  19
  20
  21 /* 64-bit signed long divide.  These are not normal C functions.  Argument
  22    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
  23    be clobbered.
  24
  25    Theory of operation here is that we can use the FPU divider for virtually
  26    all operands that we see: all dividend values between -2**53 and 2**53-1
  27    can be computed directly.  Note that divisor values need not be checked
  28    against that range because the rounded fp value will be close enough such
  29    that the quotient is < 1, which will properly be truncated to zero when we
  30    convert back to integer.
  31
  32    When the dividend is outside the range for which we can compute exact
  33    results, we use the fp quotent as an estimate from which we begin refining
  34    an exact integral value.  This reduces the number of iterations in the
  35    shift-and-subtract loop significantly.
  36
  37    The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
  38    for cvttq/c even without /sui being set.  It will not, however, properly
  39    raise the exception, so we don't have to worry about FPCR_INED being clear
  40    and so dying by SIGFPE.  */
  41
  42         .text
  43         .align  4
  44         .globl  __divq
  45         .type   __divq, @funcnoplt
  46         .usepv  __divq, no
  47
  48         cfi_startproc
  49         cfi_return_column (RA)
  50 __divq:
  51         lda     sp, -FRAME(sp)
  52         cfi_def_cfa_offset (FRAME)
  53         CALL_MCOUNT
  54
  55         /* Get the fp divide insn issued as quickly as possible.  After
  56            that's done, we have at least 22 cycles until its results are
  57            ready -- all the time in the world to figure out how we're
  58            going to use the results.  */
  59         stt     $f0, 0(sp)
  60         excb
  61         beq     Y, DIVBYZERO
  62
  63         stt     $f1, 8(sp)
  64         stt     $f3, 48(sp)
  65         cfi_rel_offset ($f0, 0)
  66         cfi_rel_offset ($f1, 8)
  67         cfi_rel_offset ($f3, 48)
  68         mf_fpcr $f3
  69
  70         _ITOFT2 X, $f0, 16, Y, $f1, 24
  71         cvtqt   $f0, $f0
  72         cvtqt   $f1, $f1
  73         divt/c  $f0, $f1, $f0
  74
  75         /* Check to see if X fit in the double as an exact value.  */
  76         sll     X, (64-53), AT
  77         ldt     $f1, 8(sp)
  78         sra     AT, (64-53), AT
  79         cmpeq   X, AT, AT
  80         beq     AT, $x_big
  81
  82         /* If we get here, we're expecting exact results from the division.
  83            Do nothing else besides convert and clean up.  */
  84         cvttq/c $f0, $f0
  85         excb
  86         mt_fpcr $f3
  87         _FTOIT  $f0, RV, 16
  88
  89         ldt     $f0, 0(sp)
  90         ldt     $f3, 48(sp)
  91         cfi_restore ($f1)
  92         cfi_remember_state
  93         cfi_restore ($f0)
  94         cfi_restore ($f3)
  95         cfi_def_cfa_offset (0)
  96         lda     sp, FRAME(sp)
  97         ret     $31, (RA), 1
  98
  99         .align  4
 100         cfi_restore_state
 101 $x_big:
 102         /* If we get here, X is large enough that we don't expect exact
 103            results, and neither X nor Y got mis-translated for the fp
 104            division.  Our task is to take the fp result, figure out how
 105            far it's off from the correct result and compute a fixup.  */
 106         stq     t0, 16(sp)
 107         stq     t1, 24(sp)
 108         stq     t2, 32(sp)
 109         stq     t5, 40(sp)
 110         cfi_rel_offset (t0, 16)
 111         cfi_rel_offset (t1, 24)
 112         cfi_rel_offset (t2, 32)
 113         cfi_rel_offset (t5, 40)
 114
 115 #define Q       RV              /* quotient */
 116 #define R       t0              /* remainder */
 117 #define SY      t1              /* scaled Y */
 118 #define S       t2              /* scalar */
 119 #define QY      t3              /* Q*Y */
 120
 121         /* The fixup code below can only handle unsigned values.  */
 122         or      X, Y, AT
 123         mov     $31, t5
 124         blt     AT, $fix_sign_in
 125 $fix_sign_in_ret1:
 126         cvttq/c $f0, $f0
 127
 128         _FTOIT  $f0, Q, 8
 129         .align  3
 130 $fix_sign_in_ret2:
 131         ldt     $f0, 0(sp)
 132         stq     t3, 0(sp)
 133         cfi_restore ($f0)
 134         cfi_rel_offset (t3, 0)
 135
 136         mulq    Q, Y, QY
 137         excb
 138         stq     t4, 8(sp)
 139         mt_fpcr $f3
 140         cfi_rel_offset (t4, 8)
 141
 142         subq    QY, X, R
 143         mov     Y, SY
 144         mov     1, S
 145         bgt     R, $q_high
 146
 147 $q_high_ret:
 148         subq    X, QY, R
 149         mov     Y, SY
 150         mov     1, S
 151         bgt     R, $q_low
 152
 153 $q_low_ret:
 154         ldq     t0, 16(sp)
 155         ldq     t1, 24(sp)
 156         ldq     t2, 32(sp)
 157         bne     t5, $fix_sign_out
 158
 159 $fix_sign_out_ret:
 160         ldq     t3, 0(sp)
 161         ldq     t4, 8(sp)
 162         ldq     t5, 40(sp)
 163         ldt     $f3, 48(sp)
 164         lda     sp, FRAME(sp)
 165         cfi_remember_state
 166         cfi_restore (t0)
 167         cfi_restore (t1)
 168         cfi_restore (t2)
 169         cfi_restore (t3)
 170         cfi_restore (t4)
 171         cfi_restore (t5)
 172         cfi_restore ($f3)
 173         cfi_def_cfa_offset (0)
 174         ret     $31, (RA), 1
 175
 176         .align  4
 177         cfi_restore_state
 178         /* The quotient that we computed was too large.  We need to reduce
 179            it by S such that Y*S >= R.  Obviously the closer we get to the
 180            correct value the better, but overshooting high is ok, as we'll
 181            fix that up later.  */
 182 0:
 183         addq    SY, SY, SY
 184         addq    S, S, S
 185 $q_high:
 186         cmpult  SY, R, AT
 187         bne     AT, 0b
 188
 189         subq    Q, S, Q
 190         unop
 191         subq    QY, SY, QY
 192         br      $q_high_ret
 193
 194         .align  4
 195         /* The quotient that we computed was too small.  Divide Y by the
 196            current remainder (R) and add that to the existing quotient (Q).
 197            The expectation, of course, is that R is much smaller than X.  */
 198         /* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 199            already have a copy of Y in SY and the value 1 in S.  */
 200 0:
 201         addq    SY, SY, SY
 202         addq    S, S, S
 203 $q_low:
 204         cmpult  SY, R, AT
 205         bne     AT, 0b
 206
 207         /* Shift-down and subtract loop.  Each iteration compares our scaled
 208            Y (SY) with the remainder (R); if SY <= R then X is divisible by
 209            Y's scalar (S) so add it to the quotient (Q).  */
 210 2:      addq    Q, S, t3
 211         srl     S, 1, S
 212         cmpule  SY, R, AT
 213         subq    R, SY, t4
 214
 215         cmovne  AT, t3, Q
 216         cmovne  AT, t4, R
 217         srl     SY, 1, SY
 218         bne     S, 2b
 219
 220         br      $q_low_ret
 221
 222         .align  4
 223 $fix_sign_in:
 224         /* If we got here, then X|Y is negative.  Need to adjust everything
 225            such that we're doing unsigned division in the fixup loop.  */
 226         /* T5 records the changes we had to make:
 227                 bit 0:  set if result should be negative.
 228                 bit 2:  set if X was negated.
 229                 bit 3:  set if Y was negated.
 230         */
 231         xor     X, Y, AT
 232         cmplt   AT, 0, t5
 233         cmplt   X, 0, AT
 234         negq    X, t0
 235
 236         s4addq  AT, t5, t5
 237         cmovne  AT, t0, X
 238         cmplt   Y, 0, AT
 239         negq    Y, t0
 240
 241         s8addq  AT, t5, t5
 242         cmovne  AT, t0, Y
 243         unop
 244         blbc    t5, $fix_sign_in_ret1
 245
 246         cvttq/c $f0, $f0
 247         _FTOIT  $f0, Q, 8
 248         .align  3
 249         negq    Q, Q
 250         br      $fix_sign_in_ret2
 251
 252         .align  4
 253 $fix_sign_out:
 254         /* Now we get to undo what we did above.  */
 255         /* ??? Is this really faster than just increasing the size of
 256            the stack frame and storing X and Y in memory?  */
 257         and     t5, 8, AT
 258         negq    Y, t4
 259         cmovne  AT, t4, Y
 260
 261         and     t5, 4, AT
 262         negq    X, t4
 263         cmovne  AT, t4, X
 264
 265         negq    RV, t4
 266         cmovlbs t5, t4, RV
 267
 268         br      $fix_sign_out_ret
 269
 270         cfi_endproc
 271         .size   __divq, .-__divq
 272
 273         DO_DIVBYZERO