sysdeps/alpha/remq.S

   1 /* Copyright (C) 2004 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    The GNU C Library is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The GNU C Library is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the GNU C Library; if not, write to the Free
  16    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  17    02111-1307 USA.  */
  18
  19 #include "div_libc.h"
  20
  21
  22 /* 64-bit signed long remainder.  These are not normal C functions.  Argument
  23    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
  24    be clobbered.
  25
  26    Theory of operation here is that we can use the FPU divider for virtually
  27    all operands that we see: all dividend values between -2**53 and 2**53-1
  28    can be computed directly.  Note that divisor values need not be checked
  29    against that range because the rounded fp value will be close enough such
  30    that the quotient is < 1, which will properly be truncated to zero when we
  31    convert back to integer.
  32
  33    When the dividend is outside the range for which we can compute exact
  34    results, we use the fp quotent as an estimate from which we begin refining
  35    an exact integral value.  This reduces the number of iterations in the
  36    shift-and-subtract loop significantly.  */
  37
  38         .text
  39         .align  4
  40         .globl  __remq
  41         .type   __remq, @function
  42         .usepv  __remq, no
  43
  44         cfi_startproc
  45         cfi_return_column (RA)
  46 __remq:
  47         lda     sp, -FRAME(sp)
  48         cfi_def_cfa_offset (FRAME)
  49         CALL_MCOUNT
  50
  51         /* Get the fp divide insn issued as quickly as possible.  After
  52            that's done, we have at least 22 cycles until its results are
  53            ready -- all the time in the world to figure out how we're
  54            going to use the results.  */
  55         stq     X, 16(sp)
  56         stq     Y, 24(sp)
  57         beq     Y, DIVBYZERO
  58
  59         stt     $f0, 0(sp)
  60         stt     $f1, 8(sp)
  61         cfi_rel_offset ($f0, 0)
  62         cfi_rel_offset ($f1, 8)
  63         ldt     $f0, 16(sp)
  64         ldt     $f1, 24(sp)
  65
  66         cvtqt   $f0, $f0
  67         cvtqt   $f1, $f1
  68         divt/c  $f0, $f1, $f0
  69
  70         /* Check to see if X fit in the double as an exact value.  */
  71         sll     X, (64-53), AT
  72         ldt     $f1, 8(sp)
  73         sra     AT, (64-53), AT
  74         cmpeq   X, AT, AT
  75         beq     AT, $x_big
  76
  77         /* If we get here, we're expecting exact results from the division.
  78            Do nothing else besides convert, compute remainder, clean up.  */
  79         cvttq/c $f0, $f0
  80         stt     $f0, 16(sp)
  81
  82         ldq     AT, 16(sp)
  83         mulq    AT, Y, AT
  84         ldt     $f0, 0(sp)
  85         cfi_restore ($f1)
  86         cfi_remember_state
  87         cfi_restore ($f0)
  88         cfi_def_cfa_offset (0)
  89         lda     sp, FRAME(sp)
  90
  91         subq    X, AT, RV
  92         ret     $31, (RA), 1
  93
  94         .align  4
  95         cfi_restore_state
  96 $x_big:
  97         /* If we get here, X is large enough that we don't expect exact
  98            results, and neither X nor Y got mis-translated for the fp
  99            division.  Our task is to take the fp result, figure out how
 100            far it's off from the correct result and compute a fixup.  */
 101         stq     t0, 16(sp)
 102         stq     t1, 24(sp)
 103         stq     t2, 32(sp)
 104         stq     t5, 40(sp)
 105         cfi_rel_offset (t0, 16)
 106         cfi_rel_offset (t1, 24)
 107         cfi_rel_offset (t2, 32)
 108         cfi_rel_offset (t5, 40)
 109
 110 #define Q       t0              /* quotient */
 111 #define R       RV              /* remainder */
 112 #define SY      t1              /* scaled Y */
 113 #define S       t2              /* scalar */
 114 #define QY      t3              /* Q*Y */
 115
 116         /* The fixup code below can only handle unsigned values.  */
 117         or      X, Y, AT
 118         mov     $31, t5
 119         blt     AT, $fix_sign_in
 120 $fix_sign_in_ret1:
 121         cvttq/c $f0, $f0
 122
 123         stt     $f0, 8(sp)
 124         ldq     Q, 8(sp)
 125 $fix_sign_in_ret2:
 126         mulq    Q, Y, QY
 127         stq     t4, 8(sp)
 128
 129         ldt     $f0, 0(sp)
 130         unop
 131         cfi_rel_offset (t4, 8)
 132         cfi_restore ($f0)
 133         stq     t3, 0(sp)
 134         unop
 135         cfi_rel_offset (t3, 0)
 136
 137         subq    QY, X, R
 138         mov     Y, SY
 139         mov     1, S
 140         bgt     R, $q_high
 141
 142 $q_high_ret:
 143         subq    X, QY, R
 144         mov     Y, SY
 145         mov     1, S
 146         bgt     R, $q_low
 147
 148 $q_low_ret:
 149         ldq     t0, 16(sp)
 150         ldq     t1, 24(sp)
 151         ldq     t2, 32(sp)
 152         bne     t5, $fix_sign_out
 153
 154 $fix_sign_out_ret:
 155         ldq     t3, 0(sp)
 156         ldq     t4, 8(sp)
 157         ldq     t5, 40(sp)
 158         lda     sp, FRAME(sp)
 159         cfi_remember_state
 160         cfi_restore (t0)
 161         cfi_restore (t1)
 162         cfi_restore (t2)
 163         cfi_restore (t3)
 164         cfi_restore (t4)
 165         cfi_restore (t5)
 166         cfi_def_cfa_offset (0)
 167         ret     $31, (RA), 1
 168
 169         .align  4
 170         cfi_restore_state
 171         /* The quotient that we computed was too large.  We need to reduce
 172            it by S such that Y*S >= R.  Obviously the closer we get to the
 173            correct value the better, but overshooting high is ok, as we'll
 174            fix that up later.  */
 175 0:
 176         addq    SY, SY, SY
 177         addq    S, S, S
 178 $q_high:
 179         cmpult  SY, R, AT
 180         bne     AT, 0b
 181
 182         subq    Q, S, Q
 183         unop
 184         subq    QY, SY, QY
 185         br      $q_high_ret
 186
 187         .align  4
 188         /* The quotient that we computed was too small.  Divide Y by the
 189            current remainder (R) and add that to the existing quotient (Q).
 190            The expectation, of course, is that R is much smaller than X.  */
 191         /* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
 192            already have a copy of Y in SY and the value 1 in S.  */
 193 0:
 194         addq    SY, SY, SY
 195         addq    S, S, S
 196 $q_low:
 197         cmpult  SY, R, AT
 198         bne     AT, 0b
 199
 200         /* Shift-down and subtract loop.  Each iteration compares our scaled
 201            Y (SY) with the remainder (R); if SY <= R then X is divisible by
 202            Y's scalar (S) so add it to the quotient (Q).  */
 203 2:      addq    Q, S, t3
 204         srl     S, 1, S
 205         cmpule  SY, R, AT
 206         subq    R, SY, t4
 207
 208         cmovne  AT, t3, Q
 209         cmovne  AT, t4, R
 210         srl     SY, 1, SY
 211         bne     S, 2b
 212
 213         br      $q_low_ret
 214
 215         .align  4
 216 $fix_sign_in:
 217         /* If we got here, then X|Y is negative.  Need to adjust everything
 218            such that we're doing unsigned division in the fixup loop.  */
 219         /* T5 records the changes we had to make:
 220                 bit 0:  set if X was negated.  Note that the sign of the
 221                         remainder follows the sign of the divisor.
 222                 bit 2:  set if Y was negated.
 223         */
 224         xor     X, Y, t1
 225         cmplt   X, 0, t5
 226         negq    X, t0
 227         cmovne  t5, t0, X
 228
 229         cmplt   Y, 0, AT
 230         negq    Y, t0
 231         s4addq  AT, t5, t5
 232         cmovne  AT, t0, Y
 233
 234         bge     t1, $fix_sign_in_ret1
 235         cvttq/c $f0, $f0
 236         stt     $f0, 8(sp)
 237         ldq     Q, 8(sp)
 238
 239         negq    Q, Q
 240         br      $fix_sign_in_ret2
 241
 242         .align  4
 243 $fix_sign_out:
 244         /* Now we get to undo what we did above.  */
 245         /* ??? Is this really faster than just increasing the size of
 246            the stack frame and storing X and Y in memory?  */
 247         and     t5, 4, AT
 248         negq    Y, t4
 249         cmovne  AT, t4, Y
 250
 251         negq    X, t4
 252         cmovlbs t5, t4, X
 253         negq    RV, t4
 254         cmovlbs t5, t4, RV
 255
 256         br      $fix_sign_out_ret
 257
 258         cfi_endproc
 259         .size   __remq, .-__remq
 260
 261         DO_DIVBYZERO