libgcc/config/arc/ieee-754/divdf3.S

   1 /* Copyright (C) 2008-2018 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 /*
  27    to calculate a := b/x as b*y, with y := 1/x:
  28    - x is in the range [1..2)
  29    - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
  30      Precision is higher for polynoms used to evaluate input with larger
  31      value.
  32    - Do one newton-raphson iteration step to double the precision,
  33      then multiply this with the divisor
  34         -> more time to decide if dividend is subnormal
  35      - the worst error propagation is on the side of the value range
  36        with the least initial defect, thus giving us about 30 bits precision.
  37       The truncation error for the either is less than 1 + x/2 ulp.
  38       A 31 bit inverse can be simply calculated by using x with implicit 1
  39       and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
  40       with the bare fraction part of x, then add in y0^2 for the implicit
  41       1 of x.
  42     - If calculating a 31 bit inverse, the systematic error is less than
  43       -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
  44     - If we calculate our seed with a 32 bit fraction, we can archive a
  45       tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
  46       only need to take the step to calculate the 2nd stage rest and
  47       rounding adjust 1/32th of the time.  However, if we use a 20 bit
  48       fraction for the seed, the negative error can exceed -2 ulp/128, (2)
  49       thus for a simple add / tst check, we need to do the 2nd stage
  50       rest calculation/ rounding adjust 1/16th of the time.
  51       (1): The inexactness of the 32 bit inverse contributes an error in the
  52       range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
  53       rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
  54       x/2 + 1/x <= 1.5 .
  55       (2): Unless proven otherwise.  I have not actually looked for an
  56       example where -2 ulp/128 is exceeded, and my calculations indicate
  57       that the excess, if existent, is less than -1/512 ulp.
  58  */
  59 #include "arc-ieee-754.h"
  60
  61 /* N.B. fp-bit.c does double rounding on denormal numbers.  */
  62 #if 0 /* DEBUG */
  63         .global __divdf3
  64         FUNC(__divdf3)
  65         .balign 4
  66 __divdf3:
  67         push_s blink
  68         push_s r2
  69         push_s r3
  70         push_s r0
  71         bl.d __divdf3_c
  72         push_s r1
  73         ld_s r2,[sp,12]
  74         ld_s r3,[sp,8]
  75         st_s r0,[sp,12]
  76         st_s r1,[sp,8]
  77         pop_s r1
  78         bl.d __divdf3_asm
  79         pop_s r0
  80         pop_s r3
  81         pop_s r2
  82         pop_s blink
  83         cmp r0,r2
  84         cmp.eq r1,r3
  85         jeq_s [blink]
  86         and r12,DBL0H,DBL1H
  87         bic.f 0,0x7ff80000,r12 ; both NaN -> OK
  88         jeq_s [blink]
  89         bl abort
  90         ENDFUNC(__divdf3)
  91 #define __divdf3 __divdf3_asm
  92 #endif /* DEBUG */
  93
  94         FUNC(__divdf3)
  95 __divdf3_support: /* This label makes debugger output saner.  */
  96         .balign 4
  97 .Ldenorm_dbl1:
  98         brge r6, \
  99                 0x43500000,.Linf_NaN ; large number / denorm -> Inf
 100         bmsk.f r12,DBL1H,19
 101         mov.eq r12,DBL1L
 102         mov.eq DBL1L,0
 103         sub.eq r7,r7,32
 104         norm.f r11,r12 ; flag for x/0 -> Inf check
 105         beq_s .Linf_NaN
 106         mov.mi r11,0
 107         add.pl r11,r11,1
 108         add_s r12,r12,r12
 109         asl r8,r12,r11
 110         rsub r12,r11,31
 111         lsr r12,DBL1L,r12
 112         tst_s DBL1H,DBL1H
 113         or r8,r8,r12
 114         lsr r4,r8,26
 115         lsr DBL1H,r8,12
 116         ld.as r4,[r10,r4]
 117         bxor.mi DBL1H,DBL1H,31
 118         sub r11,r11,11
 119         asl DBL1L,DBL1L,r11
 120         sub r11,r11,1
 121         MPYHU r5,r4,r8
 122         sub r7,r7,r11
 123         asl r4,r4,12
 124         b.d .Lpast_denorm_dbl1
 125         asl r7,r7,20
 126         ; wb stall
 127
 128         .balign 4
 129 .Ldenorm_dbl0:
 130         bmsk.f r12,DBL0H,19
 131         ; wb stall
 132         mov.eq r12,DBL0L
 133         sub.eq r6,r6,32
 134         norm.f r11,r12 ; flag for 0/x -> 0 check
 135         brge r7, \
 136                 0x43500000, .Lret0_NaN ; denorm/large number -> 0
 137         beq_s .Lret0_NaN
 138         mov.mi r11,0
 139         add.pl r11,r11,1
 140         asl r12,r12,r11
 141         sub r6,r6,r11
 142         add.f 0,r6,31
 143         lsr r10,DBL0L,r6
 144         mov.mi r10,0
 145         add r6,r6,11+32
 146         neg.f r11,r6
 147         asl DBL0L,DBL0L,r11
 148         mov.pl DBL0L,0
 149         sub r6,r6,32-1
 150         b.d .Lpast_denorm_dbl0
 151         asl r6,r6,20
 152
 153 .Linf_NaN:
 154         tst_s DBL0L,DBL0L ; 0/0 -> NaN
 155         xor_s DBL1H,DBL1H,DBL0H
 156         bclr.eq.f DBL0H,DBL0H,31
 157         bmsk DBL0H,DBL1H,30
 158         xor_s DBL0H,DBL0H,DBL1H
 159         sub.eq DBL0H,DBL0H,1
 160         mov_s DBL0L,0
 161         j_s.d [blink]
 162         or DBL0H,DBL0H,r9
 163         .balign 4
 164 .Lret0_NaN:
 165         xor_s DBL1H,DBL1H,DBL0H
 166         cmp_s r12,r9
 167         mov_s DBL0L,0
 168         bmsk DBL0H,DBL1H,30
 169         xor_s DBL0H,DBL0H,DBL1H
 170         j_s.d [blink]
 171         sub.hi DBL0H,DBL0H,1
 172 .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
 173         not_s DBL0L,DBL1H
 174         cmp r6,r9
 175         sub_s.ne DBL0L,DBL0L,DBL0L
 176         tst_s DBL0H,DBL0H
 177         add_s DBL0H,DBL1H,DBL0L
 178         j_s.d [blink]
 179         bxor.mi DBL0H,DBL0H,31
 180 .Linf_nan_dbl0:
 181         tst_s DBL1H,DBL1H
 182         j_s.d [blink]
 183         bxor.mi DBL0H,DBL0H,31
 184         .balign 4
 185         .global __divdf3
 186 /* N.B. the spacing between divtab and the add3 to get its address must
 187    be a multiple of 8.  */
 188 __divdf3:
 189         asl r8,DBL1H,12
 190         lsr r12,DBL1L,20
 191         lsr r4,r8,26
 192 #if defined (__ARCHS__) || defined (__ARCEM__)
 193         add3 r10,pcl,60 ; (.Ldivtab-.) >> 3
 194 #else
 195         add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
 196 #endif
 197         ld.as r4,[r10,r4]
 198 #if defined (__ARCHS__) || defined (__ARCEM__)
 199         ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 200 #else
 201         ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 202 #endif
 203         or r8,r8,r12
 204         MPYHU r5,r4,r8
 205         and.f r7,DBL1H,r9
 206         asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
 207         beq.d .Ldenorm_dbl1
 208         and r6,DBL0H,r9
 209 .Lpast_denorm_dbl1: ; wb stall
 210         sub r4,r4,r5
 211         MPYHU r5,r4,r4
 212         breq.d r6,0,.Ldenorm_dbl0
 213         lsr r8,r8,1
 214         asl r12,DBL0H,11
 215         lsr r10,DBL0L,21
 216 .Lpast_denorm_dbl0: ; wb stall
 217         bset r8,r8,31
 218         MPYHU r11,r5,r8
 219         add_s r12,r12,r10
 220         bset r5,r12,31
 221         cmp r5,r8
 222         cmp.eq DBL0L,DBL1L
 223         ; wb stall
 224         lsr.cc r5,r5,1
 225         sub r4,r4,r11 ; u1.31 inverse, about 30 bit
 226         MPYHU r11,r5,r4 ; result fraction highpart
 227         breq r7,r9,.Linf_nan_dbl1
 228         lsr r8,r8,2 ; u3.29
 229         add r5,r6, /* wait for immediate /  XMAC wb stall */ \
 230                 0x3fe00000
 231         ; wb stall (not for XMAC)
 232         breq r6,r9,.Linf_nan_dbl0
 233         mpyu r12,r11,r8 ; u-28.31
 234         asl_s DBL1L,DBL1L,9 ; u-29.23:9
 235         sbc r6,r5,r7
 236         ; resource conflict (not for XMAC)
 237         MPYHU r5,r11,DBL1L ; u-28.23:9
 238         add.cs DBL0L,DBL0L,DBL0L
 239         asl_s DBL0L,DBL0L,6 ; u-26.25:7
 240         asl r10,r11,23
 241         sub_l DBL0L,DBL0L,r12
 242         ; wb stall (before 'and' for XMAC)
 243         lsr r7,r11,9
 244         sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
 245         MPYH r12,r5,r4 ; result fraction lowpart
 246         xor.f 0,DBL0H,DBL1H
 247         and DBL0H,r6,r9
 248         add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
 249         bxor.mi DBL0H,DBL0H,31
 250         brhs r6, /*  wb stall / wait for immediate */ \
 251                 0x7fe00000,.Linf_denorm
 252         add.f r12,r12,0x11
 253         asr r9,r12,5
 254         sub.mi DBL0H,DBL0H,1
 255         add.f DBL0L,r9,r10
 256         tst r12,0x1c
 257         jne.d [blink]
 258         add.cs DBL0H,DBL0H,1
 259         /* work out exact rounding if we fall through here.  */
 260         /* We know that the exact result cannot be represented in double
 261            precision.  Find the mid-point between the two nearest
 262            representable values, multiply with the divisor, and check if
 263            the result is larger than the dividend.  Since we want to know
 264            only the sign bit, it is sufficient to calculate only the
 265            highpart of the lower 64 bits.  */
 266         sub.f DBL0L,DBL0L,1
 267         asl r12,r9,2 ; u-22.30:2
 268         mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
 269         sub.cs DBL0H,DBL0H,1
 270         sub.f r12,r12,2
 271         ; resource conflict (not for XMAC)
 272         MPYHU r7,r12,DBL1L ; u-51.32
 273         asl r5,r5,25 ; s-51.7:25
 274         lsr r10,r10,7 ; u-51.30:2
 275         ; resource conflict (not for XMAC)
 276         ; resource conflict (not for XMAC)
 277         mpyu r9,r12,r8 ; u-51.31:1
 278         sub r5,r5,r10
 279         add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
 280         bset r7,r7,0 ; make sure that the result is not zero, and that
 281         ; wb stall (one earlier for XMAC)
 282         sub r5,r5,r7 ; a highpart zero appears negative
 283         sub.f r5,r5,r9 ; rest msw
 284         add.pl.f DBL0L,DBL0L,1
 285         j_s.d [blink]
 286         add.eq DBL0H,DBL0H,1
 287
 288         .balign 4
 289 .Linf_denorm:
 290         brlo r6,0xc0000000,.Linf
 291 .Ldenorm:
 292         asr r6,r6,20
 293         neg r9,r6
 294         mov_s DBL0H,0
 295         brhs.d r9,54,.Lret0
 296         bxor.mi DBL0H,DBL0H,31
 297         add_l r12,r12,1
 298         and r12,r12,-4
 299         rsub r7,r6,5
 300         asr r10,r12,28
 301         bmsk r4,r12,27
 302 #if defined (__ARCHS__) || defined (__ARCEM__)
 303         min  r7, r7, 31
 304         asr  DBL0L, r4, r7
 305 #else
 306         asrs DBL0L,r4,r7
 307 #endif
 308         add DBL1H,r11,r10
 309 #if defined (__ARCHS__) || defined (__ARCEM__)
 310         abs.f r10, r4
 311         sub.mi r10, r10, 1
 312 #endif
 313         add.f r7,r6,32-5
 314 #ifdef __ARC700__
 315         abss r10,r4
 316 #endif
 317         asl r4,r4,r7
 318         mov.mi r4,r10
 319         add.f r10,r6,23
 320         rsub r7,r6,9
 321         lsr r7,DBL1H,r7
 322         asl r10,DBL1H,r10
 323         or.pnz DBL0H,DBL0H,r7
 324         or.mi r4,r4,r10
 325         mov.mi r10,r7
 326         add.f DBL0L,r10,DBL0L
 327         add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
 328         bxor.f 0,r4,31
 329         add.pnz.f DBL0L,DBL0L,1
 330         add.cs.f DBL0H,DBL0H,1
 331         jne_l [blink]
 332         /* Calculation so far was not conclusive; calculate further rest.  */
 333         mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
 334         asr.f r12,r12,3
 335         asl r5,r5,25 ; s-51.7:25
 336         ; resource conflict (not for XMAC)
 337         mpyu DBL1H,r12,r8 ; u-51.31:1
 338         and r9,DBL0L,1 ; tie-breaker: round to even
 339         lsr r11,r11,7 ; u-51.30:2
 340         ; resource conflict (not for XMAC)
 341         MPYHU r8,r12,DBL1L ; u-51.32
 342         sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
 343         add_s DBL1H,DBL1H,r11
 344         ; resource conflict (not for XMAC)
 345         ; resource conflict (not for XMAC)
 346         mpyu r12,r12,DBL1L ; u-83.30:2
 347         sub DBL1H,DBL1H,r5 ; -rest msw
 348         add_s DBL1H,DBL1H,r8 ; -rest msw
 349         add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
 350         ; wb stall (XMAC: Before add.f)
 351         tst_s DBL1H,DBL1H
 352         cmp.eq r12,r9
 353         add.cs.f DBL0L,DBL0L,1
 354         j_s.d [blink]
 355         add.cs DBL0H,DBL0H,1
 356
 357 .Lret0:
 358         /* return +- 0 */
 359         j_s.d [blink]
 360         mov_s DBL0L,0
 361 .Linf:
 362         mov_s DBL0H,r9
 363         mov_s DBL0L,0
 364         j_s.d [blink]
 365         bxor.mi DBL0H,DBL0H,31
 366
 367         .balign 4
 368 .Ldivtab:
 369         .long 0xfc0fffe1
 370         .long 0xf46ffdfb
 371         .long 0xed1ffa54
 372         .long 0xe61ff515
 373         .long 0xdf7fee75
 374         .long 0xd91fe680
 375         .long 0xd2ffdd52
 376         .long 0xcd1fd30c
 377         .long 0xc77fc7cd
 378         .long 0xc21fbbb6
 379         .long 0xbcefaec0
 380         .long 0xb7efa100
 381         .long 0xb32f92bf
 382         .long 0xae8f83b7
 383         .long 0xaa2f7467
 384         .long 0xa5ef6479
 385         .long 0xa1cf53fa
 386         .long 0x9ddf433e
 387         .long 0x9a0f3216
 388         .long 0x965f2091
 389         .long 0x92df0f11
 390         .long 0x8f6efd05
 391         .long 0x8c1eeacc
 392         .long 0x88eed876
 393         .long 0x85dec615
 394         .long 0x82eeb3b9
 395         .long 0x800ea10b
 396         .long 0x7d3e8e0f
 397         .long 0x7a8e7b3f
 398         .long 0x77ee6836
 399         .long 0x756e5576
 400         .long 0x72fe4293
 401         .long 0x709e2f93
 402         .long 0x6e4e1c7f
 403         .long 0x6c0e095e
 404         .long 0x69edf6c5
 405         .long 0x67cde3a5
 406         .long 0x65cdd125
 407         .long 0x63cdbe25
 408         .long 0x61ddab3f
 409         .long 0x600d991f
 410         .long 0x5e3d868c
 411         .long 0x5c6d7384
 412         .long 0x5abd615f
 413         .long 0x590d4ecd
 414         .long 0x576d3c83
 415         .long 0x55dd2a89
 416         .long 0x545d18e9
 417         .long 0x52dd06e9
 418         .long 0x516cf54e
 419         .long 0x4ffce356
 420         .long 0x4e9cd1ce
 421         .long 0x4d3cbfec
 422         .long 0x4becae86
 423         .long 0x4aac9da4
 424         .long 0x496c8c73
 425         .long 0x483c7bd3
 426         .long 0x470c6ae8
 427         .long 0x45dc59af
 428         .long 0x44bc4915
 429         .long 0x43ac3924
 430         .long 0x428c27fb
 431         .long 0x418c187a
 432         .long 0x407c07bd
 433 .L7ff00000:
 434         .long 0x7ff00000
 435         ENDFUNC(__divdf3)