gcc/config/sh/lib1funcs-Os-4-200.asm

   1 /* Copyright (C) 2006 Free Software Foundation, Inc.
   2
   3 This file is free software; you can redistribute it and/or modify it
   4 under the terms of the GNU General Public License as published by the
   5 Free Software Foundation; either version 2, or (at your option) any
   6 later version.
   7
   8 In addition to the permissions in the GNU General Public License, the
   9 Free Software Foundation gives you unlimited permission to link the
  10 compiled version of this file into combinations with other programs,
  11 and to distribute those combinations without any restriction coming
  12 from the use of this file.  (The General Public License restrictions
  13 do apply in other respects; for example, they cover modification of
  14 the file, and distribution when not linked into a combine
  15 executable.)
  16
  17 This file is distributed in the hope that it will be useful, but
  18 WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20 General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with this program; see the file COPYING.  If not, write to
  24 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
  25 Boston, MA 02110-1301, USA.  */
  26
  27 /* Moderately Space-optimized libgcc routines for the Renesas SH /
  28    STMicroelectronics ST40 CPUs.
  29    Contributed by J"orn Rennecke joern.rennecke@st.com.  */
  30
  31 #include "lib1funcs.h"
  32
  33 #if !__SHMEDIA__
  34 #ifdef L_udivsi3_i4i
  35
  36 /* 88 bytes; sh4-200 cycle counts:
  37    divisor  >= 2G: 11 cycles
  38    dividend <  2G: 48 cycles
  39    dividend >= 2G: divisor != 1: 54 cycles
  40    dividend >= 2G, divisor == 1: 22 cycles */
  41 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
  42 !! args in r4 and r5, result in r0, clobber r1
  43
  44         .global GLOBAL(udivsi3_i4i)
  45         FUNC(GLOBAL(udivsi3_i4i))
  46 GLOBAL(udivsi3_i4i):
  47         mova L1,r0
  48         cmp/pz r5
  49         sts fpscr,r1
  50         lds.l @r0+,fpscr
  51         sts.l fpul,@-r15
  52         bf LOCAL(huge_divisor)
  53         mov.l r1,@-r15
  54         lds r4,fpul
  55         cmp/pz r4
  56 #ifdef FMOVD_WORKS
  57         fmov.d dr0,@-r15
  58         float fpul,dr0
  59         fmov.d dr2,@-r15
  60         bt LOCAL(dividend_adjusted)
  61         mov #1,r1
  62         fmov.d @r0,dr2
  63         cmp/eq r1,r5
  64         bt LOCAL(div_by_1)
  65         fadd dr2,dr0
  66 LOCAL(dividend_adjusted):
  67         lds r5,fpul
  68         float fpul,dr2
  69         fdiv dr2,dr0
  70 LOCAL(div_by_1):
  71         fmov.d @r15+,dr2
  72         ftrc dr0,fpul
  73         fmov.d @r15+,dr0
  74 #else /* !FMOVD_WORKS */
  75         fmov.s DR01,@-r15
  76         mov #1,r1
  77         fmov.s DR00,@-r15
  78         float fpul,dr0
  79         fmov.s DR21,@-r15
  80         bt/s LOCAL(dividend_adjusted)
  81         fmov.s DR20,@-r15
  82         cmp/eq r1,r5
  83         bt LOCAL(div_by_1)
  84         fmov.s @r0+,DR20
  85         fmov.s @r0,DR21
  86         fadd dr2,dr0
  87 LOCAL(dividend_adjusted):
  88         lds r5,fpul
  89         float fpul,dr2
  90         fdiv dr2,dr0
  91 LOCAL(div_by_1):
  92         fmov.s @r15+,DR20
  93         fmov.s @r15+,DR21
  94         ftrc dr0,fpul
  95         fmov.s @r15+,DR00
  96         fmov.s @r15+,DR01
  97 #endif /* !FMOVD_WORKS */
  98         lds.l @r15+,fpscr
  99         sts fpul,r0
 100         rts
 101         lds.l @r15+,fpul
 102
 103 #ifdef FMOVD_WORKS
 104         .p2align 3        ! make double below 8 byte aligned.
 105 #endif
 106 LOCAL(huge_divisor):
 107         lds r1,fpscr
 108         add #4,r15
 109         cmp/hs r5,r4
 110         rts
 111         movt r0
 112
 113         .p2align 2
 114 L1:
 115 #ifndef FMOVD_WORKS
 116         .long 0x80000
 117 #else
 118         .long 0x180000
 119 #endif
 120         .double 4294967296
 121
 122         ENDFUNC(GLOBAL(udivsi3_i4i))
 123 #elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */
 124
 125 #if 0
 126 /* With 36 bytes, the following would probably be the most compact
 127    implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
 128 GLOBAL(udivsi3_i4i):
 129         mov.l r2,@-r15
 130         mov #0,r1
 131         div0u
 132         mov r1,r2
 133         mov.l r3,@-r15
 134         mov r1,r3
 135         sett
 136         mov r4,r0
 137 LOCAL(loop):
 138         rotcr r2
 139         ;
 140         bt/s LOCAL(end)
 141         cmp/gt r2,r3
 142         rotcl r0
 143         bra LOCAL(loop)
 144         div1 r5,r1
 145 LOCAL(end):
 146         rotcl r0
 147         mov.l @r15+,r3
 148         rts
 149         mov.l @r15+,r2
 150 #endif /* 0 */
 151
 152 /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
 153    sh4-200 run times:
 154    udiv small divisor: 55 cycles
 155    udiv large divisor: 52 cycles
 156    sdiv small divisor, positive result: 59 cycles
 157    sdiv large divisor, positive result: 56 cycles
 158    sdiv small divisor, negative result: 65 cycles (*)
 159    sdiv large divisor, negative result: 62 cycles (*)
 160    (*): r2 is restored in the rts delay slot and has a lingering latency
 161         of two more cycles.  */
 162         .balign 4
 163         .global GLOBAL(udivsi3_i4i)
 164         FUNC(GLOBAL(udivsi3_i4i))
 165         FUNC(GLOBAL(sdivsi3_i4i))
 166 GLOBAL(udivsi3_i4i):
 167         sts pr,r1
 168         mov.l r4,@-r15
 169         extu.w r5,r0
 170         cmp/eq r5,r0
 171         swap.w r4,r0
 172         shlr16 r4
 173         bf/s LOCAL(large_divisor)
 174         div0u
 175         mov.l r5,@-r15
 176         shll16 r5
 177 LOCAL(sdiv_small_divisor):
 178         div1 r5,r4
 179         bsr LOCAL(div6)
 180         div1 r5,r4
 181         div1 r5,r4
 182         bsr LOCAL(div6)
 183         div1 r5,r4
 184         xtrct r4,r0
 185         xtrct r0,r4
 186         bsr LOCAL(div7)
 187         swap.w r4,r4
 188         div1 r5,r4
 189         bsr LOCAL(div7)
 190         div1 r5,r4
 191         xtrct r4,r0
 192         mov.l @r15+,r5
 193         swap.w r0,r0
 194         mov.l @r15+,r4
 195         jmp @r1
 196         rotcl r0
 197 LOCAL(div7):
 198         div1 r5,r4
 199 LOCAL(div6):
 200                     div1 r5,r4; div1 r5,r4; div1 r5,r4
 201         div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
 202
 203 LOCAL(divx3):
 204         rotcl r0
 205         div1 r5,r4
 206         rotcl r0
 207         div1 r5,r4
 208         rotcl r0
 209         rts
 210         div1 r5,r4
 211
 212 LOCAL(large_divisor):
 213         mov.l r5,@-r15
 214 LOCAL(sdiv_large_divisor):
 215         xor r4,r0
 216         .rept 4
 217         rotcl r0
 218         bsr LOCAL(divx3)
 219         div1 r5,r4
 220         .endr
 221         mov.l @r15+,r5
 222         mov.l @r15+,r4
 223         jmp @r1
 224         rotcl r0
 225         ENDFUNC(GLOBAL(udivsi3_i4i))
 226
 227         .global GLOBAL(sdivsi3_i4i)
 228 GLOBAL(sdivsi3_i4i):
 229         mov.l r4,@-r15
 230         cmp/pz r5
 231         mov.l r5,@-r15
 232         bt/s LOCAL(pos_divisor)
 233         cmp/pz r4
 234         neg r5,r5
 235         extu.w r5,r0
 236         bt/s LOCAL(neg_result)
 237         cmp/eq r5,r0
 238         neg r4,r4
 239 LOCAL(pos_result):
 240         swap.w r4,r0
 241         bra LOCAL(sdiv_check_divisor)
 242         sts pr,r1
 243 LOCAL(pos_divisor):
 244         extu.w r5,r0
 245         bt/s LOCAL(pos_result)
 246         cmp/eq r5,r0
 247         neg r4,r4
 248 LOCAL(neg_result):
 249         mova LOCAL(negate_result),r0
 250         ;
 251         mov r0,r1
 252         swap.w r4,r0
 253         lds r2,macl
 254         sts pr,r2
 255 LOCAL(sdiv_check_divisor):
 256         shlr16 r4
 257         bf/s LOCAL(sdiv_large_divisor)
 258         div0u
 259         bra LOCAL(sdiv_small_divisor)
 260         shll16 r5
 261         .balign 4
 262 LOCAL(negate_result):
 263         neg r0,r0
 264         jmp @r2
 265         sts macl,r2
 266         ENDFUNC(GLOBAL(sdivsi3_i4i))
 267 #endif /* !__SH_FPU_DOUBLE__ */
 268 #endif /* L_udivsi3_i4i */
 269
 270 #ifdef L_sdivsi3_i4i
 271 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
 272 /* 48 bytes, 45 cycles on sh4-200  */
 273 !! args in r4 and r5, result in r0, clobber r1
 274
 275         .global GLOBAL(sdivsi3_i4i)
 276         FUNC(GLOBAL(sdivsi3_i4i))
 277 GLOBAL(sdivsi3_i4i):
 278         sts.l fpscr,@-r15
 279         sts fpul,r1
 280         mova L1,r0
 281         lds.l @r0+,fpscr
 282         lds r4,fpul
 283 #ifdef FMOVD_WORKS
 284         fmov.d dr0,@-r15
 285         float fpul,dr0
 286         lds r5,fpul
 287         fmov.d dr2,@-r15
 288 #else
 289         fmov.s DR01,@-r15
 290         fmov.s DR00,@-r15
 291         float fpul,dr0
 292         lds r5,fpul
 293         fmov.s DR21,@-r15
 294         fmov.s DR20,@-r15
 295 #endif
 296         float fpul,dr2
 297         fdiv dr2,dr0
 298 #ifdef FMOVD_WORKS
 299         fmov.d @r15+,dr2
 300 #else
 301         fmov.s @r15+,DR20
 302         fmov.s @r15+,DR21
 303 #endif
 304         ftrc dr0,fpul
 305 #ifdef FMOVD_WORKS
 306         fmov.d @r15+,dr0
 307 #else
 308         fmov.s @r15+,DR00
 309         fmov.s @r15+,DR01
 310 #endif
 311         lds.l @r15+,fpscr
 312         sts fpul,r0
 313         rts
 314         lds r1,fpul
 315
 316         .p2align 2
 317 L1:
 318 #ifndef FMOVD_WORKS
 319         .long 0x80000
 320 #else
 321         .long 0x180000
 322 #endif
 323
 324         ENDFUNC(GLOBAL(sdivsi3_i4i))
 325 #endif /* __SH_FPU_DOUBLE__ */
 326 #endif /* L_sdivsi3_i4i */
 327 #endif /* !__SHMEDIA__ */