* config/sh/lib1funcs-Os-4-200.asm: Guard entire file with
[official-gcc.git] / gcc / config / sh / lib1funcs-Os-4-200.asm
blobe016bed25880ad2b5e5fddb6b457462d5d5b7eaf
1 /* Copyright (C) 2006 Free Software Foundation, Inc.
3 This file is free software; you can redistribute it and/or modify it
4 under the terms of the GNU General Public License as published by the
5 Free Software Foundation; either version 2, or (at your option) any
6 later version.
8 In addition to the permissions in the GNU General Public License, the
9 Free Software Foundation gives you unlimited permission to link the
10 compiled version of this file into combinations with other programs,
11 and to distribute those combinations without any restriction coming
12 from the use of this file. (The General Public License restrictions
13 do apply in other respects; for example, they cover modification of
14 the file, and distribution when not linked into a combine
15 executable.)
17 This file is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; see the file COPYING. If not, write to
24 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
25 Boston, MA 02110-1301, USA. */
27 /* Moderately Space-optimized libgcc routines for the Renesas SH /
28 STMicroelectronics ST40 CPUs.
29 Contributed by J"orn Rennecke joern.rennecke@st.com. */
31 #include "lib1funcs.h"
33 #if !__SHMEDIA__
34 #ifdef L_udivsi3_i4i
36 /* 88 bytes; sh4-200 cycle counts:
37 divisor >= 2G: 11 cycles
38 dividend < 2G: 48 cycles
39 dividend >= 2G: divisor != 1: 54 cycles
40 dividend >= 2G, divisor == 1: 22 cycles */
41 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
42 !! args in r4 and r5, result in r0, clobber r1
44 .global GLOBAL(udivsi3_i4i)
45 FUNC(GLOBAL(udivsi3_i4i))
46 GLOBAL(udivsi3_i4i):
47 mova L1,r0
48 cmp/pz r5
49 sts fpscr,r1
50 lds.l @r0+,fpscr
51 sts.l fpul,@-r15
52 bf LOCAL(huge_divisor)
53 mov.l r1,@-r15
54 lds r4,fpul
55 cmp/pz r4
56 #ifdef FMOVD_WORKS
57 fmov.d dr0,@-r15
58 float fpul,dr0
59 fmov.d dr2,@-r15
60 bt LOCAL(dividend_adjusted)
61 mov #1,r1
62 fmov.d @r0,dr2
63 cmp/eq r1,r5
64 bt LOCAL(div_by_1)
65 fadd dr2,dr0
66 LOCAL(dividend_adjusted):
67 lds r5,fpul
68 float fpul,dr2
69 fdiv dr2,dr0
70 LOCAL(div_by_1):
71 fmov.d @r15+,dr2
72 ftrc dr0,fpul
73 fmov.d @r15+,dr0
74 #else /* !FMOVD_WORKS */
75 fmov.s DR01,@-r15
76 mov #1,r1
77 fmov.s DR00,@-r15
78 float fpul,dr0
79 fmov.s DR21,@-r15
80 bt/s LOCAL(dividend_adjusted)
81 fmov.s DR20,@-r15
82 cmp/eq r1,r5
83 bt LOCAL(div_by_1)
84 fmov.s @r0+,DR20
85 fmov.s @r0,DR21
86 fadd dr2,dr0
87 LOCAL(dividend_adjusted):
88 lds r5,fpul
89 float fpul,dr2
90 fdiv dr2,dr0
91 LOCAL(div_by_1):
92 fmov.s @r15+,DR20
93 fmov.s @r15+,DR21
94 ftrc dr0,fpul
95 fmov.s @r15+,DR00
96 fmov.s @r15+,DR01
97 #endif /* !FMOVD_WORKS */
98 lds.l @r15+,fpscr
99 sts fpul,r0
101 lds.l @r15+,fpul
103 #ifdef FMOVD_WORKS
104 .p2align 3 ! make double below 8 byte aligned.
105 #endif
106 LOCAL(huge_divisor):
107 lds r1,fpscr
108 add #4,r15
109 cmp/hs r5,r4
111 movt r0
113 .p2align 2
115 #ifndef FMOVD_WORKS
116 .long 0x80000
117 #else
118 .long 0x180000
119 #endif
120 .double 4294967296
122 ENDFUNC(GLOBAL(udivsi3_i4i))
123 #elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */
125 #if 0
126 /* With 36 bytes, the following would probably be the most compact
127 implementation, but with 139 cycles on an sh4-200, it is extremely slow. */
128 GLOBAL(udivsi3_i4i):
129 mov.l r2,@-r15
130 mov #0,r1
131 div0u
132 mov r1,r2
133 mov.l r3,@-r15
134 mov r1,r3
135 sett
136 mov r4,r0
137 LOCAL(loop):
138 rotcr r2
140 bt/s LOCAL(end)
141 cmp/gt r2,r3
142 rotcl r0
143 bra LOCAL(loop)
144 div1 r5,r1
145 LOCAL(end):
146 rotcl r0
147 mov.l @r15+,r3
149 mov.l @r15+,r2
150 #endif /* 0 */
152 /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
153 sh4-200 run times:
154 udiv small divisor: 55 cycles
155 udiv large divisor: 52 cycles
156 sdiv small divisor, positive result: 59 cycles
157 sdiv large divisor, positive result: 56 cycles
158 sdiv small divisor, negative result: 65 cycles (*)
159 sdiv large divisor, negative result: 62 cycles (*)
160 (*): r2 is restored in the rts delay slot and has a lingering latency
161 of two more cycles. */
162 .balign 4
163 .global GLOBAL(udivsi3_i4i)
164 FUNC(GLOBAL(udivsi3_i4i))
165 FUNC(GLOBAL(sdivsi3_i4i))
166 GLOBAL(udivsi3_i4i):
167 sts pr,r1
168 mov.l r4,@-r15
169 extu.w r5,r0
170 cmp/eq r5,r0
171 swap.w r4,r0
172 shlr16 r4
173 bf/s LOCAL(large_divisor)
174 div0u
175 mov.l r5,@-r15
176 shll16 r5
177 LOCAL(sdiv_small_divisor):
178 div1 r5,r4
179 bsr LOCAL(div6)
180 div1 r5,r4
181 div1 r5,r4
182 bsr LOCAL(div6)
183 div1 r5,r4
184 xtrct r4,r0
185 xtrct r0,r4
186 bsr LOCAL(div7)
187 swap.w r4,r4
188 div1 r5,r4
189 bsr LOCAL(div7)
190 div1 r5,r4
191 xtrct r4,r0
192 mov.l @r15+,r5
193 swap.w r0,r0
194 mov.l @r15+,r4
195 jmp @r1
196 rotcl r0
197 LOCAL(div7):
198 div1 r5,r4
199 LOCAL(div6):
200 div1 r5,r4; div1 r5,r4; div1 r5,r4
201 div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
203 LOCAL(divx3):
204 rotcl r0
205 div1 r5,r4
206 rotcl r0
207 div1 r5,r4
208 rotcl r0
210 div1 r5,r4
212 LOCAL(large_divisor):
213 mov.l r5,@-r15
214 LOCAL(sdiv_large_divisor):
215 xor r4,r0
216 .rept 4
217 rotcl r0
218 bsr LOCAL(divx3)
219 div1 r5,r4
220 .endr
221 mov.l @r15+,r5
222 mov.l @r15+,r4
223 jmp @r1
224 rotcl r0
225 ENDFUNC(GLOBAL(udivsi3_i4i))
227 .global GLOBAL(sdivsi3_i4i)
228 GLOBAL(sdivsi3_i4i):
229 mov.l r4,@-r15
230 cmp/pz r5
231 mov.l r5,@-r15
232 bt/s LOCAL(pos_divisor)
233 cmp/pz r4
234 neg r5,r5
235 extu.w r5,r0
236 bt/s LOCAL(neg_result)
237 cmp/eq r5,r0
238 neg r4,r4
239 LOCAL(pos_result):
240 swap.w r4,r0
241 bra LOCAL(sdiv_check_divisor)
242 sts pr,r1
243 LOCAL(pos_divisor):
244 extu.w r5,r0
245 bt/s LOCAL(pos_result)
246 cmp/eq r5,r0
247 neg r4,r4
248 LOCAL(neg_result):
249 mova LOCAL(negate_result),r0
251 mov r0,r1
252 swap.w r4,r0
253 lds r2,macl
254 sts pr,r2
255 LOCAL(sdiv_check_divisor):
256 shlr16 r4
257 bf/s LOCAL(sdiv_large_divisor)
258 div0u
259 bra LOCAL(sdiv_small_divisor)
260 shll16 r5
261 .balign 4
262 LOCAL(negate_result):
263 neg r0,r0
264 jmp @r2
265 sts macl,r2
266 ENDFUNC(GLOBAL(sdivsi3_i4i))
267 #endif /* !__SH_FPU_DOUBLE__ */
268 #endif /* L_udivsi3_i4i */
270 #ifdef L_sdivsi3_i4i
271 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
272 /* 48 bytes, 45 cycles on sh4-200 */
273 !! args in r4 and r5, result in r0, clobber r1
275 .global GLOBAL(sdivsi3_i4i)
276 FUNC(GLOBAL(sdivsi3_i4i))
277 GLOBAL(sdivsi3_i4i):
278 sts.l fpscr,@-r15
279 sts fpul,r1
280 mova L1,r0
281 lds.l @r0+,fpscr
282 lds r4,fpul
283 #ifdef FMOVD_WORKS
284 fmov.d dr0,@-r15
285 float fpul,dr0
286 lds r5,fpul
287 fmov.d dr2,@-r15
288 #else
289 fmov.s DR01,@-r15
290 fmov.s DR00,@-r15
291 float fpul,dr0
292 lds r5,fpul
293 fmov.s DR21,@-r15
294 fmov.s DR20,@-r15
295 #endif
296 float fpul,dr2
297 fdiv dr2,dr0
298 #ifdef FMOVD_WORKS
299 fmov.d @r15+,dr2
300 #else
301 fmov.s @r15+,DR20
302 fmov.s @r15+,DR21
303 #endif
304 ftrc dr0,fpul
305 #ifdef FMOVD_WORKS
306 fmov.d @r15+,dr0
307 #else
308 fmov.s @r15+,DR00
309 fmov.s @r15+,DR01
310 #endif
311 lds.l @r15+,fpscr
312 sts fpul,r0
314 lds r1,fpul
316 .p2align 2
318 #ifndef FMOVD_WORKS
319 .long 0x80000
320 #else
321 .long 0x180000
322 #endif
324 ENDFUNC(GLOBAL(sdivsi3_i4i))
325 #endif /* __SH_FPU_DOUBLE__ */
326 #endif /* L_sdivsi3_i4i */
327 #endif /* !__SHMEDIA__ */