1 ; libgcc1 routines for Synopsys DesignWare ARC cpu.
3 /* Copyright (C) 1995, 1997, 2007-2013 Free Software Foundation, Inc.
4 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
5 on behalf of Synopsys Inc.
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 Under Section 7 of GPL version 3, you are granted additional
20 permissions described in the GCC Runtime Library Exception, version
21 3.1, as published by the Free Software Foundation.
23 You should have received a copy of the GNU General Public License and
24 a copy of the GCC Runtime Library Exception along with this program;
25 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
26 <http://www.gnu.org/licenses/>. */
28 /* As a special exception, if you link this library with other files,
29 some of which are compiled with GCC, to produce an executable,
30 this library does not by itself cause the resulting executable
31 to be covered by the GNU General Public License.
32 This exception does not however invalidate any other reasons why
33 the executable file might be covered by the GNU General Public License. */
36 /* ANSI concatenation macros. */
38 #define CONCAT1(a, b) CONCAT2(a, b)
39 #define CONCAT2(a, b) a ## b
41 /* Use the right prefix for global labels. */
43 #define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
45 #ifndef WORKING_ASSEMBLER
51 #define FUNC(X) .type SYM(X),@function
52 #define HIDDEN_FUNC(X) FUNC(X)` .hidden X
53 #define ENDFUNC0(X) .Lfe_##X: .size X,.Lfe_##X-X
54 #define ENDFUNC(X) ENDFUNC0(X)
65 /* This the simple version.
76 #if defined (__ARC_MUL64__)
82 #elif defined (__ARC700__)
88 #elif defined (__ARC_NORM__)
95 lpnz @.Lend ; loop is aligned
101 #elif !defined (__OPTIMIZE_SIZE__) && !defined(__ARC601__)
102 /* Up to 3.5 times faster than the simpler code below, but larger. */
122 #elif !defined (__OPTIMIZE_SIZE__) /* __ARC601__ */
140 /********************************************************/
142 mov_s r2,0 ; Accumulate result here.
145 add_s r2,r2,r1 ; r += b
147 lsr_s r0,r0 ; a >>= 1
148 asl_s r1,r1 ; b <<= 1
154 /********************************************************/
157 #endif /* L_mulsi3 */
163 .global SYM(__umulsidi3)
165 HIDDEN_FUNC(__umulsidi3)
166 /* We need ARC700 /ARC_MUL64 definitions of __umulsidi3 / __umulsi3_highpart
167 in case some code has been compiled without multiply support enabled,
168 but linked with the multiply-support enabled libraries.
169 For ARC601 (i.e. without a barrel shifter), we also use umuldisi3 as our
170 umulsi3_highpart implementation; the use of the latter label doesn't
171 actually benefit ARC601 platforms, but is useful when ARC601 code is linked
172 against other libraries. */
173 #if defined (__ARC700__) || defined (__ARC_MUL64__) || defined (__ARC601__)
174 .global SYM(__umulsi3_highpart)
175 SYM(__umulsi3_highpart):
176 HIDDEN_FUNC(__umulsi3_highpart)
179 /* This the simple version.
189 #include "ieee-754/arc-ieee-754.h"
195 mpyhu DBL0H,r12,DBL0H
196 #elif defined (__ARC_MUL64__)
197 /* Likewise for __ARC_MUL64__ */
202 #else /* !__ARC700__ && !__ARC_MUL64__ */
203 /* Although it might look tempting to extend this to handle muldi3,
204 using mulsi3 twice with 2.25 cycles per 32 bit add is faster
205 than one loop with 3 or four cycles per 32 bit add. */
206 asl.f r12,0 ; Top part of b.
207 mov_s r2,0 ; Accumulate result here.
212 breq r0,0,@.Ldone ; while (a)
214 asl.f r1,r1 ; b <<= 1
215 bbit0.d r0,1,@.Llooptst
219 add.f r3,r3,r1 ; r += b
220 brne.d r0,0,@.Lloop ; while (a);
226 #endif /* !__ARC700__*/
228 #if defined (__ARC700__) || defined (__ARC_MUL64__) || defined (__ARC601__)
229 ENDFUNC(__umulsi3_highpart)
231 #endif /* L_umulsidi3 */
233 #ifdef L_umulsi3_highpart
234 #include "ieee-754/arc-ieee-754.h"
235 /* For use without a barrel shifter, and for ARC700 / ARC_MUL64, the
236 mulsidi3 algorithms above look better, so for these, there is an
237 extra label up there. */
238 #if !defined (__ARC700__) && !defined (__ARC_MUL64__) && !defined (__ARC601__)
239 .global SYM(__umulsi3_highpart)
240 SYM(__umulsi3_highpart):
241 HIDDEN_FUNC(__umulsi3_highpart)
251 /* Make the result register peephole-compatible with mulsidi3. */
253 ENDFUNC(__umulsi3_highpart)
254 #endif /* !__ARC700__ && !__ARC601__ */
255 #endif /* L_umulsi3_highpart */
257 #ifdef L_divmod_tools
259 ; Utilities used by all routines.
265 udivmodsi4(int modwanted, unsigned long num, unsigned long den)
267 unsigned long bit = 1;
268 unsigned long res = 0;
270 while (den < num && bit && !(den & (1L<<31)))
285 if (modwanted) return num;
290 ; inputs: r0 = numerator, r1 = denominator
291 ; outputs: r0 = quotient, r1 = remainder, r2/r3 trashed
294 .global SYM(__udivmodsi4)
298 #if defined (__ARC700__)
299 /* Normalize divisor and divident, and then use the appropriate number of
300 divaw (the number of result bits, or one more) to produce the result.
301 There are some special conditions that need to be tested:
302 - We can only directly normalize unsigned numbers that fit in 31 bit. For
303 the divisor, we test early on that it is not 'negative'.
304 - divaw can't corrrectly process a divident that is larger than the divisor.
305 We handle this be checking that the divident prior to normalization is
306 not larger than the normalized divisor. As we then already know then
307 that the divisor fits 31 bit, this check also makes sure that the
309 - ordinary normalization of the divident could make it larger than the
310 normalized divisor, which again would be unsuitable for divaw.
311 Thus, we want to shift left the divident by one less, except that we
312 want to leave it alone if it is already 31 bit. To this end, we
313 double the input to norm with adds.
314 - If the divident has less bits than the divisor, that would leave us
315 with a negative number of divaw to execute. Although we could use a
316 conditional loop to avoid excess divaw, and then the quotient could
317 be extracted correctly as there'd be more than enough zero bits, the
318 remainder would be shifted left too far, requiring a conditional shift
319 right. The cost of that shift and the possible mispredict on the
320 conditional loop cost as much as putting in an early check for a zero
323 brne.d r3,r0,.Large_dividend
329 asl_l r0,r0,r3 ; not short to keep loop aligned
333 .Ldiv_end:sub_s r3,r2,1
352 .Ldiv_end2:asl r0,r3,r2
370 #elif !defined (__OPTIMIZE_SIZE__)
373 brhs.d r1,r2,.Lret0_3
384 #else /* ! __ARC_NORM__ */
386 brhs.d r1,r2,.Lret0_3
389 asl_s r1,r1 ; den <<= 1
390 brls.d r1,r2,@.Lloop1
391 sub lp_count,lp_count,1
397 #endif /* !__ARC_NORM__ */
429 #if 0 /* Slightly shorter, but slower. */
431 brhi.d r1,r0,.Loop3_end
436 rsub r0,lp_count,32-1
448 #else /* Arctangent-A5 */
449 breq_s r1,0,@.Ldivmodend
455 asl_s r1,r1 ; den <<= 1
457 asl_s r2,r2 ; bit <<= 1
459 brlo r0,r1,@.Lshiftdown
460 sub_s r0,r0,r1 ; num -= den
461 or_s r3,r3,r2 ; res |= bit
463 lsr_s r2,r2 ; bit >>= 1
464 lsr_s r1,r1 ; den >>= 1
467 mov_s r1,r0 ; r1 = mod
469 mov_s r0,r3 ; r0 = res
470 /******************************************************/
472 ENDFUNC(__udivmodsi4)
480 .global SYM(__udivsi3)
485 #if 0 /* interferes with linux loader */
486 .section .__arc_profile_forward, "a"
488 .long SYM(__udivmodsi4)
492 #endif /* L_udivsi3 */
498 .global SYM(__divsi3)
507 bl.d @SYM(__udivmodsi4)
512 #else /* !ifndef __ARC700__ */
513 ;; We can use the abs, norm, divaw and mpy instructions for ARC700
516 /* This table has been generated by divtab-arc700.c. */
517 /* 1/512 .. 1/256, normalized. There is a leading 1 in bit 31.
518 For powers of two, we list unnormalized numbers instead. The values
519 for powers of 2 are loaded, but not used. The value for 1 is actually
520 the first instruction after .Lmuldiv. */
787 ; write port allocation stall
809 .Ldivstart:divaw r12,r12,r2
810 .Ldivend:xor_s r1,r1,r0
820 sub1.f 0,r12,r2 ; special case: -2**(n+1) / 2**n
827 ; Need to handle special cases involving negative powers of two:
828 ; r12,r2 are normalized dividend / divisor;
829 ; divide anything by 0x80000000, or divide 0x80000000 by 0x40000000
841 /* This version requires that divaw works with a divisor of 0x80000000U */
854 .Ldivstart:divaw r12,r12,r2
855 .Ldivend:xor_s r1,r1,r0
871 #endif /* ifndef __ARC700__ */
875 #endif /* L_divsi3 */
881 .global SYM(__umodsi3)
885 bl.nd @SYM(__udivmodsi4)
889 #if 0 /* interferes with linux loader */
890 .section .__arc_profile_forward, "a"
892 .long SYM(__udivmodsi4)
896 #endif /* L_umodsi3 */
902 .global SYM (__modsi3)
910 bl.d @SYM(__udivmodsi4)
916 #else /* __ARC700__ */
930 .Ldivstart:divaw r12,r12,r2
936 .Lonebit:neg.pl r5,r5
940 #endif /* __ARC700__ */
943 #endif /* L_modsi3 */
948 .global SYM (__clzsi2)
951 HIDDEN_FUNC(__clzsi2)
957 #elif defined (__ARC601__)
967 sub2 r0,lp_count,lp_count
995 #endif /* L_clzsi2 */
999 ;;; MILLICODE THUNK LIB ;***************
1001 ;;; .macro push_regs from, to, offset
1002 ;;; st_s "\from", [sp, \offset]
1004 ;;; push_regs "(\from+1)", \to, "(\offset+4)"
1007 ;;; push_regs 13, 18, 0
1010 ;;;; .macro sum from, to, three
1014 ;;;; .set regno, \from+1
1016 ;;;; .set shift, shift - 1
1017 ;;;; # st_s %shift @3 lsl #shift
1019 ;;;; sum "(\from+1)", \to, "(\three)"
1026 ;; .macro push_regs from=0, to=3, offset
1027 ;; st_s r\from, [sp, \offset]
1029 ;; push_regs "\from+1 ",\to,"(\offset+4)"
1033 ;; .macro expand_to_push from=13, to
1039 ;; ; push_regs \from, \to, 0
1043 ;; expand_to_push 13,18
1047 #ifdef L_millicodethunk_st
1050 .global SYM(__st_r13_to_r15)
1051 .global SYM(__st_r13_to_r16)
1052 .global SYM(__st_r13_to_r17)
1053 .global SYM(__st_r13_to_r18)
1054 .global SYM(__st_r13_to_r19)
1055 .global SYM(__st_r13_to_r20)
1056 .global SYM(__st_r13_to_r21)
1057 .global SYM(__st_r13_to_r22)
1058 .global SYM(__st_r13_to_r23)
1059 .global SYM(__st_r13_to_r24)
1060 .global SYM(__st_r13_to_r25)
1061 HIDDEN_FUNC(__st_r13_to_r15)
1062 HIDDEN_FUNC(__st_r13_to_r16)
1063 HIDDEN_FUNC(__st_r13_to_r17)
1064 HIDDEN_FUNC(__st_r13_to_r18)
1065 HIDDEN_FUNC(__st_r13_to_r19)
1066 HIDDEN_FUNC(__st_r13_to_r20)
1067 HIDDEN_FUNC(__st_r13_to_r21)
1068 HIDDEN_FUNC(__st_r13_to_r22)
1069 HIDDEN_FUNC(__st_r13_to_r23)
1070 HIDDEN_FUNC(__st_r13_to_r24)
1071 HIDDEN_FUNC(__st_r13_to_r25)
1073 SYM(__st_r13_to_r25):
1075 SYM(__st_r13_to_r24):
1077 SYM(__st_r13_to_r23):
1079 SYM(__st_r13_to_r22):
1081 SYM(__st_r13_to_r21):
1083 SYM(__st_r13_to_r20):
1085 SYM(__st_r13_to_r19):
1087 SYM(__st_r13_to_r18):
1089 SYM(__st_r13_to_r17):
1091 SYM(__st_r13_to_r16):
1093 SYM(__st_r13_to_r15):
1095 st r15, [sp,8] ; minimum function size to avoid stall: 6 bytes.
1102 ENDFUNC(__st_r13_to_r15)
1103 ENDFUNC(__st_r13_to_r16)
1104 ENDFUNC(__st_r13_to_r17)
1105 ENDFUNC(__st_r13_to_r18)
1106 ENDFUNC(__st_r13_to_r19)
1107 ENDFUNC(__st_r13_to_r20)
1108 ENDFUNC(__st_r13_to_r21)
1109 ENDFUNC(__st_r13_to_r22)
1110 ENDFUNC(__st_r13_to_r23)
1111 ENDFUNC(__st_r13_to_r24)
1112 ENDFUNC(__st_r13_to_r25)
1113 #endif /* L_millicodethunk_st */
1116 #ifdef L_millicodethunk_ld
1119 ; ==================================
1122 .global SYM(__ld_r13_to_r15)
1123 .global SYM(__ld_r13_to_r16)
1124 .global SYM(__ld_r13_to_r17)
1125 .global SYM(__ld_r13_to_r18)
1126 .global SYM(__ld_r13_to_r19)
1127 .global SYM(__ld_r13_to_r20)
1128 .global SYM(__ld_r13_to_r21)
1129 .global SYM(__ld_r13_to_r22)
1130 .global SYM(__ld_r13_to_r23)
1131 .global SYM(__ld_r13_to_r24)
1132 .global SYM(__ld_r13_to_r25)
1133 HIDDEN_FUNC(__ld_r13_to_r15)
1134 HIDDEN_FUNC(__ld_r13_to_r16)
1135 HIDDEN_FUNC(__ld_r13_to_r17)
1136 HIDDEN_FUNC(__ld_r13_to_r18)
1137 HIDDEN_FUNC(__ld_r13_to_r19)
1138 HIDDEN_FUNC(__ld_r13_to_r20)
1139 HIDDEN_FUNC(__ld_r13_to_r21)
1140 HIDDEN_FUNC(__ld_r13_to_r22)
1141 HIDDEN_FUNC(__ld_r13_to_r23)
1142 HIDDEN_FUNC(__ld_r13_to_r24)
1143 HIDDEN_FUNC(__ld_r13_to_r25)
1144 SYM(__ld_r13_to_r25):
1146 SYM(__ld_r13_to_r24):
1148 SYM(__ld_r13_to_r23):
1150 SYM(__ld_r13_to_r22):
1152 SYM(__ld_r13_to_r21):
1154 SYM(__ld_r13_to_r20):
1156 SYM(__ld_r13_to_r19):
1158 SYM(__ld_r13_to_r18):
1160 SYM(__ld_r13_to_r17):
1162 SYM(__ld_r13_to_r16):
1164 SYM(__ld_r13_to_r15):
1166 ld r15, [sp,8] ; minimum function size to avoid stall: 6 bytes.
1173 ENDFUNC(__ld_r13_to_r15)
1174 ENDFUNC(__ld_r13_to_r16)
1175 ENDFUNC(__ld_r13_to_r17)
1176 ENDFUNC(__ld_r13_to_r18)
1177 ENDFUNC(__ld_r13_to_r19)
1178 ENDFUNC(__ld_r13_to_r20)
1179 ENDFUNC(__ld_r13_to_r21)
1180 ENDFUNC(__ld_r13_to_r22)
1181 ENDFUNC(__ld_r13_to_r23)
1182 ENDFUNC(__ld_r13_to_r24)
1183 ENDFUNC(__ld_r13_to_r25)
1185 #endif /* L_millicodethunk_ld */
1186 #ifdef L_millicodethunk_ret
1187 .global SYM(__ld_r13_to_r14_ret)
1188 .global SYM(__ld_r13_to_r15_ret)
1189 .global SYM(__ld_r13_to_r16_ret)
1190 .global SYM(__ld_r13_to_r17_ret)
1191 .global SYM(__ld_r13_to_r18_ret)
1192 .global SYM(__ld_r13_to_r19_ret)
1193 .global SYM(__ld_r13_to_r20_ret)
1194 .global SYM(__ld_r13_to_r21_ret)
1195 .global SYM(__ld_r13_to_r22_ret)
1196 .global SYM(__ld_r13_to_r23_ret)
1197 .global SYM(__ld_r13_to_r24_ret)
1198 .global SYM(__ld_r13_to_r25_ret)
1199 HIDDEN_FUNC(__ld_r13_to_r14_ret)
1200 HIDDEN_FUNC(__ld_r13_to_r15_ret)
1201 HIDDEN_FUNC(__ld_r13_to_r16_ret)
1202 HIDDEN_FUNC(__ld_r13_to_r17_ret)
1203 HIDDEN_FUNC(__ld_r13_to_r18_ret)
1204 HIDDEN_FUNC(__ld_r13_to_r19_ret)
1205 HIDDEN_FUNC(__ld_r13_to_r20_ret)
1206 HIDDEN_FUNC(__ld_r13_to_r21_ret)
1207 HIDDEN_FUNC(__ld_r13_to_r22_ret)
1208 HIDDEN_FUNC(__ld_r13_to_r23_ret)
1209 HIDDEN_FUNC(__ld_r13_to_r24_ret)
1210 HIDDEN_FUNC(__ld_r13_to_r25_ret)
1213 SYM(__ld_r13_to_r25_ret):
1215 SYM(__ld_r13_to_r24_ret):
1217 SYM(__ld_r13_to_r23_ret):
1219 SYM(__ld_r13_to_r22_ret):
1221 SYM(__ld_r13_to_r21_ret):
1223 SYM(__ld_r13_to_r20_ret):
1225 SYM(__ld_r13_to_r19_ret):
1227 SYM(__ld_r13_to_r18_ret):
1229 SYM(__ld_r13_to_r17_ret):
1231 SYM(__ld_r13_to_r16_ret):
1233 SYM(__ld_r13_to_r15_ret):
1235 SYM(__ld_r13_to_r14_ret):
1241 ENDFUNC(__ld_r13_to_r14_ret)
1242 ENDFUNC(__ld_r13_to_r15_ret)
1243 ENDFUNC(__ld_r13_to_r16_ret)
1244 ENDFUNC(__ld_r13_to_r17_ret)
1245 ENDFUNC(__ld_r13_to_r18_ret)
1246 ENDFUNC(__ld_r13_to_r19_ret)
1247 ENDFUNC(__ld_r13_to_r20_ret)
1248 ENDFUNC(__ld_r13_to_r21_ret)
1249 ENDFUNC(__ld_r13_to_r22_ret)
1250 ENDFUNC(__ld_r13_to_r23_ret)
1251 ENDFUNC(__ld_r13_to_r24_ret)
1252 ENDFUNC(__ld_r13_to_r25_ret)
1254 #endif /* L_millicodethunk_ret */
1258 #include "ieee-754/adddf3.S"
1264 #include "ieee-754/muldf3.S"
1265 #elif defined (__ARC_NORM__) && defined(__ARC_MUL64__)
1266 #include "ieee-754/arc600-mul64/muldf3.S"
1267 #elif defined (__ARC_NORM__) && defined(__ARC_MUL32BY16__)
1268 #include "ieee-754/arc600-dsp/muldf3.S"
1274 #include "ieee-754/addsf3.S"
1280 #include "ieee-754/mulsf3.S"
1281 #elif defined (__ARC_NORM__) && defined(__ARC_MUL64__)
1282 #include "ieee-754/arc600-mul64/mulsf3.S"
1283 #elif defined (__ARC_NORM__) && defined(__ARC_MUL32BY16__)
1284 #include "ieee-754/arc600-dsp/mulsf3.S"
1285 #elif defined (__ARC_NORM__)
1286 #include "ieee-754/arc600/mulsf3.S"
1292 #include "ieee-754/divdf3.S"
1293 #elif defined (__ARC_NORM__) && defined(__ARC_MUL64__)
1294 #include "ieee-754/arc600-mul64/divdf3.S"
1295 #elif defined (__ARC_NORM__) && defined(__ARC_MUL32BY16__)
1296 #include "ieee-754/arc600-dsp/divdf3.S"
1302 #include "ieee-754/divsf3-stdmul.S"
1303 #elif defined (__ARC_NORM__) && defined(__ARC_MUL64__)
1304 #include "ieee-754/arc600-mul64/divsf3.S"
1305 #elif defined (__ARC_NORM__) && defined(__ARC_MUL32BY16__)
1306 #include "ieee-754/arc600-dsp/divsf3.S"
1307 #elif defined (__ARC_NORM__)
1308 #include "ieee-754/arc600/divsf3.S"
1312 #ifdef L_extendsfdf2
1314 #include "ieee-754/extendsfdf2.S"
1320 #include "ieee-754/truncdfsf2.S"
1326 #include "ieee-754/floatsidf.S"
1332 #include "ieee-754/floatsisf.S"
1336 #ifdef L_floatunsidf
1338 #include "ieee-754/floatunsidf.S"
1344 #include "ieee-754/fixdfsi.S"
1350 #include "ieee-754/fixsfsi.S"
1356 #include "ieee-754/fixunsdfsi.S"
1362 #include "ieee-754/eqdf2.S"
1368 #include "ieee-754/eqsf2.S"
1374 #include "ieee-754/gtdf2.S"
1380 #include "ieee-754/gtsf2.S"
1386 #include "ieee-754/gedf2.S"
1392 #include "ieee-754/gesf2.S"
1398 #include "ieee-754/uneqdf2.S"
1404 #include "ieee-754/uneqsf2.S"
1410 #include "ieee-754/orddf2.S"
1416 #include "ieee-754/ordsf2.S"