1 /* libgcc routines for the Texas Instruments TMS320C[34]x
2 Copyright (C) 1997,98, 1999 Free Software Foundation, Inc.
4 Contributed by Michael Hayes (m.hayes@elec.canterbury.ac.nz)
5 and Herman Ten Brugge (Haj.Ten.Brugge@net.HCC.nl).
8 This file is part of GCC.
10 GCC is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option) any
15 In addition to the permissions in the GNU General Public License, the
16 Free Software Foundation gives you unlimited permission to link the
17 compiled version of this file into combinations with other programs,
18 and to distribute those combinations without any restriction coming
19 from the use of this file. (The General Public License restrictions
20 do apply in other respects; for example, they cover modification of
21 the file, and distribution when not linked into a combine
24 This file is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 General Public License for more details.
29 You should have received a copy of the GNU General Public License
30 along with this program; see the file COPYING. If not, write to
31 the Free Software Foundation, 59 Temple Place - Suite 330,
32 Boston, MA 02111-1307, USA. */
34 ; These routines are called using the standard TI register argument
36 ; The following registers do not have to be saved:
37 ; r0, r1, r2, r3, ar0, ar1, ar2, ir0, ir1, bk, rs, rc, re, (r9, r10, r11)
39 ; Perform floating point divqf3
41 ; This routine performs a reciprocal of the divisor using the method
42 ; described in the C30/C40 user manuals. It then multiplies that
43 ; result by the dividend.
45 ; Let r be the reciprocal of the divisor v and let the ith estimate
46 ; of r be denoted by r[i]. An iterative approach can be used to
47 ; improve the estimate of r, given an initial estimate r[0], where
49 ; r[i + 1] = r[i] * (2.0 - v * r[i])
51 ; The normalized error e[i] at the ith iteration is
53 ; e[i] = (r - r[i]) / r = (1 / v - r[i]) * v = (1 - v * r[i])
57 ; e[i + 1] = (1 - v * r[i + 1]) = 1 - 2 * v * r[i] + v^2 + (r[i])^2
58 ; = (1 - v * r[i])^2 = (e[i])^2
60 ; r2 dividend, r3 divisor, r0 quotient
73 pop ar1 ; Pop return address
75 ; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor
76 rcpf r3, r0 ; Compute initial estimate r[0]
78 mpyf3 r0, r3, r1 ; r1 = r[0] * v
79 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v
80 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1]
81 ; End of 1st iteration (16 bits accuracy)
83 mpyf3 r0, r3, r1 ; r1 = r[1] * v
84 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v
86 bud ar1 ; Delayed branch
87 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2]
88 ; End of 2nd iteration (32 bits accuracy)
90 mpyf *-ar0(1), r0 ; Multiply by the dividend
92 mpyf r2, r0 ; Multiply by the dividend
102 pop ar1 ; Pop return address
104 ; Initial estimate r[0] = 1.0 * 2^(-e - 1)
107 ; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor
109 ; Calculate initial estimate r[0]
113 ; complement exponent = -e -1
114 ; complement sign (side effect)
115 ; complement mantissa (almost 3 bit accurate)
117 popf r0 ; r0 = 1.0 * e^(-e - 1) + inverted mantissa
118 ldf -1.0, r1 ; undo complement sign bit
121 mpyf3 r0, r3, r1 ; r1 = r[0] * v
122 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v
123 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1]
124 ; End of 1st iteration
126 mpyf3 r0, r3, r1 ; r1 = r[1] * v
127 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v
128 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2]
129 ; End of 2nd iteration
131 mpyf3 r0, r3, r1 ; r1 = r[2] * v
132 subrf 2.0, r1 ; r1 = 2.0 - r[2] * v
133 mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3]
134 ; End of 3rd iteration
136 rnd r0 ; Minimize error in x[3]'s LSBs
138 ; Use modified last iteration
139 ; r[4] = (r[3] * (1.0 - (v * r[3]))) + r[3]
140 mpyf3 r0, r3, r1 ; r1 = r[3] * v
141 subrf 1.0, r1 ; r1 = 1.0 - r[3] * v
142 mpyf r0, r1 ; r1 = r[3] * (1.0 - r[3] * v)
143 addf r1, r0 ; r0 = r[3] * (1.0 - r[3] * v) + r[3] = r[4]
145 rnd r0 ; Minimize error in x[4]'s LSBs
147 bud ar1 ; Delayed branch
150 ldfu *-ar0(1), r2 ; Dividend in mem has only 24 bits significance
152 rnd r2 ; Minimize error in reg dividend's LSBs
153 ; since this may have 32 bit significance
156 mpyf r2, r0 ; Multiply by the dividend
157 rnd r0 ; Round result to 32 bits
164 ; Integer signed division
166 ; ar2 dividend, r2 divisor, r0 quotient
167 ; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re
183 xor3 ar2, r2, r3 ; Get the sign
188 cmpi ar2, r2 ; Divisor > dividend?
191 bhid zero ; If so, return 0
194 ; Normalize oeprands. Use difference exponents as shift count
195 ; for divisor, and as repeat count for "subc"
197 float ar2, r1 ; Normalize dividend
198 pushf r1 ; Get as integer
200 lsh -24, ar0 ; Get exponent
202 float r2, r1 ; Normalize divisor
203 pushf r1 ; Get as integer
205 lsh -24, ir0 ; Get exponent
207 subi ir0, ar0 ; Get difference of exponents
208 lsh ar0, r2 ; Align divisor with dividend
211 ; Do count + 1 subtracts and shifts
217 ; Mask off the lower count+1 bits of ar2
219 subri 31, ar0 ; Shift count is (32 - (ar0 + 1))
220 lsh ar0, ar2 ; Shift left
222 lsh3 ar0, ar2, r0 ; Shift right and put result in r0
225 ; Check sign and negate result if necessary
227 bud ir1 ; Delayed return
228 negi r0, r1 ; Negate result
229 ash -31, r3 ; Check sign
230 ldinz r1, r0 ; If set, use negative result
233 zero: bud ir1 ; Delayed branch
239 ; special case where ar2 = abs(ar2) = 0x80000000. We handle this by
240 ; calling unsigned divide and negating the result if necessary.
248 negi r0, r1 ; Negate result
249 ash -31, r3 ; Check sign
250 ldinz r1, r0 ; If set, use negative result
255 ; ar2 dividend, r2 divisor, r0 quotient,
256 ; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re
275 cmpi ar2, r2 ; If divisor > dividend
276 bhi qzero ; return zero
277 ldi r2, ar1 ; Store divisor in ar1
279 tstb ar2, ar2 ; Check top bit, jump if set to special handler
280 bld div_32 ; Delayed branch
283 ; Get divisor exponent
285 float ar1, r1 ; Normalize the divisor
286 pushf r1 ; Get into int register
290 bzd qzero ; if (float) divisor zero, return zero
292 float ar2, r1 ; Normalize the dividend
293 pushf r1 ; Get into int register
295 lsh -24, ar0 ; Get both the exponents
298 subi rc, ar0 ; Get the difference between the exponents
299 lsh ar0, ar1 ; Normalize the divisor with the dividend
302 ; Do count_1 subtracts and shifts
308 ; mask off the lower count+1 bits
310 subri 31, ar0 ; Shift count (31 - (ar0+1))
311 bud ir1 ; Delayed return
318 ; Handle a full 32-bit dividend
320 div_32: tstb ar1, ar1
321 bld qone ; if divisor high bit is one, the result is one
324 lsh rc, ar1 ; Line up the divisor
327 ; Now divisor and dividend are aligned. Do first SUBC by hand, save
328 ; of the forst quotient digit. Then, shift divisor right rather
329 ; than shifting dividend left. This leaves a zero in the top bit of
332 ldi 1, ar0 ; Initizialize MSB of quotient
333 lsh rc, ar0 ; create a mask for MSBs
334 subi 1, ar0 ; mask is (2 << count) - 1
345 ; do the rest of the shifts and subtracts
384 pop ir1 ; return address
385 cmpi ar2, r2 ; divisor > dividend ?
386 bhi uzero ; if so, return dividend
387 ldi r2, ar1 ; load divisor
389 ; If top bit of dividend is set, handle specially.
391 tstb ar2, ar2 ; check top bit
392 bld umod_32 ; get divisor exponent, then jump.
394 ; Get divisor exponent by converting to float.
396 float ar1, r1 ; normalize divisor
397 pushf r1 ; push as float
398 pop rc ; pop as int to get exponent
399 bzd uzero ; if (float)divisor was zero, return
401 ; 31 or less bits in dividend. Get dividend exponent.
403 float ar2, r1 ; normalize dividend
404 pushf r1 ; push as float
405 pop ar0 ; pop as int to get exponent
407 ; Use difference in exponents as shift count to line up MSBs.
409 lsh -24, rc ; divisor exponent
410 lsh -24, ar0 ; dividend exponent
411 subi rc, ar0 ; difference
412 lsh ar0, ar1 ; shift divisor up
414 ; Do COUNT+1 subtract & shifts.
419 ; Remainder is in upper 31-COUNT bits.
421 bud ir1 ; delayed branch to return
422 addi 1, ar0 ; shift count is COUNT+1
423 negi ar0, ar0 ; negate for right shift
424 lsh3 ar0, ar2, r0 ; shift to get result
428 ; The following code handles cases of a full 32-bit dividend. Before
429 ; SUBC can be used, the top bit must be cleared (otherwise SUBC can
430 ; possibly shift a significant 1 out the top of the dividend). This
431 ; is accomplished by first doing a normal subtraction, then proceeding
436 ; If the top bit of the divisor is set too, the remainder is simply
437 ; the difference between the dividend and divisor. Otherwise, shift
438 ; the divisor up to line up the MSBs.
440 tstb ar1, ar1 ; check divisor
441 bld uone ; if negative, remainder is diff
443 lsh -24, rc ; divisor exponent
444 subri 31, rc ; shift count = 31 - exp
445 negi rc, ar0 ; used later as shift count
446 lsh rc, ar1 ; shift up to line up MSBs
448 ; Now MSBs are aligned. Do first SUBC by hand using a plain subtraction.
449 ; Then, shift divisor right rather than shifting dividend left. This leaves
450 ; a 0 in the top bit of the dividend.
452 subi3 ar1, ar2, r1 ; subtract
453 ldihs r1, ar2 ; if positive, replace dividend
454 subi 1, rc ; first iteration is done
455 lsh -1, ar1 ; shift divisor down
457 ; Do EXP subtract & shifts.
462 ; Quotient is in EXP+1 LSBs; shift remainder (in MSBs) down.
465 lsh3 ar0, ar2, r0 ; COUNT contains -(EXP+1)
469 ; Return (dividend - divisor).
479 ldi ar2, r0 ; set status from result
500 ; Determine sign of result. Get absolute value of operands.
502 ldi ar2, ar0 ; sign of result same as dividend
503 absi ar2, r0 ; make dividend positive
504 bvd mod_32 ; if still negative, escape
505 absi r2, r1 ; make divisor positive
506 ldi r1, ar1 ; save in ar1
507 cmpi r0, ar1 ; divisor > dividend ?
509 pop ir1 ; return address
510 bhid return ; if so, return dividend
512 ; Normalize operands. Use difference in exponents as shift count
513 ; for divisor, and as repeat count for SUBC.
515 float r1, r1 ; normalize divisor
516 pushf r1 ; push as float
518 bzd return ; if (float)divisor was zero, return
520 float r0, r1 ; normalize dividend
521 pushf r1 ; push as float
524 lsh -24, rc ; get divisor exponent
525 lsh -24, r1 ; get dividend exponent
526 subi rc, r1 ; get difference in exponents
527 lsh r1, ar1 ; align divisor with dividend
529 ; Do COUNT+1 subtract & shifts.
534 ; Remainder is in upper bits of R0
536 addi 1, r1 ; shift count is -(r1+1)
538 lsh r1, r0 ; shift right
540 ; Check sign and negate result if necessary.
543 bud ir1 ; delayed branch to return
544 negi r0, r1 ; negate result
545 cmpi 0, ar0 ; check sign
546 ldin r1, r0 ; if set, use negative result
549 ; The following code handles cases of a full 32-bit dividend. This occurs
550 ; when R0 = abs(R0) = 080000000h. Handle this by calling the unsigned mod
551 ; function, then negating the result if necessary.
554 push ar0 ; remember sign
555 call umodqi3n ; do divide
558 pop ar0 ; restore sign
559 pop ir1 ; return address
565 .global ___unsfltconst
566 ___unsfltconst: .float 4294967296.0
569 #ifdef L_unsfltcompare
571 .global ___unsfltcompare
572 ___unsfltcompare: .float 2147483648.0
575 ; Integer 32-bit signed multiplication
577 ; The TMS320C3x MPYI instruction takes two 24-bit signed integers
578 ; and produces a 48-bit signed result which is truncated to 32-bits.
580 ; A 32-bit by 32-bit multiplication thus requires a number of steps.
582 ; Consider the product of two 32-bit signed integers,
586 ; where x = (b << 16) + a, y = (d << 16) + c
588 ; This can be expressed as
590 ; z = ((b << 16) + a) * ((d << 16) + c)
592 ; = ((b * d) << 32) + ((b * c + a * d) << 16) + a * c
594 ; Let z = (f << 16) + e where f < (1 << 16).
596 ; Since we are only interested in a 32-bit result, we can ignore the
597 ; (b * d) << 32 term, and thus
599 ; f = b * c + a * d, e = a * c
601 ; We can simplify things if we have some a priori knowledge of the
602 ; operands, for example, if -32768 <= y <= 32767, then y = c and d = 0 and thus
604 ; f = b * c, e = a * c
606 ; ar2 multiplier, r2 multiplicand, r0 product
607 ; clobbers r1, r2, r3
622 pop ir1 ; return address
631 addi ar2, r2 ; c * b + a * d
632 bd ir1 ; delayed branch to return
633 lsh 16, r2 ; (c * b + a * d) << 16
635 addi r2, r0 ; a * c + (c * b + a * d) << 16
641 ; Integer 64 by 64 multiply
642 ; long1 and long2 on stack
722 ; Integer 32 by 32 multiply highpart unsigned
727 #ifdef L_umuldi3_high
729 .global ___umulhi3_high
768 ; Integer 32 by 32 multiply highpart signed
773 #ifdef L_smuldi3_high
775 .global ___smulhi3_high
821 ; Integer 64 by 64 unsigned divide
822 ; long1 and long2 on stack
825 ; routine takes a maximum of 64*8+23=535 cycles = 21.4 us @ 50Mhz
925 ; Integer 64 by 64 unsigned modulo
926 ; long1 and long2 on stack
948 ; Integer 64 by 64 signed divide
949 ; long1 and long2 on stack
985 ; Integer 64 by 64 signed modulo
986 ; long1 and long2 on stack
1024 ; double to signed long long conversion
1028 #ifdef L_fix_truncsfdi2
1030 .global ___fix_truncqfhi2
1031 .ref ufix_truncqfhi2n
1043 bge ufix_truncqfhi2n
1045 call ufix_truncqfhi2n
1052 ; double to unsigned long long conversion
1056 #ifdef L_ufix_truncsfdi2
1058 .global ___ufix_truncqfhi2
1059 .global ufix_truncqfhi2n
1098 ; signed long long to double conversion
1104 .global ___floathiqf2
1119 ; unsigned long long to double conversion
1123 #ifdef L_ufloatdisf2
1125 .global ___ufloathiqf2
1126 .global ufloathiqf2n
1135 ldpk @___unsfltconst
1140 ldf @___unsfltconst,r2
1169 ; long double to signed long long conversion
1173 #ifdef L_fix_truncdfdi2
1175 .global ___fix_trunchfhi2
1176 .ref ufix_trunchfhi2n
1189 bge ufix_trunchfhi2n
1191 call ufix_trunchfhi2n
1198 ; long double to unsigned long long conversion
1202 #ifdef L_ufix_truncdfdi2
1204 .global ___ufix_trunchfhi2
1205 .global ufix_trunchfhi2n
1245 ; signed long long to long double conversion
1251 .global ___floathihf2
1266 ; unsigned long long to double conversion
1270 #ifdef L_ufloatdidf2
1272 .global ___ufloathihf2
1273 .global ufloathihf2n
1282 ldpk @___unsfltconst
1287 ldf @___unsfltconst,r2
1340 ldpk @___unsfltconst
1345 ldflt @___unsfltconst,r1
1357 ; calculate long double * long double
1378 ldf r2,r0 ; copy lsb0
1379 ldf r3,r1 ; copy lsb1
1380 and 0ffh,r0 ; mask lsb0
1381 and 0ffh,r1 ; mask lsb1
1382 norm r0,r0 ; correct lsb0
1383 norm r1,r1 ; correct lsb1
1384 mpyf r2,r1 ; arg0*lsb1
1385 mpyf r3,r0 ; arg1*lsb0
1386 bd ar2 ; return (delayed)
1387 addf r0,r1 ; arg0*lsb1 + arg1*lsb0
1388 mpyf r2,r3,r0 ; msb0*msb1
1389 addf r1,r0 ; msb0*msb1 + arg0*lsb1 + arg1*lsb0
1393 ; calculate long double / long double
1394 ; r2 dividend, r3 divisor, r0 quotient
1433 mpyf3 r0, r3, r1 ; r1 = r[0] * v
1434 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v
1435 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1]
1436 ; End of 1st iteration
1438 mpyf3 r0, r3, r1 ; r1 = r[1] * v
1439 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v
1440 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2]
1441 ; End of 2nd iteration
1443 mpyf3 r0, r3, r1 ; r1 = r[2] * v
1444 subrf 2.0, r1 ; r1 = 2.0 - r[2] * v
1445 mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3]
1446 ; End of 3rd iteration
1451 ; mpyf3 r0, r3, r1 ; r1 = r[3] * v
1468 subrf 2.0, r1 ; r1 = 2.0 - r[3] * v
1470 mpyf r1, r0, r3 ; r3 = r[3] * (2.0 - r[3] * v) = r[5]
1484 mpyf r2, r3, r0 ; Multiply by the dividend