1 /* Copyright
2010, 2011 Free Software Foundation
, Inc.
2 Contributed by Bernd Schmidt
<bernds
@codesourcery.com
>.
4 This file is free software
; you can redistribute it and/or modify it
5 under the terms of the GNU General
Public License as published by the
6 Free Software Foundation
; either version 3, or (at your option) any
9 This file is distributed
in the hope that it will be useful
, but
10 WITHOUT ANY WARRANTY
; without even the implied warranty of
11 MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General
Public License for more details.
14 Under
Section 7 of GPL version
3, you are granted additional
15 permissions described
in the GCC Runtime Library Exception
, version
16 3.1, as published by the Free Software Foundation.
18 You should have received a copy of the GNU General
Public License
and
19 a copy of the GCC Runtime Library Exception along with
this program
;
20 see the files COPYING3
and COPYING.RUNTIME respectively. If
not, see
21 <http://www.gnu.
org/licenses
/>.
*/
23 ;; ABI considerations for the divide functions
24 ;; The following registers are call-used:
25 ;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5
26 ;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4
27 ;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4
28 ;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4
30 ;; In our implementation, divu and remu are leaf functions,
31 ;; while both divi and remi call into divu.
32 ;; A0 is not clobbered by any of the functions.
33 ;; divu does not clobber B2 either, which is taken advantage of
35 ;; divi uses B5 to hold the original return address during
37 ;; remi uses B2 and A5 to hold the input values during the
38 ;; call to divu. It stores B3 in on the stack.
45 .
type __c6xabi_divi
, STT_FUNC
48 call .s2 __c6xabi_divu
50 || cmpgt .l1
0, A4
, A1
51 || cmpgt .l2
0, B4
, B1
54 ||
[B1
] neg .l2 B4
, B4
55 ||
xor .s1x A1
, B1
, A1
58 [A1
] addkpc .s2
1f
, B3
, 4
71 #if defined L_modsi3 || defined L_divmodsi4
74 #define MOD_OUTPUT_REG A4
77 .
type __c6xabi_remi
, STT_FUNC
79 #define MOD_OUTPUT_REG A5
80 .
global __c6xabi_divremi
81 .hidden __c6xabi_divremi
82 .
type __c6xabi_divremi
, STT_FUNC
87 stw .d2t2 B3
, *B15
--[2]
88 || cmpgt .l1
0, A4
, A1
89 || cmpgt .l2
0, B4
, B2
91 ||
call .s2 __c6xabi_divu
94 ||
[B2
] neg .l2 B4
, B4
95 ||
xor .s2x B2
, A1
, B0
99 [B0
] addkpc .s2
1f
, B3
, 1
100 [!B0
] addkpc .s2
2f
, B3
, 1
112 ldw .d2t2
*++B15
[2], B3
114 #ifdef _TMS320C6400_PLUS
115 mpy32 .m1x A4
, B2
, A6
118 sub .l1 A5
, A6
, MOD_OUTPUT_REG
123 mpylhu .m1x A4
, B2
, A6
124 || mpylhu .m2x B2
, A4
, B2
130 sub .l1 A5
, A6
, MOD_OUTPUT_REG
136 #if defined L_udivsi3 || defined L_udivmodsi4
139 .
global __c6xabi_divu
140 .hidden __c6xabi_divu
141 .
type __c6xabi_divu
, STT_FUNC
144 .
global __c6xabi_divremu
145 .hidden __c6xabi_divremu
146 .
type __c6xabi_divremu
, STT_FUNC
149 ;; We use a series of up to 31 subc instructions. First, we find
150 ;; out how many leading zero bits there are in the divisor. This
151 ;; gives us both a shift count for aligning (shifting) the divisor
152 ;; to the, and the number of times we have to execute subc.
154 ;; At the end, we have both the remainder and most of the quotient
155 ;; in A4. The top bit of the quotient is computed first and is
158 ;; Return immediately if the dividend is zero. Setting B4 to 1
159 ;; is a trick to allow us to leave the following insns in the jump
160 ;; delay slot without affecting the result.
166 [b1
] lmbd .l2
1, B4
, B1
167 ||
[!b1
] b .s2 B3
; RETURN A
169 ||
[!b1
] mvk .d2
1, B4
175 ||
shl .s2 B4
, B1
, B4
177 ;; The loop performs a maximum of 28 steps, so we do the
179 cmpltu .l1x A4
, B4
, A2
180 [!A2
] sub .l1x A4
, B4
, A4
181 || shru .s2 B4
, 1, B4
185 ||
[b1
] subc .l1x A4
,B4
,A4
186 ||
[b1
] add .s2
-1, B1
, B1
187 [b1
] subc .l1x A4
,B4
,A4
188 ||
[b1
] add .s2
-1, B1
, B1
190 ;; RETURN A may happen here (note: must happen before the next branch)
193 ||
[b1
] subc .l1x A4
,B4
,A4
194 ||
[b1
] add .s2
-1, B1
, B1
195 [b1
] subc .l1x A4
,B4
,A4
196 ||
[b1
] add .s2
-1, B1
, B1
198 [b1
] subc .l1x A4
,B4
,A4
199 ||
[b1
] add .s2
-1, B1
, B1
200 [b1
] subc .l1x A4
,B4
,A4
201 ||
[b1
] add .s2
-1, B1
, B1
202 [b1
] subc .l1x A4
,B4
,A4
203 ||
[b1
] add .s2
-1, B1
, B1
204 [b1
] subc .l1x A4
,B4
,A4
205 ||
[b1
] add .s2
-1, B1
, B1
206 [b1
] subc .l1x A4
,B4
,A4
207 ||
[b1
] add .s2
-1, B1
, B1
208 ;; loop backwards branch happens here
214 || extu .s1 A4
, A6
, A5
227 .
global __c6xabi_remu
228 .hidden __c6xabi_remu
229 .
type __c6xabi_remu
, STT_FUNC
231 ;; The ABI seems designed to prevent these functions calling each other,
232 ;; so we duplicate most of the divsi3 code here.
238 ||
[!b1
] b .s2 B3
; RETURN A
240 ||
[!b1
] mvk .d2
1, B4
244 ||
shl .s2 B4
, B1
, B4
246 cmpltu .l1x A4
, B4
, A1
247 [!a1
] sub .l1x A4
, B4
, A4
252 ||
[b1
] subc .l1x A4
,B4
,A4
253 ||
[b1
] add .s2
-1, B1
, B1
254 ;; RETURN A may happen here (note: must happen before the next branch)
255 [b1
] subc .l1x A4
,B4
,A4
256 ||
[b1
] add .s2
-1, B1
, B1
258 [b1
] subc .l1x A4
,B4
,A4
259 ||
[b1
] add .s2
-1, B1
, B1
260 [b1
] subc .l1x A4
,B4
,A4
261 ||
[b1
] add .s2
-1, B1
, B1
262 [b1
] subc .l1x A4
,B4
,A4
263 ||
[b1
] add .s2
-1, B1
, B1
264 [b1
] subc .l1x A4
,B4
,A4
265 ||
[b1
] add .s2
-1, B1
, B1
266 [b1
] subc .l1x A4
,B4
,A4
267 ||
[b1
] add .s2
-1, B1
, B1
268 ;; loop backwards branch happens here
271 [b1
] subc .l1x A4
,B4
,A4
272 ||
[b1
] add .s2
-1, B1
, B1
273 [b1
] subc .l1x A4
,B4
,A4
279 #if defined L_strasgi_64plus
&& defined _TMS320C6400_PLUS
282 .
global __c6xabi_strasgi_64plus
283 .hidden __c6xabi_strasgi_64plus
284 .
type __c6xabi_strasgi_64plus
, STT_FUNC
285 __c6xabi_strasgi_64plus:
294 ldw .d2t2
*b30
++, b31
298 || stw .d1t1 a31
, *a30
++
305 .
global __c6xabi_strasgi
306 .
type __c6xabi_strasgi
, STT_FUNC
308 ;; This is essentially memcpy, with alignment known to be at least
309 ;; 4, and the size a multiple of 4 greater than or equal to 28.
314 ||
sub .d1 A6
, 24, A6
321 || cmpltu .l2 B2
, B7
, B0
325 ||
[b0
] ldw .d2t1
*B4
++, A0
329 [b0
] sub .d2 B6
, 24, B7
331 || cmpltu .l2 B1
, B6
, B0
333 [b0
] ldw .d2t1
*B4
++, A1
334 || stw .d1t2 B5
, *A4
++
336 || cmpltu .l2
12, B6
, B0
338 [b0
] ldw .d2t1
*B4
++, A5
339 || stw .d1t2 B5
, *A4
++
341 || cmpltu .l2
8, B6
, B0
343 [b0
] ldw .d2t1
*B4
++, A7
344 || stw .d1t2 B5
, *A4
++
346 || cmpltu .l2
4, B6
, B0
348 [b0
] ldw .d2t1
*B4
++, A8
349 || stw .d1t2 B5
, *A4
++
351 || cmpltu .l2
0, B6
, B0
353 [b0
] ldw .d2t1
*B4
++, A9
354 || stw .d1t2 B5
, *A4
++
356 || cmpltu .l2 B2
, B7
, B0
358 ;; loop back branch happens here
360 cmpltu .l2 B1
, B6
, B0
363 [b0
] stw .d1t1 A1
, *A4
++
364 || cmpltu .l2
12, B6
, B0
365 [b0
] stw .d1t1 A5
, *A4
++
366 || cmpltu .l2
8, B6
, B0
367 [b0
] stw .d1t1 A7
, *A4
++
368 || cmpltu .l2
4, B6
, B0
369 [b0
] stw .d1t1 A8
, *A4
++
370 || cmpltu .l2
0, B6
, B0
371 [b0
] stw .d1t1 A9
, *A4
++
373 ;; return happens here
377 #ifdef _TMS320C6400_PLUS
380 .
global __c6xabi_push_rts
381 .hidden __c6xabi_push_rts
382 .
type __c6xabi_push_rts
, STT_FUNC
384 stw .d2t2 B14
, *B15
--[2]
385 stdw .d2t1
A15:A14
, *B15
--
387 stdw .d2t2
B13:B12
, *B15
--
388 stdw .d2t1
A13:A12
, *B15
--
389 stdw .d2t2
B11:B10
, *B15
--
390 stdw .d2t1
A11:A10
, *B15
--
391 stdw .d2t2
B3:B2
, *B15
--
396 .
global __c6xabi_pop_rts
397 .hidden __c6xabi_pop_rts
398 .
type __c6xabi_pop_rts
, STT_FUNC
400 lddw .d2t2
*++B15
, B3:B2
401 lddw .d2t1
*++B15
, A11:A10
402 lddw .d2t2
*++B15
, B11:B10
403 lddw .d2t1
*++B15
, A13:A12
404 lddw .d2t2
*++B15
, B13:B12
405 lddw .d2t1
*++B15
, A15:A14
407 ldw .d2t2
*++B15
[2], B14
413 .
global __c6xabi_call_stub
414 .
type __c6xabi_call_stub
, STT_FUNC
416 stw .d2t1 A2
, *B15
--[2]
417 stdw .d2t1
A7:A6
, *B15
--
419 stdw .d2t1
A1:A0
, *B15
--
420 stdw .d2t2
B7:B6
, *B15
--
421 stdw .d2t2
B5:B4
, *B15
--
422 stdw .d2t2
B1:B0
, *B15
--
423 stdw .d2t2
B3:B2
, *B15
--
424 || addkpc .s2
1f
, B3
, 0
426 lddw .d2t2
*++B15
, B3:B2
427 lddw .d2t2
*++B15
, B1:B0
428 lddw .d2t2
*++B15
, B5:B4
429 lddw .d2t2
*++B15
, B7:B6
430 lddw .d2t1
*++B15
, A1:A0
431 lddw .d2t1
*++B15
, A7:A6
433 ldw .d2t1
*++B15
[2], A2