1 /* Copyright (C) 2010-2015 Free Software Foundation, Inc.
2 Contributed by Bernd Schmidt <bernds@codesourcery.com>.
4 This file is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any
9 This file is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 Under Section 7 of GPL version 3, you are granted additional
15 permissions described in the GCC Runtime Library Exception, version
16 3.1, as published by the Free Software Foundation.
18 You should have received a copy of the GNU General Public License and
19 a copy of the GCC Runtime Library Exception along with this program;
20 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
21 <http://www.gnu.org/licenses/>. */
23 ;; ABI considerations for the divide functions
24 ;; The following registers are call-used:
25 ;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5
26 ;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4
27 ;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4
28 ;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4
30 ;; In our implementation, divu and remu are leaf functions,
31 ;; while both divi and remi call into divu.
32 ;; A0 is not clobbered by any of the functions.
33 ;; divu does not clobber B2 either, which is taken advantage of
35 ;; divi uses B5 to hold the original return address during
37 ;; remi uses B2 and A5 to hold the input values during the
38 ;; call to divu. It stores B3 in on the stack.
45 .type __c6xabi_divi, STT_FUNC
48 call .s2 __c6xabi_divu
50 || cmpgt .l1 0, A4, A1
51 || cmpgt .l2 0, B4, B1
54 || [B1] neg .l2 B4, B4
55 || xor .s1x A1, B1, A1
58 [A1] addkpc .s2 1f, B3, 4
71 #if defined L_modsi3 || defined L_divmodsi4
74 #define MOD_OUTPUT_REG A4
77 .type __c6xabi_remi, STT_FUNC
79 #define MOD_OUTPUT_REG A5
80 .global __c6xabi_divremi
81 .hidden __c6xabi_divremi
82 .type __c6xabi_divremi, STT_FUNC
87 stw .d2t2 B3, *B15--[2]
88 || cmpgt .l1 0, A4, A1
89 || cmpgt .l2 0, B4, B2
91 || call .s2 __c6xabi_divu
94 || [B2] neg .l2 B4, B4
95 || xor .s2x B2, A1, B0
99 [B0] addkpc .s2 1f, B3, 1
100 [!B0] addkpc .s2 2f, B3, 1
112 ldw .d2t2 *++B15[2], B3
114 #ifdef _TMS320C6400_PLUS
115 mpy32 .m1x A4, B2, A6
118 sub .l1 A5, A6, MOD_OUTPUT_REG
123 mpylhu .m1x A4, B2, A6
124 || mpylhu .m2x B2, A4, B2
130 sub .l1 A5, A6, MOD_OUTPUT_REG
136 #if defined L_udivsi3 || defined L_udivmodsi4
139 .global __c6xabi_divu
140 .hidden __c6xabi_divu
141 .type __c6xabi_divu, STT_FUNC
144 .global __c6xabi_divremu
145 .hidden __c6xabi_divremu
146 .type __c6xabi_divremu, STT_FUNC
149 ;; We use a series of up to 31 subc instructions. First, we find
150 ;; out how many leading zero bits there are in the divisor. This
151 ;; gives us both a shift count for aligning (shifting) the divisor
152 ;; to the, and the number of times we have to execute subc.
154 ;; At the end, we have both the remainder and most of the quotient
155 ;; in A4. The top bit of the quotient is computed first and is
158 ;; Return immediately if the dividend is zero. Setting B4 to 1
159 ;; is a trick to allow us to leave the following insns in the jump
160 ;; delay slot without affecting the result.
166 [b1] lmbd .l2 1, B4, B1
167 ||[!b1] b .s2 B3 ; RETURN A
169 ||[!b1] mvk .d2 1, B4
175 || shl .s2 B4, B1, B4
177 ;; The loop performs a maximum of 28 steps, so we do the
179 cmpltu .l1x A4, B4, A2
180 [!A2] sub .l1x A4, B4, A4
181 || shru .s2 B4, 1, B4
185 || [b1] subc .l1x A4,B4,A4
186 || [b1] add .s2 -1, B1, B1
187 [b1] subc .l1x A4,B4,A4
188 || [b1] add .s2 -1, B1, B1
190 ;; RETURN A may happen here (note: must happen before the next branch)
193 || [b1] subc .l1x A4,B4,A4
194 || [b1] add .s2 -1, B1, B1
195 [b1] subc .l1x A4,B4,A4
196 || [b1] add .s2 -1, B1, B1
198 [b1] subc .l1x A4,B4,A4
199 || [b1] add .s2 -1, B1, B1
200 [b1] subc .l1x A4,B4,A4
201 || [b1] add .s2 -1, B1, B1
202 [b1] subc .l1x A4,B4,A4
203 || [b1] add .s2 -1, B1, B1
204 [b1] subc .l1x A4,B4,A4
205 || [b1] add .s2 -1, B1, B1
206 [b1] subc .l1x A4,B4,A4
207 || [b1] add .s2 -1, B1, B1
208 ;; loop backwards branch happens here
214 || extu .s1 A4, A6, A5
227 .global __c6xabi_remu
228 .hidden __c6xabi_remu
229 .type __c6xabi_remu, STT_FUNC
231 ;; The ABI seems designed to prevent these functions calling each other,
232 ;; so we duplicate most of the divsi3 code here.
238 ||[!b1] b .s2 B3 ; RETURN A
240 ||[!b1] mvk .d2 1, B4
244 || shl .s2 B4, B1, B4
246 cmpltu .l1x A4, B4, A1
247 [!a1] sub .l1x A4, B4, A4
252 || [b1] subc .l1x A4,B4,A4
253 || [b1] add .s2 -1, B1, B1
254 ;; RETURN A may happen here (note: must happen before the next branch)
255 [b1] subc .l1x A4,B4,A4
256 || [b1] add .s2 -1, B1, B1
258 [b1] subc .l1x A4,B4,A4
259 || [b1] add .s2 -1, B1, B1
260 [b1] subc .l1x A4,B4,A4
261 || [b1] add .s2 -1, B1, B1
262 [b1] subc .l1x A4,B4,A4
263 || [b1] add .s2 -1, B1, B1
264 [b1] subc .l1x A4,B4,A4
265 || [b1] add .s2 -1, B1, B1
266 [b1] subc .l1x A4,B4,A4
267 || [b1] add .s2 -1, B1, B1
268 ;; loop backwards branch happens here
271 [b1] subc .l1x A4,B4,A4
272 || [b1] add .s2 -1, B1, B1
273 [b1] subc .l1x A4,B4,A4
279 #if defined L_strasgi_64plus && defined _TMS320C6400_PLUS
282 .global __c6xabi_strasgi_64plus
283 .hidden __c6xabi_strasgi_64plus
284 .type __c6xabi_strasgi_64plus, STT_FUNC
285 __c6xabi_strasgi_64plus:
294 ldw .d2t2 *b30++, b31
298 || stw .d1t1 a31, *a30++
305 .global __c6xabi_strasgi
306 .type __c6xabi_strasgi, STT_FUNC
308 ;; This is essentially memcpy, with alignment known to be at least
309 ;; 4, and the size a multiple of 4 greater than or equal to 28.
314 || sub .d1 A6, 24, A6
321 || cmpltu .l2 B2, B7, B0
325 ||[b0] ldw .d2t1 *B4++, A0
329 [b0] sub .d2 B6, 24, B7
331 || cmpltu .l2 B1, B6, B0
333 [b0] ldw .d2t1 *B4++, A1
334 || stw .d1t2 B5, *A4++
336 || cmpltu .l2 12, B6, B0
338 [b0] ldw .d2t1 *B4++, A5
339 || stw .d1t2 B5, *A4++
341 || cmpltu .l2 8, B6, B0
343 [b0] ldw .d2t1 *B4++, A7
344 || stw .d1t2 B5, *A4++
346 || cmpltu .l2 4, B6, B0
348 [b0] ldw .d2t1 *B4++, A8
349 || stw .d1t2 B5, *A4++
351 || cmpltu .l2 0, B6, B0
353 [b0] ldw .d2t1 *B4++, A9
354 || stw .d1t2 B5, *A4++
356 || cmpltu .l2 B2, B7, B0
358 ;; loop back branch happens here
360 cmpltu .l2 B1, B6, B0
363 [b0] stw .d1t1 A1, *A4++
364 || cmpltu .l2 12, B6, B0
365 [b0] stw .d1t1 A5, *A4++
366 || cmpltu .l2 8, B6, B0
367 [b0] stw .d1t1 A7, *A4++
368 || cmpltu .l2 4, B6, B0
369 [b0] stw .d1t1 A8, *A4++
370 || cmpltu .l2 0, B6, B0
371 [b0] stw .d1t1 A9, *A4++
373 ;; return happens here
377 #ifdef _TMS320C6400_PLUS
380 .global __c6xabi_push_rts
381 .hidden __c6xabi_push_rts
382 .type __c6xabi_push_rts, STT_FUNC
384 stw .d2t2 B14, *B15--[2]
385 stdw .d2t1 A15:A14, *B15--
387 stdw .d2t2 B13:B12, *B15--
388 stdw .d2t1 A13:A12, *B15--
389 stdw .d2t2 B11:B10, *B15--
390 stdw .d2t1 A11:A10, *B15--
391 stdw .d2t2 B3:B2, *B15--
396 .global __c6xabi_pop_rts
397 .hidden __c6xabi_pop_rts
398 .type __c6xabi_pop_rts, STT_FUNC
400 lddw .d2t2 *++B15, B3:B2
401 lddw .d2t1 *++B15, A11:A10
402 lddw .d2t2 *++B15, B11:B10
403 lddw .d2t1 *++B15, A13:A12
404 lddw .d2t2 *++B15, B13:B12
405 lddw .d2t1 *++B15, A15:A14
407 ldw .d2t2 *++B15[2], B14
413 .global __c6xabi_call_stub
414 .type __c6xabi_call_stub, STT_FUNC
416 stw .d2t1 A2, *B15--[2]
417 stdw .d2t1 A7:A6, *B15--
419 stdw .d2t1 A1:A0, *B15--
420 stdw .d2t2 B7:B6, *B15--
421 stdw .d2t2 B5:B4, *B15--
422 stdw .d2t2 B1:B0, *B15--
423 stdw .d2t2 B3:B2, *B15--
424 || addkpc .s2 1f, B3, 0
426 lddw .d2t2 *++B15, B3:B2
427 lddw .d2t2 *++B15, B1:B0
428 lddw .d2t2 *++B15, B5:B4
429 lddw .d2t2 *++B15, B7:B6
430 lddw .d2t1 *++B15, A1:A0
431 lddw .d2t1 *++B15, A7:A6
433 ldw .d2t1 *++B15[2], A2