beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v6t2 / divrem_1.asm
blobbe24615acbbebefa2e08c195f1bc1f1b296a529d
1 dnl ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C norm unorm frac
36 C StrongARM - - -
37 C XScale - - -
38 C Cortex-A7 ? ? ?
39 C Cortex-A8 ? ? ?
40 C Cortex-A9 13 14 13
41 C Cortex-A15 11.4 11.8 11.1
43 C TODO
44 C * Optimise inner-loops better, they could likely run a cycle or two faster.
45 C * Decrease register usage, streamline non-loop code.
47 define(`qp_arg', `r0')
48 define(`fn', `r1')
49 define(`up_arg', `r2')
50 define(`n_arg', `r3')
51 define(`d_arg', `0')
52 define(`dinv_arg',`4')
53 define(`cnt_arg', `8')
55 define(`n', `r9')
56 define(`qp', `r5')
57 define(`up', `r6')
58 define(`cnt', `r7')
59 define(`tnc', `r10')
60 define(`dinv', `r0')
61 define(`d', `r4')
63 ASM_START()
64 PROLOGUE(mpn_preinv_divrem_1)
65 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
66 ldr d, [sp, #9*4+d_arg]
67 ldr cnt, [sp, #9*4+cnt_arg]
68 str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
69 sub n, r3, #1
70 add r3, r1, n
71 cmp d, #0
72 add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
73 add up, up_arg, n, lsl #2 C put up at U[] end
74 ldr dinv, [sp, #9*4+dinv_arg]
75 blt L(nent)
76 b L(uent)
77 EPILOGUE()
79 PROLOGUE(mpn_divrem_1)
80 stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
81 sub n, r3, #1
82 ldr d, [sp, #9*4+d_arg] C d
83 str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn
84 add r3, r1, n
85 cmp d, #0
86 add qp, qp_arg, r3, lsl #2 C put qp at Q[] end
87 add up, up_arg, n, lsl #2 C put up at U[] end
88 blt L(normalised)
90 L(unnorm):
91 clz cnt, d
92 mov r0, d, lsl cnt C pass d << cnt
93 bl mpn_invert_limb
94 L(uent):
95 mov d, d, lsl cnt C d <<= cnt
96 cmp n, #0
97 mov r1, #0 C r
98 blt L(frac)
100 ldr r11, [up, #0]
102 rsb tnc, cnt, #32
103 mov r1, r11, lsr tnc
104 mov r11, r11, lsl cnt
105 beq L(uend)
107 ldr r3, [up, #-4]!
108 orr r2, r11, r3, lsr tnc
109 b L(mid)
111 L(utop):
112 mls r1, d, r8, r11
113 mov r11, r3, lsl cnt
114 ldr r3, [up, #-4]!
115 cmp r1, r2
116 addhi r1, r1, d
117 subhi r8, r8, #1
118 orr r2, r11, r3, lsr tnc
119 cmp r1, d
120 bcs L(ufx)
121 L(uok): str r8, [qp], #-4
122 L(mid): add r8, r1, #1
123 mov r11, r2
124 umlal r2, r8, r1, dinv
125 subs n, n, #1
126 bne L(utop)
128 mls r1, d, r8, r11
129 mov r11, r3, lsl cnt
130 cmp r1, r2
131 addhi r1, r1, d
132 subhi r8, r8, #1
133 cmp r1, d
134 rsbcs r1, d, r1
135 addcs r8, r8, #1
136 str r8, [qp], #-4
138 L(uend):add r8, r1, #1
139 mov r2, r11
140 umlal r2, r8, r1, dinv
141 mls r1, d, r8, r11
142 cmp r1, r2
143 addhi r1, r1, d
144 subhi r8, r8, #1
145 cmp r1, d
146 rsbcs r1, d, r1
147 addcs r8, r8, #1
148 str r8, [qp], #-4
149 L(frac):
150 ldr r2, [sp, #9*4+d_arg] C fn
151 cmp r2, #0
152 beq L(fend)
154 L(ftop):mov r6, #0
155 add r3, r1, #1
156 umlal r6, r3, r1, dinv
157 mov r8, #0
158 mls r1, d, r3, r8
159 cmp r1, r6
160 addhi r1, r1, d
161 subhi r3, r3, #1
162 subs r2, r2, #1
163 str r3, [qp], #-4
164 bne L(ftop)
166 L(fend):mov r11, r1, lsr cnt
167 L(rtn): mov r0, r11
168 ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
170 L(normalised):
171 mov r0, d
172 bl mpn_invert_limb
173 L(nent):
174 cmp n, #0
175 mov r11, #0 C r
176 blt L(nend)
178 ldr r11, [up, #0]
179 cmp r11, d
180 movlo r2, #0 C hi q limb
181 movhs r2, #1 C hi q limb
182 subhs r11, r11, d
184 str r2, [qp], #-4
185 cmp n, #0
186 beq L(nend)
188 L(ntop):ldr r1, [up, #-4]!
189 add r12, r11, #1
190 umlal r1, r12, r11, dinv
191 ldr r3, [up, #0]
192 mls r11, d, r12, r3
193 cmp r11, r1
194 addhi r11, r11, d
195 subhi r12, r12, #1
196 cmp d, r11
197 bls L(nfx)
198 L(nok): str r12, [qp], #-4
199 subs n, n, #1
200 bne L(ntop)
202 L(nend):mov r1, r11 C r
203 mov cnt, #0 C shift cnt
204 b L(frac)
206 L(nfx): add r12, r12, #1
207 rsb r11, d, r11
208 b L(nok)
209 L(ufx): rsb r1, d, r1
210 add r8, r8, #1
211 b L(uok)
212 EPILOGUE()