beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / k8 / div_qr_1n_pi1.asm
blob861402b222916aae8a648b4ebcab33983da3a001
1 dnl x86-64 mpn_div_qr_1n_pi1
2 dnl -- Divide an mpn number by a normalized single-limb number,
3 dnl using a single-limb inverse.
5 dnl Contributed to the GNU project by Niels Möller
7 dnl Copyright 2013 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
10 dnl
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of either:
13 dnl
14 dnl * the GNU Lesser General Public License as published by the Free
15 dnl Software Foundation; either version 3 of the License, or (at your
16 dnl option) any later version.
17 dnl
18 dnl or
19 dnl
20 dnl * the GNU General Public License as published by the Free Software
21 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl later version.
23 dnl
24 dnl or both in parallel, as here.
25 dnl
26 dnl The GNU MP Library is distributed in the hope that it will be useful, but
27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl for more details.
30 dnl
31 dnl You should have received copies of the GNU General Public License and the
32 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
33 dnl see https://www.gnu.org/licenses/.
35 include(`../config.m4')
38 C c/l
39 C AMD K8,K9 11
40 C AMD K10 11
41 C AMD bull 16
42 C AMD pile 14.25
43 C AMD steam ?
44 C AMD bobcat 16
45 C AMD jaguar ?
46 C Intel P4 47.5 poor
47 C Intel core 28.5 very poor
48 C Intel NHM 29 very poor
49 C Intel SBR 16 poor
50 C Intel IBR 13.5
51 C Intel HWL 12
52 C Intel BWL ?
53 C Intel atom 53 very poor
54 C VIA nano 19
57 C INPUT Parameters
58 define(`QP', `%rdi')
59 define(`UP', `%rsi')
60 define(`UN_INPUT', `%rdx')
61 define(`U1', `%rcx') C Also in %rax
62 define(`D', `%r8')
63 define(`DINV', `%r9')
65 C Invariants
66 define(`B2', `%rbp')
67 define(`B2md', `%rbx')
69 C Variables
70 define(`UN', `%r8') C Overlaps D input
71 define(`T', `%r10')
72 define(`U0', `%r11')
73 define(`U2', `%r12')
74 define(`Q0', `%r13')
75 define(`Q1', `%r14')
76 define(`Q2', `%r15')
78 ABI_SUPPORT(STD64)
80 ASM_START()
81 TEXT
82 ALIGN(16)
83 PROLOGUE(mpn_div_qr_1n_pi1)
84 FUNC_ENTRY(6)
85 IFDOS(` mov 56(%rsp), %r8 ')
86 IFDOS(` mov 64(%rsp), %r9 ')
87 dec UN_INPUT
88 jnz L(first)
90 C Just a single 2/1 division.
91 C T, U0 are allocated in scratch registers
92 lea 1(U1), T
93 mov U1, %rax
94 mul DINV
95 mov (UP), U0
96 add U0, %rax
97 adc T, %rdx
98 mov %rdx, T
99 imul D, %rdx
100 sub %rdx, U0
101 cmp U0, %rax
102 lea (U0, D), %rax
103 cmovnc U0, %rax
104 sbb $0, T
105 cmp D, %rax
106 jc L(single_div_done)
107 sub D, %rax
108 add $1, T
109 L(single_div_done):
110 mov T, (QP)
111 FUNC_EXIT
113 L(first):
114 C FIXME: Could delay some of these until we enter the loop.
115 push %r15
116 push %r14
117 push %r13
118 push %r12
119 push %rbx
120 push %rbp
122 mov D, B2
123 imul DINV, B2
124 neg B2
125 mov B2, B2md
126 sub D, B2md
128 C D not needed until final reduction
129 push D
130 mov UN_INPUT, UN C Clobbers D
132 mov DINV, %rax
133 mul U1
134 mov %rax, Q0
135 add U1, %rdx
136 mov %rdx, T
138 mov B2, %rax
139 mul U1
140 mov -8(UP, UN, 8), U0
141 mov (UP, UN, 8), U1
142 mov T, (QP, UN, 8)
143 add %rax, U0
144 adc %rdx, U1
145 sbb U2, U2
146 dec UN
147 mov U1, %rax
148 jz L(final)
149 mov $0, R32(Q1)
151 ALIGN(16)
153 C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
154 C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
155 C is zero, and carry holds an extra copy of U2.
156 L(loop):
157 C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
158 C Remains to add in B (U1 + c)
159 cmovc DINV, Q1
160 mov U2, Q2
161 neg Q2
162 mul DINV
163 add %rdx, Q1
164 adc $0, Q2
165 add Q0, Q1
166 mov %rax, Q0
167 mov B2, %rax
168 lea (B2md, U0), T
169 adc $0, Q2
171 C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
172 mul U1
173 and B2, U2
174 add U2, U0
175 cmovnc U0, T
177 C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
178 adc U1, Q1
179 mov -8(UP, UN, 8), U0
180 adc Q2, 8(QP, UN, 8)
181 jc L(q_incr)
182 L(q_incr_done):
183 add %rax, U0
184 mov T, %rax
185 adc %rdx, %rax
186 mov Q1, (QP, UN, 8)
187 mov $0, R32(Q1)
188 sbb U2, U2
189 dec UN
190 mov %rax, U1
191 jnz L(loop)
193 L(final):
194 pop D
196 mov U2, Q1
197 and D, U2
198 sub U2, %rax
199 neg Q1
201 mov %rax, U1
202 sub D, %rax
203 cmovc U1, %rax
204 sbb $-1, Q1
206 lea 1(%rax), T
207 mul DINV
208 add U0, %rax
209 adc T, %rdx
210 mov %rdx, T
211 imul D, %rdx
212 sub %rdx, U0
213 cmp U0, %rax
214 lea (U0, D), %rax
215 cmovnc U0, %rax
216 sbb $0, T
217 cmp D, %rax
218 jc L(div_done)
219 sub D, %rax
220 add $1, T
221 L(div_done):
222 add T, Q0
223 mov Q0, (QP)
224 adc Q1, 8(QP)
225 jnc L(done)
226 L(final_q_incr):
227 addq $1, 16(QP)
228 lea 8(QP), QP
229 jc L(final_q_incr)
231 L(done):
232 pop %rbp
233 pop %rbx
234 pop %r12
235 pop %r13
236 pop %r14
237 pop %r15
238 FUNC_EXIT
241 L(q_incr):
242 C U1 is not live, so use it for indexing
243 lea 16(QP, UN, 8), U1
244 L(q_incr_loop):
245 addq $1, (U1)
246 jnc L(q_incr_done)
247 lea 8(U1), U1
248 jmp L(q_incr_loop)
249 EPILOGUE()