beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / div_qr_1n_pi1.asm
blobcb072e979dc08b9960032cdd7da0daf9ad31ee1f
1 dnl x86-64 mpn_div_qr_1n_pi1
2 dnl -- Divide an mpn number by a normalized single-limb number,
3 dnl using a single-limb inverse.
5 dnl Contributed to the GNU project by Niels Möller
7 dnl Copyright 2013 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
10 dnl
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of either:
13 dnl
14 dnl * the GNU Lesser General Public License as published by the Free
15 dnl Software Foundation; either version 3 of the License, or (at your
16 dnl option) any later version.
17 dnl
18 dnl or
19 dnl
20 dnl * the GNU General Public License as published by the Free Software
21 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl later version.
23 dnl
24 dnl or both in parallel, as here.
25 dnl
26 dnl The GNU MP Library is distributed in the hope that it will be useful, but
27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl for more details.
30 dnl
31 dnl You should have received copies of the GNU General Public License and the
32 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
33 dnl see https://www.gnu.org/licenses/.
35 include(`../config.m4')
38 C c/l
39 C AMD K8,K9 13
40 C AMD K10 13
41 C AMD bull 16.5
42 C AMD pile 15
43 C AMD steam ?
44 C AMD bobcat 16
45 C AMD jaguar ?
46 C Intel P4 47 poor
47 C Intel core 19.25
48 C Intel NHM 18
49 C Intel SBR 15 poor
50 C Intel IBR 13
51 C Intel HWL 11.7
52 C Intel BWL ?
53 C Intel atom 52 very poor
54 C VIA nano 19
57 C INPUT Parameters
58 define(`QP', `%rdi')
59 define(`UP', `%rsi')
60 define(`UN_INPUT', `%rdx')
61 define(`U1', `%rcx') C Also in %rax
62 define(`D', `%r8')
63 define(`DINV', `%r9')
65 C Invariants
66 define(`B2', `%rbp')
67 define(`B2md', `%rbx')
69 C Variables
70 define(`UN', `%r8') C Overlaps D input
71 define(`T', `%r10')
72 define(`U0', `%r11')
73 define(`U2', `%r12')
74 define(`Q0', `%r13')
75 define(`Q1', `%r14')
76 define(`Q2', `%r15')
78 ABI_SUPPORT(STD64)
80 ASM_START()
81 TEXT
82 ALIGN(16)
83 PROLOGUE(mpn_div_qr_1n_pi1)
84 FUNC_ENTRY(6)
85 IFDOS(` mov 56(%rsp), %r8 ')
86 IFDOS(` mov 64(%rsp), %r9 ')
87 dec UN_INPUT
88 jnz L(first)
90 C Just a single 2/1 division.
91 C T, U0 are allocated in scratch registers
92 lea 1(U1), T
93 mov U1, %rax
94 mul DINV
95 mov (UP), U0
96 add U0, %rax
97 adc T, %rdx
98 mov %rdx, T
99 imul D, %rdx
100 sub %rdx, U0
101 cmp U0, %rax
102 lea (U0, D), %rax
103 cmovnc U0, %rax
104 sbb $0, T
105 cmp D, %rax
106 jc L(single_div_done)
107 sub D, %rax
108 add $1, T
109 L(single_div_done):
110 mov T, (QP)
111 FUNC_EXIT
113 L(first):
114 C FIXME: Could delay some of these until we enter the loop.
115 push %r15
116 push %r14
117 push %r13
118 push %r12
119 push %rbx
120 push %rbp
122 mov D, B2
123 imul DINV, B2
124 neg B2
125 mov B2, B2md
126 sub D, B2md
128 C D not needed until final reduction
129 push D
130 mov UN_INPUT, UN C Clobbers D
132 mov DINV, %rax
133 mul U1
134 mov %rax, Q0
135 add U1, %rdx
136 mov %rdx, T
138 mov B2, %rax
139 mul U1
140 mov -8(UP, UN, 8), U0
141 mov (UP, UN, 8), U1
142 mov T, (QP, UN, 8)
143 add %rax, U0
144 adc %rdx, U1
145 sbb U2, U2
146 dec UN
147 mov U1, %rax
148 jz L(final)
150 ALIGN(16)
152 C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
153 C At entry, %rax holds an extra copy of U1
154 L(loop):
155 C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
156 C Remains to add in B (U1 + c)
157 mov DINV, Q1
158 mov U2, Q2
159 and U2, Q1
160 neg Q2
161 mul DINV
162 add %rdx, Q1
163 adc $0, Q2
164 add Q0, Q1
165 mov %rax, Q0
166 mov B2, %rax
167 lea (B2md, U0), T
168 adc $0, Q2
170 C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
171 mul U1
172 and B2, U2
173 add U2, U0
174 cmovnc U0, T
176 C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
177 adc U1, Q1
178 mov -8(UP, UN, 8), U0
179 adc Q2, 8(QP, UN, 8)
180 jc L(q_incr)
181 L(q_incr_done):
182 add %rax, U0
183 mov T, %rax
184 adc %rdx, %rax
185 mov Q1, (QP, UN, 8)
186 sbb U2, U2
187 dec UN
188 mov %rax, U1
189 jnz L(loop)
191 L(final):
192 pop D
194 mov U2, Q1
195 and D, U2
196 sub U2, %rax
197 neg Q1
199 mov %rax, U1
200 sub D, %rax
201 cmovc U1, %rax
202 sbb $-1, Q1
204 lea 1(%rax), T
205 mul DINV
206 add U0, %rax
207 adc T, %rdx
208 mov %rdx, T
209 imul D, %rdx
210 sub %rdx, U0
211 cmp U0, %rax
212 lea (U0, D), %rax
213 cmovnc U0, %rax
214 sbb $0, T
215 cmp D, %rax
216 jc L(div_done)
217 sub D, %rax
218 add $1, T
219 L(div_done):
220 add T, Q0
221 mov Q0, (QP)
222 adc Q1, 8(QP)
223 jnc L(done)
224 L(final_q_incr):
225 addq $1, 16(QP)
226 lea 8(QP), QP
227 jc L(final_q_incr)
229 L(done):
230 pop %rbp
231 pop %rbx
232 pop %r12
233 pop %r13
234 pop %r14
235 pop %r15
236 FUNC_EXIT
239 L(q_incr):
240 C U1 is not live, so use it for indexing
241 lea 16(QP, UN, 8), U1
242 L(q_incr_loop):
243 addq $1, (U1)
244 jnc L(q_incr_done)
245 lea 8(U1), U1
246 jmp L(q_incr_loop)
247 EPILOGUE()