beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mod_1_1.asm
blob09b5dd1e1a11ff2d5053a4facca63ece63cec5f9
1 dnl AMD64 mpn_mod_1_1p
3 dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 6
37 C AMD K10 6
38 C Intel P4 26
39 C Intel core2 12.5
40 C Intel NHM 11.3
41 C Intel SBR 8.4 (slowdown, old code took 8.0)
42 C Intel atom 26
43 C VIA nano 13
45 define(`B2mb', `%r10')
46 define(`B2modb', `%r11')
47 define(`ap', `%rdi')
48 define(`n', `%rsi')
49 define(`pre', `%r8')
50 define(`b', `%rbx')
52 define(`r0', `%rbp') C r1 kept in %rax
53 define(`r2', `%rcx') C kept negated. Also used as shift count
54 define(`t0', `%r9')
56 C mp_limb_t
57 C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
58 C %rdi %rsi %rdx %rcx
59 C The pre array contains bi, cnt, B1modb, B2modb
60 C Note: This implementation needs B1modb only when cnt > 0
62 C The iteration is almost as follows,
64 C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
66 C where r2 is a single bit represented as a mask. But to make sure that the
67 C result fits in two limbs and a bit, carry from the addition
69 C r_0 + r_2 B2mod
71 C is handled specially. On carry, we subtract b to cancel the carry,
72 C and we use instead the value
74 C r_0 + B2mb (mod B)
76 C This addition can be issued early since it doesn't depend on r2, and it is
77 C the source of the cmov in the loop.
79 C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
81 ABI_SUPPORT(DOS64)
82 ABI_SUPPORT(STD64)
84 ASM_START()
85 TEXT
86 ALIGN(16)
87 PROLOGUE(mpn_mod_1_1p)
88 FUNC_ENTRY(4)
89 push %rbp
90 push %rbx
91 mov %rdx, b
92 mov %rcx, pre
94 mov -8(ap, n, 8), %rax
95 cmp $3, n
96 jnc L(first)
97 mov -16(ap, n, 8), r0
98 jmp L(reduce_two)
100 L(first):
101 C First iteration, no r2
102 mov 24(pre), B2modb
103 mul B2modb
104 mov -24(ap, n, 8), r0
105 add %rax, r0
106 mov -16(ap, n, 8), %rax
107 adc %rdx, %rax
108 sbb r2, r2
109 sub $4, n
110 jc L(reduce_three)
112 mov B2modb, B2mb
113 sub b, B2mb
115 ALIGN(16)
116 L(top): and B2modb, r2
117 lea (B2mb, r0), t0
118 mul B2modb
119 add r0, r2
120 mov (ap, n, 8), r0
121 cmovc t0, r2
122 add %rax, r0
123 mov r2, %rax
124 adc %rdx, %rax
125 sbb r2, r2
126 sub $1, n
127 jnc L(top)
129 L(reduce_three):
130 C Eliminate r2
131 and b, r2
132 sub r2, %rax
134 L(reduce_two):
135 mov 8(pre), R32(%rcx)
136 test R32(%rcx), R32(%rcx)
137 jz L(normalized)
139 C Unnormalized, use B1modb to reduce to size < B (b+1)
140 mulq 16(pre)
141 xor t0, t0
142 add %rax, r0
143 adc %rdx, t0
144 mov t0, %rax
146 C Left-shift to normalize
147 ifdef(`SHLD_SLOW',`
148 shl R8(%rcx), %rax
149 mov r0, t0
150 neg R32(%rcx)
151 shr R8(%rcx), t0
152 or t0, %rax
153 neg R32(%rcx)
155 shld R8(%rcx), r0, %rax
157 shl R8(%rcx), r0
158 jmp L(udiv)
160 L(normalized):
161 mov %rax, t0
162 sub b, t0
163 cmovnc t0, %rax
165 L(udiv):
166 lea 1(%rax), t0
167 mulq (pre)
168 add r0, %rax
169 adc t0, %rdx
170 imul b, %rdx
171 sub %rdx, r0
172 cmp r0, %rax
173 lea (b, r0), %rax
174 cmovnc r0, %rax
175 cmp b, %rax
176 jnc L(fix)
177 L(ok): shr R8(%rcx), %rax
179 pop %rbx
180 pop %rbp
181 FUNC_EXIT()
183 L(fix): sub b, %rax
184 jmp L(ok)
185 EPILOGUE()
187 ALIGN(16)
188 PROLOGUE(mpn_mod_1_1p_cps)
189 FUNC_ENTRY(2)
190 push %rbp
191 bsr %rsi, %rcx
192 push %rbx
193 mov %rdi, %rbx
194 push %r12
195 xor $63, R32(%rcx)
196 mov %rsi, %r12
197 mov R32(%rcx), R32(%rbp)
198 sal R8(%rcx), %r12
199 IFSTD(` mov %r12, %rdi ') C pass parameter
200 IFDOS(` mov %r12, %rcx ') C pass parameter
201 ASSERT(nz, `test $15, %rsp')
202 CALL( mpn_invert_limb)
203 neg %r12
204 mov %r12, %r8
205 mov %rax, (%rbx) C store bi
206 mov %rbp, 8(%rbx) C store cnt
207 imul %rax, %r12
208 mov %r12, 24(%rbx) C store B2modb
209 mov R32(%rbp), R32(%rcx)
210 test R32(%rcx), R32(%rcx)
211 jz L(z)
213 mov $1, R32(%rdx)
214 ifdef(`SHLD_SLOW',`
215 C Destroys %rax, unlike shld. Otherwise, we could do B1modb
216 C before B2modb, and get rid of the move %r12, %r8 above.
218 shl R8(%rcx), %rdx
219 neg R32(%rcx)
220 shr R8(%rcx), %rax
221 or %rax, %rdx
222 neg R32(%rcx)
224 shld R8(%rcx), %rax, %rdx
226 imul %rdx, %r8
227 shr R8(%rcx), %r8
228 mov %r8, 16(%rbx) C store B1modb
229 L(z):
230 pop %r12
231 pop %rbx
232 pop %rbp
233 FUNC_EXIT()
235 EPILOGUE()
236 ASM_END()