beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / dive_1.asm
blob988bdab632d1f848626f6ece3b96c176e1d256b3
1 dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 10
36 C AMD K10 10
37 C Intel P4 33
38 C Intel core2 13.25
39 C Intel corei 14
40 C Intel atom 42
41 C VIA nano 43
43 C A quick adoption of the 32-bit K7 code.
46 C INPUT PARAMETERS
47 C rp rdi
48 C up rsi
49 C n rdx
50 C divisor rcx
52 ABI_SUPPORT(DOS64)
53 ABI_SUPPORT(STD64)
55 ASM_START()
56 TEXT
57 ALIGN(16)
58 PROLOGUE(mpn_divexact_1)
59 FUNC_ENTRY(4)
60 push %rbx
62 mov %rcx, %rax
63 xor R32(%rcx), R32(%rcx) C shift count
64 mov %rdx, %r8
66 bt $0, R32(%rax)
67 jnc L(evn) C skip bsfq unless divisor is even
69 L(odd): mov %rax, %rbx
70 shr R32(%rax)
71 and $127, R32(%rax) C d/2, 7 bits
73 LEA( binvert_limb_table, %rdx)
75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
77 mov %rbx, %r11 C d without twos
79 lea (%rax,%rax), R32(%rdx) C 2*inv
80 imul R32(%rax), R32(%rax) C inv*inv
81 imul R32(%rbx), R32(%rax) C inv*inv*d
82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
84 lea (%rdx,%rdx), R32(%rax) C 2*inv
85 imul R32(%rdx), R32(%rdx) C inv*inv
86 imul R32(%rbx), R32(%rdx) C inv*inv*d
87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
89 lea (%rax,%rax), %r10 C 2*inv
90 imul %rax, %rax C inv*inv
91 imul %rbx, %rax C inv*inv*d
92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
94 lea (%rsi,%r8,8), %rsi C up end
95 lea -8(%rdi,%r8,8), %rdi C rp end
96 neg %r8 C -n
98 mov (%rsi,%r8,8), %rax C up[0]
100 inc %r8
101 jz L(one)
103 mov (%rsi,%r8,8), %rdx C up[1]
105 shrd R8(%rcx), %rdx, %rax
107 xor R32(%rbx), R32(%rbx)
108 jmp L(ent)
110 L(evn): bsf %rax, %rcx
111 shr R8(%rcx), %rax
112 jmp L(odd)
114 ALIGN(8)
115 L(top):
116 C rax q
117 C rbx carry bit, 0 or 1
118 C rcx shift
119 C rdx
120 C rsi up end
121 C rdi rp end
122 C r8 counter, limbs, negative
123 C r10 d^(-1) mod 2^64
124 C r11 d, shifted down
126 mul %r11 C carry limb in rdx 0 10
127 mov -8(%rsi,%r8,8), %rax C
128 mov (%rsi,%r8,8), %r9 C
129 shrd R8(%rcx), %r9, %rax C
130 nop C
131 sub %rbx, %rax C apply carry bit
132 setc %bl C
133 sub %rdx, %rax C apply carry limb 5
134 adc $0, %rbx C 6
135 L(ent): imul %r10, %rax C 6
136 mov %rax, (%rdi,%r8,8) C
137 inc %r8 C
138 jnz L(top)
140 mul %r11 C carry limb in rdx
141 mov -8(%rsi), %rax C up high limb
142 shr R8(%rcx), %rax
143 sub %rbx, %rax C apply carry bit
144 sub %rdx, %rax C apply carry limb
145 imul %r10, %rax
146 mov %rax, (%rdi)
147 pop %rbx
148 FUNC_EXIT()
151 L(one): shr R8(%rcx), %rax
152 imul %r10, %rax
153 mov %rax, (%rdi)
154 pop %rbx
155 FUNC_EXIT()
158 EPILOGUE()