beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / nano / dive_1.asm
blobe9a07631c4bb13dde020678d69b12062cee7d99c
1 dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C norm unorm
36 C AMD K8,K9 11 11
37 C AMD K10 11 11
38 C Intel P4 ?
39 C Intel core2 13.5 13.25
40 C Intel corei 14.25
41 C Intel atom 34 36
42 C VIA nano 19.25 19.25
45 C INPUT PARAMETERS
46 C rp rdi
47 C up rsi
48 C n rdx
49 C divisor rcx
51 ABI_SUPPORT(DOS64)
52 ABI_SUPPORT(STD64)
54 ASM_START()
55 TEXT
56 ALIGN(16)
57 PROLOGUE(mpn_divexact_1)
58 FUNC_ENTRY(4)
59 push %rbx
61 mov %rcx, %rax
62 xor R32(%rcx), R32(%rcx) C shift count
63 mov %rdx, %r8
65 bt $0, R32(%rax)
66 jc L(odd) C skip bsfq unless divisor is even
67 bsf %rax, %rcx
68 shr R8(%rcx), %rax
69 L(odd): mov %rax, %rbx
70 shr R32(%rax)
71 and $127, R32(%rax) C d/2, 7 bits
73 LEA( binvert_limb_table, %rdx)
75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
77 mov %rbx, %r11 C d without twos
79 lea (%rax,%rax), R32(%rdx) C 2*inv
80 imul R32(%rax), R32(%rax) C inv*inv
81 imul R32(%rbx), R32(%rax) C inv*inv*d
82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
84 lea (%rdx,%rdx), R32(%rax) C 2*inv
85 imul R32(%rdx), R32(%rdx) C inv*inv
86 imul R32(%rbx), R32(%rdx) C inv*inv*d
87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
89 lea (%rax,%rax), %r10 C 2*inv
90 imul %rax, %rax C inv*inv
91 imul %rbx, %rax C inv*inv*d
92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
94 lea (%rsi,%r8,8), %rsi C up end
95 lea -8(%rdi,%r8,8), %rdi C rp end
96 neg %r8 C -n
98 mov (%rsi,%r8,8), %rax C up[0]
100 inc %r8
101 jz L(one)
103 test R32(%rcx), R32(%rcx)
104 jnz L(unorm) C branch if count != 0
105 xor R32(%rbx), R32(%rbx)
106 jmp L(nent)
108 ALIGN(8)
109 L(ntop):mul %r11 C carry limb in rdx 0 10
110 mov -8(%rsi,%r8,8), %rax C
111 sub %rbx, %rax C apply carry bit
112 setc %bl C
113 sub %rdx, %rax C apply carry limb 5
114 adc $0, %rbx C 6
115 L(nent):imul %r10, %rax C 6
116 mov %rax, (%rdi,%r8,8) C
117 inc %r8 C
118 jnz L(ntop)
120 mov -8(%rsi), %r9 C up high limb
121 jmp L(com)
123 L(unorm):
124 mov (%rsi,%r8,8), %r9 C up[1]
125 shr R8(%rcx), %rax C
126 neg R32(%rcx)
127 shl R8(%rcx), %r9 C
128 neg R32(%rcx)
129 or %r9, %rax
130 xor R32(%rbx), R32(%rbx)
131 jmp L(uent)
133 ALIGN(8)
134 L(utop):mul %r11 C carry limb in rdx 0 10
135 mov (%rsi,%r8,8), %rax C
136 shl R8(%rcx), %rax C
137 neg R32(%rcx)
138 or %r9, %rax
139 sub %rbx, %rax C apply carry bit
140 setc %bl C
141 sub %rdx, %rax C apply carry limb 5
142 adc $0, %rbx C 6
143 L(uent):imul %r10, %rax C 6
144 mov (%rsi,%r8,8), %r9 C
145 shr R8(%rcx), %r9 C
146 neg R32(%rcx)
147 mov %rax, (%rdi,%r8,8) C
148 inc %r8 C
149 jnz L(utop)
151 L(com): mul %r11 C carry limb in rdx
152 sub %rbx, %r9 C apply carry bit
153 sub %rdx, %r9 C apply carry limb
154 imul %r10, %r9
155 mov %r9, (%rdi)
156 pop %rbx
157 FUNC_EXIT()
160 L(one): shr R8(%rcx), %rax
161 imul %r10, %rax
162 mov %rax, (%rdi)
163 pop %rbx
164 FUNC_EXIT()
166 EPILOGUE()