1 dnl AMD64 mpn_divexact_1
-- mpn by limb exact division.
3 dnl Copyright
2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
43 C A quick adoption of the 32-bit K7 code.
58 PROLOGUE(mpn_divexact_1)
63 xor R32(%rcx), R32(%rcx) C shift count
67 jnc L(evn) C skip bsfq unless divisor is even
69 L(odd): mov %rax, %rbx
71 and $127, R32(%rax) C d/2, 7 bits
73 LEA( binvert_limb_table, %rdx)
75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
77 mov %rbx, %r11 C d without twos
79 lea (%rax,%rax), R32(%rdx) C 2*inv
80 imul R32(%rax), R32(%rax) C inv*inv
81 imul R32(%rbx), R32(%rax) C inv*inv*d
82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
84 lea (%rdx,%rdx), R32(%rax) C 2*inv
85 imul R32(%rdx), R32(%rdx) C inv*inv
86 imul R32(%rbx), R32(%rdx) C inv*inv*d
87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
89 lea (%rax,%rax), %r10 C 2*inv
90 imul %rax, %rax C inv*inv
91 imul %rbx, %rax C inv*inv*d
92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
94 lea (%rsi,%r8,8), %rsi C up end
95 lea -8(%rdi,%r8,8), %rdi C rp end
98 mov (%rsi,%r8,8), %rax C up[0]
103 mov (%rsi,%r8,8), %rdx C up[1]
105 shrd R8(%rcx), %rdx, %rax
107 xor R32(%rbx), R32(%rbx)
110 L(evn): bsf %rax, %rcx
117 C rbx carry bit, 0 or 1
122 C r8 counter, limbs, negative
123 C r10 d^(-1) mod 2^64
124 C r11 d, shifted down
126 mul %r11 C carry limb in rdx 0 10
127 mov -8(%rsi,%r8,8), %rax C
128 mov (%rsi,%r8,8), %r9 C
129 shrd R8(%rcx), %r9, %rax C
131 sub %rbx, %rax C apply carry bit
133 sub %rdx, %rax C apply carry limb 5
135 L(ent): imul %r10, %rax C 6
136 mov %rax, (%rdi,%r8,8) C
140 mul %r11 C carry limb in rdx
141 mov -8(%rsi), %rax C up high limb
143 sub %rbx, %rax C apply carry bit
144 sub %rdx, %rax C apply carry limb
151 L(one): shr R8(%rcx), %rax