1 dnl AMD64 mpn_divexact_1
-- mpn by limb exact division.
3 dnl Copyright
2001, 2002, 2004-2006, 2010-2012 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
39 C Intel core2 13.5 13.25
42 C VIA nano 19.25 19.25
57 PROLOGUE(mpn_divexact_1)
62 xor R32(%rcx), R32(%rcx) C shift count
66 jc L(odd) C skip bsfq unless divisor is even
69 L(odd): mov %rax, %rbx
71 and $127, R32(%rax) C d/2, 7 bits
73 LEA( binvert_limb_table, %rdx)
75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
77 mov %rbx, %r11 C d without twos
79 lea (%rax,%rax), R32(%rdx) C 2*inv
80 imul R32(%rax), R32(%rax) C inv*inv
81 imul R32(%rbx), R32(%rax) C inv*inv*d
82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
84 lea (%rdx,%rdx), R32(%rax) C 2*inv
85 imul R32(%rdx), R32(%rdx) C inv*inv
86 imul R32(%rbx), R32(%rdx) C inv*inv*d
87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
89 lea (%rax,%rax), %r10 C 2*inv
90 imul %rax, %rax C inv*inv
91 imul %rbx, %rax C inv*inv*d
92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits
94 lea (%rsi,%r8,8), %rsi C up end
95 lea -8(%rdi,%r8,8), %rdi C rp end
98 mov (%rsi,%r8,8), %rax C up[0]
103 test R32(%rcx), R32(%rcx)
104 jnz L(unorm) C branch if count != 0
105 xor R32(%rbx), R32(%rbx)
109 L(ntop):mul %r11 C carry limb in rdx 0 10
110 mov -8(%rsi,%r8,8), %rax C
111 sub %rbx, %rax C apply carry bit
113 sub %rdx, %rax C apply carry limb 5
115 L(nent):imul %r10, %rax C 6
116 mov %rax, (%rdi,%r8,8) C
120 mov -8(%rsi), %r9 C up high limb
124 mov (%rsi,%r8,8), %r9 C up[1]
130 xor R32(%rbx), R32(%rbx)
134 L(utop):mul %r11 C carry limb in rdx 0 10
135 mov (%rsi,%r8,8), %rax C
139 sub %rbx, %rax C apply carry bit
141 sub %rdx, %rax C apply carry limb 5
143 L(uent):imul %r10, %rax C 6
144 mov (%rsi,%r8,8), %r9 C
147 mov %rax, (%rdi,%r8,8) C
151 L(com): mul %r11 C carry limb in rdx
152 sub %rbx, %r9 C apply carry bit
153 sub %rdx, %r9 C apply carry limb
160 L(one): shr R8(%rcx), %rax