1 dnl x86
-64 mpn_div_qr_1n_pi1
2 dnl
-- Divide an mpn number by a normalized single
-limb number
,
3 dnl using a single
-limb inverse.
5 dnl Contributed to the GNU project by Niels Möller
7 dnl Copyright
2013 Free Software Foundation
, Inc.
9 dnl
This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
12 dnl it under the terms of
either:
14 dnl
* the GNU Lesser General
Public License as published by the Free
15 dnl Software Foundation
; either version 3 of the License, or (at your
16 dnl option
) any later version.
20 dnl
* the GNU General
Public License as published by the Free Software
21 dnl Foundation
; either version 2 of the License, or (at your option) any
24 dnl
or both
in parallel
, as here.
26 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
27 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
28 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
31 dnl You should have received copies of the GNU General
Public License
and the
32 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
33 dnl see
https://www.gnu.
org/licenses
/.
35 include(`..
/config.m4
')
47 C Intel core 28.5 very poor
48 C Intel NHM 29 very poor
53 C Intel atom 53 very poor
60 define(`UN_INPUT', `
%rdx
')
61 define(`U1', `
%rcx
') C Also in %rax
67 define(`B2md', `
%rbx
')
70 define(`UN', `
%r8
') C Overlaps D input
83 PROLOGUE(mpn_div_qr_1n_pi1)
85 IFDOS(` mov 56(%rsp), %r8 ')
86 IFDOS
(`
mov 64(%rsp
), %r9
')
90 C Just a single 2/1 division.
91 C T, U0 are allocated in scratch registers
106 jc L(single_div_done)
114 C FIXME: Could delay some of these until we enter the loop.
128 C D not needed until final reduction
130 mov UN_INPUT, UN C Clobbers D
140 mov -8(UP, UN, 8), U0
153 C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
154 C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
155 C is zero, and carry holds an extra copy of U2.
157 C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
158 C Remains to add in B (U1 + c)
171 C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
177 C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
179 mov -8(UP, UN, 8), U0
242 C U1 is not live, so use it for indexing
243 lea 16(QP, UN, 8), U1