1 dnl Intel Pentium
-4 mpn_divexact_1
-- mpn by limb exact division.
3 dnl Rearranged from mpn
/x86
/pentium4
/sse2
/dive_1.asm by Marco Bodrato.
5 dnl Copyright
2001, 2002, 2007, 2011 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C P4: 19.0 cycles/limb
38 C Pairs of movd's are used to avoid unaligned loads. Despite the loads
not
39 C being on the dependent chain
and there being plenty of cycles available
,
40 C using an unaligned movq on every second iteration measured about
23 c
/l.
43 defframe
(PARAM_SHIFT
, 24)
44 defframe
(PARAM_INVERSE
,20)
45 defframe
(PARAM_DIVISOR
,16)
46 defframe
(PARAM_SIZE
, 12)
47 defframe
(PARAM_SRC
, 8)
48 defframe
(PARAM_DST
, 4)
53 C mpn_pi1_bdiv_q_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size, mp_limb_t divisor
,
54 C mp_limb_t inverse
, int shift
)
56 PROLOGUE
(mpn_pi1_bdiv_q_1
)
63 movl PARAM_DIVISOR, %ecx
66 movl PARAM_SHIFT, %ecx
68 movd %ecx, %mm7 C shift
72 movl PARAM_INVERSE, %ecx
76 pxor %mm1, %mm1 C initial carry limb
77 pxor %mm0, %mm0 C initial carry bit
83 psrlq $32, %mm4 C 0x00000000FFFFFFFF
85 C The dependent chain here is as follows.
88 C psubq s = (src-cbit) - climb 2
89 C pmuludq q = s*inverse 8
90 C pmuludq prod = q*divisor 8
91 C psrlq climb = high(prod) 2
95 C Yet the loop measures 19.0 c/l, so obviously there's something gained
96 C there over a straight reading of the chip documentation.
99 C
eax src
, incrementing
101 C
ecx dst
, incrementing
102 C
edx counter
, size-1 iterations
106 C mm4
0x00000000FFFFFFFF
117 pand
%mm4
, %mm2 C src
118 psubq
%mm0
, %mm2 C src
- cbit
120 psubq
%mm1
, %mm2 C src
- cbit
- climb
122 psrlq
$63, %mm0 C new cbit
124 pmuludq
%mm5
, %mm2 C s
*inverse
125 movd
%mm2
, (%ecx) C q
129 pmuludq
%mm2
, %mm1 C q
*divisor
130 psrlq
$32, %mm1 C new climb
138 psrlq
%mm7
, %mm2 C src
139 psubq
%mm0
, %mm2 C src
- cbit
141 psubq
%mm1
, %mm2 C src
- cbit
- climb
143 pmuludq
%mm5
, %mm2 C s
*inverse
144 movd
%mm2
, (%ecx) C q
152 C mp_limb_t mpn_bdiv_q_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
153 C mp_limb_t divisor
);
155 PROLOGUE
(mpn_bdiv_q_1
)
158 movl PARAM_SIZE, %edx
160 movl PARAM_DIVISOR, %ecx
168 bsfl %ecx, %ecx C trailing twos
170 shrl %cl, %eax C d = divisor without twos
172 movd %ecx, %mm7 C shift
176 andl $127, %eax C d/2, 7 bits
179 LEA( binvert_limb_table
, %ecx)
180 movzbl
(%eax,%ecx), %eax C inv
8 bits
182 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
187 movd
%eax, %mm5 C inv
189 movd
%eax, %mm0 C inv
191 pmuludq
%mm5
, %mm5 C inv
*inv
195 pmuludq
%mm6
, %mm5 C inv
*inv
*d
196 paddd
%mm0
, %mm0 C
2*inv
200 psubd
%mm5
, %mm0 C inv
= 2*inv
- inv
*inv
*d
204 pmuludq
%mm0
, %mm0 C inv
*inv
207 psrlq
$32, %mm4 C
0x00000000FFFFFFFF
211 pmuludq
%mm6
, %mm0 C inv
*inv
*d
212 paddd
%mm5
, %mm5 C
2*inv
216 pxor
%mm1
, %mm1 C initial carry limb
220 psubd
%mm0
, %mm5 C inv
= 2*inv
- inv
*inv
*d
222 ASSERT
(e
,` C expect d
*inv
== 1 mod 2^GMP_LIMB_BITS
223 pushl
%eax FRAME_pushl
()
228 popl
%eax FRAME_popl
()')
230 pxor %mm0, %mm0 C initial carry bit