1 dnl Intel Pentium
-4 mpn_divexact_1
-- mpn by limb exact division.
3 dnl Copyright
2001, 2002, 2007 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P4: 19.0 cycles/limb
37 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C Pairs of movd's are used to avoid unaligned loads. Despite the loads
not
41 C being on the dependent chain
and there being plenty of cycles available
,
42 C using an unaligned movq on every second iteration measured about
23 c
/l.
44 C Using divl for
size==1 seems a touch quicker than
mul-by
-inverse. The
mul
45 C will be about
9+2*4+2*2+10*4+19+12 = 92 cycles latency
, though some of
46 C that might be hidden by
out-of
-order execution
, whereas divl is around
60.
47 C At
size==2 an extra
19 for the
mul versus
60 for the divl will see the
mul
50 defframe
(PARAM_DIVISOR
,16)
51 defframe
(PARAM_SIZE
, 12)
52 defframe
(PARAM_SRC
, 8)
53 defframe
(PARAM_DST
, 4)
58 PROLOGUE
(mpn_divexact_1
)
65 movl PARAM_DIVISOR, %ecx
86 bsfl %ecx, %ecx C trailing twos
88 shrl %cl, %eax C d = divisor without twos
90 movd %ecx, %mm7 C shift
94 andl $127, %eax C d/2, 7 bits
97 LEA( binvert_limb_table
, %ecx)
98 movzbl
(%eax,%ecx), %eax C inv
8 bits
100 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
105 movd
%eax, %mm5 C inv
107 movd
%eax, %mm0 C inv
109 pmuludq
%mm5
, %mm5 C inv
*inv
113 pmuludq
%mm6
, %mm5 C inv
*inv
*d
114 paddd
%mm0
, %mm0 C
2*inv
118 psubd
%mm5
, %mm0 C inv
= 2*inv
- inv
*inv
*d
122 pmuludq
%mm0
, %mm0 C inv
*inv
125 psrlq
$32, %mm4 C
0x00000000FFFFFFFF
129 pmuludq
%mm6
, %mm0 C inv
*inv
*d
130 paddd
%mm5
, %mm5 C
2*inv
134 pxor
%mm1
, %mm1 C initial carry limb
138 psubd
%mm0
, %mm5 C inv
= 2*inv
- inv
*inv
*d
140 ASSERT
(e
,` C expect d
*inv
== 1 mod 2^GMP_LIMB_BITS
141 pushl
%eax FRAME_pushl
()
146 popl
%eax FRAME_popl
()')
148 pxor %mm0, %mm0 C initial carry bit
151 C The dependent chain here is as follows.
154 C psubq s = (src-cbit) - climb 2
155 C pmuludq q = s*inverse 8
156 C pmuludq prod = q*divisor 8
157 C psrlq climb = high(prod) 2
161 C Yet the loop measures 19.0 c/l, so obviously there's something gained
162 C there over a straight reading of the chip documentation.
165 C
eax src
, incrementing
167 C
ecx dst
, incrementing
168 C
edx counter
, size-1 iterations
172 C mm4
0x00000000FFFFFFFF
183 pand
%mm4
, %mm2 C src
184 psubq
%mm0
, %mm2 C src
- cbit
186 psubq
%mm1
, %mm2 C src
- cbit
- climb
188 psrlq
$63, %mm0 C new cbit
190 pmuludq
%mm5
, %mm2 C s
*inverse
191 movd
%mm2
, (%ecx) C q
195 pmuludq
%mm2
, %mm1 C q
*divisor
196 psrlq
$32, %mm1 C new climb
204 psrlq
%mm7
, %mm2 C src
205 psubq
%mm0
, %mm2 C src
- cbit
207 psubq
%mm1
, %mm2 C src
- cbit
- climb
209 pmuludq
%mm5
, %mm2 C s
*inverse
210 movd
%mm2
, (%ecx) C q