1 dnl Intel Pentium
-4 mpn_modexact_1_odd
-- mpn by limb exact remainder.
3 dnl Copyright
2001, 2002, 2007 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P4: 19.0 cycles/limb
37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
43 defframe(PARAM_CARRY, 16)
44 defframe(PARAM_DIVISOR,12)
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
51 PROLOGUE(mpn_modexact_1c_odd)
54 movd PARAM_CARRY
, %mm1
61 PROLOGUE
(mpn_modexact_1_odd
)
64 pxor %mm1, %mm1 C carry limb
66 movl PARAM_DIVISOR, %eax
68 movd PARAM_DIVISOR, %mm7
72 andl $127, %eax C d/2, 7 bits
75 LEA( binvert_limb_table
, %edx)
76 movzbl
(%eax,%edx), %eax C inv
8 bits
78 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
87 pmuludq
%mm6
, %mm6 C inv
*inv
91 pmuludq
%mm7
, %mm6 C inv
*inv
*d
92 paddd
%mm0
, %mm0 C
2*inv
96 psubd
%mm6
, %mm0 C inv
= 2*inv
- inv
*inv
*d
100 pmuludq
%mm0
, %mm0 C inv
*inv
104 pmuludq
%mm7
, %mm0 C inv
*inv
*d
105 paddd
%mm6
, %mm6 C
2*inv
109 movl PARAM_SIZE
, %ecx
113 psubd
%mm0
, %mm6 C inv
= 2*inv
- inv
*inv
*d
115 ASSERT
(e
,` C expect d
*inv
== 1 mod 2^GMP_LIMB_BITS
116 pushl
%eax FRAME_pushl
()
118 imul PARAM_DIVISOR
, %eax
120 popl
%eax FRAME_popl
()')
122 pxor %mm0, %mm0 C carry bit
125 C The dependent chain here is as follows.
128 C psubq s = (src-cbit) - climb 2
129 C pmuludq q = s*inverse 8
130 C pmuludq prod = q*divisor 8
131 C psrlq climb = high(prod) 2
135 C Yet the loop measures 19.0 c/l, so obviously there's something gained
136 C there over a straight reading of the chip documentation.
139 C
eax src
, incrementing
152 psubq
%mm0
, %mm2 C src
- cbit
154 psubq
%mm1
, %mm2 C src
- cbit
- climb
156 psrlq
$63, %mm0 C new cbit
158 pmuludq
%mm6
, %mm2 C s
*inverse
161 pmuludq
%mm2
, %mm1 C q
*divisor
162 psrlq
$32, %mm1 C new climb