1 dnl Intel Pentium mpn_modexact_1_odd
-- exact division style remainder.
3 dnl Copyright
2000-2002, 2014 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P5: 23.0 cycles/limb
37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
42 C There seems no way to pair up the two lone instructions in the main loop.
44 C The special case for size==1 saves about 20 cycles (non-PIC), making it
45 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
50 C Using mmx for the multiplies might be possible, with pmullw and pmulhw
51 C having just 3 cycle latencies, but carry bit handling would probably be
54 defframe(PARAM_CARRY, 16)
55 defframe(PARAM_DIVISOR,12)
56 defframe(PARAM_SIZE, 8)
57 defframe(PARAM_SRC, 4)
59 dnl re-using parameter space
60 define(VAR_INVERSE,`PARAM_SIZE')
65 PROLOGUE
(mpn_modexact_1c_odd
)
68 movl PARAM_DIVISOR, %eax
69 movl PARAM_CARRY, %edx
76 PROLOGUE(mpn_modexact_1_odd)
79 movl PARAM_DIVISOR
, %eax
80 xorl
%edx, %edx C carry
87 LEA( binvert_limb_table
, %ecx)
88 pushl
%ebx FRAME_pushl
()
97 call L(here) FRAME_pushl()
101 movl (%esp), %ecx C eip
103 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
104 movl %ebx, (%esp) C push ebx
107 movl PARAM_SIZE, %ebx
109 movl binvert_limb_table@GOT(%ecx), %ecx
112 movb (%eax,%ecx), %cl C inv 8 bits
118 pushl %ebx FRAME_pushl()
120 movl PARAM_SIZE, %ebx
126 movb binvert_limb_table(%eax), %cl C inv 8 bits
130 addl
%ecx, %ecx C
2*inv
132 imull
%eax, %eax C inv
*inv
134 imull PARAM_DIVISOR
, %eax C inv
*inv
*d
136 subl
%eax, %ecx C inv
= 2*inv
- inv
*inv
*d
139 addl
%ecx, %ecx C
2*inv
141 imull
%eax, %eax C inv
*inv
143 imull PARAM_DIVISOR
, %eax C inv
*inv
*d
145 subl
%eax, %ecx C inv
= 2*inv
- inv
*inv
*d
146 pushl
%esi FRAME_pushl
()
148 ASSERT
(e
,` C d
*inv
== 1 mod 2^GMP_LIMB_BITS
150 imull PARAM_DIVISOR
, %eax
154 movl %ecx, VAR_INVERSE
156 movl (%esi), %eax C src[0]
157 leal 4(%esi,%ebx,4), %esi C &src[size-1]
159 xorl $-1, %ebx C -(size-1)
164 C The use of VAR_INVERSE means only a store is needed for that value, rather
165 C than a push and pop of say %edi.
169 C eax scratch, low product
170 C ebx counter, limbs, negative
172 C edx scratch, high product
177 mull PARAM_DIVISOR C h:dummy = q*d
179 movl (%esi,%ebx,4), %eax C src[i]
180 subl %ecx, %edx C h -= -c
183 subl %edx, %eax C s = src[i] - h
185 sbbl %ecx, %ecx C new -c (0 or -1)
187 imull VAR_INVERSE, %eax C q = s*i
195 movl (%esi), %eax C src high
196 subl %ecx, %edx C h -= -c
198 cmpl PARAM_DIVISOR, %eax
201 deflit(FRAME_LAST,FRAME)
204 subl %edx, %eax C s = src[i] - h
205 popl %esi FRAME_popl()
207 sbbl %ecx, %ecx C c (0 or -1)
208 popl %ebx FRAME_popl()
210 imull VAR_INVERSE, %eax C q = s*i
212 mull PARAM_DIVISOR C h:dummy = q*d
221 C When high<divisor can skip last step.
224 deflit(`FRAME',FRAME_LAST
)
231 subl
%eax, %edx C r
-s
232 popl
%esi FRAME_popl
()
234 sbbl
%eax, %eax C
-1 if underflow
235 movl PARAM_DIVISOR
, %ebx
237 andl
%ebx, %eax C divisor if underflow
238 popl
%ebx FRAME_popl
()
240 addl
%edx, %eax C addback if underflow
245 C Special case for
size==1 using a division for r
= c
-a
mod d.
246 C Could look for a
-c
<d
and save a division sometimes
, but that doesn
't seem
247 C worth bothering about.
262 movl PARAM_DIVISOR
, %ecx
263 popl
%ebx FRAME_popl
()
265 subl
(%edx), %eax C c
-a
270 andl
%ecx, %edx C b
*d
+c
-a if c
<a
, or c
-a if c
>=a