1 dnl Intel P6 mpn_modexact_1_odd
-- exact division style remainder.
3 dnl Copyright
2001, 2002, 2007 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
35 C P6: 10.0 12.0 cycles/limb
38 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
41 C The odd case is basically the same as mpn_modexact_1_odd, just with an
42 C extra store, and it runs at the same 10 cycles which is the dependent
45 C The shifts for the even case aren't on the dependent chain so
in principle
46 C it could run the same too
, but nothing running at
10 has been found.
47 C Perhaps there
's too many uops (an extra 4 over the odd case).
49 defframe(PARAM_DIVISOR,16)
50 defframe(PARAM_SIZE, 12)
51 defframe(PARAM_SRC, 8)
52 defframe(PARAM_DST, 4)
54 defframe(SAVE_EBX, -4)
55 defframe(SAVE_ESI, -8)
56 defframe(SAVE_EDI, -12)
57 defframe(SAVE_EBP, -16)
58 defframe(VAR_INVERSE, -20)
59 deflit(STACK_SPACE, 20)
64 PROLOGUE(mpn_divexact_1)
67 movl PARAM_DIVISOR
, %eax
68 subl $STACK_SPACE
, %esp FRAME_subl_esp
(STACK_SPACE
)
76 bsfl
%eax, %ecx C trailing twos
80 shrl
%cl, %eax C d without twos
83 shrl
%eax C d
/2 without twos
85 movl
%edx, PARAM_DIVISOR
89 LEA( binvert_limb_table, %ebp)
90 movzbl (%eax,%ebp), %ebp C inv 8 bits
92 movzbl binvert_limb_table
(%eax), %ebp C inv
8 bits
95 leal (%ebp,%ebp), %eax C 2*inv
97 imull %ebp, %ebp C inv*inv
102 leal (%esi,%ebx,4), %esi C src end
104 imull PARAM_DIVISOR, %ebp C inv*inv*d
106 subl %ebp, %eax C inv = 2*inv - inv*inv*d
107 leal (%eax,%eax), %ebp C 2*inv
109 imull %eax, %eax C inv*inv
111 leal (%edi,%ebx,4), %edi C dst end
116 imull PARAM_DIVISOR, %eax C inv*inv*d
118 subl %eax, %ebp C inv = 2*inv - inv*inv*d
120 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
121 movl PARAM_DIVISOR, %eax
125 movl
%ebp, VAR_INVERSE
126 movl
(%esi,%ebx,4), %eax C src
[0]
131 C
ecx initial carry is zero
135 C The dependent chain here is
139 C mull PARAM_DIVISOR
5
143 C
and this is the measured speed. No special scheduling is necessary
, out
144 C of order execution hides the load latency.
147 C
eax scratch
(src limb
)
148 C
ebx counter
, limbs
, negative
150 C
edx carry limb
, high of last product
157 movl
(%esi,%ebx,4), %eax
166 imull VAR_INVERSE
, %eax
168 movl
%eax, (%edi,%ebx,4)
182 addl $STACK_SPACE
, %esp
189 C
ebx counter
, limbs
, negative
196 xorl
%ebp, %ebp C initial carry bit
197 xorl
%edx, %edx C initial carry limb
(for
size==1)
202 movl
(%esi,%ebx,4), %edi C src
[1]
204 shrdl
( %cl, %edi, %eax)
211 C
ebx counter
, limbs
, negative
215 C
edi &dst
[size] and scratch
218 movl
(%esi,%ebx,4), %edi
222 movl
-4(%esi,%ebx,4), %eax
223 shrdl
( %cl, %edi, %eax)
233 imull VAR_INVERSE
, %eax
238 movl
%eax, -4(%edi,%ebx,4)
258 imull VAR_INVERSE
, %eax
262 addl $STACK_SPACE
, %esp