1 dnl AMD K6 mpn_mul_1
-- mpn by limb multiply.
3 dnl Copyright
1999, 2000, 2002, 2005 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
36 C P6 model 0-8,10-12 5.5
38 C P6 model 13 (Dothan) 4.87
39 C P4 model 0 (Willamette)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
49 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
50 C mp_limb_t multiplier);
51 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
52 C mp_limb_t multiplier, mp_limb_t carry);
54 C Multiply src,size by mult and store the result in dst,size.
55 C Return the carry limb from the top of the result.
57 C mpn_mul_1c() accepts an initial carry for the calculation, it's added
into
58 C the
low limb of the result.
60 defframe
(PARAM_CARRY
, 20)
61 defframe
(PARAM_MULTIPLIER
,16)
62 defframe
(PARAM_SIZE
, 12)
63 defframe
(PARAM_SRC
, 8)
64 defframe
(PARAM_DST
, 4)
66 dnl minimum
5 because the unrolled code can
't handle less
67 deflit(UNROLL_THRESHOLD, 5)
75 movl PARAM_CARRY
, %esi
83 xorl %esi, %esi C initial carry
98 cmpl $UNROLL_THRESHOLD, %ecx
99 movl PARAM_MULTIPLIER, %ebp
104 C code offset 0x22 here, close enough to aligned
114 C this loop 8 cycles/limb
143 C -----------------------------------------------------------------------------
144 C The code for each limb is 6 cycles, with instruction decoding being the
145 C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's
6.25
146 C cycles
/limb
in total.
148 C The secret ingredient to get
6.25 is to start the
loop with the
mul and
149 C have the load
/store pair at the
end. Rotating the load
/store to the top
150 C is an
0.5 c
/l slowdown.
(Some address generation effect probably.
)
152 C The whole unrolled
loop fits nicely
in exactly
80 bytes.
155 ALIGN(16) C already aligned to
16 here actually
158 leal
-16(%ebx,%ecx,4), %ebx
160 leal
-16(%edi,%ecx,4), %edi
166 ALIGN(16) C one
byte nop for
this alignment
183 movl
%eax, (%edi,%ecx,4)
184 movl
4(%ebx,%ecx,4), %eax
194 movl
%eax, 4(%edi,%ecx,4)
195 movl
8(%ebx,%ecx,4), %eax
205 movl
%eax, 8(%edi,%ecx,4)
206 movl
12(%ebx,%ecx,4), %eax
216 movl
%eax, 12(%edi,%ecx,4)
217 movl
16(%ebx,%ecx,4), %eax
227 C
ecx 0 to
3 representing respectively
4 to
1 further limbs
233 jnz L
(finish_not_two
)
242 movl
%eax, (%edi,%ecx,4)
243 movl
4(%ebx,%ecx,4), %eax
253 movl
%eax, 4(%edi,%ecx,4)
254 movl
8(%ebx,%ecx,4), %eax
261 jnz L
(finish_not_one
)