1 dnl AMD K7 mpn_add_n
/mpn_sub_n
-- mpn
add or subtract.
3 dnl Copyright
1999-2003 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K7: 1.64 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 ifdef(`OPERATION_add_n', `
50 define
(M4_function_n
, mpn_add_n
)
51 define
(M4_function_nc
, mpn_add_nc
)
52 define
(M4_description
, add)
53 ',`ifdef(`OPERATION_sub_n', `
55 define
(M4_function_n
, mpn_sub_n
)
56 define
(M4_function_nc
, mpn_sub_nc
)
57 define
(M4_description
, subtract
)
58 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
61 MULFUNC_PROLOGUE
(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc
)
64 C mp_limb_t M4_function_n
(mp_ptr dst
, mp_srcptr src1
, mp_srcptr src2
,
66 C mp_limb_t M4_function_nc
(mp_ptr dst
, mp_srcptr src1
, mp_srcptr src2
,
67 C mp_size_t
size, mp_limb_t carry
);
69 C Calculate src1
,size M4_description src2
,size, and store the result
in
70 C dst
,size. The return value is the carry bit from the top of the result
(1
73 C The _nc version accepts
1 or 0 for an initial carry
into the
low limb of
74 C the calculation. Note values other than
1 or 0 here will lead to garbage
77 C
This code runs at
1.64 cycles
/limb
, which might be the best possible with
78 C plain integer operations. Each limb is
2 loads
and 1 store
, any
2 of
79 C which can be done each cycle
, leading to
1.5 c
/l.
81 dnl Must have UNROLL_THRESHOLD
>= 2, since the unrolled
loop can
't handle 1.
83 deflit
(UNROLL_THRESHOLD
, 8)
85 deflit(UNROLL_THRESHOLD, 8)
88 defframe
(PARAM_CARRY
,20)
89 defframe
(PARAM_SIZE
, 16)
90 defframe
(PARAM_SRC2
, 12)
91 defframe
(PARAM_SRC1
, 8)
92 defframe
(PARAM_DST
, 4)
94 defframe
(SAVE_EBP
, -4)
95 defframe
(SAVE_ESI
, -8)
96 defframe
(SAVE_EBX
, -12)
97 defframe
(SAVE_EDI
, -16)
98 deflit
(STACK_SPACE
, 16)
104 PROLOGUE(M4_function_nc)
105 movl PARAM_CARRY, %eax
109 PROLOGUE(M4_function_n)
111 xorl %eax, %eax C carry
113 movl PARAM_SIZE, %ecx
114 subl $STACK_SPACE, %esp
115 deflit(`FRAME',STACK_SPACE
)
119 cmpl $UNROLL_THRESHOLD
, %ecx
121 movl PARAM_SRC2
, %edx
122 movl PARAM_SRC1
, %ebx
126 leal
(%ebx,%ecx,4), %ebx
127 leal
(%edx,%ecx,4), %edx
129 leal
(%edi,%ecx,4), %edi
133 C
This loop in in a single
16 byte code block already
, so no
134 C alignment necessary.
144 movl
(%ebx,%ecx,4), %eax
145 M4_inst
(%edx,%ecx,4), %eax
146 movl
%eax, (%edi,%ecx,4)
155 addl $STACK_SPACE
, %esp
160 C
-----------------------------------------------------------------------------
161 C
This is at
0x55, close enough to aligned.
163 deflit
(`FRAME
',STACK_SPACE)
165 andl $-2, %ecx C size low bit masked out
166 andl $1, PARAM_SIZE C size low bit kept
172 shrl $UNROLL_LOG2, %ecx
176 andl $UNROLL_MASK, %edi
182 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
187 leal ifelse
(UNROLL_BYTES
,256,128) (%ebx,%edi,4), %ebx
188 leal ifelse
(UNROLL_BYTES
,256,128) (%edx,%edi,4), %edx
189 leal ifelse
(UNROLL_BYTES
,256,128) (%ebp,%edi,4), %edi
196 C See mpn/x86/README about old gas bugs
197 leal (%edi,%edi,8), %esi
198 addl $L(entry)-L(here), %esi
204 C
-----------------------------------------------------------------------------
211 C
esi scratch
(was computed jump
)
215 leal UNROLL_BYTES
(%edx), %edx
218 deflit
(CHUNK_COUNT
, 2)
219 forloop
(i
, 0, UNROLL_COUNT
/CHUNK_COUNT
-1, `
220 deflit
(`disp0
', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
221 deflit(`disp1', eval
(disp0
+ 4))
223 Zdisp
( movl
, disp0
,(%ebx), %esi)
224 movl disp1
(%ebx), %ebp
225 Zdisp
( M4_inst
,disp0
,(%edx), %esi)
226 Zdisp
( movl
, %esi, disp0
,(%edi))
227 M4_inst disp1
(%edx), %ebp
228 movl
%ebp, disp1
(%edi)
232 leal UNROLL_BYTES(%ebx), %ebx
233 leal UNROLL_BYTES(%edi), %edi
245 M4_inst UNROLL_BYTES(%edx), %ecx
254 addl $STACK_SPACE, %esp