1 dnl Intel P6 mpn_addmul_1
/mpn_submul_1
-- add or subtract mpn multiple.
3 dnl Copyright
1999-2002, 2005 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
36 C P6 model 0-8,10-12 6.44
37 C P6 model 9 (Banias) 6.15
38 C P6 model 13 (Dothan) 6.11
39 C P4 model 0 (Willamette)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
49 dnl P6 UNROLL_COUNT cycles/limb
54 dnl Maximum possible with the current code is 64.
56 deflit(UNROLL_COUNT, 16)
59 ifdef(`OPERATION_addmul_1', `
61 define
(M4_function_1
, mpn_addmul_1
)
62 define
(M4_function_1c
, mpn_addmul_1c
)
63 define
(M4_description
, add it to
)
64 define
(M4_desc_retval
, carry
)
65 ',`ifdef(`OPERATION_submul_1', `
67 define
(M4_function_1
, mpn_submul_1
)
68 define
(M4_function_1c
, mpn_submul_1c
)
69 define
(M4_description
, subtract it from
)
70 define
(M4_desc_retval
, borrow
)
71 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
74 MULFUNC_PROLOGUE
(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c
)
77 C mp_limb_t M4_function_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
79 C mp_limb_t M4_function_1c
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
80 C mp_limb_t mult
, mp_limb_t carry
);
82 C Calculate src
,size multiplied by mult
and M4_description dst
,size.
83 C Return the M4_desc_retval limb from the top of the result.
85 C
This code is pretty much the same as the K6 code. The unrolled
loop is
86 C the same
, but there
's just a few scheduling tweaks in the setups and the
89 C A number of variations have been tried for the unrolled loop, with one or
90 C two carries, and with loads scheduled earlier, but nothing faster than 6
91 C cycles/limb has been found.
94 deflit
(UNROLL_THRESHOLD
, 5)
96 deflit(UNROLL_THRESHOLD, 5)
99 defframe
(PARAM_CARRY
, 20)
100 defframe
(PARAM_MULTIPLIER
,16)
101 defframe
(PARAM_SIZE
, 12)
102 defframe
(PARAM_SRC
, 8)
103 defframe
(PARAM_DST
, 4)
108 PROLOGUE
(M4_function_1c
)
111 movl PARAM_CARRY, %ebx
115 PROLOGUE(M4_function_1)
118 xorl
%ebx, %ebx C initial carry
121 movl PARAM_SIZE
, %ecx
132 cmpl $UNROLL_THRESHOLD, %ecx
134 movl PARAM_MULTIPLIER, %ebp
139 C this is offset 0x22, so close enough to aligned
157 M4_inst %eax, -4(%edi)
178 C------------------------------------------------------------------------------
179 C VAR_JUMP holds the computed jump temporarily because there's
not enough
180 C registers when doing the
mul for the initial two carry limbs.
182 C The
add/adc for the initial carry
in %ebx is necessary only for the
183 C mpn_add
/submul_1c
entry points. Duplicating the startup code to
184 C eliminate
this for the plain mpn_add
/submul_1 doesn
't seem like a good
187 dnl overlapping with parameters already fetched
188 define(VAR_COUNTER,`PARAM_SIZE')
189 define
(VAR_JUMP
, `PARAM_DST
')
191 C this is offset 0x43, so close enough to aligned
207 shrl $UNROLL_LOG2, %edx
208 andl $UNROLL_MASK, %ecx
210 movl %edx, VAR_COUNTER
213 C 15 code bytes per limb
221 leal L(entry) (%edx,%ecx,1), %edx
223 movl
(%esi), %eax C src
low limb
226 leal ifelse
(UNROLL_BYTES
,256,128+) 4(%esi,%ecx,4), %esi
230 addl
%ebx, %eax C initial carry
(from _1c
)
233 movl
%edx, %ebx C
high carry
234 leal ifelse
(UNROLL_BYTES
,256,128) (%edi,%ecx,4), %edi
238 movl
%eax, %ecx C
low carry
240 cmovnz
( %ebx, %ecx) C
high,low carry other way around
251 C See mpn/x86/README about old gas bugs
252 leal (%edx,%ecx,1), %edx
253 addl $L(entry)-L(here), %edx
261 C
-----------------------------------------------------------
273 C VAR_COUNTER loop counter
275 C 15 code bytes per limb
277 addl $UNROLL_BYTES, %edi
280 deflit(CHUNK_COUNT,2)
281 forloop(`i', 0, UNROLL_COUNT
/CHUNK_COUNT
-1, `
282 deflit
(`disp0
', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
283 deflit(`disp1', eval
(disp0
+ 4))
285 Zdisp
( movl
, disp0
,(%esi), %eax)
287 Zdisp
( M4_inst
,%ecx, disp0
,(%edi))
292 movl disp1
(%esi), %eax
294 M4_inst
%ebx, disp1
(%edi)
301 leal UNROLL_BYTES(%esi), %esi
306 deflit(`disp0', eval
(UNROLL_BYTES ifelse
(UNROLL_BYTES
,256,-128)))
308 M4_inst
%ecx, disp0
(%edi)