1 dnl AMD K6 mpn_addmul_1
/mpn_submul_1
-- add or subtract mpn multiple.
3 dnl Copyright
1999-2003, 2005 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
36 C P6 model 0-8,10-12 5.94
37 C P6 model 9 (Banias) 5.51
38 C P6 model 13 (Dothan) 5.57
39 C P4 model 0 (Willamette)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
44 C AMD K6 7.65-8.5 (data dependent)
49 dnl K6: large multipliers small multipliers
50 dnl UNROLL_COUNT cycles/limb cycles/limb
56 dnl Maximum possible unrolling with the current code is 32.
58 dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
59 dnl byte block, which might explain the good speed at that unrolling.
61 deflit(UNROLL_COUNT, 16)
64 ifdef(`OPERATION_addmul_1', `
66 define
(M4_function_1
, mpn_addmul_1
)
67 define
(M4_function_1c
, mpn_addmul_1c
)
68 ',`ifdef(`OPERATION_submul_1', `
70 define
(M4_function_1
, mpn_submul_1
)
71 define
(M4_function_1c
, mpn_submul_1c
)
72 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
75 MULFUNC_PROLOGUE
(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c
)
78 C mp_limb_t mpn_addmul_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
80 C mp_limb_t mpn_addmul_1c
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
81 C mp_limb_t mult
, mp_limb_t carry
);
82 C mp_limb_t mpn_submul_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
84 C mp_limb_t mpn_submul_1c
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
85 C mp_limb_t mult
, mp_limb_t carry
);
87 C The jadcl0
()s
in the unrolled
loop makes the speed data dependent.
Small
88 C multipliers
(most significant few bits clear
) result
in few carry bits
and
89 C speeds up to
7.65 cycles
/limb are attained.
Large multipliers
(most
90 C significant few bits set
) make the carry bits
50/50 and lead to something
91 C more like
8.4 c
/l. With adcl
's both of these would be 9.3 c/l.
93 C It's important that the gains for jadcl0 on
small multipliers don
't come
94 C at the cost of slowing down other data. Tests on uniformly distributed
95 C random data, designed to confound branch prediction, show about a 7%
96 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
97 C overheads included).
99 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
100 C 11.0 cycles/limb), and hence isn't used.
102 C
In the simple
loop, note that running
ecx from negative to zero
and using
103 C it as an index
in the two
movs wouldn
't help. It would save one
104 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
105 C that would be collapsed by
this.
107 C Attempts at a simpler main
loop, with less unrolling
, haven
't yielded much
108 C success, generally running over 9 c/l.
114 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
115 C firstly the instruction decoding and secondly the fact that there's a
116 C carry bit for the jadcl0 only on average about
1/4 of the time.
118 C The code
in the unrolled
loop decodes something like the following.
122 C M4_inst
%esi, disp
(%edi) 1
124 C movl
%edx, %esi \
1
127 C
1: movl disp
(%ebx), %eax /
131 C
In a back
-to
-back style
test this measures
7 with the
jnc not taken
, or 8
132 C with it taken
(both when correctly predicted
).
This is opposite to the
133 C measurements showing
small multipliers running faster than
large ones.
134 C Don
't really know why.
136 C It's
not clear how much branch misprediction might be costing. The K6
137 C doco says it will be
1 to
4 cycles
, but presumably it
's near the low end
138 C of that range to get the measured results.
141 C In the code the two carries are more or less the preceding mul product and
142 C the calculation is roughly
146 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
147 C v are the two limbs it's added to
(being the
low of the next
mul, and a
148 C limb from the destination
).
150 C To get a carry requires x
*y
+u
*b
+v
>= b^
2, which is u
*b
+v
>= b^
2-x
*y
, and
151 C there are b^
2-(b^
2-x
*y
) = x
*y many such values
, giving a probability of
152 C x
*y
/b^
2. If x
, y
, u
and v are random
and uniformly distributed between
0
153 C
and b
-1, then the total probability can be summed over x
and y
,
155 C
1 b
-1 b
-1 x
*y
1 b
*(b
-1) b
*(b
-1)
156 C
--- * sum sum
--- = --- * ------- * ------- = 1/4
157 C b^
2 x
=0 y
=1 b^
2 b^
4 2 2
159 C Actually it
's a very tiny bit less than 1/4 of course. If y is fixed,
160 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
164 deflit
(UNROLL_THRESHOLD
, 9)
166 deflit(UNROLL_THRESHOLD, 6)
169 defframe
(PARAM_CARRY
, 20)
170 defframe
(PARAM_MULTIPLIER
,16)
171 defframe
(PARAM_SIZE
, 12)
172 defframe
(PARAM_SRC
, 8)
173 defframe
(PARAM_DST
, 4)
178 PROLOGUE
(M4_function_1c
)
181 movl PARAM_CARRY, %esi
185 PROLOGUE(M4_function_1)
188 xorl
%esi, %esi C initial carry
191 movl PARAM_SIZE
, %ecx
199 cmpl $UNROLL_THRESHOLD
, %ecx
209 movl PARAM_MULTIPLIER, %ebp
230 M4_inst %eax, -4(%edi)
249 C -----------------------------------------------------------------------------
250 C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
251 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
252 C For the computed jump an odd size means they start one way around, an even
255 C VAR_JUMP holds the computed jump temporarily because there's
not enough
256 C registers at the point of doing the
mul for the initial two carry limbs.
258 C The
add/adc for the initial carry
in %esi is necessary only for the
259 C mpn_addmul
/submul_1c
entry points. Duplicating the startup code to
260 C eliminate
this for the plain mpn_add
/submul_1 doesn
't seem like a good
263 dnl overlapping with parameters already fetched
264 define(VAR_COUNTER, `PARAM_SIZE')
265 define
(VAR_JUMP
, `PARAM_DST
')
282 shrl $UNROLL_LOG2, %edx
283 andl $UNROLL_MASK, %ecx
285 movl %edx, VAR_COUNTER
291 C 15 code bytes per limb
296 leal L(entry) (%edx,%ecx,1), %edx
298 movl
(%ebx), %eax C src
low limb
300 movl PARAM_MULTIPLIER
, %ebp
305 addl
%esi, %eax C initial carry
(from _1c
)
309 leal
4(%ebx,%ecx,4), %ebx
310 movl
%edx, %esi C
high carry
313 leal
(%edi,%ecx,4), %edi
316 movl
%eax, %ecx C
low carry
319 movl
%esi, %ecx C
high,low carry other way around
329 C See mpn/x86/README about old gas bugs
330 leal (%edx,%ecx,1), %edx
331 addl $L(entry)-L(here), %edx
337 C
-----------------------------------------------------------
349 C 15 code bytes per limb
351 leal UNROLL_BYTES(%edi), %edi
354 forloop(`i', 0, UNROLL_COUNT
/2-1, `
355 deflit
(`disp0
', eval(2*i*4))
356 deflit(`disp1', eval
(disp0
+ 4))
358 Zdisp
( movl
, disp0
,(%ebx), %eax)
360 Zdisp
( M4_inst
,%ecx, disp0
,(%edi))
365 movl disp1
(%ebx), %eax
367 M4_inst
%esi, disp1
(%edi)
375 leal UNROLL_BYTES(%ebx), %ebx
380 M4_inst %ecx, UNROLL_BYTES(%edi)