new beta-0.90.0
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / aorsmul_1.asm
blobeaa92ebb24b69dae457b6d27a545c9726651c48b
1 dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5
36 C P6 model 0-8,10-12 5.94
37 C P6 model 9 (Banias) 5.51
38 C P6 model 13 (Dothan) 5.57
39 C P4 model 0 (Willamette)
40 C P4 model 1 (?)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
43 C P4 model 4 (Nocona)
44 C AMD K6 7.65-8.5 (data dependent)
45 C AMD K7
46 C AMD K8
49 dnl K6: large multipliers small multipliers
50 dnl UNROLL_COUNT cycles/limb cycles/limb
51 dnl 4 9.5 7.78
52 dnl 8 9.0 7.78
53 dnl 16 8.4 7.65
54 dnl 32 8.4 8.2
55 dnl
56 dnl Maximum possible unrolling with the current code is 32.
57 dnl
58 dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
59 dnl byte block, which might explain the good speed at that unrolling.
61 deflit(UNROLL_COUNT, 16)
64 ifdef(`OPERATION_addmul_1', `
65 define(M4_inst, addl)
66 define(M4_function_1, mpn_addmul_1)
67 define(M4_function_1c, mpn_addmul_1c)
68 ',`ifdef(`OPERATION_submul_1', `
69 define(M4_inst, subl)
70 define(M4_function_1, mpn_submul_1)
71 define(M4_function_1c, mpn_submul_1c)
72 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
73 ')')')
75 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
78 C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
79 C mp_limb_t mult);
80 C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
81 C mp_limb_t mult, mp_limb_t carry);
82 C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
83 C mp_limb_t mult);
84 C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
85 C mp_limb_t mult, mp_limb_t carry);
87 C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
88 C multipliers (most significant few bits clear) result in few carry bits and
89 C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
90 C significant few bits set) make the carry bits 50/50 and lead to something
91 C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l.
93 C It's important that the gains for jadcl0 on small multipliers don't come
94 C at the cost of slowing down other data. Tests on uniformly distributed
95 C random data, designed to confound branch prediction, show about a 7%
96 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
97 C overheads included).
99 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
100 C 11.0 cycles/limb), and hence isn't used.
102 C In the simple loop, note that running ecx from negative to zero and using
103 C it as an index in the two movs wouldn't help. It would save one
104 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
105 C that would be collapsed by this.
107 C Attempts at a simpler main loop, with less unrolling, haven't yielded much
108 C success, generally running over 9 c/l.
111 C jadcl0
112 C ------
114 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
115 C firstly the instruction decoding and secondly the fact that there's a
116 C carry bit for the jadcl0 only on average about 1/4 of the time.
118 C The code in the unrolled loop decodes something like the following.
120 C decode cycles
121 C mull %ebp 2
122 C M4_inst %esi, disp(%edi) 1
123 C adcl %eax, %ecx 2
124 C movl %edx, %esi \ 1
125 C jnc 1f /
126 C incl %esi \ 1
127 C 1: movl disp(%ebx), %eax /
128 C ---
131 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
132 C with it taken (both when correctly predicted). This is opposite to the
133 C measurements showing small multipliers running faster than large ones.
134 C Don't really know why.
136 C It's not clear how much branch misprediction might be costing. The K6
137 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
138 C of that range to get the measured results.
141 C In the code the two carries are more or less the preceding mul product and
142 C the calculation is roughly
144 C x*y + u*b+v
146 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
147 C v are the two limbs it's added to (being the low of the next mul, and a
148 C limb from the destination).
150 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
151 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
152 C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
153 C and b-1, then the total probability can be summed over x and y,
155 C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
156 C --- * sum sum --- = --- * ------- * ------- = 1/4
157 C b^2 x=0 y=1 b^2 b^4 2 2
159 C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
160 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
163 ifdef(`PIC',`
164 deflit(UNROLL_THRESHOLD, 9)
166 deflit(UNROLL_THRESHOLD, 6)
169 defframe(PARAM_CARRY, 20)
170 defframe(PARAM_MULTIPLIER,16)
171 defframe(PARAM_SIZE, 12)
172 defframe(PARAM_SRC, 8)
173 defframe(PARAM_DST, 4)
175 TEXT
176 ALIGN(32)
178 PROLOGUE(M4_function_1c)
179 pushl %esi
180 deflit(`FRAME',4)
181 movl PARAM_CARRY, %esi
182 jmp L(start_nc)
183 EPILOGUE()
185 PROLOGUE(M4_function_1)
186 push %esi
187 deflit(`FRAME',4)
188 xorl %esi, %esi C initial carry
190 L(start_nc):
191 movl PARAM_SIZE, %ecx
192 pushl %ebx
193 deflit(`FRAME',8)
195 movl PARAM_SRC, %ebx
196 pushl %edi
197 deflit(`FRAME',12)
199 cmpl $UNROLL_THRESHOLD, %ecx
200 movl PARAM_DST, %edi
202 pushl %ebp
203 deflit(`FRAME',16)
204 jae L(unroll)
207 C simple loop
209 movl PARAM_MULTIPLIER, %ebp
211 L(simple):
212 C eax scratch
213 C ebx src
214 C ecx counter
215 C edx scratch
216 C esi carry
217 C edi dst
218 C ebp multiplier
220 movl (%ebx), %eax
221 addl $4, %ebx
223 mull %ebp
225 addl $4, %edi
226 addl %esi, %eax
228 adcl $0, %edx
230 M4_inst %eax, -4(%edi)
232 adcl $0, %edx
234 movl %edx, %esi
235 loop L(simple)
238 popl %ebp
239 popl %edi
241 popl %ebx
242 movl %esi, %eax
244 popl %esi
249 C -----------------------------------------------------------------------------
250 C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
251 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
252 C For the computed jump an odd size means they start one way around, an even
253 C size the other.
255 C VAR_JUMP holds the computed jump temporarily because there's not enough
256 C registers at the point of doing the mul for the initial two carry limbs.
258 C The add/adc for the initial carry in %esi is necessary only for the
259 C mpn_addmul/submul_1c entry points. Duplicating the startup code to
260 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
261 C idea.
263 dnl overlapping with parameters already fetched
264 define(VAR_COUNTER, `PARAM_SIZE')
265 define(VAR_JUMP, `PARAM_DST')
267 L(unroll):
268 C eax
269 C ebx src
270 C ecx size
271 C edx
272 C esi initial carry
273 C edi dst
274 C ebp
276 movl %ecx, %edx
277 decl %ecx
279 subl $2, %edx
280 negl %ecx
282 shrl $UNROLL_LOG2, %edx
283 andl $UNROLL_MASK, %ecx
285 movl %edx, VAR_COUNTER
286 movl %ecx, %edx
288 shll $4, %edx
289 negl %ecx
291 C 15 code bytes per limb
292 ifdef(`PIC',`
293 call L(pic_calc)
294 L(here):
296 leal L(entry) (%edx,%ecx,1), %edx
298 movl (%ebx), %eax C src low limb
300 movl PARAM_MULTIPLIER, %ebp
301 movl %edx, VAR_JUMP
303 mull %ebp
305 addl %esi, %eax C initial carry (from _1c)
306 jadcl0( %edx)
309 leal 4(%ebx,%ecx,4), %ebx
310 movl %edx, %esi C high carry
312 movl VAR_JUMP, %edx
313 leal (%edi,%ecx,4), %edi
315 testl $1, %ecx
316 movl %eax, %ecx C low carry
318 jz L(noswap)
319 movl %esi, %ecx C high,low carry other way around
321 movl %eax, %esi
322 L(noswap):
324 jmp *%edx
327 ifdef(`PIC',`
328 L(pic_calc):
329 C See mpn/x86/README about old gas bugs
330 leal (%edx,%ecx,1), %edx
331 addl $L(entry)-L(here), %edx
332 addl (%esp), %edx
333 ret_internal
337 C -----------------------------------------------------------
338 ALIGN(32)
339 L(top):
340 deflit(`FRAME',16)
341 C eax scratch
342 C ebx src
343 C ecx carry lo
344 C edx scratch
345 C esi carry hi
346 C edi dst
347 C ebp multiplier
349 C 15 code bytes per limb
351 leal UNROLL_BYTES(%edi), %edi
353 L(entry):
354 forloop(`i', 0, UNROLL_COUNT/2-1, `
355 deflit(`disp0', eval(2*i*4))
356 deflit(`disp1', eval(disp0 + 4))
358 Zdisp( movl, disp0,(%ebx), %eax)
359 mull %ebp
360 Zdisp( M4_inst,%ecx, disp0,(%edi))
361 adcl %eax, %esi
362 movl %edx, %ecx
363 jadcl0( %ecx)
365 movl disp1(%ebx), %eax
366 mull %ebp
367 M4_inst %esi, disp1(%edi)
368 adcl %eax, %ecx
369 movl %edx, %esi
370 jadcl0( %esi)
373 decl VAR_COUNTER
375 leal UNROLL_BYTES(%ebx), %ebx
376 jns L(top)
379 popl %ebp
380 M4_inst %ecx, UNROLL_BYTES(%edi)
382 popl %edi
383 movl %esi, %eax
385 popl %ebx
386 jadcl0( %eax)
388 popl %esi
391 EPILOGUE()