beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mul_basecase.asm
blob7030001c3f21ec2d9a47dfd13d65e5e85de6f419
1 dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl Copyright 1999-2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
35 C unrolling).
39 dnl K6: UNROLL_COUNT cycles/product (approx)
40 dnl 8 9.75
41 dnl 16 9.3
42 dnl 32 9.3
43 dnl Maximum possible with the current code is 32.
44 dnl
45 dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which
46 dnl might explain it's good performance.
48 deflit(UNROLL_COUNT, 16)
51 C void mpn_mul_basecase (mp_ptr wp,
52 C mp_srcptr xp, mp_size_t xsize,
53 C mp_srcptr yp, mp_size_t ysize);
55 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
56 C wp,xsize+ysize.
58 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
59 C it's faster because it does most of the mpn_addmul_1() entry code only
60 C once. The saving is about 10-20% on typical sizes coming from the
61 C Karatsuba multiply code.
63 C Enhancements:
65 C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
66 C c/l. Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
68 C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
69 C extra stack setups and maybe 2 or 3 wasted cycles at the end. Code saving
70 C would be 256 bytes.
72 ifdef(`PIC',`
73 deflit(UNROLL_THRESHOLD, 8)
74 ',`
75 deflit(UNROLL_THRESHOLD, 8)
78 defframe(PARAM_YSIZE,20)
79 defframe(PARAM_YP, 16)
80 defframe(PARAM_XSIZE,12)
81 defframe(PARAM_XP, 8)
82 defframe(PARAM_WP, 4)
84 TEXT
85 ALIGN(32)
86 PROLOGUE(mpn_mul_basecase)
87 deflit(`FRAME',0)
89 movl PARAM_XSIZE, %ecx
90 movl PARAM_YP, %eax
92 movl PARAM_XP, %edx
93 movl (%eax), %eax C yp low limb
95 cmpl $2, %ecx
96 ja L(xsize_more_than_two_limbs)
97 je L(two_by_something)
100 C one limb by one limb
102 movl (%edx), %edx C xp low limb
103 movl PARAM_WP, %ecx
105 mull %edx
107 movl %eax, (%ecx)
108 movl %edx, 4(%ecx)
112 C -----------------------------------------------------------------------------
113 L(two_by_something):
114 decl PARAM_YSIZE
115 pushl %ebx
116 deflit(`FRAME',4)
118 movl PARAM_WP, %ebx
119 pushl %esi
120 deflit(`FRAME',8)
122 movl %eax, %ecx C yp low limb
123 movl (%edx), %eax C xp low limb
125 movl %edx, %esi C xp
126 jnz L(two_by_two)
129 C two limbs by one limb
131 mull %ecx
133 movl %eax, (%ebx)
134 movl 4(%esi), %eax
136 movl %edx, %esi C carry
138 mull %ecx
140 addl %eax, %esi
141 movl %esi, 4(%ebx)
143 adcl $0, %edx
145 movl %edx, 8(%ebx)
146 popl %esi
148 popl %ebx
153 C -----------------------------------------------------------------------------
154 ALIGN(16)
155 L(two_by_two):
156 C eax xp low limb
157 C ebx wp
158 C ecx yp low limb
159 C edx
160 C esi xp
161 C edi
162 C ebp
163 deflit(`FRAME',8)
165 mull %ecx C xp[0] * yp[0]
167 push %edi
168 deflit(`FRAME',12)
169 movl %eax, (%ebx)
171 movl 4(%esi), %eax
172 movl %edx, %edi C carry, for wp[1]
174 mull %ecx C xp[1] * yp[0]
176 addl %eax, %edi
177 movl PARAM_YP, %ecx
179 adcl $0, %edx
181 movl %edi, 4(%ebx)
182 movl 4(%ecx), %ecx C yp[1]
184 movl 4(%esi), %eax C xp[1]
185 movl %edx, %edi C carry, for wp[2]
187 mull %ecx C xp[1] * yp[1]
189 addl %eax, %edi
191 adcl $0, %edx
193 movl (%esi), %eax C xp[0]
194 movl %edx, %esi C carry, for wp[3]
196 mull %ecx C xp[0] * yp[1]
198 addl %eax, 4(%ebx)
199 adcl %edx, %edi
200 adcl $0, %esi
202 movl %edi, 8(%ebx)
203 popl %edi
205 movl %esi, 12(%ebx)
206 popl %esi
208 popl %ebx
212 C -----------------------------------------------------------------------------
213 ALIGN(16)
214 L(xsize_more_than_two_limbs):
216 C The first limb of yp is processed with a simple mpn_mul_1 style loop
217 C inline. Unrolling this doesn't seem worthwhile since it's only run once
218 C (whereas the addmul below is run ysize-1 many times). A call to the
219 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
220 C popping, and doesn't seem likely to be worthwhile on the typical 10-20
221 C limb operations the Karatsuba code calls here with.
223 C eax yp[0]
224 C ebx
225 C ecx xsize
226 C edx xp
227 C esi
228 C edi
229 C ebp
230 deflit(`FRAME',0)
232 pushl %edi defframe_pushl(SAVE_EDI)
233 pushl %ebp defframe_pushl(SAVE_EBP)
235 movl PARAM_WP, %edi
236 pushl %esi defframe_pushl(SAVE_ESI)
238 movl %eax, %ebp
239 pushl %ebx defframe_pushl(SAVE_EBX)
241 leal (%edx,%ecx,4), %ebx C xp end
242 xorl %esi, %esi
244 leal (%edi,%ecx,4), %edi C wp end of mul1
245 negl %ecx
248 L(mul1):
249 C eax scratch
250 C ebx xp end
251 C ecx counter, negative
252 C edx scratch
253 C esi carry
254 C edi wp end of mul1
255 C ebp multiplier
257 movl (%ebx,%ecx,4), %eax
259 mull %ebp
261 addl %esi, %eax
262 movl $0, %esi
264 adcl %edx, %esi
266 movl %eax, (%edi,%ecx,4)
267 incl %ecx
269 jnz L(mul1)
272 movl PARAM_YSIZE, %edx
273 movl %esi, (%edi) C final carry
275 movl PARAM_XSIZE, %ecx
276 decl %edx
278 jnz L(ysize_more_than_one_limb)
280 popl %ebx
281 popl %esi
282 popl %ebp
283 popl %edi
287 L(ysize_more_than_one_limb):
288 cmpl $UNROLL_THRESHOLD, %ecx
289 movl PARAM_YP, %eax
291 jae L(unroll)
294 C -----------------------------------------------------------------------------
295 C Simple addmul loop.
297 C Using ebx and edi pointing at the ends of their respective locations saves
298 C a couple of instructions in the outer loop. The inner loop is still 11
299 C cycles, the same as the simple loop in aorsmul_1.asm.
301 C eax yp
302 C ebx xp end
303 C ecx xsize
304 C edx ysize-1
305 C esi
306 C edi wp end of mul1
307 C ebp
309 movl 4(%eax), %ebp C multiplier
310 negl %ecx
312 movl %ecx, PARAM_XSIZE C -xsize
313 xorl %esi, %esi C initial carry
315 leal 4(%eax,%edx,4), %eax C yp end
316 negl %edx
318 movl %eax, PARAM_YP
319 movl %edx, PARAM_YSIZE
321 jmp L(simple_outer_entry)
324 C aligning here saves a couple of cycles
325 ALIGN(16)
326 L(simple_outer_top):
327 C edx ysize counter, negative
329 movl PARAM_YP, %eax C yp end
330 xorl %esi, %esi C carry
332 movl PARAM_XSIZE, %ecx C -xsize
333 movl %edx, PARAM_YSIZE
335 movl (%eax,%edx,4), %ebp C yp limb multiplier
336 L(simple_outer_entry):
337 addl $4, %edi
340 L(simple_inner):
341 C eax scratch
342 C ebx xp end
343 C ecx counter, negative
344 C edx scratch
345 C esi carry
346 C edi wp end of this addmul
347 C ebp multiplier
349 movl (%ebx,%ecx,4), %eax
351 mull %ebp
353 addl %esi, %eax
354 movl $0, %esi
356 adcl $0, %edx
357 addl %eax, (%edi,%ecx,4)
358 adcl %edx, %esi
360 incl %ecx
361 jnz L(simple_inner)
364 movl PARAM_YSIZE, %edx
365 movl %esi, (%edi)
367 incl %edx
368 jnz L(simple_outer_top)
371 popl %ebx
372 popl %esi
373 popl %ebp
374 popl %edi
378 C -----------------------------------------------------------------------------
379 C Unrolled loop.
381 C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
382 C some comments.
384 C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
385 C 0, inclusive.
387 C VAR_JMP is the computed jump into the unrolled loop.
389 C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
390 C is entered.
392 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
393 C start of the unrolled loop. This can't just be fetched through the xp
394 C pointer because of the offset applied to it.
396 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
397 C inclusive.
399 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
400 C added to give the location of the next limb of yp, which is the multiplier
401 C in the unrolled loop.
403 C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
404 C to give the starting point in the destination for each unrolled loop (this
405 C point is one limb upwards for each limb of yp processed).
407 C Having PARAM_YSIZE count negative to zero means it's not necessary to
408 C store new values of PARAM_YP and PARAM_WP on each loop. Those values on
409 C the stack remain constant and on each loop an leal adjusts them with the
410 C PARAM_YSIZE counter value.
413 defframe(VAR_COUNTER, -20)
414 defframe(VAR_COUNTER_INIT, -24)
415 defframe(VAR_JMP, -28)
416 defframe(VAR_XP_LOW, -32)
417 deflit(VAR_STACK_SPACE, 16)
419 dnl For some strange reason using (%esp) instead of 0(%esp) is a touch
420 dnl slower in this code, hence the defframe empty-if-zero feature is
421 dnl disabled.
423 dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the
424 dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects
425 dnl anything isn't clear.
427 define(`defframe_empty_if_zero_disabled',1)
429 L(unroll):
430 C eax yp (not used)
431 C ebx xp end (not used)
432 C ecx xsize
433 C edx ysize-1
434 C esi
435 C edi wp end of mul1 (not used)
436 C ebp
437 deflit(`FRAME', 16)
439 leal -2(%ecx), %ebp C one limb processed at start,
440 decl %ecx C and ebp is one less
442 shrl $UNROLL_LOG2, %ebp
443 negl %ecx
445 subl $VAR_STACK_SPACE, %esp
446 deflit(`FRAME', 16+VAR_STACK_SPACE)
447 andl $UNROLL_MASK, %ecx
449 movl %ecx, %esi
450 shll $4, %ecx
452 movl %ebp, VAR_COUNTER_INIT
453 negl %esi
455 C 15 code bytes per limb
456 ifdef(`PIC',`
457 call L(pic_calc)
458 L(unroll_here):
460 leal L(unroll_entry) (%ecx,%esi,1), %ecx
463 movl PARAM_XP, %ebx
464 movl %ebp, VAR_COUNTER
466 movl PARAM_WP, %edi
467 movl %ecx, VAR_JMP
469 movl (%ebx), %eax
470 leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1
472 leal (%ebx,%esi,4), %ebx C xp adjust for unrolling
474 movl %eax, VAR_XP_LOW
476 movl %ebx, PARAM_XP
477 movl PARAM_YP, %ebx
479 leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing
480 movl 4(%ebx), %ebp C multiplier (yp second limb)
482 leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing
484 movl %ecx, PARAM_WP
486 leal 1(%esi), %ecx C adjust parity for decl %ecx above
488 movl %ebx, PARAM_YP
489 negl %edx
491 movl %edx, PARAM_YSIZE
492 jmp L(unroll_outer_entry)
495 ifdef(`PIC',`
496 L(pic_calc):
497 C See mpn/x86/README about old gas bugs
498 leal (%ecx,%esi,1), %ecx
499 addl $L(unroll_entry)-L(unroll_here), %ecx
500 addl (%esp), %ecx
501 ret_internal
505 C -----------------------------------------------------------------------------
506 C Aligning here saves a couple of cycles per loop. Using 32 doesn't
507 C cost any extra space, since the inner unrolled loop below is
508 C aligned to 32.
509 ALIGN(32)
510 L(unroll_outer_top):
511 C edx ysize
513 movl PARAM_YP, %eax
514 movl %edx, PARAM_YSIZE C incremented ysize counter
516 movl PARAM_WP, %edi
518 movl VAR_COUNTER_INIT, %ebx
519 movl (%eax,%edx,4), %ebp C next multiplier
521 movl PARAM_XSIZE, %ecx
522 leal (%edi,%edx,4), %edi C adjust wp for where we are in yp
524 movl VAR_XP_LOW, %eax
525 movl %ebx, VAR_COUNTER
527 L(unroll_outer_entry):
528 mull %ebp
530 C using testb is a tiny bit faster than testl
531 testb $1, %cl
533 movl %eax, %ecx C low carry
534 movl VAR_JMP, %eax
536 movl %edx, %esi C high carry
537 movl PARAM_XP, %ebx
539 jnz L(unroll_noswap)
540 movl %ecx, %esi C high,low carry other way around
542 movl %edx, %ecx
543 L(unroll_noswap):
545 jmp *%eax
549 C -----------------------------------------------------------------------------
550 ALIGN(32)
551 L(unroll_top):
552 C eax scratch
553 C ebx xp
554 C ecx carry low
555 C edx scratch
556 C esi carry high
557 C edi wp
558 C ebp multiplier
559 C VAR_COUNTER loop counter
561 C 15 code bytes each limb
563 leal UNROLL_BYTES(%edi), %edi
565 L(unroll_entry):
566 deflit(CHUNK_COUNT,2)
567 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
568 deflit(`disp0', eval(i*CHUNK_COUNT*4))
569 deflit(`disp1', eval(disp0 + 4))
570 deflit(`disp2', eval(disp1 + 4))
572 movl disp1(%ebx), %eax
573 mull %ebp
574 Zdisp( addl, %ecx, disp0,(%edi))
575 adcl %eax, %esi
576 movl %edx, %ecx
577 jadcl0( %ecx)
579 movl disp2(%ebx), %eax
580 mull %ebp
581 addl %esi, disp1(%edi)
582 adcl %eax, %ecx
583 movl %edx, %esi
584 jadcl0( %esi)
587 decl VAR_COUNTER
588 leal UNROLL_BYTES(%ebx), %ebx
590 jns L(unroll_top)
593 movl PARAM_YSIZE, %edx
594 addl %ecx, UNROLL_BYTES(%edi)
596 adcl $0, %esi
598 incl %edx
599 movl %esi, UNROLL_BYTES+4(%edi)
601 jnz L(unroll_outer_top)
604 movl SAVE_ESI, %esi
605 movl SAVE_EBP, %ebp
606 movl SAVE_EDI, %edi
607 movl SAVE_EBX, %ebx
609 addl $FRAME, %esp
612 EPILOGUE()