1 dnl AMD K7 mpn_mul_basecase
-- multiply two mpn numbers.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
35 C limbs/loop unrolling).
39 dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
43 dnl Maximum possible with the current code is 32.
45 dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
46 dnl done with a straight run through a block of code, no inner loop. Using
47 dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
49 deflit(UNROLL_COUNT, 32)
52 C void mpn_mul_basecase (mp_ptr wp,
53 C mp_srcptr xp, mp_size_t xsize,
54 C mp_srcptr yp, mp_size_t ysize);
56 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
59 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
60 C it's faster because it does most of the mpn_addmul_1
() startup
61 C calculations only once. The saving is
15-25% on typical sizes coming from
62 C the Karatsuba multiply code.
65 deflit(UNROLL_THRESHOLD, 5)
67 deflit
(UNROLL_THRESHOLD
, 5)
70 defframe(PARAM_YSIZE,20)
71 defframe(PARAM_YP, 16)
72 defframe(PARAM_XSIZE,12)
78 PROLOGUE(mpn_mul_basecase)
81 movl PARAM_XSIZE
, %ecx
85 movl
(%eax), %eax C yp
low limb
88 ja L
(xsize_more_than_two
)
89 je L
(two_by_something
)
92 C one limb by one limb
102 C
-----------------------------------------------------------------------------
106 pushl %ebx defframe_pushl(`SAVE_EBX')
107 movl
%eax, %ecx C yp
low limb
110 pushl
%esi defframe_pushl
(`SAVE_ESI
')
113 movl (%edx), %eax C xp low limb
117 C two limbs by one limb
123 movl %edx, %esi C carry
142 C -----------------------------------------------------------------------------
143 C Could load yp earlier into another register.
155 dnl FRAME carries on from previous
157 mull %ecx C xp[0] * yp[0]
159 push %edi defframe_pushl(`SAVE_EDI')
160 movl
%edx, %edi C carry
, for wp
[1]
165 mull
%ecx C xp
[1] * yp
[0]
171 movl
4(%ecx), %ecx C yp
[1]
174 movl
4(%esi), %eax C xp
[1]
175 movl
%edx, %edi C carry
, for wp
[2]
177 mull
%ecx C xp
[1] * yp
[1]
182 movl
(%esi), %eax C xp
[0]
184 movl
%edx, %esi C carry
, for wp
[3]
186 mull
%ecx C xp
[0] * yp
[1]
203 C
-----------------------------------------------------------------------------
205 L
(xsize_more_than_two
):
207 C The first limb of yp is processed with a simple mpn_mul_1 style
loop
208 C inline. Unrolling
this doesn
't seem worthwhile since it's only run once
209 C
(whereas the addmul below is run ysize
-1 many times
). A
call to the
210 C actual mpn_mul_1 will be slowed down by the
call and parameter pushing
and
211 C popping
, and doesn
't seem likely to be worthwhile on the typical 13-26
212 C limb operations the Karatsuba code calls here with.
222 dnl FRAME doesn't carry on from previous
, no pushes yet here
223 defframe
(`SAVE_EBX
',-4)
224 defframe(`SAVE_ESI',-8)
225 defframe
(`SAVE_EDI
',-12)
226 defframe(`SAVE_EBP',-16)
241 leal
(%edx,%ecx,4), %esi C xp
end
243 leal
(%edi,%ecx,4), %edi C wp
end of mul1
250 C
ecx counter
, negative
256 movl
(%esi,%ecx,4), %eax
261 movl
%eax, (%edi,%ecx,4)
269 movl PARAM_YSIZE
, %edx
270 movl PARAM_XSIZE
, %ecx
272 movl
%ebx, (%edi) C final carry
275 jnz L
(ysize_more_than_one
)
288 L
(ysize_more_than_one
):
289 cmpl $UNROLL_THRESHOLD
, %ecx
295 C
-----------------------------------------------------------------------------
296 C simple addmul looping
306 leal
4(%eax,%edx,4), %ebp C yp
end
310 movl
(%esi,%ecx,4), %eax C xp
low limb
311 movl
%edx, PARAM_YSIZE C
-(ysize
-1)
314 xorl
%ebx, %ebx C initial carry
315 movl
%ecx, PARAM_XSIZE C
-(xsize
-1)
318 movl
(%ebp,%edx,4), %ebp C yp second lowest limb
- multiplier
319 jmp L
(simple_outer_entry
)
322 C
this is
offset 0x121 so close enough to aligned
324 C
ebp ysize counter
, negative
327 movl PARAM_XSIZE
, %ecx C
-(xsize
-1)
328 xorl
%ebx, %ebx C carry
330 movl
%ebp, PARAM_YSIZE
331 addl
$4, %edi C next position
in wp
333 movl
(%edx,%ebp,4), %ebp C yp limb
- multiplier
334 movl
-4(%esi,%ecx,4), %eax C xp
low limb
337 L
(simple_outer_entry
):
342 C
ecx loop counter
(negative
)
353 addl
%ebx, (%edi,%ecx,4)
354 movl
(%esi,%ecx,4), %eax
364 movl PARAM_YSIZE
, %ebp
374 jnz L
(simple_outer_top
)
388 C
-----------------------------------------------------------------------------
390 C The unrolled
loop is the same as
in mpn_addmul_1
(), see that code for some
393 C VAR_ADJUST is the negative of how many limbs the leals
in the inner
loop
394 C increment xp
and wp.
This is used to adjust back xp
and wp
, and rshifted
395 C to given an initial VAR_COUNTER at the top of the outer
loop.
397 C VAR_COUNTER is for the unrolled
loop, running from VAR_ADJUST
/UNROLL_COUNT
398 C up to
-1, inclusive.
400 C VAR_JMP is the computed jump
into the unrolled
loop.
402 C VAR_XP_LOW is the least significant limb of xp
, which is needed at the
403 C start of the unrolled
loop.
405 C PARAM_YSIZE is the outer
loop counter
, going from
-(ysize
-1) up to
-1,
408 C PARAM_YP is
offset appropriately so that the PARAM_YSIZE counter can be
409 C added to give the location of the next limb of yp
, which is the multiplier
410 C
in the unrolled
loop.
412 C The trick with VAR_ADJUST means it
's only necessary to do one fetch in the
413 C outer loop to take care of xp, wp and the inner loop counter.
415 defframe(VAR_COUNTER, -20)
416 defframe(VAR_ADJUST, -24)
417 defframe(VAR_JMP, -28)
418 defframe(VAR_XP_LOW, -32)
419 deflit(VAR_EXTRA_SPACE, 16)
432 movl 4(%eax), %ebp C multiplier (yp second limb)
433 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
439 movl %edx, PARAM_YSIZE
440 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
443 movl (%esi), %eax C xp low limb
444 andl $-UNROLL_MASK-1, %ebx
447 subl $VAR_EXTRA_SPACE, %esp
448 deflit(`FRAME',16+VAR_EXTRA_SPACE
)
450 andl $UNROLL_MASK
, %ecx
452 movl
%ebx, VAR_ADJUST
456 sarl $UNROLL_LOG2
, %ebx
458 C
17 code bytes per limb
463 leal L
(unroll_entry
) (%ecx,%edx,1), %ecx
467 movl %eax, VAR_XP_LOW
469 leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
470 leal 4(%esi,%edx,4), %esi C and start at second limb
471 jmp L(unroll_outer_entry)
476 C See mpn
/x86
/README about old gas bugs
477 leal
(%ecx,%edx,1), %ecx
478 addl $L
(unroll_entry
)-L
(unroll_here
), %ecx
484 C --------------------------------------------------------------------------
487 C ebp ysize counter, negative
489 movl VAR_ADJUST, %ebx
492 movl VAR_XP_LOW, %eax
493 movl %ebp, PARAM_YSIZE C store incremented ysize counter
495 leal 4(%edi,%ebx,4), %edi
496 leal (%esi,%ebx,4), %esi
497 sarl $UNROLL_LOG2, %ebx
499 movl (%edx,%ebp,4), %ebp C yp next multiplier
502 L(unroll_outer_entry):
505 testb $1, %cl C and clear carry bit
506 movl %ebx, VAR_COUNTER
510 cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
513 C Extra fetch of VAR_JMP is bad, but registers are tight
517 C -----------------------------------------------------------------------------
526 C ebp yp multiplier limb
528 C VAR_COUNTER loop counter, negative
534 deflit(CHUNK_COUNT,2)
535 forloop(`i', 0, UNROLL_COUNT
/CHUNK_COUNT
-1, `
536 deflit
(`disp0
', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
537 deflit(`disp1', eval
(disp0
+ 4))
539 Zdisp
( movl
, disp0
,(%esi), %eax)
544 Zdisp
( addl
, %ecx, disp0
,(%edi))
550 movl disp1
(%esi), %eax
555 addl
%ebx, disp1
(%edi)
563 leal UNROLL_BYTES(%esi), %esi
564 leal UNROLL_BYTES(%edi), %edi
574 C edi wp, pointing at second last limb)
577 C carry flag to be added to high
579 deflit(`disp0', ifelse
(UNROLL_BYTES
,256,-128))
580 deflit
(`disp1
', eval(disp0-0 + 4))
582 movl PARAM_YSIZE, %ebp
584 addl %ecx, disp0(%edi)
589 movl %edx, disp1(%edi)
590 jnz L(unroll_outer_top)