1 dnl Intel P6 mpn_mul_basecase
-- multiply two mpn numbers.
3 dnl Copyright
1999-2003 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
37 dnl P6 UNROLL_COUNT cycles/product (approx)
41 dnl Maximum possible with the current code is 32.
43 deflit(UNROLL_COUNT, 16)
46 C void mpn_mul_basecase (mp_ptr wp,
47 C mp_srcptr xp, mp_size_t xsize,
48 C mp_srcptr yp, mp_size_t ysize);
50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
51 C it's faster because it does most of the mpn_addmul_1
() startup
52 C calculations only once.
55 deflit(UNROLL_THRESHOLD, 5)
57 deflit
(UNROLL_THRESHOLD
, 5)
60 defframe(PARAM_YSIZE,20)
61 defframe(PARAM_YP, 16)
62 defframe(PARAM_XSIZE,12)
69 PROLOGUE(mpn_mul_basecase)
72 movl PARAM_XSIZE
, %ecx
78 movl
(%eax), %eax C yp
[0]
80 ja L
(xsize_more_than_two
)
81 je L
(two_by_something
)
84 C one limb by one limb
94 C
-----------------------------------------------------------------------------
98 dnl re-use parameter space
99 define(SAVE_EBX, `PARAM_XSIZE')
100 define
(SAVE_ESI
, `PARAM_YSIZE
')
104 movl %eax, %ecx C yp[0]
106 movl %esi, SAVE_ESI C save esi
110 movl (%edx), %eax C xp[0]
114 C two limbs by one limb
126 movl %edx, %esi C carry
144 C -----------------------------------------------------------------------------
156 dnl more parameter space re-use
157 define(SAVE_EDI, `PARAM_WP')
159 mull
%ecx C xp
[0] * yp
[0]
162 movl
%edx, %edi C carry
, for wp
[1]
167 mull
%ecx C xp
[1] * yp
[0]
173 movl
4(%ecx), %ecx C yp
[1]
176 movl
4(%esi), %eax C xp
[1]
177 movl
%edx, %edi C carry
, for wp
[2]
179 mull
%ecx C xp
[1] * yp
[1]
182 movl
(%esi), %eax C xp
[0]
185 movl
%edx, %esi C carry
, for wp
[3]
187 mull
%ecx C xp
[0] * yp
[1]
206 C
-----------------------------------------------------------------------------
208 L
(xsize_more_than_two
):
210 C The first limb of yp is processed with a simple mpn_mul_1
loop running at
211 C about
6.2 c
/l. Unrolling
this doesn
't seem worthwhile since it's only run
212 C once
(whereas the addmul_1 below is run ysize
-1 many times
). A
call to
213 C mpn_mul_1 would be slowed down by the parameter pushing
and popping etc
,
214 C
and doesn
't seem likely to be worthwhile on the typical sizes reaching
215 C here from the Karatsuba code.
225 defframe(`SAVE_EBX', -4)
226 defframe
(`SAVE_ESI
', -8)
227 defframe(`SAVE_EDI', -12)
228 defframe
(`SAVE_EBP
', -16)
229 defframe(VAR_COUNTER, -20) dnl for use in the unroll case
230 defframe(VAR_ADJUST, -24)
231 defframe(VAR_JMP, -28)
232 defframe(VAR_SWAP, -32)
233 defframe(VAR_XP_LOW, -36)
234 deflit(STACK_SPACE, 36)
236 subl $STACK_SPACE, %esp
237 deflit(`FRAME',STACK_SPACE
)
249 leal
(%edx,%ecx,4), %esi C xp
end
251 leal
(%edi,%ecx,4), %edi C wp
end of mul1
258 C
ecx counter
, negative
264 movl
(%esi,%ecx,4), %eax
269 movl
%eax, (%edi,%ecx,4)
277 movl PARAM_YSIZE
, %edx
279 movl
%ebx, (%edi) C final carry
280 movl PARAM_XSIZE
, %ecx
283 jz L
(done
) C if ysize
==1
285 cmpl $UNROLL_THRESHOLD
, %ecx
290 C
-----------------------------------------------------------------------------
291 C simple addmul looping
301 leal
4(%eax,%edx,4), %ebp C yp
end
305 movl
%edx, PARAM_YSIZE C
-(ysize
-1)
306 movl
(%esi,%ecx,4), %eax C xp
low limb
309 movl
%ecx, PARAM_XSIZE C
-(xsize
-1)
310 xorl
%ebx, %ebx C initial carry
313 movl
(%ebp,%edx,4), %ebp C yp second lowest limb
- multiplier
314 jmp L
(simple_outer_entry
)
318 C
ebp ysize counter
, negative
322 movl PARAM_XSIZE
, %ecx C
-(xsize
-1)
323 xorl
%ebx, %ebx C carry
325 movl
%ebp, PARAM_YSIZE
326 addl
$4, %edi C next position
in wp
328 movl
(%edx,%ebp,4), %ebp C yp limb
- multiplier
330 movl
-4(%esi,%ecx,4), %eax C xp
low limb
333 L
(simple_outer_entry
):
338 C
ecx loop counter
(negative
)
349 addl
%ebx, (%edi,%ecx,4)
350 movl
(%esi,%ecx,4), %eax
355 jnz L
(simple_inner_top
)
358 C separate code for last limb so outer
loop counter handling can be
363 movl PARAM_YSIZE
, %ebp
374 jnz L
(simple_outer_top
)
391 C
-----------------------------------------------------------------------------
393 C The unrolled
loop is the same as
in mpn_addmul_1
, see that code for some
396 C VAR_ADJUST is the negative of how many limbs the leals
in the inner
loop
397 C increment xp
and wp.
This is used to adjust xp
and wp
, and is rshifted to
398 C given an initial VAR_COUNTER at the top of the outer
loop.
400 C VAR_COUNTER is for the unrolled
loop, running from VAR_ADJUST
/UNROLL_COUNT
401 C up to
-1, inclusive.
403 C VAR_JMP is the computed jump
into the unrolled
loop.
405 C VAR_SWAP is
0 if xsize odd
or 0xFFFFFFFF if xsize even
, used to swap the
406 C initial
ebx and ecx on
entry to the unrolling.
408 C VAR_XP_LOW is the least significant limb of xp
, which is needed at the
409 C start of the unrolled
loop.
411 C PARAM_YSIZE is the outer
loop counter
, going from
-(ysize
-1) up to
-1,
414 C PARAM_YP is
offset appropriately so that the PARAM_YSIZE counter can be
415 C added to give the location of the next limb of yp
, which is the multiplier
416 C
in the unrolled
loop.
418 C The trick with the VAR_ADJUST value means it
's only necessary to do one
419 C fetch in the outer loop to take care of xp, wp and the inner loop counter.
433 movl 4(%eax), %ebp C multiplier (yp second limb)
434 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
440 movl %edx, PARAM_YSIZE
441 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
444 movl (%esi), %eax C xp low limb
445 andl $-UNROLL_MASK-1, %ebx
446 negl %ecx C -(xsize-1)
449 andl $UNROLL_MASK, %ecx
451 movl %ebx, VAR_ADJUST
455 movl %eax, VAR_XP_LOW
456 sarl $UNROLL_LOG2, %ebx
459 C 15 code bytes per limb
464 leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx
471 sarl
$31, %edx C
0 or -1 as xsize odd
or even
472 leal
4(%edi,%ecx,4), %edi C wp
and xp
, adjust for unrolling
,
473 leal
4(%esi,%ecx,4), %esi C
and start at second limb
476 jmp L
(unroll_outer_entry
)
481 C See mpn/x86/README about old gas bugs
482 leal (%ecx,%edx,1), %ecx
483 addl $L(unroll_inner_entry)-L(unroll_here), %ecx
489 C
--------------------------------------------------------------------------
498 C
ebp ysize counter
, negative
500 movl VAR_ADJUST
, %ebx
503 movl VAR_XP_LOW
, %eax
504 movl
%ebp, PARAM_YSIZE C store incremented ysize counter
506 leal eval
(UNROLL_BYTES
+ 4) (%edi,%ebx,4), %edi
507 leal
(%esi,%ebx,4), %esi
508 sarl $UNROLL_LOG2
, %ebx
510 movl
(%edx,%ebp,4), %ebp C yp next multiplier
512 L
(unroll_outer_entry
):
515 movl
%ebx, VAR_COUNTER
516 movl
%edx, %ebx C carry
high
517 movl
%eax, %ecx C carry
low
524 xorl
%eax, %ebx C carries other way for odd index
530 C
-----------------------------------------------------------------------------
539 C
ebp yp multiplier limb
541 C VAR_COUNTER
loop counter
, negative
545 addl $UNROLL_BYTES
, %edi
547 L
(unroll_inner_entry
):
549 deflit
(CHUNK_COUNT
,2)
550 forloop
(`i
', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
551 deflit(`disp0', eval
(i
*CHUNK_COUNT
*4 ifelse
(UNROLL_BYTES
,256,-128)))
552 deflit
(`disp1
', eval(disp0 + 4))
554 Zdisp( movl, disp0,(%esi), %eax)
556 Zdisp( addl, %ecx, disp0,(%edi))
557 adcl %eax, %ebx C new carry low
559 adcl $0, %ecx C new carry high
561 movl disp1(%esi), %eax
563 addl %ebx, disp1(%edi)
564 adcl %eax, %ecx C new carry low
566 adcl $0, %ebx C new carry high
571 leal UNROLL_BYTES
(%esi), %esi
572 jnz L
(unroll_inner_top
)
580 C
edi wp
, pointing at second last limb
)
583 deflit
(`disp0
', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
584 deflit(`disp1', eval
(disp0
+ 4))
586 movl PARAM_YSIZE
, %ebp
587 addl
%ecx, disp0
(%edi) C carry
low
592 movl
%ebx, disp1
(%edi) C carry
high
593 jnz L
(unroll_outer_top
)