1 dnl Intel P6 mpn_sqr_basecase
-- square an mpn number.
3 dnl Copyright
1999, 2000, 2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
35 C product (measured on the speed difference between 20 and 40 limbs,
36 C which is the Karatsuba recursing range).
39 dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
40 dnl a description. The only difference here is that UNROLL_COUNT can go up
41 dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
43 deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
45 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
46 `define
(`SQR_TOOM2_THRESHOLD
',SQR_TOOM2_THRESHOLD_OVERRIDE)')
48 m4_config_gmp_mparam
(`SQR_TOOM2_THRESHOLD
')
49 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
52 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
54 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
55 C lot of function call overheads are avoided, especially when the given size
58 C The code size might look a bit excessive, but not all of it is executed so
59 C it won't all get
into the code cache. The
1x1
, 2x2
and 3x3 special cases
60 C clearly apply only to those sizes
; mid sizes like 10x10 only need part of
61 C the unrolled addmul
; and big sizes like 40x40 that do use the full
62 C unrolling will least be making good use of it
, because
40x40 will take
63 C something like
7000 cycles.
65 defframe
(PARAM_SIZE
,12)
66 defframe
(PARAM_SRC
, 8)
67 defframe
(PARAM_DST
, 4)
71 PROLOGUE
(mpn_sqr_basecase
)
86 C -----------------------------------------------------------------------------
101 C -----------------------------------------------------------------------------
108 defframe(SAVE_ESI, -4)
109 defframe(SAVE_EBX, -8)
110 defframe(SAVE_EDI, -12)
111 defframe(SAVE_EBP, -16)
112 deflit(`STACK_SPACE',16)
114 subl $STACK_SPACE
, %esp
115 deflit
(`FRAME
',STACK_SPACE)
123 movl %eax, (%ecx) C dst[0]
127 movl %edx, %ebx C dst[1]
132 movl %eax, %edi C dst[2]
136 movl %edx, %ebp C dst[3]
138 mull 4(%esi) C src[0]*src[1]
165 C -----------------------------------------------------------------------------
173 pushl
%esi defframe_pushl
(`SAVE_ESI
')
180 C -----------------------------------------------------------------------------
191 pushl %ebp defframe_pushl(`SAVE_EBP')
192 pushl
%edi defframe_pushl
(`SAVE_EDI
')
194 mull %eax C src[0] ^ 2
202 mull %eax C src[1] ^ 2
208 pushl %ebx defframe_pushl(`SAVE_EBX')
210 mull
%eax C src
[2] ^
2
217 mull
4(%esi) C src
[0] * src
[1]
224 mull
8(%esi) C src
[0] * src
[2]
232 mull
8(%esi) C src
[1] * src
[2]
241 C
esi zero
, will be dst
[5]
279 adcl
%esi, %eax C no carry
out of
this
289 C
-----------------------------------------------------------------------------
290 defframe
(VAR_COUNTER
,-20)
291 defframe
(VAR_JMP
, -24)
292 deflit
(`STACK_SPACE
',24)
302 deflit(`FRAME',4) dnl
%esi already pushed
304 C First multiply src
[0]*src
[1..
size-1] and store at dst
[1..
size].
306 subl $STACK_SPACE
-FRAME
, %esp
307 deflit
(`FRAME
',STACK_SPACE)
314 subl %edx, %ecx C -(size-1)
317 movl $0, %ebx C initial carry
319 leal (%esi,%edx,4), %esi C &src[size]
320 movl %eax, %ebp C multiplier
322 leal -4(%edi,%edx,4), %edi C &dst[size-1]
325 C This loop runs at just over 6 c/l.
330 C ecx counter, limbs, negative, -(size-1) to -1
344 movl %eax, 4(%edi,%ecx,4)
353 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
355 C The last two addmuls, which are the bottom right corner of the product
356 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
357 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
358 C cases that need to be done.
360 C The unrolled code is the same as mpn_addmul_1
(), see that routine for some
363 C VAR_COUNTER is the outer
loop, running from
-(size-4) to
-1, inclusive.
365 C VAR_JMP is the computed jump
into the unrolled code
, stepped by one code
366 C chunk each outer
loop.
368 dnl
This is also hard
-coded
in the address calculation below.
369 deflit
(CODE_BYTES_PER_LIMB
, 15)
371 dnl With
&src
[size] and &dst
[size-1] pointers
, the displacements
in the
372 dnl unrolled code fit
in a
byte for UNROLL_COUNT values up to
32, but above
373 dnl that an
offset must be added to them.
375 ifelse
(eval
(UNROLL_COUNT
>32),1,
376 eval
((UNROLL_COUNT
-32)*4),
387 movl PARAM_SIZE
, %ecx
396 ifelse
(OFFSET,0,,`subl $
OFFSET, %esi')
402 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
406 ifelse
(OFFSET,0,,`subl $
OFFSET, %edi')
408 C The calculated jump mustn't be before the start of the available
409 C code.
This is the limit that UNROLL_COUNT puts on the src operand
410 C
size, but checked here using the jump address directly.
413 `movl_text_address
( L
(unroll_inner_start
), %eax)
417 C -----------------------------------------------------------------------------
421 C ebx high limb to store
423 C edx VAR_COUNTER, limbs, negative
424 C esi &src[size], constant
425 C edi dst ptr, second highest limb of last addmul
428 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
429 movl %edx, VAR_COUNTER
431 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
435 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz
($@
)')')
439 movl
%edx, %ebx C
high carry
442 movl
%ecx, %edx C jump
444 movl
%eax, %ecx C
low carry
445 leal CODE_BYTES_PER_LIMB
(%edx), %edx
447 cmovX
( %ebx, %ecx) C
high carry reverse
448 cmovX
( %eax, %ebx) C
low carry reverse
453 C Must be on an even address here so the
low bit of the jump address
454 C will indicate which way around
ecx/ebx should start.
458 L
(unroll_inner_start
):
467 C
15 code bytes each limb
468 C
ecx/ebx reversed on each chunk
470 forloop
(`i
', UNROLL_COUNT, 1, `
471 deflit(`disp_src', eval
(-i
*4 + OFFSET))
472 deflit
(`disp_dst
', eval(disp_src))
474 m4_assert(`disp_src>=-128 && disp_src<128')
475 m4_assert
(`disp_dst
>=-128 && disp_dst
<128')
478 Zdisp( movl, disp_src,(%esi), %eax)
480 Zdisp( addl, %ebx, disp_dst,(%edi))
485 dnl
this one comes
out last
486 Zdisp
( movl
, disp_src
,(%esi), %eax)
488 Zdisp
( addl
, %ecx, disp_dst
,(%edi))
496 addl
%ebx, m4_empty_if_zero
(OFFSET)(%edi)
498 movl VAR_COUNTER
, %edx
501 movl
%ecx, m4_empty_if_zero
(OFFSET+4)(%edi)
505 jnz L
(unroll_outer_top
)
514 C -----------------------------------------------------------------------------
549 movl PARAM_SIZE, %ecx
560 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
562 subl $1, %ecx C size-1
563 xorl %eax, %eax C ready for final adcl, and clear carry
572 C ecx counter, size-1 to 1
573 C edx size-1 (for later use)
574 C esi src (for later use)
575 C edi dst, incrementing
588 movl %eax, 4(%edi) C dst most significant limb
589 movl (%esi), %eax C src[0]
591 leal 4(%esi,%edx,4), %esi C &src[size]
592 subl %edx, %ecx C -(size-1)
595 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
596 C src[size-1]^2. dst[0] hasn't yet been set at all yet
, and just gets the
597 C
low limb of src
[0]^
2.
602 movl
%eax, (%edi,%ecx,8) C dst
[0]
608 C
ecx counter
, negative
614 movl
(%esi,%ecx,4), %eax
619 addl
%ebx, 4(%edi,%ecx,8)
620 adcl
%eax, 8(%edi,%ecx,8)
630 addl
%edx, 4(%edi) C dst most significant limb
639 C
-----------------------------------------------------------------------------
643 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx