1 dnl AMD K7 mpn_sqr_basecase
-- square an mpn number.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
35 C (measured on the speed difference between 25 and 50 limbs, which is
36 C roughly the Karatsuba recursing range).
39 dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
42 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
44 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
45 `define
(`SQR_TOOM2_THRESHOLD
',SQR_TOOM2_THRESHOLD_OVERRIDE)')
47 m4_config_gmp_mparam
(`SQR_TOOM2_THRESHOLD
')
48 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
51 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
53 C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
54 C which is quite a bit, but is considered good value since squares big
55 C enough to use most of the code will be spending quite a few cycles in it.
58 defframe(PARAM_SIZE,12)
59 defframe(PARAM_SRC, 8)
60 defframe(PARAM_DST, 4)
64 PROLOGUE(mpn_sqr_basecase)
76 C
------------------------------------------------------------------------------
92 C
------------------------------------------------------------------------------
94 C Using the read
/modify
/write
"add"s seems to be faster than saving
and
95 C restoring registers. Perhaps the loads for the first set hide under the
96 C
mul latency
and the second gets store to load forwarding.
106 pushl %ebx FRAME_pushl()
107 movl %eax, %ebx C src
110 movl %edx, %ecx C dst
114 movl %eax, (%ecx) C dst[0]
117 movl %edx, 4(%ecx) C dst[1]
121 movl %eax, 8(%ecx) C dst[2]
124 movl %edx, 12(%ecx) C dst[3]
126 mull 4(%ebx) C src[0]*src[1]
143 C------------------------------------------------------------------------------
144 defframe(SAVE_EBX, -4)
145 defframe(SAVE_ESI, -8)
146 defframe(SAVE_EDI, -12)
147 defframe(SAVE_EBP, -16)
148 deflit(STACK_SPACE, 16)
151 subl $STACK_SPACE, %esp
154 deflit(`FRAME',STACK_SPACE
)
157 C
------------------------------------------------------------------------------
160 C Writing
out the loads
and stores separately at the
end of
this code comes
161 C
out about
10 cycles faster than using adcls to memory.
168 movl
%eax, %ebx C src
171 movl
%edx, %ecx C dst
175 mull
%eax C src
[0] ^
2
181 mull
%eax C src
[1] ^
2
187 mull
%eax C src
[2] ^
2
193 mull
4(%ebx) C src
[0] * src
[1]
199 mull
8(%ebx) C src
[0] * src
[2]
208 mull
8(%ebx) C src
[1] * src
[2]
216 C
ebx zero
, will be dst
[5]
264 C
------------------------------------------------------------------------------
267 C First multiply src
[0]*src
[1..
size-1] and store at dst
[1..
size].
268 C Further products are added
in rather than stored.
278 defframe
(`VAR_COUNTER
',-20)
279 defframe(`VAR_JMP', -24)
280 deflit
(EXTRA_STACK_SPACE
, 8)
284 leal
(%edx,%ecx,4), %edi C
&dst
[size]
288 leal
(%eax,%ecx,4), %esi C
&src
[size]
290 movl
(%eax), %ebp C multiplier
295 subl
$EXTRA_STACK_SPACE
, %esp
296 FRAME_subl_esp
(EXTRA_STACK_SPACE
)
307 movl
(%esi,%ecx,4), %eax
312 movl
%eax, (%edi,%ecx,4)
320 C
Add products src
[n
]*src
[n
+1..
size-1] at dst
[2*n
-1...
], for each n
=1..
size-2.
322 C The last two products
, which are the bottom right corner of the product
323 C triangle
, are left to the
end. These are src
[size-3]*src
[size-2,size-1]
324 C
and src
[size-2]*src
[size-1]. If
size is
4 then it
's only these corner
325 C cases that need to be done.
327 C The unrolled code is the same as in mpn_addmul_1, see that routine for
330 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
332 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
333 C chunk each outer loop.
335 C K7 does branch prediction on indirect jumps, which is bad since it's a
336 C different target each time. There seems no way to avoid
this.
338 dnl
This value also hard coded
in some shifts
and adds
339 deflit
(CODE_BYTES_PER_LIMB
, 17)
341 dnl With the unmodified
&src
[size] and &dst
[size] pointers
, the
342 dnl displacements
in the unrolled code fit
in a
byte for UNROLL_COUNT
343 dnl values up to
31, but above that an
offset must be added to them.
346 ifelse
(eval
(UNROLL_COUNT
>31),1,
347 eval
((UNROLL_COUNT
-31)*4),
350 dnl Because the last chunk of code is generated differently
, a
label placed
351 dnl at the
end doesn
't work. Instead calculate the implied end using the
352 dnl start and how many chunks of code there are.
354 deflit(UNROLL_INNER_END,
355 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
365 movl PARAM_SIZE
, %ecx
372 ifelse
(OFFSET,0,,`subl $
OFFSET, %edi')
373 ifelse(OFFSET,0,,`subl $OFFSET, %esi')
382 leal UNROLL_INNER_END
-eval
(2*CODE_BYTES_PER_LIMB
)(%ecx,%edx), %ecx
386 C The calculated jump mustn't come
out to before the start of the
387 C code available.
This is the limit UNROLL_COUNT puts on the src
388 C operand
size, but checked here directly using the jump address.
390 `movl_text_address
(L
(unroll_inner_start
), %eax)
394 C------------------------------------------------------------------------------
398 C ebx high limb to store
400 C edx VAR_COUNTER, limbs, negative
401 C esi &src[size], constant
402 C edi dst ptr, high of last addmul
405 movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
406 movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
408 movl %edx, VAR_COUNTER
412 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz
($@
)')')
415 movl
%edx, %ebx C
high carry
416 movl
%ecx, %edx C jump
418 movl
%eax, %ecx C
low carry
419 cmovX
( %ebx, %ecx) C
high carry reverse
420 cmovX
( %eax, %ebx) C
low carry reverse
422 leal CODE_BYTES_PER_LIMB
(%edx), %eax
434 addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
440 C Must be an even address to preserve the significance of the
low
441 C bit of the jump address indicating which way around
ecx/ebx should
445 L
(unroll_inner_start
):
454 forloop
(`i
', UNROLL_COUNT, 1, `
455 deflit(`disp_src', eval
(-i
*4 + OFFSET))
456 deflit
(`disp_dst
', eval(disp_src - 4))
458 m4_assert(`disp_src>=-128 && disp_src<128')
459 m4_assert
(`disp_dst
>=-128 && disp_dst
<128')
462 Zdisp( movl, disp_src,(%esi), %eax)
467 Zdisp( addl, %ecx, disp_dst,(%edi))
473 dnl
this bit comes
out last
474 Zdisp
( movl
, disp_src
,(%esi), %eax)
479 Zdisp
( addl
, %ebx, disp_dst
,(%edi))
481 ifelse
(forloop_last
,0,
497 addl %ecx, -4+OFFSET(%edi)
502 movl %edx, m4_empty_if_zero(OFFSET) (%edi)
503 movl VAR_COUNTER, %edx
506 jnz L(unroll_outer_top)
515 C
------------------------------------------------------------------------------
554 C Left shift of dst
[1.
.2*size-2], high bit shifted
out becomes dst
[2*size-1].
557 movl PARAM_SIZE
, %eax
559 xorl
%ecx, %ecx C clear carry
561 leal
(%edi,%eax,8), %edi
562 notl
%eax C
-size-1, preserve carry
564 leal
2(%eax), %eax C
-(size-1)
567 C
eax counter
, negative
572 C
edi dst
, pointing just after last limb
583 movl
%eax, -4(%edi) C dst most significant limb
585 movl PARAM_SIZE
, %ecx
588 C Now
add in the squares on the diagonal
, src
[0]^
2, src
[1]^
2, ...
,
589 C src
[size-1]^
2. dst
[0] hasn
't yet been set at all yet, and just gets the
590 C low limb of src[0]^2.
592 movl (%esi), %eax C src[0]
596 leal (%esi,%ecx,4), %esi C src point just after last limb
599 movl %eax, (%edi,%ecx,8) C dst[0]
605 C ecx counter, negative
607 C esi src just after last limb
608 C edi dst just after last limb
611 movl (%esi,%ecx,4), %eax
616 addl %ebx, -4(%edi,%ecx,8)
617 adcl %eax, (%edi,%ecx,8)
627 addl %edx, -4(%edi) C dst most significant limb