1 dnl AMD K6 mpn_sqr_basecase
-- square an mpn number.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
35 C product (measured on the speed difference between 17 and 33 limbs,
36 C which is roughly the Karatsuba recursing range).
39 dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
40 dnl code supports. This value is used only by the tune program to know
41 dnl what it can go up to. (An attempt to compile with a bigger value will
42 dnl trigger some m4_assert()s in the code, making the build fail.)
44 dnl The value is determined by requiring the displacements in the unrolled
45 dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
46 dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
48 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
51 dnl Allow a value from the tune program to override config.m4.
53 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
54 `define
(`SQR_TOOM2_THRESHOLD
',SQR_TOOM2_THRESHOLD_OVERRIDE)')
57 dnl UNROLL_COUNT is the number of code chunks
in the unrolled addmul. The
58 dnl number required is determined by SQR_TOOM2_THRESHOLD
, since
59 dnl mpn_sqr_basecase only needs to handle sizes
< SQR_TOOM2_THRESHOLD.
61 dnl The first addmul is the biggest
, and this takes the second least
62 dnl significant limb
and multiplies it by the third least significant
and
63 dnl up. Hence for a maximum operand
size of SQR_TOOM2_THRESHOLD
-1
64 dnl limbs
, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD
-3.
66 m4_config_gmp_mparam
(`SQR_TOOM2_THRESHOLD
')
67 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
70 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
72 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
73 C lot of function call overheads are avoided, especially when the given size
76 C The code size might look a bit excessive, but not all of it is executed
77 C and so won't fill up the code cache. The
1x1
, 2x2
and 3x3 special cases
78 C clearly apply only to those sizes
; mid sizes like 10x10 only need part of
79 C the unrolled addmul
; and big sizes like 35x35 that do need all of it will
80 C at least be getting value for money
, because
35x35 spends something like
83 C Different values of UNROLL_COUNT give slightly different speeds
, between
84 C
9.0 and 9.2 c
/tri
-prod measured on the difference between
17 and 33 limbs.
85 C
This isn
't a big difference, but it's presumably some alignment effect
86 C which if understood could give a simple speedup.
88 defframe
(PARAM_SIZE
,12)
89 defframe
(PARAM_SRC
, 8)
90 defframe
(PARAM_DST
, 4)
94 PROLOGUE
(mpn_sqr_basecase
)
107 C -----------------------------------------------------------------------------
124 C -----------------------------------------------------------------------------
133 movl %eax, %ebx C src
154 mull
%edx C src
[0]*src
[1]
170 C
-----------------------------------------------------------------------------
177 C -----------------------------------------------------------------------------
184 movl %eax, %ebx C src
187 movl %edx, %ecx C dst
189 mull %eax C src[0] ^ 2
197 mull %eax C src[1] ^ 2
205 mull %eax C src[2] ^ 2
213 mull %edx C src[0] * src[1]
224 mull %edx C src[0] * src[2]
233 mull %edx C src[1] * src[2]
273 C -----------------------------------------------------------------------------
275 defframe(SAVE_EBX, -4)
276 defframe(SAVE_ESI, -8)
277 defframe(SAVE_EDI, -12)
278 defframe(SAVE_EBP, -16)
279 defframe(VAR_COUNTER,-20)
280 defframe(VAR_JMP, -24)
281 deflit(STACK_SPACE, 24)
294 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
296 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
297 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
298 C a 5780 cycle operation, which is not surprising since the loop here is 8
299 C c/l and mpn_mul_1 is 6.25 c/l.
301 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE
)
322 movl
(%eax), %ebp C multiplier
323 leal
-1(%ecx), %ecx C
size-1, and pad to a
16 byte boundary
352 C Addmul src
[n
]*src
[n
+1..
size-1] at dst
[2*n
-1...
], for each n
=1..
size-2.
354 C The last two addmuls
, which are the bottom right corner of the product
355 C triangle
, are left to the
end. These are src
[size-3]*src
[size-2,size-1]
356 C
and src
[size-2]*src
[size-1]. If
size is
4 then it
's only these corner
357 C cases that need to be done.
359 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
362 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
364 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
365 C chunk each outer loop.
367 C K6 doesn't do any branch prediction on indirect jumps
, which is good
368 C actually because it
's a different target each time. The unrolled addmul
369 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
370 C the indirect jump is quickly recovered.
373 dnl This value is also implicitly encoded in a shift and add.
375 deflit(CODE_BYTES_PER_LIMB, 15)
377 dnl With the unmodified &src[size] and &dst[size] pointers, the
378 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
379 dnl values up to 31. Above that an offset must be added to them.
382 ifelse(eval(UNROLL_COUNT>31),1,
383 eval((UNROLL_COUNT-31)*4),
394 movl PARAM_SIZE, %ecx
402 ` subl $OFFSET, %ebx')
406 ` subl $
OFFSET, %edi')
414 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
419 C The calculated jump mustn
't be before the start of the available
420 C code. This is the limitation UNROLL_COUNT puts on the src operand
421 C size, but checked here using the jump address directly.
424 movl_text_address( L(unroll_inner_start), %eax)
429 C
-----------------------------------------------------------------------------
433 C
ebx &src
[size], constant
435 C
edx VAR_COUNTER
, limbs
, negative
436 C
esi high limb to store
437 C
edi dst
ptr, high of last addmul
440 movl
-12+OFFSET(%ebx,%edx,4), %ebp C multiplier
441 movl
%edx, VAR_COUNTER
443 movl
-8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
449 movl
%edx, %esi C
high carry
450 movl
%ecx, %edx C jump
452 movl
%eax, %ecx C
low carry
453 leal CODE_BYTES_PER_LIMB
(%edx), %edx
458 C A branch
-free version of
this using some xors was found to be a
459 C touch slower than just a conditional jump
, despite the jump
460 C switching between taken
and not taken on every
loop.
462 ifelse
(eval
(UNROLL_COUNT
%2),0,
463 jz,jnz) L
(unroll_noswap
)
464 movl
%esi, %eax C
high,low carry other way around
473 C Must be on an even address here so the
low bit of the jump address
474 C will indicate which way around
ecx/esi should start.
476 C An attempt was made at padding here to get the
end of the unrolled
477 C code to come
out on a good alignment
, to save padding before
478 C L
(corner
).
This worked
, but turned
out to run slower than just an
479 C
ALIGN(2). The reason for
this is
not clear
, it might be related
480 C to the different speeds on different UNROLL_COUNTs noted above.
484 L
(unroll_inner_start
):
493 C
15 code bytes each limb
494 C
ecx/esi swapped on each chunk
496 forloop
(`i
', UNROLL_COUNT, 1, `
497 deflit(`disp_src', eval
(-i
*4 + OFFSET))
498 deflit
(`disp_dst
', eval(disp_src - 4))
500 m4_assert(`disp_src>=-128 && disp_src<128')
501 m4_assert
(`disp_dst
>=-128 && disp_dst
<128')
504 Zdisp( movl, disp_src,(%ebx), %eax)
506 Zdisp( addl, %esi, disp_dst,(%edi))
511 dnl
this one comes
out last
512 Zdisp
( movl
, disp_src
,(%ebx), %eax)
514 Zdisp
( addl
, %ecx, disp_dst
,(%edi))
522 addl
%esi, -4+OFFSET(%edi)
524 movl VAR_COUNTER
, %edx
527 movl
%ecx, m4_empty_if_zero
(OFFSET)(%edi)
531 jnz L
(unroll_outer_top
)
540 C -----------------------------------------------------------------------------
581 C -----------------------------------------------------------------------------
582 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
583 C The loop measures about 6 cycles/iteration, though it looks like it should
587 movl PARAM_SIZE, %ecx
590 subl $1, %ecx C size-1 and clear carry
595 xorl %eax, %eax C ready for adcl
601 C ebx src (for later use)
602 C ecx counter, decrementing
603 C edx size-1 (for later use)
605 C edi dst, incrementing
616 movl %eax, 4(%edi) C dst most significant limb
617 movl (%ebx), %eax C src[0]
619 leal 4(%ebx,%edx,4), %ebx C &src[size]
620 subl %edx, %ecx C -(size-1)
623 C -----------------------------------------------------------------------------
624 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
625 C src[size-1]^2. dst[0] hasn't yet been set at all yet
, and just gets the
626 C
low limb of src
[0]^
2.
631 movl
%eax, (%edi,%ecx,8) C dst
[0]
638 C
ecx counter
, negative
644 movl
(%ebx,%ecx,4), %eax
649 addl
%esi, 4(%edi,%ecx,8)
650 adcl
%eax, 8(%edi,%ecx,8)
660 addl
%edx, 4(%edi) C dst most significant limb
669 C
-----------------------------------------------------------------------------
672 C See mpn/x86/README about old gas bugs
674 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx