1 dnl AMD64 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2008, 2009, 2011, 2012 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
35 C The inner loops of this code are the result of running a code generation and
36 C optimization tool suite written by David Harvey and Torbjorn Granlund.
39 C * There is a major stupidity in that we call mpn_mul_1 initially, for a
40 C large trip count. Instead, we should follow the generic/sqr_basecase.c
41 C code which uses addmul_2s from the start, conditionally leaving a 1x1
42 C multiply to the end. (In assembly code, one would stop invoking
43 C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
44 C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
45 C save/restore carry, instead it can propagate into the high product word.
46 C * Align more labels, should shave off a few cycles.
47 C * We can safely use 32-bit size operations, since operands with (2^32)
48 C limbs will lead to non-termination in practice.
49 C * The jump table could probably be optimized, at least for non-pic.
50 C * The special code for n <= 4 was quickly written. It is probably too
51 C large and unnecessarily slow.
52 C * Consider combining small cases code so that the n=k-1 code jumps into the
53 C middle of the n=k code.
54 C * Avoid saving registers for small cases code.
57 C i r8 work left, initially n
58 C j r9 inner loop count
76 define(`n_param', `
%rdx
')
95 PROLOGUE(mpn_sqr_basecase)
97 mov R32(n_param), R32(%rcx)
98 mov R32(n_param), R32(n) C free original n register (rdx)
114 lea L(tab)(%rip), %rax
116 ` movslq
(%rax
,%rcx
,4), %r10
124 L
(tab
): JMPENT
( L
(4), L
(tab
))
125 JMPENT
( L
(1), L
(tab
))
126 JMPENT
( L
(2), L
(tab
))
127 JMPENT
( L
(3), L
(tab
))
128 JMPENT
( L
(0m4
), L
(tab
))
129 JMPENT
( L
(1m4
), L
(tab
))
130 JMPENT
( L
(2m4
), L
(tab
))
131 JMPENT
( L
(3m4
), L
(tab
))
285 lea -16(rp
,n
,8), tp C point tp
in middle of result operand
288 lea (up
,n
,8), up C point up at
end of input operand
291 C Function mpn_mul_1_m3
(tp
, up
- i
, i
, up
[-i
- 1])
321 L
(L3
): xor R32
(w1
), R32
(w1
)
337 lea eval
(2*8)(tp
), tp C tp
+= 2
343 lea 8(rp
,n
,8), tp C point tp
in middle of result operand
346 lea 8(up
,n
,8), up C point up at
end of input operand
349 C Function mpn_mul_2s_m0
(tp
, up
- i
, i
, up
- i
- 1)
365 mov -24(up
,j
,8), %rax
369 mov -24(up
,j
,8), %rax
376 L
(m0
): mov -16(up
,j
,8), %rax C u2
, u6 ...
381 mov -16(up
,j
,8), %rax
399 L
(m2x
): mov (up
,j
,8), %rax
405 mov -32(up
,j
,8), %rax
416 lea eval
(3*8-24)(tp
), tp C tp
+= 3
421 lea -16(rp
,n
,8), tp C point tp
in middle of result operand
424 lea (up
,n
,8), up C point up at
end of input operand
427 C Function mpn_mul_1_m1
(tp
, up
- (i
- 1), i
- 1, up
[-i
])
443 L
(L1
): xor R32
(w0
), R32
(w0
)
472 lea eval
(2*8)(tp
), tp C tp
+= 2
478 lea 8(rp
,n
,8), tp C point tp
in middle of result operand
481 lea 8(up
,n
,8), up C point up at
end of input operand
484 C Function mpn_mul_2s_m2
(tp
, up
- i
+ 1, i
- 1, up
- i
)
501 mov -24(up
,j
,8), %rax
505 mov -24(up
,j
,8), %rax
512 mov -16(up
,j
,8), %rax
517 mov -16(up
,j
,8), %rax
535 L
(m2
): mov (up
,j
,8), %rax
541 mov -32(up
,j
,8), %rax
555 C Function mpn_addmul_2s_m2
(tp
, up
- (i
- 1), i
- 1, up
- i
)
609 L
(am2
): mov 32(up
,j
,8), %rax
617 js L
(addmul_2_m2_top
)
625 lea eval
(2*8)(tp
), tp C tp
+= 2
627 add $
-2, R32
(i
) C i
-= 2
630 C Function mpn_addmul_2s_m0
(tp
, up
- (i
- 1), i
- 1, up
- i
)
660 L
(20): mov 16(up
,j
,8), %rax
691 js L
(addmul_2_m0_top
)
699 lea eval
(2*8)(tp
), tp C tp
+= 2
702 add $
-2, R32
(i
) C i
-= 2
705 C Function mpn_addmul_2s_2
727 C Function mpn_sqr_diag_addlsh1
739 L
(evn
): add %r11
, %r11
740 sbb R32
(%rbx
), R32
(%rbx
) C save CF
745 L
(odd
): add %r11
, %r11
746 sbb R32
(%rbp
), R32
(%rbp
) C save CF
753 L
(top
): mov (up
,j
,4), %rax
755 add R32
(%rbp
), R32
(%rbp
) C restore carry
759 L
(d0
): mov %r11
, 8(rp
,j
,8)
765 sbb R32
(%rbp
), R32
(%rbp
) C save CF
768 add R32
(%rbx
), R32
(%rbx
) C restore carry
772 L
(d1
): mov %r11
, 24(rp
,j
,8)
777 sbb R32
(%rbx
), R32
(%rbx
) C save CF
783 add R32
(%rbp
), R32
(%rbp
) C restore carry
790 sbb R32
(%rbp
), R32
(%rbp
) C save CF
794 add R32
(%rbx
), R32
(%rbx
) C restore carry