1 dnl X86
-64 mpn_redc_1 optimised for AMD K8
-K10.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2004, 2008, 2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
57 C * Micro-optimise, none performed thus far.
58 C * This looks different from other current redc_1.asm variants. Consider
59 C adapting this to the mainstream style.
60 C * Is this code really faster than more approaches which compute q0 later?
61 C Is the use of a jump jump table faster? Or is the edge of this due to the
63 C * Put initial m[0] x q0 computation in header.
64 C * Put basecases at the file's
end, single them
out before the pushes.
66 define
(`rp
', `%rdi') C rcx
67 define
(`up
', `%rsi') C rdx
68 define
(`mp_param
', `%rdx') C r8
69 define
(`n
', `%rcx') C r9
70 define
(`u0inv
', `%r8') C stack
73 define
(`nneg
', `%r12')
86 IFDOS
(`
mov 56(%rsp
), %r8
')
90 imul u0inv, q0 C first q0, for all execution paths
98 lea (mp_param,n,8), mp C mp += n
99 lea -16(up,n,8), up C up += n
101 mov R32(n), R32(%rax)
106 lea L(tab)(%rip), %r9
108 movslq
(%r9
,%rax
,4), %rax
117 L
(tab
): JMPENT
( L
(0), L
(tab
))
118 JMPENT
( L
(1), L
(tab
))
119 JMPENT
( L
(2), L
(tab
))
120 JMPENT
( L
(3), L
(tab
))
121 JMPENT
( L
(0m4
), L
(tab
))
122 JMPENT
( L
(1m4
), L
(tab
))
123 JMPENT
( L
(2m4
), L
(tab
))
124 JMPENT
( L
(3m4
), L
(tab
))
128 L
(1): mov (mp_param
), %rax
134 adc R32
(%rax
), R32
(%rax
)
139 L
(2): mov (mp_param
), %rax
141 xor R32
(%r14
), R32
(%r14
)
155 xor R32
(%rbx
), R32
(%rbx
)
165 xor R32
(%rax
), R32
(%rax
)
170 adc R32
(%rax
), R32
(%rax
)
174 L
(3): mov (mp_param
), %rax
180 xor R32
(%r9
), R32
(%r9
)
181 xor R32
(%r14
), R32
(%r14
)
203 xor R32
(%r9
), R32
(%r9
)
204 xor R32
(%r14
), R32
(%r14
)
226 xor R32
(%r9
), R32
(%r9
)
227 xor R32
(%r14
), R32
(%r14
)
239 xor R32
(%rax
), R32
(%rax
)
246 adc R32
(%rax
), R32
(%rax
)
252 L
(lo2
): mov (mp
,nneg
,8), %rax
254 xor R32
(%r14
), R32
(%r14
)
255 xor R32
(%rbx
), R32
(%rbx
)
257 mov 8(mp
,nneg
,8), %rax
258 mov 24(up
,nneg
,8), %r15
261 add 16(up
,nneg
,8), %r10
263 mov 16(mp
,nneg
,8), %rax
266 mov $0, R32
(%r10
) C
xor?
273 L
(li2
): add %r10
, (up
,i
,8)
277 xor R32
(%r10
), R32
(%r10
)
279 L
(e2
): add %r9
, 8(up
,i
,8)
290 mov $0, R32
(%r14
) C zero
291 mov %r14
, %rbx C zero
300 L
(le2
): add %r10
, (up
)
305 mov %rdx
, 16(up
,nneg
,8) C up
[0]
313 lea 32(up
,nneg
,8), up
328 L
(lo1
): mov (mp
,nneg
,8), %rax
330 xor R32
(%rbx
), R32
(%rbx
)
333 mov 8(mp
,nneg
,8), %rax
334 mov 24(up
,nneg
,8), %r15
336 mov $0, R32
(%r10
) C
xor?
338 add 16(up
,nneg
,8), %r9
341 mov 16(mp
,nneg
,8), %rax
349 L
(li1
): add %r10
, (up
,i
,8)
353 xor R32
(%r10
), R32
(%r10
)
360 L
(e1
): add %r14
, 16(up
,i
,8)
366 mov $0, R32
(%r14
) C zero
367 mov %r14
, %rbx C zero
376 L
(le1
): add %r10
, (up
)
381 mov %rdx
, 16(up
,nneg
,8) C up
[0]
389 lea 24(up
,nneg
,8), up
402 L
(lo0
): mov (mp
,nneg
,8), %rax
405 xor R32
(%r10
), R32
(%r10
)
408 mov 8(mp
,nneg
,8), %rax
409 mov 24(up
,nneg
,8), %r15
411 add 16(up
,nneg
,8), %r14
419 L
(li0
): add %r10
, (up
,i
,8)
423 xor R32
(%r10
), R32
(%r10
)
433 L
(e0
): mov 16(mp
,i
,8), %rax
436 mov $0, R32
(%r14
) C zero
437 mov %r14
, %rbx C zero
446 L
(le0
): add %r10
, (up
)
451 mov %rdx
, 16(up
,nneg
,8) C up
[0]
460 lea 16(up
,nneg
,8), up
467 L
(lo3
): mov (mp
,nneg
,8), %rax
471 mov 8(mp
,nneg
,8), %rax
472 mov 24(up
,nneg
,8), %r15
474 add 16(up
,nneg
,8), %rbx C result is zero
, might carry
475 mov $0, R32
(%rbx
) C zero
476 mov %rbx
, %r14 C zero
478 mov 16(mp
,nneg
,8), %rax
488 L
(li3
): add %r10
, (up
,i
,8)
492 xor R32
(%r10
), R32
(%r10
)
505 mov $0, R32
(%r14
) C zero
506 mov %r14
, %rbx C zero
515 L
(le3
): add %r10
, (up
)
520 mov %rdx
, 16(up
,nneg
,8) C up
[0]
527 C
==== Addition code
====
530 lea 40(up
,nneg
,8), up
547 L
(addy
):mov (up
), %r8
553 L
(al3
): adc (vp
), %r8
567 L
(mid
): mov 16(up
), %r10
571 L
(ae3
): adc (vp
), %r8
580 L
(ad3
): mov R32
(n
), R32
(%rax
) C zero
581 adc R32
(%rax
), R32
(%rax
)