1 dnl X86
-64 mpn_redc_1 optimised for AMD bobcat.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2003-2005, 2007, 2008, 2011-2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
54 C * Micro-optimise, none performed thus far.
55 C * Consider inlining mpn_add_n.
56 C * Single basecases out before the pushes.
58 C When playing with pointers, set this to $2 to fall back to conservative
59 C indexing in wind-down code.
62 define(`rp', `
%rdi
') C rcx
63 define(`up', `
%rsi
') C rdx
64 define(`mp_param', `
%rdx
') C r8
65 define(`n', `
%rcx
') C r9
66 define(`u0inv', `
%r8
') C stack
77 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
82 define(`ALIGNx', `
ALIGN(16)')
89 IFDOS(` mov 56(%rsp), %r8 ')
98 mov n
, j C outer
loop induction var
99 lea (mp_param
,n
,8), mp
102 imul u0inv
, q0 C first iteration q0
107 L
(bx1
): test $2, R8
(n
)
110 L
(b1
): cmp $
-1, R32
(n
)
133 imul u0inv
, %rbx C next q limb
137 L
(tp1
): add w0
, -16(up
,i
,8)
154 L
(e1
): mov 16(mp
,i
,8), %rax
168 L
(ed1
): add w0
, I
(-16(up
),-16(up
,i
,8))
171 add w2
, I
(-8(up
),-8(up
,i
,8))
173 mov w3
, (up
,n
,8) C up
[0]
174 mov %rbx
, q0 C previously computed q limb
-> q0
180 L
(b3
): cmp $
-3, R32
(n
)
203 imul u0inv
, %rbx C next q limb
207 L
(tp3
): add w0
, -16(up
,i
,8)
210 L
(e3
): mov (mp
,i
,8), %rax
238 L
(ed3
): add w0
, I
(-16(up
),-16(up
,i
,8))
241 add w2
, I
(-8(up
),-8(up
,i
,8))
243 mov w3
, (up
,n
,8) C up
[0]
244 mov %rbx
, q0 C previously computed q limb
-> q0
251 IFSTD
(`
lea (up
,n
,8), up C param
2: up
252 lea (up
,n
,8), %rdx C param
3: up
- n
253 neg R32
(n
) ') C param 4: n
255 IFDOS(` lea (up,n,8), %rdx C param 2: up
256 lea (%rdx,n,8), %r8 C param 3: up - n
258 mov n, %r9 C param 4: n
259 mov rp, %rcx ') C param
1: rp
261 IFSTD
(`
sub $8, %rsp
')
262 IFDOS(` sub $40, %rsp ')
263 ASSERT
(nz
, `
test $15, %rsp
')
265 IFSTD(` add $8, %rsp ')
266 IFDOS
(`
add $40, %rsp
')
277 L(bx0): test $2, R8(n)
301 imul u0inv, %rbx C next q limb
305 L(tp0): add w0, -16(up,i,8)
329 L(e0): mov 24(mp,i,8), %rax
336 L(ed0): add w0, I(-16(up),-16(up,i,8))
339 add w2, I(-8(up),-8(up,i,8))
341 mov w3, (up,n,8) C up[0]
342 mov %rbx, q0 C previously computed q limb -> q0
348 L(b2): cmp $-2, R32(n)
371 imul u0inv, %rbx C next q limb
375 L(tp2): add w0, -16(up,i,8)
385 L(e2): mov 8(mp,i,8), %rax
406 L(ed2): add w0, I(-16(up),-16(up,i,8))
409 add w2, I(-8(up),-8(up,i,8))
411 mov w3, (up,n,8) C up[0]
412 mov %rbx, q0 C previously computed q limb -> q0
418 L(n1): mov (mp_param), %rax
424 adc R32(%rax), R32(%rax)
427 L(n2): mov (mp_param), %rax
442 imul u0inv, q0 C next q0
455 xor R32(%rax), R32(%rax)
460 adc R32(%rax), R32(%rax)
464 L(n3): mov -24(mp), %rax
482 imul u0inv, q0 C next q0
490 mov %r11, -24(up) C up[0]
497 xor R32(%rax), R32(%rax)
504 adc R32(%rax), R32(%rax)