1 dnl X86
-64 mpn_redc_1 optimised for Intel Sandy Bridge
and Ivy Bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2003-2005, 2007, 2008, 2011-2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
57 C * Micro-optimise, none performed thus far.
58 C * Consider inlining mpn_add_n.
59 C * Single basecases out before the pushes.
61 C When playing with pointers, set this to $2 to fall back to conservative
62 C indexing in wind-down code.
65 define(`rp', `
%rdi
') C rcx
66 define(`up', `
%rsi
') C rdx
67 define(`mp_param', `
%rdx
') C r8
68 define(`n', `
%rcx
') C r9
69 define(`u0inv', `
%r8
') C stack
76 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
81 define(`ALIGNx', `
ALIGN(16)')
88 IFDOS(` mov 56(%rsp), %r8 ')
97 mov n
, j C outer
loop induction var
98 lea 8(mp_param
,n
,8), mp
101 imul u0inv
, q0 C first iteration q0
106 L
(bx1
): test $2, R8
(n
)
109 L
(b1
): cmp $
-1, R32
(n
)
129 mov %rbx
, -8(up
,i
,8) C next
low remainder limb
131 imul u0inv
, %rbx C next q limb
136 mov -16(up
,i
,8), %r10
140 mov %rbp
, -24(up
,i
,8)
149 mov %r10
, -16(up
,i
,8)
158 L
(e1
): mov %rdx
, %r11
175 mov I
(-16(up
),-16(up
,i
,8)), %r10
178 mov %rbp
, I
(-24(up
),-24(up
,i
,8))
183 mov %r10
, I
(-16(up
),-16(up
,i
,8))
184 mov %rdx
, -8(up
,n
,8) C up
[0]
185 mov %rbx
, q0 C previously computed q limb
-> q0
191 L
(b3
): cmp $
-3, R32
(n
)
214 imul u0inv
, %rbx C next q limb
219 mov -16(up
,i
,8), %r10
223 mov %rbp
, -24(up
,i
,8)
224 L
(e3
): add %rax
, %r10
232 mov %r10
, -16(up
,i
,8)
258 mov I
(-16(up
),-16(up
,i
,8)), %r10
261 mov %rbp
, I
(-24(up
),-24(up
,i
,8))
266 mov %r10
, I
(-16(up
),-16(up
,i
,8))
267 mov %rdx
, -8(up
,n
,8) C up
[0]
268 mov %rbx
, q0 C previously computed q limb
-> q0
275 IFSTD
(`
lea -8(up
,n
,8), up C param
2: up
276 lea (up
,n
,8), %rdx C param
3: up
- n
277 neg R32
(n
) ') C param 4: n
279 IFDOS(` lea -8(up,n,8), %rdx C param 2: up
280 lea (%rdx,n,8), %r8 C param 3: up - n
282 mov n, %r9 C param 4: n
283 mov rp, %rcx ') C param
1: rp
285 IFSTD
(`
sub $8, %rsp
')
286 IFDOS(` sub $40, %rsp ')
287 ASSERT
(nz
, `
test $15, %rsp
')
289 IFSTD(` add $8, %rsp ')
290 IFDOS
(`
add $40, %rsp
')
301 L(bx0): test $2, R8(n)
325 imul u0inv, %rbx C next q limb
330 mov -16(up,i,8), %r10
334 mov %rbp, -24(up,i,8)
343 mov %r10, -16(up,i,8)
362 L(e0): add %rax, %rbp
369 mov I(-16(up),-16(up,i,8)), %r10
372 mov %rbp, I(-24(up),-24(up,i,8))
377 mov %r10, I(-16(up),-16(up,i,8))
378 mov %rdx, -8(up,n,8) C up[0]
379 mov %rbx, q0 C previously computed q limb -> q0
385 L(b2): cmp $-2, R32(n)
408 imul u0inv, %rbx C next q limb
413 mov -16(up,i,8), %r10
417 mov %rbp, -24(up,i,8)
426 mov %r10, -16(up,i,8)
427 L(e2): add %rax, %rbp
452 mov I(-16(up),-16(up,i,8)), %r10
455 mov %rbp, I(-24(up),-24(up,i,8))
460 mov %r10, I(-16(up),-16(up,i,8))
461 mov %rdx, -8(up,n,8) C up[0]
462 mov %rbx, q0 C previously computed q limb -> q0
468 L(n1): mov (mp_param), %rax
474 adc R32(%rax), R32(%rax)
477 L(n2): mov (mp_param), %rax
492 imul u0inv, q0 C next q0
505 xor R32(%rax), R32(%rax)
510 adc R32(%rax), R32(%rax)
514 L(n3): mov -32(mp), %rax
532 imul u0inv, q0 C next q0
540 mov %r11, -32(up) C up[0]