1 dnl AMD64 mpn_redc_1 optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
58 C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2
61 define(`rp', `
%rdi
') C rcx
62 define(`up', `
%rsi
') C rdx
63 define(`mp_param', `
%rdx
') C r8
64 define(`n', `
%rcx
') C r9
65 define(`u0inv_param', `
%r8
') C stack
70 define(`u0inv', `
(%rsp
)') C stack
72 ABI_SUPPORT(DOS64) C FIXME: needs verification
80 IFDOS(` mov 56(%rsp), %r8 ')
88 mov mp_param
, mp C note that rp
and mp shares register
92 push %r8 C put u0inv on stack
93 imul u0inv_param
, %rdx C first iteration q0
94 mov n
, j C outer
loop induction var
99 L
(bx0
): test $2, R8
(n
)
105 C Special code for n
= 2 since general code cannot handle it
106 mov 8(%rsp
), %rbx C rp
107 lea 16(%rsp
), %rsp C deallocate two slots
108 mulx
( (mp
), %r9
, %r12
)
109 mulx
( 8,(mp
), %r11
, %r10
)
113 adc 8(up
), %r11 C r11
= up
[1]
114 adc $0, %r10 C
-> up
[0]
116 imul u0inv_param
, %rdx
117 mulx
( (mp
), %r13
, %r12
)
118 mulx
( 8,(mp
), %r14
, %r15
)
119 xor R32
(%rax
), R32
(%rax
)
123 adc 16(up
), %r14 C rp
[2]
124 adc $0, %r15 C
-> up
[1]
132 L
(o2
): lea 2(n
), i C inner
loop induction var
133 mulx
( (mp
), %r9
, %r8
)
134 mulx
( 8,(mp
), %r11
, %r10
)
140 L
(tp2
): adc %rax
, %r9
143 L
(lo2
): mulx
( 16,(mp
), %r13
, %r12
)
145 mulx
( 24,(mp
), %rbx
, %rax
)
156 mulx
( (mp
), %r9
, %r8
)
161 mulx
( 8,(mp
), %r11
, %r10
)
166 L
(ed2
): mov 56(up
,n
,8), %rdx C next iteration up
[0]
167 lea 16(mp
,n
,8), mp C mp
= (last starting mp
)
172 imul u0inv
, %rdx C next iteration q0
178 lea 56(up
,n
,8), up C up
= (last starting up
) + 1
187 L
(bx1
): test $2, R8
(n
)
190 L
(o1a
): cmp $
-1, R32
(n
)
193 C Special code for n
= 1 since general code cannot handle it
194 mov 8(%rsp
), %rbx C rp
195 lea 16(%rsp
), %rsp C deallocate two slots
196 mulx
( (mp
), %r11
, %r10
)
204 L
(o1b
): lea 24(mp
), mp
205 L
(o1
): lea 1(n
), i C inner
loop induction var
206 mulx
( -24,(mp
), %r11
, %r10
)
207 mulx
( -16,(mp
), %r13
, %r12
)
208 mulx
( -8,(mp
), %rbx
, %rax
)
220 L
(tp1
): adc %rax
, %r9
223 mulx
( 16,(mp
), %r13
, %r12
)
225 mulx
( 24,(mp
), %rbx
, %rax
)
236 L
(lo1
): mulx
( (mp
), %r9
, %r8
)
241 mulx
( 8,(mp
), %r11
, %r10
)
246 L
(ed1
): mov 48(up
,n
,8), %rdx C next iteration up
[0]
247 lea 40(mp
,n
,8), mp C mp
= (last starting mp
)
252 imul u0inv
, %rdx C next iteration q0
258 lea 48(up
,n
,8), up C up
= (last starting up
) + 1
266 L
(o3a
): cmp $
-3, R32
(n
)
269 C Special code for n
= 3 since general code cannot handle it
270 L
(n3
): mulx
( (mp
), %rbx
, %rax
)
271 mulx
( 8,(mp
), %r9
, %r14
)
273 mulx
( 16,(mp
), %r11
, %r10
)
277 mov u0inv_param
, %rdx
282 mulx
( %r14
, %rdx
, %r13
) C next iteration q0
287 lea 8(up
), up C up
= (last starting up
) + 1
293 L
(o3b
): lea 8(mp
), mp
294 L
(o3
): lea 4(n
), i C inner
loop induction var
295 mulx
( -8,(mp
), %rbx
, %rax
)
296 mulx
( (mp
), %r9
, %r8
)
298 mulx
( 8,(mp
), %r11
, %r10
)
306 L
(tp3
): adc %rax
, %r9
308 L
(lo3
): adc %r8
, %r11
309 mulx
( 16,(mp
), %r13
, %r12
)
311 mulx
( 24,(mp
), %rbx
, %rax
)
322 mulx
( (mp
), %r9
, %r8
)
327 mulx
( 8,(mp
), %r11
, %r10
)
332 L
(ed3
): mov 64(up
,n
,8), %rdx C next iteration up
[0]
333 lea 24(mp
,n
,8), mp C mp
= (last starting mp
)
338 imul u0inv
, %rdx C next iteration q0
344 lea 64(up
,n
,8), up C up
= (last starting up
) + 1
352 L
(o0b
): lea 16(mp
), mp
353 L
(o0
): mov n
, i C inner
loop induction var
354 mulx
( -16,(mp
), %r13
, %r12
)
355 mulx
( -8,(mp
), %rbx
, %rax
)
361 mulx
( (mp
), %r9
, %r8
)
366 L
(tp0
): adc %rax
, %r9
369 mulx
( 16,(mp
), %r13
, %r12
)
371 mulx
( 24,(mp
), %rbx
, %rax
)
382 mulx
( (mp
), %r9
, %r8
)
386 L
(lo0
): adc %rbx
, %rbp
387 mulx
( 8,(mp
), %r11
, %r10
)
392 L
(ed0
): mov 40(up
,n
,8), %rdx C next iteration up
[0]
393 lea 32(mp
,n
,8), mp C mp
= (last starting mp
)
398 imul u0inv
, %rdx C next iteration q0
404 lea 40(up
,n
,8), up C up
= (last starting up
) + 1
411 IFSTD
(`
mov 8(%rsp
), %rdi C param
1: rp
412 lea 16-8(%rsp
), %rsp C deallocate
2, add back for alignment
413 lea (up
,n
,8), %rdx C param
3: up
- n
414 neg R32
(n
) ') C param 4: n
416 IFDOS(` mov up, %rdx C param 2: up
417 lea (up,n,8), %r8 C param 3: up - n
419 mov n, %r9 C param 4: n
420 mov 8(%rsp), %rcx C param 1: rp
421 lea 16-32-8(%rsp), %rsp') C deallocate
2, allocate shadow
, align
423 ASSERT
(nz
, `
test $15, %rsp
')
426 IFSTD(` lea 8(%rsp), %rsp ')
427 IFDOS
(`
lea 32+8(%rsp
), %rsp
')