1 dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell.
3 dnl Copyright
2015 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
33 C cycles/limb mul_1 addmul_1
48 C Intel BWL 1.69 1.8-1.9
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
57 C * Do overlapped software pipelining.
58 C * When changing this, make sure the code which falls into the inner loops
59 C does not execute too many no-ops (for both PIC and non-PIC).
63 define(`un_param',`
%rdx
')
64 define(`vp_param',`
%rcx
')
68 define(`n_save', `
%rbp
')
70 define(`unneg', `
%rbx
')
72 define(`jaddr', `
%rax
')
85 PROLOGUE(mpn_mul_basecase)
87 IFDOS(` mov 56(%rsp), %r8d ')
92 mulx
( (up
), %rax
, %r9
) C
0 1
95 L
(s11
): mov %rax
, (rp
)
100 mulx
( 8,(up
), %r8
, %r10
) C
1 2
110 L
(s22
): add %r8
, %r9 C
1
112 mov 8(vp_param
), %rdx
114 mulx
( (up
), %r8
, %r11
) C
1 2
115 mulx
( 8,(up
), %rax
, %rdx
) C
2 3
134 lea 1(un_param
), unneg
136 mov R32
(un_param
), R32
(%rax
)
138 shr $3, n_save C
loop count
140 and $7, R32
(%rax
) C clear CF for
adc as side
-effect
141 C note that rax lives very long
146 lea L
(mtab
)(%rip
), %r10
148 ` movslq (%r10,%rax,4), %r11
149 lea (%r11, %r10), %r10
155 L(mf0): mulx( (up), w2, w3)
160 L(mf3): mulx( (up), w0, w1)
166 L(mf4): mulx( (up), w2, w3)
172 L(mf5): mulx( (up), w0, w1)
178 L(mf6): mulx( (up), w2, w3)
184 L(mf7): mulx( (up), w0, w1)
190 L(mf1): mulx( (up), w0, w1)
193 L(mf2): mulx( (up), w2, w3)
202 L(mb1): mulx( 8,(up), w2, w3)
206 L(mb0): mov w2, 8(rp)
207 mulx( -48,(up), w0, w1)
210 L(mb7): mulx( -40,(up), w2, w3)
213 L(mb6): mov w2, -40(rp)
214 mulx( -32,(up), w0, w1)
216 L(mb5): mulx( -24,(up), w2, w3)
219 L(mb4): mulx( -16,(up), w0, w1)
222 L(mb3): mulx( -8,(up), w2, w3)
233 adc %rcx, w1 C relies on rcx = 0
239 lea L(atab)(%rip), %r10
241 ` movslq
(%r10
,%rax
,4), %rax
242 lea (%rax
, %r10
), jaddr
244 mov (%r10,%rax,8), jaddr
254 L
(f0
): mulx
( 8,(up
), w2
, w3
)
255 lea 8(rp
,unneg
,8), rp
259 L
(f3
): mulx
( -16,(up
), w0
, w1
)
260 lea -56(rp
,unneg
,8), rp
263 L
(f4
): mulx
( -24,(up
), w2
, w3
)
264 lea -56(rp
,unneg
,8), rp
267 L
(f5
): mulx
( -32,(up
), w0
, w1
)
268 lea -56(rp
,unneg
,8), rp
271 L
(f6
): mulx
( -40,(up
), w2
, w3
)
272 lea -56(rp
,unneg
,8), rp
275 L
(f7
): mulx
( 16,(up
), w0
, w1
)
276 lea 8(rp
,unneg
,8), rp
279 L
(f1
): mulx
( (up
), w0
, w1
)
280 lea 8(rp
,unneg
,8), rp
285 adox
( %rcx
, w1
) C relies on rcx
= 0
287 adc %rcx
, w1 C relies on rcx
= 0
290 dec vn C clear CF
and OF as side
-effect
300 mulx
( -8,(up
), w2
, w3
)
301 lea 8(rp
,unneg
,8), rp
310 L
(b1
): mulx
( 8,(up
), w2
, w3
)
315 L
(b0
): mulx
( 16,(up
), w0
, w1
)
319 L
(b7
): mulx
( 24,(up
), w2
, w3
)
324 L
(b6
): mulx
( -32,(up
), w0
, w1
)
328 L
(b5
): mulx
( -24,(up
), w2
, w3
)
332 L
(b4
): mulx
( -16,(up
), w0
, w1
)
336 L
(b3
): adox
( 48,(rp
), w0
)
337 mulx
( -8,(up
), w2
, w3
)
346 L
(mtab
):JMPENT
( L
(mf0
), L
(mtab
))
347 JMPENT
( L
(mf1
), L
(mtab
))
348 JMPENT
( L
(mf2
), L
(mtab
))
349 JMPENT
( L
(mf3
), L
(mtab
))
350 JMPENT
( L
(mf4
), L
(mtab
))
351 JMPENT
( L
(mf5
), L
(mtab
))
352 JMPENT
( L
(mf6
), L
(mtab
))
353 JMPENT
( L
(mf7
), L
(mtab
))
354 L
(atab
):JMPENT
( L
(f0
), L
(atab
))
355 JMPENT
( L
(f1
), L
(atab
))
356 JMPENT
( L
(f2
), L
(atab
))
357 JMPENT
( L
(f3
), L
(atab
))
358 JMPENT
( L
(f4
), L
(atab
))
359 JMPENT
( L
(f5
), L
(atab
))
360 JMPENT
( L
(f6
), L
(atab
))
361 JMPENT
( L
(f7
), L
(atab
))