1 dnl AMD64 mpn_mod_34lsub1
-- remainder modulo
2^
48-1.
3 dnl Copyright
2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation
,
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
36 C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
37 C AMD K10 0.67 this seems hard to beat
40 C Intel P4 7.35 terrible, use old code
41 C Intel core2 1.25 1+epsilon with huge unrolling
42 C Intel NHM 1.15 this seems hard to beat
45 C VIA nano 1.25 this seems hard to beat
51 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
54 C * Review feed-in and wind-down code.
62 PROLOGUE(mpn_mod_34lsub1)
65 mov $0x0000FFFFFFFFFFFF, %r11
76 shr $48, %rax C src[0] low
78 and %r11, %rdx C src[0] high
80 mov R32(%rsi), R32(%rdx)
82 shr $32, %rsi C src[1] high
85 shl $16, %rdx C src[1] low
91 C Don't change
this, the wind
-down code is
not able to handle greater values
94 L
(gt2
): mov 8(ap
), %rcx
98 sub $eval
(UNROLL
*3+3), %rsi
106 forloop
(i
,1,UNROLL
-1,`dnl
107 add eval
(i
*24)(ap
), %rax
108 adc eval
(i
*24+8)(ap
), %rcx
109 adc eval
(i
*24+16)(ap
), %rdx
112 add $eval(UNROLL*24), ap
113 sub $eval(UNROLL*3), %rsi
117 lea L(tab)(%rip), %r8
119 ` movslq
36(%r8
,%rsi
,4), %r10
127 L
(tab
): JMPENT
( L
(0), L
(tab
))
128 JMPENT
( L
(1), L
(tab
))
129 JMPENT
( L
(2), L
(tab
))
130 JMPENT
( L
(3), L
(tab
))
131 JMPENT
( L
(4), L
(tab
))
132 JMPENT
( L
(5), L
(tab
))
133 JMPENT
( L
(6), L
(tab
))
134 JMPENT
( L
(7), L
(tab
))
135 JMPENT
( L
(8), L
(tab
))
182 mov %rax
, %rdi C
0mod3
183 shr $48, %rax C
0mod3
high
185 and %r11
, %rdi C
0mod3
low
186 mov R32
(%rcx
), R32
(%r10
) C
1mod3
188 shr $32, %rcx C
1mod3
high
190 add %rdi
, %rax C apply
0mod3
low
191 movzwl
%dx, R32
(%rdi
) C
2mod3
192 shl $16, %r10 C
1mod3
low
194 add %rcx
, %rax C apply
1mod3
high
195 shr $16, %rdx C
2mod3
high
197 add %r10
, %rax C apply
1mod3
low
198 shl $32, %rdi C
2mod3
low
200 add %rdx
, %rax C apply
2mod3
high
201 add %rdi
, %rax C apply
2mod3
low