beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mod_34lsub1.asm
blob62bdcfac690ebbbfde5ca07f1b994d247601f576
1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
3 dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
4 dnl Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way
37 C AMD K10 0.67 this seems hard to beat
38 C AMD bd1 1
39 C AMD bobcat 1.07
40 C Intel P4 7.35 terrible, use old code
41 C Intel core2 1.25 1+epsilon with huge unrolling
42 C Intel NHM 1.15 this seems hard to beat
43 C Intel SBR 0.93
44 C Intel atom 2.5
45 C VIA nano 1.25 this seems hard to beat
47 C INPUT PARAMETERS
48 define(`ap', %rdi)
49 define(`n', %rsi)
51 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
53 C TODO
54 C * Review feed-in and wind-down code.
56 ABI_SUPPORT(DOS64)
57 ABI_SUPPORT(STD64)
59 ASM_START()
60 TEXT
61 ALIGN(32)
62 PROLOGUE(mpn_mod_34lsub1)
63 FUNC_ENTRY(2)
65 mov $0x0000FFFFFFFFFFFF, %r11
67 mov (ap), %rax
69 cmp $2, %rsi
70 ja L(gt2)
72 jb L(one)
74 mov 8(ap), %rsi
75 mov %rax, %rdx
76 shr $48, %rax C src[0] low
78 and %r11, %rdx C src[0] high
79 add %rdx, %rax
80 mov R32(%rsi), R32(%rdx)
82 shr $32, %rsi C src[1] high
83 add %rsi, %rax
85 shl $16, %rdx C src[1] low
86 add %rdx, %rax
87 L(one): FUNC_EXIT()
88 ret
91 C Don't change this, the wind-down code is not able to handle greater values
92 define(UNROLL,3)
94 L(gt2): mov 8(ap), %rcx
95 mov 16(ap), %rdx
96 xor %r9, %r9
97 add $24, ap
98 sub $eval(UNROLL*3+3), %rsi
99 jc L(end)
100 ALIGN(16)
101 L(top):
102 add (ap), %rax
103 adc 8(ap), %rcx
104 adc 16(ap), %rdx
105 adc $0, %r9
106 forloop(i,1,UNROLL-1,`dnl
107 add eval(i*24)(ap), %rax
108 adc eval(i*24+8)(ap), %rcx
109 adc eval(i*24+16)(ap), %rdx
110 adc $0, %r9
111 ')dnl
112 add $eval(UNROLL*24), ap
113 sub $eval(UNROLL*3), %rsi
114 jnc L(top)
116 L(end):
117 lea L(tab)(%rip), %r8
118 ifdef(`PIC',
119 ` movslq 36(%r8,%rsi,4), %r10
120 add %r10, %r8
121 jmp *%r8
123 jmp *72(%r8,%rsi,8)
125 JUMPTABSECT
126 ALIGN(8)
127 L(tab): JMPENT( L(0), L(tab))
128 JMPENT( L(1), L(tab))
129 JMPENT( L(2), L(tab))
130 JMPENT( L(3), L(tab))
131 JMPENT( L(4), L(tab))
132 JMPENT( L(5), L(tab))
133 JMPENT( L(6), L(tab))
134 JMPENT( L(7), L(tab))
135 JMPENT( L(8), L(tab))
136 TEXT
138 L(6): add (ap), %rax
139 adc 8(ap), %rcx
140 adc 16(ap), %rdx
141 adc $0, %r9
142 add $24, ap
143 L(3): add (ap), %rax
144 adc 8(ap), %rcx
145 adc 16(ap), %rdx
146 jmp L(cj1)
148 L(7): add (ap), %rax
149 adc 8(ap), %rcx
150 adc 16(ap), %rdx
151 adc $0, %r9
152 add $24, ap
153 L(4): add (ap), %rax
154 adc 8(ap), %rcx
155 adc 16(ap), %rdx
156 adc $0, %r9
157 add $24, ap
158 L(1): add (ap), %rax
159 adc $0, %rcx
160 jmp L(cj2)
162 L(8): add (ap), %rax
163 adc 8(ap), %rcx
164 adc 16(ap), %rdx
165 adc $0, %r9
166 add $24, ap
167 L(5): add (ap), %rax
168 adc 8(ap), %rcx
169 adc 16(ap), %rdx
170 adc $0, %r9
171 add $24, ap
172 L(2): add (ap), %rax
173 adc 8(ap), %rcx
175 L(cj2): adc $0, %rdx
176 L(cj1): adc $0, %r9
177 L(0): add %r9, %rax
178 adc $0, %rcx
179 adc $0, %rdx
180 adc $0, %rax
182 mov %rax, %rdi C 0mod3
183 shr $48, %rax C 0mod3 high
185 and %r11, %rdi C 0mod3 low
186 mov R32(%rcx), R32(%r10) C 1mod3
188 shr $32, %rcx C 1mod3 high
190 add %rdi, %rax C apply 0mod3 low
191 movzwl %dx, R32(%rdi) C 2mod3
192 shl $16, %r10 C 1mod3 low
194 add %rcx, %rax C apply 1mod3 high
195 shr $16, %rdx C 2mod3 high
197 add %r10, %rax C apply 1mod3 low
198 shl $32, %rdi C 2mod3 low
200 add %rdx, %rax C apply 2mod3 high
201 add %rdi, %rax C apply 2mod3 low
203 FUNC_EXIT()
205 EPILOGUE()