beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreihwl / redc_1.asm
blobb1d6c0a7d83169c5cdab42d3c10f66f3f540682f
1 dnl AMD64 mpn_redc_1 optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 n/a
37 C AMD K10 n/a
38 C AMD bull n/a
39 C AMD pile n/a
40 C AMD steam ?
41 C AMD bobcat n/a
42 C AMD jaguar ?
43 C Intel P4 n/a
44 C Intel core n/a
45 C Intel NHM n/a
46 C Intel SBR n/a
47 C Intel IBR n/a
48 C Intel HWL 2.32
49 C Intel BWL ?
50 C Intel atom n/a
51 C VIA nano n/a
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Micro-optimise.
58 C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2
59 C cycles, though.
61 define(`rp', `%rdi') C rcx
62 define(`up', `%rsi') C rdx
63 define(`mp_param', `%rdx') C r8
64 define(`n', `%rcx') C r9
65 define(`u0inv_param', `%r8') C stack
67 define(`i', `%r14')
68 define(`j', `%r15')
69 define(`mp', `%rdi')
70 define(`u0inv', `(%rsp)') C stack
72 ABI_SUPPORT(DOS64) C FIXME: needs verification
73 ABI_SUPPORT(STD64)
75 ASM_START()
76 TEXT
77 ALIGN(16)
78 PROLOGUE(mpn_redc_1)
79 FUNC_ENTRY(4)
80 IFDOS(` mov 56(%rsp), %r8 ')
81 push %rbx
82 push %rbp
83 push %r12
84 push %r13
85 push %r14
86 push %r15
87 push rp
88 mov mp_param, mp C note that rp and mp shares register
89 mov (up), %rdx
91 neg n
92 push %r8 C put u0inv on stack
93 imul u0inv_param, %rdx C first iteration q0
94 mov n, j C outer loop induction var
96 test $1, R8(n)
97 jnz L(bx1)
99 L(bx0): test $2, R8(n)
100 jz L(o0b)
102 cmp $-2, R32(n)
103 jnz L(o2)
105 C Special code for n = 2 since general code cannot handle it
106 mov 8(%rsp), %rbx C rp
107 lea 16(%rsp), %rsp C deallocate two slots
108 mulx( (mp), %r9, %r12)
109 mulx( 8,(mp), %r11, %r10)
110 add %r12, %r11
111 adc $0, %r10
112 add (up), %r9 C = 0
113 adc 8(up), %r11 C r11 = up[1]
114 adc $0, %r10 C -> up[0]
115 mov %r11, %rdx
116 imul u0inv_param, %rdx
117 mulx( (mp), %r13, %r12)
118 mulx( 8,(mp), %r14, %r15)
119 xor R32(%rax), R32(%rax)
120 add %r12, %r14
121 adc $0, %r15
122 add %r11, %r13 C = 0
123 adc 16(up), %r14 C rp[2]
124 adc $0, %r15 C -> up[1]
125 add %r14, %r10
126 adc 24(up), %r15
127 mov %r10, (%rbx)
128 mov %r15, 8(%rbx)
129 setc R8(%rax)
130 jmp L(ret)
132 L(o2): lea 2(n), i C inner loop induction var
133 mulx( (mp), %r9, %r8)
134 mulx( 8,(mp), %r11, %r10)
135 sar $2, i
136 add %r8, %r11
137 jmp L(lo2)
139 ALIGN(16)
140 L(tp2): adc %rax, %r9
141 lea 32(up), up
142 adc %r8, %r11
143 L(lo2): mulx( 16,(mp), %r13, %r12)
144 mov (up), %r8
145 mulx( 24,(mp), %rbx, %rax)
146 lea 32(mp), mp
147 adc %r10, %r13
148 adc %r12, %rbx
149 adc $0, %rax
150 mov 8(up), %r10
151 mov 16(up), %r12
152 add %r9, %r8
153 mov 24(up), %rbp
154 mov %r8, (up)
155 adc %r11, %r10
156 mulx( (mp), %r9, %r8)
157 mov %r10, 8(up)
158 adc %r13, %r12
159 mov %r12, 16(up)
160 adc %rbx, %rbp
161 mulx( 8,(mp), %r11, %r10)
162 mov %rbp, 24(up)
163 inc i
164 jnz L(tp2)
166 L(ed2): mov 56(up,n,8), %rdx C next iteration up[0]
167 lea 16(mp,n,8), mp C mp = (last starting mp)
168 adc %rax, %r9
169 adc %r8, %r11
170 mov 32(up), %r8
171 adc $0, %r10
172 imul u0inv, %rdx C next iteration q0
173 mov 40(up), %rax
174 add %r9, %r8
175 mov %r8, 32(up)
176 adc %r11, %rax
177 mov %rax, 40(up)
178 lea 56(up,n,8), up C up = (last starting up) + 1
179 adc $0, %r10
180 mov %r10, -8(up)
181 inc j
182 jnz L(o2)
184 jmp L(cj)
187 L(bx1): test $2, R8(n)
188 jz L(o3a)
190 L(o1a): cmp $-1, R32(n)
191 jnz L(o1b)
193 C Special code for n = 1 since general code cannot handle it
194 mov 8(%rsp), %rbx C rp
195 lea 16(%rsp), %rsp C deallocate two slots
196 mulx( (mp), %r11, %r10)
197 add (up), %r11
198 adc 8(up), %r10
199 mov %r10, (%rbx)
200 mov $0, R32(%rax)
201 setc R8(%rax)
202 jmp L(ret)
204 L(o1b): lea 24(mp), mp
205 L(o1): lea 1(n), i C inner loop induction var
206 mulx( -24,(mp), %r11, %r10)
207 mulx( -16,(mp), %r13, %r12)
208 mulx( -8,(mp), %rbx, %rax)
209 sar $2, i
210 add %r10, %r13
211 adc %r12, %rbx
212 adc $0, %rax
213 mov (up), %r10
214 mov 8(up), %r12
215 mov 16(up), %rbp
216 add %r11, %r10
217 jmp L(lo1)
219 ALIGN(16)
220 L(tp1): adc %rax, %r9
221 lea 32(up), up
222 adc %r8, %r11
223 mulx( 16,(mp), %r13, %r12)
224 mov -8(up), %r8
225 mulx( 24,(mp), %rbx, %rax)
226 lea 32(mp), mp
227 adc %r10, %r13
228 adc %r12, %rbx
229 adc $0, %rax
230 mov (up), %r10
231 mov 8(up), %r12
232 add %r9, %r8
233 mov 16(up), %rbp
234 mov %r8, -8(up)
235 adc %r11, %r10
236 L(lo1): mulx( (mp), %r9, %r8)
237 mov %r10, (up)
238 adc %r13, %r12
239 mov %r12, 8(up)
240 adc %rbx, %rbp
241 mulx( 8,(mp), %r11, %r10)
242 mov %rbp, 16(up)
243 inc i
244 jnz L(tp1)
246 L(ed1): mov 48(up,n,8), %rdx C next iteration up[0]
247 lea 40(mp,n,8), mp C mp = (last starting mp)
248 adc %rax, %r9
249 adc %r8, %r11
250 mov 24(up), %r8
251 adc $0, %r10
252 imul u0inv, %rdx C next iteration q0
253 mov 32(up), %rax
254 add %r9, %r8
255 mov %r8, 24(up)
256 adc %r11, %rax
257 mov %rax, 32(up)
258 lea 48(up,n,8), up C up = (last starting up) + 1
259 adc $0, %r10
260 mov %r10, -8(up)
261 inc j
262 jnz L(o1)
264 jmp L(cj)
266 L(o3a): cmp $-3, R32(n)
267 jnz L(o3b)
269 C Special code for n = 3 since general code cannot handle it
270 L(n3): mulx( (mp), %rbx, %rax)
271 mulx( 8,(mp), %r9, %r14)
272 add (up), %rbx
273 mulx( 16,(mp), %r11, %r10)
274 adc %rax, %r9 C W 1
275 adc %r14, %r11 C W 2
276 mov 8(up), %r14
277 mov u0inv_param, %rdx
278 adc $0, %r10 C W 3
279 mov 16(up), %rax
280 add %r9, %r14 C W 1
281 mov %r14, 8(up)
282 mulx( %r14, %rdx, %r13) C next iteration q0
283 adc %r11, %rax C W 2
284 mov %rax, 16(up)
285 adc $0, %r10 C W 3
286 mov %r10, (up)
287 lea 8(up), up C up = (last starting up) + 1
288 inc j
289 jnz L(n3)
291 jmp L(cj)
293 L(o3b): lea 8(mp), mp
294 L(o3): lea 4(n), i C inner loop induction var
295 mulx( -8,(mp), %rbx, %rax)
296 mulx( (mp), %r9, %r8)
297 mov (up), %rbp
298 mulx( 8,(mp), %r11, %r10)
299 sar $2, i
300 add %rbx, %rbp
302 adc %rax, %r9
303 jmp L(lo3)
305 ALIGN(16)
306 L(tp3): adc %rax, %r9
307 lea 32(up), up
308 L(lo3): adc %r8, %r11
309 mulx( 16,(mp), %r13, %r12)
310 mov 8(up), %r8
311 mulx( 24,(mp), %rbx, %rax)
312 lea 32(mp), mp
313 adc %r10, %r13
314 adc %r12, %rbx
315 adc $0, %rax
316 mov 16(up), %r10
317 mov 24(up), %r12
318 add %r9, %r8
319 mov 32(up), %rbp
320 mov %r8, 8(up)
321 adc %r11, %r10
322 mulx( (mp), %r9, %r8)
323 mov %r10, 16(up)
324 adc %r13, %r12
325 mov %r12, 24(up)
326 adc %rbx, %rbp
327 mulx( 8,(mp), %r11, %r10)
328 mov %rbp, 32(up)
329 inc i
330 jnz L(tp3)
332 L(ed3): mov 64(up,n,8), %rdx C next iteration up[0]
333 lea 24(mp,n,8), mp C mp = (last starting mp)
334 adc %rax, %r9
335 adc %r8, %r11
336 mov 40(up), %r8
337 adc $0, %r10
338 imul u0inv, %rdx C next iteration q0
339 mov 48(up), %rax
340 add %r9, %r8
341 mov %r8, 40(up)
342 adc %r11, %rax
343 mov %rax, 48(up)
344 lea 64(up,n,8), up C up = (last starting up) + 1
345 adc $0, %r10
346 mov %r10, -8(up)
347 inc j
348 jnz L(o3)
350 jmp L(cj)
352 L(o0b): lea 16(mp), mp
353 L(o0): mov n, i C inner loop induction var
354 mulx( -16,(mp), %r13, %r12)
355 mulx( -8,(mp), %rbx, %rax)
356 sar $2, i
357 add %r12, %rbx
358 adc $0, %rax
359 mov (up), %r12
360 mov 8(up), %rbp
361 mulx( (mp), %r9, %r8)
362 add %r13, %r12
363 jmp L(lo0)
365 ALIGN(16)
366 L(tp0): adc %rax, %r9
367 lea 32(up), up
368 adc %r8, %r11
369 mulx( 16,(mp), %r13, %r12)
370 mov -16(up), %r8
371 mulx( 24,(mp), %rbx, %rax)
372 lea 32(mp), mp
373 adc %r10, %r13
374 adc %r12, %rbx
375 adc $0, %rax
376 mov -8(up), %r10
377 mov (up), %r12
378 add %r9, %r8
379 mov 8(up), %rbp
380 mov %r8, -16(up)
381 adc %r11, %r10
382 mulx( (mp), %r9, %r8)
383 mov %r10, -8(up)
384 adc %r13, %r12
385 mov %r12, (up)
386 L(lo0): adc %rbx, %rbp
387 mulx( 8,(mp), %r11, %r10)
388 mov %rbp, 8(up)
389 inc i
390 jnz L(tp0)
392 L(ed0): mov 40(up,n,8), %rdx C next iteration up[0]
393 lea 32(mp,n,8), mp C mp = (last starting mp)
394 adc %rax, %r9
395 adc %r8, %r11
396 mov 16(up), %r8
397 adc $0, %r10
398 imul u0inv, %rdx C next iteration q0
399 mov 24(up), %rax
400 add %r9, %r8
401 mov %r8, 16(up)
402 adc %r11, %rax
403 mov %rax, 24(up)
404 lea 40(up,n,8), up C up = (last starting up) + 1
405 adc $0, %r10
406 mov %r10, -8(up)
407 inc j
408 jnz L(o0)
410 L(cj):
411 IFSTD(` mov 8(%rsp), %rdi C param 1: rp
412 lea 16-8(%rsp), %rsp C deallocate 2, add back for alignment
413 lea (up,n,8), %rdx C param 3: up - n
414 neg R32(n) ') C param 4: n
416 IFDOS(` mov up, %rdx C param 2: up
417 lea (up,n,8), %r8 C param 3: up - n
418 neg R32(n)
419 mov n, %r9 C param 4: n
420 mov 8(%rsp), %rcx C param 1: rp
421 lea 16-32-8(%rsp), %rsp') C deallocate 2, allocate shadow, align
423 ASSERT(nz, `test $15, %rsp')
424 CALL( mpn_add_n)
426 IFSTD(` lea 8(%rsp), %rsp ')
427 IFDOS(` lea 32+8(%rsp), %rsp')
429 L(ret): pop %r15
430 pop %r14
431 pop %r13
432 pop %r12
433 pop %rbp
434 pop %rbx
435 FUNC_EXIT()
437 EPILOGUE()