beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / k8 / redc_1.asm
blob9327b21b1573ad1fe31d0230d40b8c30c2890d6c
1 dnl X86-64 mpn_redc_1 optimised for AMD K8-K10.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 ?
37 C AMD K10 ?
38 C AMD bull ?
39 C AMD pile ?
40 C AMD steam ?
41 C AMD bobcat ?
42 C AMD jaguar ?
43 C Intel P4 ?
44 C Intel core ?
45 C Intel NHM ?
46 C Intel SBR ?
47 C Intel IBR ?
48 C Intel HWL ?
49 C Intel BWL ?
50 C Intel atom ?
51 C VIA nano ?
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Micro-optimise, none performed thus far.
58 C * This looks different from other current redc_1.asm variants. Consider
59 C adapting this to the mainstream style.
60 C * Is this code really faster than more approaches which compute q0 later?
61 C Is the use of a jump jump table faster? Or is the edge of this due to the
62 C inlined add_n code?
63 C * Put initial m[0] x q0 computation in header.
64 C * Put basecases at the file's end, single them out before the pushes.
66 define(`rp', `%rdi') C rcx
67 define(`up', `%rsi') C rdx
68 define(`mp_param', `%rdx') C r8
69 define(`n', `%rcx') C r9
70 define(`u0inv', `%r8') C stack
72 define(`i', `%r11')
73 define(`nneg', `%r12')
74 define(`mp', `%r13')
75 define(`q0', `%rbp')
76 define(`vp', `%rdx')
78 ABI_SUPPORT(DOS64)
79 ABI_SUPPORT(STD64)
81 ASM_START()
82 TEXT
83 ALIGN(32)
84 PROLOGUE(mpn_redc_1)
85 FUNC_ENTRY(4)
86 IFDOS(` mov 56(%rsp), %r8 ')
87 push %rbp
88 mov (up), q0 C up[0]
89 push %rbx
90 imul u0inv, q0 C first q0, for all execution paths
91 push %r12
92 push %r13
93 push %r14
94 push %r15
96 mov n, nneg
97 neg nneg
98 lea (mp_param,n,8), mp C mp += n
99 lea -16(up,n,8), up C up += n
101 mov R32(n), R32(%rax)
102 and $3, R32(%rax)
103 lea 4(%rax), %r9
104 cmp $4, R32(n)
105 cmovg %r9, %rax
106 lea L(tab)(%rip), %r9
107 ifdef(`PIC',`
108 movslq (%r9,%rax,4), %rax
109 add %r9, %rax
110 jmp *%rax
112 jmp *(%r9,%rax,8)
115 JUMPTABSECT
116 ALIGN(8)
117 L(tab): JMPENT( L(0), L(tab))
118 JMPENT( L(1), L(tab))
119 JMPENT( L(2), L(tab))
120 JMPENT( L(3), L(tab))
121 JMPENT( L(0m4), L(tab))
122 JMPENT( L(1m4), L(tab))
123 JMPENT( L(2m4), L(tab))
124 JMPENT( L(3m4), L(tab))
125 TEXT
127 ALIGN(16)
128 L(1): mov (mp_param), %rax
129 mul q0
130 add 8(up), %rax
131 adc 16(up), %rdx
132 mov %rdx, (rp)
133 mov $0, R32(%rax)
134 adc R32(%rax), R32(%rax)
135 jmp L(ret)
138 ALIGN(16)
139 L(2): mov (mp_param), %rax
140 mul q0
141 xor R32(%r14), R32(%r14)
142 mov %rax, %r10
143 mov -8(mp), %rax
144 mov %rdx, %r9
145 mul q0
146 add (up), %r10
147 adc %rax, %r9
148 adc %rdx, %r14
149 add 8(up), %r9
150 adc $0, %r14
151 mov %r9, q0
152 imul u0inv, q0
153 mov -16(mp), %rax
154 mul q0
155 xor R32(%rbx), R32(%rbx)
156 mov %rax, %r10
157 mov -8(mp), %rax
158 mov %rdx, %r11
159 mul q0
160 add %r9, %r10
161 adc %rax, %r11
162 adc %rdx, %rbx
163 add 16(up), %r11
164 adc $0, %rbx
165 xor R32(%rax), R32(%rax)
166 add %r11, %r14
167 adc 24(up), %rbx
168 mov %r14, (rp)
169 mov %rbx, 8(rp)
170 adc R32(%rax), R32(%rax)
171 jmp L(ret)
174 L(3): mov (mp_param), %rax
175 mul q0
176 mov %rax, %rbx
177 mov %rdx, %r10
178 mov -16(mp), %rax
179 mul q0
180 xor R32(%r9), R32(%r9)
181 xor R32(%r14), R32(%r14)
182 add -8(up), %rbx
183 adc %rax, %r10
184 mov -8(mp), %rax
185 adc %rdx, %r9
186 mul q0
187 add (up), %r10
188 mov %r10, (up)
189 adc %rax, %r9
190 adc %rdx, %r14
191 mov %r10, q0
192 imul u0inv, q0
193 add %r9, 8(up)
194 adc $0, %r14
195 mov %r14, -8(up)
197 mov -24(mp), %rax
198 mul q0
199 mov %rax, %rbx
200 mov %rdx, %r10
201 mov -16(mp), %rax
202 mul q0
203 xor R32(%r9), R32(%r9)
204 xor R32(%r14), R32(%r14)
205 add (up), %rbx
206 adc %rax, %r10
207 mov -8(mp), %rax
208 adc %rdx, %r9
209 mul q0
210 add 8(up), %r10
211 mov %r10, 8(up)
212 adc %rax, %r9
213 adc %rdx, %r14
214 mov %r10, q0
215 imul u0inv, q0
216 add %r9, 16(up)
217 adc $0, %r14
218 mov %r14, (up)
220 mov -24(mp), %rax
221 mul q0
222 mov %rax, %rbx
223 mov %rdx, %r10
224 mov -16(mp), %rax
225 mul q0
226 xor R32(%r9), R32(%r9)
227 xor R32(%r14), R32(%r14)
228 add 8(up), %rbx
229 adc %rax, %r10
230 mov -8(mp), %rax
231 adc %rdx, %r9
232 mul q0
233 add 16(up), %r10
234 adc %rax, %r9
235 adc %rdx, %r14
236 add 24(up), %r9
237 adc $0, %r14
239 xor R32(%rax), R32(%rax)
240 add -8(up), %r10
241 adc (up), %r9
242 adc 32(up), %r14
243 mov %r10, (rp)
244 mov %r9, 8(rp)
245 mov %r14, 16(rp)
246 adc R32(%rax), R32(%rax)
247 jmp L(ret)
250 ALIGN(16)
251 L(2m4):
252 L(lo2): mov (mp,nneg,8), %rax
253 mul q0
254 xor R32(%r14), R32(%r14)
255 xor R32(%rbx), R32(%rbx)
256 mov %rax, %r10
257 mov 8(mp,nneg,8), %rax
258 mov 24(up,nneg,8), %r15
259 mov %rdx, %r9
260 mul q0
261 add 16(up,nneg,8), %r10
262 adc %rax, %r9
263 mov 16(mp,nneg,8), %rax
264 adc %rdx, %r14
265 mul q0
266 mov $0, R32(%r10) C xor?
267 lea 2(nneg), i
268 add %r9, %r15
269 imul u0inv, %r15
270 jmp L(e2)
272 ALIGN(16)
273 L(li2): add %r10, (up,i,8)
274 adc %rax, %r9
275 mov (mp,i,8), %rax
276 adc %rdx, %r14
277 xor R32(%r10), R32(%r10)
278 mul q0
279 L(e2): add %r9, 8(up,i,8)
280 adc %rax, %r14
281 adc %rdx, %rbx
282 mov 8(mp,i,8), %rax
283 mul q0
284 add %r14, 16(up,i,8)
285 adc %rax, %rbx
286 adc %rdx, %r10
287 mov 16(mp,i,8), %rax
288 mul q0
289 add %rbx, 24(up,i,8)
290 mov $0, R32(%r14) C zero
291 mov %r14, %rbx C zero
292 adc %rax, %r10
293 mov 24(mp,i,8), %rax
294 mov %r14, %r9 C zero
295 adc %rdx, %r9
296 mul q0
297 add $4, i
298 js L(li2)
300 L(le2): add %r10, (up)
301 adc %rax, %r9
302 adc %r14, %rdx
303 add %r9, 8(up)
304 adc $0, %rdx
305 mov %rdx, 16(up,nneg,8) C up[0]
306 add $8, up
307 mov %r15, q0
308 dec n
309 jnz L(lo2)
311 mov nneg, n
312 sar $2, n
313 lea 32(up,nneg,8), up
314 lea (up,nneg,8), vp
316 mov -16(up), %r8
317 mov -8(up), %r9
318 add -16(vp), %r8
319 adc -8(vp), %r9
320 mov %r8, (rp)
321 mov %r9, 8(rp)
322 lea 16(rp), rp
323 jmp L(addx)
326 ALIGN(16)
327 L(1m4):
328 L(lo1): mov (mp,nneg,8), %rax
329 xor %r9, %r9
330 xor R32(%rbx), R32(%rbx)
331 mul q0
332 mov %rax, %r9
333 mov 8(mp,nneg,8), %rax
334 mov 24(up,nneg,8), %r15
335 mov %rdx, %r14
336 mov $0, R32(%r10) C xor?
337 mul q0
338 add 16(up,nneg,8), %r9
339 adc %rax, %r14
340 adc %rdx, %rbx
341 mov 16(mp,nneg,8), %rax
342 mul q0
343 lea 1(nneg), i
344 add %r14, %r15
345 imul u0inv, %r15
346 jmp L(e1)
348 ALIGN(16)
349 L(li1): add %r10, (up,i,8)
350 adc %rax, %r9
351 mov (mp,i,8), %rax
352 adc %rdx, %r14
353 xor R32(%r10), R32(%r10)
354 mul q0
355 add %r9, 8(up,i,8)
356 adc %rax, %r14
357 adc %rdx, %rbx
358 mov 8(mp,i,8), %rax
359 mul q0
360 L(e1): add %r14, 16(up,i,8)
361 adc %rax, %rbx
362 adc %rdx, %r10
363 mov 16(mp,i,8), %rax
364 mul q0
365 add %rbx, 24(up,i,8)
366 mov $0, R32(%r14) C zero
367 mov %r14, %rbx C zero
368 adc %rax, %r10
369 mov 24(mp,i,8), %rax
370 mov %r14, %r9 C zero
371 adc %rdx, %r9
372 mul q0
373 add $4, i
374 js L(li1)
376 L(le1): add %r10, (up)
377 adc %rax, %r9
378 adc %r14, %rdx
379 add %r9, 8(up)
380 adc $0, %rdx
381 mov %rdx, 16(up,nneg,8) C up[0]
382 add $8, up
383 mov %r15, q0
384 dec n
385 jnz L(lo1)
387 mov nneg, n
388 sar $2, n
389 lea 24(up,nneg,8), up
390 lea (up,nneg,8), vp
392 mov -8(up), %r8
393 add -8(vp), %r8
394 mov %r8, (rp)
395 lea 8(rp), rp
396 jmp L(addx)
399 ALIGN(16)
400 L(0):
401 L(0m4):
402 L(lo0): mov (mp,nneg,8), %rax
403 mov nneg, i
404 mul q0
405 xor R32(%r10), R32(%r10)
406 mov %rax, %r14
407 mov %rdx, %rbx
408 mov 8(mp,nneg,8), %rax
409 mov 24(up,nneg,8), %r15
410 mul q0
411 add 16(up,nneg,8), %r14
412 adc %rax, %rbx
413 adc %rdx, %r10
414 add %rbx, %r15
415 imul u0inv, %r15
416 jmp L(e0)
418 ALIGN(16)
419 L(li0): add %r10, (up,i,8)
420 adc %rax, %r9
421 mov (mp,i,8), %rax
422 adc %rdx, %r14
423 xor R32(%r10), R32(%r10)
424 mul q0
425 add %r9, 8(up,i,8)
426 adc %rax, %r14
427 adc %rdx, %rbx
428 mov 8(mp,i,8), %rax
429 mul q0
430 add %r14, 16(up,i,8)
431 adc %rax, %rbx
432 adc %rdx, %r10
433 L(e0): mov 16(mp,i,8), %rax
434 mul q0
435 add %rbx, 24(up,i,8)
436 mov $0, R32(%r14) C zero
437 mov %r14, %rbx C zero
438 adc %rax, %r10
439 mov 24(mp,i,8), %rax
440 mov %r14, %r9 C zero
441 adc %rdx, %r9
442 mul q0
443 add $4, i
444 js L(li0)
446 L(le0): add %r10, (up)
447 adc %rax, %r9
448 adc %r14, %rdx
449 add %r9, 8(up)
450 adc $0, %rdx
451 mov %rdx, 16(up,nneg,8) C up[0]
452 add $8, up
453 mov %r15, q0
454 dec n
455 jnz L(lo0)
457 mov nneg, n
458 sar $2, n
460 lea 16(up,nneg,8), up
461 lea (up,nneg,8), vp
462 jmp L(addy)
465 ALIGN(16)
466 L(3m4):
467 L(lo3): mov (mp,nneg,8), %rax
468 mul q0
469 mov %rax, %rbx
470 mov %rdx, %r10
471 mov 8(mp,nneg,8), %rax
472 mov 24(up,nneg,8), %r15
473 mul q0
474 add 16(up,nneg,8), %rbx C result is zero, might carry
475 mov $0, R32(%rbx) C zero
476 mov %rbx, %r14 C zero
477 adc %rax, %r10
478 mov 16(mp,nneg,8), %rax
479 mov %r14, %r9 C zero
480 adc %rdx, %r9
481 add %r10, %r15
482 mul q0
483 lea 3(nneg), i
484 imul u0inv, %r15
485 C jmp L(li3)
487 ALIGN(16)
488 L(li3): add %r10, (up,i,8)
489 adc %rax, %r9
490 mov (mp,i,8), %rax
491 adc %rdx, %r14
492 xor R32(%r10), R32(%r10)
493 mul q0
494 add %r9, 8(up,i,8)
495 adc %rax, %r14
496 adc %rdx, %rbx
497 mov 8(mp,i,8), %rax
498 mul q0
499 add %r14, 16(up,i,8)
500 adc %rax, %rbx
501 adc %rdx, %r10
502 mov 16(mp,i,8), %rax
503 mul q0
504 add %rbx, 24(up,i,8)
505 mov $0, R32(%r14) C zero
506 mov %r14, %rbx C zero
507 adc %rax, %r10
508 mov 24(mp,i,8), %rax
509 mov %r14, %r9 C zero
510 adc %rdx, %r9
511 mul q0
512 add $4, i
513 js L(li3)
515 L(le3): add %r10, (up)
516 adc %rax, %r9
517 adc %r14, %rdx
518 add %r9, 8(up)
519 adc $0, %rdx
520 mov %rdx, 16(up,nneg,8) C up[0]
521 mov %r15, q0
522 lea 8(up), up
523 dec n
524 jnz L(lo3)
527 C ==== Addition code ====
528 mov nneg, n
529 sar $2, n
530 lea 40(up,nneg,8), up
531 lea (up,nneg,8), vp
533 mov -24(up), %r8
534 mov -16(up), %r9
535 mov -8(up), %r10
536 add -24(vp), %r8
537 adc -16(vp), %r9
538 adc -8(vp), %r10
539 mov %r8, (rp)
540 mov %r9, 8(rp)
541 mov %r10, 16(rp)
542 lea 24(rp), rp
544 L(addx):inc n
545 jz L(ad3)
547 L(addy):mov (up), %r8
548 mov 8(up), %r9
549 inc n
550 jmp L(mid)
552 C ALIGN(16)
553 L(al3): adc (vp), %r8
554 adc 8(vp), %r9
555 adc 16(vp), %r10
556 adc 24(vp), %r11
557 mov %r8, (rp)
558 lea 32(up), up
559 mov %r9, 8(rp)
560 mov %r10, 16(rp)
561 inc n
562 mov %r11, 24(rp)
563 lea 32(vp), vp
564 mov (up), %r8
565 mov 8(up), %r9
566 lea 32(rp), rp
567 L(mid): mov 16(up), %r10
568 mov 24(up), %r11
569 jnz L(al3)
571 L(ae3): adc (vp), %r8
572 adc 8(vp), %r9
573 adc 16(vp), %r10
574 adc 24(vp), %r11
575 mov %r8, (rp)
576 mov %r9, 8(rp)
577 mov %r10, 16(rp)
578 mov %r11, 24(rp)
580 L(ad3): mov R32(n), R32(%rax) C zero
581 adc R32(%rax), R32(%rax)
583 L(ret): pop %r15
584 pop %r14
585 pop %r13
586 pop %r12
587 pop %rbx
588 pop %rbp
589 FUNC_EXIT()
591 EPILOGUE()