beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / redc_1.asm
blobf0dbe0781b98fcf7170e978056ffc777b5969acf
1 dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 ?
37 C AMD K10 ?
38 C AMD bull ?
39 C AMD pile ?
40 C AMD steam ?
41 C AMD bobcat ?
42 C AMD jaguar ?
43 C Intel P4 ?
44 C Intel core ?
45 C Intel NHM ?
46 C Intel SBR 3.24
47 C Intel IBR 3.04
48 C Intel HWL ?
49 C Intel BWL ?
50 C Intel atom ?
51 C VIA nano ?
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Micro-optimise, none performed thus far.
58 C * Consider inlining mpn_add_n.
59 C * Single basecases out before the pushes.
61 C When playing with pointers, set this to $2 to fall back to conservative
62 C indexing in wind-down code.
63 define(`I',`$1')
65 define(`rp', `%rdi') C rcx
66 define(`up', `%rsi') C rdx
67 define(`mp_param', `%rdx') C r8
68 define(`n', `%rcx') C r9
69 define(`u0inv', `%r8') C stack
71 define(`i', `%r14')
72 define(`j', `%r15')
73 define(`mp', `%r12')
74 define(`q0', `%r13')
76 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
78 ABI_SUPPORT(DOS64)
79 ABI_SUPPORT(STD64)
81 define(`ALIGNx', `ALIGN(16)')
83 ASM_START()
84 TEXT
85 ALIGN(32)
86 PROLOGUE(mpn_redc_1)
87 FUNC_ENTRY(4)
88 IFDOS(` mov 56(%rsp), %r8 ')
89 push %rbx
90 push %rbp
91 push %r12
92 push %r13
93 push %r14
94 push %r15
96 mov (up), q0
97 mov n, j C outer loop induction var
98 lea 8(mp_param,n,8), mp
99 lea 8(up,n,8), up
100 neg n
101 imul u0inv, q0 C first iteration q0
103 test $1, R8(n)
104 jz L(bx0)
106 L(bx1): test $2, R8(n)
107 jz L(b3)
109 L(b1): cmp $-1, R32(n)
110 jz L(n1)
112 L(otp1):lea 1(n), i
113 mov -8(mp,n,8), %rax
114 mul q0
115 mov -8(up,n,8), %r10
116 mov %rdx, %r11
117 add %rax, %r10
118 mov (mp,n,8), %rax
119 adc $0, %r11
120 mul q0
121 mov %rdx, %r9
122 mov (up,n,8), %rbx
123 add %rax, %rbx
124 adc $0, %r9
125 mov (mp,i,8), %rax
126 mul q0
127 mov (up,i,8), %r10
128 add %r11, %rbx
129 mov %rbx, -8(up,i,8) C next low remainder limb
130 adc $0, %r9
131 imul u0inv, %rbx C next q limb
132 jmp L(e1)
134 ALIGNx
135 L(tp1): mul q0
136 mov -16(up,i,8), %r10
137 add %r11, %rbp
138 mov %rdx, %r11
139 adc $0, %r9
140 mov %rbp, -24(up,i,8)
141 add %rax, %r10
142 mov -8(mp,i,8), %rax
143 adc $0, %r11
144 mul q0
145 add %r9, %r10
146 mov %rdx, %r9
147 mov -8(up,i,8), %rbp
148 adc $0, %r11
149 mov %r10, -16(up,i,8)
150 add %rax, %rbp
151 adc $0, %r9
152 mov (mp,i,8), %rax
153 mul q0
154 mov (up,i,8), %r10
155 add %r11, %rbp
156 mov %rbp, -8(up,i,8)
157 adc $0, %r9
158 L(e1): mov %rdx, %r11
159 add %rax, %r10
160 mov 8(mp,i,8), %rax
161 adc $0, %r11
162 mul q0
163 mov 8(up,i,8), %rbp
164 add %r9, %r10
165 mov %rdx, %r9
166 mov %r10, (up,i,8)
167 adc $0, %r11
168 add %rax, %rbp
169 adc $0, %r9
170 mov 16(mp,i,8), %rax
171 add $4, i
172 jnc L(tp1)
174 L(ed1): mul q0
175 mov I(-16(up),-16(up,i,8)), %r10
176 add %r11, %rbp
177 adc $0, %r9
178 mov %rbp, I(-24(up),-24(up,i,8))
179 add %rax, %r10
180 adc $0, %rdx
181 add %r9, %r10
182 adc $0, %rdx
183 mov %r10, I(-16(up),-16(up,i,8))
184 mov %rdx, -8(up,n,8) C up[0]
185 mov %rbx, q0 C previously computed q limb -> q0
186 lea 8(up), up C up++
187 dec j
188 jnz L(otp1)
189 jmp L(cj)
191 L(b3): cmp $-3, R32(n)
192 jz L(n3)
194 L(otp3):lea 3(n), i
195 mov -8(mp,n,8), %rax
196 mul q0
197 mov -8(up,n,8), %r10
198 mov %rdx, %r11
199 add %rax, %r10
200 mov (mp,n,8), %rax
201 adc $0, %r11
202 mul q0
203 mov (up,n,8), %rbx
204 mov %rdx, %r9
205 add %rax, %rbx
206 adc $0, %r9
207 mov 8(mp,n,8), %rax
208 mul q0
209 mov 8(up,n,8), %r10
210 add %r11, %rbx
211 mov %rdx, %r11
212 adc $0, %r9
213 mov %rbx, (up,n,8)
214 imul u0inv, %rbx C next q limb
215 jmp L(e3)
217 ALIGNx
218 L(tp3): mul q0
219 mov -16(up,i,8), %r10
220 add %r11, %rbp
221 mov %rdx, %r11
222 adc $0, %r9
223 mov %rbp, -24(up,i,8)
224 L(e3): add %rax, %r10
225 mov -8(mp,i,8), %rax
226 adc $0, %r11
227 mul q0
228 add %r9, %r10
229 mov %rdx, %r9
230 mov -8(up,i,8), %rbp
231 adc $0, %r11
232 mov %r10, -16(up,i,8)
233 add %rax, %rbp
234 adc $0, %r9
235 mov (mp,i,8), %rax
236 mul q0
237 mov (up,i,8), %r10
238 add %r11, %rbp
239 mov %rbp, -8(up,i,8)
240 adc $0, %r9
241 mov %rdx, %r11
242 add %rax, %r10
243 mov 8(mp,i,8), %rax
244 adc $0, %r11
245 mul q0
246 mov 8(up,i,8), %rbp
247 add %r9, %r10
248 mov %rdx, %r9
249 mov %r10, (up,i,8)
250 adc $0, %r11
251 add %rax, %rbp
252 adc $0, %r9
253 mov 16(mp,i,8), %rax
254 add $4, i
255 jnc L(tp3)
257 L(ed3): mul q0
258 mov I(-16(up),-16(up,i,8)), %r10
259 add %r11, %rbp
260 adc $0, %r9
261 mov %rbp, I(-24(up),-24(up,i,8))
262 add %rax, %r10
263 adc $0, %rdx
264 add %r9, %r10
265 adc $0, %rdx
266 mov %r10, I(-16(up),-16(up,i,8))
267 mov %rdx, -8(up,n,8) C up[0]
268 mov %rbx, q0 C previously computed q limb -> q0
269 lea 8(up), up C up++
270 dec j
271 jnz L(otp3)
272 C jmp L(cj)
274 L(cj):
275 IFSTD(` lea -8(up,n,8), up C param 2: up
276 lea (up,n,8), %rdx C param 3: up - n
277 neg R32(n) ') C param 4: n
279 IFDOS(` lea -8(up,n,8), %rdx C param 2: up
280 lea (%rdx,n,8), %r8 C param 3: up - n
281 neg R32(n)
282 mov n, %r9 C param 4: n
283 mov rp, %rcx ') C param 1: rp
285 IFSTD(` sub $8, %rsp ')
286 IFDOS(` sub $40, %rsp ')
287 ASSERT(nz, `test $15, %rsp')
288 CALL( mpn_add_n)
289 IFSTD(` add $8, %rsp ')
290 IFDOS(` add $40, %rsp ')
292 L(ret): pop %r15
293 pop %r14
294 pop %r13
295 pop %r12
296 pop %rbp
297 pop %rbx
298 FUNC_EXIT()
301 L(bx0): test $2, R8(n)
302 jnz L(b2)
304 L(b0):
305 L(otp0):lea (n), i
306 mov -8(mp,n,8), %rax
307 mul q0
308 mov %rdx, %r9
309 mov -8(up,n,8), %rbp
310 add %rax, %rbp
311 adc $0, %r9
312 mov (mp,n,8), %rax
313 mul q0
314 mov (up,n,8), %rbx
315 mov %rdx, %r11
316 add %rax, %rbx
317 mov 8(mp,n,8), %rax
318 adc $0, %r11
319 mul q0
320 mov 8(up,n,8), %rbp
321 add %r9, %rbx
322 mov %rdx, %r9
323 mov %rbx, (up,n,8)
324 adc $0, %r11
325 imul u0inv, %rbx C next q limb
326 jmp L(e0)
328 ALIGNx
329 L(tp0): mul q0
330 mov -16(up,i,8), %r10
331 add %r11, %rbp
332 mov %rdx, %r11
333 adc $0, %r9
334 mov %rbp, -24(up,i,8)
335 add %rax, %r10
336 mov -8(mp,i,8), %rax
337 adc $0, %r11
338 mul q0
339 add %r9, %r10
340 mov %rdx, %r9
341 mov -8(up,i,8), %rbp
342 adc $0, %r11
343 mov %r10, -16(up,i,8)
344 add %rax, %rbp
345 adc $0, %r9
346 mov (mp,i,8), %rax
347 mul q0
348 mov (up,i,8), %r10
349 add %r11, %rbp
350 mov %rbp, -8(up,i,8)
351 adc $0, %r9
352 mov %rdx, %r11
353 add %rax, %r10
354 mov 8(mp,i,8), %rax
355 adc $0, %r11
356 mul q0
357 mov 8(up,i,8), %rbp
358 add %r9, %r10
359 mov %rdx, %r9
360 mov %r10, (up,i,8)
361 adc $0, %r11
362 L(e0): add %rax, %rbp
363 adc $0, %r9
364 mov 16(mp,i,8), %rax
365 add $4, i
366 jnc L(tp0)
368 L(ed0): mul q0
369 mov I(-16(up),-16(up,i,8)), %r10
370 add %r11, %rbp
371 adc $0, %r9
372 mov %rbp, I(-24(up),-24(up,i,8))
373 add %rax, %r10
374 adc $0, %rdx
375 add %r9, %r10
376 adc $0, %rdx
377 mov %r10, I(-16(up),-16(up,i,8))
378 mov %rdx, -8(up,n,8) C up[0]
379 mov %rbx, q0 C previously computed q limb -> q0
380 lea 8(up), up C up++
381 dec j
382 jnz L(otp0)
383 jmp L(cj)
385 L(b2): cmp $-2, R32(n)
386 jz L(n2)
388 L(otp2):lea 2(n), i
389 mov -8(mp,n,8), %rax
390 mul q0
391 mov -8(up,n,8), %rbp
392 mov %rdx, %r9
393 add %rax, %rbp
394 adc $0, %r9
395 mov (mp,n,8), %rax
396 mul q0
397 mov (up,n,8), %rbx
398 mov %rdx, %r11
399 add %rax, %rbx
400 mov 8(mp,n,8), %rax
401 adc $0, %r11
402 mul q0
403 add %r9, %rbx
404 mov %rdx, %r9
405 mov 8(up,n,8), %rbp
406 adc $0, %r11
407 mov %rbx, (up,n,8)
408 imul u0inv, %rbx C next q limb
409 jmp L(e2)
411 ALIGNx
412 L(tp2): mul q0
413 mov -16(up,i,8), %r10
414 add %r11, %rbp
415 mov %rdx, %r11
416 adc $0, %r9
417 mov %rbp, -24(up,i,8)
418 add %rax, %r10
419 mov -8(mp,i,8), %rax
420 adc $0, %r11
421 mul q0
422 add %r9, %r10
423 mov %rdx, %r9
424 mov -8(up,i,8), %rbp
425 adc $0, %r11
426 mov %r10, -16(up,i,8)
427 L(e2): add %rax, %rbp
428 adc $0, %r9
429 mov (mp,i,8), %rax
430 mul q0
431 mov (up,i,8), %r10
432 add %r11, %rbp
433 mov %rbp, -8(up,i,8)
434 adc $0, %r9
435 mov %rdx, %r11
436 add %rax, %r10
437 mov 8(mp,i,8), %rax
438 adc $0, %r11
439 mul q0
440 mov 8(up,i,8), %rbp
441 add %r9, %r10
442 mov %rdx, %r9
443 mov %r10, (up,i,8)
444 adc $0, %r11
445 add %rax, %rbp
446 adc $0, %r9
447 mov 16(mp,i,8), %rax
448 add $4, i
449 jnc L(tp2)
451 L(ed2): mul q0
452 mov I(-16(up),-16(up,i,8)), %r10
453 add %r11, %rbp
454 adc $0, %r9
455 mov %rbp, I(-24(up),-24(up,i,8))
456 add %rax, %r10
457 adc $0, %rdx
458 add %r9, %r10
459 adc $0, %rdx
460 mov %r10, I(-16(up),-16(up,i,8))
461 mov %rdx, -8(up,n,8) C up[0]
462 mov %rbx, q0 C previously computed q limb -> q0
463 lea 8(up), up C up++
464 dec j
465 jnz L(otp2)
466 jmp L(cj)
468 L(n1): mov (mp_param), %rax
469 mul q0
470 add -16(up), %rax
471 adc -8(up), %rdx
472 mov %rdx, (rp)
473 mov $0, R32(%rax)
474 adc R32(%rax), R32(%rax)
475 jmp L(ret)
477 L(n2): mov (mp_param), %rax
478 mov -24(up), %rbp
479 mul q0
480 add %rax, %rbp
481 mov %rdx, %r9
482 adc $0, %r9
483 mov -16(mp), %rax
484 mov -16(up), %r10
485 mul q0
486 add %rax, %r10
487 mov %rdx, %r11
488 adc $0, %r11
489 add %r9, %r10
490 adc $0, %r11
491 mov %r10, q0
492 imul u0inv, q0 C next q0
493 mov -24(mp), %rax
494 mul q0
495 add %rax, %r10
496 mov %rdx, %r9
497 adc $0, %r9
498 mov -16(mp), %rax
499 mov -8(up), %r14
500 mul q0
501 add %rax, %r14
502 adc $0, %rdx
503 add %r9, %r14
504 adc $0, %rdx
505 xor R32(%rax), R32(%rax)
506 add %r11, %r14
507 adc (up), %rdx
508 mov %r14, (rp)
509 mov %rdx, 8(rp)
510 adc R32(%rax), R32(%rax)
511 jmp L(ret)
513 ALIGNx
514 L(n3): mov -32(mp), %rax
515 mov -32(up), %r10
516 mul q0
517 add %rax, %r10
518 mov -24(mp), %rax
519 mov %rdx, %r11
520 adc $0, %r11
521 mov -24(up), %rbp
522 mul q0
523 add %rax, %rbp
524 mov %rdx, %r9
525 adc $0, %r9
526 mov -16(mp), %rax
527 add %r11, %rbp
528 mov -16(up), %r10
529 adc $0, %r9
530 mul q0
531 mov %rbp, q0
532 imul u0inv, q0 C next q0
533 add %rax, %r10
534 mov %rdx, %r11
535 adc $0, %r11
536 mov %rbp, -24(up)
537 add %r9, %r10
538 adc $0, %r11
539 mov %r10, -16(up)
540 mov %r11, -32(up) C up[0]
541 lea 8(up), up C up++
542 dec j
543 jnz L(n3)
544 jmp L(cj)
545 EPILOGUE()
546 ASM_END()