beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / bobcat / redc_1.asm
blobd55b1e5b378f69dda8a2fa8410c9ffd6a80babda
1 dnl X86-64 mpn_redc_1 optimised for AMD bobcat.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 ?
37 C AMD K10 ?
38 C AMD bull ?
39 C AMD pile ?
40 C AMD steam ?
41 C AMD bobcat 5.0
42 C AMD jaguar ?
43 C Intel P4 ?
44 C Intel core ?
45 C Intel NHM ?
46 C Intel SBR ?
47 C Intel IBR ?
48 C Intel HWL ?
49 C Intel BWL ?
50 C Intel atom ?
51 C VIA nano ?
53 C TODO
54 C * Micro-optimise, none performed thus far.
55 C * Consider inlining mpn_add_n.
56 C * Single basecases out before the pushes.
58 C When playing with pointers, set this to $2 to fall back to conservative
59 C indexing in wind-down code.
60 define(`I',`$1')
62 define(`rp', `%rdi') C rcx
63 define(`up', `%rsi') C rdx
64 define(`mp_param', `%rdx') C r8
65 define(`n', `%rcx') C r9
66 define(`u0inv', `%r8') C stack
68 define(`i', `%r14')
69 define(`j', `%r15')
70 define(`mp', `%r12')
71 define(`q0', `%r13')
72 define(`w0', `%rbp')
73 define(`w1', `%r9')
74 define(`w2', `%r10')
75 define(`w3', `%r11')
77 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
79 ABI_SUPPORT(DOS64)
80 ABI_SUPPORT(STD64)
82 define(`ALIGNx', `ALIGN(16)')
84 ASM_START()
85 TEXT
86 ALIGN(32)
87 PROLOGUE(mpn_redc_1)
88 FUNC_ENTRY(4)
89 IFDOS(` mov 56(%rsp), %r8 ')
90 push %rbx
91 push %rbp
92 push %r12
93 push %r13
94 push %r14
95 push %r15
97 mov (up), q0
98 mov n, j C outer loop induction var
99 lea (mp_param,n,8), mp
100 lea (up,n,8), up
101 neg n
102 imul u0inv, q0 C first iteration q0
104 test $1, R8(n)
105 jz L(bx0)
107 L(bx1): test $2, R8(n)
108 jz L(b3)
110 L(b1): cmp $-1, R32(n)
111 jz L(n1)
113 L(otp1):lea 1(n), i
114 mov (mp,n,8), %rax
115 mul q0
116 mov %rax, w2
117 mov %rdx, w3
118 mov 8(mp,n,8), %rax
119 mul q0
120 mov %rax, %rbx
121 mov %rdx, w1
122 add (up,n,8), w2
123 adc w3, %rbx
124 adc $0, w1
125 mov 16(mp,n,8), %rax
126 mul q0
127 mov %rax, w2
128 mov %rdx, w3
129 add 8(up,n,8), %rbx
130 mov %rbx, 8(up,n,8)
131 adc w1, w2
132 adc $0, w3
133 imul u0inv, %rbx C next q limb
134 jmp L(e1)
136 ALIGNx
137 L(tp1): add w0, -16(up,i,8)
138 adc w1, w2
139 adc $0, w3
140 mov (mp,i,8), %rax
141 mul q0
142 mov %rax, w0
143 mov %rdx, w1
144 add w2, -8(up,i,8)
145 adc w3, w0
146 adc $0, w1
147 mov 8(mp,i,8), %rax
148 mul q0
149 mov %rax, w2
150 mov %rdx, w3
151 add w0, (up,i,8)
152 adc w1, w2
153 adc $0, w3
154 L(e1): mov 16(mp,i,8), %rax
155 mul q0
156 mov %rax, w0
157 mov %rdx, w1
158 add w2, 8(up,i,8)
159 adc w3, w0
160 adc $0, w1
161 mov 24(mp,i,8), %rax
162 mul q0
163 mov %rax, w2
164 mov %rdx, w3
165 add $4, i
166 js L(tp1)
168 L(ed1): add w0, I(-16(up),-16(up,i,8))
169 adc w1, w2
170 adc $0, w3
171 add w2, I(-8(up),-8(up,i,8))
172 adc $0, w3
173 mov w3, (up,n,8) C up[0]
174 mov %rbx, q0 C previously computed q limb -> q0
175 lea 8(up), up C up++
176 dec j
177 jnz L(otp1)
178 jmp L(cj)
180 L(b3): cmp $-3, R32(n)
181 jz L(n3)
183 L(otp3):lea 3(n), i
184 mov (mp,n,8), %rax
185 mul q0
186 mov %rax, w2
187 mov %rdx, w3
188 mov 8(mp,n,8), %rax
189 mul q0
190 mov %rax, %rbx
191 mov %rdx, w1
192 add (up,n,8), w2
193 adc w3, %rbx
194 adc $0, w1
195 mov 16(mp,n,8), %rax
196 mul q0
197 mov %rax, w2
198 mov %rdx, w3
199 add 8(up,n,8), %rbx
200 mov %rbx, 8(up,n,8)
201 adc w1, w2
202 adc $0, w3
203 imul u0inv, %rbx C next q limb
204 jmp L(e3)
206 ALIGNx
207 L(tp3): add w0, -16(up,i,8)
208 adc w1, w2
209 adc $0, w3
210 L(e3): mov (mp,i,8), %rax
211 mul q0
212 mov %rax, w0
213 mov %rdx, w1
214 add w2, -8(up,i,8)
215 adc w3, w0
216 adc $0, w1
217 mov 8(mp,i,8), %rax
218 mul q0
219 mov %rax, w2
220 mov %rdx, w3
221 add w0, (up,i,8)
222 adc w1, w2
223 adc $0, w3
224 mov 16(mp,i,8), %rax
225 mul q0
226 mov %rax, w0
227 mov %rdx, w1
228 add w2, 8(up,i,8)
229 adc w3, w0
230 adc $0, w1
231 mov 24(mp,i,8), %rax
232 mul q0
233 mov %rax, w2
234 mov %rdx, w3
235 add $4, i
236 js L(tp3)
238 L(ed3): add w0, I(-16(up),-16(up,i,8))
239 adc w1, w2
240 adc $0, w3
241 add w2, I(-8(up),-8(up,i,8))
242 adc $0, w3
243 mov w3, (up,n,8) C up[0]
244 mov %rbx, q0 C previously computed q limb -> q0
245 lea 8(up), up C up++
246 dec j
247 jnz L(otp3)
248 C jmp L(cj)
250 L(cj):
251 IFSTD(` lea (up,n,8), up C param 2: up
252 lea (up,n,8), %rdx C param 3: up - n
253 neg R32(n) ') C param 4: n
255 IFDOS(` lea (up,n,8), %rdx C param 2: up
256 lea (%rdx,n,8), %r8 C param 3: up - n
257 neg R32(n)
258 mov n, %r9 C param 4: n
259 mov rp, %rcx ') C param 1: rp
261 IFSTD(` sub $8, %rsp ')
262 IFDOS(` sub $40, %rsp ')
263 ASSERT(nz, `test $15, %rsp')
264 CALL( mpn_add_n)
265 IFSTD(` add $8, %rsp ')
266 IFDOS(` add $40, %rsp ')
268 L(ret): pop %r15
269 pop %r14
270 pop %r13
271 pop %r12
272 pop %rbp
273 pop %rbx
274 FUNC_EXIT()
277 L(bx0): test $2, R8(n)
278 jnz L(b2)
280 L(b0):
281 L(otp0):lea (n), i
282 mov (mp,n,8), %rax
283 mul q0
284 mov %rax, w0
285 mov %rdx, w1
286 mov 8(mp,n,8), %rax
287 mul q0
288 mov %rax, %rbx
289 mov %rdx, w3
290 add (up,n,8), w0
291 adc w1, %rbx
292 adc $0, w3
293 mov 16(mp,n,8), %rax
294 mul q0
295 mov %rax, w0
296 mov %rdx, w1
297 add 8(up,n,8), %rbx
298 mov %rbx, 8(up,n,8)
299 adc w3, w0
300 adc $0, w1
301 imul u0inv, %rbx C next q limb
302 jmp L(e0)
304 ALIGNx
305 L(tp0): add w0, -16(up,i,8)
306 adc w1, w2
307 adc $0, w3
308 mov (mp,i,8), %rax
309 mul q0
310 mov %rax, w0
311 mov %rdx, w1
312 add w2, -8(up,i,8)
313 adc w3, w0
314 adc $0, w1
315 mov 8(mp,i,8), %rax
316 mul q0
317 mov %rax, w2
318 mov %rdx, w3
319 add w0, (up,i,8)
320 adc w1, w2
321 adc $0, w3
322 mov 16(mp,i,8), %rax
323 mul q0
324 mov %rax, w0
325 mov %rdx, w1
326 add w2, 8(up,i,8)
327 adc w3, w0
328 adc $0, w1
329 L(e0): mov 24(mp,i,8), %rax
330 mul q0
331 mov %rax, w2
332 mov %rdx, w3
333 add $4, i
334 js L(tp0)
336 L(ed0): add w0, I(-16(up),-16(up,i,8))
337 adc w1, w2
338 adc $0, w3
339 add w2, I(-8(up),-8(up,i,8))
340 adc $0, w3
341 mov w3, (up,n,8) C up[0]
342 mov %rbx, q0 C previously computed q limb -> q0
343 lea 8(up), up C up++
344 dec j
345 jnz L(otp0)
346 jmp L(cj)
348 L(b2): cmp $-2, R32(n)
349 jz L(n2)
351 L(otp2):lea 2(n), i
352 mov (mp,n,8), %rax
353 mul q0
354 mov %rax, w0
355 mov %rdx, w1
356 mov 8(mp,n,8), %rax
357 mul q0
358 mov %rax, %rbx
359 mov %rdx, w3
360 add (up,n,8), w0
361 adc w1, %rbx
362 adc $0, w3
363 mov 16(mp,n,8), %rax
364 mul q0
365 mov %rax, w0
366 mov %rdx, w1
367 add 8(up,n,8), %rbx
368 mov %rbx, 8(up,n,8)
369 adc w3, w0
370 adc $0, w1
371 imul u0inv, %rbx C next q limb
372 jmp L(e2)
374 ALIGNx
375 L(tp2): add w0, -16(up,i,8)
376 adc w1, w2
377 adc $0, w3
378 mov (mp,i,8), %rax
379 mul q0
380 mov %rax, w0
381 mov %rdx, w1
382 add w2, -8(up,i,8)
383 adc w3, w0
384 adc $0, w1
385 L(e2): mov 8(mp,i,8), %rax
386 mul q0
387 mov %rax, w2
388 mov %rdx, w3
389 add w0, (up,i,8)
390 adc w1, w2
391 adc $0, w3
392 mov 16(mp,i,8), %rax
393 mul q0
394 mov %rax, w0
395 mov %rdx, w1
396 add w2, 8(up,i,8)
397 adc w3, w0
398 adc $0, w1
399 mov 24(mp,i,8), %rax
400 mul q0
401 mov %rax, w2
402 mov %rdx, w3
403 add $4, i
404 js L(tp2)
406 L(ed2): add w0, I(-16(up),-16(up,i,8))
407 adc w1, w2
408 adc $0, w3
409 add w2, I(-8(up),-8(up,i,8))
410 adc $0, w3
411 mov w3, (up,n,8) C up[0]
412 mov %rbx, q0 C previously computed q limb -> q0
413 lea 8(up), up C up++
414 dec j
415 jnz L(otp2)
416 jmp L(cj)
418 L(n1): mov (mp_param), %rax
419 mul q0
420 add -8(up), %rax
421 adc (up), %rdx
422 mov %rdx, (rp)
423 mov $0, R32(%rax)
424 adc R32(%rax), R32(%rax)
425 jmp L(ret)
427 L(n2): mov (mp_param), %rax
428 mov -16(up), %rbp
429 mul q0
430 add %rax, %rbp
431 mov %rdx, %r9
432 adc $0, %r9
433 mov -8(mp), %rax
434 mov -8(up), %r10
435 mul q0
436 add %rax, %r10
437 mov %rdx, %r11
438 adc $0, %r11
439 add %r9, %r10
440 adc $0, %r11
441 mov %r10, q0
442 imul u0inv, q0 C next q0
443 mov -16(mp), %rax
444 mul q0
445 add %rax, %r10
446 mov %rdx, %r9
447 adc $0, %r9
448 mov -8(mp), %rax
449 mov (up), %r14
450 mul q0
451 add %rax, %r14
452 adc $0, %rdx
453 add %r9, %r14
454 adc $0, %rdx
455 xor R32(%rax), R32(%rax)
456 add %r11, %r14
457 adc 8(up), %rdx
458 mov %r14, (rp)
459 mov %rdx, 8(rp)
460 adc R32(%rax), R32(%rax)
461 jmp L(ret)
463 ALIGNx
464 L(n3): mov -24(mp), %rax
465 mov -24(up), %r10
466 mul q0
467 add %rax, %r10
468 mov -16(mp), %rax
469 mov %rdx, %r11
470 adc $0, %r11
471 mov -16(up), %rbp
472 mul q0
473 add %rax, %rbp
474 mov %rdx, %r9
475 adc $0, %r9
476 mov -8(mp), %rax
477 add %r11, %rbp
478 mov -8(up), %r10
479 adc $0, %r9
480 mul q0
481 mov %rbp, q0
482 imul u0inv, q0 C next q0
483 add %rax, %r10
484 mov %rdx, %r11
485 adc $0, %r11
486 mov %rbp, -16(up)
487 add %r9, %r10
488 adc $0, %r11
489 mov %r10, -8(up)
490 mov %r11, -24(up) C up[0]
491 lea 8(up), up C up++
492 dec j
493 jnz L(n3)
495 mov -48(up), %rdx
496 mov -40(up), %rbx
497 xor R32(%rax), R32(%rax)
498 add %rbp, %rdx
499 adc %r10, %rbx
500 adc -8(up), %r11
501 mov %rdx, (rp)
502 mov %rbx, 8(rp)
503 mov %r11, 16(rp)
504 adc R32(%rax), R32(%rax)
505 jmp L(ret)
506 EPILOGUE()
507 ASM_END()