fix getsup (HH)
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreibwl / sqr_basecase.asm
blob447ba00e435e4f2cd2507f1ca3752d4bbb0fed45
1 dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
3 dnl Copyright 2015 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb mul_1 addmul_1
34 C AMD K8,K9 n/a n/a
35 C AMD K10 n/a n/a
36 C AMD bull n/a n/a
37 C AMD pile n/a n/a
38 C AMD steam n/a n/a
39 C AMD excavator ? ?
40 C AMD bobcat n/a n/a
41 C AMD jaguar n/a n/a
42 C Intel P4 n/a n/a
43 C Intel core2 n/a n/a
44 C Intel NHM n/a n/a
45 C Intel SBR n/a n/a
46 C Intel IBR n/a n/a
47 C Intel HWL 1.68 n/a
48 C Intel BWL 1.69 1.8-1.9
49 C Intel atom n/a n/a
50 C Intel SLM n/a n/a
51 C VIA nano n/a n/a
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56 C TODO
57 C * We have 8 addmul_1 loops which fall into each other. The idea is to save
58 C on switching code, since a circularly updated computed goto target will
59 C hardly allow correct branch prediction. On 2nd thought, we now might make
60 C each of the 8 loop branches be poorly predicted since they will be
61 C executed fewer times for each time. With just one addmul_1 loop, the loop
62 C count will change only once each 8th time!
63 C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
64 C 3 variants below, but the haswell code turns out to be fastest.
65 C * Do overlapped software pipelining.
66 C * When changing this, make sure the code which falls into the inner loops
67 C does not execute too many no-ops (for both PIC and non-PIC).
69 define(`rp', `%rdi')
70 define(`up', `%rsi')
71 define(`un_param',`%rdx')
73 define(`n', `%rcx')
74 define(`un_save', `%rbx')
75 define(`u0', `%rdx')
77 define(`w0', `%r8')
78 define(`w1', `%r9')
79 define(`w2', `%r10')
80 define(`w3', `%r11')
82 ABI_SUPPORT(DOS64)
83 ABI_SUPPORT(STD64)
85 ASM_START()
86 TEXT
87 ALIGN(16)
88 PROLOGUE(mpn_sqr_basecase)
89 FUNC_ENTRY(3)
91 cmp $2, un_param
92 jae L(gt1)
94 mov (up), %rdx
95 mulx( %rdx, %rax, %rdx)
96 mov %rax, (rp)
97 mov %rdx, 8(rp)
98 FUNC_EXIT()
99 ret
101 L(gt1): jne L(gt2)
103 mov (up), %rdx
104 mov 8(up), %rcx
105 mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
106 mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
107 mov %rcx, %rdx
108 mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
109 add %r9, %r9 C W 1
110 adc %r10, %r10 C W 2
111 adc $0, %rdx C W 3
112 add %r9, %r8 C W 1
113 adc %r11, %r10 C W 2
114 adc $0, %rdx C W 3
115 mov %rax, (rp)
116 mov %r8, 8(rp)
117 mov %r10, 16(rp)
118 mov %rdx, 24(rp)
119 FUNC_EXIT()
122 L(gt2): cmp $4, un_param
123 jae L(gt3)
125 push %rbx
126 mov (up), %rdx
127 mulx( 8,(up), w2, w3)
128 mulx( 16,(up), w0, w1)
129 add w3, w0
130 mov 8(up), %rdx
131 mulx( 16,(up), %rax, w3)
132 adc %rax, w1
133 adc $0, w3
134 test R32(%rbx), R32(%rbx)
135 mov (up), %rdx
136 mulx( %rdx, %rbx, %rcx)
137 mov %rbx, (rp)
138 mov 8(up), %rdx
139 mulx( %rdx, %rax, %rbx)
140 mov 16(up), %rdx
141 mulx( %rdx, %rsi, %rdx)
142 adcx( w2, w2)
143 adcx( w0, w0)
144 adcx( w1, w1)
145 adcx( w3, w3)
146 adox( w2, %rcx)
147 adox( w0, %rax)
148 adox( w1, %rbx)
149 adox( w3, %rsi)
150 mov $0, R32(%r8)
151 adox( %r8, %rdx)
152 adcx( %r8, %rdx)
153 mov %rcx, 8(rp)
154 mov %rax, 16(rp)
155 mov %rbx, 24(rp)
156 mov %rsi, 32(rp)
157 mov %rdx, 40(rp)
158 pop %rbx
159 FUNC_EXIT()
162 L(gt3): push %rbx
164 push rp
165 push up
166 push un_param
168 lea -3(un_param), R32(un_save)
169 lea 5(un_param), n
170 mov R32(un_param), R32(%rax)
171 and $-8, R32(un_save)
172 shr $3, R32(n) C count for mul_1 loop
173 neg un_save C 8*count and offert for addmul_1 loops
174 and $7, R32(%rax) C clear CF for adc as side-effect
176 mov (up), u0
178 lea L(mtab)(%rip), %r10
179 ifdef(`PIC',
180 ` movslq (%r10,%rax,4), %r8
181 lea (%r8, %r10), %r10
182 jmp *%r10
184 jmp *(%r10,%rax,8)
187 L(mf0): mulx( 8,(up), w2, w3)
188 lea 64(up), up
189 C lea (rp), rp
190 jmp L(mb0)
192 L(mf3): mulx( 8,(up), w0, w1)
193 lea 24(up), up
194 lea 24(rp), rp
195 jmp L(mb3)
197 L(mf4): mulx( 8,(up), w2, w3)
198 lea 32(up), up
199 lea 32(rp), rp
200 jmp L(mb4)
202 L(mf5): mulx( 8,(up), w0, w1)
203 lea 40(up), up
204 lea 40(rp), rp
205 jmp L(mb5)
207 L(mf6): mulx( 8,(up), w2, w3)
208 lea 48(up), up
209 lea 48(rp), rp
210 jmp L(mb6)
212 L(mf7): mulx( 8,(up), w0, w1)
213 lea 56(up), up
214 lea 56(rp), rp
215 jmp L(mb7)
217 L(mf1): mulx( 8,(up), w0, w1)
218 lea 8(up), up
219 lea 8(rp), rp
220 jmp L(mb1)
222 L(mf2): mulx( 8,(up), w2, w3)
223 lea 16(up), up
224 lea 16(rp), rp
225 dec R32(n)
226 mulx( (up), w0, w1)
228 ALIGN(16)
229 L(top): mov w2, -8(rp)
230 adc w3, w0
231 L(mb1): mulx( 8,(up), w2, w3)
232 adc w1, w2
233 lea 64(up), up
234 mov w0, (rp)
235 L(mb0): mov w2, 8(rp)
236 mulx( -48,(up), w0, w1)
237 lea 64(rp), rp
238 adc w3, w0
239 L(mb7): mulx( -40,(up), w2, w3)
240 mov w0, -48(rp)
241 adc w1, w2
242 L(mb6): mov w2, -40(rp)
243 mulx( -32,(up), w0, w1)
244 adc w3, w0
245 L(mb5): mulx( -24,(up), w2, w3)
246 mov w0, -32(rp)
247 adc w1, w2
248 L(mb4): mulx( -16,(up), w0, w1)
249 mov w2, -24(rp)
250 adc w3, w0
251 L(mb3): mulx( -8,(up), w2, w3)
252 adc w1, w2
253 mov w0, -16(rp)
254 dec R32(n)
255 mulx( (up), w0, w1)
256 jnz L(top)
258 L(end): mov w2, -8(rp)
259 adc w3, w0
260 mov w0, (rp)
261 adc %rcx, w1
262 mov w1, 8(rp)
264 lea L(atab)(%rip), %r10
265 ifdef(`PIC',
266 ` movslq (%r10,%rax,4), %r11
267 lea (%r11, %r10), %r11
268 jmp *%r11
270 jmp *(%r10,%rax,8)
273 L(ed0): adox( (rp), w0)
274 adox( %rcx, w1) C relies on rcx = 0
275 mov w0, (rp)
276 adc %rcx, w1 C relies on rcx = 0
277 mov w1, 8(rp)
278 L(f7): lea -64(up,un_save,8), up
279 or R32(un_save), R32(n)
280 mov 8(up), u0
281 mulx( 16,(up), w0, w1)
282 lea -56(rp,un_save,8), rp
283 jmp L(b7)
285 ALIGN(16)
286 L(tp0): adox( -8,(rp), w2)
287 adcx( w3, w0)
288 mov w2, -8(rp)
289 jrcxz L(ed0)
290 mulx( 8,(up), w2, w3)
291 adox( (rp), w0)
292 lea 8(n), R32(n)
293 mov w0, (rp)
294 adcx( w1, w2)
295 L(b0): mulx( 16,(up), w0, w1)
296 adcx( w3, w0)
297 adox( 8,(rp), w2)
298 mov w2, 8(rp)
299 mulx( 24,(up), w2, w3)
300 lea 64(up), up
301 adcx( w1, w2)
302 adox( 16,(rp), w0)
303 mov w0, 16(rp)
304 mulx( -32,(up), w0, w1)
305 adox( 24,(rp), w2)
306 adcx( w3, w0)
307 mov w2, 24(rp)
308 mulx( -24,(up), w2, w3)
309 adcx( w1, w2)
310 adox( 32,(rp), w0)
311 mov w0, 32(rp)
312 mulx( -16,(up), w0, w1)
313 adox( 40,(rp), w2)
314 adcx( w3, w0)
315 mov w2, 40(rp)
316 adox( 48,(rp), w0)
317 mulx( -8,(up), w2, w3)
318 mov w0, 48(rp)
319 lea 64(rp), rp
320 adcx( w1, w2)
321 mulx( (up), w0, w1)
322 jmp L(tp0)
324 L(ed1): adox( (rp), w0)
325 adox( %rcx, w1) C relies on rcx = 0
326 mov w0, (rp)
327 adc %rcx, w1 C relies on rcx = 0
328 mov w1, 8(rp)
329 L(f0): lea -64(up,un_save,8), up
330 or R32(un_save), R32(n)
331 mov (up), u0
332 mulx( 8,(up), w2, w3)
333 lea -56(rp,un_save,8), rp
334 jmp L(b0)
336 ALIGN(16)
337 L(tp1): adox( -8,(rp), w2)
338 adcx( w3, w0)
339 mov w2, -8(rp)
340 jrcxz L(ed1)
341 L(b1): mulx( 8,(up), w2, w3)
342 adox( (rp), w0)
343 lea 8(n), R32(n)
344 mov w0, (rp)
345 adcx( w1, w2)
346 mulx( 16,(up), w0, w1)
347 adcx( w3, w0)
348 adox( 8,(rp), w2)
349 mov w2, 8(rp)
350 mulx( 24,(up), w2, w3)
351 lea 64(up), up
352 adcx( w1, w2)
353 adox( 16,(rp), w0)
354 mov w0, 16(rp)
355 mulx( -32,(up), w0, w1)
356 adox( 24,(rp), w2)
357 adcx( w3, w0)
358 mov w2, 24(rp)
359 mulx( -24,(up), w2, w3)
360 adcx( w1, w2)
361 adox( 32,(rp), w0)
362 mov w0, 32(rp)
363 mulx( -16,(up), w0, w1)
364 adox( 40,(rp), w2)
365 adcx( w3, w0)
366 mov w2, 40(rp)
367 adox( 48,(rp), w0)
368 mulx( -8,(up), w2, w3)
369 mov w0, 48(rp)
370 lea 64(rp), rp
371 adcx( w1, w2)
372 mulx( (up), w0, w1)
373 jmp L(tp1)
375 L(ed2): adox( (rp), w0)
376 adox( %rcx, w1) C relies on rcx = 0
377 mov w0, (rp)
378 adc %rcx, w1 C relies on rcx = 0
379 mov w1, 8(rp)
380 L(f1): lea (up,un_save,8), up
381 or R32(un_save), R32(n)
382 lea 8(un_save), un_save
383 mov -8(up), u0
384 mulx( (up), w0, w1)
385 lea -56(rp,un_save,8), rp
386 jmp L(b1)
388 ALIGN(16)
389 L(tp2): adox( -8,(rp), w2)
390 adcx( w3, w0)
391 mov w2, -8(rp)
392 jrcxz L(ed2)
393 mulx( 8,(up), w2, w3)
394 adox( (rp), w0)
395 lea 8(n), R32(n)
396 mov w0, (rp)
397 adcx( w1, w2)
398 mulx( 16,(up), w0, w1)
399 adcx( w3, w0)
400 adox( 8,(rp), w2)
401 mov w2, 8(rp)
402 mulx( 24,(up), w2, w3)
403 lea 64(up), up
404 adcx( w1, w2)
405 adox( 16,(rp), w0)
406 mov w0, 16(rp)
407 mulx( -32,(up), w0, w1)
408 adox( 24,(rp), w2)
409 adcx( w3, w0)
410 mov w2, 24(rp)
411 mulx( -24,(up), w2, w3)
412 adcx( w1, w2)
413 adox( 32,(rp), w0)
414 mov w0, 32(rp)
415 mulx( -16,(up), w0, w1)
416 adox( 40,(rp), w2)
417 adcx( w3, w0)
418 mov w2, 40(rp)
419 adox( 48,(rp), w0)
420 mulx( -8,(up), w2, w3)
421 mov w0, 48(rp)
422 lea 64(rp), rp
423 adcx( w1, w2)
424 mulx( (up), w0, w1)
425 jmp L(tp2)
427 L(ed3): adox( (rp), w0)
428 adox( %rcx, w1) C relies on rcx = 0
429 mov w0, (rp)
430 adc %rcx, w1 C relies on rcx = 0
431 mov w1, 8(rp)
432 L(f2): lea (up,un_save,8), up
433 or R32(un_save), R32(n)
434 jz L(corner2)
435 mov -16(up), u0
436 mulx( -8,(up), w2, w3)
437 lea 8(rp,un_save,8), rp
438 mulx( (up), w0, w1)
439 jmp L(tp2)
441 ALIGN(16)
442 L(tp3): adox( -8,(rp), w2)
443 adcx( w3, w0)
444 mov w2, -8(rp)
445 jrcxz L(ed3)
446 mulx( 8,(up), w2, w3)
447 adox( (rp), w0)
448 lea 8(n), R32(n)
449 mov w0, (rp)
450 adcx( w1, w2)
451 mulx( 16,(up), w0, w1)
452 adcx( w3, w0)
453 adox( 8,(rp), w2)
454 mov w2, 8(rp)
455 mulx( 24,(up), w2, w3)
456 lea 64(up), up
457 adcx( w1, w2)
458 adox( 16,(rp), w0)
459 mov w0, 16(rp)
460 mulx( -32,(up), w0, w1)
461 adox( 24,(rp), w2)
462 adcx( w3, w0)
463 mov w2, 24(rp)
464 mulx( -24,(up), w2, w3)
465 adcx( w1, w2)
466 adox( 32,(rp), w0)
467 mov w0, 32(rp)
468 mulx( -16,(up), w0, w1)
469 adox( 40,(rp), w2)
470 adcx( w3, w0)
471 mov w2, 40(rp)
472 L(b3): adox( 48,(rp), w0)
473 mulx( -8,(up), w2, w3)
474 mov w0, 48(rp)
475 lea 64(rp), rp
476 adcx( w1, w2)
477 mulx( (up), w0, w1)
478 jmp L(tp3)
480 L(ed4): adox( (rp), w0)
481 adox( %rcx, w1) C relies on rcx = 0
482 mov w0, (rp)
483 adc %rcx, w1 C relies on rcx = 0
484 mov w1, 8(rp)
485 L(f3): lea (up,un_save,8), up
486 or R32(un_save), R32(n)
487 jz L(corner3)
488 mov -24(up), u0
489 mulx( -16,(up), w0, w1)
490 lea -56(rp,un_save,8), rp
491 jmp L(b3)
493 ALIGN(16)
494 L(tp4): adox( -8,(rp), w2)
495 adcx( w3, w0)
496 mov w2, -8(rp)
497 jrcxz L(ed4)
498 mulx( 8,(up), w2, w3)
499 adox( (rp), w0)
500 lea 8(n), R32(n)
501 mov w0, (rp)
502 adcx( w1, w2)
503 mulx( 16,(up), w0, w1)
504 adcx( w3, w0)
505 adox( 8,(rp), w2)
506 mov w2, 8(rp)
507 mulx( 24,(up), w2, w3)
508 lea 64(up), up
509 adcx( w1, w2)
510 adox( 16,(rp), w0)
511 mov w0, 16(rp)
512 mulx( -32,(up), w0, w1)
513 adox( 24,(rp), w2)
514 adcx( w3, w0)
515 mov w2, 24(rp)
516 mulx( -24,(up), w2, w3)
517 adcx( w1, w2)
518 adox( 32,(rp), w0)
519 mov w0, 32(rp)
520 L(b4): mulx( -16,(up), w0, w1)
521 adox( 40,(rp), w2)
522 adcx( w3, w0)
523 mov w2, 40(rp)
524 adox( 48,(rp), w0)
525 mulx( -8,(up), w2, w3)
526 mov w0, 48(rp)
527 lea 64(rp), rp
528 adcx( w1, w2)
529 mulx( (up), w0, w1)
530 jmp L(tp4)
532 L(ed5): adox( (rp), w0)
533 adox( %rcx, w1) C relies on rcx = 0
534 mov w0, (rp)
535 adc %rcx, w1 C relies on rcx = 0
536 mov w1, 8(rp)
537 L(f4): lea (up,un_save,8), up
538 or R32(un_save), R32(n)
539 mov -32(up), u0
540 mulx( -24,(up), w2, w3)
541 lea -56(rp,un_save,8), rp
542 jmp L(b4)
544 ALIGN(16)
545 L(tp5): adox( -8,(rp), w2)
546 adcx( w3, w0)
547 mov w2, -8(rp)
548 jrcxz L(ed5)
549 mulx( 8,(up), w2, w3)
550 adox( (rp), w0)
551 lea 8(n), R32(n)
552 mov w0, (rp)
553 adcx( w1, w2)
554 mulx( 16,(up), w0, w1)
555 adcx( w3, w0)
556 adox( 8,(rp), w2)
557 mov w2, 8(rp)
558 mulx( 24,(up), w2, w3)
559 lea 64(up), up
560 adcx( w1, w2)
561 adox( 16,(rp), w0)
562 mov w0, 16(rp)
563 mulx( -32,(up), w0, w1)
564 adox( 24,(rp), w2)
565 adcx( w3, w0)
566 mov w2, 24(rp)
567 L(b5): mulx( -24,(up), w2, w3)
568 adcx( w1, w2)
569 adox( 32,(rp), w0)
570 mov w0, 32(rp)
571 mulx( -16,(up), w0, w1)
572 adox( 40,(rp), w2)
573 adcx( w3, w0)
574 mov w2, 40(rp)
575 adox( 48,(rp), w0)
576 mulx( -8,(up), w2, w3)
577 mov w0, 48(rp)
578 lea 64(rp), rp
579 adcx( w1, w2)
580 mulx( (up), w0, w1)
581 jmp L(tp5)
583 L(ed6): adox( (rp), w0)
584 adox( %rcx, w1) C relies on rcx = 0
585 mov w0, (rp)
586 adc %rcx, w1 C relies on rcx = 0
587 mov w1, 8(rp)
588 L(f5): lea (up,un_save,8), up
589 or R32(un_save), R32(n)
590 mov -40(up), u0
591 mulx( -32,(up), w0, w1)
592 lea -56(rp,un_save,8), rp
593 jmp L(b5)
595 ALIGN(16)
596 L(tp6): adox( -8,(rp), w2)
597 adcx( w3, w0)
598 mov w2, -8(rp)
599 jrcxz L(ed6)
600 mulx( 8,(up), w2, w3)
601 adox( (rp), w0)
602 lea 8(n), R32(n)
603 mov w0, (rp)
604 adcx( w1, w2)
605 mulx( 16,(up), w0, w1)
606 adcx( w3, w0)
607 adox( 8,(rp), w2)
608 mov w2, 8(rp)
609 mulx( 24,(up), w2, w3)
610 lea 64(up), up
611 adcx( w1, w2)
612 adox( 16,(rp), w0)
613 mov w0, 16(rp)
614 L(b6): mulx( -32,(up), w0, w1)
615 adox( 24,(rp), w2)
616 adcx( w3, w0)
617 mov w2, 24(rp)
618 mulx( -24,(up), w2, w3)
619 adcx( w1, w2)
620 adox( 32,(rp), w0)
621 mov w0, 32(rp)
622 mulx( -16,(up), w0, w1)
623 adox( 40,(rp), w2)
624 adcx( w3, w0)
625 mov w2, 40(rp)
626 adox( 48,(rp), w0)
627 mulx( -8,(up), w2, w3)
628 mov w0, 48(rp)
629 lea 64(rp), rp
630 adcx( w1, w2)
631 mulx( (up), w0, w1)
632 jmp L(tp6)
634 L(ed7): adox( (rp), w0)
635 adox( %rcx, w1) C relies on rcx = 0
636 mov w0, (rp)
637 adc %rcx, w1 C relies on rcx = 0
638 mov w1, 8(rp)
639 L(f6): lea (up,un_save,8), up
640 or R32(un_save), R32(n)
641 mov -48(up), u0
642 mulx( -40,(up), w2, w3)
643 lea -56(rp,un_save,8), rp
644 jmp L(b6)
646 ALIGN(16)
647 L(tp7): adox( -8,(rp), w2)
648 adcx( w3, w0)
649 mov w2, -8(rp)
650 jrcxz L(ed7)
651 mulx( 8,(up), w2, w3)
652 adox( (rp), w0)
653 lea 8(n), R32(n)
654 mov w0, (rp)
655 adcx( w1, w2)
656 mulx( 16,(up), w0, w1)
657 adcx( w3, w0)
658 adox( 8,(rp), w2)
659 mov w2, 8(rp)
660 L(b7): mulx( 24,(up), w2, w3)
661 lea 64(up), up
662 adcx( w1, w2)
663 adox( 16,(rp), w0)
664 mov w0, 16(rp)
665 mulx( -32,(up), w0, w1)
666 adox( 24,(rp), w2)
667 adcx( w3, w0)
668 mov w2, 24(rp)
669 mulx( -24,(up), w2, w3)
670 adcx( w1, w2)
671 adox( 32,(rp), w0)
672 mov w0, 32(rp)
673 mulx( -16,(up), w0, w1)
674 adox( 40,(rp), w2)
675 adcx( w3, w0)
676 mov w2, 40(rp)
677 adox( 48,(rp), w0)
678 mulx( -8,(up), w2, w3)
679 mov w0, 48(rp)
680 lea 64(rp), rp
681 adcx( w1, w2)
682 mulx( (up), w0, w1)
683 jmp L(tp7)
685 L(corner3):
686 mov -24(up), u0
687 mulx( -16,(up), w0, w1)
688 adox( -8,(rp), w0)
689 mulx( -8,(up), w2, w3)
690 mov w0, -8(rp)
691 lea 8(rp), rp
692 adcx( w1, w2)
693 mulx( (up), w0, w1)
694 adox( -8,(rp), w2)
695 adcx( w3, w0)
696 mov w2, -8(rp)
697 adox( (rp), w0)
698 adox( %rcx, w1) C relies on rcx = 0
699 adcx( %rcx, w1) C relies on rcx = 0
700 L(corner2):
701 mov -16(up), u0
702 mulx( -8,(up), w2, w3)
703 mulx( (up), %rax, %rbx)
704 adox( w0, w2)
705 adcx( w3, %rax)
706 mov w2, (rp)
707 adox( w1, %rax)
708 adox( %rcx, %rbx) C relies on rcx = 0
709 mov %rax, 8(rp)
710 adc %rcx, %rbx C relies on rcx = 0
711 mov -8(up), %rdx
712 mulx( (up), %rax, %rdx)
713 add %rbx, %rax
714 mov %rax, 16(rp)
715 adc %rcx, %rdx C relies on rcx = 0
716 mov %rdx, 24(rp)
718 L(sqr_diag_addlsh1):
719 pop n
720 pop up
721 pop rp
723 ifdef(`SDA_VARIANT',,`define(`SDA_VARIANT', 2)')
725 ifelse(SDA_VARIANT,1,`
726 lea (n,n), %rax
727 movq $0, -8(rp,%rax,8) C FIXME
728 test R32(%rax), R32(%rax)
729 mov (up), %rdx
730 lea 8(up), up
731 mulx( %rdx, %r8, %rdx)
732 jmp L(dm)
734 ALIGN(16)
735 L(dtop):mov 8(rp), %r9
736 adcx( %r9, %r9)
737 adox( %rdx, %r9)
738 mov %r9, 8(rp)
739 lea 16(rp), rp
740 jrcxz L(dend)
741 mov (up), %rdx
742 mulx( %rdx, %rax, %rdx)
743 lea 8(up), up
744 mov (rp), %r8
745 adcx( %r8, %r8)
746 adox( %rax, %r8)
747 L(dm): mov %r8, (rp)
748 lea -1(n), n
749 jmp L(dtop)
750 L(dend):
753 ifelse(SDA_VARIANT,2,`
754 dec R32(n)
755 mov (up), %rdx
756 xor R32(%rbx), R32(%rbx) C clear CF as side effect
757 mulx( %rdx, %rax, %r10)
758 mov %rax, (rp)
759 mov 8(rp), %r8
760 mov 16(rp), %r9
761 jmp L(dm)
763 ALIGN(16)
764 L(dtop):mov 24(rp), %r8
765 mov 32(rp), %r9
766 lea 16(rp), rp
767 lea (%rdx,%rbx), %r10
768 L(dm): adc %r8, %r8
769 adc %r9, %r9
770 setc R8(%rbx)
771 mov 8(up), %rdx
772 lea 8(up), up
773 mulx( %rdx, %rax, %rdx)
774 add %r10, %r8
775 adc %rax, %r9
776 mov %r8, 8(rp)
777 mov %r9, 16(rp)
778 dec R32(n)
779 jnz L(dtop)
781 L(dend):adc %rbx, %rdx
782 mov %rdx, 24(rp)
785 ifelse(SDA_VARIANT,3,`
786 dec R32(n)
787 mov (up), %rdx
788 test R32(%rbx), R32(%rbx) C clear CF and OF
789 mulx( %rdx, %rax, %r10)
790 mov %rax, (rp)
791 mov 8(rp), %r8
792 mov 16(rp), %r9
793 jmp L(dm)
795 ALIGN(16)
796 L(dtop):jrcxz L(dend)
797 mov 24(rp), %r8
798 mov 32(rp), %r9
799 lea 16(rp), rp
800 L(dm): adcx( %r8, %r8)
801 adcx( %r9, %r9)
802 mov 8(up), %rdx
803 lea 8(up), up
804 adox( %r10, %r8)
805 mulx( %rdx, %rax, %r10)
806 adox( %rax, %r9)
807 mov %r8, 8(rp)
808 mov %r9, 16(rp)
809 lea -1(n), R32(n)
810 jmp L(dtop)
812 L(dend):adcx( %rcx, %r10)
813 adox( %rcx, %r10)
814 mov %r10, 24(rp)
817 pop %rbx
818 FUNC_EXIT()
821 JUMPTABSECT
822 ALIGN(8)
823 L(mtab):JMPENT( L(mf7), L(mtab))
824 JMPENT( L(mf0), L(mtab))
825 JMPENT( L(mf1), L(mtab))
826 JMPENT( L(mf2), L(mtab))
827 JMPENT( L(mf3), L(mtab))
828 JMPENT( L(mf4), L(mtab))
829 JMPENT( L(mf5), L(mtab))
830 JMPENT( L(mf6), L(mtab))
831 L(atab):JMPENT( L(f6), L(atab))
832 JMPENT( L(f7), L(atab))
833 JMPENT( L(f0), L(atab))
834 JMPENT( L(f1), L(atab))
835 JMPENT( L(f2), L(atab))
836 JMPENT( L(f3), L(atab))
837 JMPENT( L(f4), L(atab))
838 JMPENT( L(f5), L(atab))
839 TEXT
840 EPILOGUE()