if_iwm - Recognize IWM_FW_PAGING_BLOCK_CMD wide cmd response correctly.
[dragonfly.git] / lib / libcrypto / asm / x86_64-mont.s
blob8bb3b2124a93cbaeb9280ba0a4998900b7cf9499
1 .text
5 .globl bn_mul_mont
6 .type bn_mul_mont,@function
7 .align 16
8 bn_mul_mont:
9 testl $3,%r9d
10 jnz .Lmul_enter
11 cmpl $8,%r9d
12 jb .Lmul_enter
13 movl OPENSSL_ia32cap_P+8(%rip),%r11d
14 cmpq %rsi,%rdx
15 jne .Lmul4x_enter
16 testl $7,%r9d
17 jz .Lsqr8x_enter
18 jmp .Lmul4x_enter
20 .align 16
21 .Lmul_enter:
22 pushq %rbx
23 pushq %rbp
24 pushq %r12
25 pushq %r13
26 pushq %r14
27 pushq %r15
29 movl %r9d,%r9d
30 leaq 2(%r9),%r10
31 movq %rsp,%r11
32 negq %r10
33 leaq (%rsp,%r10,8),%rsp
34 andq $-1024,%rsp
36 movq %r11,8(%rsp,%r9,8)
37 .Lmul_body:
44 subq %rsp,%r11
45 andq $-4096,%r11
46 .Lmul_page_walk:
47 movq (%rsp,%r11,1),%r10
48 subq $4096,%r11
49 .byte 0x66,0x2e
50 jnc .Lmul_page_walk
52 movq %rdx,%r12
53 movq (%r8),%r8
54 movq (%r12),%rbx
55 movq (%rsi),%rax
57 xorq %r14,%r14
58 xorq %r15,%r15
60 movq %r8,%rbp
61 mulq %rbx
62 movq %rax,%r10
63 movq (%rcx),%rax
65 imulq %r10,%rbp
66 movq %rdx,%r11
68 mulq %rbp
69 addq %rax,%r10
70 movq 8(%rsi),%rax
71 adcq $0,%rdx
72 movq %rdx,%r13
74 leaq 1(%r15),%r15
75 jmp .L1st_enter
77 .align 16
78 .L1st:
79 addq %rax,%r13
80 movq (%rsi,%r15,8),%rax
81 adcq $0,%rdx
82 addq %r11,%r13
83 movq %r10,%r11
84 adcq $0,%rdx
85 movq %r13,-16(%rsp,%r15,8)
86 movq %rdx,%r13
88 .L1st_enter:
89 mulq %rbx
90 addq %rax,%r11
91 movq (%rcx,%r15,8),%rax
92 adcq $0,%rdx
93 leaq 1(%r15),%r15
94 movq %rdx,%r10
96 mulq %rbp
97 cmpq %r9,%r15
98 jne .L1st
100 addq %rax,%r13
101 movq (%rsi),%rax
102 adcq $0,%rdx
103 addq %r11,%r13
104 adcq $0,%rdx
105 movq %r13,-16(%rsp,%r15,8)
106 movq %rdx,%r13
107 movq %r10,%r11
109 xorq %rdx,%rdx
110 addq %r11,%r13
111 adcq $0,%rdx
112 movq %r13,-8(%rsp,%r9,8)
113 movq %rdx,(%rsp,%r9,8)
115 leaq 1(%r14),%r14
116 jmp .Louter
117 .align 16
118 .Louter:
119 movq (%r12,%r14,8),%rbx
120 xorq %r15,%r15
121 movq %r8,%rbp
122 movq (%rsp),%r10
123 mulq %rbx
124 addq %rax,%r10
125 movq (%rcx),%rax
126 adcq $0,%rdx
128 imulq %r10,%rbp
129 movq %rdx,%r11
131 mulq %rbp
132 addq %rax,%r10
133 movq 8(%rsi),%rax
134 adcq $0,%rdx
135 movq 8(%rsp),%r10
136 movq %rdx,%r13
138 leaq 1(%r15),%r15
139 jmp .Linner_enter
141 .align 16
142 .Linner:
143 addq %rax,%r13
144 movq (%rsi,%r15,8),%rax
145 adcq $0,%rdx
146 addq %r10,%r13
147 movq (%rsp,%r15,8),%r10
148 adcq $0,%rdx
149 movq %r13,-16(%rsp,%r15,8)
150 movq %rdx,%r13
152 .Linner_enter:
153 mulq %rbx
154 addq %rax,%r11
155 movq (%rcx,%r15,8),%rax
156 adcq $0,%rdx
157 addq %r11,%r10
158 movq %rdx,%r11
159 adcq $0,%r11
160 leaq 1(%r15),%r15
162 mulq %rbp
163 cmpq %r9,%r15
164 jne .Linner
166 addq %rax,%r13
167 movq (%rsi),%rax
168 adcq $0,%rdx
169 addq %r10,%r13
170 movq (%rsp,%r15,8),%r10
171 adcq $0,%rdx
172 movq %r13,-16(%rsp,%r15,8)
173 movq %rdx,%r13
175 xorq %rdx,%rdx
176 addq %r11,%r13
177 adcq $0,%rdx
178 addq %r10,%r13
179 adcq $0,%rdx
180 movq %r13,-8(%rsp,%r9,8)
181 movq %rdx,(%rsp,%r9,8)
183 leaq 1(%r14),%r14
184 cmpq %r9,%r14
185 jb .Louter
187 xorq %r14,%r14
188 movq (%rsp),%rax
189 leaq (%rsp),%rsi
190 movq %r9,%r15
191 jmp .Lsub
192 .align 16
193 .Lsub: sbbq (%rcx,%r14,8),%rax
194 movq %rax,(%rdi,%r14,8)
195 movq 8(%rsi,%r14,8),%rax
196 leaq 1(%r14),%r14
197 decq %r15
198 jnz .Lsub
200 sbbq $0,%rax
201 xorq %r14,%r14
202 andq %rax,%rsi
203 notq %rax
204 movq %rdi,%rcx
205 andq %rax,%rcx
206 movq %r9,%r15
207 orq %rcx,%rsi
208 .align 16
209 .Lcopy:
210 movq (%rsi,%r14,8),%rax
211 movq %r14,(%rsp,%r14,8)
212 movq %rax,(%rdi,%r14,8)
213 leaq 1(%r14),%r14
214 subq $1,%r15
215 jnz .Lcopy
217 movq 8(%rsp,%r9,8),%rsi
218 movq $1,%rax
219 movq (%rsi),%r15
220 movq 8(%rsi),%r14
221 movq 16(%rsi),%r13
222 movq 24(%rsi),%r12
223 movq 32(%rsi),%rbp
224 movq 40(%rsi),%rbx
225 leaq 48(%rsi),%rsp
226 .Lmul_epilogue:
227 .byte 0xf3,0xc3
228 .size bn_mul_mont,.-bn_mul_mont
229 .type bn_mul4x_mont,@function
230 .align 16
231 bn_mul4x_mont:
232 .Lmul4x_enter:
233 andl $0x80100,%r11d
234 cmpl $0x80100,%r11d
235 je .Lmulx4x_enter
236 pushq %rbx
237 pushq %rbp
238 pushq %r12
239 pushq %r13
240 pushq %r14
241 pushq %r15
243 movl %r9d,%r9d
244 leaq 4(%r9),%r10
245 movq %rsp,%r11
246 negq %r10
247 leaq (%rsp,%r10,8),%rsp
248 andq $-1024,%rsp
250 movq %r11,8(%rsp,%r9,8)
251 .Lmul4x_body:
252 subq %rsp,%r11
253 andq $-4096,%r11
254 .Lmul4x_page_walk:
255 movq (%rsp,%r11,1),%r10
256 subq $4096,%r11
257 .byte 0x2e
258 jnc .Lmul4x_page_walk
260 movq %rdi,16(%rsp,%r9,8)
261 movq %rdx,%r12
262 movq (%r8),%r8
263 movq (%r12),%rbx
264 movq (%rsi),%rax
266 xorq %r14,%r14
267 xorq %r15,%r15
269 movq %r8,%rbp
270 mulq %rbx
271 movq %rax,%r10
272 movq (%rcx),%rax
274 imulq %r10,%rbp
275 movq %rdx,%r11
277 mulq %rbp
278 addq %rax,%r10
279 movq 8(%rsi),%rax
280 adcq $0,%rdx
281 movq %rdx,%rdi
283 mulq %rbx
284 addq %rax,%r11
285 movq 8(%rcx),%rax
286 adcq $0,%rdx
287 movq %rdx,%r10
289 mulq %rbp
290 addq %rax,%rdi
291 movq 16(%rsi),%rax
292 adcq $0,%rdx
293 addq %r11,%rdi
294 leaq 4(%r15),%r15
295 adcq $0,%rdx
296 movq %rdi,(%rsp)
297 movq %rdx,%r13
298 jmp .L1st4x
299 .align 16
300 .L1st4x:
301 mulq %rbx
302 addq %rax,%r10
303 movq -16(%rcx,%r15,8),%rax
304 adcq $0,%rdx
305 movq %rdx,%r11
307 mulq %rbp
308 addq %rax,%r13
309 movq -8(%rsi,%r15,8),%rax
310 adcq $0,%rdx
311 addq %r10,%r13
312 adcq $0,%rdx
313 movq %r13,-24(%rsp,%r15,8)
314 movq %rdx,%rdi
316 mulq %rbx
317 addq %rax,%r11
318 movq -8(%rcx,%r15,8),%rax
319 adcq $0,%rdx
320 movq %rdx,%r10
322 mulq %rbp
323 addq %rax,%rdi
324 movq (%rsi,%r15,8),%rax
325 adcq $0,%rdx
326 addq %r11,%rdi
327 adcq $0,%rdx
328 movq %rdi,-16(%rsp,%r15,8)
329 movq %rdx,%r13
331 mulq %rbx
332 addq %rax,%r10
333 movq (%rcx,%r15,8),%rax
334 adcq $0,%rdx
335 movq %rdx,%r11
337 mulq %rbp
338 addq %rax,%r13
339 movq 8(%rsi,%r15,8),%rax
340 adcq $0,%rdx
341 addq %r10,%r13
342 adcq $0,%rdx
343 movq %r13,-8(%rsp,%r15,8)
344 movq %rdx,%rdi
346 mulq %rbx
347 addq %rax,%r11
348 movq 8(%rcx,%r15,8),%rax
349 adcq $0,%rdx
350 leaq 4(%r15),%r15
351 movq %rdx,%r10
353 mulq %rbp
354 addq %rax,%rdi
355 movq -16(%rsi,%r15,8),%rax
356 adcq $0,%rdx
357 addq %r11,%rdi
358 adcq $0,%rdx
359 movq %rdi,-32(%rsp,%r15,8)
360 movq %rdx,%r13
361 cmpq %r9,%r15
362 jb .L1st4x
364 mulq %rbx
365 addq %rax,%r10
366 movq -16(%rcx,%r15,8),%rax
367 adcq $0,%rdx
368 movq %rdx,%r11
370 mulq %rbp
371 addq %rax,%r13
372 movq -8(%rsi,%r15,8),%rax
373 adcq $0,%rdx
374 addq %r10,%r13
375 adcq $0,%rdx
376 movq %r13,-24(%rsp,%r15,8)
377 movq %rdx,%rdi
379 mulq %rbx
380 addq %rax,%r11
381 movq -8(%rcx,%r15,8),%rax
382 adcq $0,%rdx
383 movq %rdx,%r10
385 mulq %rbp
386 addq %rax,%rdi
387 movq (%rsi),%rax
388 adcq $0,%rdx
389 addq %r11,%rdi
390 adcq $0,%rdx
391 movq %rdi,-16(%rsp,%r15,8)
392 movq %rdx,%r13
394 xorq %rdi,%rdi
395 addq %r10,%r13
396 adcq $0,%rdi
397 movq %r13,-8(%rsp,%r15,8)
398 movq %rdi,(%rsp,%r15,8)
400 leaq 1(%r14),%r14
401 .align 4
402 .Louter4x:
403 movq (%r12,%r14,8),%rbx
404 xorq %r15,%r15
405 movq (%rsp),%r10
406 movq %r8,%rbp
407 mulq %rbx
408 addq %rax,%r10
409 movq (%rcx),%rax
410 adcq $0,%rdx
412 imulq %r10,%rbp
413 movq %rdx,%r11
415 mulq %rbp
416 addq %rax,%r10
417 movq 8(%rsi),%rax
418 adcq $0,%rdx
419 movq %rdx,%rdi
421 mulq %rbx
422 addq %rax,%r11
423 movq 8(%rcx),%rax
424 adcq $0,%rdx
425 addq 8(%rsp),%r11
426 adcq $0,%rdx
427 movq %rdx,%r10
429 mulq %rbp
430 addq %rax,%rdi
431 movq 16(%rsi),%rax
432 adcq $0,%rdx
433 addq %r11,%rdi
434 leaq 4(%r15),%r15
435 adcq $0,%rdx
436 movq %rdi,(%rsp)
437 movq %rdx,%r13
438 jmp .Linner4x
439 .align 16
440 .Linner4x:
441 mulq %rbx
442 addq %rax,%r10
443 movq -16(%rcx,%r15,8),%rax
444 adcq $0,%rdx
445 addq -16(%rsp,%r15,8),%r10
446 adcq $0,%rdx
447 movq %rdx,%r11
449 mulq %rbp
450 addq %rax,%r13
451 movq -8(%rsi,%r15,8),%rax
452 adcq $0,%rdx
453 addq %r10,%r13
454 adcq $0,%rdx
455 movq %r13,-24(%rsp,%r15,8)
456 movq %rdx,%rdi
458 mulq %rbx
459 addq %rax,%r11
460 movq -8(%rcx,%r15,8),%rax
461 adcq $0,%rdx
462 addq -8(%rsp,%r15,8),%r11
463 adcq $0,%rdx
464 movq %rdx,%r10
466 mulq %rbp
467 addq %rax,%rdi
468 movq (%rsi,%r15,8),%rax
469 adcq $0,%rdx
470 addq %r11,%rdi
471 adcq $0,%rdx
472 movq %rdi,-16(%rsp,%r15,8)
473 movq %rdx,%r13
475 mulq %rbx
476 addq %rax,%r10
477 movq (%rcx,%r15,8),%rax
478 adcq $0,%rdx
479 addq (%rsp,%r15,8),%r10
480 adcq $0,%rdx
481 movq %rdx,%r11
483 mulq %rbp
484 addq %rax,%r13
485 movq 8(%rsi,%r15,8),%rax
486 adcq $0,%rdx
487 addq %r10,%r13
488 adcq $0,%rdx
489 movq %r13,-8(%rsp,%r15,8)
490 movq %rdx,%rdi
492 mulq %rbx
493 addq %rax,%r11
494 movq 8(%rcx,%r15,8),%rax
495 adcq $0,%rdx
496 addq 8(%rsp,%r15,8),%r11
497 adcq $0,%rdx
498 leaq 4(%r15),%r15
499 movq %rdx,%r10
501 mulq %rbp
502 addq %rax,%rdi
503 movq -16(%rsi,%r15,8),%rax
504 adcq $0,%rdx
505 addq %r11,%rdi
506 adcq $0,%rdx
507 movq %rdi,-32(%rsp,%r15,8)
508 movq %rdx,%r13
509 cmpq %r9,%r15
510 jb .Linner4x
512 mulq %rbx
513 addq %rax,%r10
514 movq -16(%rcx,%r15,8),%rax
515 adcq $0,%rdx
516 addq -16(%rsp,%r15,8),%r10
517 adcq $0,%rdx
518 movq %rdx,%r11
520 mulq %rbp
521 addq %rax,%r13
522 movq -8(%rsi,%r15,8),%rax
523 adcq $0,%rdx
524 addq %r10,%r13
525 adcq $0,%rdx
526 movq %r13,-24(%rsp,%r15,8)
527 movq %rdx,%rdi
529 mulq %rbx
530 addq %rax,%r11
531 movq -8(%rcx,%r15,8),%rax
532 adcq $0,%rdx
533 addq -8(%rsp,%r15,8),%r11
534 adcq $0,%rdx
535 leaq 1(%r14),%r14
536 movq %rdx,%r10
538 mulq %rbp
539 addq %rax,%rdi
540 movq (%rsi),%rax
541 adcq $0,%rdx
542 addq %r11,%rdi
543 adcq $0,%rdx
544 movq %rdi,-16(%rsp,%r15,8)
545 movq %rdx,%r13
547 xorq %rdi,%rdi
548 addq %r10,%r13
549 adcq $0,%rdi
550 addq (%rsp,%r9,8),%r13
551 adcq $0,%rdi
552 movq %r13,-8(%rsp,%r15,8)
553 movq %rdi,(%rsp,%r15,8)
555 cmpq %r9,%r14
556 jb .Louter4x
557 movq 16(%rsp,%r9,8),%rdi
558 movq 0(%rsp),%rax
559 pxor %xmm0,%xmm0
560 movq 8(%rsp),%rdx
561 shrq $2,%r9
562 leaq (%rsp),%rsi
563 xorq %r14,%r14
565 subq 0(%rcx),%rax
566 movq 16(%rsi),%rbx
567 movq 24(%rsi),%rbp
568 sbbq 8(%rcx),%rdx
569 leaq -1(%r9),%r15
570 jmp .Lsub4x
571 .align 16
572 .Lsub4x:
573 movq %rax,0(%rdi,%r14,8)
574 movq %rdx,8(%rdi,%r14,8)
575 sbbq 16(%rcx,%r14,8),%rbx
576 movq 32(%rsi,%r14,8),%rax
577 movq 40(%rsi,%r14,8),%rdx
578 sbbq 24(%rcx,%r14,8),%rbp
579 movq %rbx,16(%rdi,%r14,8)
580 movq %rbp,24(%rdi,%r14,8)
581 sbbq 32(%rcx,%r14,8),%rax
582 movq 48(%rsi,%r14,8),%rbx
583 movq 56(%rsi,%r14,8),%rbp
584 sbbq 40(%rcx,%r14,8),%rdx
585 leaq 4(%r14),%r14
586 decq %r15
587 jnz .Lsub4x
589 movq %rax,0(%rdi,%r14,8)
590 movq 32(%rsi,%r14,8),%rax
591 sbbq 16(%rcx,%r14,8),%rbx
592 movq %rdx,8(%rdi,%r14,8)
593 sbbq 24(%rcx,%r14,8),%rbp
594 movq %rbx,16(%rdi,%r14,8)
596 sbbq $0,%rax
597 movq %rbp,24(%rdi,%r14,8)
598 xorq %r14,%r14
599 andq %rax,%rsi
600 notq %rax
601 movq %rdi,%rcx
602 andq %rax,%rcx
603 leaq -1(%r9),%r15
604 orq %rcx,%rsi
606 movdqu (%rsi),%xmm1
607 movdqa %xmm0,(%rsp)
608 movdqu %xmm1,(%rdi)
609 jmp .Lcopy4x
610 .align 16
611 .Lcopy4x:
612 movdqu 16(%rsi,%r14,1),%xmm2
613 movdqu 32(%rsi,%r14,1),%xmm1
614 movdqa %xmm0,16(%rsp,%r14,1)
615 movdqu %xmm2,16(%rdi,%r14,1)
616 movdqa %xmm0,32(%rsp,%r14,1)
617 movdqu %xmm1,32(%rdi,%r14,1)
618 leaq 32(%r14),%r14
619 decq %r15
620 jnz .Lcopy4x
622 shlq $2,%r9
623 movdqu 16(%rsi,%r14,1),%xmm2
624 movdqa %xmm0,16(%rsp,%r14,1)
625 movdqu %xmm2,16(%rdi,%r14,1)
626 movq 8(%rsp,%r9,8),%rsi
627 movq $1,%rax
628 movq (%rsi),%r15
629 movq 8(%rsi),%r14
630 movq 16(%rsi),%r13
631 movq 24(%rsi),%r12
632 movq 32(%rsi),%rbp
633 movq 40(%rsi),%rbx
634 leaq 48(%rsi),%rsp
635 .Lmul4x_epilogue:
636 .byte 0xf3,0xc3
637 .size bn_mul4x_mont,.-bn_mul4x_mont
641 .type bn_sqr8x_mont,@function
642 .align 32
643 bn_sqr8x_mont:
644 .Lsqr8x_enter:
645 movq %rsp,%rax
646 pushq %rbx
647 pushq %rbp
648 pushq %r12
649 pushq %r13
650 pushq %r14
651 pushq %r15
653 movl %r9d,%r10d
654 shll $3,%r9d
655 shlq $3+2,%r10
656 negq %r9
663 leaq -64(%rsp,%r9,2),%r11
664 movq (%r8),%r8
665 subq %rsi,%r11
666 andq $4095,%r11
667 cmpq %r11,%r10
668 jb .Lsqr8x_sp_alt
669 subq %r11,%rsp
670 leaq -64(%rsp,%r9,2),%rsp
671 jmp .Lsqr8x_sp_done
673 .align 32
674 .Lsqr8x_sp_alt:
675 leaq 4096-64(,%r9,2),%r10
676 leaq -64(%rsp,%r9,2),%rsp
677 subq %r10,%r11
678 movq $0,%r10
679 cmovcq %r10,%r11
680 subq %r11,%rsp
681 .Lsqr8x_sp_done:
682 andq $-64,%rsp
683 movq %rax,%r11
684 subq %rsp,%r11
685 andq $-4096,%r11
686 .Lsqr8x_page_walk:
687 movq (%rsp,%r11,1),%r10
688 subq $4096,%r11
689 .byte 0x2e
690 jnc .Lsqr8x_page_walk
692 movq %r9,%r10
693 negq %r9
695 movq %r8,32(%rsp)
696 movq %rax,40(%rsp)
697 .Lsqr8x_body:
699 .byte 102,72,15,110,209
700 pxor %xmm0,%xmm0
701 .byte 102,72,15,110,207
702 .byte 102,73,15,110,218
703 movl OPENSSL_ia32cap_P+8(%rip),%eax
704 andl $0x80100,%eax
705 cmpl $0x80100,%eax
706 jne .Lsqr8x_nox
708 call bn_sqrx8x_internal
713 leaq (%r8,%rcx,1),%rbx
714 movq %rcx,%r9
715 movq %rcx,%rdx
716 .byte 102,72,15,126,207
717 sarq $3+2,%rcx
718 jmp .Lsqr8x_sub
720 .align 32
721 .Lsqr8x_nox:
722 call bn_sqr8x_internal
727 leaq (%rdi,%r9,1),%rbx
728 movq %r9,%rcx
729 movq %r9,%rdx
730 .byte 102,72,15,126,207
731 sarq $3+2,%rcx
732 jmp .Lsqr8x_sub
734 .align 32
735 .Lsqr8x_sub:
736 movq 0(%rbx),%r12
737 movq 8(%rbx),%r13
738 movq 16(%rbx),%r14
739 movq 24(%rbx),%r15
740 leaq 32(%rbx),%rbx
741 sbbq 0(%rbp),%r12
742 sbbq 8(%rbp),%r13
743 sbbq 16(%rbp),%r14
744 sbbq 24(%rbp),%r15
745 leaq 32(%rbp),%rbp
746 movq %r12,0(%rdi)
747 movq %r13,8(%rdi)
748 movq %r14,16(%rdi)
749 movq %r15,24(%rdi)
750 leaq 32(%rdi),%rdi
751 incq %rcx
752 jnz .Lsqr8x_sub
754 sbbq $0,%rax
755 leaq (%rbx,%r9,1),%rbx
756 leaq (%rdi,%r9,1),%rdi
758 .byte 102,72,15,110,200
759 pxor %xmm0,%xmm0
760 pshufd $0,%xmm1,%xmm1
761 movq 40(%rsp),%rsi
762 jmp .Lsqr8x_cond_copy
764 .align 32
765 .Lsqr8x_cond_copy:
766 movdqa 0(%rbx),%xmm2
767 movdqa 16(%rbx),%xmm3
768 leaq 32(%rbx),%rbx
769 movdqu 0(%rdi),%xmm4
770 movdqu 16(%rdi),%xmm5
771 leaq 32(%rdi),%rdi
772 movdqa %xmm0,-32(%rbx)
773 movdqa %xmm0,-16(%rbx)
774 movdqa %xmm0,-32(%rbx,%rdx,1)
775 movdqa %xmm0,-16(%rbx,%rdx,1)
776 pcmpeqd %xmm1,%xmm0
777 pand %xmm1,%xmm2
778 pand %xmm1,%xmm3
779 pand %xmm0,%xmm4
780 pand %xmm0,%xmm5
781 pxor %xmm0,%xmm0
782 por %xmm2,%xmm4
783 por %xmm3,%xmm5
784 movdqu %xmm4,-32(%rdi)
785 movdqu %xmm5,-16(%rdi)
786 addq $32,%r9
787 jnz .Lsqr8x_cond_copy
789 movq $1,%rax
790 movq -48(%rsi),%r15
791 movq -40(%rsi),%r14
792 movq -32(%rsi),%r13
793 movq -24(%rsi),%r12
794 movq -16(%rsi),%rbp
795 movq -8(%rsi),%rbx
796 leaq (%rsi),%rsp
797 .Lsqr8x_epilogue:
798 .byte 0xf3,0xc3
799 .size bn_sqr8x_mont,.-bn_sqr8x_mont
800 .type bn_mulx4x_mont,@function
801 .align 32
802 bn_mulx4x_mont:
803 .Lmulx4x_enter:
804 movq %rsp,%rax
805 pushq %rbx
806 pushq %rbp
807 pushq %r12
808 pushq %r13
809 pushq %r14
810 pushq %r15
812 shll $3,%r9d
813 .byte 0x67
814 xorq %r10,%r10
815 subq %r9,%r10
816 movq (%r8),%r8
817 leaq -72(%rsp,%r10,1),%rsp
818 andq $-128,%rsp
819 movq %rax,%r11
820 subq %rsp,%r11
821 andq $-4096,%r11
822 .Lmulx4x_page_walk:
823 movq (%rsp,%r11,1),%r10
824 subq $4096,%r11
825 .byte 0x66,0x2e
826 jnc .Lmulx4x_page_walk
828 leaq (%rdx,%r9,1),%r10
841 movq %r9,0(%rsp)
842 shrq $5,%r9
843 movq %r10,16(%rsp)
844 subq $1,%r9
845 movq %r8,24(%rsp)
846 movq %rdi,32(%rsp)
847 movq %rax,40(%rsp)
848 movq %r9,48(%rsp)
849 jmp .Lmulx4x_body
851 .align 32
852 .Lmulx4x_body:
853 leaq 8(%rdx),%rdi
854 movq (%rdx),%rdx
855 leaq 64+32(%rsp),%rbx
856 movq %rdx,%r9
858 mulxq 0(%rsi),%r8,%rax
859 mulxq 8(%rsi),%r11,%r14
860 addq %rax,%r11
861 movq %rdi,8(%rsp)
862 mulxq 16(%rsi),%r12,%r13
863 adcq %r14,%r12
864 adcq $0,%r13
866 movq %r8,%rdi
867 imulq 24(%rsp),%r8
868 xorq %rbp,%rbp
870 mulxq 24(%rsi),%rax,%r14
871 movq %r8,%rdx
872 leaq 32(%rsi),%rsi
873 adcxq %rax,%r13
874 adcxq %rbp,%r14
876 mulxq 0(%rcx),%rax,%r10
877 adcxq %rax,%rdi
878 adoxq %r11,%r10
879 mulxq 8(%rcx),%rax,%r11
880 adcxq %rax,%r10
881 adoxq %r12,%r11
882 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
883 movq 48(%rsp),%rdi
884 movq %r10,-32(%rbx)
885 adcxq %rax,%r11
886 adoxq %r13,%r12
887 mulxq 24(%rcx),%rax,%r15
888 movq %r9,%rdx
889 movq %r11,-24(%rbx)
890 adcxq %rax,%r12
891 adoxq %rbp,%r15
892 leaq 32(%rcx),%rcx
893 movq %r12,-16(%rbx)
895 jmp .Lmulx4x_1st
897 .align 32
898 .Lmulx4x_1st:
899 adcxq %rbp,%r15
900 mulxq 0(%rsi),%r10,%rax
901 adcxq %r14,%r10
902 mulxq 8(%rsi),%r11,%r14
903 adcxq %rax,%r11
904 mulxq 16(%rsi),%r12,%rax
905 adcxq %r14,%r12
906 mulxq 24(%rsi),%r13,%r14
907 .byte 0x67,0x67
908 movq %r8,%rdx
909 adcxq %rax,%r13
910 adcxq %rbp,%r14
911 leaq 32(%rsi),%rsi
912 leaq 32(%rbx),%rbx
914 adoxq %r15,%r10
915 mulxq 0(%rcx),%rax,%r15
916 adcxq %rax,%r10
917 adoxq %r15,%r11
918 mulxq 8(%rcx),%rax,%r15
919 adcxq %rax,%r11
920 adoxq %r15,%r12
921 mulxq 16(%rcx),%rax,%r15
922 movq %r10,-40(%rbx)
923 adcxq %rax,%r12
924 movq %r11,-32(%rbx)
925 adoxq %r15,%r13
926 mulxq 24(%rcx),%rax,%r15
927 movq %r9,%rdx
928 movq %r12,-24(%rbx)
929 adcxq %rax,%r13
930 adoxq %rbp,%r15
931 leaq 32(%rcx),%rcx
932 movq %r13,-16(%rbx)
934 decq %rdi
935 jnz .Lmulx4x_1st
937 movq 0(%rsp),%rax
938 movq 8(%rsp),%rdi
939 adcq %rbp,%r15
940 addq %r15,%r14
941 sbbq %r15,%r15
942 movq %r14,-8(%rbx)
943 jmp .Lmulx4x_outer
945 .align 32
946 .Lmulx4x_outer:
947 movq (%rdi),%rdx
948 leaq 8(%rdi),%rdi
949 subq %rax,%rsi
950 movq %r15,(%rbx)
951 leaq 64+32(%rsp),%rbx
952 subq %rax,%rcx
954 mulxq 0(%rsi),%r8,%r11
955 xorl %ebp,%ebp
956 movq %rdx,%r9
957 mulxq 8(%rsi),%r14,%r12
958 adoxq -32(%rbx),%r8
959 adcxq %r14,%r11
960 mulxq 16(%rsi),%r15,%r13
961 adoxq -24(%rbx),%r11
962 adcxq %r15,%r12
963 adoxq %rbp,%r12
964 adcxq %rbp,%r13
966 movq %rdi,8(%rsp)
967 .byte 0x67
968 movq %r8,%r15
969 imulq 24(%rsp),%r8
970 xorl %ebp,%ebp
972 mulxq 24(%rsi),%rax,%r14
973 movq %r8,%rdx
974 adoxq -16(%rbx),%r12
975 adcxq %rax,%r13
976 adoxq -8(%rbx),%r13
977 adcxq %rbp,%r14
978 leaq 32(%rsi),%rsi
979 adoxq %rbp,%r14
981 mulxq 0(%rcx),%rax,%r10
982 adcxq %rax,%r15
983 adoxq %r11,%r10
984 mulxq 8(%rcx),%rax,%r11
985 adcxq %rax,%r10
986 adoxq %r12,%r11
987 mulxq 16(%rcx),%rax,%r12
988 movq %r10,-32(%rbx)
989 adcxq %rax,%r11
990 adoxq %r13,%r12
991 mulxq 24(%rcx),%rax,%r15
992 movq %r9,%rdx
993 movq %r11,-24(%rbx)
994 leaq 32(%rcx),%rcx
995 adcxq %rax,%r12
996 adoxq %rbp,%r15
997 movq 48(%rsp),%rdi
998 movq %r12,-16(%rbx)
1000 jmp .Lmulx4x_inner
1002 .align 32
1003 .Lmulx4x_inner:
1004 mulxq 0(%rsi),%r10,%rax
1005 adcxq %rbp,%r15
1006 adoxq %r14,%r10
1007 mulxq 8(%rsi),%r11,%r14
1008 adcxq 0(%rbx),%r10
1009 adoxq %rax,%r11
1010 mulxq 16(%rsi),%r12,%rax
1011 adcxq 8(%rbx),%r11
1012 adoxq %r14,%r12
1013 mulxq 24(%rsi),%r13,%r14
1014 movq %r8,%rdx
1015 adcxq 16(%rbx),%r12
1016 adoxq %rax,%r13
1017 adcxq 24(%rbx),%r13
1018 adoxq %rbp,%r14
1019 leaq 32(%rsi),%rsi
1020 leaq 32(%rbx),%rbx
1021 adcxq %rbp,%r14
1023 adoxq %r15,%r10
1024 mulxq 0(%rcx),%rax,%r15
1025 adcxq %rax,%r10
1026 adoxq %r15,%r11
1027 mulxq 8(%rcx),%rax,%r15
1028 adcxq %rax,%r11
1029 adoxq %r15,%r12
1030 mulxq 16(%rcx),%rax,%r15
1031 movq %r10,-40(%rbx)
1032 adcxq %rax,%r12
1033 adoxq %r15,%r13
1034 mulxq 24(%rcx),%rax,%r15
1035 movq %r9,%rdx
1036 movq %r11,-32(%rbx)
1037 movq %r12,-24(%rbx)
1038 adcxq %rax,%r13
1039 adoxq %rbp,%r15
1040 leaq 32(%rcx),%rcx
1041 movq %r13,-16(%rbx)
1043 decq %rdi
1044 jnz .Lmulx4x_inner
1046 movq 0(%rsp),%rax
1047 movq 8(%rsp),%rdi
1048 adcq %rbp,%r15
1049 subq 0(%rbx),%rbp
1050 adcq %r15,%r14
1051 sbbq %r15,%r15
1052 movq %r14,-8(%rbx)
1054 cmpq 16(%rsp),%rdi
1055 jne .Lmulx4x_outer
1057 leaq 64(%rsp),%rbx
1058 subq %rax,%rcx
1059 negq %r15
1060 movq %rax,%rdx
1061 shrq $3+2,%rax
1062 movq 32(%rsp),%rdi
1063 jmp .Lmulx4x_sub
1065 .align 32
1066 .Lmulx4x_sub:
1067 movq 0(%rbx),%r11
1068 movq 8(%rbx),%r12
1069 movq 16(%rbx),%r13
1070 movq 24(%rbx),%r14
1071 leaq 32(%rbx),%rbx
1072 sbbq 0(%rcx),%r11
1073 sbbq 8(%rcx),%r12
1074 sbbq 16(%rcx),%r13
1075 sbbq 24(%rcx),%r14
1076 leaq 32(%rcx),%rcx
1077 movq %r11,0(%rdi)
1078 movq %r12,8(%rdi)
1079 movq %r13,16(%rdi)
1080 movq %r14,24(%rdi)
1081 leaq 32(%rdi),%rdi
1082 decq %rax
1083 jnz .Lmulx4x_sub
1085 sbbq $0,%r15
1086 leaq 64(%rsp),%rbx
1087 subq %rdx,%rdi
1089 .byte 102,73,15,110,207
1090 pxor %xmm0,%xmm0
1091 pshufd $0,%xmm1,%xmm1
1092 movq 40(%rsp),%rsi
1093 jmp .Lmulx4x_cond_copy
1095 .align 32
1096 .Lmulx4x_cond_copy:
1097 movdqa 0(%rbx),%xmm2
1098 movdqa 16(%rbx),%xmm3
1099 leaq 32(%rbx),%rbx
1100 movdqu 0(%rdi),%xmm4
1101 movdqu 16(%rdi),%xmm5
1102 leaq 32(%rdi),%rdi
1103 movdqa %xmm0,-32(%rbx)
1104 movdqa %xmm0,-16(%rbx)
1105 pcmpeqd %xmm1,%xmm0
1106 pand %xmm1,%xmm2
1107 pand %xmm1,%xmm3
1108 pand %xmm0,%xmm4
1109 pand %xmm0,%xmm5
1110 pxor %xmm0,%xmm0
1111 por %xmm2,%xmm4
1112 por %xmm3,%xmm5
1113 movdqu %xmm4,-32(%rdi)
1114 movdqu %xmm5,-16(%rdi)
1115 subq $32,%rdx
1116 jnz .Lmulx4x_cond_copy
1118 movq %rdx,(%rbx)
1120 movq $1,%rax
1121 movq -48(%rsi),%r15
1122 movq -40(%rsi),%r14
1123 movq -32(%rsi),%r13
1124 movq -24(%rsi),%r12
1125 movq -16(%rsi),%rbp
1126 movq -8(%rsi),%rbx
1127 leaq (%rsi),%rsp
1128 .Lmulx4x_epilogue:
1129 .byte 0xf3,0xc3
1130 .size bn_mulx4x_mont,.-bn_mulx4x_mont
1131 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1132 .align 16
1133 .section .note.GNU-stack,"",%progbits