OpenSSL 1.0.2g
[tomato.git] / release / src / router / openssl / crypto / bn / asm / rsaz-x86_64.pl
blob87ce2c34d90cbb8bc3b10c21fc679a010955715b
1 #!/usr/bin/env perl
3 ##############################################################################
4 # #
5 # Copyright (c) 2012, Intel Corporation #
6 # #
7 # All rights reserved. #
8 # #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
11 # met: #
12 # #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
15 # #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
19 # distribution. #
20 # #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
24 # #
25 # #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37 # #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
45 # Reference: #
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
69 # <appro@openssl.org>
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
75 # P4 +11% |+7% +8%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
80 # Atom +13% |+11% +4%
81 # VIA Nano +70% |+9% +25%
83 # (*) rsax engine and fips numbers are presented for reference
84 # purposes;
85 # (**) MULX was attempted, but found to give only marginal improvement;
87 $flavour = shift;
88 $output = shift;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
98 open OUT,"| \"$^X\" $xlate $flavour $output";
99 *STDOUT=*OUT;
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 $addx = ($1>=2.23);
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113 $addx = ($1>=12);
116 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
118 $addx = ($ver>=3.03);
121 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
123 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
125 $code.=<<___;
126 .text
128 .extern OPENSSL_ia32cap_P
130 .globl rsaz_512_sqr
131 .type rsaz_512_sqr,\@function,5
132 .align 32
133 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
134 push %rbx
135 push %rbp
136 push %r12
137 push %r13
138 push %r14
139 push %r15
141 subq \$128+24, %rsp
142 .Lsqr_body:
143 movq $mod, %rbp # common argument
144 movq ($inp), %rdx
145 movq 8($inp), %rax
146 movq $n0, 128(%rsp)
148 $code.=<<___ if ($addx);
149 movl \$0x80100,%r11d
150 andl OPENSSL_ia32cap_P+8(%rip),%r11d
151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
152 je .Loop_sqrx
154 $code.=<<___;
155 jmp .Loop_sqr
157 .align 32
158 .Loop_sqr:
159 movl $times,128+8(%rsp)
160 #first iteration
161 movq %rdx, %rbx
162 mulq %rdx
163 movq %rax, %r8
164 movq 16($inp), %rax
165 movq %rdx, %r9
167 mulq %rbx
168 addq %rax, %r9
169 movq 24($inp), %rax
170 movq %rdx, %r10
171 adcq \$0, %r10
173 mulq %rbx
174 addq %rax, %r10
175 movq 32($inp), %rax
176 movq %rdx, %r11
177 adcq \$0, %r11
179 mulq %rbx
180 addq %rax, %r11
181 movq 40($inp), %rax
182 movq %rdx, %r12
183 adcq \$0, %r12
185 mulq %rbx
186 addq %rax, %r12
187 movq 48($inp), %rax
188 movq %rdx, %r13
189 adcq \$0, %r13
191 mulq %rbx
192 addq %rax, %r13
193 movq 56($inp), %rax
194 movq %rdx, %r14
195 adcq \$0, %r14
197 mulq %rbx
198 addq %rax, %r14
199 movq %rbx, %rax
200 movq %rdx, %r15
201 adcq \$0, %r15
203 addq %r8, %r8 #shlq \$1, %r8
204 movq %r9, %rcx
205 adcq %r9, %r9 #shld \$1, %r8, %r9
207 mulq %rax
208 movq %rax, (%rsp)
209 addq %rdx, %r8
210 adcq \$0, %r9
212 movq %r8, 8(%rsp)
213 shrq \$63, %rcx
215 #second iteration
216 movq 8($inp), %r8
217 movq 16($inp), %rax
218 mulq %r8
219 addq %rax, %r10
220 movq 24($inp), %rax
221 movq %rdx, %rbx
222 adcq \$0, %rbx
224 mulq %r8
225 addq %rax, %r11
226 movq 32($inp), %rax
227 adcq \$0, %rdx
228 addq %rbx, %r11
229 movq %rdx, %rbx
230 adcq \$0, %rbx
232 mulq %r8
233 addq %rax, %r12
234 movq 40($inp), %rax
235 adcq \$0, %rdx
236 addq %rbx, %r12
237 movq %rdx, %rbx
238 adcq \$0, %rbx
240 mulq %r8
241 addq %rax, %r13
242 movq 48($inp), %rax
243 adcq \$0, %rdx
244 addq %rbx, %r13
245 movq %rdx, %rbx
246 adcq \$0, %rbx
248 mulq %r8
249 addq %rax, %r14
250 movq 56($inp), %rax
251 adcq \$0, %rdx
252 addq %rbx, %r14
253 movq %rdx, %rbx
254 adcq \$0, %rbx
256 mulq %r8
257 addq %rax, %r15
258 movq %r8, %rax
259 adcq \$0, %rdx
260 addq %rbx, %r15
261 movq %rdx, %r8
262 movq %r10, %rdx
263 adcq \$0, %r8
265 add %rdx, %rdx
266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
267 movq %r11, %rbx
268 adcq %r11, %r11 #shld \$1, %r10, %r11
270 mulq %rax
271 addq %rax, %r9
272 adcq %rdx, %r10
273 adcq \$0, %r11
275 movq %r9, 16(%rsp)
276 movq %r10, 24(%rsp)
277 shrq \$63, %rbx
279 #third iteration
280 movq 16($inp), %r9
281 movq 24($inp), %rax
282 mulq %r9
283 addq %rax, %r12
284 movq 32($inp), %rax
285 movq %rdx, %rcx
286 adcq \$0, %rcx
288 mulq %r9
289 addq %rax, %r13
290 movq 40($inp), %rax
291 adcq \$0, %rdx
292 addq %rcx, %r13
293 movq %rdx, %rcx
294 adcq \$0, %rcx
296 mulq %r9
297 addq %rax, %r14
298 movq 48($inp), %rax
299 adcq \$0, %rdx
300 addq %rcx, %r14
301 movq %rdx, %rcx
302 adcq \$0, %rcx
304 mulq %r9
305 movq %r12, %r10
306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 addq %rax, %r15
308 movq 56($inp), %rax
309 adcq \$0, %rdx
310 addq %rcx, %r15
311 movq %rdx, %rcx
312 adcq \$0, %rcx
314 mulq %r9
315 shrq \$63, %r10
316 addq %rax, %r8
317 movq %r9, %rax
318 adcq \$0, %rdx
319 addq %rcx, %r8
320 movq %rdx, %r9
321 adcq \$0, %r9
323 movq %r13, %rcx
324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
326 mulq %rax
327 addq %rax, %r11
328 adcq %rdx, %r12
329 adcq \$0, %r13
331 movq %r11, 32(%rsp)
332 movq %r12, 40(%rsp)
333 shrq \$63, %rcx
335 #fourth iteration
336 movq 24($inp), %r10
337 movq 32($inp), %rax
338 mulq %r10
339 addq %rax, %r14
340 movq 40($inp), %rax
341 movq %rdx, %rbx
342 adcq \$0, %rbx
344 mulq %r10
345 addq %rax, %r15
346 movq 48($inp), %rax
347 adcq \$0, %rdx
348 addq %rbx, %r15
349 movq %rdx, %rbx
350 adcq \$0, %rbx
352 mulq %r10
353 movq %r14, %r12
354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 addq %rax, %r8
356 movq 56($inp), %rax
357 adcq \$0, %rdx
358 addq %rbx, %r8
359 movq %rdx, %rbx
360 adcq \$0, %rbx
362 mulq %r10
363 shrq \$63, %r12
364 addq %rax, %r9
365 movq %r10, %rax
366 adcq \$0, %rdx
367 addq %rbx, %r9
368 movq %rdx, %r10
369 adcq \$0, %r10
371 movq %r15, %rbx
372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
374 mulq %rax
375 addq %rax, %r13
376 adcq %rdx, %r14
377 adcq \$0, %r15
379 movq %r13, 48(%rsp)
380 movq %r14, 56(%rsp)
381 shrq \$63, %rbx
383 #fifth iteration
384 movq 32($inp), %r11
385 movq 40($inp), %rax
386 mulq %r11
387 addq %rax, %r8
388 movq 48($inp), %rax
389 movq %rdx, %rcx
390 adcq \$0, %rcx
392 mulq %r11
393 addq %rax, %r9
394 movq 56($inp), %rax
395 adcq \$0, %rdx
396 movq %r8, %r12
397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
398 addq %rcx, %r9
399 movq %rdx, %rcx
400 adcq \$0, %rcx
402 mulq %r11
403 shrq \$63, %r12
404 addq %rax, %r10
405 movq %r11, %rax
406 adcq \$0, %rdx
407 addq %rcx, %r10
408 movq %rdx, %r11
409 adcq \$0, %r11
411 movq %r9, %rcx
412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
414 mulq %rax
415 addq %rax, %r15
416 adcq %rdx, %r8
417 adcq \$0, %r9
419 movq %r15, 64(%rsp)
420 movq %r8, 72(%rsp)
421 shrq \$63, %rcx
423 #sixth iteration
424 movq 40($inp), %r12
425 movq 48($inp), %rax
426 mulq %r12
427 addq %rax, %r10
428 movq 56($inp), %rax
429 movq %rdx, %rbx
430 adcq \$0, %rbx
432 mulq %r12
433 addq %rax, %r11
434 movq %r12, %rax
435 movq %r10, %r15
436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
437 adcq \$0, %rdx
438 shrq \$63, %r15
439 addq %rbx, %r11
440 movq %rdx, %r12
441 adcq \$0, %r12
443 movq %r11, %rbx
444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
446 mulq %rax
447 addq %rax, %r9
448 adcq %rdx, %r10
449 adcq \$0, %r11
451 movq %r9, 80(%rsp)
452 movq %r10, 88(%rsp)
454 #seventh iteration
455 movq 48($inp), %r13
456 movq 56($inp), %rax
457 mulq %r13
458 addq %rax, %r12
459 movq %r13, %rax
460 movq %rdx, %r13
461 adcq \$0, %r13
463 xorq %r14, %r14
464 shlq \$1, %rbx
465 adcq %r12, %r12 #shld \$1, %rbx, %r12
466 adcq %r13, %r13 #shld \$1, %r12, %r13
467 adcq %r14, %r14 #shld \$1, %r13, %r14
469 mulq %rax
470 addq %rax, %r11
471 adcq %rdx, %r12
472 adcq \$0, %r13
474 movq %r11, 96(%rsp)
475 movq %r12, 104(%rsp)
477 #eighth iteration
478 movq 56($inp), %rax
479 mulq %rax
480 addq %rax, %r13
481 adcq \$0, %rdx
483 addq %rdx, %r14
485 movq %r13, 112(%rsp)
486 movq %r14, 120(%rsp)
488 movq (%rsp), %r8
489 movq 8(%rsp), %r9
490 movq 16(%rsp), %r10
491 movq 24(%rsp), %r11
492 movq 32(%rsp), %r12
493 movq 40(%rsp), %r13
494 movq 48(%rsp), %r14
495 movq 56(%rsp), %r15
497 call __rsaz_512_reduce
499 addq 64(%rsp), %r8
500 adcq 72(%rsp), %r9
501 adcq 80(%rsp), %r10
502 adcq 88(%rsp), %r11
503 adcq 96(%rsp), %r12
504 adcq 104(%rsp), %r13
505 adcq 112(%rsp), %r14
506 adcq 120(%rsp), %r15
507 sbbq %rcx, %rcx
509 call __rsaz_512_subtract
511 movq %r8, %rdx
512 movq %r9, %rax
513 movl 128+8(%rsp), $times
514 movq $out, $inp
516 decl $times
517 jnz .Loop_sqr
519 if ($addx) {
520 $code.=<<___;
521 jmp .Lsqr_tail
523 .align 32
524 .Loop_sqrx:
525 movl $times,128+8(%rsp)
526 movq $out, %xmm0 # off-load
527 movq %rbp, %xmm1 # off-load
528 #first iteration
529 mulx %rax, %r8, %r9
531 mulx 16($inp), %rcx, %r10
532 xor %rbp, %rbp # cf=0, of=0
534 mulx 24($inp), %rax, %r11
535 adcx %rcx, %r9
537 mulx 32($inp), %rcx, %r12
538 adcx %rax, %r10
540 mulx 40($inp), %rax, %r13
541 adcx %rcx, %r11
543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
544 adcx %rax, %r12
545 adcx %rcx, %r13
547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
548 adcx %rax, %r14
549 adcx %rbp, %r15 # %rbp is 0
551 mov %r9, %rcx
552 shld \$1, %r8, %r9
553 shl \$1, %r8
555 xor %ebp, %ebp
556 mulx %rdx, %rax, %rdx
557 adcx %rdx, %r8
558 mov 8($inp), %rdx
559 adcx %rbp, %r9
561 mov %rax, (%rsp)
562 mov %r8, 8(%rsp)
564 #second iteration
565 mulx 16($inp), %rax, %rbx
566 adox %rax, %r10
567 adcx %rbx, %r11
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
570 adox $out, %r11
571 adcx %r8, %r12
573 mulx 32($inp), %rax, %rbx
574 adox %rax, %r12
575 adcx %rbx, %r13
577 mulx 40($inp), $out, %r8
578 adox $out, %r13
579 adcx %r8, %r14
581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
582 adox %rax, %r14
583 adcx %rbx, %r15
585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
586 adox $out, %r15
587 adcx %rbp, %r8
588 adox %rbp, %r8
590 mov %r11, %rbx
591 shld \$1, %r10, %r11
592 shld \$1, %rcx, %r10
594 xor %ebp,%ebp
595 mulx %rdx, %rax, %rcx
596 mov 16($inp), %rdx
597 adcx %rax, %r9
598 adcx %rcx, %r10
599 adcx %rbp, %r11
601 mov %r9, 16(%rsp)
602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
604 #third iteration
605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
606 adox $out, %r12
607 adcx %r9, %r13
609 mulx 32($inp), %rax, %rcx
610 adox %rax, %r13
611 adcx %rcx, %r14
613 mulx 40($inp), $out, %r9
614 adox $out, %r14
615 adcx %r9, %r15
617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
618 adox %rax, %r15
619 adcx %rcx, %r8
621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
622 adox $out, %r8
623 adcx %rbp, %r9
624 adox %rbp, %r9
626 mov %r13, %rcx
627 shld \$1, %r12, %r13
628 shld \$1, %rbx, %r12
630 xor %ebp, %ebp
631 mulx %rdx, %rax, %rdx
632 adcx %rax, %r11
633 adcx %rdx, %r12
634 mov 24($inp), %rdx
635 adcx %rbp, %r13
637 mov %r11, 32(%rsp)
638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
640 #fourth iteration
641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
642 adox %rax, %r14
643 adcx %rbx, %r15
645 mulx 40($inp), $out, %r10
646 adox $out, %r15
647 adcx %r10, %r8
649 mulx 48($inp), %rax, %rbx
650 adox %rax, %r8
651 adcx %rbx, %r9
653 mulx 56($inp), $out, %r10
654 adox $out, %r9
655 adcx %rbp, %r10
656 adox %rbp, %r10
658 .byte 0x66
659 mov %r15, %rbx
660 shld \$1, %r14, %r15
661 shld \$1, %rcx, %r14
663 xor %ebp, %ebp
664 mulx %rdx, %rax, %rdx
665 adcx %rax, %r13
666 adcx %rdx, %r14
667 mov 32($inp), %rdx
668 adcx %rbp, %r15
670 mov %r13, 48(%rsp)
671 mov %r14, 56(%rsp)
673 #fifth iteration
674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
675 adox $out, %r8
676 adcx %r11, %r9
678 mulx 48($inp), %rax, %rcx
679 adox %rax, %r9
680 adcx %rcx, %r10
682 mulx 56($inp), $out, %r11
683 adox $out, %r10
684 adcx %rbp, %r11
685 adox %rbp, %r11
687 mov %r9, %rcx
688 shld \$1, %r8, %r9
689 shld \$1, %rbx, %r8
691 xor %ebp, %ebp
692 mulx %rdx, %rax, %rdx
693 adcx %rax, %r15
694 adcx %rdx, %r8
695 mov 40($inp), %rdx
696 adcx %rbp, %r9
698 mov %r15, 64(%rsp)
699 mov %r8, 72(%rsp)
701 #sixth iteration
702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
703 adox %rax, %r10
704 adcx %rbx, %r11
706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
707 adox $out, %r11
708 adcx %rbp, %r12
709 adox %rbp, %r12
711 mov %r11, %rbx
712 shld \$1, %r10, %r11
713 shld \$1, %rcx, %r10
715 xor %ebp, %ebp
716 mulx %rdx, %rax, %rdx
717 adcx %rax, %r9
718 adcx %rdx, %r10
719 mov 48($inp), %rdx
720 adcx %rbp, %r11
722 mov %r9, 80(%rsp)
723 mov %r10, 88(%rsp)
725 #seventh iteration
726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
727 adox %rax, %r12
728 adox %rbp, %r13
730 xor %r14, %r14
731 shld \$1, %r13, %r14
732 shld \$1, %r12, %r13
733 shld \$1, %rbx, %r12
735 xor %ebp, %ebp
736 mulx %rdx, %rax, %rdx
737 adcx %rax, %r11
738 adcx %rdx, %r12
739 mov 56($inp), %rdx
740 adcx %rbp, %r13
742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
745 #eighth iteration
746 mulx %rdx, %rax, %rdx
747 adox %rax, %r13
748 adox %rbp, %rdx
750 .byte 0x66
751 add %rdx, %r14
753 movq %r13, 112(%rsp)
754 movq %r14, 120(%rsp)
755 movq %xmm0, $out
756 movq %xmm1, %rbp
758 movq 128(%rsp), %rdx # pull $n0
759 movq (%rsp), %r8
760 movq 8(%rsp), %r9
761 movq 16(%rsp), %r10
762 movq 24(%rsp), %r11
763 movq 32(%rsp), %r12
764 movq 40(%rsp), %r13
765 movq 48(%rsp), %r14
766 movq 56(%rsp), %r15
768 call __rsaz_512_reducex
770 addq 64(%rsp), %r8
771 adcq 72(%rsp), %r9
772 adcq 80(%rsp), %r10
773 adcq 88(%rsp), %r11
774 adcq 96(%rsp), %r12
775 adcq 104(%rsp), %r13
776 adcq 112(%rsp), %r14
777 adcq 120(%rsp), %r15
778 sbbq %rcx, %rcx
780 call __rsaz_512_subtract
782 movq %r8, %rdx
783 movq %r9, %rax
784 movl 128+8(%rsp), $times
785 movq $out, $inp
787 decl $times
788 jnz .Loop_sqrx
790 .Lsqr_tail:
793 $code.=<<___;
795 leaq 128+24+48(%rsp), %rax
796 movq -48(%rax), %r15
797 movq -40(%rax), %r14
798 movq -32(%rax), %r13
799 movq -24(%rax), %r12
800 movq -16(%rax), %rbp
801 movq -8(%rax), %rbx
802 leaq (%rax), %rsp
803 .Lsqr_epilogue:
805 .size rsaz_512_sqr,.-rsaz_512_sqr
809 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810 $code.=<<___;
811 .globl rsaz_512_mul
812 .type rsaz_512_mul,\@function,5
813 .align 32
814 rsaz_512_mul:
815 push %rbx
816 push %rbp
817 push %r12
818 push %r13
819 push %r14
820 push %r15
822 subq \$128+24, %rsp
823 .Lmul_body:
824 movq $out, %xmm0 # off-load arguments
825 movq $mod, %xmm1
826 movq $n0, 128(%rsp)
828 $code.=<<___ if ($addx);
829 movl \$0x80100,%r11d
830 andl OPENSSL_ia32cap_P+8(%rip),%r11d
831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
832 je .Lmulx
834 $code.=<<___;
835 movq ($bp), %rbx # pass b[0]
836 movq $bp, %rbp # pass argument
837 call __rsaz_512_mul
839 movq %xmm0, $out
840 movq %xmm1, %rbp
842 movq (%rsp), %r8
843 movq 8(%rsp), %r9
844 movq 16(%rsp), %r10
845 movq 24(%rsp), %r11
846 movq 32(%rsp), %r12
847 movq 40(%rsp), %r13
848 movq 48(%rsp), %r14
849 movq 56(%rsp), %r15
851 call __rsaz_512_reduce
853 $code.=<<___ if ($addx);
854 jmp .Lmul_tail
856 .align 32
857 .Lmulx:
858 movq $bp, %rbp # pass argument
859 movq ($bp), %rdx # pass b[0]
860 call __rsaz_512_mulx
862 movq %xmm0, $out
863 movq %xmm1, %rbp
865 movq 128(%rsp), %rdx # pull $n0
866 movq (%rsp), %r8
867 movq 8(%rsp), %r9
868 movq 16(%rsp), %r10
869 movq 24(%rsp), %r11
870 movq 32(%rsp), %r12
871 movq 40(%rsp), %r13
872 movq 48(%rsp), %r14
873 movq 56(%rsp), %r15
875 call __rsaz_512_reducex
876 .Lmul_tail:
878 $code.=<<___;
879 addq 64(%rsp), %r8
880 adcq 72(%rsp), %r9
881 adcq 80(%rsp), %r10
882 adcq 88(%rsp), %r11
883 adcq 96(%rsp), %r12
884 adcq 104(%rsp), %r13
885 adcq 112(%rsp), %r14
886 adcq 120(%rsp), %r15
887 sbbq %rcx, %rcx
889 call __rsaz_512_subtract
891 leaq 128+24+48(%rsp), %rax
892 movq -48(%rax), %r15
893 movq -40(%rax), %r14
894 movq -32(%rax), %r13
895 movq -24(%rax), %r12
896 movq -16(%rax), %rbp
897 movq -8(%rax), %rbx
898 leaq (%rax), %rsp
899 .Lmul_epilogue:
901 .size rsaz_512_mul,.-rsaz_512_mul
905 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906 $code.=<<___;
907 .globl rsaz_512_mul_gather4
908 .type rsaz_512_mul_gather4,\@function,6
909 .align 32
910 rsaz_512_mul_gather4:
911 push %rbx
912 push %rbp
913 push %r12
914 push %r13
915 push %r14
916 push %r15
918 subq \$`128+24+($win64?0xb0:0)`, %rsp
920 $code.=<<___ if ($win64);
921 movaps %xmm6,0xa0(%rsp)
922 movaps %xmm7,0xb0(%rsp)
923 movaps %xmm8,0xc0(%rsp)
924 movaps %xmm9,0xd0(%rsp)
925 movaps %xmm10,0xe0(%rsp)
926 movaps %xmm11,0xf0(%rsp)
927 movaps %xmm12,0x100(%rsp)
928 movaps %xmm13,0x110(%rsp)
929 movaps %xmm14,0x120(%rsp)
930 movaps %xmm15,0x130(%rsp)
932 $code.=<<___;
933 .Lmul_gather4_body:
934 movd $pwr,%xmm8
935 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
936 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
938 pshufd \$0,%xmm8,%xmm8 # broadcast $power
939 movdqa %xmm1,%xmm7
940 movdqa %xmm1,%xmm2
942 ########################################################################
943 # calculate mask by comparing 0..15 to $power
945 for($i=0;$i<4;$i++) {
946 $code.=<<___;
947 paddd %xmm`$i`,%xmm`$i+1`
948 pcmpeqd %xmm8,%xmm`$i`
949 movdqa %xmm7,%xmm`$i+3`
952 for(;$i<7;$i++) {
953 $code.=<<___;
954 paddd %xmm`$i`,%xmm`$i+1`
955 pcmpeqd %xmm8,%xmm`$i`
958 $code.=<<___;
959 pcmpeqd %xmm8,%xmm7
961 movdqa 16*0($bp),%xmm8
962 movdqa 16*1($bp),%xmm9
963 movdqa 16*2($bp),%xmm10
964 movdqa 16*3($bp),%xmm11
965 pand %xmm0,%xmm8
966 movdqa 16*4($bp),%xmm12
967 pand %xmm1,%xmm9
968 movdqa 16*5($bp),%xmm13
969 pand %xmm2,%xmm10
970 movdqa 16*6($bp),%xmm14
971 pand %xmm3,%xmm11
972 movdqa 16*7($bp),%xmm15
973 leaq 128($bp), %rbp
974 pand %xmm4,%xmm12
975 pand %xmm5,%xmm13
976 pand %xmm6,%xmm14
977 pand %xmm7,%xmm15
978 por %xmm10,%xmm8
979 por %xmm11,%xmm9
980 por %xmm12,%xmm8
981 por %xmm13,%xmm9
982 por %xmm14,%xmm8
983 por %xmm15,%xmm9
985 por %xmm9,%xmm8
986 pshufd \$0x4e,%xmm8,%xmm9
987 por %xmm9,%xmm8
989 $code.=<<___ if ($addx);
990 movl \$0x80100,%r11d
991 andl OPENSSL_ia32cap_P+8(%rip),%r11d
992 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
993 je .Lmulx_gather
995 $code.=<<___;
996 movq %xmm8,%rbx
998 movq $n0, 128(%rsp) # off-load arguments
999 movq $out, 128+8(%rsp)
1000 movq $mod, 128+16(%rsp)
1002 movq ($ap), %rax
1003 movq 8($ap), %rcx
1004 mulq %rbx # 0 iteration
1005 movq %rax, (%rsp)
1006 movq %rcx, %rax
1007 movq %rdx, %r8
1009 mulq %rbx
1010 addq %rax, %r8
1011 movq 16($ap), %rax
1012 movq %rdx, %r9
1013 adcq \$0, %r9
1015 mulq %rbx
1016 addq %rax, %r9
1017 movq 24($ap), %rax
1018 movq %rdx, %r10
1019 adcq \$0, %r10
1021 mulq %rbx
1022 addq %rax, %r10
1023 movq 32($ap), %rax
1024 movq %rdx, %r11
1025 adcq \$0, %r11
1027 mulq %rbx
1028 addq %rax, %r11
1029 movq 40($ap), %rax
1030 movq %rdx, %r12
1031 adcq \$0, %r12
1033 mulq %rbx
1034 addq %rax, %r12
1035 movq 48($ap), %rax
1036 movq %rdx, %r13
1037 adcq \$0, %r13
1039 mulq %rbx
1040 addq %rax, %r13
1041 movq 56($ap), %rax
1042 movq %rdx, %r14
1043 adcq \$0, %r14
1045 mulq %rbx
1046 addq %rax, %r14
1047 movq ($ap), %rax
1048 movq %rdx, %r15
1049 adcq \$0, %r15
1051 leaq 8(%rsp), %rdi
1052 movl \$7, %ecx
1053 jmp .Loop_mul_gather
1055 .align 32
1056 .Loop_mul_gather:
1057 movdqa 16*0(%rbp),%xmm8
1058 movdqa 16*1(%rbp),%xmm9
1059 movdqa 16*2(%rbp),%xmm10
1060 movdqa 16*3(%rbp),%xmm11
1061 pand %xmm0,%xmm8
1062 movdqa 16*4(%rbp),%xmm12
1063 pand %xmm1,%xmm9
1064 movdqa 16*5(%rbp),%xmm13
1065 pand %xmm2,%xmm10
1066 movdqa 16*6(%rbp),%xmm14
1067 pand %xmm3,%xmm11
1068 movdqa 16*7(%rbp),%xmm15
1069 leaq 128(%rbp), %rbp
1070 pand %xmm4,%xmm12
1071 pand %xmm5,%xmm13
1072 pand %xmm6,%xmm14
1073 pand %xmm7,%xmm15
1074 por %xmm10,%xmm8
1075 por %xmm11,%xmm9
1076 por %xmm12,%xmm8
1077 por %xmm13,%xmm9
1078 por %xmm14,%xmm8
1079 por %xmm15,%xmm9
1081 por %xmm9,%xmm8
1082 pshufd \$0x4e,%xmm8,%xmm9
1083 por %xmm9,%xmm8
1084 movq %xmm8,%rbx
1086 mulq %rbx
1087 addq %rax, %r8
1088 movq 8($ap), %rax
1089 movq %r8, (%rdi)
1090 movq %rdx, %r8
1091 adcq \$0, %r8
1093 mulq %rbx
1094 addq %rax, %r9
1095 movq 16($ap), %rax
1096 adcq \$0, %rdx
1097 addq %r9, %r8
1098 movq %rdx, %r9
1099 adcq \$0, %r9
1101 mulq %rbx
1102 addq %rax, %r10
1103 movq 24($ap), %rax
1104 adcq \$0, %rdx
1105 addq %r10, %r9
1106 movq %rdx, %r10
1107 adcq \$0, %r10
1109 mulq %rbx
1110 addq %rax, %r11
1111 movq 32($ap), %rax
1112 adcq \$0, %rdx
1113 addq %r11, %r10
1114 movq %rdx, %r11
1115 adcq \$0, %r11
1117 mulq %rbx
1118 addq %rax, %r12
1119 movq 40($ap), %rax
1120 adcq \$0, %rdx
1121 addq %r12, %r11
1122 movq %rdx, %r12
1123 adcq \$0, %r12
1125 mulq %rbx
1126 addq %rax, %r13
1127 movq 48($ap), %rax
1128 adcq \$0, %rdx
1129 addq %r13, %r12
1130 movq %rdx, %r13
1131 adcq \$0, %r13
1133 mulq %rbx
1134 addq %rax, %r14
1135 movq 56($ap), %rax
1136 adcq \$0, %rdx
1137 addq %r14, %r13
1138 movq %rdx, %r14
1139 adcq \$0, %r14
1141 mulq %rbx
1142 addq %rax, %r15
1143 movq ($ap), %rax
1144 adcq \$0, %rdx
1145 addq %r15, %r14
1146 movq %rdx, %r15
1147 adcq \$0, %r15
1149 leaq 8(%rdi), %rdi
1151 decl %ecx
1152 jnz .Loop_mul_gather
1154 movq %r8, (%rdi)
1155 movq %r9, 8(%rdi)
1156 movq %r10, 16(%rdi)
1157 movq %r11, 24(%rdi)
1158 movq %r12, 32(%rdi)
1159 movq %r13, 40(%rdi)
1160 movq %r14, 48(%rdi)
1161 movq %r15, 56(%rdi)
1163 movq 128+8(%rsp), $out
1164 movq 128+16(%rsp), %rbp
1166 movq (%rsp), %r8
1167 movq 8(%rsp), %r9
1168 movq 16(%rsp), %r10
1169 movq 24(%rsp), %r11
1170 movq 32(%rsp), %r12
1171 movq 40(%rsp), %r13
1172 movq 48(%rsp), %r14
1173 movq 56(%rsp), %r15
1175 call __rsaz_512_reduce
1177 $code.=<<___ if ($addx);
1178 jmp .Lmul_gather_tail
1180 .align 32
1181 .Lmulx_gather:
1182 movq %xmm8,%rdx
1184 mov $n0, 128(%rsp) # off-load arguments
1185 mov $out, 128+8(%rsp)
1186 mov $mod, 128+16(%rsp)
1188 mulx ($ap), %rbx, %r8 # 0 iteration
1189 mov %rbx, (%rsp)
1190 xor %edi, %edi # cf=0, of=0
1192 mulx 8($ap), %rax, %r9
1194 mulx 16($ap), %rbx, %r10
1195 adcx %rax, %r8
1197 mulx 24($ap), %rax, %r11
1198 adcx %rbx, %r9
1200 mulx 32($ap), %rbx, %r12
1201 adcx %rax, %r10
1203 mulx 40($ap), %rax, %r13
1204 adcx %rbx, %r11
1206 mulx 48($ap), %rbx, %r14
1207 adcx %rax, %r12
1209 mulx 56($ap), %rax, %r15
1210 adcx %rbx, %r13
1211 adcx %rax, %r14
1212 .byte 0x67
1213 mov %r8, %rbx
1214 adcx %rdi, %r15 # %rdi is 0
1216 mov \$-7, %rcx
1217 jmp .Loop_mulx_gather
1219 .align 32
1220 .Loop_mulx_gather:
1221 movdqa 16*0(%rbp),%xmm8
1222 movdqa 16*1(%rbp),%xmm9
1223 movdqa 16*2(%rbp),%xmm10
1224 movdqa 16*3(%rbp),%xmm11
1225 pand %xmm0,%xmm8
1226 movdqa 16*4(%rbp),%xmm12
1227 pand %xmm1,%xmm9
1228 movdqa 16*5(%rbp),%xmm13
1229 pand %xmm2,%xmm10
1230 movdqa 16*6(%rbp),%xmm14
1231 pand %xmm3,%xmm11
1232 movdqa 16*7(%rbp),%xmm15
1233 leaq 128(%rbp), %rbp
1234 pand %xmm4,%xmm12
1235 pand %xmm5,%xmm13
1236 pand %xmm6,%xmm14
1237 pand %xmm7,%xmm15
1238 por %xmm10,%xmm8
1239 por %xmm11,%xmm9
1240 por %xmm12,%xmm8
1241 por %xmm13,%xmm9
1242 por %xmm14,%xmm8
1243 por %xmm15,%xmm9
1245 por %xmm9,%xmm8
1246 pshufd \$0x4e,%xmm8,%xmm9
1247 por %xmm9,%xmm8
1248 movq %xmm8,%rdx
1250 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1251 adcx %rax, %rbx
1252 adox %r9, %r8
1254 mulx 8($ap), %rax, %r9
1255 adcx %rax, %r8
1256 adox %r10, %r9
1258 mulx 16($ap), %rax, %r10
1259 adcx %rax, %r9
1260 adox %r11, %r10
1262 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1263 adcx %rax, %r10
1264 adox %r12, %r11
1266 mulx 32($ap), %rax, %r12
1267 adcx %rax, %r11
1268 adox %r13, %r12
1270 mulx 40($ap), %rax, %r13
1271 adcx %rax, %r12
1272 adox %r14, %r13
1274 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1275 adcx %rax, %r13
1276 .byte 0x67
1277 adox %r15, %r14
1279 mulx 56($ap), %rax, %r15
1280 mov %rbx, 64(%rsp,%rcx,8)
1281 adcx %rax, %r14
1282 adox %rdi, %r15
1283 mov %r8, %rbx
1284 adcx %rdi, %r15 # cf=0
1286 inc %rcx # of=0
1287 jnz .Loop_mulx_gather
1289 mov %r8, 64(%rsp)
1290 mov %r9, 64+8(%rsp)
1291 mov %r10, 64+16(%rsp)
1292 mov %r11, 64+24(%rsp)
1293 mov %r12, 64+32(%rsp)
1294 mov %r13, 64+40(%rsp)
1295 mov %r14, 64+48(%rsp)
1296 mov %r15, 64+56(%rsp)
1298 mov 128(%rsp), %rdx # pull arguments
1299 mov 128+8(%rsp), $out
1300 mov 128+16(%rsp), %rbp
1302 mov (%rsp), %r8
1303 mov 8(%rsp), %r9
1304 mov 16(%rsp), %r10
1305 mov 24(%rsp), %r11
1306 mov 32(%rsp), %r12
1307 mov 40(%rsp), %r13
1308 mov 48(%rsp), %r14
1309 mov 56(%rsp), %r15
1311 call __rsaz_512_reducex
1313 .Lmul_gather_tail:
1315 $code.=<<___;
1316 addq 64(%rsp), %r8
1317 adcq 72(%rsp), %r9
1318 adcq 80(%rsp), %r10
1319 adcq 88(%rsp), %r11
1320 adcq 96(%rsp), %r12
1321 adcq 104(%rsp), %r13
1322 adcq 112(%rsp), %r14
1323 adcq 120(%rsp), %r15
1324 sbbq %rcx, %rcx
1326 call __rsaz_512_subtract
1328 leaq 128+24+48(%rsp), %rax
1330 $code.=<<___ if ($win64);
1331 movaps 0xa0-0xc8(%rax),%xmm6
1332 movaps 0xb0-0xc8(%rax),%xmm7
1333 movaps 0xc0-0xc8(%rax),%xmm8
1334 movaps 0xd0-0xc8(%rax),%xmm9
1335 movaps 0xe0-0xc8(%rax),%xmm10
1336 movaps 0xf0-0xc8(%rax),%xmm11
1337 movaps 0x100-0xc8(%rax),%xmm12
1338 movaps 0x110-0xc8(%rax),%xmm13
1339 movaps 0x120-0xc8(%rax),%xmm14
1340 movaps 0x130-0xc8(%rax),%xmm15
1341 lea 0xb0(%rax),%rax
1343 $code.=<<___;
1344 movq -48(%rax), %r15
1345 movq -40(%rax), %r14
1346 movq -32(%rax), %r13
1347 movq -24(%rax), %r12
1348 movq -16(%rax), %rbp
1349 movq -8(%rax), %rbx
1350 leaq (%rax), %rsp
1351 .Lmul_gather4_epilogue:
1353 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1357 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1358 $code.=<<___;
1359 .globl rsaz_512_mul_scatter4
1360 .type rsaz_512_mul_scatter4,\@function,6
1361 .align 32
1362 rsaz_512_mul_scatter4:
1363 push %rbx
1364 push %rbp
1365 push %r12
1366 push %r13
1367 push %r14
1368 push %r15
1370 mov $pwr, $pwr
1371 subq \$128+24, %rsp
1372 .Lmul_scatter4_body:
1373 leaq ($tbl,$pwr,8), $tbl
1374 movq $out, %xmm0 # off-load arguments
1375 movq $mod, %xmm1
1376 movq $tbl, %xmm2
1377 movq $n0, 128(%rsp)
1379 movq $out, %rbp
1381 $code.=<<___ if ($addx);
1382 movl \$0x80100,%r11d
1383 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1384 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1385 je .Lmulx_scatter
1387 $code.=<<___;
1388 movq ($out),%rbx # pass b[0]
1389 call __rsaz_512_mul
1391 movq %xmm0, $out
1392 movq %xmm1, %rbp
1394 movq (%rsp), %r8
1395 movq 8(%rsp), %r9
1396 movq 16(%rsp), %r10
1397 movq 24(%rsp), %r11
1398 movq 32(%rsp), %r12
1399 movq 40(%rsp), %r13
1400 movq 48(%rsp), %r14
1401 movq 56(%rsp), %r15
1403 call __rsaz_512_reduce
1405 $code.=<<___ if ($addx);
1406 jmp .Lmul_scatter_tail
1408 .align 32
1409 .Lmulx_scatter:
1410 movq ($out), %rdx # pass b[0]
1411 call __rsaz_512_mulx
1413 movq %xmm0, $out
1414 movq %xmm1, %rbp
1416 movq 128(%rsp), %rdx # pull $n0
1417 movq (%rsp), %r8
1418 movq 8(%rsp), %r9
1419 movq 16(%rsp), %r10
1420 movq 24(%rsp), %r11
1421 movq 32(%rsp), %r12
1422 movq 40(%rsp), %r13
1423 movq 48(%rsp), %r14
1424 movq 56(%rsp), %r15
1426 call __rsaz_512_reducex
1428 .Lmul_scatter_tail:
1430 $code.=<<___;
1431 addq 64(%rsp), %r8
1432 adcq 72(%rsp), %r9
1433 adcq 80(%rsp), %r10
1434 adcq 88(%rsp), %r11
1435 adcq 96(%rsp), %r12
1436 adcq 104(%rsp), %r13
1437 adcq 112(%rsp), %r14
1438 adcq 120(%rsp), %r15
1439 movq %xmm2, $inp
1440 sbbq %rcx, %rcx
1442 call __rsaz_512_subtract
1444 movq %r8, 128*0($inp) # scatter
1445 movq %r9, 128*1($inp)
1446 movq %r10, 128*2($inp)
1447 movq %r11, 128*3($inp)
1448 movq %r12, 128*4($inp)
1449 movq %r13, 128*5($inp)
1450 movq %r14, 128*6($inp)
1451 movq %r15, 128*7($inp)
1453 leaq 128+24+48(%rsp), %rax
1454 movq -48(%rax), %r15
1455 movq -40(%rax), %r14
1456 movq -32(%rax), %r13
1457 movq -24(%rax), %r12
1458 movq -16(%rax), %rbp
1459 movq -8(%rax), %rbx
1460 leaq (%rax), %rsp
1461 .Lmul_scatter4_epilogue:
1463 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1467 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1468 $code.=<<___;
1469 .globl rsaz_512_mul_by_one
1470 .type rsaz_512_mul_by_one,\@function,4
1471 .align 32
1472 rsaz_512_mul_by_one:
1473 push %rbx
1474 push %rbp
1475 push %r12
1476 push %r13
1477 push %r14
1478 push %r15
1480 subq \$128+24, %rsp
1481 .Lmul_by_one_body:
1483 $code.=<<___ if ($addx);
1484 movl OPENSSL_ia32cap_P+8(%rip),%eax
1486 $code.=<<___;
1487 movq $mod, %rbp # reassign argument
1488 movq $n0, 128(%rsp)
1490 movq ($inp), %r8
1491 pxor %xmm0, %xmm0
1492 movq 8($inp), %r9
1493 movq 16($inp), %r10
1494 movq 24($inp), %r11
1495 movq 32($inp), %r12
1496 movq 40($inp), %r13
1497 movq 48($inp), %r14
1498 movq 56($inp), %r15
1500 movdqa %xmm0, (%rsp)
1501 movdqa %xmm0, 16(%rsp)
1502 movdqa %xmm0, 32(%rsp)
1503 movdqa %xmm0, 48(%rsp)
1504 movdqa %xmm0, 64(%rsp)
1505 movdqa %xmm0, 80(%rsp)
1506 movdqa %xmm0, 96(%rsp)
1508 $code.=<<___ if ($addx);
1509 andl \$0x80100,%eax
1510 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1511 je .Lby_one_callx
1513 $code.=<<___;
1514 call __rsaz_512_reduce
1516 $code.=<<___ if ($addx);
1517 jmp .Lby_one_tail
1518 .align 32
1519 .Lby_one_callx:
1520 movq 128(%rsp), %rdx # pull $n0
1521 call __rsaz_512_reducex
1522 .Lby_one_tail:
1524 $code.=<<___;
1525 movq %r8, ($out)
1526 movq %r9, 8($out)
1527 movq %r10, 16($out)
1528 movq %r11, 24($out)
1529 movq %r12, 32($out)
1530 movq %r13, 40($out)
1531 movq %r14, 48($out)
1532 movq %r15, 56($out)
1534 leaq 128+24+48(%rsp), %rax
1535 movq -48(%rax), %r15
1536 movq -40(%rax), %r14
1537 movq -32(%rax), %r13
1538 movq -24(%rax), %r12
1539 movq -16(%rax), %rbp
1540 movq -8(%rax), %rbx
1541 leaq (%rax), %rsp
1542 .Lmul_by_one_epilogue:
1544 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1547 { # __rsaz_512_reduce
1549 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1550 # output: %r8-%r15
1551 # clobbers: everything except %rbp and %rdi
1552 $code.=<<___;
1553 .type __rsaz_512_reduce,\@abi-omnipotent
1554 .align 32
1555 __rsaz_512_reduce:
1556 movq %r8, %rbx
1557 imulq 128+8(%rsp), %rbx
1558 movq 0(%rbp), %rax
1559 movl \$8, %ecx
1560 jmp .Lreduction_loop
1562 .align 32
1563 .Lreduction_loop:
1564 mulq %rbx
1565 movq 8(%rbp), %rax
1566 negq %r8
1567 movq %rdx, %r8
1568 adcq \$0, %r8
1570 mulq %rbx
1571 addq %rax, %r9
1572 movq 16(%rbp), %rax
1573 adcq \$0, %rdx
1574 addq %r9, %r8
1575 movq %rdx, %r9
1576 adcq \$0, %r9
1578 mulq %rbx
1579 addq %rax, %r10
1580 movq 24(%rbp), %rax
1581 adcq \$0, %rdx
1582 addq %r10, %r9
1583 movq %rdx, %r10
1584 adcq \$0, %r10
1586 mulq %rbx
1587 addq %rax, %r11
1588 movq 32(%rbp), %rax
1589 adcq \$0, %rdx
1590 addq %r11, %r10
1591 movq 128+8(%rsp), %rsi
1592 #movq %rdx, %r11
1593 #adcq \$0, %r11
1594 adcq \$0, %rdx
1595 movq %rdx, %r11
1597 mulq %rbx
1598 addq %rax, %r12
1599 movq 40(%rbp), %rax
1600 adcq \$0, %rdx
1601 imulq %r8, %rsi
1602 addq %r12, %r11
1603 movq %rdx, %r12
1604 adcq \$0, %r12
1606 mulq %rbx
1607 addq %rax, %r13
1608 movq 48(%rbp), %rax
1609 adcq \$0, %rdx
1610 addq %r13, %r12
1611 movq %rdx, %r13
1612 adcq \$0, %r13
1614 mulq %rbx
1615 addq %rax, %r14
1616 movq 56(%rbp), %rax
1617 adcq \$0, %rdx
1618 addq %r14, %r13
1619 movq %rdx, %r14
1620 adcq \$0, %r14
1622 mulq %rbx
1623 movq %rsi, %rbx
1624 addq %rax, %r15
1625 movq 0(%rbp), %rax
1626 adcq \$0, %rdx
1627 addq %r15, %r14
1628 movq %rdx, %r15
1629 adcq \$0, %r15
1631 decl %ecx
1632 jne .Lreduction_loop
1635 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1638 if ($addx) {
1639 # __rsaz_512_reducex
1641 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1642 # output: %r8-%r15
1643 # clobbers: everything except %rbp and %rdi
1644 $code.=<<___;
1645 .type __rsaz_512_reducex,\@abi-omnipotent
1646 .align 32
1647 __rsaz_512_reducex:
1648 #movq 128+8(%rsp), %rdx # pull $n0
1649 imulq %r8, %rdx
1650 xorq %rsi, %rsi # cf=0,of=0
1651 movl \$8, %ecx
1652 jmp .Lreduction_loopx
1654 .align 32
1655 .Lreduction_loopx:
1656 mov %r8, %rbx
1657 mulx 0(%rbp), %rax, %r8
1658 adcx %rbx, %rax
1659 adox %r9, %r8
1661 mulx 8(%rbp), %rax, %r9
1662 adcx %rax, %r8
1663 adox %r10, %r9
1665 mulx 16(%rbp), %rbx, %r10
1666 adcx %rbx, %r9
1667 adox %r11, %r10
1669 mulx 24(%rbp), %rbx, %r11
1670 adcx %rbx, %r10
1671 adox %r12, %r11
1673 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1674 mov %rdx, %rax
1675 mov %r8, %rdx
1676 adcx %rbx, %r11
1677 adox %r13, %r12
1679 mulx 128+8(%rsp), %rbx, %rdx
1680 mov %rax, %rdx
1682 mulx 40(%rbp), %rax, %r13
1683 adcx %rax, %r12
1684 adox %r14, %r13
1686 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1687 adcx %rax, %r13
1688 adox %r15, %r14
1690 mulx 56(%rbp), %rax, %r15
1691 mov %rbx, %rdx
1692 adcx %rax, %r14
1693 adox %rsi, %r15 # %rsi is 0
1694 adcx %rsi, %r15 # cf=0
1696 decl %ecx # of=0
1697 jne .Lreduction_loopx
1700 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1703 { # __rsaz_512_subtract
1704 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1705 # output:
1706 # clobbers: everything but %rdi, %rsi and %rbp
1707 $code.=<<___;
1708 .type __rsaz_512_subtract,\@abi-omnipotent
1709 .align 32
1710 __rsaz_512_subtract:
1711 movq %r8, ($out)
1712 movq %r9, 8($out)
1713 movq %r10, 16($out)
1714 movq %r11, 24($out)
1715 movq %r12, 32($out)
1716 movq %r13, 40($out)
1717 movq %r14, 48($out)
1718 movq %r15, 56($out)
1720 movq 0($mod), %r8
1721 movq 8($mod), %r9
1722 negq %r8
1723 notq %r9
1724 andq %rcx, %r8
1725 movq 16($mod), %r10
1726 andq %rcx, %r9
1727 notq %r10
1728 movq 24($mod), %r11
1729 andq %rcx, %r10
1730 notq %r11
1731 movq 32($mod), %r12
1732 andq %rcx, %r11
1733 notq %r12
1734 movq 40($mod), %r13
1735 andq %rcx, %r12
1736 notq %r13
1737 movq 48($mod), %r14
1738 andq %rcx, %r13
1739 notq %r14
1740 movq 56($mod), %r15
1741 andq %rcx, %r14
1742 notq %r15
1743 andq %rcx, %r15
1745 addq ($out), %r8
1746 adcq 8($out), %r9
1747 adcq 16($out), %r10
1748 adcq 24($out), %r11
1749 adcq 32($out), %r12
1750 adcq 40($out), %r13
1751 adcq 48($out), %r14
1752 adcq 56($out), %r15
1754 movq %r8, ($out)
1755 movq %r9, 8($out)
1756 movq %r10, 16($out)
1757 movq %r11, 24($out)
1758 movq %r12, 32($out)
1759 movq %r13, 40($out)
1760 movq %r14, 48($out)
1761 movq %r15, 56($out)
1764 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1767 { # __rsaz_512_mul
1769 # input: %rsi - ap, %rbp - bp
1770 # ouput:
1771 # clobbers: everything
1772 my ($ap,$bp) = ("%rsi","%rbp");
1773 $code.=<<___;
1774 .type __rsaz_512_mul,\@abi-omnipotent
1775 .align 32
1776 __rsaz_512_mul:
1777 leaq 8(%rsp), %rdi
1779 movq ($ap), %rax
1780 mulq %rbx
1781 movq %rax, (%rdi)
1782 movq 8($ap), %rax
1783 movq %rdx, %r8
1785 mulq %rbx
1786 addq %rax, %r8
1787 movq 16($ap), %rax
1788 movq %rdx, %r9
1789 adcq \$0, %r9
1791 mulq %rbx
1792 addq %rax, %r9
1793 movq 24($ap), %rax
1794 movq %rdx, %r10
1795 adcq \$0, %r10
1797 mulq %rbx
1798 addq %rax, %r10
1799 movq 32($ap), %rax
1800 movq %rdx, %r11
1801 adcq \$0, %r11
1803 mulq %rbx
1804 addq %rax, %r11
1805 movq 40($ap), %rax
1806 movq %rdx, %r12
1807 adcq \$0, %r12
1809 mulq %rbx
1810 addq %rax, %r12
1811 movq 48($ap), %rax
1812 movq %rdx, %r13
1813 adcq \$0, %r13
1815 mulq %rbx
1816 addq %rax, %r13
1817 movq 56($ap), %rax
1818 movq %rdx, %r14
1819 adcq \$0, %r14
1821 mulq %rbx
1822 addq %rax, %r14
1823 movq ($ap), %rax
1824 movq %rdx, %r15
1825 adcq \$0, %r15
1827 leaq 8($bp), $bp
1828 leaq 8(%rdi), %rdi
1830 movl \$7, %ecx
1831 jmp .Loop_mul
1833 .align 32
1834 .Loop_mul:
1835 movq ($bp), %rbx
1836 mulq %rbx
1837 addq %rax, %r8
1838 movq 8($ap), %rax
1839 movq %r8, (%rdi)
1840 movq %rdx, %r8
1841 adcq \$0, %r8
1843 mulq %rbx
1844 addq %rax, %r9
1845 movq 16($ap), %rax
1846 adcq \$0, %rdx
1847 addq %r9, %r8
1848 movq %rdx, %r9
1849 adcq \$0, %r9
1851 mulq %rbx
1852 addq %rax, %r10
1853 movq 24($ap), %rax
1854 adcq \$0, %rdx
1855 addq %r10, %r9
1856 movq %rdx, %r10
1857 adcq \$0, %r10
1859 mulq %rbx
1860 addq %rax, %r11
1861 movq 32($ap), %rax
1862 adcq \$0, %rdx
1863 addq %r11, %r10
1864 movq %rdx, %r11
1865 adcq \$0, %r11
1867 mulq %rbx
1868 addq %rax, %r12
1869 movq 40($ap), %rax
1870 adcq \$0, %rdx
1871 addq %r12, %r11
1872 movq %rdx, %r12
1873 adcq \$0, %r12
1875 mulq %rbx
1876 addq %rax, %r13
1877 movq 48($ap), %rax
1878 adcq \$0, %rdx
1879 addq %r13, %r12
1880 movq %rdx, %r13
1881 adcq \$0, %r13
1883 mulq %rbx
1884 addq %rax, %r14
1885 movq 56($ap), %rax
1886 adcq \$0, %rdx
1887 addq %r14, %r13
1888 movq %rdx, %r14
1889 leaq 8($bp), $bp
1890 adcq \$0, %r14
1892 mulq %rbx
1893 addq %rax, %r15
1894 movq ($ap), %rax
1895 adcq \$0, %rdx
1896 addq %r15, %r14
1897 movq %rdx, %r15
1898 adcq \$0, %r15
1900 leaq 8(%rdi), %rdi
1902 decl %ecx
1903 jnz .Loop_mul
1905 movq %r8, (%rdi)
1906 movq %r9, 8(%rdi)
1907 movq %r10, 16(%rdi)
1908 movq %r11, 24(%rdi)
1909 movq %r12, 32(%rdi)
1910 movq %r13, 40(%rdi)
1911 movq %r14, 48(%rdi)
1912 movq %r15, 56(%rdi)
1915 .size __rsaz_512_mul,.-__rsaz_512_mul
1918 if ($addx) {
1919 # __rsaz_512_mulx
1921 # input: %rsi - ap, %rbp - bp
1922 # ouput:
1923 # clobbers: everything
1924 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1925 $code.=<<___;
1926 .type __rsaz_512_mulx,\@abi-omnipotent
1927 .align 32
1928 __rsaz_512_mulx:
1929 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1930 mov \$-6, %rcx
1932 mulx 8($ap), %rax, %r9
1933 movq %rbx, 8(%rsp)
1935 mulx 16($ap), %rbx, %r10
1936 adc %rax, %r8
1938 mulx 24($ap), %rax, %r11
1939 adc %rbx, %r9
1941 mulx 32($ap), %rbx, %r12
1942 adc %rax, %r10
1944 mulx 40($ap), %rax, %r13
1945 adc %rbx, %r11
1947 mulx 48($ap), %rbx, %r14
1948 adc %rax, %r12
1950 mulx 56($ap), %rax, %r15
1951 mov 8($bp), %rdx
1952 adc %rbx, %r13
1953 adc %rax, %r14
1954 adc \$0, %r15
1956 xor $zero, $zero # cf=0,of=0
1957 jmp .Loop_mulx
1959 .align 32
1960 .Loop_mulx:
1961 movq %r8, %rbx
1962 mulx ($ap), %rax, %r8
1963 adcx %rax, %rbx
1964 adox %r9, %r8
1966 mulx 8($ap), %rax, %r9
1967 adcx %rax, %r8
1968 adox %r10, %r9
1970 mulx 16($ap), %rax, %r10
1971 adcx %rax, %r9
1972 adox %r11, %r10
1974 mulx 24($ap), %rax, %r11
1975 adcx %rax, %r10
1976 adox %r12, %r11
1978 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1979 adcx %rax, %r11
1980 adox %r13, %r12
1982 mulx 40($ap), %rax, %r13
1983 adcx %rax, %r12
1984 adox %r14, %r13
1986 mulx 48($ap), %rax, %r14
1987 adcx %rax, %r13
1988 adox %r15, %r14
1990 mulx 56($ap), %rax, %r15
1991 movq 64($bp,%rcx,8), %rdx
1992 movq %rbx, 8+64-8(%rsp,%rcx,8)
1993 adcx %rax, %r14
1994 adox $zero, %r15
1995 adcx $zero, %r15 # cf=0
1997 inc %rcx # of=0
1998 jnz .Loop_mulx
2000 movq %r8, %rbx
2001 mulx ($ap), %rax, %r8
2002 adcx %rax, %rbx
2003 adox %r9, %r8
2005 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2006 adcx %rax, %r8
2007 adox %r10, %r9
2009 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2010 adcx %rax, %r9
2011 adox %r11, %r10
2013 mulx 24($ap), %rax, %r11
2014 adcx %rax, %r10
2015 adox %r12, %r11
2017 mulx 32($ap), %rax, %r12
2018 adcx %rax, %r11
2019 adox %r13, %r12
2021 mulx 40($ap), %rax, %r13
2022 adcx %rax, %r12
2023 adox %r14, %r13
2025 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2026 adcx %rax, %r13
2027 adox %r15, %r14
2029 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2030 adcx %rax, %r14
2031 adox $zero, %r15
2032 adcx $zero, %r15
2034 mov %rbx, 8+64-8(%rsp)
2035 mov %r8, 8+64(%rsp)
2036 mov %r9, 8+64+8(%rsp)
2037 mov %r10, 8+64+16(%rsp)
2038 mov %r11, 8+64+24(%rsp)
2039 mov %r12, 8+64+32(%rsp)
2040 mov %r13, 8+64+40(%rsp)
2041 mov %r14, 8+64+48(%rsp)
2042 mov %r15, 8+64+56(%rsp)
2045 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2049 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2050 $code.=<<___;
2051 .globl rsaz_512_scatter4
2052 .type rsaz_512_scatter4,\@abi-omnipotent
2053 .align 16
2054 rsaz_512_scatter4:
2055 leaq ($out,$power,8), $out
2056 movl \$8, %r9d
2057 jmp .Loop_scatter
2058 .align 16
2059 .Loop_scatter:
2060 movq ($inp), %rax
2061 leaq 8($inp), $inp
2062 movq %rax, ($out)
2063 leaq 128($out), $out
2064 decl %r9d
2065 jnz .Loop_scatter
2067 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2069 .globl rsaz_512_gather4
2070 .type rsaz_512_gather4,\@abi-omnipotent
2071 .align 16
2072 rsaz_512_gather4:
2074 $code.=<<___ if ($win64);
2075 .LSEH_begin_rsaz_512_gather4:
2076 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2077 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2078 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2079 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2080 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2081 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2082 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2083 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2084 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2085 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2086 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2088 $code.=<<___;
2089 movd $power,%xmm8
2090 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2091 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2093 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2094 movdqa %xmm1,%xmm7
2095 movdqa %xmm1,%xmm2
2097 ########################################################################
2098 # calculate mask by comparing 0..15 to $power
2100 for($i=0;$i<4;$i++) {
2101 $code.=<<___;
2102 paddd %xmm`$i`,%xmm`$i+1`
2103 pcmpeqd %xmm8,%xmm`$i`
2104 movdqa %xmm7,%xmm`$i+3`
2107 for(;$i<7;$i++) {
2108 $code.=<<___;
2109 paddd %xmm`$i`,%xmm`$i+1`
2110 pcmpeqd %xmm8,%xmm`$i`
2113 $code.=<<___;
2114 pcmpeqd %xmm8,%xmm7
2115 movl \$8, %r9d
2116 jmp .Loop_gather
2117 .align 16
2118 .Loop_gather:
2119 movdqa 16*0($inp),%xmm8
2120 movdqa 16*1($inp),%xmm9
2121 movdqa 16*2($inp),%xmm10
2122 movdqa 16*3($inp),%xmm11
2123 pand %xmm0,%xmm8
2124 movdqa 16*4($inp),%xmm12
2125 pand %xmm1,%xmm9
2126 movdqa 16*5($inp),%xmm13
2127 pand %xmm2,%xmm10
2128 movdqa 16*6($inp),%xmm14
2129 pand %xmm3,%xmm11
2130 movdqa 16*7($inp),%xmm15
2131 leaq 128($inp), $inp
2132 pand %xmm4,%xmm12
2133 pand %xmm5,%xmm13
2134 pand %xmm6,%xmm14
2135 pand %xmm7,%xmm15
2136 por %xmm10,%xmm8
2137 por %xmm11,%xmm9
2138 por %xmm12,%xmm8
2139 por %xmm13,%xmm9
2140 por %xmm14,%xmm8
2141 por %xmm15,%xmm9
2143 por %xmm9,%xmm8
2144 pshufd \$0x4e,%xmm8,%xmm9
2145 por %xmm9,%xmm8
2146 movq %xmm8,($out)
2147 leaq 8($out), $out
2148 decl %r9d
2149 jnz .Loop_gather
2151 $code.=<<___ if ($win64);
2152 movaps 0x00(%rsp),%xmm6
2153 movaps 0x10(%rsp),%xmm7
2154 movaps 0x20(%rsp),%xmm8
2155 movaps 0x30(%rsp),%xmm9
2156 movaps 0x40(%rsp),%xmm10
2157 movaps 0x50(%rsp),%xmm11
2158 movaps 0x60(%rsp),%xmm12
2159 movaps 0x70(%rsp),%xmm13
2160 movaps 0x80(%rsp),%xmm14
2161 movaps 0x90(%rsp),%xmm15
2162 add \$0xa8,%rsp
2164 $code.=<<___;
2166 .LSEH_end_rsaz_512_gather4:
2167 .size rsaz_512_gather4,.-rsaz_512_gather4
2169 .align 64
2170 .Linc:
2171 .long 0,0, 1,1
2172 .long 2,2, 2,2
2176 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2177 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2178 if ($win64) {
2179 $rec="%rcx";
2180 $frame="%rdx";
2181 $context="%r8";
2182 $disp="%r9";
2184 $code.=<<___;
2185 .extern __imp_RtlVirtualUnwind
2186 .type se_handler,\@abi-omnipotent
2187 .align 16
2188 se_handler:
2189 push %rsi
2190 push %rdi
2191 push %rbx
2192 push %rbp
2193 push %r12
2194 push %r13
2195 push %r14
2196 push %r15
2197 pushfq
2198 sub \$64,%rsp
2200 mov 120($context),%rax # pull context->Rax
2201 mov 248($context),%rbx # pull context->Rip
2203 mov 8($disp),%rsi # disp->ImageBase
2204 mov 56($disp),%r11 # disp->HandlerData
2206 mov 0(%r11),%r10d # HandlerData[0]
2207 lea (%rsi,%r10),%r10 # end of prologue label
2208 cmp %r10,%rbx # context->Rip<end of prologue label
2209 jb .Lcommon_seh_tail
2211 mov 152($context),%rax # pull context->Rsp
2213 mov 4(%r11),%r10d # HandlerData[1]
2214 lea (%rsi,%r10),%r10 # epilogue label
2215 cmp %r10,%rbx # context->Rip>=epilogue label
2216 jae .Lcommon_seh_tail
2218 lea 128+24+48(%rax),%rax
2220 lea .Lmul_gather4_epilogue(%rip),%rbx
2221 cmp %r10,%rbx
2222 jne .Lse_not_in_mul_gather4
2224 lea 0xb0(%rax),%rax
2226 lea -48-0xa8(%rax),%rsi
2227 lea 512($context),%rdi
2228 mov \$20,%ecx
2229 .long 0xa548f3fc # cld; rep movsq
2231 .Lse_not_in_mul_gather4:
2232 mov -8(%rax),%rbx
2233 mov -16(%rax),%rbp
2234 mov -24(%rax),%r12
2235 mov -32(%rax),%r13
2236 mov -40(%rax),%r14
2237 mov -48(%rax),%r15
2238 mov %rbx,144($context) # restore context->Rbx
2239 mov %rbp,160($context) # restore context->Rbp
2240 mov %r12,216($context) # restore context->R12
2241 mov %r13,224($context) # restore context->R13
2242 mov %r14,232($context) # restore context->R14
2243 mov %r15,240($context) # restore context->R15
2245 .Lcommon_seh_tail:
2246 mov 8(%rax),%rdi
2247 mov 16(%rax),%rsi
2248 mov %rax,152($context) # restore context->Rsp
2249 mov %rsi,168($context) # restore context->Rsi
2250 mov %rdi,176($context) # restore context->Rdi
2252 mov 40($disp),%rdi # disp->ContextRecord
2253 mov $context,%rsi # context
2254 mov \$154,%ecx # sizeof(CONTEXT)
2255 .long 0xa548f3fc # cld; rep movsq
2257 mov $disp,%rsi
2258 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2259 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2260 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2261 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2262 mov 40(%rsi),%r10 # disp->ContextRecord
2263 lea 56(%rsi),%r11 # &disp->HandlerData
2264 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2265 mov %r10,32(%rsp) # arg5
2266 mov %r11,40(%rsp) # arg6
2267 mov %r12,48(%rsp) # arg7
2268 mov %rcx,56(%rsp) # arg8, (NULL)
2269 call *__imp_RtlVirtualUnwind(%rip)
2271 mov \$1,%eax # ExceptionContinueSearch
2272 add \$64,%rsp
2273 popfq
2274 pop %r15
2275 pop %r14
2276 pop %r13
2277 pop %r12
2278 pop %rbp
2279 pop %rbx
2280 pop %rdi
2281 pop %rsi
2283 .size se_handler,.-se_handler
2285 .section .pdata
2286 .align 4
2287 .rva .LSEH_begin_rsaz_512_sqr
2288 .rva .LSEH_end_rsaz_512_sqr
2289 .rva .LSEH_info_rsaz_512_sqr
2291 .rva .LSEH_begin_rsaz_512_mul
2292 .rva .LSEH_end_rsaz_512_mul
2293 .rva .LSEH_info_rsaz_512_mul
2295 .rva .LSEH_begin_rsaz_512_mul_gather4
2296 .rva .LSEH_end_rsaz_512_mul_gather4
2297 .rva .LSEH_info_rsaz_512_mul_gather4
2299 .rva .LSEH_begin_rsaz_512_mul_scatter4
2300 .rva .LSEH_end_rsaz_512_mul_scatter4
2301 .rva .LSEH_info_rsaz_512_mul_scatter4
2303 .rva .LSEH_begin_rsaz_512_mul_by_one
2304 .rva .LSEH_end_rsaz_512_mul_by_one
2305 .rva .LSEH_info_rsaz_512_mul_by_one
2307 .rva .LSEH_begin_rsaz_512_gather4
2308 .rva .LSEH_end_rsaz_512_gather4
2309 .rva .LSEH_info_rsaz_512_gather4
2311 .section .xdata
2312 .align 8
2313 .LSEH_info_rsaz_512_sqr:
2314 .byte 9,0,0,0
2315 .rva se_handler
2316 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2317 .LSEH_info_rsaz_512_mul:
2318 .byte 9,0,0,0
2319 .rva se_handler
2320 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2321 .LSEH_info_rsaz_512_mul_gather4:
2322 .byte 9,0,0,0
2323 .rva se_handler
2324 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2325 .LSEH_info_rsaz_512_mul_scatter4:
2326 .byte 9,0,0,0
2327 .rva se_handler
2328 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2329 .LSEH_info_rsaz_512_mul_by_one:
2330 .byte 9,0,0,0
2331 .rva se_handler
2332 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2333 .LSEH_info_rsaz_512_gather4:
2334 .byte 0x01,0x46,0x16,0x00
2335 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2336 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2337 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2338 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2339 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2340 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2341 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2342 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2343 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2344 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2345 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2349 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2350 print $code;
2351 close STDOUT;