OpenSSL: update to 1.0.2a
[tomato.git] / release / src / router / openssl / crypto / bn / asm / rsaz-x86_64.pl
blob3bd45dbac01d5bb71d5c33b91d4c5157c51df4c2
1 #!/usr/bin/env perl
3 ##############################################################################
4 # #
5 # Copyright (c) 2012, Intel Corporation #
6 # #
7 # All rights reserved. #
8 # #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
11 # met: #
12 # #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
15 # #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
19 # distribution. #
20 # #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
24 # #
25 # #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37 # #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
45 # Reference: #
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
69 # <appro@openssl.org>
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
75 # P4 +11% |+7% +8%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
80 # Atom +13% |+11% +4%
81 # VIA Nano +70% |+9% +25%
83 # (*) rsax engine and fips numbers are presented for reference
84 # purposes;
85 # (**) MULX was attempted, but found to give only marginal improvement;
87 $flavour = shift;
88 $output = shift;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
98 open OUT,"| \"$^X\" $xlate $flavour $output";
99 *STDOUT=*OUT;
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 $addx = ($1>=2.23);
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113 $addx = ($1>=12);
116 if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
118 $addx = ($ver>=3.03);
121 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
123 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
125 $code.=<<___;
126 .text
128 .extern OPENSSL_ia32cap_P
130 .globl rsaz_512_sqr
131 .type rsaz_512_sqr,\@function,5
132 .align 32
133 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
134 push %rbx
135 push %rbp
136 push %r12
137 push %r13
138 push %r14
139 push %r15
141 subq \$128+24, %rsp
142 .Lsqr_body:
143 movq $mod, %rbp # common argument
144 movq ($inp), %rdx
145 movq 8($inp), %rax
146 movq $n0, 128(%rsp)
148 $code.=<<___ if ($addx);
149 movl \$0x80100,%r11d
150 andl OPENSSL_ia32cap_P+8(%rip),%r11d
151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
152 je .Loop_sqrx
154 $code.=<<___;
155 jmp .Loop_sqr
157 .align 32
158 .Loop_sqr:
159 movl $times,128+8(%rsp)
160 #first iteration
161 movq %rdx, %rbx
162 mulq %rdx
163 movq %rax, %r8
164 movq 16($inp), %rax
165 movq %rdx, %r9
167 mulq %rbx
168 addq %rax, %r9
169 movq 24($inp), %rax
170 movq %rdx, %r10
171 adcq \$0, %r10
173 mulq %rbx
174 addq %rax, %r10
175 movq 32($inp), %rax
176 movq %rdx, %r11
177 adcq \$0, %r11
179 mulq %rbx
180 addq %rax, %r11
181 movq 40($inp), %rax
182 movq %rdx, %r12
183 adcq \$0, %r12
185 mulq %rbx
186 addq %rax, %r12
187 movq 48($inp), %rax
188 movq %rdx, %r13
189 adcq \$0, %r13
191 mulq %rbx
192 addq %rax, %r13
193 movq 56($inp), %rax
194 movq %rdx, %r14
195 adcq \$0, %r14
197 mulq %rbx
198 addq %rax, %r14
199 movq %rbx, %rax
200 movq %rdx, %r15
201 adcq \$0, %r15
203 addq %r8, %r8 #shlq \$1, %r8
204 movq %r9, %rcx
205 adcq %r9, %r9 #shld \$1, %r8, %r9
207 mulq %rax
208 movq %rax, (%rsp)
209 addq %rdx, %r8
210 adcq \$0, %r9
212 movq %r8, 8(%rsp)
213 shrq \$63, %rcx
215 #second iteration
216 movq 8($inp), %r8
217 movq 16($inp), %rax
218 mulq %r8
219 addq %rax, %r10
220 movq 24($inp), %rax
221 movq %rdx, %rbx
222 adcq \$0, %rbx
224 mulq %r8
225 addq %rax, %r11
226 movq 32($inp), %rax
227 adcq \$0, %rdx
228 addq %rbx, %r11
229 movq %rdx, %rbx
230 adcq \$0, %rbx
232 mulq %r8
233 addq %rax, %r12
234 movq 40($inp), %rax
235 adcq \$0, %rdx
236 addq %rbx, %r12
237 movq %rdx, %rbx
238 adcq \$0, %rbx
240 mulq %r8
241 addq %rax, %r13
242 movq 48($inp), %rax
243 adcq \$0, %rdx
244 addq %rbx, %r13
245 movq %rdx, %rbx
246 adcq \$0, %rbx
248 mulq %r8
249 addq %rax, %r14
250 movq 56($inp), %rax
251 adcq \$0, %rdx
252 addq %rbx, %r14
253 movq %rdx, %rbx
254 adcq \$0, %rbx
256 mulq %r8
257 addq %rax, %r15
258 movq %r8, %rax
259 adcq \$0, %rdx
260 addq %rbx, %r15
261 movq %rdx, %r8
262 movq %r10, %rdx
263 adcq \$0, %r8
265 add %rdx, %rdx
266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
267 movq %r11, %rbx
268 adcq %r11, %r11 #shld \$1, %r10, %r11
270 mulq %rax
271 addq %rax, %r9
272 adcq %rdx, %r10
273 adcq \$0, %r11
275 movq %r9, 16(%rsp)
276 movq %r10, 24(%rsp)
277 shrq \$63, %rbx
279 #third iteration
280 movq 16($inp), %r9
281 movq 24($inp), %rax
282 mulq %r9
283 addq %rax, %r12
284 movq 32($inp), %rax
285 movq %rdx, %rcx
286 adcq \$0, %rcx
288 mulq %r9
289 addq %rax, %r13
290 movq 40($inp), %rax
291 adcq \$0, %rdx
292 addq %rcx, %r13
293 movq %rdx, %rcx
294 adcq \$0, %rcx
296 mulq %r9
297 addq %rax, %r14
298 movq 48($inp), %rax
299 adcq \$0, %rdx
300 addq %rcx, %r14
301 movq %rdx, %rcx
302 adcq \$0, %rcx
304 mulq %r9
305 movq %r12, %r10
306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 addq %rax, %r15
308 movq 56($inp), %rax
309 adcq \$0, %rdx
310 addq %rcx, %r15
311 movq %rdx, %rcx
312 adcq \$0, %rcx
314 mulq %r9
315 shrq \$63, %r10
316 addq %rax, %r8
317 movq %r9, %rax
318 adcq \$0, %rdx
319 addq %rcx, %r8
320 movq %rdx, %r9
321 adcq \$0, %r9
323 movq %r13, %rcx
324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
326 mulq %rax
327 addq %rax, %r11
328 adcq %rdx, %r12
329 adcq \$0, %r13
331 movq %r11, 32(%rsp)
332 movq %r12, 40(%rsp)
333 shrq \$63, %rcx
335 #fourth iteration
336 movq 24($inp), %r10
337 movq 32($inp), %rax
338 mulq %r10
339 addq %rax, %r14
340 movq 40($inp), %rax
341 movq %rdx, %rbx
342 adcq \$0, %rbx
344 mulq %r10
345 addq %rax, %r15
346 movq 48($inp), %rax
347 adcq \$0, %rdx
348 addq %rbx, %r15
349 movq %rdx, %rbx
350 adcq \$0, %rbx
352 mulq %r10
353 movq %r14, %r12
354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 addq %rax, %r8
356 movq 56($inp), %rax
357 adcq \$0, %rdx
358 addq %rbx, %r8
359 movq %rdx, %rbx
360 adcq \$0, %rbx
362 mulq %r10
363 shrq \$63, %r12
364 addq %rax, %r9
365 movq %r10, %rax
366 adcq \$0, %rdx
367 addq %rbx, %r9
368 movq %rdx, %r10
369 adcq \$0, %r10
371 movq %r15, %rbx
372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
374 mulq %rax
375 addq %rax, %r13
376 adcq %rdx, %r14
377 adcq \$0, %r15
379 movq %r13, 48(%rsp)
380 movq %r14, 56(%rsp)
381 shrq \$63, %rbx
383 #fifth iteration
384 movq 32($inp), %r11
385 movq 40($inp), %rax
386 mulq %r11
387 addq %rax, %r8
388 movq 48($inp), %rax
389 movq %rdx, %rcx
390 adcq \$0, %rcx
392 mulq %r11
393 addq %rax, %r9
394 movq 56($inp), %rax
395 adcq \$0, %rdx
396 movq %r8, %r12
397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
398 addq %rcx, %r9
399 movq %rdx, %rcx
400 adcq \$0, %rcx
402 mulq %r11
403 shrq \$63, %r12
404 addq %rax, %r10
405 movq %r11, %rax
406 adcq \$0, %rdx
407 addq %rcx, %r10
408 movq %rdx, %r11
409 adcq \$0, %r11
411 movq %r9, %rcx
412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
414 mulq %rax
415 addq %rax, %r15
416 adcq %rdx, %r8
417 adcq \$0, %r9
419 movq %r15, 64(%rsp)
420 movq %r8, 72(%rsp)
421 shrq \$63, %rcx
423 #sixth iteration
424 movq 40($inp), %r12
425 movq 48($inp), %rax
426 mulq %r12
427 addq %rax, %r10
428 movq 56($inp), %rax
429 movq %rdx, %rbx
430 adcq \$0, %rbx
432 mulq %r12
433 addq %rax, %r11
434 movq %r12, %rax
435 movq %r10, %r15
436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
437 adcq \$0, %rdx
438 shrq \$63, %r15
439 addq %rbx, %r11
440 movq %rdx, %r12
441 adcq \$0, %r12
443 movq %r11, %rbx
444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
446 mulq %rax
447 addq %rax, %r9
448 adcq %rdx, %r10
449 adcq \$0, %r11
451 movq %r9, 80(%rsp)
452 movq %r10, 88(%rsp)
454 #seventh iteration
455 movq 48($inp), %r13
456 movq 56($inp), %rax
457 mulq %r13
458 addq %rax, %r12
459 movq %r13, %rax
460 movq %rdx, %r13
461 adcq \$0, %r13
463 xorq %r14, %r14
464 shlq \$1, %rbx
465 adcq %r12, %r12 #shld \$1, %rbx, %r12
466 adcq %r13, %r13 #shld \$1, %r12, %r13
467 adcq %r14, %r14 #shld \$1, %r13, %r14
469 mulq %rax
470 addq %rax, %r11
471 adcq %rdx, %r12
472 adcq \$0, %r13
474 movq %r11, 96(%rsp)
475 movq %r12, 104(%rsp)
477 #eighth iteration
478 movq 56($inp), %rax
479 mulq %rax
480 addq %rax, %r13
481 adcq \$0, %rdx
483 addq %rdx, %r14
485 movq %r13, 112(%rsp)
486 movq %r14, 120(%rsp)
488 movq (%rsp), %r8
489 movq 8(%rsp), %r9
490 movq 16(%rsp), %r10
491 movq 24(%rsp), %r11
492 movq 32(%rsp), %r12
493 movq 40(%rsp), %r13
494 movq 48(%rsp), %r14
495 movq 56(%rsp), %r15
497 call __rsaz_512_reduce
499 addq 64(%rsp), %r8
500 adcq 72(%rsp), %r9
501 adcq 80(%rsp), %r10
502 adcq 88(%rsp), %r11
503 adcq 96(%rsp), %r12
504 adcq 104(%rsp), %r13
505 adcq 112(%rsp), %r14
506 adcq 120(%rsp), %r15
507 sbbq %rcx, %rcx
509 call __rsaz_512_subtract
511 movq %r8, %rdx
512 movq %r9, %rax
513 movl 128+8(%rsp), $times
514 movq $out, $inp
516 decl $times
517 jnz .Loop_sqr
519 if ($addx) {
520 $code.=<<___;
521 jmp .Lsqr_tail
523 .align 32
524 .Loop_sqrx:
525 movl $times,128+8(%rsp)
526 movq $out, %xmm0 # off-load
527 movq %rbp, %xmm1 # off-load
528 #first iteration
529 mulx %rax, %r8, %r9
531 mulx 16($inp), %rcx, %r10
532 xor %rbp, %rbp # cf=0, of=0
534 mulx 24($inp), %rax, %r11
535 adcx %rcx, %r9
537 mulx 32($inp), %rcx, %r12
538 adcx %rax, %r10
540 mulx 40($inp), %rax, %r13
541 adcx %rcx, %r11
543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
544 adcx %rax, %r12
545 adcx %rcx, %r13
547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
548 adcx %rax, %r14
549 adcx %rbp, %r15 # %rbp is 0
551 mov %r9, %rcx
552 shld \$1, %r8, %r9
553 shl \$1, %r8
555 xor %ebp, %ebp
556 mulx %rdx, %rax, %rdx
557 adcx %rdx, %r8
558 mov 8($inp), %rdx
559 adcx %rbp, %r9
561 mov %rax, (%rsp)
562 mov %r8, 8(%rsp)
564 #second iteration
565 mulx 16($inp), %rax, %rbx
566 adox %rax, %r10
567 adcx %rbx, %r11
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
570 adox $out, %r11
571 adcx %r8, %r12
573 mulx 32($inp), %rax, %rbx
574 adox %rax, %r12
575 adcx %rbx, %r13
577 mulx 40($inp), $out, %r8
578 adox $out, %r13
579 adcx %r8, %r14
581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
582 adox %rax, %r14
583 adcx %rbx, %r15
585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
586 adox $out, %r15
587 adcx %rbp, %r8
588 adox %rbp, %r8
590 mov %r11, %rbx
591 shld \$1, %r10, %r11
592 shld \$1, %rcx, %r10
594 xor %ebp,%ebp
595 mulx %rdx, %rax, %rcx
596 mov 16($inp), %rdx
597 adcx %rax, %r9
598 adcx %rcx, %r10
599 adcx %rbp, %r11
601 mov %r9, 16(%rsp)
602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
604 #third iteration
605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
606 adox $out, %r12
607 adcx %r9, %r13
609 mulx 32($inp), %rax, %rcx
610 adox %rax, %r13
611 adcx %rcx, %r14
613 mulx 40($inp), $out, %r9
614 adox $out, %r14
615 adcx %r9, %r15
617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
618 adox %rax, %r15
619 adcx %rcx, %r8
621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
622 adox $out, %r8
623 adcx %rbp, %r9
624 adox %rbp, %r9
626 mov %r13, %rcx
627 shld \$1, %r12, %r13
628 shld \$1, %rbx, %r12
630 xor %ebp, %ebp
631 mulx %rdx, %rax, %rdx
632 adcx %rax, %r11
633 adcx %rdx, %r12
634 mov 24($inp), %rdx
635 adcx %rbp, %r13
637 mov %r11, 32(%rsp)
638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
640 #fourth iteration
641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
642 adox %rax, %r14
643 adcx %rbx, %r15
645 mulx 40($inp), $out, %r10
646 adox $out, %r15
647 adcx %r10, %r8
649 mulx 48($inp), %rax, %rbx
650 adox %rax, %r8
651 adcx %rbx, %r9
653 mulx 56($inp), $out, %r10
654 adox $out, %r9
655 adcx %rbp, %r10
656 adox %rbp, %r10
658 .byte 0x66
659 mov %r15, %rbx
660 shld \$1, %r14, %r15
661 shld \$1, %rcx, %r14
663 xor %ebp, %ebp
664 mulx %rdx, %rax, %rdx
665 adcx %rax, %r13
666 adcx %rdx, %r14
667 mov 32($inp), %rdx
668 adcx %rbp, %r15
670 mov %r13, 48(%rsp)
671 mov %r14, 56(%rsp)
673 #fifth iteration
674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
675 adox $out, %r8
676 adcx %r11, %r9
678 mulx 48($inp), %rax, %rcx
679 adox %rax, %r9
680 adcx %rcx, %r10
682 mulx 56($inp), $out, %r11
683 adox $out, %r10
684 adcx %rbp, %r11
685 adox %rbp, %r11
687 mov %r9, %rcx
688 shld \$1, %r8, %r9
689 shld \$1, %rbx, %r8
691 xor %ebp, %ebp
692 mulx %rdx, %rax, %rdx
693 adcx %rax, %r15
694 adcx %rdx, %r8
695 mov 40($inp), %rdx
696 adcx %rbp, %r9
698 mov %r15, 64(%rsp)
699 mov %r8, 72(%rsp)
701 #sixth iteration
702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
703 adox %rax, %r10
704 adcx %rbx, %r11
706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
707 adox $out, %r11
708 adcx %rbp, %r12
709 adox %rbp, %r12
711 mov %r11, %rbx
712 shld \$1, %r10, %r11
713 shld \$1, %rcx, %r10
715 xor %ebp, %ebp
716 mulx %rdx, %rax, %rdx
717 adcx %rax, %r9
718 adcx %rdx, %r10
719 mov 48($inp), %rdx
720 adcx %rbp, %r11
722 mov %r9, 80(%rsp)
723 mov %r10, 88(%rsp)
725 #seventh iteration
726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
727 adox %rax, %r12
728 adox %rbp, %r13
730 xor %r14, %r14
731 shld \$1, %r13, %r14
732 shld \$1, %r12, %r13
733 shld \$1, %rbx, %r12
735 xor %ebp, %ebp
736 mulx %rdx, %rax, %rdx
737 adcx %rax, %r11
738 adcx %rdx, %r12
739 mov 56($inp), %rdx
740 adcx %rbp, %r13
742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
745 #eighth iteration
746 mulx %rdx, %rax, %rdx
747 adox %rax, %r13
748 adox %rbp, %rdx
750 .byte 0x66
751 add %rdx, %r14
753 movq %r13, 112(%rsp)
754 movq %r14, 120(%rsp)
755 movq %xmm0, $out
756 movq %xmm1, %rbp
758 movq 128(%rsp), %rdx # pull $n0
759 movq (%rsp), %r8
760 movq 8(%rsp), %r9
761 movq 16(%rsp), %r10
762 movq 24(%rsp), %r11
763 movq 32(%rsp), %r12
764 movq 40(%rsp), %r13
765 movq 48(%rsp), %r14
766 movq 56(%rsp), %r15
768 call __rsaz_512_reducex
770 addq 64(%rsp), %r8
771 adcq 72(%rsp), %r9
772 adcq 80(%rsp), %r10
773 adcq 88(%rsp), %r11
774 adcq 96(%rsp), %r12
775 adcq 104(%rsp), %r13
776 adcq 112(%rsp), %r14
777 adcq 120(%rsp), %r15
778 sbbq %rcx, %rcx
780 call __rsaz_512_subtract
782 movq %r8, %rdx
783 movq %r9, %rax
784 movl 128+8(%rsp), $times
785 movq $out, $inp
787 decl $times
788 jnz .Loop_sqrx
790 .Lsqr_tail:
793 $code.=<<___;
795 leaq 128+24+48(%rsp), %rax
796 movq -48(%rax), %r15
797 movq -40(%rax), %r14
798 movq -32(%rax), %r13
799 movq -24(%rax), %r12
800 movq -16(%rax), %rbp
801 movq -8(%rax), %rbx
802 leaq (%rax), %rsp
803 .Lsqr_epilogue:
805 .size rsaz_512_sqr,.-rsaz_512_sqr
809 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810 $code.=<<___;
811 .globl rsaz_512_mul
812 .type rsaz_512_mul,\@function,5
813 .align 32
814 rsaz_512_mul:
815 push %rbx
816 push %rbp
817 push %r12
818 push %r13
819 push %r14
820 push %r15
822 subq \$128+24, %rsp
823 .Lmul_body:
824 movq $out, %xmm0 # off-load arguments
825 movq $mod, %xmm1
826 movq $n0, 128(%rsp)
828 $code.=<<___ if ($addx);
829 movl \$0x80100,%r11d
830 andl OPENSSL_ia32cap_P+8(%rip),%r11d
831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
832 je .Lmulx
834 $code.=<<___;
835 movq ($bp), %rbx # pass b[0]
836 movq $bp, %rbp # pass argument
837 call __rsaz_512_mul
839 movq %xmm0, $out
840 movq %xmm1, %rbp
842 movq (%rsp), %r8
843 movq 8(%rsp), %r9
844 movq 16(%rsp), %r10
845 movq 24(%rsp), %r11
846 movq 32(%rsp), %r12
847 movq 40(%rsp), %r13
848 movq 48(%rsp), %r14
849 movq 56(%rsp), %r15
851 call __rsaz_512_reduce
853 $code.=<<___ if ($addx);
854 jmp .Lmul_tail
856 .align 32
857 .Lmulx:
858 movq $bp, %rbp # pass argument
859 movq ($bp), %rdx # pass b[0]
860 call __rsaz_512_mulx
862 movq %xmm0, $out
863 movq %xmm1, %rbp
865 movq 128(%rsp), %rdx # pull $n0
866 movq (%rsp), %r8
867 movq 8(%rsp), %r9
868 movq 16(%rsp), %r10
869 movq 24(%rsp), %r11
870 movq 32(%rsp), %r12
871 movq 40(%rsp), %r13
872 movq 48(%rsp), %r14
873 movq 56(%rsp), %r15
875 call __rsaz_512_reducex
876 .Lmul_tail:
878 $code.=<<___;
879 addq 64(%rsp), %r8
880 adcq 72(%rsp), %r9
881 adcq 80(%rsp), %r10
882 adcq 88(%rsp), %r11
883 adcq 96(%rsp), %r12
884 adcq 104(%rsp), %r13
885 adcq 112(%rsp), %r14
886 adcq 120(%rsp), %r15
887 sbbq %rcx, %rcx
889 call __rsaz_512_subtract
891 leaq 128+24+48(%rsp), %rax
892 movq -48(%rax), %r15
893 movq -40(%rax), %r14
894 movq -32(%rax), %r13
895 movq -24(%rax), %r12
896 movq -16(%rax), %rbp
897 movq -8(%rax), %rbx
898 leaq (%rax), %rsp
899 .Lmul_epilogue:
901 .size rsaz_512_mul,.-rsaz_512_mul
905 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906 $code.=<<___;
907 .globl rsaz_512_mul_gather4
908 .type rsaz_512_mul_gather4,\@function,6
909 .align 32
910 rsaz_512_mul_gather4:
911 push %rbx
912 push %rbp
913 push %r12
914 push %r13
915 push %r14
916 push %r15
918 mov $pwr, $pwr
919 subq \$128+24, %rsp
920 .Lmul_gather4_body:
922 $code.=<<___ if ($addx);
923 movl \$0x80100,%r11d
924 andl OPENSSL_ia32cap_P+8(%rip),%r11d
925 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
926 je .Lmulx_gather
928 $code.=<<___;
929 movl 64($bp,$pwr,4), %eax
930 movq $out, %xmm0 # off-load arguments
931 movl ($bp,$pwr,4), %ebx
932 movq $mod, %xmm1
933 movq $n0, 128(%rsp)
935 shlq \$32, %rax
936 or %rax, %rbx
937 movq ($ap), %rax
938 movq 8($ap), %rcx
939 leaq 128($bp,$pwr,4), %rbp
940 mulq %rbx # 0 iteration
941 movq %rax, (%rsp)
942 movq %rcx, %rax
943 movq %rdx, %r8
945 mulq %rbx
946 movd (%rbp), %xmm4
947 addq %rax, %r8
948 movq 16($ap), %rax
949 movq %rdx, %r9
950 adcq \$0, %r9
952 mulq %rbx
953 movd 64(%rbp), %xmm5
954 addq %rax, %r9
955 movq 24($ap), %rax
956 movq %rdx, %r10
957 adcq \$0, %r10
959 mulq %rbx
960 pslldq \$4, %xmm5
961 addq %rax, %r10
962 movq 32($ap), %rax
963 movq %rdx, %r11
964 adcq \$0, %r11
966 mulq %rbx
967 por %xmm5, %xmm4
968 addq %rax, %r11
969 movq 40($ap), %rax
970 movq %rdx, %r12
971 adcq \$0, %r12
973 mulq %rbx
974 addq %rax, %r12
975 movq 48($ap), %rax
976 movq %rdx, %r13
977 adcq \$0, %r13
979 mulq %rbx
980 leaq 128(%rbp), %rbp
981 addq %rax, %r13
982 movq 56($ap), %rax
983 movq %rdx, %r14
984 adcq \$0, %r14
986 mulq %rbx
987 movq %xmm4, %rbx
988 addq %rax, %r14
989 movq ($ap), %rax
990 movq %rdx, %r15
991 adcq \$0, %r15
993 leaq 8(%rsp), %rdi
994 movl \$7, %ecx
995 jmp .Loop_mul_gather
997 .align 32
998 .Loop_mul_gather:
999 mulq %rbx
1000 addq %rax, %r8
1001 movq 8($ap), %rax
1002 movq %r8, (%rdi)
1003 movq %rdx, %r8
1004 adcq \$0, %r8
1006 mulq %rbx
1007 movd (%rbp), %xmm4
1008 addq %rax, %r9
1009 movq 16($ap), %rax
1010 adcq \$0, %rdx
1011 addq %r9, %r8
1012 movq %rdx, %r9
1013 adcq \$0, %r9
1015 mulq %rbx
1016 movd 64(%rbp), %xmm5
1017 addq %rax, %r10
1018 movq 24($ap), %rax
1019 adcq \$0, %rdx
1020 addq %r10, %r9
1021 movq %rdx, %r10
1022 adcq \$0, %r10
1024 mulq %rbx
1025 pslldq \$4, %xmm5
1026 addq %rax, %r11
1027 movq 32($ap), %rax
1028 adcq \$0, %rdx
1029 addq %r11, %r10
1030 movq %rdx, %r11
1031 adcq \$0, %r11
1033 mulq %rbx
1034 por %xmm5, %xmm4
1035 addq %rax, %r12
1036 movq 40($ap), %rax
1037 adcq \$0, %rdx
1038 addq %r12, %r11
1039 movq %rdx, %r12
1040 adcq \$0, %r12
1042 mulq %rbx
1043 addq %rax, %r13
1044 movq 48($ap), %rax
1045 adcq \$0, %rdx
1046 addq %r13, %r12
1047 movq %rdx, %r13
1048 adcq \$0, %r13
1050 mulq %rbx
1051 addq %rax, %r14
1052 movq 56($ap), %rax
1053 adcq \$0, %rdx
1054 addq %r14, %r13
1055 movq %rdx, %r14
1056 adcq \$0, %r14
1058 mulq %rbx
1059 movq %xmm4, %rbx
1060 addq %rax, %r15
1061 movq ($ap), %rax
1062 adcq \$0, %rdx
1063 addq %r15, %r14
1064 movq %rdx, %r15
1065 adcq \$0, %r15
1067 leaq 128(%rbp), %rbp
1068 leaq 8(%rdi), %rdi
1070 decl %ecx
1071 jnz .Loop_mul_gather
1073 movq %r8, (%rdi)
1074 movq %r9, 8(%rdi)
1075 movq %r10, 16(%rdi)
1076 movq %r11, 24(%rdi)
1077 movq %r12, 32(%rdi)
1078 movq %r13, 40(%rdi)
1079 movq %r14, 48(%rdi)
1080 movq %r15, 56(%rdi)
1082 movq %xmm0, $out
1083 movq %xmm1, %rbp
1085 movq (%rsp), %r8
1086 movq 8(%rsp), %r9
1087 movq 16(%rsp), %r10
1088 movq 24(%rsp), %r11
1089 movq 32(%rsp), %r12
1090 movq 40(%rsp), %r13
1091 movq 48(%rsp), %r14
1092 movq 56(%rsp), %r15
1094 call __rsaz_512_reduce
1096 $code.=<<___ if ($addx);
1097 jmp .Lmul_gather_tail
1099 .align 32
1100 .Lmulx_gather:
1101 mov 64($bp,$pwr,4), %eax
1102 movq $out, %xmm0 # off-load arguments
1103 lea 128($bp,$pwr,4), %rbp
1104 mov ($bp,$pwr,4), %edx
1105 movq $mod, %xmm1
1106 mov $n0, 128(%rsp)
1108 shl \$32, %rax
1109 or %rax, %rdx
1110 mulx ($ap), %rbx, %r8 # 0 iteration
1111 mov %rbx, (%rsp)
1112 xor %edi, %edi # cf=0, of=0
1114 mulx 8($ap), %rax, %r9
1115 movd (%rbp), %xmm4
1117 mulx 16($ap), %rbx, %r10
1118 movd 64(%rbp), %xmm5
1119 adcx %rax, %r8
1121 mulx 24($ap), %rax, %r11
1122 pslldq \$4, %xmm5
1123 adcx %rbx, %r9
1125 mulx 32($ap), %rbx, %r12
1126 por %xmm5, %xmm4
1127 adcx %rax, %r10
1129 mulx 40($ap), %rax, %r13
1130 adcx %rbx, %r11
1132 mulx 48($ap), %rbx, %r14
1133 lea 128(%rbp), %rbp
1134 adcx %rax, %r12
1136 mulx 56($ap), %rax, %r15
1137 movq %xmm4, %rdx
1138 adcx %rbx, %r13
1139 adcx %rax, %r14
1140 mov %r8, %rbx
1141 adcx %rdi, %r15 # %rdi is 0
1143 mov \$-7, %rcx
1144 jmp .Loop_mulx_gather
1146 .align 32
1147 .Loop_mulx_gather:
1148 mulx ($ap), %rax, %r8
1149 adcx %rax, %rbx
1150 adox %r9, %r8
1152 mulx 8($ap), %rax, %r9
1153 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1154 adcx %rax, %r8
1155 adox %r10, %r9
1157 mulx 16($ap), %rax, %r10
1158 movd 64(%rbp), %xmm5
1159 lea 128(%rbp), %rbp
1160 adcx %rax, %r9
1161 adox %r11, %r10
1163 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1164 pslldq \$4, %xmm5
1165 por %xmm5, %xmm4
1166 adcx %rax, %r10
1167 adox %r12, %r11
1169 mulx 32($ap), %rax, %r12
1170 adcx %rax, %r11
1171 adox %r13, %r12
1173 mulx 40($ap), %rax, %r13
1174 adcx %rax, %r12
1175 adox %r14, %r13
1177 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1178 adcx %rax, %r13
1179 adox %r15, %r14
1181 mulx 56($ap), %rax, %r15
1182 movq %xmm4, %rdx
1183 mov %rbx, 64(%rsp,%rcx,8)
1184 adcx %rax, %r14
1185 adox %rdi, %r15
1186 mov %r8, %rbx
1187 adcx %rdi, %r15 # cf=0
1189 inc %rcx # of=0
1190 jnz .Loop_mulx_gather
1192 mov %r8, 64(%rsp)
1193 mov %r9, 64+8(%rsp)
1194 mov %r10, 64+16(%rsp)
1195 mov %r11, 64+24(%rsp)
1196 mov %r12, 64+32(%rsp)
1197 mov %r13, 64+40(%rsp)
1198 mov %r14, 64+48(%rsp)
1199 mov %r15, 64+56(%rsp)
1201 movq %xmm0, $out
1202 movq %xmm1, %rbp
1204 mov 128(%rsp), %rdx # pull $n0
1205 mov (%rsp), %r8
1206 mov 8(%rsp), %r9
1207 mov 16(%rsp), %r10
1208 mov 24(%rsp), %r11
1209 mov 32(%rsp), %r12
1210 mov 40(%rsp), %r13
1211 mov 48(%rsp), %r14
1212 mov 56(%rsp), %r15
1214 call __rsaz_512_reducex
1216 .Lmul_gather_tail:
1218 $code.=<<___;
1219 addq 64(%rsp), %r8
1220 adcq 72(%rsp), %r9
1221 adcq 80(%rsp), %r10
1222 adcq 88(%rsp), %r11
1223 adcq 96(%rsp), %r12
1224 adcq 104(%rsp), %r13
1225 adcq 112(%rsp), %r14
1226 adcq 120(%rsp), %r15
1227 sbbq %rcx, %rcx
1229 call __rsaz_512_subtract
1231 leaq 128+24+48(%rsp), %rax
1232 movq -48(%rax), %r15
1233 movq -40(%rax), %r14
1234 movq -32(%rax), %r13
1235 movq -24(%rax), %r12
1236 movq -16(%rax), %rbp
1237 movq -8(%rax), %rbx
1238 leaq (%rax), %rsp
1239 .Lmul_gather4_epilogue:
1241 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1245 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1246 $code.=<<___;
1247 .globl rsaz_512_mul_scatter4
1248 .type rsaz_512_mul_scatter4,\@function,6
1249 .align 32
1250 rsaz_512_mul_scatter4:
1251 push %rbx
1252 push %rbp
1253 push %r12
1254 push %r13
1255 push %r14
1256 push %r15
1258 mov $pwr, $pwr
1259 subq \$128+24, %rsp
1260 .Lmul_scatter4_body:
1261 leaq ($tbl,$pwr,4), $tbl
1262 movq $out, %xmm0 # off-load arguments
1263 movq $mod, %xmm1
1264 movq $tbl, %xmm2
1265 movq $n0, 128(%rsp)
1267 movq $out, %rbp
1269 $code.=<<___ if ($addx);
1270 movl \$0x80100,%r11d
1271 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1272 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1273 je .Lmulx_scatter
1275 $code.=<<___;
1276 movq ($out),%rbx # pass b[0]
1277 call __rsaz_512_mul
1279 movq %xmm0, $out
1280 movq %xmm1, %rbp
1282 movq (%rsp), %r8
1283 movq 8(%rsp), %r9
1284 movq 16(%rsp), %r10
1285 movq 24(%rsp), %r11
1286 movq 32(%rsp), %r12
1287 movq 40(%rsp), %r13
1288 movq 48(%rsp), %r14
1289 movq 56(%rsp), %r15
1291 call __rsaz_512_reduce
1293 $code.=<<___ if ($addx);
1294 jmp .Lmul_scatter_tail
1296 .align 32
1297 .Lmulx_scatter:
1298 movq ($out), %rdx # pass b[0]
1299 call __rsaz_512_mulx
1301 movq %xmm0, $out
1302 movq %xmm1, %rbp
1304 movq 128(%rsp), %rdx # pull $n0
1305 movq (%rsp), %r8
1306 movq 8(%rsp), %r9
1307 movq 16(%rsp), %r10
1308 movq 24(%rsp), %r11
1309 movq 32(%rsp), %r12
1310 movq 40(%rsp), %r13
1311 movq 48(%rsp), %r14
1312 movq 56(%rsp), %r15
1314 call __rsaz_512_reducex
1316 .Lmul_scatter_tail:
1318 $code.=<<___;
1319 addq 64(%rsp), %r8
1320 adcq 72(%rsp), %r9
1321 adcq 80(%rsp), %r10
1322 adcq 88(%rsp), %r11
1323 adcq 96(%rsp), %r12
1324 adcq 104(%rsp), %r13
1325 adcq 112(%rsp), %r14
1326 adcq 120(%rsp), %r15
1327 movq %xmm2, $inp
1328 sbbq %rcx, %rcx
1330 call __rsaz_512_subtract
1332 movl %r8d, 64*0($inp) # scatter
1333 shrq \$32, %r8
1334 movl %r9d, 64*2($inp)
1335 shrq \$32, %r9
1336 movl %r10d, 64*4($inp)
1337 shrq \$32, %r10
1338 movl %r11d, 64*6($inp)
1339 shrq \$32, %r11
1340 movl %r12d, 64*8($inp)
1341 shrq \$32, %r12
1342 movl %r13d, 64*10($inp)
1343 shrq \$32, %r13
1344 movl %r14d, 64*12($inp)
1345 shrq \$32, %r14
1346 movl %r15d, 64*14($inp)
1347 shrq \$32, %r15
1348 movl %r8d, 64*1($inp)
1349 movl %r9d, 64*3($inp)
1350 movl %r10d, 64*5($inp)
1351 movl %r11d, 64*7($inp)
1352 movl %r12d, 64*9($inp)
1353 movl %r13d, 64*11($inp)
1354 movl %r14d, 64*13($inp)
1355 movl %r15d, 64*15($inp)
1357 leaq 128+24+48(%rsp), %rax
1358 movq -48(%rax), %r15
1359 movq -40(%rax), %r14
1360 movq -32(%rax), %r13
1361 movq -24(%rax), %r12
1362 movq -16(%rax), %rbp
1363 movq -8(%rax), %rbx
1364 leaq (%rax), %rsp
1365 .Lmul_scatter4_epilogue:
1367 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1371 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1372 $code.=<<___;
1373 .globl rsaz_512_mul_by_one
1374 .type rsaz_512_mul_by_one,\@function,4
1375 .align 32
1376 rsaz_512_mul_by_one:
1377 push %rbx
1378 push %rbp
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1384 subq \$128+24, %rsp
1385 .Lmul_by_one_body:
1387 $code.=<<___ if ($addx);
1388 movl OPENSSL_ia32cap_P+8(%rip),%eax
1390 $code.=<<___;
1391 movq $mod, %rbp # reassign argument
1392 movq $n0, 128(%rsp)
1394 movq ($inp), %r8
1395 pxor %xmm0, %xmm0
1396 movq 8($inp), %r9
1397 movq 16($inp), %r10
1398 movq 24($inp), %r11
1399 movq 32($inp), %r12
1400 movq 40($inp), %r13
1401 movq 48($inp), %r14
1402 movq 56($inp), %r15
1404 movdqa %xmm0, (%rsp)
1405 movdqa %xmm0, 16(%rsp)
1406 movdqa %xmm0, 32(%rsp)
1407 movdqa %xmm0, 48(%rsp)
1408 movdqa %xmm0, 64(%rsp)
1409 movdqa %xmm0, 80(%rsp)
1410 movdqa %xmm0, 96(%rsp)
1412 $code.=<<___ if ($addx);
1413 andl \$0x80100,%eax
1414 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1415 je .Lby_one_callx
1417 $code.=<<___;
1418 call __rsaz_512_reduce
1420 $code.=<<___ if ($addx);
1421 jmp .Lby_one_tail
1422 .align 32
1423 .Lby_one_callx:
1424 movq 128(%rsp), %rdx # pull $n0
1425 call __rsaz_512_reducex
1426 .Lby_one_tail:
1428 $code.=<<___;
1429 movq %r8, ($out)
1430 movq %r9, 8($out)
1431 movq %r10, 16($out)
1432 movq %r11, 24($out)
1433 movq %r12, 32($out)
1434 movq %r13, 40($out)
1435 movq %r14, 48($out)
1436 movq %r15, 56($out)
1438 leaq 128+24+48(%rsp), %rax
1439 movq -48(%rax), %r15
1440 movq -40(%rax), %r14
1441 movq -32(%rax), %r13
1442 movq -24(%rax), %r12
1443 movq -16(%rax), %rbp
1444 movq -8(%rax), %rbx
1445 leaq (%rax), %rsp
1446 .Lmul_by_one_epilogue:
1448 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1451 { # __rsaz_512_reduce
1453 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1454 # output: %r8-%r15
1455 # clobbers: everything except %rbp and %rdi
1456 $code.=<<___;
1457 .type __rsaz_512_reduce,\@abi-omnipotent
1458 .align 32
1459 __rsaz_512_reduce:
1460 movq %r8, %rbx
1461 imulq 128+8(%rsp), %rbx
1462 movq 0(%rbp), %rax
1463 movl \$8, %ecx
1464 jmp .Lreduction_loop
1466 .align 32
1467 .Lreduction_loop:
1468 mulq %rbx
1469 movq 8(%rbp), %rax
1470 negq %r8
1471 movq %rdx, %r8
1472 adcq \$0, %r8
1474 mulq %rbx
1475 addq %rax, %r9
1476 movq 16(%rbp), %rax
1477 adcq \$0, %rdx
1478 addq %r9, %r8
1479 movq %rdx, %r9
1480 adcq \$0, %r9
1482 mulq %rbx
1483 addq %rax, %r10
1484 movq 24(%rbp), %rax
1485 adcq \$0, %rdx
1486 addq %r10, %r9
1487 movq %rdx, %r10
1488 adcq \$0, %r10
1490 mulq %rbx
1491 addq %rax, %r11
1492 movq 32(%rbp), %rax
1493 adcq \$0, %rdx
1494 addq %r11, %r10
1495 movq 128+8(%rsp), %rsi
1496 #movq %rdx, %r11
1497 #adcq \$0, %r11
1498 adcq \$0, %rdx
1499 movq %rdx, %r11
1501 mulq %rbx
1502 addq %rax, %r12
1503 movq 40(%rbp), %rax
1504 adcq \$0, %rdx
1505 imulq %r8, %rsi
1506 addq %r12, %r11
1507 movq %rdx, %r12
1508 adcq \$0, %r12
1510 mulq %rbx
1511 addq %rax, %r13
1512 movq 48(%rbp), %rax
1513 adcq \$0, %rdx
1514 addq %r13, %r12
1515 movq %rdx, %r13
1516 adcq \$0, %r13
1518 mulq %rbx
1519 addq %rax, %r14
1520 movq 56(%rbp), %rax
1521 adcq \$0, %rdx
1522 addq %r14, %r13
1523 movq %rdx, %r14
1524 adcq \$0, %r14
1526 mulq %rbx
1527 movq %rsi, %rbx
1528 addq %rax, %r15
1529 movq 0(%rbp), %rax
1530 adcq \$0, %rdx
1531 addq %r15, %r14
1532 movq %rdx, %r15
1533 adcq \$0, %r15
1535 decl %ecx
1536 jne .Lreduction_loop
1539 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1542 if ($addx) {
1543 # __rsaz_512_reducex
1545 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1546 # output: %r8-%r15
1547 # clobbers: everything except %rbp and %rdi
1548 $code.=<<___;
1549 .type __rsaz_512_reducex,\@abi-omnipotent
1550 .align 32
1551 __rsaz_512_reducex:
1552 #movq 128+8(%rsp), %rdx # pull $n0
1553 imulq %r8, %rdx
1554 xorq %rsi, %rsi # cf=0,of=0
1555 movl \$8, %ecx
1556 jmp .Lreduction_loopx
1558 .align 32
1559 .Lreduction_loopx:
1560 mov %r8, %rbx
1561 mulx 0(%rbp), %rax, %r8
1562 adcx %rbx, %rax
1563 adox %r9, %r8
1565 mulx 8(%rbp), %rax, %r9
1566 adcx %rax, %r8
1567 adox %r10, %r9
1569 mulx 16(%rbp), %rbx, %r10
1570 adcx %rbx, %r9
1571 adox %r11, %r10
1573 mulx 24(%rbp), %rbx, %r11
1574 adcx %rbx, %r10
1575 adox %r12, %r11
1577 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1578 mov %rdx, %rax
1579 mov %r8, %rdx
1580 adcx %rbx, %r11
1581 adox %r13, %r12
1583 mulx 128+8(%rsp), %rbx, %rdx
1584 mov %rax, %rdx
1586 mulx 40(%rbp), %rax, %r13
1587 adcx %rax, %r12
1588 adox %r14, %r13
1590 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1591 adcx %rax, %r13
1592 adox %r15, %r14
1594 mulx 56(%rbp), %rax, %r15
1595 mov %rbx, %rdx
1596 adcx %rax, %r14
1597 adox %rsi, %r15 # %rsi is 0
1598 adcx %rsi, %r15 # cf=0
1600 decl %ecx # of=0
1601 jne .Lreduction_loopx
1604 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1607 { # __rsaz_512_subtract
1608 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1609 # output:
1610 # clobbers: everything but %rdi, %rsi and %rbp
1611 $code.=<<___;
1612 .type __rsaz_512_subtract,\@abi-omnipotent
1613 .align 32
1614 __rsaz_512_subtract:
1615 movq %r8, ($out)
1616 movq %r9, 8($out)
1617 movq %r10, 16($out)
1618 movq %r11, 24($out)
1619 movq %r12, 32($out)
1620 movq %r13, 40($out)
1621 movq %r14, 48($out)
1622 movq %r15, 56($out)
1624 movq 0($mod), %r8
1625 movq 8($mod), %r9
1626 negq %r8
1627 notq %r9
1628 andq %rcx, %r8
1629 movq 16($mod), %r10
1630 andq %rcx, %r9
1631 notq %r10
1632 movq 24($mod), %r11
1633 andq %rcx, %r10
1634 notq %r11
1635 movq 32($mod), %r12
1636 andq %rcx, %r11
1637 notq %r12
1638 movq 40($mod), %r13
1639 andq %rcx, %r12
1640 notq %r13
1641 movq 48($mod), %r14
1642 andq %rcx, %r13
1643 notq %r14
1644 movq 56($mod), %r15
1645 andq %rcx, %r14
1646 notq %r15
1647 andq %rcx, %r15
1649 addq ($out), %r8
1650 adcq 8($out), %r9
1651 adcq 16($out), %r10
1652 adcq 24($out), %r11
1653 adcq 32($out), %r12
1654 adcq 40($out), %r13
1655 adcq 48($out), %r14
1656 adcq 56($out), %r15
1658 movq %r8, ($out)
1659 movq %r9, 8($out)
1660 movq %r10, 16($out)
1661 movq %r11, 24($out)
1662 movq %r12, 32($out)
1663 movq %r13, 40($out)
1664 movq %r14, 48($out)
1665 movq %r15, 56($out)
1668 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1671 { # __rsaz_512_mul
1673 # input: %rsi - ap, %rbp - bp
1674 # ouput:
1675 # clobbers: everything
1676 my ($ap,$bp) = ("%rsi","%rbp");
1677 $code.=<<___;
1678 .type __rsaz_512_mul,\@abi-omnipotent
1679 .align 32
1680 __rsaz_512_mul:
1681 leaq 8(%rsp), %rdi
1683 movq ($ap), %rax
1684 mulq %rbx
1685 movq %rax, (%rdi)
1686 movq 8($ap), %rax
1687 movq %rdx, %r8
1689 mulq %rbx
1690 addq %rax, %r8
1691 movq 16($ap), %rax
1692 movq %rdx, %r9
1693 adcq \$0, %r9
1695 mulq %rbx
1696 addq %rax, %r9
1697 movq 24($ap), %rax
1698 movq %rdx, %r10
1699 adcq \$0, %r10
1701 mulq %rbx
1702 addq %rax, %r10
1703 movq 32($ap), %rax
1704 movq %rdx, %r11
1705 adcq \$0, %r11
1707 mulq %rbx
1708 addq %rax, %r11
1709 movq 40($ap), %rax
1710 movq %rdx, %r12
1711 adcq \$0, %r12
1713 mulq %rbx
1714 addq %rax, %r12
1715 movq 48($ap), %rax
1716 movq %rdx, %r13
1717 adcq \$0, %r13
1719 mulq %rbx
1720 addq %rax, %r13
1721 movq 56($ap), %rax
1722 movq %rdx, %r14
1723 adcq \$0, %r14
1725 mulq %rbx
1726 addq %rax, %r14
1727 movq ($ap), %rax
1728 movq %rdx, %r15
1729 adcq \$0, %r15
1731 leaq 8($bp), $bp
1732 leaq 8(%rdi), %rdi
1734 movl \$7, %ecx
1735 jmp .Loop_mul
1737 .align 32
1738 .Loop_mul:
1739 movq ($bp), %rbx
1740 mulq %rbx
1741 addq %rax, %r8
1742 movq 8($ap), %rax
1743 movq %r8, (%rdi)
1744 movq %rdx, %r8
1745 adcq \$0, %r8
1747 mulq %rbx
1748 addq %rax, %r9
1749 movq 16($ap), %rax
1750 adcq \$0, %rdx
1751 addq %r9, %r8
1752 movq %rdx, %r9
1753 adcq \$0, %r9
1755 mulq %rbx
1756 addq %rax, %r10
1757 movq 24($ap), %rax
1758 adcq \$0, %rdx
1759 addq %r10, %r9
1760 movq %rdx, %r10
1761 adcq \$0, %r10
1763 mulq %rbx
1764 addq %rax, %r11
1765 movq 32($ap), %rax
1766 adcq \$0, %rdx
1767 addq %r11, %r10
1768 movq %rdx, %r11
1769 adcq \$0, %r11
1771 mulq %rbx
1772 addq %rax, %r12
1773 movq 40($ap), %rax
1774 adcq \$0, %rdx
1775 addq %r12, %r11
1776 movq %rdx, %r12
1777 adcq \$0, %r12
1779 mulq %rbx
1780 addq %rax, %r13
1781 movq 48($ap), %rax
1782 adcq \$0, %rdx
1783 addq %r13, %r12
1784 movq %rdx, %r13
1785 adcq \$0, %r13
1787 mulq %rbx
1788 addq %rax, %r14
1789 movq 56($ap), %rax
1790 adcq \$0, %rdx
1791 addq %r14, %r13
1792 movq %rdx, %r14
1793 leaq 8($bp), $bp
1794 adcq \$0, %r14
1796 mulq %rbx
1797 addq %rax, %r15
1798 movq ($ap), %rax
1799 adcq \$0, %rdx
1800 addq %r15, %r14
1801 movq %rdx, %r15
1802 adcq \$0, %r15
1804 leaq 8(%rdi), %rdi
1806 decl %ecx
1807 jnz .Loop_mul
1809 movq %r8, (%rdi)
1810 movq %r9, 8(%rdi)
1811 movq %r10, 16(%rdi)
1812 movq %r11, 24(%rdi)
1813 movq %r12, 32(%rdi)
1814 movq %r13, 40(%rdi)
1815 movq %r14, 48(%rdi)
1816 movq %r15, 56(%rdi)
1819 .size __rsaz_512_mul,.-__rsaz_512_mul
1822 if ($addx) {
1823 # __rsaz_512_mulx
1825 # input: %rsi - ap, %rbp - bp
1826 # ouput:
1827 # clobbers: everything
1828 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1829 $code.=<<___;
1830 .type __rsaz_512_mulx,\@abi-omnipotent
1831 .align 32
1832 __rsaz_512_mulx:
1833 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1834 mov \$-6, %rcx
1836 mulx 8($ap), %rax, %r9
1837 movq %rbx, 8(%rsp)
1839 mulx 16($ap), %rbx, %r10
1840 adc %rax, %r8
1842 mulx 24($ap), %rax, %r11
1843 adc %rbx, %r9
1845 mulx 32($ap), %rbx, %r12
1846 adc %rax, %r10
1848 mulx 40($ap), %rax, %r13
1849 adc %rbx, %r11
1851 mulx 48($ap), %rbx, %r14
1852 adc %rax, %r12
1854 mulx 56($ap), %rax, %r15
1855 mov 8($bp), %rdx
1856 adc %rbx, %r13
1857 adc %rax, %r14
1858 adc \$0, %r15
1860 xor $zero, $zero # cf=0,of=0
1861 jmp .Loop_mulx
1863 .align 32
1864 .Loop_mulx:
1865 movq %r8, %rbx
1866 mulx ($ap), %rax, %r8
1867 adcx %rax, %rbx
1868 adox %r9, %r8
1870 mulx 8($ap), %rax, %r9
1871 adcx %rax, %r8
1872 adox %r10, %r9
1874 mulx 16($ap), %rax, %r10
1875 adcx %rax, %r9
1876 adox %r11, %r10
1878 mulx 24($ap), %rax, %r11
1879 adcx %rax, %r10
1880 adox %r12, %r11
1882 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1883 adcx %rax, %r11
1884 adox %r13, %r12
1886 mulx 40($ap), %rax, %r13
1887 adcx %rax, %r12
1888 adox %r14, %r13
1890 mulx 48($ap), %rax, %r14
1891 adcx %rax, %r13
1892 adox %r15, %r14
1894 mulx 56($ap), %rax, %r15
1895 movq 64($bp,%rcx,8), %rdx
1896 movq %rbx, 8+64-8(%rsp,%rcx,8)
1897 adcx %rax, %r14
1898 adox $zero, %r15
1899 adcx $zero, %r15 # cf=0
1901 inc %rcx # of=0
1902 jnz .Loop_mulx
1904 movq %r8, %rbx
1905 mulx ($ap), %rax, %r8
1906 adcx %rax, %rbx
1907 adox %r9, %r8
1909 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1910 adcx %rax, %r8
1911 adox %r10, %r9
1913 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1914 adcx %rax, %r9
1915 adox %r11, %r10
1917 mulx 24($ap), %rax, %r11
1918 adcx %rax, %r10
1919 adox %r12, %r11
1921 mulx 32($ap), %rax, %r12
1922 adcx %rax, %r11
1923 adox %r13, %r12
1925 mulx 40($ap), %rax, %r13
1926 adcx %rax, %r12
1927 adox %r14, %r13
1929 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1930 adcx %rax, %r13
1931 adox %r15, %r14
1933 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1934 adcx %rax, %r14
1935 adox $zero, %r15
1936 adcx $zero, %r15
1938 mov %rbx, 8+64-8(%rsp)
1939 mov %r8, 8+64(%rsp)
1940 mov %r9, 8+64+8(%rsp)
1941 mov %r10, 8+64+16(%rsp)
1942 mov %r11, 8+64+24(%rsp)
1943 mov %r12, 8+64+32(%rsp)
1944 mov %r13, 8+64+40(%rsp)
1945 mov %r14, 8+64+48(%rsp)
1946 mov %r15, 8+64+56(%rsp)
1949 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1953 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1954 $code.=<<___;
1955 .globl rsaz_512_scatter4
1956 .type rsaz_512_scatter4,\@abi-omnipotent
1957 .align 16
1958 rsaz_512_scatter4:
1959 leaq ($out,$power,4), $out
1960 movl \$8, %r9d
1961 jmp .Loop_scatter
1962 .align 16
1963 .Loop_scatter:
1964 movq ($inp), %rax
1965 leaq 8($inp), $inp
1966 movl %eax, ($out)
1967 shrq \$32, %rax
1968 movl %eax, 64($out)
1969 leaq 128($out), $out
1970 decl %r9d
1971 jnz .Loop_scatter
1973 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1975 .globl rsaz_512_gather4
1976 .type rsaz_512_gather4,\@abi-omnipotent
1977 .align 16
1978 rsaz_512_gather4:
1979 leaq ($inp,$power,4), $inp
1980 movl \$8, %r9d
1981 jmp .Loop_gather
1982 .align 16
1983 .Loop_gather:
1984 movl ($inp), %eax
1985 movl 64($inp), %r8d
1986 leaq 128($inp), $inp
1987 shlq \$32, %r8
1988 or %r8, %rax
1989 movq %rax, ($out)
1990 leaq 8($out), $out
1991 decl %r9d
1992 jnz .Loop_gather
1994 .size rsaz_512_gather4,.-rsaz_512_gather4
1998 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1999 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2000 if ($win64) {
2001 $rec="%rcx";
2002 $frame="%rdx";
2003 $context="%r8";
2004 $disp="%r9";
2006 $code.=<<___;
2007 .extern __imp_RtlVirtualUnwind
2008 .type se_handler,\@abi-omnipotent
2009 .align 16
2010 se_handler:
2011 push %rsi
2012 push %rdi
2013 push %rbx
2014 push %rbp
2015 push %r12
2016 push %r13
2017 push %r14
2018 push %r15
2019 pushfq
2020 sub \$64,%rsp
2022 mov 120($context),%rax # pull context->Rax
2023 mov 248($context),%rbx # pull context->Rip
2025 mov 8($disp),%rsi # disp->ImageBase
2026 mov 56($disp),%r11 # disp->HandlerData
2028 mov 0(%r11),%r10d # HandlerData[0]
2029 lea (%rsi,%r10),%r10 # end of prologue label
2030 cmp %r10,%rbx # context->Rip<end of prologue label
2031 jb .Lcommon_seh_tail
2033 mov 152($context),%rax # pull context->Rsp
2035 mov 4(%r11),%r10d # HandlerData[1]
2036 lea (%rsi,%r10),%r10 # epilogue label
2037 cmp %r10,%rbx # context->Rip>=epilogue label
2038 jae .Lcommon_seh_tail
2040 lea 128+24+48(%rax),%rax
2042 mov -8(%rax),%rbx
2043 mov -16(%rax),%rbp
2044 mov -24(%rax),%r12
2045 mov -32(%rax),%r13
2046 mov -40(%rax),%r14
2047 mov -48(%rax),%r15
2048 mov %rbx,144($context) # restore context->Rbx
2049 mov %rbp,160($context) # restore context->Rbp
2050 mov %r12,216($context) # restore context->R12
2051 mov %r13,224($context) # restore context->R13
2052 mov %r14,232($context) # restore context->R14
2053 mov %r15,240($context) # restore context->R15
2055 .Lcommon_seh_tail:
2056 mov 8(%rax),%rdi
2057 mov 16(%rax),%rsi
2058 mov %rax,152($context) # restore context->Rsp
2059 mov %rsi,168($context) # restore context->Rsi
2060 mov %rdi,176($context) # restore context->Rdi
2062 mov 40($disp),%rdi # disp->ContextRecord
2063 mov $context,%rsi # context
2064 mov \$154,%ecx # sizeof(CONTEXT)
2065 .long 0xa548f3fc # cld; rep movsq
2067 mov $disp,%rsi
2068 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2069 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2070 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2071 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2072 mov 40(%rsi),%r10 # disp->ContextRecord
2073 lea 56(%rsi),%r11 # &disp->HandlerData
2074 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2075 mov %r10,32(%rsp) # arg5
2076 mov %r11,40(%rsp) # arg6
2077 mov %r12,48(%rsp) # arg7
2078 mov %rcx,56(%rsp) # arg8, (NULL)
2079 call *__imp_RtlVirtualUnwind(%rip)
2081 mov \$1,%eax # ExceptionContinueSearch
2082 add \$64,%rsp
2083 popfq
2084 pop %r15
2085 pop %r14
2086 pop %r13
2087 pop %r12
2088 pop %rbp
2089 pop %rbx
2090 pop %rdi
2091 pop %rsi
2093 .size sqr_handler,.-sqr_handler
2095 .section .pdata
2096 .align 4
2097 .rva .LSEH_begin_rsaz_512_sqr
2098 .rva .LSEH_end_rsaz_512_sqr
2099 .rva .LSEH_info_rsaz_512_sqr
2101 .rva .LSEH_begin_rsaz_512_mul
2102 .rva .LSEH_end_rsaz_512_mul
2103 .rva .LSEH_info_rsaz_512_mul
2105 .rva .LSEH_begin_rsaz_512_mul_gather4
2106 .rva .LSEH_end_rsaz_512_mul_gather4
2107 .rva .LSEH_info_rsaz_512_mul_gather4
2109 .rva .LSEH_begin_rsaz_512_mul_scatter4
2110 .rva .LSEH_end_rsaz_512_mul_scatter4
2111 .rva .LSEH_info_rsaz_512_mul_scatter4
2113 .rva .LSEH_begin_rsaz_512_mul_by_one
2114 .rva .LSEH_end_rsaz_512_mul_by_one
2115 .rva .LSEH_info_rsaz_512_mul_by_one
2117 .section .xdata
2118 .align 8
2119 .LSEH_info_rsaz_512_sqr:
2120 .byte 9,0,0,0
2121 .rva se_handler
2122 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2123 .LSEH_info_rsaz_512_mul:
2124 .byte 9,0,0,0
2125 .rva se_handler
2126 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2127 .LSEH_info_rsaz_512_mul_gather4:
2128 .byte 9,0,0,0
2129 .rva se_handler
2130 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2131 .LSEH_info_rsaz_512_mul_scatter4:
2132 .byte 9,0,0,0
2133 .rva se_handler
2134 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2135 .LSEH_info_rsaz_512_mul_by_one:
2136 .byte 9,0,0,0
2137 .rva se_handler
2138 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2142 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2143 print $code;
2144 close STDOUT;