import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / bn / asm / x86_64-mont5.pl
blobbb7ad4c4b789157348646c64a20cf28d2dcebd92
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # August 2011.
12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
13 # countermeasures. The subroutines are produced by replacing bp[i]
14 # references in their x86_64-mont.pl counterparts with cache-neutral
15 # references to powers table computed in BN_mod_exp_mont_consttime.
16 # In addition subroutine that scatters elements of the powers table
17 # is implemented, so that scatter-/gathering can be tuned without
18 # bn_exp.c modifications.
20 $flavour = shift;
21 $output = shift;
22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29 die "can't locate x86_64-xlate.pl";
31 open OUT,"| \"$^X\" $xlate $flavour $output";
32 *STDOUT=*OUT;
34 # int bn_mul_mont_gather5(
35 $rp="%rdi"; # BN_ULONG *rp,
36 $ap="%rsi"; # const BN_ULONG *ap,
37 $bp="%rdx"; # const BN_ULONG *bp,
38 $np="%rcx"; # const BN_ULONG *np,
39 $n0="%r8"; # const BN_ULONG *n0,
40 $num="%r9"; # int num,
41 # int idx); # 0 to 2^5-1, "index" in $bp holding
42 # pre-computed powers of a', interlaced
43 # in such manner that b[0] is $bp[idx],
44 # b[1] is [2^5+idx], etc.
45 $lo0="%r10";
46 $hi0="%r11";
47 $hi1="%r13";
48 $i="%r14";
49 $j="%r15";
50 $m0="%rbx";
51 $m1="%rbp";
53 $code=<<___;
54 .text
56 .globl bn_mul_mont_gather5
57 .type bn_mul_mont_gather5,\@function,6
58 .align 64
59 bn_mul_mont_gather5:
60 test \$3,${num}d
61 jnz .Lmul_enter
62 cmp \$8,${num}d
63 jb .Lmul_enter
64 jmp .Lmul4x_enter
66 .align 16
67 .Lmul_enter:
68 mov ${num}d,${num}d
69 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
70 lea .Linc(%rip),%r10
71 push %rbx
72 push %rbp
73 push %r12
74 push %r13
75 push %r14
76 push %r15
78 .Lmul_alloca:
79 mov %rsp,%rax
80 lea 2($num),%r11
81 neg %r11
82 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
83 and \$-1024,%rsp # minimize TLB usage
85 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
86 .Lmul_body:
87 lea 128($bp),%r12 # reassign $bp (+size optimization)
88 ___
89 $bp="%r12";
90 $STRIDE=2**5*8; # 5 is "window size"
91 $N=$STRIDE/4; # should match cache line size
92 $code.=<<___;
93 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
94 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
95 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
96 and \$-16,%r10
98 pshufd \$0,%xmm5,%xmm5 # broadcast index
99 movdqa %xmm1,%xmm4
100 movdqa %xmm1,%xmm2
102 ########################################################################
103 # calculate mask by comparing 0..31 to index and save result to stack
105 $code.=<<___;
106 paddd %xmm0,%xmm1
107 pcmpeqd %xmm5,%xmm0 # compare to 1,0
108 .byte 0x67
109 movdqa %xmm4,%xmm3
111 for($k=0;$k<$STRIDE/16-4;$k+=4) {
112 $code.=<<___;
113 paddd %xmm1,%xmm2
114 pcmpeqd %xmm5,%xmm1 # compare to 3,2
115 movdqa %xmm0,`16*($k+0)+112`(%r10)
116 movdqa %xmm4,%xmm0
118 paddd %xmm2,%xmm3
119 pcmpeqd %xmm5,%xmm2 # compare to 5,4
120 movdqa %xmm1,`16*($k+1)+112`(%r10)
121 movdqa %xmm4,%xmm1
123 paddd %xmm3,%xmm0
124 pcmpeqd %xmm5,%xmm3 # compare to 7,6
125 movdqa %xmm2,`16*($k+2)+112`(%r10)
126 movdqa %xmm4,%xmm2
128 paddd %xmm0,%xmm1
129 pcmpeqd %xmm5,%xmm0
130 movdqa %xmm3,`16*($k+3)+112`(%r10)
131 movdqa %xmm4,%xmm3
134 $code.=<<___; # last iteration can be optimized
135 paddd %xmm1,%xmm2
136 pcmpeqd %xmm5,%xmm1
137 movdqa %xmm0,`16*($k+0)+112`(%r10)
139 paddd %xmm2,%xmm3
140 .byte 0x67
141 pcmpeqd %xmm5,%xmm2
142 movdqa %xmm1,`16*($k+1)+112`(%r10)
144 pcmpeqd %xmm5,%xmm3
145 movdqa %xmm2,`16*($k+2)+112`(%r10)
146 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
148 pand `16*($k+1)-128`($bp),%xmm1
149 pand `16*($k+2)-128`($bp),%xmm2
150 movdqa %xmm3,`16*($k+3)+112`(%r10)
151 pand `16*($k+3)-128`($bp),%xmm3
152 por %xmm2,%xmm0
153 por %xmm3,%xmm1
155 for($k=0;$k<$STRIDE/16-4;$k+=4) {
156 $code.=<<___;
157 movdqa `16*($k+0)-128`($bp),%xmm4
158 movdqa `16*($k+1)-128`($bp),%xmm5
159 movdqa `16*($k+2)-128`($bp),%xmm2
160 pand `16*($k+0)+112`(%r10),%xmm4
161 movdqa `16*($k+3)-128`($bp),%xmm3
162 pand `16*($k+1)+112`(%r10),%xmm5
163 por %xmm4,%xmm0
164 pand `16*($k+2)+112`(%r10),%xmm2
165 por %xmm5,%xmm1
166 pand `16*($k+3)+112`(%r10),%xmm3
167 por %xmm2,%xmm0
168 por %xmm3,%xmm1
171 $code.=<<___;
172 por %xmm1,%xmm0
173 pshufd \$0x4e,%xmm0,%xmm1
174 por %xmm1,%xmm0
175 lea $STRIDE($bp),$bp
176 movd %xmm0,$m0 # m0=bp[0]
178 mov ($n0),$n0 # pull n0[0] value
179 mov ($ap),%rax
181 xor $i,$i # i=0
182 xor $j,$j # j=0
184 mov $n0,$m1
185 mulq $m0 # ap[0]*bp[0]
186 mov %rax,$lo0
187 mov ($np),%rax
189 imulq $lo0,$m1 # "tp[0]"*n0
190 mov %rdx,$hi0
192 mulq $m1 # np[0]*m1
193 add %rax,$lo0 # discarded
194 mov 8($ap),%rax
195 adc \$0,%rdx
196 mov %rdx,$hi1
198 lea 1($j),$j # j++
199 jmp .L1st_enter
201 .align 16
202 .L1st:
203 add %rax,$hi1
204 mov ($ap,$j,8),%rax
205 adc \$0,%rdx
206 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
207 mov $lo0,$hi0
208 adc \$0,%rdx
209 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
210 mov %rdx,$hi1
212 .L1st_enter:
213 mulq $m0 # ap[j]*bp[0]
214 add %rax,$hi0
215 mov ($np,$j,8),%rax
216 adc \$0,%rdx
217 lea 1($j),$j # j++
218 mov %rdx,$lo0
220 mulq $m1 # np[j]*m1
221 cmp $num,$j
222 jl .L1st
224 add %rax,$hi1
225 mov ($ap),%rax # ap[0]
226 adc \$0,%rdx
227 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
228 adc \$0,%rdx
229 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
230 mov %rdx,$hi1
231 mov $lo0,$hi0
233 xor %rdx,%rdx
234 add $hi0,$hi1
235 adc \$0,%rdx
236 mov $hi1,-8(%rsp,$num,8)
237 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
239 lea 1($i),$i # i++
240 jmp .Louter
241 .align 16
242 .Louter:
243 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
244 and \$-16,%rdx
245 pxor %xmm4,%xmm4
246 pxor %xmm5,%xmm5
248 for($k=0;$k<$STRIDE/16;$k+=4) {
249 $code.=<<___;
250 movdqa `16*($k+0)-128`($bp),%xmm0
251 movdqa `16*($k+1)-128`($bp),%xmm1
252 movdqa `16*($k+2)-128`($bp),%xmm2
253 movdqa `16*($k+3)-128`($bp),%xmm3
254 pand `16*($k+0)-128`(%rdx),%xmm0
255 pand `16*($k+1)-128`(%rdx),%xmm1
256 por %xmm0,%xmm4
257 pand `16*($k+2)-128`(%rdx),%xmm2
258 por %xmm1,%xmm5
259 pand `16*($k+3)-128`(%rdx),%xmm3
260 por %xmm2,%xmm4
261 por %xmm3,%xmm5
264 $code.=<<___;
265 por %xmm5,%xmm4
266 pshufd \$0x4e,%xmm4,%xmm0
267 por %xmm4,%xmm0
268 lea $STRIDE($bp),$bp
269 movd %xmm0,$m0 # m0=bp[i]
271 xor $j,$j # j=0
272 mov $n0,$m1
273 mov (%rsp),$lo0
275 mulq $m0 # ap[0]*bp[i]
276 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
277 mov ($np),%rax
278 adc \$0,%rdx
280 imulq $lo0,$m1 # tp[0]*n0
281 mov %rdx,$hi0
283 mulq $m1 # np[0]*m1
284 add %rax,$lo0 # discarded
285 mov 8($ap),%rax
286 adc \$0,%rdx
287 mov 8(%rsp),$lo0 # tp[1]
288 mov %rdx,$hi1
290 lea 1($j),$j # j++
291 jmp .Linner_enter
293 .align 16
294 .Linner:
295 add %rax,$hi1
296 mov ($ap,$j,8),%rax
297 adc \$0,%rdx
298 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
299 mov (%rsp,$j,8),$lo0
300 adc \$0,%rdx
301 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
302 mov %rdx,$hi1
304 .Linner_enter:
305 mulq $m0 # ap[j]*bp[i]
306 add %rax,$hi0
307 mov ($np,$j,8),%rax
308 adc \$0,%rdx
309 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
310 mov %rdx,$hi0
311 adc \$0,$hi0
312 lea 1($j),$j # j++
314 mulq $m1 # np[j]*m1
315 cmp $num,$j
316 jl .Linner
318 add %rax,$hi1
319 mov ($ap),%rax # ap[0]
320 adc \$0,%rdx
321 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
322 mov (%rsp,$j,8),$lo0
323 adc \$0,%rdx
324 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
325 mov %rdx,$hi1
327 xor %rdx,%rdx
328 add $hi0,$hi1
329 adc \$0,%rdx
330 add $lo0,$hi1 # pull upmost overflow bit
331 adc \$0,%rdx
332 mov $hi1,-8(%rsp,$num,8)
333 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
335 lea 1($i),$i # i++
336 cmp $num,$i
337 jl .Louter
339 xor $i,$i # i=0 and clear CF!
340 mov (%rsp),%rax # tp[0]
341 lea (%rsp),$ap # borrow ap for tp
342 mov $num,$j # j=num
343 jmp .Lsub
344 .align 16
345 .Lsub: sbb ($np,$i,8),%rax
346 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
347 mov 8($ap,$i,8),%rax # tp[i+1]
348 lea 1($i),$i # i++
349 dec $j # doesnn't affect CF!
350 jnz .Lsub
352 sbb \$0,%rax # handle upmost overflow bit
353 xor $i,$i
354 and %rax,$ap
355 not %rax
356 mov $rp,$np
357 and %rax,$np
358 mov $num,$j # j=num
359 or $np,$ap # ap=borrow?tp:rp
360 .align 16
361 .Lcopy: # copy or in-place refresh
362 mov ($ap,$i,8),%rax
363 mov $i,(%rsp,$i,8) # zap temporary vector
364 mov %rax,($rp,$i,8) # rp[i]=tp[i]
365 lea 1($i),$i
366 sub \$1,$j
367 jnz .Lcopy
369 mov 8(%rsp,$num,8),%rsi # restore %rsp
370 mov \$1,%rax
372 mov (%rsi),%r15
373 mov 8(%rsi),%r14
374 mov 16(%rsi),%r13
375 mov 24(%rsi),%r12
376 mov 32(%rsi),%rbp
377 mov 40(%rsi),%rbx
378 lea 48(%rsi),%rsp
379 .Lmul_epilogue:
381 .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
384 my @A=("%r10","%r11");
385 my @N=("%r13","%rdi");
386 $code.=<<___;
387 .type bn_mul4x_mont_gather5,\@function,6
388 .align 16
389 bn_mul4x_mont_gather5:
390 .Lmul4x_enter:
391 mov ${num}d,${num}d
392 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
393 lea .Linc(%rip),%r10
394 push %rbx
395 push %rbp
396 push %r12
397 push %r13
398 push %r14
399 push %r15
401 .Lmul4x_alloca:
402 mov %rsp,%rax
403 lea 4($num),%r11
404 neg %r11
405 lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256)
406 and \$-1024,%rsp # minimize TLB usage
408 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
409 .Lmul4x_body:
410 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
411 lea 128(%rdx),%r12 # reassign $bp (+size optimization)
413 $bp="%r12";
414 $STRIDE=2**5*8; # 5 is "window size"
415 $N=$STRIDE/4; # should match cache line size
416 $code.=<<___;
417 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
418 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
419 lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
421 pshufd \$0,%xmm5,%xmm5 # broadcast index
422 movdqa %xmm1,%xmm4
423 .byte 0x67,0x67
424 movdqa %xmm1,%xmm2
426 ########################################################################
427 # calculate mask by comparing 0..31 to index and save result to stack
429 $code.=<<___;
430 paddd %xmm0,%xmm1
431 pcmpeqd %xmm5,%xmm0 # compare to 1,0
432 .byte 0x67
433 movdqa %xmm4,%xmm3
435 for($k=0;$k<$STRIDE/16-4;$k+=4) {
436 $code.=<<___;
437 paddd %xmm1,%xmm2
438 pcmpeqd %xmm5,%xmm1 # compare to 3,2
439 movdqa %xmm0,`16*($k+0)+112`(%r10)
440 movdqa %xmm4,%xmm0
442 paddd %xmm2,%xmm3
443 pcmpeqd %xmm5,%xmm2 # compare to 5,4
444 movdqa %xmm1,`16*($k+1)+112`(%r10)
445 movdqa %xmm4,%xmm1
447 paddd %xmm3,%xmm0
448 pcmpeqd %xmm5,%xmm3 # compare to 7,6
449 movdqa %xmm2,`16*($k+2)+112`(%r10)
450 movdqa %xmm4,%xmm2
452 paddd %xmm0,%xmm1
453 pcmpeqd %xmm5,%xmm0
454 movdqa %xmm3,`16*($k+3)+112`(%r10)
455 movdqa %xmm4,%xmm3
458 $code.=<<___; # last iteration can be optimized
459 paddd %xmm1,%xmm2
460 pcmpeqd %xmm5,%xmm1
461 movdqa %xmm0,`16*($k+0)+112`(%r10)
463 paddd %xmm2,%xmm3
464 .byte 0x67
465 pcmpeqd %xmm5,%xmm2
466 movdqa %xmm1,`16*($k+1)+112`(%r10)
468 pcmpeqd %xmm5,%xmm3
469 movdqa %xmm2,`16*($k+2)+112`(%r10)
470 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
472 pand `16*($k+1)-128`($bp),%xmm1
473 pand `16*($k+2)-128`($bp),%xmm2
474 movdqa %xmm3,`16*($k+3)+112`(%r10)
475 pand `16*($k+3)-128`($bp),%xmm3
476 por %xmm2,%xmm0
477 por %xmm3,%xmm1
479 for($k=0;$k<$STRIDE/16-4;$k+=4) {
480 $code.=<<___;
481 movdqa `16*($k+0)-128`($bp),%xmm4
482 movdqa `16*($k+1)-128`($bp),%xmm5
483 movdqa `16*($k+2)-128`($bp),%xmm2
484 pand `16*($k+0)+112`(%r10),%xmm4
485 movdqa `16*($k+3)-128`($bp),%xmm3
486 pand `16*($k+1)+112`(%r10),%xmm5
487 por %xmm4,%xmm0
488 pand `16*($k+2)+112`(%r10),%xmm2
489 por %xmm5,%xmm1
490 pand `16*($k+3)+112`(%r10),%xmm3
491 por %xmm2,%xmm0
492 por %xmm3,%xmm1
495 $code.=<<___;
496 por %xmm1,%xmm0
497 pshufd \$0x4e,%xmm0,%xmm1
498 por %xmm1,%xmm0
499 lea $STRIDE($bp),$bp
500 movd %xmm0,$m0 # m0=bp[0]
502 mov ($n0),$n0 # pull n0[0] value
503 mov ($ap),%rax
505 xor $i,$i # i=0
506 xor $j,$j # j=0
508 mov $n0,$m1
509 mulq $m0 # ap[0]*bp[0]
510 mov %rax,$A[0]
511 mov ($np),%rax
513 imulq $A[0],$m1 # "tp[0]"*n0
514 mov %rdx,$A[1]
516 mulq $m1 # np[0]*m1
517 add %rax,$A[0] # discarded
518 mov 8($ap),%rax
519 adc \$0,%rdx
520 mov %rdx,$N[1]
522 mulq $m0
523 add %rax,$A[1]
524 mov 8($np),%rax
525 adc \$0,%rdx
526 mov %rdx,$A[0]
528 mulq $m1
529 add %rax,$N[1]
530 mov 16($ap),%rax
531 adc \$0,%rdx
532 add $A[1],$N[1]
533 lea 4($j),$j # j++
534 adc \$0,%rdx
535 mov $N[1],(%rsp)
536 mov %rdx,$N[0]
537 jmp .L1st4x
538 .align 16
539 .L1st4x:
540 mulq $m0 # ap[j]*bp[0]
541 add %rax,$A[0]
542 mov -16($np,$j,8),%rax
543 adc \$0,%rdx
544 mov %rdx,$A[1]
546 mulq $m1 # np[j]*m1
547 add %rax,$N[0]
548 mov -8($ap,$j,8),%rax
549 adc \$0,%rdx
550 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
551 adc \$0,%rdx
552 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
553 mov %rdx,$N[1]
555 mulq $m0 # ap[j]*bp[0]
556 add %rax,$A[1]
557 mov -8($np,$j,8),%rax
558 adc \$0,%rdx
559 mov %rdx,$A[0]
561 mulq $m1 # np[j]*m1
562 add %rax,$N[1]
563 mov ($ap,$j,8),%rax
564 adc \$0,%rdx
565 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
566 adc \$0,%rdx
567 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
568 mov %rdx,$N[0]
570 mulq $m0 # ap[j]*bp[0]
571 add %rax,$A[0]
572 mov ($np,$j,8),%rax
573 adc \$0,%rdx
574 mov %rdx,$A[1]
576 mulq $m1 # np[j]*m1
577 add %rax,$N[0]
578 mov 8($ap,$j,8),%rax
579 adc \$0,%rdx
580 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
581 adc \$0,%rdx
582 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
583 mov %rdx,$N[1]
585 mulq $m0 # ap[j]*bp[0]
586 add %rax,$A[1]
587 mov 8($np,$j,8),%rax
588 adc \$0,%rdx
589 lea 4($j),$j # j++
590 mov %rdx,$A[0]
592 mulq $m1 # np[j]*m1
593 add %rax,$N[1]
594 mov -16($ap,$j,8),%rax
595 adc \$0,%rdx
596 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
597 adc \$0,%rdx
598 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
599 mov %rdx,$N[0]
600 cmp $num,$j
601 jl .L1st4x
603 mulq $m0 # ap[j]*bp[0]
604 add %rax,$A[0]
605 mov -16($np,$j,8),%rax
606 adc \$0,%rdx
607 mov %rdx,$A[1]
609 mulq $m1 # np[j]*m1
610 add %rax,$N[0]
611 mov -8($ap,$j,8),%rax
612 adc \$0,%rdx
613 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
614 adc \$0,%rdx
615 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
616 mov %rdx,$N[1]
618 mulq $m0 # ap[j]*bp[0]
619 add %rax,$A[1]
620 mov -8($np,$j,8),%rax
621 adc \$0,%rdx
622 mov %rdx,$A[0]
624 mulq $m1 # np[j]*m1
625 add %rax,$N[1]
626 mov ($ap),%rax # ap[0]
627 adc \$0,%rdx
628 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
629 adc \$0,%rdx
630 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
631 mov %rdx,$N[0]
633 xor $N[1],$N[1]
634 add $A[0],$N[0]
635 adc \$0,$N[1]
636 mov $N[0],-8(%rsp,$j,8)
637 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
639 lea 1($i),$i # i++
640 .align 4
641 .Louter4x:
642 lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
643 pxor %xmm4,%xmm4
644 pxor %xmm5,%xmm5
646 for($k=0;$k<$STRIDE/16;$k+=4) {
647 $code.=<<___;
648 movdqa `16*($k+0)-128`($bp),%xmm0
649 movdqa `16*($k+1)-128`($bp),%xmm1
650 movdqa `16*($k+2)-128`($bp),%xmm2
651 movdqa `16*($k+3)-128`($bp),%xmm3
652 pand `16*($k+0)-128`(%rdx),%xmm0
653 pand `16*($k+1)-128`(%rdx),%xmm1
654 por %xmm0,%xmm4
655 pand `16*($k+2)-128`(%rdx),%xmm2
656 por %xmm1,%xmm5
657 pand `16*($k+3)-128`(%rdx),%xmm3
658 por %xmm2,%xmm4
659 por %xmm3,%xmm5
662 $code.=<<___;
663 por %xmm5,%xmm4
664 pshufd \$0x4e,%xmm4,%xmm0
665 por %xmm4,%xmm0
666 lea $STRIDE($bp),$bp
667 movd %xmm0,$m0 # m0=bp[i]
669 xor $j,$j # j=0
671 mov (%rsp),$A[0]
672 mov $n0,$m1
673 mulq $m0 # ap[0]*bp[i]
674 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
675 mov ($np),%rax
676 adc \$0,%rdx
678 imulq $A[0],$m1 # tp[0]*n0
679 mov %rdx,$A[1]
681 mulq $m1 # np[0]*m1
682 add %rax,$A[0] # "$N[0]", discarded
683 mov 8($ap),%rax
684 adc \$0,%rdx
685 mov %rdx,$N[1]
687 mulq $m0 # ap[j]*bp[i]
688 add %rax,$A[1]
689 mov 8($np),%rax
690 adc \$0,%rdx
691 add 8(%rsp),$A[1] # +tp[1]
692 adc \$0,%rdx
693 mov %rdx,$A[0]
695 mulq $m1 # np[j]*m1
696 add %rax,$N[1]
697 mov 16($ap),%rax
698 adc \$0,%rdx
699 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
700 lea 4($j),$j # j+=2
701 adc \$0,%rdx
702 mov %rdx,$N[0]
703 jmp .Linner4x
704 .align 16
705 .Linner4x:
706 mulq $m0 # ap[j]*bp[i]
707 add %rax,$A[0]
708 mov -16($np,$j,8),%rax
709 adc \$0,%rdx
710 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
711 adc \$0,%rdx
712 mov %rdx,$A[1]
714 mulq $m1 # np[j]*m1
715 add %rax,$N[0]
716 mov -8($ap,$j,8),%rax
717 adc \$0,%rdx
718 add $A[0],$N[0]
719 adc \$0,%rdx
720 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
721 mov %rdx,$N[1]
723 mulq $m0 # ap[j]*bp[i]
724 add %rax,$A[1]
725 mov -8($np,$j,8),%rax
726 adc \$0,%rdx
727 add -8(%rsp,$j,8),$A[1]
728 adc \$0,%rdx
729 mov %rdx,$A[0]
731 mulq $m1 # np[j]*m1
732 add %rax,$N[1]
733 mov ($ap,$j,8),%rax
734 adc \$0,%rdx
735 add $A[1],$N[1]
736 adc \$0,%rdx
737 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
738 mov %rdx,$N[0]
740 mulq $m0 # ap[j]*bp[i]
741 add %rax,$A[0]
742 mov ($np,$j,8),%rax
743 adc \$0,%rdx
744 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
745 adc \$0,%rdx
746 mov %rdx,$A[1]
748 mulq $m1 # np[j]*m1
749 add %rax,$N[0]
750 mov 8($ap,$j,8),%rax
751 adc \$0,%rdx
752 add $A[0],$N[0]
753 adc \$0,%rdx
754 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
755 mov %rdx,$N[1]
757 mulq $m0 # ap[j]*bp[i]
758 add %rax,$A[1]
759 mov 8($np,$j,8),%rax
760 adc \$0,%rdx
761 add 8(%rsp,$j,8),$A[1]
762 adc \$0,%rdx
763 lea 4($j),$j # j++
764 mov %rdx,$A[0]
766 mulq $m1 # np[j]*m1
767 add %rax,$N[1]
768 mov -16($ap,$j,8),%rax
769 adc \$0,%rdx
770 add $A[1],$N[1]
771 adc \$0,%rdx
772 mov $N[0],-40(%rsp,$j,8) # tp[j-1]
773 mov %rdx,$N[0]
774 cmp $num,$j
775 jl .Linner4x
777 mulq $m0 # ap[j]*bp[i]
778 add %rax,$A[0]
779 mov -16($np,$j,8),%rax
780 adc \$0,%rdx
781 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
782 adc \$0,%rdx
783 mov %rdx,$A[1]
785 mulq $m1 # np[j]*m1
786 add %rax,$N[0]
787 mov -8($ap,$j,8),%rax
788 adc \$0,%rdx
789 add $A[0],$N[0]
790 adc \$0,%rdx
791 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
792 mov %rdx,$N[1]
794 mulq $m0 # ap[j]*bp[i]
795 add %rax,$A[1]
796 mov -8($np,$j,8),%rax
797 adc \$0,%rdx
798 add -8(%rsp,$j,8),$A[1]
799 adc \$0,%rdx
800 lea 1($i),$i # i++
801 mov %rdx,$A[0]
803 mulq $m1 # np[j]*m1
804 add %rax,$N[1]
805 mov ($ap),%rax # ap[0]
806 adc \$0,%rdx
807 add $A[1],$N[1]
808 adc \$0,%rdx
809 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
810 mov %rdx,$N[0]
812 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
814 xor $N[1],$N[1]
815 add $A[0],$N[0]
816 adc \$0,$N[1]
817 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
818 adc \$0,$N[1]
819 mov $N[0],-8(%rsp,$j,8)
820 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
822 cmp $num,$i
823 jl .Louter4x
826 my @ri=("%rax","%rdx",$m0,$m1);
827 $code.=<<___;
828 mov 16(%rsp,$num,8),$rp # restore $rp
829 mov 0(%rsp),@ri[0] # tp[0]
830 pxor %xmm0,%xmm0
831 mov 8(%rsp),@ri[1] # tp[1]
832 shr \$2,$num # num/=4
833 lea (%rsp),$ap # borrow ap for tp
834 xor $i,$i # i=0 and clear CF!
836 sub 0($np),@ri[0]
837 mov 16($ap),@ri[2] # tp[2]
838 mov 24($ap),@ri[3] # tp[3]
839 sbb 8($np),@ri[1]
840 lea -1($num),$j # j=num/4-1
841 jmp .Lsub4x
842 .align 16
843 .Lsub4x:
844 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
845 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
846 sbb 16($np,$i,8),@ri[2]
847 mov 32($ap,$i,8),@ri[0] # tp[i+1]
848 mov 40($ap,$i,8),@ri[1]
849 sbb 24($np,$i,8),@ri[3]
850 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
851 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
852 sbb 32($np,$i,8),@ri[0]
853 mov 48($ap,$i,8),@ri[2]
854 mov 56($ap,$i,8),@ri[3]
855 sbb 40($np,$i,8),@ri[1]
856 lea 4($i),$i # i++
857 dec $j # doesnn't affect CF!
858 jnz .Lsub4x
860 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
861 mov 32($ap,$i,8),@ri[0] # load overflow bit
862 sbb 16($np,$i,8),@ri[2]
863 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
864 sbb 24($np,$i,8),@ri[3]
865 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
867 sbb \$0,@ri[0] # handle upmost overflow bit
868 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
869 xor $i,$i # i=0
870 and @ri[0],$ap
871 not @ri[0]
872 mov $rp,$np
873 and @ri[0],$np
874 lea -1($num),$j
875 or $np,$ap # ap=borrow?tp:rp
877 movdqu ($ap),%xmm1
878 movdqa %xmm0,(%rsp)
879 movdqu %xmm1,($rp)
880 jmp .Lcopy4x
881 .align 16
882 .Lcopy4x: # copy or in-place refresh
883 movdqu 16($ap,$i),%xmm2
884 movdqu 32($ap,$i),%xmm1
885 movdqa %xmm0,16(%rsp,$i)
886 movdqu %xmm2,16($rp,$i)
887 movdqa %xmm0,32(%rsp,$i)
888 movdqu %xmm1,32($rp,$i)
889 lea 32($i),$i
890 dec $j
891 jnz .Lcopy4x
893 shl \$2,$num
894 movdqu 16($ap,$i),%xmm2
895 movdqa %xmm0,16(%rsp,$i)
896 movdqu %xmm2,16($rp,$i)
899 $code.=<<___;
900 mov 8(%rsp,$num,8),%rsi # restore %rsp
901 mov \$1,%rax
903 mov (%rsi),%r15
904 mov 8(%rsi),%r14
905 mov 16(%rsi),%r13
906 mov 24(%rsi),%r12
907 mov 32(%rsi),%rbp
908 mov 40(%rsi),%rbx
909 lea 48(%rsi),%rsp
910 .Lmul4x_epilogue:
912 .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
917 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
918 ("%rdi","%rsi","%rdx","%ecx"); # Unix order
919 my $out=$inp;
920 my $STRIDE=2**5*8;
921 my $N=$STRIDE/4;
923 $code.=<<___;
924 .globl bn_scatter5
925 .type bn_scatter5,\@abi-omnipotent
926 .align 16
927 bn_scatter5:
928 cmp \$0, $num
929 jz .Lscatter_epilogue
930 lea ($tbl,$idx,8),$tbl
931 .Lscatter:
932 mov ($inp),%rax
933 lea 8($inp),$inp
934 mov %rax,($tbl)
935 lea 32*8($tbl),$tbl
936 sub \$1,$num
937 jnz .Lscatter
938 .Lscatter_epilogue:
940 .size bn_scatter5,.-bn_scatter5
942 .globl bn_gather5
943 .type bn_gather5,\@abi-omnipotent
944 .align 16
945 bn_gather5:
946 .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
947 # I can't trust assembler to use specific encoding:-(
948 .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10
949 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp
950 lea .Linc(%rip),%rax
951 and \$-16,%rsp # shouldn't be formally required
953 movd $idx,%xmm5
954 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
955 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
956 lea 128($tbl),%r11 # size optimization
957 lea 128(%rsp),%rax # size optimization
959 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
960 movdqa %xmm1,%xmm4
961 movdqa %xmm1,%xmm2
963 ########################################################################
964 # calculate mask by comparing 0..31 to $idx and save result to stack
966 for($i=0;$i<$STRIDE/16;$i+=4) {
967 $code.=<<___;
968 paddd %xmm0,%xmm1
969 pcmpeqd %xmm5,%xmm0 # compare to 1,0
971 $code.=<<___ if ($i);
972 movdqa %xmm3,`16*($i-1)-128`(%rax)
974 $code.=<<___;
975 movdqa %xmm4,%xmm3
977 paddd %xmm1,%xmm2
978 pcmpeqd %xmm5,%xmm1 # compare to 3,2
979 movdqa %xmm0,`16*($i+0)-128`(%rax)
980 movdqa %xmm4,%xmm0
982 paddd %xmm2,%xmm3
983 pcmpeqd %xmm5,%xmm2 # compare to 5,4
984 movdqa %xmm1,`16*($i+1)-128`(%rax)
985 movdqa %xmm4,%xmm1
987 paddd %xmm3,%xmm0
988 pcmpeqd %xmm5,%xmm3 # compare to 7,6
989 movdqa %xmm2,`16*($i+2)-128`(%rax)
990 movdqa %xmm4,%xmm2
993 $code.=<<___;
994 movdqa %xmm3,`16*($i-1)-128`(%rax)
995 jmp .Lgather
997 .align 32
998 .Lgather:
999 pxor %xmm4,%xmm4
1000 pxor %xmm5,%xmm5
1002 for($i=0;$i<$STRIDE/16;$i+=4) {
1003 $code.=<<___;
1004 movdqa `16*($i+0)-128`(%r11),%xmm0
1005 movdqa `16*($i+1)-128`(%r11),%xmm1
1006 movdqa `16*($i+2)-128`(%r11),%xmm2
1007 pand `16*($i+0)-128`(%rax),%xmm0
1008 movdqa `16*($i+3)-128`(%r11),%xmm3
1009 pand `16*($i+1)-128`(%rax),%xmm1
1010 por %xmm0,%xmm4
1011 pand `16*($i+2)-128`(%rax),%xmm2
1012 por %xmm1,%xmm5
1013 pand `16*($i+3)-128`(%rax),%xmm3
1014 por %xmm2,%xmm4
1015 por %xmm3,%xmm5
1018 $code.=<<___;
1019 por %xmm5,%xmm4
1020 lea $STRIDE(%r11),%r11
1021 pshufd \$0x4e,%xmm4,%xmm0
1022 por %xmm4,%xmm0
1023 movq %xmm0,($out) # m0=bp[0]
1024 lea 8($out),$out
1025 sub \$1,$num
1026 jnz .Lgather
1028 lea (%r10),%rsp
1030 .LSEH_end_bn_gather5:
1031 .size bn_gather5,.-bn_gather5
1034 $code.=<<___;
1035 .align 64
1036 .Linc:
1037 .long 0,0, 1,1
1038 .long 2,2, 2,2
1039 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1042 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1043 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1044 if ($win64) {
1045 $rec="%rcx";
1046 $frame="%rdx";
1047 $context="%r8";
1048 $disp="%r9";
1050 $code.=<<___;
1051 .extern __imp_RtlVirtualUnwind
1052 .type mul_handler,\@abi-omnipotent
1053 .align 16
1054 mul_handler:
1055 push %rsi
1056 push %rdi
1057 push %rbx
1058 push %rbp
1059 push %r12
1060 push %r13
1061 push %r14
1062 push %r15
1063 pushfq
1064 sub \$64,%rsp
1066 mov 120($context),%rax # pull context->Rax
1067 mov 248($context),%rbx # pull context->Rip
1069 mov 8($disp),%rsi # disp->ImageBase
1070 mov 56($disp),%r11 # disp->HandlerData
1072 mov 0(%r11),%r10d # HandlerData[0]
1073 lea (%rsi,%r10),%r10 # end of prologue label
1074 cmp %r10,%rbx # context->Rip<end of prologue label
1075 jb .Lcommon_seh_tail
1077 lea 48(%rax),%rax
1079 mov 4(%r11),%r10d # HandlerData[1]
1080 lea (%rsi,%r10),%r10 # end of alloca label
1081 cmp %r10,%rbx # context->Rip<end of alloca label
1082 jb .Lcommon_seh_tail
1084 mov 152($context),%rax # pull context->Rsp
1086 mov 8(%r11),%r10d # HandlerData[2]
1087 lea (%rsi,%r10),%r10 # epilogue label
1088 cmp %r10,%rbx # context->Rip>=epilogue label
1089 jae .Lcommon_seh_tail
1091 mov 192($context),%r10 # pull $num
1092 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1094 lea 48(%rax),%rax
1096 mov -8(%rax),%rbx
1097 mov -16(%rax),%rbp
1098 mov -24(%rax),%r12
1099 mov -32(%rax),%r13
1100 mov -40(%rax),%r14
1101 mov -48(%rax),%r15
1102 mov %rbx,144($context) # restore context->Rbx
1103 mov %rbp,160($context) # restore context->Rbp
1104 mov %r12,216($context) # restore context->R12
1105 mov %r13,224($context) # restore context->R13
1106 mov %r14,232($context) # restore context->R14
1107 mov %r15,240($context) # restore context->R15
1109 .Lcommon_seh_tail:
1110 mov 8(%rax),%rdi
1111 mov 16(%rax),%rsi
1112 mov %rax,152($context) # restore context->Rsp
1113 mov %rsi,168($context) # restore context->Rsi
1114 mov %rdi,176($context) # restore context->Rdi
1116 mov 40($disp),%rdi # disp->ContextRecord
1117 mov $context,%rsi # context
1118 mov \$154,%ecx # sizeof(CONTEXT)
1119 .long 0xa548f3fc # cld; rep movsq
1121 mov $disp,%rsi
1122 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1123 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1124 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1125 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1126 mov 40(%rsi),%r10 # disp->ContextRecord
1127 lea 56(%rsi),%r11 # &disp->HandlerData
1128 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1129 mov %r10,32(%rsp) # arg5
1130 mov %r11,40(%rsp) # arg6
1131 mov %r12,48(%rsp) # arg7
1132 mov %rcx,56(%rsp) # arg8, (NULL)
1133 call *__imp_RtlVirtualUnwind(%rip)
1135 mov \$1,%eax # ExceptionContinueSearch
1136 add \$64,%rsp
1137 popfq
1138 pop %r15
1139 pop %r14
1140 pop %r13
1141 pop %r12
1142 pop %rbp
1143 pop %rbx
1144 pop %rdi
1145 pop %rsi
1147 .size mul_handler,.-mul_handler
1149 .section .pdata
1150 .align 4
1151 .rva .LSEH_begin_bn_mul_mont_gather5
1152 .rva .LSEH_end_bn_mul_mont_gather5
1153 .rva .LSEH_info_bn_mul_mont_gather5
1155 .rva .LSEH_begin_bn_mul4x_mont_gather5
1156 .rva .LSEH_end_bn_mul4x_mont_gather5
1157 .rva .LSEH_info_bn_mul4x_mont_gather5
1159 .rva .LSEH_begin_bn_gather5
1160 .rva .LSEH_end_bn_gather5
1161 .rva .LSEH_info_bn_gather5
1163 .section .xdata
1164 .align 8
1165 .LSEH_info_bn_mul_mont_gather5:
1166 .byte 9,0,0,0
1167 .rva mul_handler
1168 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
1169 .align 8
1170 .LSEH_info_bn_mul4x_mont_gather5:
1171 .byte 9,0,0,0
1172 .rva mul_handler
1173 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1174 .align 8
1175 .LSEH_info_bn_gather5:
1176 .byte 0x01,0x0b,0x03,0x0a
1177 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
1178 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10
1179 .align 8
1183 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1185 print $code;
1186 close STDOUT;