import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / bn / asm / ppc-mont.pl
blobf9b6992ccc82bd28333ab8ae87e5d120ee07c791
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # April 2006
12 # "Teaser" Montgomery multiplication module for PowerPC. It's possible
13 # to gain a bit more by modulo-scheduling outer loop, then dedicated
14 # squaring procedure should give further 20% and code can be adapted
15 # for 32-bit application running on 64-bit CPU. As for the latter.
16 # It won't be able to achieve "native" 64-bit performance, because in
17 # 32-bit application context every addc instruction will have to be
18 # expanded as addc, twice right shift by 32 and finally adde, etc.
19 # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20 # for 64-bit application running on PPC970/G5 is:
22 # 512-bit +65%
23 # 1024-bit +35%
24 # 2048-bit +18%
25 # 4096-bit +4%
27 $flavour = shift;
29 if ($flavour =~ /32/) {
30 $BITS= 32;
31 $BNSZ= $BITS/8;
32 $SIZE_T=4;
33 $RZONE= 224;
35 $LD= "lwz"; # load
36 $LDU= "lwzu"; # load and update
37 $LDX= "lwzx"; # load indexed
38 $ST= "stw"; # store
39 $STU= "stwu"; # store and update
40 $STX= "stwx"; # store indexed
41 $STUX= "stwux"; # store indexed and update
42 $UMULL= "mullw"; # unsigned multiply low
43 $UMULH= "mulhwu"; # unsigned multiply high
44 $UCMP= "cmplw"; # unsigned compare
45 $SHRI= "srwi"; # unsigned shift right by immediate
46 $PUSH= $ST;
47 $POP= $LD;
48 } elsif ($flavour =~ /64/) {
49 $BITS= 64;
50 $BNSZ= $BITS/8;
51 $SIZE_T=8;
52 $RZONE= 288;
54 # same as above, but 64-bit mnemonics...
55 $LD= "ld"; # load
56 $LDU= "ldu"; # load and update
57 $LDX= "ldx"; # load indexed
58 $ST= "std"; # store
59 $STU= "stdu"; # store and update
60 $STX= "stdx"; # store indexed
61 $STUX= "stdux"; # store indexed and update
62 $UMULL= "mulld"; # unsigned multiply low
63 $UMULH= "mulhdu"; # unsigned multiply high
64 $UCMP= "cmpld"; # unsigned compare
65 $SHRI= "srdi"; # unsigned shift right by immediate
66 $PUSH= $ST;
67 $POP= $LD;
68 } else { die "nonsense $flavour"; }
70 $FRAME=8*$SIZE_T+$RZONE;
71 $LOCALS=8*$SIZE_T;
73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
75 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
76 die "can't locate ppc-xlate.pl";
78 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
80 $sp="r1";
81 $toc="r2";
82 $rp="r3"; $ovf="r3";
83 $ap="r4";
84 $bp="r5";
85 $np="r6";
86 $n0="r7";
87 $num="r8";
88 $rp="r9"; # $rp is reassigned
89 $aj="r10";
90 $nj="r11";
91 $tj="r12";
92 # non-volatile registers
93 $i="r20";
94 $j="r21";
95 $tp="r22";
96 $m0="r23";
97 $m1="r24";
98 $lo0="r25";
99 $hi0="r26";
100 $lo1="r27";
101 $hi1="r28";
102 $alo="r29";
103 $ahi="r30";
104 $nlo="r31";
106 $nhi="r0";
108 $code=<<___;
109 .machine "any"
110 .text
112 .globl .bn_mul_mont_int
113 .align 4
114 .bn_mul_mont_int:
115 cmpwi $num,4
116 mr $rp,r3 ; $rp is reassigned
117 li r3,0
118 bltlr
120 $code.=<<___ if ($BNSZ==4);
121 cmpwi $num,32 ; longer key performance is not better
122 bgelr
124 $code.=<<___;
125 slwi $num,$num,`log($BNSZ)/log(2)`
126 li $tj,-4096
127 addi $ovf,$num,$FRAME
128 subf $ovf,$ovf,$sp ; $sp-$ovf
129 and $ovf,$ovf,$tj ; minimize TLB usage
130 subf $ovf,$sp,$ovf ; $ovf-$sp
131 mr $tj,$sp
132 srwi $num,$num,`log($BNSZ)/log(2)`
133 $STUX $sp,$sp,$ovf
135 $PUSH r20,`-12*$SIZE_T`($tj)
136 $PUSH r21,`-11*$SIZE_T`($tj)
137 $PUSH r22,`-10*$SIZE_T`($tj)
138 $PUSH r23,`-9*$SIZE_T`($tj)
139 $PUSH r24,`-8*$SIZE_T`($tj)
140 $PUSH r25,`-7*$SIZE_T`($tj)
141 $PUSH r26,`-6*$SIZE_T`($tj)
142 $PUSH r27,`-5*$SIZE_T`($tj)
143 $PUSH r28,`-4*$SIZE_T`($tj)
144 $PUSH r29,`-3*$SIZE_T`($tj)
145 $PUSH r30,`-2*$SIZE_T`($tj)
146 $PUSH r31,`-1*$SIZE_T`($tj)
148 $LD $n0,0($n0) ; pull n0[0] value
149 addi $num,$num,-2 ; adjust $num for counter register
151 $LD $m0,0($bp) ; m0=bp[0]
152 $LD $aj,0($ap) ; ap[0]
153 addi $tp,$sp,$LOCALS
154 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
155 $UMULH $hi0,$aj,$m0
157 $LD $aj,$BNSZ($ap) ; ap[1]
158 $LD $nj,0($np) ; np[0]
160 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
162 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
163 $UMULH $ahi,$aj,$m0
165 $UMULL $lo1,$nj,$m1 ; np[0]*m1
166 $UMULH $hi1,$nj,$m1
167 $LD $nj,$BNSZ($np) ; np[1]
168 addc $lo1,$lo1,$lo0
169 addze $hi1,$hi1
171 $UMULL $nlo,$nj,$m1 ; np[1]*m1
172 $UMULH $nhi,$nj,$m1
174 mtctr $num
175 li $j,`2*$BNSZ`
176 .align 4
177 L1st:
178 $LDX $aj,$ap,$j ; ap[j]
179 addc $lo0,$alo,$hi0
180 $LDX $nj,$np,$j ; np[j]
181 addze $hi0,$ahi
182 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
183 addc $lo1,$nlo,$hi1
184 $UMULH $ahi,$aj,$m0
185 addze $hi1,$nhi
186 $UMULL $nlo,$nj,$m1 ; np[j]*m1
187 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
188 $UMULH $nhi,$nj,$m1
189 addze $hi1,$hi1
190 $ST $lo1,0($tp) ; tp[j-1]
192 addi $j,$j,$BNSZ ; j++
193 addi $tp,$tp,$BNSZ ; tp++
194 bdnz- L1st
195 ;L1st
196 addc $lo0,$alo,$hi0
197 addze $hi0,$ahi
199 addc $lo1,$nlo,$hi1
200 addze $hi1,$nhi
201 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
202 addze $hi1,$hi1
203 $ST $lo1,0($tp) ; tp[j-1]
205 li $ovf,0
206 addc $hi1,$hi1,$hi0
207 addze $ovf,$ovf ; upmost overflow bit
208 $ST $hi1,$BNSZ($tp)
210 li $i,$BNSZ
211 .align 4
212 Louter:
213 $LDX $m0,$bp,$i ; m0=bp[i]
214 $LD $aj,0($ap) ; ap[0]
215 addi $tp,$sp,$LOCALS
216 $LD $tj,$LOCALS($sp); tp[0]
217 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
218 $UMULH $hi0,$aj,$m0
219 $LD $aj,$BNSZ($ap) ; ap[1]
220 $LD $nj,0($np) ; np[0]
221 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
222 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
223 addze $hi0,$hi0
224 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
225 $UMULH $ahi,$aj,$m0
226 $UMULL $lo1,$nj,$m1 ; np[0]*m1
227 $UMULH $hi1,$nj,$m1
228 $LD $nj,$BNSZ($np) ; np[1]
229 addc $lo1,$lo1,$lo0
230 $UMULL $nlo,$nj,$m1 ; np[1]*m1
231 addze $hi1,$hi1
232 $UMULH $nhi,$nj,$m1
234 mtctr $num
235 li $j,`2*$BNSZ`
236 .align 4
237 Linner:
238 $LDX $aj,$ap,$j ; ap[j]
239 addc $lo0,$alo,$hi0
240 $LD $tj,$BNSZ($tp) ; tp[j]
241 addze $hi0,$ahi
242 $LDX $nj,$np,$j ; np[j]
243 addc $lo1,$nlo,$hi1
244 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
245 addze $hi1,$nhi
246 $UMULH $ahi,$aj,$m0
247 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
248 $UMULL $nlo,$nj,$m1 ; np[j]*m1
249 addze $hi0,$hi0
250 $UMULH $nhi,$nj,$m1
251 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
252 addi $j,$j,$BNSZ ; j++
253 addze $hi1,$hi1
254 $ST $lo1,0($tp) ; tp[j-1]
255 addi $tp,$tp,$BNSZ ; tp++
256 bdnz- Linner
257 ;Linner
258 $LD $tj,$BNSZ($tp) ; tp[j]
259 addc $lo0,$alo,$hi0
260 addze $hi0,$ahi
261 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
262 addze $hi0,$hi0
264 addc $lo1,$nlo,$hi1
265 addze $hi1,$nhi
266 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
267 addze $hi1,$hi1
268 $ST $lo1,0($tp) ; tp[j-1]
270 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
271 li $ovf,0
272 adde $hi1,$hi1,$hi0
273 addze $ovf,$ovf
274 $ST $hi1,$BNSZ($tp)
276 slwi $tj,$num,`log($BNSZ)/log(2)`
277 $UCMP $i,$tj
278 addi $i,$i,$BNSZ
279 ble- Louter
281 addi $num,$num,2 ; restore $num
282 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
283 addi $tp,$sp,$LOCALS
284 mtctr $num
286 .align 4
287 Lsub: $LDX $tj,$tp,$j
288 $LDX $nj,$np,$j
289 subfe $aj,$nj,$tj ; tp[j]-np[j]
290 $STX $aj,$rp,$j
291 addi $j,$j,$BNSZ
292 bdnz- Lsub
294 li $j,0
295 mtctr $num
296 subfe $ovf,$j,$ovf ; handle upmost overflow bit
297 and $ap,$tp,$ovf
298 andc $np,$rp,$ovf
299 or $ap,$ap,$np ; ap=borrow?tp:rp
301 .align 4
302 Lcopy: ; copy or in-place refresh
303 $LDX $tj,$ap,$j
304 $STX $tj,$rp,$j
305 $STX $j,$tp,$j ; zap at once
306 addi $j,$j,$BNSZ
307 bdnz- Lcopy
309 $POP $tj,0($sp)
310 li r3,1
311 $POP r20,`-12*$SIZE_T`($tj)
312 $POP r21,`-11*$SIZE_T`($tj)
313 $POP r22,`-10*$SIZE_T`($tj)
314 $POP r23,`-9*$SIZE_T`($tj)
315 $POP r24,`-8*$SIZE_T`($tj)
316 $POP r25,`-7*$SIZE_T`($tj)
317 $POP r26,`-6*$SIZE_T`($tj)
318 $POP r27,`-5*$SIZE_T`($tj)
319 $POP r28,`-4*$SIZE_T`($tj)
320 $POP r29,`-3*$SIZE_T`($tj)
321 $POP r30,`-2*$SIZE_T`($tj)
322 $POP r31,`-1*$SIZE_T`($tj)
323 mr $sp,$tj
325 .long 0
326 .byte 0,12,4,0,0x80,12,6,0
327 .long 0
329 .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
332 $code =~ s/\`([^\`]*)\`/eval $1/gem;
333 print $code;
334 close STDOUT;