OpenSSL: update to 1.0.2a
[tomato.git] / release / src-rt-6.x.4708 / router / openssl / crypto / aes / asm / vpaes-ppc.pl
blob7fda60ed9e4d59d35fc9222a69ac313a397b942d
1 #!/usr/bin/env perl
3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
5 ## version 0.1
6 ##
7 ## By Mike Hamburg (Stanford University), 2009
8 ## Public domain.
9 ##
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
14 # 128-bit key.
16 # aes-ppc.pl this
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
26 # for Intel CPUs.
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
30 $flavour = shift;
32 if ($flavour =~ /64/) {
33 $SIZE_T =8;
34 $LRSAVE =2*$SIZE_T;
35 $STU ="stdu";
36 $POP ="ld";
37 $PUSH ="std";
38 $UCMP ="cmpld";
39 } elsif ($flavour =~ /32/) {
40 $SIZE_T =4;
41 $LRSAVE =$SIZE_T;
42 $STU ="stwu";
43 $POP ="lwz";
44 $PUSH ="stw";
45 $UCMP ="cmplw";
46 } else { die "nonsense $flavour"; }
48 $sp="r1";
49 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54 die "can't locate ppc-xlate.pl";
56 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
58 $code.=<<___;
59 .machine "any"
61 .text
63 .align 7 # totally strategic alignment
64 _vpaes_consts:
65 Lk_mc_forward: # mc_forward
66 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
67 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
68 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
69 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
70 Lk_mc_backward: # mc_backward
71 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
72 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
73 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
74 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
75 Lk_sr: # sr
76 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
77 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
78 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
79 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
82 ## "Hot" constants
84 Lk_inv: # inv, inva
85 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
86 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
87 Lk_ipt: # input transform (lo, hi)
88 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
89 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
90 Lk_sbo: # sbou, sbot
91 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
92 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
93 Lk_sb1: # sb1u, sb1t
94 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
95 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
96 Lk_sb2: # sb2u, sb2t
97 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
98 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
101 ## Decryption stuff
103 Lk_dipt: # decryption input transform
104 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
105 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
106 Lk_dsbo: # decryption sbox final output
107 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
108 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
109 Lk_dsb9: # decryption sbox output *9*u, *9*t
110 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
111 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
112 Lk_dsbd: # decryption sbox output *D*u, *D*t
113 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
114 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
115 Lk_dsbb: # decryption sbox output *B*u, *B*t
116 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
117 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
118 Lk_dsbe: # decryption sbox output *E*u, *E*t
119 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
120 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
123 ## Key schedule constants
125 Lk_dksd: # decryption key schedule: invskew x*D
126 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
127 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
128 Lk_dksb: # decryption key schedule: invskew x*B
129 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
130 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
131 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
132 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
133 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
134 Lk_dks9: # decryption key schedule: invskew x*9
135 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
136 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
138 Lk_rcon: # rcon
139 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
140 Lk_s63:
141 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
143 Lk_opt: # output transform
144 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
145 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
146 Lk_deskew: # deskew tables: inverts the sbox's "skew"
147 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
148 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
149 .align 5
150 Lconsts:
151 mflr r0
152 bcl 20,31,\$+4
153 mflr r12 #vvvvv "distance between . and _vpaes_consts
154 addi r12,r12,-0x308
155 mtlr r0
157 .long 0
158 .byte 0,12,0x14,0,0,0,0,0
159 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
160 .align 6
163 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
165 my ($inp,$out,$key) = map("r$_",(3..5));
167 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
168 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
169 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
171 $code.=<<___;
173 ## _aes_preheat
175 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
176 ## and %xmm9-%xmm15 as specified below.
178 .align 4
179 _vpaes_encrypt_preheat:
180 mflr r8
181 bl Lconsts
182 mtlr r8
183 li r11, 0xc0 # Lk_inv
184 li r10, 0xd0
185 li r9, 0xe0 # Lk_ipt
186 li r8, 0xf0
187 vxor v7, v7, v7 # 0x00..00
188 vspltisb v8,4 # 0x04..04
189 vspltisb v9,0x0f # 0x0f..0f
190 lvx $invlo, r12, r11
191 li r11, 0x100
192 lvx $invhi, r12, r10
193 li r10, 0x110
194 lvx $iptlo, r12, r9
195 li r9, 0x120
196 lvx $ipthi, r12, r8
197 li r8, 0x130
198 lvx $sbou, r12, r11
199 li r11, 0x140
200 lvx $sbot, r12, r10
201 li r10, 0x150
202 lvx $sb1u, r12, r9
203 lvx $sb1t, r12, r8
204 lvx $sb2u, r12, r11
205 lvx $sb2t, r12, r10
207 .long 0
208 .byte 0,12,0x14,0,0,0,0,0
211 ## _aes_encrypt_core
213 ## AES-encrypt %xmm0.
215 ## Inputs:
216 ## %xmm0 = input
217 ## %xmm9-%xmm15 as in _vpaes_preheat
218 ## (%rdx) = scheduled keys
220 ## Output in %xmm0
221 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
224 .align 5
225 _vpaes_encrypt_core:
226 lwz r8, 240($key) # pull rounds
227 li r9, 16
228 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
229 li r11, 0x10
230 lvx v6, r9, $key
231 addi r9, r9, 16
232 ?vperm v5, v5, v6, $keyperm # align round key
233 addi r10, r11, 0x40
234 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
235 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
236 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
237 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
238 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
239 mtctr r8
240 b Lenc_entry
242 .align 4
243 Lenc_loop:
244 # middle of middle round
245 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
246 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
247 addi r11, r11, 16
248 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
249 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
250 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
251 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
252 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
253 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
254 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
255 addi r10, r11, 0x40
256 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
257 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
258 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
259 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
260 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
261 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
262 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
264 Lenc_entry:
265 # top of round
266 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
267 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
268 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
269 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
270 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
271 vand v0, v0, v9
272 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
273 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
274 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
275 vmr v5, v6
276 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
277 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
278 addi r9, r9, 16
279 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
280 ?vperm v5, v5, v6, $keyperm # align round key
281 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
282 bdnz Lenc_loop
284 # middle of last round
285 addi r10, r11, 0x80
286 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
287 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
288 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
289 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
290 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
291 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
292 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
293 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
295 .long 0
296 .byte 0,12,0x14,0,0,0,0,0
298 .globl .vpaes_encrypt
299 .align 5
300 .vpaes_encrypt:
301 $STU $sp,-$FRAME($sp)
302 li r10,`15+6*$SIZE_T`
303 li r11,`31+6*$SIZE_T`
304 mflr r6
305 mfspr r7, 256 # save vrsave
306 stvx v20,r10,$sp
307 addi r10,r10,32
308 stvx v21,r11,$sp
309 addi r11,r11,32
310 stvx v22,r10,$sp
311 addi r10,r10,32
312 stvx v23,r11,$sp
313 addi r11,r11,32
314 stvx v24,r10,$sp
315 addi r10,r10,32
316 stvx v25,r11,$sp
317 addi r11,r11,32
318 stvx v26,r10,$sp
319 addi r10,r10,32
320 stvx v27,r11,$sp
321 addi r11,r11,32
322 stvx v28,r10,$sp
323 addi r10,r10,32
324 stvx v29,r11,$sp
325 addi r11,r11,32
326 stvx v30,r10,$sp
327 stvx v31,r11,$sp
328 stw r7,`$FRAME-4`($sp) # save vrsave
329 li r0, -1
330 $PUSH r6,`$FRAME+$LRSAVE`($sp)
331 mtspr 256, r0 # preserve all AltiVec registers
333 bl _vpaes_encrypt_preheat
335 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
336 lvx v0, 0, $inp
337 addi $inp, $inp, 15 # 15 is not a typo
338 ?lvsr $outperm, 0, $out
339 ?lvsl $keyperm, 0, $key # prepare for unaligned access
340 vnor $outmask, v7, v7 # 0xff..ff
341 lvx $inptail, 0, $inp # redundant in aligned case
342 ?vperm $outmask, v7, $outmask, $outperm
343 lvx $outhead, 0, $out
344 ?vperm v0, v0, $inptail, $inpperm
346 bl _vpaes_encrypt_core
348 vperm v0, v0, v0, $outperm # rotate right/left
349 vsel v1, $outhead, v0, $outmask
350 vmr $outhead, v0
351 stvx v1, 0, $out
352 addi $out, $out, 15 # 15 is not a typo
353 ########
355 lvx v1, 0, $out # redundant in aligned case
356 vsel v1, $outhead, v1, $outmask
357 stvx v1, 0, $out
359 li r10,`15+6*$SIZE_T`
360 li r11,`31+6*$SIZE_T`
361 mtlr r6
362 mtspr 256, r7 # restore vrsave
363 lvx v20,r10,$sp
364 addi r10,r10,32
365 lvx v21,r11,$sp
366 addi r11,r11,32
367 lvx v22,r10,$sp
368 addi r10,r10,32
369 lvx v23,r11,$sp
370 addi r11,r11,32
371 lvx v24,r10,$sp
372 addi r10,r10,32
373 lvx v25,r11,$sp
374 addi r11,r11,32
375 lvx v26,r10,$sp
376 addi r10,r10,32
377 lvx v27,r11,$sp
378 addi r11,r11,32
379 lvx v28,r10,$sp
380 addi r10,r10,32
381 lvx v29,r11,$sp
382 addi r11,r11,32
383 lvx v30,r10,$sp
384 lvx v31,r11,$sp
385 addi $sp,$sp,$FRAME
387 .long 0
388 .byte 0,12,0x04,1,0x80,0,3,0
389 .long 0
390 .size .vpaes_encrypt,.-.vpaes_encrypt
392 .align 4
393 _vpaes_decrypt_preheat:
394 mflr r8
395 bl Lconsts
396 mtlr r8
397 li r11, 0xc0 # Lk_inv
398 li r10, 0xd0
399 li r9, 0x160 # Ldipt
400 li r8, 0x170
401 vxor v7, v7, v7 # 0x00..00
402 vspltisb v8,4 # 0x04..04
403 vspltisb v9,0x0f # 0x0f..0f
404 lvx $invlo, r12, r11
405 li r11, 0x180
406 lvx $invhi, r12, r10
407 li r10, 0x190
408 lvx $iptlo, r12, r9
409 li r9, 0x1a0
410 lvx $ipthi, r12, r8
411 li r8, 0x1b0
412 lvx $sbou, r12, r11
413 li r11, 0x1c0
414 lvx $sbot, r12, r10
415 li r10, 0x1d0
416 lvx $sb9u, r12, r9
417 li r9, 0x1e0
418 lvx $sb9t, r12, r8
419 li r8, 0x1f0
420 lvx $sbdu, r12, r11
421 li r11, 0x200
422 lvx $sbdt, r12, r10
423 li r10, 0x210
424 lvx $sbbu, r12, r9
425 lvx $sbbt, r12, r8
426 lvx $sbeu, r12, r11
427 lvx $sbet, r12, r10
429 .long 0
430 .byte 0,12,0x14,0,0,0,0,0
433 ## Decryption core
435 ## Same API as encryption core.
437 .align 4
438 _vpaes_decrypt_core:
439 lwz r8, 240($key) # pull rounds
440 li r9, 16
441 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
442 li r11, 0x30
443 lvx v6, r9, $key
444 addi r9, r9, 16
445 ?vperm v5, v5, v6, $keyperm # align round key
446 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
447 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
448 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
449 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
450 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
451 mtctr r8
452 b Ldec_entry
454 .align 4
455 Ldec_loop:
457 # Inverse mix columns
459 lvx v0, r12, r11 # v5 and v0 are flipped
460 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
461 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
462 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
463 subi r11, r11, 16
464 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
465 andi. r11, r11, 0x30
466 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
467 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
468 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
469 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
471 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
472 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
473 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
474 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
475 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
476 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
477 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
479 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
480 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
481 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
482 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
483 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
484 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
485 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
487 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
488 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
489 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
490 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
491 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
493 Ldec_entry:
494 # top of round
495 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
496 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
497 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
498 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
499 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
500 vand v0, v0, v9
501 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
502 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
503 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
504 vmr v5, v6
505 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
506 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
507 addi r9, r9, 16
508 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
509 ?vperm v5, v5, v6, $keyperm # align round key
510 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
511 bdnz Ldec_loop
513 # middle of last round
514 addi r10, r11, 0x80
515 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
516 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
517 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
518 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
519 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
520 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
521 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
522 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
524 .long 0
525 .byte 0,12,0x14,0,0,0,0,0
527 .globl .vpaes_decrypt
528 .align 5
529 .vpaes_decrypt:
530 $STU $sp,-$FRAME($sp)
531 li r10,`15+6*$SIZE_T`
532 li r11,`31+6*$SIZE_T`
533 mflr r6
534 mfspr r7, 256 # save vrsave
535 stvx v20,r10,$sp
536 addi r10,r10,32
537 stvx v21,r11,$sp
538 addi r11,r11,32
539 stvx v22,r10,$sp
540 addi r10,r10,32
541 stvx v23,r11,$sp
542 addi r11,r11,32
543 stvx v24,r10,$sp
544 addi r10,r10,32
545 stvx v25,r11,$sp
546 addi r11,r11,32
547 stvx v26,r10,$sp
548 addi r10,r10,32
549 stvx v27,r11,$sp
550 addi r11,r11,32
551 stvx v28,r10,$sp
552 addi r10,r10,32
553 stvx v29,r11,$sp
554 addi r11,r11,32
555 stvx v30,r10,$sp
556 stvx v31,r11,$sp
557 stw r7,`$FRAME-4`($sp) # save vrsave
558 li r0, -1
559 $PUSH r6,`$FRAME+$LRSAVE`($sp)
560 mtspr 256, r0 # preserve all AltiVec registers
562 bl _vpaes_decrypt_preheat
564 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
565 lvx v0, 0, $inp
566 addi $inp, $inp, 15 # 15 is not a typo
567 ?lvsr $outperm, 0, $out
568 ?lvsl $keyperm, 0, $key
569 vnor $outmask, v7, v7 # 0xff..ff
570 lvx $inptail, 0, $inp # redundant in aligned case
571 ?vperm $outmask, v7, $outmask, $outperm
572 lvx $outhead, 0, $out
573 ?vperm v0, v0, $inptail, $inpperm
575 bl _vpaes_decrypt_core
577 vperm v0, v0, v0, $outperm # rotate right/left
578 vsel v1, $outhead, v0, $outmask
579 vmr $outhead, v0
580 stvx v1, 0, $out
581 addi $out, $out, 15 # 15 is not a typo
582 ########
584 lvx v1, 0, $out # redundant in aligned case
585 vsel v1, $outhead, v1, $outmask
586 stvx v1, 0, $out
588 li r10,`15+6*$SIZE_T`
589 li r11,`31+6*$SIZE_T`
590 mtlr r6
591 mtspr 256, r7 # restore vrsave
592 lvx v20,r10,$sp
593 addi r10,r10,32
594 lvx v21,r11,$sp
595 addi r11,r11,32
596 lvx v22,r10,$sp
597 addi r10,r10,32
598 lvx v23,r11,$sp
599 addi r11,r11,32
600 lvx v24,r10,$sp
601 addi r10,r10,32
602 lvx v25,r11,$sp
603 addi r11,r11,32
604 lvx v26,r10,$sp
605 addi r10,r10,32
606 lvx v27,r11,$sp
607 addi r11,r11,32
608 lvx v28,r10,$sp
609 addi r10,r10,32
610 lvx v29,r11,$sp
611 addi r11,r11,32
612 lvx v30,r10,$sp
613 lvx v31,r11,$sp
614 addi $sp,$sp,$FRAME
616 .long 0
617 .byte 0,12,0x04,1,0x80,0,3,0
618 .long 0
619 .size .vpaes_decrypt,.-.vpaes_decrypt
621 .globl .vpaes_cbc_encrypt
622 .align 5
623 .vpaes_cbc_encrypt:
624 ${UCMP}i r5,16
625 bltlr-
627 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
628 mflr r0
629 li r10,`15+6*$SIZE_T`
630 li r11,`31+6*$SIZE_T`
631 mfspr r12, 256
632 stvx v20,r10,$sp
633 addi r10,r10,32
634 stvx v21,r11,$sp
635 addi r11,r11,32
636 stvx v22,r10,$sp
637 addi r10,r10,32
638 stvx v23,r11,$sp
639 addi r11,r11,32
640 stvx v24,r10,$sp
641 addi r10,r10,32
642 stvx v25,r11,$sp
643 addi r11,r11,32
644 stvx v26,r10,$sp
645 addi r10,r10,32
646 stvx v27,r11,$sp
647 addi r11,r11,32
648 stvx v28,r10,$sp
649 addi r10,r10,32
650 stvx v29,r11,$sp
651 addi r11,r11,32
652 stvx v30,r10,$sp
653 stvx v31,r11,$sp
654 stw r12,`$FRAME-4`($sp) # save vrsave
655 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
656 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
657 li r9, -16
658 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
660 and r30, r5, r9 # copy length&-16
661 mr r5, r6 # copy pointer to key
662 mr r31, r7 # copy pointer to iv
663 blt Lcbc_abort
664 cmpwi r8, 0 # test direction
665 li r6, -1
666 mr r7, r12 # copy vrsave
667 mtspr 256, r6 # preserve all AltiVec registers
669 lvx v24, 0, r31 # load [potentially unaligned] iv
670 li r9, 15
671 ?lvsl $inpperm, 0, r31
672 lvx v25, r9, r31
673 ?vperm v24, v24, v25, $inpperm
675 neg r8, $inp # prepare for unaligned access
676 vxor v7, v7, v7
677 ?lvsl $keyperm, 0, $key
678 ?lvsr $outperm, 0, $out
679 ?lvsr $inpperm, 0, r8 # -$inp
680 vnor $outmask, v7, v7 # 0xff..ff
681 lvx $inptail, 0, $inp
682 ?vperm $outmask, v7, $outmask, $outperm
683 addi $inp, $inp, 15 # 15 is not a typo
684 lvx $outhead, 0, $out
686 beq Lcbc_decrypt
688 bl _vpaes_encrypt_preheat
689 li r0, 16
691 Lcbc_enc_loop:
692 vmr v0, $inptail
693 lvx $inptail, 0, $inp
694 addi $inp, $inp, 16
695 ?vperm v0, v0, $inptail, $inpperm
696 vxor v0, v0, v24 # ^= iv
698 bl _vpaes_encrypt_core
700 vmr v24, v0 # put aside iv
701 sub. r30, r30, r0 # len -= 16
702 vperm v0, v0, v0, $outperm # rotate right/left
703 vsel v1, $outhead, v0, $outmask
704 vmr $outhead, v0
705 stvx v1, 0, $out
706 addi $out, $out, 16
707 bne Lcbc_enc_loop
709 b Lcbc_done
711 .align 5
712 Lcbc_decrypt:
713 bl _vpaes_decrypt_preheat
714 li r0, 16
716 Lcbc_dec_loop:
717 vmr v0, $inptail
718 lvx $inptail, 0, $inp
719 addi $inp, $inp, 16
720 ?vperm v0, v0, $inptail, $inpperm
721 vmr v25, v0 # put aside input
723 bl _vpaes_decrypt_core
725 vxor v0, v0, v24 # ^= iv
726 vmr v24, v25
727 sub. r30, r30, r0 # len -= 16
728 vperm v0, v0, v0, $outperm # rotate right/left
729 vsel v1, $outhead, v0, $outmask
730 vmr $outhead, v0
731 stvx v1, 0, $out
732 addi $out, $out, 16
733 bne Lcbc_dec_loop
735 Lcbc_done:
736 addi $out, $out, -1
737 lvx v1, 0, $out # redundant in aligned case
738 vsel v1, $outhead, v1, $outmask
739 stvx v1, 0, $out
741 neg r8, r31 # write [potentially unaligned] iv
742 ?lvsl $outperm, 0, r8
743 li r6, 15
744 vnor $outmask, v7, v7 # 0xff..ff
745 ?vperm $outmask, v7, $outmask, $outperm
746 lvx $outhead, 0, r31
747 vperm v24, v24, v24, $outperm # rotate right/left
748 vsel v0, $outhead, v24, $outmask
749 lvx v1, r6, r31
750 stvx v0, 0, r31
751 vsel v1, v24, v1, $outmask
752 stvx v1, r6, r31
754 mtspr 256, r7 # restore vrsave
755 li r10,`15+6*$SIZE_T`
756 li r11,`31+6*$SIZE_T`
757 lvx v20,r10,$sp
758 addi r10,r10,32
759 lvx v21,r11,$sp
760 addi r11,r11,32
761 lvx v22,r10,$sp
762 addi r10,r10,32
763 lvx v23,r11,$sp
764 addi r11,r11,32
765 lvx v24,r10,$sp
766 addi r10,r10,32
767 lvx v25,r11,$sp
768 addi r11,r11,32
769 lvx v26,r10,$sp
770 addi r10,r10,32
771 lvx v27,r11,$sp
772 addi r11,r11,32
773 lvx v28,r10,$sp
774 addi r10,r10,32
775 lvx v29,r11,$sp
776 addi r11,r11,32
777 lvx v30,r10,$sp
778 lvx v31,r11,$sp
779 Lcbc_abort:
780 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
781 $POP r30,`$FRAME+$SIZE_T*0`($sp)
782 $POP r31,`$FRAME+$SIZE_T*1`($sp)
783 mtlr r0
784 addi $sp,$sp,`$FRAME+$SIZE_T*2`
786 .long 0
787 .byte 0,12,0x04,1,0x80,2,6,0
788 .long 0
789 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
793 my ($inp,$bits,$out)=map("r$_",(3..5));
794 my $dir="cr1";
795 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
797 $code.=<<___;
798 ########################################################
799 ## ##
800 ## AES key schedule ##
801 ## ##
802 ########################################################
803 .align 4
804 _vpaes_key_preheat:
805 mflr r8
806 bl Lconsts
807 mtlr r8
808 li r11, 0xc0 # Lk_inv
809 li r10, 0xd0
810 li r9, 0xe0 # L_ipt
811 li r8, 0xf0
813 vspltisb v8,4 # 0x04..04
814 vxor v9,v9,v9 # 0x00..00
815 lvx $invlo, r12, r11 # Lk_inv
816 li r11, 0x120
817 lvx $invhi, r12, r10
818 li r10, 0x130
819 lvx $iptlo, r12, r9 # Lk_ipt
820 li r9, 0x220
821 lvx $ipthi, r12, r8
822 li r8, 0x230
824 lvx v14, r12, r11 # Lk_sb1
825 li r11, 0x240
826 lvx v15, r12, r10
827 li r10, 0x250
829 lvx v16, r12, r9 # Lk_dksd
830 li r9, 0x260
831 lvx v17, r12, r8
832 li r8, 0x270
833 lvx v18, r12, r11 # Lk_dksb
834 li r11, 0x280
835 lvx v19, r12, r10
836 li r10, 0x290
837 lvx v20, r12, r9 # Lk_dkse
838 li r9, 0x2a0
839 lvx v21, r12, r8
840 li r8, 0x2b0
841 lvx v22, r12, r11 # Lk_dks9
842 lvx v23, r12, r10
844 lvx v24, r12, r9 # Lk_rcon
845 lvx v25, 0, r12 # Lk_mc_forward[0]
846 lvx v26, r12, r8 # Lks63
848 .long 0
849 .byte 0,12,0x14,0,0,0,0,0
851 .align 4
852 _vpaes_schedule_core:
853 mflr r7
855 bl _vpaes_key_preheat # load the tables
857 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
858 neg r8, $inp # prepare for unaligned access
859 lvx v0, 0, $inp
860 addi $inp, $inp, 15 # 15 is not typo
861 ?lvsr $inpperm, 0, r8 # -$inp
862 lvx v6, 0, $inp # v6 serves as inptail
863 addi $inp, $inp, 8
864 ?vperm v0, v0, v6, $inpperm
866 # input transform
867 vmr v3, v0 # vmovdqa %xmm0, %xmm3
868 bl _vpaes_schedule_transform
869 vmr v7, v0 # vmovdqa %xmm0, %xmm7
871 bne $dir, Lschedule_am_decrypting
873 # encrypting, output zeroth round key after transform
874 li r8, 0x30 # mov \$0x30,%r8d
875 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
877 ?lvsr $outperm, 0, $out # prepare for unaligned access
878 vnor $outmask, v9, v9 # 0xff..ff
879 lvx $outhead, 0, $out
880 ?vperm $outmask, v9, $outmask, $outperm
882 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
883 vperm v1, v0, v0, $outperm # rotate right/left
884 vsel v2, $outhead, v1, $outmask
885 vmr $outhead, v1
886 stvx v2, 0, $out
887 b Lschedule_go
889 Lschedule_am_decrypting:
890 srwi r8, $bits, 1 # shr \$1,%r8d
891 andi. r8, r8, 32 # and \$32,%r8d
892 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
893 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
894 # decrypting, output zeroth round key after shiftrows
895 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
896 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
898 neg r0, $out # prepare for unaligned access
899 ?lvsl $outperm, 0, r0
900 addi $out, $out, 15 # 15 is not typo
901 vnor $outmask, v9, v9 # 0xff..ff
902 lvx $outhead, 0, $out
903 ?vperm $outmask, $outmask, v9, $outperm
905 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
906 vperm v4, v4, v4, $outperm # rotate right/left
907 vsel v2, $outhead, v4, $outmask
908 vmr $outhead, v4
909 stvx v2, 0, $out
910 xori r8, r8, 0x30 # xor \$0x30, %r8
912 Lschedule_go:
913 cmplwi $bits, 192 # cmp \$192, %esi
914 bgt Lschedule_256
915 beq Lschedule_192
916 # 128: fall though
919 ## .schedule_128
921 ## 128-bit specific part of key schedule.
923 ## This schedule is really simple, because all its parts
924 ## are accomplished by the subroutines.
926 Lschedule_128:
927 li r0, 10 # mov \$10, %esi
928 mtctr r0
930 Loop_schedule_128:
931 bl _vpaes_schedule_round
932 bdz Lschedule_mangle_last # dec %esi
933 bl _vpaes_schedule_mangle # write output
934 b Loop_schedule_128
937 ## .aes_schedule_192
939 ## 192-bit specific part of key schedule.
941 ## The main body of this schedule is the same as the 128-bit
942 ## schedule, but with more smearing. The long, high side is
943 ## stored in %xmm7 as before, and the short, low side is in
944 ## the high bits of %xmm6.
946 ## This schedule is somewhat nastier, however, because each
947 ## round produces 192 bits of key material, or 1.5 round keys.
948 ## Therefore, on each cycle we do 2 rounds and produce 3 round
949 ## keys.
951 .align 4
952 Lschedule_192:
953 li r0, 4 # mov \$4, %esi
954 lvx v0, 0, $inp
955 ?vperm v0, v6, v0, $inpperm
956 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
957 bl _vpaes_schedule_transform # input transform
958 ?vsldoi v6, v0, v9, 8
959 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
960 mtctr r0
962 Loop_schedule_192:
963 bl _vpaes_schedule_round
964 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
965 bl _vpaes_schedule_mangle # save key n
966 bl _vpaes_schedule_192_smear
967 bl _vpaes_schedule_mangle # save key n+1
968 bl _vpaes_schedule_round
969 bdz Lschedule_mangle_last # dec %esi
970 bl _vpaes_schedule_mangle # save key n+2
971 bl _vpaes_schedule_192_smear
972 b Loop_schedule_192
975 ## .aes_schedule_256
977 ## 256-bit specific part of key schedule.
979 ## The structure here is very similar to the 128-bit
980 ## schedule, but with an additional "low side" in
981 ## %xmm6. The low side's rounds are the same as the
982 ## high side's, except no rcon and no rotation.
984 .align 4
985 Lschedule_256:
986 li r0, 7 # mov \$7, %esi
987 addi $inp, $inp, 8
988 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
989 ?vperm v0, v6, v0, $inpperm
990 bl _vpaes_schedule_transform # input transform
991 mtctr r0
993 Loop_schedule_256:
994 bl _vpaes_schedule_mangle # output low result
995 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
997 # high round
998 bl _vpaes_schedule_round
999 bdz Lschedule_mangle_last # dec %esi
1000 bl _vpaes_schedule_mangle
1002 # low round. swap xmm7 and xmm6
1003 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1004 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1005 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1006 bl _vpaes_schedule_low_round
1007 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1009 b Loop_schedule_256
1011 ## .aes_schedule_mangle_last
1013 ## Mangler for last round of key schedule
1014 ## Mangles %xmm0
1015 ## when encrypting, outputs out(%xmm0) ^ 63
1016 ## when decrypting, outputs unskew(%xmm0)
1018 ## Always called right before return... jumps to cleanup and exits
1020 .align 4
1021 Lschedule_mangle_last:
1022 # schedule last round key from xmm0
1023 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1024 li r9, 0x2f0
1025 bne $dir, Lschedule_mangle_last_dec
1027 # encrypting
1028 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1029 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1030 li r9, 0x2d0 # prepare to output transform
1031 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1033 lvx $iptlo, r11, r12 # reload $ipt
1034 lvx $ipthi, r9, r12
1035 addi $out, $out, 16 # add \$16, %rdx
1036 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1037 bl _vpaes_schedule_transform # output transform
1039 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1040 vperm v0, v0, v0, $outperm # rotate right/left
1041 vsel v2, $outhead, v0, $outmask
1042 vmr $outhead, v0
1043 stvx v2, 0, $out
1045 addi $out, $out, 15 # 15 is not typo
1046 lvx v1, 0, $out # redundant in aligned case
1047 vsel v1, $outhead, v1, $outmask
1048 stvx v1, 0, $out
1049 b Lschedule_mangle_done
1051 .align 4
1052 Lschedule_mangle_last_dec:
1053 lvx $iptlo, r11, r12 # reload $ipt
1054 lvx $ipthi, r9, r12
1055 addi $out, $out, -16 # add \$-16, %rdx
1056 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1057 bl _vpaes_schedule_transform # output transform
1059 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1060 vperm v0, v0, v0, $outperm # rotate right/left
1061 vsel v2, $outhead, v0, $outmask
1062 vmr $outhead, v0
1063 stvx v2, 0, $out
1065 addi $out, $out, -15 # -15 is not typo
1066 lvx v1, 0, $out # redundant in aligned case
1067 vsel v1, $outhead, v1, $outmask
1068 stvx v1, 0, $out
1070 Lschedule_mangle_done:
1071 mtlr r7
1072 # cleanup
1073 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1074 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1075 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1076 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1077 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1078 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1079 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1080 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1083 .long 0
1084 .byte 0,12,0x14,0,0,0,0,0
1087 ## .aes_schedule_192_smear
1089 ## Smear the short, low side in the 192-bit key schedule.
1091 ## Inputs:
1092 ## %xmm7: high side, b a x y
1093 ## %xmm6: low side, d c 0 0
1094 ## %xmm13: 0
1096 ## Outputs:
1097 ## %xmm6: b+c+d b+c 0 0
1098 ## %xmm0: b+c+d b+c b a
1100 .align 4
1101 _vpaes_schedule_192_smear:
1102 ?vspltw v0, v7, 3
1103 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1104 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1105 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1106 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1107 vmr v0, v6
1108 ?vsldoi v6, v6, v9, 8
1109 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1111 .long 0
1112 .byte 0,12,0x14,0,0,0,0,0
1115 ## .aes_schedule_round
1117 ## Runs one main round of the key schedule on %xmm0, %xmm7
1119 ## Specifically, runs subbytes on the high dword of %xmm0
1120 ## then rotates it by one byte and xors into the low dword of
1121 ## %xmm7.
1123 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1124 ## next rcon.
1126 ## Smears the dwords of %xmm7 by xoring the low into the
1127 ## second low, result into third, result into highest.
1129 ## Returns results in %xmm7 = %xmm0.
1130 ## Clobbers %xmm1-%xmm4, %r11.
1132 .align 4
1133 _vpaes_schedule_round:
1134 # extract rcon from xmm8
1135 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1136 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1137 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1138 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1140 # rotate
1141 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1142 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1144 # fall through...
1146 # low round: same as high round, but no rotation and no rcon.
1147 _vpaes_schedule_low_round:
1148 # smear xmm7
1149 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1150 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1151 vspltisb v1, 0x0f # 0x0f..0f
1152 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1154 # subbytes
1155 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1156 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1157 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1158 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1159 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1160 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1161 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1162 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1163 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1164 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1165 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1166 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1167 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1168 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1169 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1170 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1171 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1173 # add in smeared stuff
1174 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1175 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1177 .long 0
1178 .byte 0,12,0x14,0,0,0,0,0
1181 ## .aes_schedule_transform
1183 ## Linear-transform %xmm0 according to tables at (%r11)
1185 ## Requires that %xmm9 = 0x0F0F... as in preheat
1186 ## Output in %xmm0
1187 ## Clobbers %xmm2
1189 .align 4
1190 _vpaes_schedule_transform:
1191 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1192 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1193 # vmovdqa (%r11), %xmm2 # lo
1194 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1195 # vmovdqa 16(%r11), %xmm1 # hi
1196 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1197 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1199 .long 0
1200 .byte 0,12,0x14,0,0,0,0,0
1203 ## .aes_schedule_mangle
1205 ## Mangle xmm0 from (basis-transformed) standard version
1206 ## to our version.
1208 ## On encrypt,
1209 ## xor with 0x63
1210 ## multiply by circulant 0,1,1,1
1211 ## apply shiftrows transform
1213 ## On decrypt,
1214 ## xor with 0x63
1215 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1216 ## deskew
1217 ## apply shiftrows transform
1220 ## Writes out to (%rdx), and increments or decrements it
1221 ## Keeps track of round number mod 4 in %r8
1222 ## Preserves xmm0
1223 ## Clobbers xmm1-xmm5
1225 .align 4
1226 _vpaes_schedule_mangle:
1227 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1228 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1229 bne $dir, Lschedule_mangle_dec
1231 # encrypting
1232 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1233 addi $out, $out, 16 # add \$16, %rdx
1234 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1235 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1236 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1237 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1238 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1239 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1241 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1242 addi r8, r8, -16 # add \$-16, %r8
1243 andi. r8, r8, 0x30 # and \$0x30, %r8
1245 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1246 vperm v1, v3, v3, $outperm # rotate right/left
1247 vsel v2, $outhead, v1, $outmask
1248 vmr $outhead, v1
1249 stvx v2, 0, $out
1252 .align 4
1253 Lschedule_mangle_dec:
1254 # inverse mix columns
1255 # lea .Lk_dksd(%rip),%r11
1256 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1257 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1259 # vmovdqa 0x00(%r11), %xmm2
1260 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1261 # vmovdqa 0x10(%r11), %xmm3
1262 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1263 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1264 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1266 # vmovdqa 0x20(%r11), %xmm2
1267 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1268 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1269 # vmovdqa 0x30(%r11), %xmm3
1270 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1272 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1274 # vmovdqa 0x40(%r11), %xmm2
1275 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1276 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1277 # vmovdqa 0x50(%r11), %xmm3
1278 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1279 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1281 # vmovdqa 0x60(%r11), %xmm2
1282 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1283 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1284 # vmovdqa 0x70(%r11), %xmm4
1285 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1286 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1287 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1288 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1290 addi $out, $out, -16 # add \$-16, %rdx
1292 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1293 addi r8, r8, -16 # add \$-16, %r8
1294 andi. r8, r8, 0x30 # and \$0x30, %r8
1296 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1297 vperm v1, v3, v3, $outperm # rotate right/left
1298 vsel v2, $outhead, v1, $outmask
1299 vmr $outhead, v1
1300 stvx v2, 0, $out
1302 .long 0
1303 .byte 0,12,0x14,0,0,0,0,0
1305 .globl .vpaes_set_encrypt_key
1306 .align 5
1307 .vpaes_set_encrypt_key:
1308 $STU $sp,-$FRAME($sp)
1309 li r10,`15+6*$SIZE_T`
1310 li r11,`31+6*$SIZE_T`
1311 mflr r0
1312 mfspr r6, 256 # save vrsave
1313 stvx v20,r10,$sp
1314 addi r10,r10,32
1315 stvx v21,r11,$sp
1316 addi r11,r11,32
1317 stvx v22,r10,$sp
1318 addi r10,r10,32
1319 stvx v23,r11,$sp
1320 addi r11,r11,32
1321 stvx v24,r10,$sp
1322 addi r10,r10,32
1323 stvx v25,r11,$sp
1324 addi r11,r11,32
1325 stvx v26,r10,$sp
1326 addi r10,r10,32
1327 stvx v27,r11,$sp
1328 addi r11,r11,32
1329 stvx v28,r10,$sp
1330 addi r10,r10,32
1331 stvx v29,r11,$sp
1332 addi r11,r11,32
1333 stvx v30,r10,$sp
1334 stvx v31,r11,$sp
1335 stw r6,`$FRAME-4`($sp) # save vrsave
1336 li r7, -1
1337 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1338 mtspr 256, r7 # preserve all AltiVec registers
1340 srwi r9, $bits, 5 # shr \$5,%eax
1341 addi r9, r9, 6 # add \$5,%eax
1342 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1344 cmplw $dir, $bits, $bits # set encrypt direction
1345 li r8, 0x30 # mov \$0x30,%r8d
1346 bl _vpaes_schedule_core
1348 $POP r0, `$FRAME+$LRSAVE`($sp)
1349 li r10,`15+6*$SIZE_T`
1350 li r11,`31+6*$SIZE_T`
1351 mtspr 256, r6 # restore vrsave
1352 mtlr r0
1353 xor r3, r3, r3
1354 lvx v20,r10,$sp
1355 addi r10,r10,32
1356 lvx v21,r11,$sp
1357 addi r11,r11,32
1358 lvx v22,r10,$sp
1359 addi r10,r10,32
1360 lvx v23,r11,$sp
1361 addi r11,r11,32
1362 lvx v24,r10,$sp
1363 addi r10,r10,32
1364 lvx v25,r11,$sp
1365 addi r11,r11,32
1366 lvx v26,r10,$sp
1367 addi r10,r10,32
1368 lvx v27,r11,$sp
1369 addi r11,r11,32
1370 lvx v28,r10,$sp
1371 addi r10,r10,32
1372 lvx v29,r11,$sp
1373 addi r11,r11,32
1374 lvx v30,r10,$sp
1375 lvx v31,r11,$sp
1376 addi $sp,$sp,$FRAME
1378 .long 0
1379 .byte 0,12,0x04,1,0x80,0,3,0
1380 .long 0
1381 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1383 .globl .vpaes_set_decrypt_key
1384 .align 4
1385 .vpaes_set_decrypt_key:
1386 $STU $sp,-$FRAME($sp)
1387 li r10,`15+6*$SIZE_T`
1388 li r11,`31+6*$SIZE_T`
1389 mflr r0
1390 mfspr r6, 256 # save vrsave
1391 stvx v20,r10,$sp
1392 addi r10,r10,32
1393 stvx v21,r11,$sp
1394 addi r11,r11,32
1395 stvx v22,r10,$sp
1396 addi r10,r10,32
1397 stvx v23,r11,$sp
1398 addi r11,r11,32
1399 stvx v24,r10,$sp
1400 addi r10,r10,32
1401 stvx v25,r11,$sp
1402 addi r11,r11,32
1403 stvx v26,r10,$sp
1404 addi r10,r10,32
1405 stvx v27,r11,$sp
1406 addi r11,r11,32
1407 stvx v28,r10,$sp
1408 addi r10,r10,32
1409 stvx v29,r11,$sp
1410 addi r11,r11,32
1411 stvx v30,r10,$sp
1412 stvx v31,r11,$sp
1413 stw r6,`$FRAME-4`($sp) # save vrsave
1414 li r7, -1
1415 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1416 mtspr 256, r7 # preserve all AltiVec registers
1418 srwi r9, $bits, 5 # shr \$5,%eax
1419 addi r9, r9, 6 # add \$5,%eax
1420 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1422 slwi r9, r9, 4 # shl \$4,%eax
1423 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1425 cmplwi $dir, $bits, 0 # set decrypt direction
1426 srwi r8, $bits, 1 # shr \$1,%r8d
1427 andi. r8, r8, 32 # and \$32,%r8d
1428 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1429 bl _vpaes_schedule_core
1431 $POP r0, `$FRAME+$LRSAVE`($sp)
1432 li r10,`15+6*$SIZE_T`
1433 li r11,`31+6*$SIZE_T`
1434 mtspr 256, r6 # restore vrsave
1435 mtlr r0
1436 xor r3, r3, r3
1437 lvx v20,r10,$sp
1438 addi r10,r10,32
1439 lvx v21,r11,$sp
1440 addi r11,r11,32
1441 lvx v22,r10,$sp
1442 addi r10,r10,32
1443 lvx v23,r11,$sp
1444 addi r11,r11,32
1445 lvx v24,r10,$sp
1446 addi r10,r10,32
1447 lvx v25,r11,$sp
1448 addi r11,r11,32
1449 lvx v26,r10,$sp
1450 addi r10,r10,32
1451 lvx v27,r11,$sp
1452 addi r11,r11,32
1453 lvx v28,r10,$sp
1454 addi r10,r10,32
1455 lvx v29,r11,$sp
1456 addi r11,r11,32
1457 lvx v30,r10,$sp
1458 lvx v31,r11,$sp
1459 addi $sp,$sp,$FRAME
1461 .long 0
1462 .byte 0,12,0x04,1,0x80,0,3,0
1463 .long 0
1464 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1468 my $consts=1;
1469 foreach (split("\n",$code)) {
1470 s/\`([^\`]*)\`/eval $1/geo;
1472 # constants table endian-specific conversion
1473 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1474 my $conv=$2;
1475 my @bytes=();
1477 # convert to endian-agnostic format
1478 foreach (split(/,\s+/,$1)) {
1479 my $l = /^0/?oct:int;
1480 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1483 # little-endian conversion
1484 if ($flavour =~ /le$/o) {
1485 SWITCH: for($conv) {
1486 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1487 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1491 #emit
1492 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1493 next;
1495 $consts=0 if (m/Lconsts:/o); # end of table
1497 # instructions prefixed with '?' are endian-specific and need
1498 # to be adjusted accordingly...
1499 if ($flavour =~ /le$/o) { # little-endian
1500 s/\?lvsr/lvsl/o or
1501 s/\?lvsl/lvsr/o or
1502 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1503 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1504 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1505 } else { # big-endian
1506 s/\?([a-z]+)/$1/o;
1509 print $_,"\n";
1512 close STDOUT;