2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
37 POLY: .octa 0xC2000000000000000000000000000001
38 TWOONE: .octa 0x00000001000000000000000000000001
40 # order of these constants should not change.
41 # more specifically, ALL_F should follow SHIFT_MASK,
42 # and ZERO should follow ALL_F
44 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45 MASK1: .octa 0x0000000000000000ffffffffffffffff
46 MASK2: .octa 0xffffffffffffffff0000000000000000
47 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49 ZERO: .octa 0x00000000000000000000000000000000
50 ONE: .octa 0x00000000000000000000000000000001
51 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
59 #define STACK_OFFSET 8*3
60 #define HashKey 16*0 // store HashKey <<1 mod poly here
61 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76 #define VARIABLE_OFFSET 16*8
84 #define arg7 STACK_OFFSET+8(%r14)
85 #define arg8 STACK_OFFSET+16(%r14)
86 #define arg9 STACK_OFFSET+24(%r14)
87 #define arg10 STACK_OFFSET+32(%r14)
104 #define BSWAP_MASK %xmm10
136 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
139 * Input: A and B (128-bits each, bit-reflected)
140 * Output: C = A*B*x mod poly, (i.e. >>1 )
141 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
145 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
162 # first phase of the reduction
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
179 # second phase of the reduction
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
193 pxor \TMP1, \GH # result is in TMP1
197 * if a = number of total plaintext bytes
199 * num_initial_blocks = b mod 4
200 * encrypt the initial num_initial_blocks blocks and apply ghash on
202 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
204 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
207 .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
208 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
214 _get_AAD_loop\num_initial_blocks\operation:
221 jne _get_AAD_loop\num_initial_blocks\operation
223 je _get_AAD_loop2_done\num_initial_blocks\operation
225 _get_AAD_loop2\num_initial_blocks\operation:
229 jne _get_AAD_loop2\num_initial_blocks\operation
230 _get_AAD_loop2_done\num_initial_blocks\operation:
231 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
232 xor %r11, %r11 # initialise the data pointer offset as zero
234 # start AES for num_initial_blocks blocks
236 mov %arg5, %rax # %rax = *Y0
237 movdqu (%rax), \XMM0 # XMM0 = Y0
238 pshufb SHUF_MASK(%rip), \XMM0
241 paddd ONE(%rip), \XMM0 # INCR Y0
242 movdqa \XMM0, %xmm\index
243 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
246 pxor 16*0(%arg1), %xmm\index
249 movaps 0x10(%rdi), \TMP1
250 AESENC \TMP1, %xmm\index # Round 1
253 movaps 0x20(%arg1), \TMP1
254 AESENC \TMP1, %xmm\index # Round 2
257 movaps 0x30(%arg1), \TMP1
258 AESENC \TMP1, %xmm\index # Round 2
261 movaps 0x40(%arg1), \TMP1
262 AESENC \TMP1, %xmm\index # Round 2
265 movaps 0x50(%arg1), \TMP1
266 AESENC \TMP1, %xmm\index # Round 2
269 movaps 0x60(%arg1), \TMP1
270 AESENC \TMP1, %xmm\index # Round 2
273 movaps 0x70(%arg1), \TMP1
274 AESENC \TMP1, %xmm\index # Round 2
277 movaps 0x80(%arg1), \TMP1
278 AESENC \TMP1, %xmm\index # Round 2
281 movaps 0x90(%arg1), \TMP1
282 AESENC \TMP1, %xmm\index # Round 2
285 movaps 0xa0(%arg1), \TMP1
286 AESENCLAST \TMP1, %xmm\index # Round 10
289 movdqu (%arg3 , %r11, 1), \TMP1
290 pxor \TMP1, %xmm\index
291 movdqu %xmm\index, (%arg2 , %r11, 1)
292 # write back plaintext/ciphertext for num_initial_blocks
294 .if \operation == dec
295 movdqa \TMP1, %xmm\index
297 pshufb SHUF_MASK(%rip), %xmm\index
298 # prepare plaintext/ciphertext for GHASH computation
301 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
302 # apply GHASH on num_initial_blocks blocks
306 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
308 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 jl _initial_blocks_done\num_initial_blocks\operation
322 # no need for precomputed values
325 * Precomputations for HashKey parallel with encryption of first 4 blocks.
326 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
328 paddd ONE(%rip), \XMM0 # INCR Y0
330 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
331 paddd ONE(%rip), \XMM0 # INCR Y0
333 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
334 paddd ONE(%rip), \XMM0 # INCR Y0
336 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
337 paddd ONE(%rip), \XMM0 # INCR Y0
339 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
340 pxor 16*0(%arg1), \XMM1
341 pxor 16*0(%arg1), \XMM2
342 pxor 16*0(%arg1), \XMM3
343 pxor 16*0(%arg1), \XMM4
345 pshufd $78, \TMP3, \TMP1
347 movdqa \TMP1, HashKey_k(%rsp)
348 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
349 # TMP5 = HashKey^2<<1 (mod poly)
350 movdqa \TMP5, HashKey_2(%rsp)
351 # HashKey_2 = HashKey^2<<1 (mod poly)
352 pshufd $78, \TMP5, \TMP1
354 movdqa \TMP1, HashKey_2_k(%rsp)
355 .irpc index, 1234 # do 4 rounds
356 movaps 0x10*\index(%arg1), \TMP1
362 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
363 # TMP5 = HashKey^3<<1 (mod poly)
364 movdqa \TMP5, HashKey_3(%rsp)
365 pshufd $78, \TMP5, \TMP1
367 movdqa \TMP1, HashKey_3_k(%rsp)
368 .irpc index, 56789 # do next 5 rounds
369 movaps 0x10*\index(%arg1), \TMP1
375 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
376 # TMP5 = HashKey^3<<1 (mod poly)
377 movdqa \TMP5, HashKey_4(%rsp)
378 pshufd $78, \TMP5, \TMP1
380 movdqa \TMP1, HashKey_4_k(%rsp)
381 movaps 0xa0(%arg1), \TMP2
382 AESENCLAST \TMP2, \XMM1
383 AESENCLAST \TMP2, \XMM2
384 AESENCLAST \TMP2, \XMM3
385 AESENCLAST \TMP2, \XMM4
386 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
388 .if \operation == dec
389 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
392 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
394 .if \operation == dec
395 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
398 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
400 .if \operation == dec
401 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
404 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
406 .if \operation == dec
407 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
410 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
411 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
412 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
413 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
418 # combine GHASHed value with the corresponding ciphertext
419 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
420 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
421 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
422 _initial_blocks_done\num_initial_blocks\operation:
426 * encrypt 4 blocks at a time
427 * ghash the 4 previously encrypted ciphertext blocks
428 * arg1, %arg2, %arg3 are used as pointers only, not modified
429 * %r11 is the data offset value
431 .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
432 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
439 # multiply TMP5 * HashKey using karatsuba
442 pshufd $78, \XMM5, \TMP6
444 paddd ONE(%rip), \XMM0 # INCR CNT
445 movdqa HashKey_4(%rsp), \TMP5
446 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
448 paddd ONE(%rip), \XMM0 # INCR CNT
450 paddd ONE(%rip), \XMM0 # INCR CNT
452 paddd ONE(%rip), \XMM0 # INCR CNT
454 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
455 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
456 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
457 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
458 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
463 movdqa HashKey_4_k(%rsp), \TMP5
464 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
465 movaps 0x10(%arg1), \TMP1
466 AESENC \TMP1, \XMM1 # Round 1
470 movaps 0x20(%arg1), \TMP1
471 AESENC \TMP1, \XMM1 # Round 2
476 pshufd $78, \XMM6, \TMP2
478 movdqa HashKey_3(%rsp), \TMP5
479 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
480 movaps 0x30(%arg1), \TMP3
481 AESENC \TMP3, \XMM1 # Round 3
485 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
486 movaps 0x40(%arg1), \TMP3
487 AESENC \TMP3, \XMM1 # Round 4
491 movdqa HashKey_3_k(%rsp), \TMP5
492 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
493 movaps 0x50(%arg1), \TMP3
494 AESENC \TMP3, \XMM1 # Round 5
499 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
503 pshufd $78, \XMM7, \TMP2
505 movdqa HashKey_2(%rsp ), \TMP5
507 # Multiply TMP5 * HashKey using karatsuba
509 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
510 movaps 0x60(%arg1), \TMP3
511 AESENC \TMP3, \XMM1 # Round 6
515 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
516 movaps 0x70(%arg1), \TMP3
517 AESENC \TMP3, \XMM1 # Round 7
521 movdqa HashKey_2_k(%rsp), \TMP5
522 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
523 movaps 0x80(%arg1), \TMP3
524 AESENC \TMP3, \XMM1 # Round 8
529 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
533 # Multiply XMM8 * HashKey
534 # XMM8 and TMP5 hold the values for the two operands
537 pshufd $78, \XMM8, \TMP2
539 movdqa HashKey(%rsp), \TMP5
540 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
541 movaps 0x90(%arg1), \TMP3
542 AESENC \TMP3, \XMM1 # Round 9
546 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
547 movaps 0xa0(%arg1), \TMP3
548 AESENCLAST \TMP3, \XMM1 # Round 10
549 AESENCLAST \TMP3, \XMM2
550 AESENCLAST \TMP3, \XMM3
551 AESENCLAST \TMP3, \XMM4
552 movdqa HashKey_k(%rsp), \TMP5
553 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
554 movdqu (%arg3,%r11,1), \TMP3
555 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
556 .if \operation == dec
557 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
560 movdqu 16(%arg3,%r11,1), \TMP3
561 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
562 .if \operation == dec
563 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
566 movdqu 32(%arg3,%r11,1), \TMP3
567 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
568 .if \operation == dec
569 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
572 movdqu 48(%arg3,%r11,1), \TMP3
573 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
574 .if \operation == dec
575 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
578 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
579 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
580 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
581 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
583 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
584 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
585 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
586 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
594 pslldq $8, \TMP3 # left shift TMP3 2 DWs
595 psrldq $8, \TMP2 # right shift TMP2 2 DWs
597 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
599 # first phase of reduction
604 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
605 pslld $31, \TMP2 # packed right shift << 31
606 pslld $30, \TMP3 # packed right shift << 30
607 pslld $25, \TMP4 # packed right shift << 25
608 pxor \TMP3, \TMP2 # xor the shifted versions
611 psrldq $4, \TMP5 # right shift T5 1 DW
612 pslldq $12, \TMP2 # left shift T2 3 DWs
615 # second phase of reduction
617 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
620 psrld $1, \TMP2 # packed left shift >>1
621 psrld $2, \TMP3 # packed left shift >>2
622 psrld $7, \TMP4 # packed left shift >>7
623 pxor \TMP3,\TMP2 # xor the shifted versions
627 pxor \TMP1, \XMM5 # result is in TMP1
632 /* GHASH the last 4 ciphertext blocks. */
633 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
634 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
636 # Multiply TMP6 * HashKey (using Karatsuba)
639 pshufd $78, \XMM1, \TMP2
641 movdqa HashKey_4(%rsp), \TMP5
642 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
643 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
644 movdqa HashKey_4_k(%rsp), \TMP4
645 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
646 movdqa \XMM1, \XMMDst
647 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
649 # Multiply TMP1 * HashKey (using Karatsuba)
652 pshufd $78, \XMM2, \TMP2
654 movdqa HashKey_3(%rsp), \TMP5
655 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
656 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
657 movdqa HashKey_3_k(%rsp), \TMP4
658 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
662 # results accumulated in TMP6, XMMDst, XMM1
664 # Multiply TMP1 * HashKey (using Karatsuba)
667 pshufd $78, \XMM3, \TMP2
669 movdqa HashKey_2(%rsp), \TMP5
670 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
671 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
672 movdqa HashKey_2_k(%rsp), \TMP4
673 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
676 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
678 # Multiply TMP1 * HashKey (using Karatsuba)
680 pshufd $78, \XMM4, \TMP2
682 movdqa HashKey(%rsp), \TMP5
683 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
684 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
685 movdqa HashKey_k(%rsp), \TMP4
686 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
692 # middle section of the temp results combined as in karatsuba algorithm
694 pslldq $8, \TMP4 # left shift TMP4 2 DWs
695 psrldq $8, \TMP2 # right shift TMP2 2 DWs
698 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
699 # first phase of the reduction
700 movdqa \XMMDst, \TMP2
701 movdqa \XMMDst, \TMP3
702 movdqa \XMMDst, \TMP4
703 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
704 pslld $31, \TMP2 # packed right shifting << 31
705 pslld $30, \TMP3 # packed right shifting << 30
706 pslld $25, \TMP4 # packed right shifting << 25
707 pxor \TMP3, \TMP2 # xor the shifted versions
710 psrldq $4, \TMP7 # right shift TMP7 1 DW
711 pslldq $12, \TMP2 # left shift TMP2 3 DWs
714 # second phase of the reduction
715 movdqa \XMMDst, \TMP2
716 # make 3 copies of XMMDst for doing 3 shift operations
717 movdqa \XMMDst, \TMP3
718 movdqa \XMMDst, \TMP4
719 psrld $1, \TMP2 # packed left shift >> 1
720 psrld $2, \TMP3 # packed left shift >> 2
721 psrld $7, \TMP4 # packed left shift >> 7
722 pxor \TMP3, \TMP2 # xor the shifted versions
726 pxor \TMP6, \XMMDst # reduced result is in XMMDst
729 /* Encryption of a single block done*/
730 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
733 movaps 16(%arg1), \TMP1
735 movaps 32(%arg1), \TMP1
737 movaps 48(%arg1), \TMP1
739 movaps 64(%arg1), \TMP1
741 movaps 80(%arg1), \TMP1
743 movaps 96(%arg1), \TMP1
745 movaps 112(%arg1), \TMP1
747 movaps 128(%arg1), \TMP1
749 movaps 144(%arg1), \TMP1
751 movaps 160(%arg1), \TMP1
752 AESENCLAST \TMP1, \XMM0
756 /*****************************************************************************
757 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
758 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
759 * const u8 *in, // Ciphertext input
760 * u64 plaintext_len, // Length of data in bytes for decryption.
761 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
762 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
763 * // concatenated with 0x00000001. 16-byte aligned pointer.
764 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
765 * const u8 *aad, // Additional Authentication Data (AAD)
766 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
767 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
768 * // given authentication tag and only return the plaintext if they match.
769 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
770 * // (most likely), 12 or 8.
775 * keys are pre-expanded and aligned to 16 bytes. we are using the first
776 * set of 11 keys in the data structure void *aes_ctx
780 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
781 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
782 * | Salt (From the SA) |
783 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
784 * | Initialization Vector |
785 * | (This is the sequence number from IPSec header) |
786 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
788 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
793 * AAD padded to 128 bits with 0
794 * for example, assume AAD is a u32 vector
798 * padded AAD in xmm register = {A1 A0 0 0}
801 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
802 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
804 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
805 * | 32-bit Sequence Number (A0) |
806 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
808 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
810 * AAD Format with 32-bit Sequence Number
812 * if AAD is 12 bytes:
813 * AAD[3] = {A0, A1, A2};
814 * padded AAD in xmm register = {A2 A1 A0 0}
817 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
818 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
819 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
820 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
822 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
823 * | 64-bit Extended Sequence Number {A1,A0} |
825 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
827 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
829 * AAD Format with 64-bit Extended Sequence Number
832 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
833 * The code supports 16 too but for other sizes, the code will fail.
836 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
837 * For other sizes, the code will fail.
839 * poly = x^128 + x^127 + x^126 + x^121 + 1
841 *****************************************************************************/
849 * states of %xmm registers %xmm6:%xmm15 not saved
850 * all %xmm registers are clobbered
852 sub $VARIABLE_OFFSET, %rsp
853 and $~63, %rsp # align rsp to 64 bytes
855 movdqu (%r12), %xmm13 # %xmm13 = HashKey
856 pshufb SHUF_MASK(%rip), %xmm13
858 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
870 pshufd $0x24, %xmm1, %xmm2
871 pcmpeqd TWOONE(%rip), %xmm2
872 pand POLY(%rip), %xmm2
873 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
876 # Decrypt first few blocks
878 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
879 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
880 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
883 jz _initial_num_blocks_is_0_decrypt
885 jb _initial_num_blocks_is_1_decrypt
886 je _initial_num_blocks_is_2_decrypt
887 _initial_num_blocks_is_3_decrypt:
888 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
889 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
891 jmp _initial_blocks_decrypted
892 _initial_num_blocks_is_2_decrypt:
893 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
894 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
896 jmp _initial_blocks_decrypted
897 _initial_num_blocks_is_1_decrypt:
898 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
899 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
901 jmp _initial_blocks_decrypted
902 _initial_num_blocks_is_0_decrypt:
903 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
904 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
905 _initial_blocks_decrypted:
907 je _zero_cipher_left_decrypt
909 je _four_cipher_left_decrypt
911 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
912 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
916 _four_cipher_left_decrypt:
917 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
918 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
919 _zero_cipher_left_decrypt:
921 and $15, %r13 # %r13 = arg4 (mod 16)
922 je _multiple_of_16_bytes_decrypt
924 # Handle the last <16 byte block seperately
926 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
927 pshufb SHUF_MASK(%rip), %xmm0
928 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
931 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
932 lea SHIFT_MASK+16(%rip), %r12
934 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
935 # (%r13 is the number of bytes in plaintext mod 16)
936 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
937 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
939 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
940 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
941 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
942 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
944 pshufb SHUF_MASK(%rip),%xmm2
946 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
947 # GHASH computation for the last <16 byte block
954 jle _less_than_8_bytes_left_decrypt
955 mov %rax, (%arg2 , %r11, 1)
960 _less_than_8_bytes_left_decrypt:
961 mov %al, (%arg2, %r11, 1)
965 jne _less_than_8_bytes_left_decrypt
966 _multiple_of_16_bytes_decrypt:
967 mov arg8, %r12 # %r13 = aadLen (number of bytes)
968 shl $3, %r12 # convert into number of bits
969 movd %r12d, %xmm15 # len(A) in %xmm15
970 shl $3, %arg4 # len(C) in bits (*128)
972 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
973 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
975 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
976 # final GHASH computation
977 pshufb SHUF_MASK(%rip), %xmm8
978 mov %arg5, %rax # %rax = *Y0
979 movdqu (%rax), %xmm0 # %xmm0 = Y0
980 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
983 mov arg9, %r10 # %r10 = authTag
984 mov arg10, %r11 # %r11 = auth_tag_len
992 jmp _return_T_done_decrypt
999 jmp _return_T_done_decrypt
1001 movdqu %xmm0, (%r10)
1002 _return_T_done_decrypt:
1010 /*****************************************************************************
1011 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1012 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1013 * const u8 *in, // Plaintext input
1014 * u64 plaintext_len, // Length of data in bytes for encryption.
1015 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1016 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1017 * // concatenated with 0x00000001. 16-byte aligned pointer.
1018 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1019 * const u8 *aad, // Additional Authentication Data (AAD)
1020 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1021 * u8 *auth_tag, // Authenticated Tag output.
1022 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1028 * keys are pre-expanded and aligned to 16 bytes. we are using the
1029 * first set of 11 keys in the data structure void *aes_ctx
1034 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1035 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1036 * | Salt (From the SA) |
1037 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1038 * | Initialization Vector |
1039 * | (This is the sequence number from IPSec header) |
1040 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1042 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1047 * AAD padded to 128 bits with 0
1048 * for example, assume AAD is a u32 vector
1050 * if AAD is 8 bytes:
1051 * AAD[3] = {A0, A1};
1052 * padded AAD in xmm register = {A1 A0 0 0}
1055 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1056 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1058 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1059 * | 32-bit Sequence Number (A0) |
1060 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1062 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1064 * AAD Format with 32-bit Sequence Number
1066 * if AAD is 12 bytes:
1067 * AAD[3] = {A0, A1, A2};
1068 * padded AAD in xmm register = {A2 A1 A0 0}
1071 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1072 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1074 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1075 * | 64-bit Extended Sequence Number {A1,A0} |
1077 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1079 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1081 * AAD Format with 64-bit Extended Sequence Number
1084 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1085 * The code supports 16 too but for other sizes, the code will fail.
1088 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1089 * For other sizes, the code will fail.
1091 * poly = x^128 + x^127 + x^126 + x^121 + 1
1092 ***************************************************************************/
1093 ENTRY(aesni_gcm_enc)
1099 # states of %xmm registers %xmm6:%xmm15 not saved
1100 # all %xmm registers are clobbered
1102 sub $VARIABLE_OFFSET, %rsp
1105 movdqu (%r12), %xmm13
1106 pshufb SHUF_MASK(%rip), %xmm13
1108 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1110 movdqa %xmm13, %xmm2
1120 pshufd $0x24, %xmm1, %xmm2
1121 pcmpeqd TWOONE(%rip), %xmm2
1122 pand POLY(%rip), %xmm2
1124 movdqa %xmm13, HashKey(%rsp)
1125 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1129 # Encrypt first few blocks
1132 jz _initial_num_blocks_is_0_encrypt
1134 jb _initial_num_blocks_is_1_encrypt
1135 je _initial_num_blocks_is_2_encrypt
1136 _initial_num_blocks_is_3_encrypt:
1137 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1138 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1140 jmp _initial_blocks_encrypted
1141 _initial_num_blocks_is_2_encrypt:
1142 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1143 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1145 jmp _initial_blocks_encrypted
1146 _initial_num_blocks_is_1_encrypt:
1147 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1148 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1150 jmp _initial_blocks_encrypted
1151 _initial_num_blocks_is_0_encrypt:
1152 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1153 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1154 _initial_blocks_encrypted:
1156 # Main loop - Encrypt remaining blocks
1159 je _zero_cipher_left_encrypt
1161 je _four_cipher_left_encrypt
1162 _encrypt_by_4_encrypt:
1163 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1164 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1167 jne _encrypt_by_4_encrypt
1168 _four_cipher_left_encrypt:
1169 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1170 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1171 _zero_cipher_left_encrypt:
1173 and $15, %r13 # %r13 = arg4 (mod 16)
1174 je _multiple_of_16_bytes_encrypt
1176 # Handle the last <16 Byte block seperately
1177 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1178 pshufb SHUF_MASK(%rip), %xmm0
1179 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1182 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1183 lea SHIFT_MASK+16(%rip), %r12
1185 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1186 # (%r13 is the number of bytes in plaintext mod 16)
1187 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1188 pshufb %xmm2, %xmm1 # shift right 16-r13 byte
1189 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1190 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1191 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1192 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1194 pshufb SHUF_MASK(%rip),%xmm0
1196 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1197 # GHASH computation for the last <16 byte block
1200 pshufb SHUF_MASK(%rip), %xmm0
1201 # shuffle xmm0 back to output as ciphertext
1206 jle _less_than_8_bytes_left_encrypt
1207 mov %rax, (%arg2 , %r11, 1)
1212 _less_than_8_bytes_left_encrypt:
1213 mov %al, (%arg2, %r11, 1)
1217 jne _less_than_8_bytes_left_encrypt
1218 _multiple_of_16_bytes_encrypt:
1219 mov arg8, %r12 # %r12 = addLen (number of bytes)
1221 movd %r12d, %xmm15 # len(A) in %xmm15
1222 shl $3, %arg4 # len(C) in bits (*128)
1224 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1225 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1227 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1228 # final GHASH computation
1230 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1231 mov %arg5, %rax # %rax = *Y0
1232 movdqu (%rax), %xmm0 # %xmm0 = Y0
1233 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1236 mov arg9, %r10 # %r10 = authTag
1237 mov arg10, %r11 # %r11 = auth_tag_len
1245 jmp _return_T_done_encrypt
1252 jmp _return_T_done_encrypt
1254 movdqu %xmm0, (%r10)
1255 _return_T_done_encrypt:
1265 _key_expansion_256a:
1266 pshufd $0b11111111, %xmm1, %xmm1
1267 shufps $0b00010000, %xmm0, %xmm4
1269 shufps $0b10001100, %xmm0, %xmm4
1272 movaps %xmm0, (TKEYP)
1277 _key_expansion_192a:
1278 pshufd $0b01010101, %xmm1, %xmm1
1279 shufps $0b00010000, %xmm0, %xmm4
1281 shufps $0b10001100, %xmm0, %xmm4
1288 pshufd $0b11111111, %xmm0, %xmm3
1293 shufps $0b01000100, %xmm0, %xmm6
1294 movaps %xmm6, (TKEYP)
1295 shufps $0b01001110, %xmm2, %xmm1
1296 movaps %xmm1, 0x10(TKEYP)
1301 _key_expansion_192b:
1302 pshufd $0b01010101, %xmm1, %xmm1
1303 shufps $0b00010000, %xmm0, %xmm4
1305 shufps $0b10001100, %xmm0, %xmm4
1311 pshufd $0b11111111, %xmm0, %xmm3
1315 movaps %xmm0, (TKEYP)
1320 _key_expansion_256b:
1321 pshufd $0b10101010, %xmm1, %xmm1
1322 shufps $0b00010000, %xmm2, %xmm4
1324 shufps $0b10001100, %xmm2, %xmm4
1327 movaps %xmm2, (TKEYP)
1332 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1333 * unsigned int key_len)
1335 ENTRY(aesni_set_key)
1338 movl 8(%esp), KEYP # ctx
1339 movl 12(%esp), UKEYP # in_key
1340 movl 16(%esp), %edx # key_len
1342 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1343 movaps %xmm0, (KEYP)
1344 lea 0x10(KEYP), TKEYP # key addr
1345 movl %edx, 480(KEYP)
1346 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1350 movups 0x10(UKEYP), %xmm2 # other user key
1351 movaps %xmm2, (TKEYP)
1353 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1354 call _key_expansion_256a
1355 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1356 call _key_expansion_256b
1357 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1358 call _key_expansion_256a
1359 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1360 call _key_expansion_256b
1361 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1362 call _key_expansion_256a
1363 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1364 call _key_expansion_256b
1365 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1366 call _key_expansion_256a
1367 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1368 call _key_expansion_256b
1369 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1370 call _key_expansion_256a
1371 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1372 call _key_expansion_256b
1373 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1374 call _key_expansion_256a
1375 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1376 call _key_expansion_256b
1377 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1378 call _key_expansion_256a
1381 movq 0x10(UKEYP), %xmm2 # other user key
1382 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1383 call _key_expansion_192a
1384 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1385 call _key_expansion_192b
1386 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1387 call _key_expansion_192a
1388 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1389 call _key_expansion_192b
1390 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1391 call _key_expansion_192a
1392 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1393 call _key_expansion_192b
1394 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1395 call _key_expansion_192a
1396 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1397 call _key_expansion_192b
1400 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1401 call _key_expansion_128
1402 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1403 call _key_expansion_128
1404 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1405 call _key_expansion_128
1406 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1407 call _key_expansion_128
1408 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1409 call _key_expansion_128
1410 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1411 call _key_expansion_128
1412 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1413 call _key_expansion_128
1414 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1415 call _key_expansion_128
1416 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1417 call _key_expansion_128
1418 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1419 call _key_expansion_128
1422 movaps (KEYP), %xmm0
1423 movaps (TKEYP), %xmm1
1424 movaps %xmm0, 240(TKEYP)
1425 movaps %xmm1, 240(KEYP)
1427 lea 240-16(TKEYP), UKEYP
1430 movaps (KEYP), %xmm0
1432 movaps %xmm1, (UKEYP)
1444 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1454 movl 480(KEYP), KLEN # key length
1455 movups (INP), STATE # input
1457 movups STATE, (OUTP) # output
1465 * _aesni_enc1: internal ABI
1467 * KEYP: key struct pointer
1469 * STATE: initial state (input)
1471 * STATE: finial state (output)
1478 movaps (KEYP), KEY # key
1480 pxor KEY, STATE # round 0
1484 lea 0x20(TKEYP), TKEYP
1487 movaps -0x60(TKEYP), KEY
1489 movaps -0x50(TKEYP), KEY
1493 movaps -0x40(TKEYP), KEY
1495 movaps -0x30(TKEYP), KEY
1499 movaps -0x20(TKEYP), KEY
1501 movaps -0x10(TKEYP), KEY
1505 movaps 0x10(TKEYP), KEY
1507 movaps 0x20(TKEYP), KEY
1509 movaps 0x30(TKEYP), KEY
1511 movaps 0x40(TKEYP), KEY
1513 movaps 0x50(TKEYP), KEY
1515 movaps 0x60(TKEYP), KEY
1517 movaps 0x70(TKEYP), KEY
1518 AESENCLAST KEY STATE
1522 * _aesni_enc4: internal ABI
1524 * KEYP: key struct pointer
1526 * STATE1: initial state (input)
1531 * STATE1: finial state (output)
1541 movaps (KEYP), KEY # key
1543 pxor KEY, STATE1 # round 0
1550 lea 0x20(TKEYP), TKEYP
1553 movaps -0x60(TKEYP), KEY
1558 movaps -0x50(TKEYP), KEY
1565 movaps -0x40(TKEYP), KEY
1570 movaps -0x30(TKEYP), KEY
1577 movaps -0x20(TKEYP), KEY
1582 movaps -0x10(TKEYP), KEY
1592 movaps 0x10(TKEYP), KEY
1597 movaps 0x20(TKEYP), KEY
1602 movaps 0x30(TKEYP), KEY
1607 movaps 0x40(TKEYP), KEY
1612 movaps 0x50(TKEYP), KEY
1617 movaps 0x60(TKEYP), KEY
1622 movaps 0x70(TKEYP), KEY
1623 AESENCLAST KEY STATE1 # last round
1624 AESENCLAST KEY STATE2
1625 AESENCLAST KEY STATE3
1626 AESENCLAST KEY STATE4
1630 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1640 mov 480(KEYP), KLEN # key length
1642 movups (INP), STATE # input
1644 movups STATE, (OUTP) #output
1652 * _aesni_dec1: internal ABI
1654 * KEYP: key struct pointer
1656 * STATE: initial state (input)
1658 * STATE: finial state (output)
1665 movaps (KEYP), KEY # key
1667 pxor KEY, STATE # round 0
1671 lea 0x20(TKEYP), TKEYP
1674 movaps -0x60(TKEYP), KEY
1676 movaps -0x50(TKEYP), KEY
1680 movaps -0x40(TKEYP), KEY
1682 movaps -0x30(TKEYP), KEY
1686 movaps -0x20(TKEYP), KEY
1688 movaps -0x10(TKEYP), KEY
1692 movaps 0x10(TKEYP), KEY
1694 movaps 0x20(TKEYP), KEY
1696 movaps 0x30(TKEYP), KEY
1698 movaps 0x40(TKEYP), KEY
1700 movaps 0x50(TKEYP), KEY
1702 movaps 0x60(TKEYP), KEY
1704 movaps 0x70(TKEYP), KEY
1705 AESDECLAST KEY STATE
1709 * _aesni_dec4: internal ABI
1711 * KEYP: key struct pointer
1713 * STATE1: initial state (input)
1718 * STATE1: finial state (output)
1728 movaps (KEYP), KEY # key
1730 pxor KEY, STATE1 # round 0
1737 lea 0x20(TKEYP), TKEYP
1740 movaps -0x60(TKEYP), KEY
1745 movaps -0x50(TKEYP), KEY
1752 movaps -0x40(TKEYP), KEY
1757 movaps -0x30(TKEYP), KEY
1764 movaps -0x20(TKEYP), KEY
1769 movaps -0x10(TKEYP), KEY
1779 movaps 0x10(TKEYP), KEY
1784 movaps 0x20(TKEYP), KEY
1789 movaps 0x30(TKEYP), KEY
1794 movaps 0x40(TKEYP), KEY
1799 movaps 0x50(TKEYP), KEY
1804 movaps 0x60(TKEYP), KEY
1809 movaps 0x70(TKEYP), KEY
1810 AESDECLAST KEY STATE1 # last round
1811 AESDECLAST KEY STATE2
1812 AESDECLAST KEY STATE3
1813 AESDECLAST KEY STATE4
1817 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1820 ENTRY(aesni_ecb_enc)
1830 test LEN, LEN # check length
1839 movups (INP), STATE1
1840 movups 0x10(INP), STATE2
1841 movups 0x20(INP), STATE3
1842 movups 0x30(INP), STATE4
1844 movups STATE1, (OUTP)
1845 movups STATE2, 0x10(OUTP)
1846 movups STATE3, 0x20(OUTP)
1847 movups STATE4, 0x30(OUTP)
1857 movups (INP), STATE1
1859 movups STATE1, (OUTP)
1874 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1877 ENTRY(aesni_ecb_dec)
1897 movups (INP), STATE1
1898 movups 0x10(INP), STATE2
1899 movups 0x20(INP), STATE3
1900 movups 0x30(INP), STATE4
1902 movups STATE1, (OUTP)
1903 movups STATE2, 0x10(OUTP)
1904 movups STATE3, 0x20(OUTP)
1905 movups STATE4, 0x30(OUTP)
1915 movups (INP), STATE1
1917 movups STATE1, (OUTP)
1932 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1933 * size_t len, u8 *iv)
1935 ENTRY(aesni_cbc_enc)
1950 movups (IVP), STATE # load iv as initial state
1953 movups (INP), IN # load input
1956 movups STATE, (OUTP) # store output
1973 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1974 * size_t len, u8 *iv)
1976 ENTRY(aesni_cbc_dec)
1989 jb .Lcbc_dec_just_ret
1999 movups 0x10(INP), IN2
2002 movups 0x20(INP), IN3
2004 movups 0x30(INP), IN4
2007 movups 0x20(INP), IN1
2009 movups 0x30(INP), IN2
2021 pxor 0x10(INP), STATE3
2025 movups STATE1, (OUTP)
2026 movups STATE2, 0x10(OUTP)
2027 movups STATE3, 0x20(OUTP)
2028 movups STATE4, 0x30(OUTP)
2042 movups STATE, (OUTP)
2063 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2066 * _aesni_inc_init: internal ABI
2067 * setup registers used by _aesni_inc
2071 * CTR: == IV, in little endian
2072 * TCTR_LOW: == lower qword of CTR
2073 * INC: == 1, in little endian
2074 * BSWAP_MASK == endian swapping mask
2078 movaps .Lbswap_mask, BSWAP_MASK
2080 PSHUFB_XMM BSWAP_MASK CTR
2082 MOVQ_R64_XMM TCTR_LOW INC
2083 MOVQ_R64_XMM CTR TCTR_LOW
2087 * _aesni_inc: internal ABI
2088 * Increase IV by 1, IV is in big endian
2091 * CTR: == IV, in little endian
2092 * TCTR_LOW: == lower qword of CTR
2093 * INC: == 1, in little endian
2094 * BSWAP_MASK == endian swapping mask
2098 * CTR: == output IV, in little endian
2099 * TCTR_LOW: == lower qword of CTR
2111 PSHUFB_XMM BSWAP_MASK IV
2115 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2116 * size_t len, u8 *iv)
2118 ENTRY(aesni_ctr_enc)
2120 jb .Lctr_enc_just_ret
2123 call _aesni_inc_init
2133 movups 0x10(INP), IN2
2136 movups 0x20(INP), IN3
2139 movups 0x30(INP), IN4
2142 movups STATE1, (OUTP)
2144 movups STATE2, 0x10(OUTP)
2146 movups STATE3, 0x20(OUTP)
2148 movups STATE4, 0x30(OUTP)
2163 movups STATE, (OUTP)