2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
36 #define ENDPROC(name) \
37 .type name, @function ; \
42 #define FRAME_OFFSET 0
44 #include "inst-intel.h"
47 * The following macros are used to move an (un)aligned 16 byte value to/from
48 * an XMM register. This can done for either FP or integer values, for FP use
49 * movaps (move aligned packed single) or integer use movdqa (move double quad
50 * aligned). It doesn't make a performance difference which instruction is used
51 * since Nehalem (original Core i7) was released. However, the movaps is a byte
52 * shorter, so that is the one we'll use for now. (same for unaligned).
61 .Lgf128mul_x_ble_mask
:
62 .octa
0x00000000000000010000000000000087
63 POLY
: .octa
0xC2000000000000000000000000000001
64 TWOONE
: .octa
0x00000001000000000000000000000001
66 # order of these constants should not change.
67 # more specifically, ALL_F should follow SHIFT_MASK,
68 # and ZERO should follow ALL_F
70 SHUF_MASK
: .octa
0x000102030405060708090A0B0C0D0E0F
71 MASK1
: .octa
0x0000000000000000ffffffffffffffff
72 MASK2
: .octa
0xffffffffffffffff0000000000000000
73 SHIFT_MASK
: .octa
0x0f0e0d0c0b0a09080706050403020100
74 ALL_F
: .octa
0xffffffffffffffffffffffffffffffff
75 ZERO
: .octa
0x00000000000000000000000000000000
76 ONE
: .octa
0x00000000000000000000000000000001
77 F_MIN_MASK
: .octa
0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
85 #define STACK_OFFSET 8*3
86 #define HashKey 16*0 // store HashKey <<1 mod poly here
87 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
88 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
89 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
90 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
91 // bits of HashKey <<1 mod poly here
92 //(for Karatsuba purposes)
93 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
94 // bits of HashKey^2 <<1 mod poly here
95 // (for Karatsuba purposes)
96 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
97 // bits of HashKey^3 <<1 mod poly here
98 // (for Karatsuba purposes)
99 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
100 // bits of HashKey^4 <<1 mod poly here
101 // (for Karatsuba purposes)
102 #define VARIABLE_OFFSET 16*8
110 #define arg7 STACK_OFFSET+8(%r14)
111 #define arg8 STACK_OFFSET+16(%r14)
112 #define arg9 STACK_OFFSET+24(%r14)
113 #define arg10 STACK_OFFSET+32(%r14)
114 #define keysize 2*15*16(%arg1)
131 #define BSWAP_MASK %xmm10
135 #define GF128MUL_MASK %xmm10
165 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
168 * Input: A and B (128-bits each, bit-reflected)
169 * Output: C = A*B*x mod poly, (i.e. >>1 )
170 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
171 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
174 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
176 pshufd $
78, \GH
, \TMP2
177 pshufd $
78, \HK
, \TMP3
178 pxor \GH
, \TMP2
# TMP2 = a1+a0
179 pxor \HK
, \TMP3
# TMP3 = b1+b0
180 PCLMULQDQ
0x11, \HK
, \TMP1
# TMP1 = a1*b1
181 PCLMULQDQ
0x00, \HK
, \GH
# GH = a0*b0
182 PCLMULQDQ
0x00, \TMP3
, \TMP2
# TMP2 = (a0+a1)*(b1+b0)
184 pxor \TMP1
, \TMP2
# TMP2 = (a0*b0)+(a1*b0)
186 pslldq $
8, \TMP3
# left shift TMP3 2 DWs
187 psrldq $
8, \TMP2
# right shift TMP2 2 DWs
189 pxor \TMP2
, \TMP1
# TMP2:GH holds the result of GH*HK
191 # first phase of the reduction
195 movdqa \GH
, \TMP4
# copy GH into TMP2,TMP3 and TMP4
196 # in in order to perform
198 pslld $
31, \TMP2
# packed right shift <<31
199 pslld $
30, \TMP3
# packed right shift <<30
200 pslld $
25, \TMP4
# packed right shift <<25
201 pxor \TMP3
, \TMP2
# xor the shifted versions
204 psrldq $
4, \TMP5
# right shift TMP5 1 DW
205 pslldq $
12, \TMP2
# left shift TMP2 3 DWs
208 # second phase of the reduction
210 movdqa \GH
,\TMP2
# copy GH into TMP2,TMP3 and TMP4
211 # in in order to perform
215 psrld $
1,\TMP2
# packed left shift >>1
216 psrld $
2,\TMP3
# packed left shift >>2
217 psrld $
7,\TMP4
# packed left shift >>7
218 pxor \TMP3
,\TMP2
# xor the shifted versions
222 pxor \TMP1
, \GH
# result is in TMP1
226 * if a = number of total plaintext bytes
228 * num_initial_blocks = b mod 4
229 * encrypt the initial num_initial_blocks blocks and apply ghash on
231 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
233 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
237 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
238 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
239 MOVADQ
SHUF_MASK(%rip
), %xmm14
240 mov arg7
, %r10
# %r10 = AAD
241 mov arg8
, %r12
# %r12 = aadLen
245 _get_AAD_loop
\num_initial_blocks\operation
:
252 jne _get_AAD_loop
\num_initial_blocks\operation
255 je _get_AAD_loop2_done
\num_initial_blocks\operation
258 _get_AAD_loop2
\num_initial_blocks\operation
:
262 jne _get_AAD_loop2
\num_initial_blocks\operation
264 _get_AAD_loop2_done
\num_initial_blocks\operation
:
265 PSHUFB_XMM
%xmm14
, %xmm\i
# byte-reflect the AAD data
267 xor %r11
, %r11
# initialise the data pointer offset as zero
269 # start AES for num_initial_blocks blocks
271 mov
%arg5
, %rax
# %rax = *Y0
272 movdqu (%rax
), \XMM0
# XMM0 = Y0
273 PSHUFB_XMM
%xmm14
, \XMM0
275 .if (\i
== 5) || (\i
== 6) || (\i
== 7)
276 MOVADQ
ONE(%RIP
),\TMP1
279 paddd \TMP1
, \XMM0
# INCR Y0
280 movdqa \XMM0
, %xmm\index
281 PSHUFB_XMM
%xmm14
, %xmm\index
# perform a 16 byte swap
282 pxor \TMP2
, %xmm\index
286 shr $
2,%eax
# 128->4, 192->6, 256->8
287 add $
5,%eax
# 128->9, 192->11, 256->13
289 aes_loop_initial_dec
\num_initial_blocks
:
292 AESENC \TMP1
, %xmm\index
296 jnz aes_loop_initial_dec
\num_initial_blocks
300 AESENCLAST \TMP1
, %xmm\index
# Last Round
303 movdqu (%arg3
, %r11
, 1), \TMP1
304 pxor \TMP1
, %xmm\index
305 movdqu
%xmm\index
, (%arg2
, %r11
, 1)
306 # write back plaintext/ciphertext for num_initial_blocks
309 movdqa \TMP1
, %xmm\index
310 PSHUFB_XMM
%xmm14
, %xmm\index
311 # prepare plaintext/ciphertext for GHASH computation
314 GHASH_MUL
%xmm\i
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
315 # apply GHASH on num_initial_blocks blocks
319 GHASH_MUL
%xmm6
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
321 GHASH_MUL
%xmm7
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
323 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
326 GHASH_MUL
%xmm7
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
328 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
331 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
334 jl _initial_blocks_done
\num_initial_blocks\operation
335 # no need for precomputed values
338 * Precomputations for HashKey parallel with encryption of first 4 blocks.
339 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
341 MOVADQ
ONE(%rip
), \TMP1
342 paddd \TMP1
, \XMM0
# INCR Y0
344 PSHUFB_XMM
%xmm14
, \XMM1
# perform a 16 byte swap
346 paddd \TMP1
, \XMM0
# INCR Y0
348 PSHUFB_XMM
%xmm14
, \XMM2
# perform a 16 byte swap
350 paddd \TMP1
, \XMM0
# INCR Y0
352 PSHUFB_XMM
%xmm14
, \XMM3
# perform a 16 byte swap
354 paddd \TMP1
, \XMM0
# INCR Y0
356 PSHUFB_XMM
%xmm14
, \XMM4
# perform a 16 byte swap
358 MOVADQ
0(%arg1
),\TMP1
364 pshufd $
78, \TMP3
, \TMP1
366 movdqa \TMP1
, HashKey_k(%rsp
)
367 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
368 # TMP5 = HashKey^2<<1 (mod poly)
369 movdqa \TMP5
, HashKey_2(%rsp
)
370 # HashKey_2 = HashKey^2<<1 (mod poly)
371 pshufd $
78, \TMP5
, \TMP1
373 movdqa \TMP1
, HashKey_2_k(%rsp
)
374 .irpc index
, 1234 # do 4 rounds
375 movaps
0x10*\
index(%arg1
), \TMP1
381 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
382 # TMP5 = HashKey^3<<1 (mod poly)
383 movdqa \TMP5
, HashKey_3(%rsp
)
384 pshufd $
78, \TMP5
, \TMP1
386 movdqa \TMP1
, HashKey_3_k(%rsp
)
387 .irpc index
, 56789 # do next 5 rounds
388 movaps
0x10*\
index(%arg1
), \TMP1
394 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
395 # TMP5 = HashKey^3<<1 (mod poly)
396 movdqa \TMP5
, HashKey_4(%rsp
)
397 pshufd $
78, \TMP5
, \TMP1
399 movdqa \TMP1
, HashKey_4_k(%rsp
)
402 shr $
2,%eax
# 128->4, 192->6, 256->8
403 sub $
4,%eax
# 128->0, 192->2, 256->4
404 jz aes_loop_pre_dec_done
\num_initial_blocks
406 aes_loop_pre_dec
\num_initial_blocks
:
409 AESENC \TMP2
, %xmm\index
413 jnz aes_loop_pre_dec
\num_initial_blocks
415 aes_loop_pre_dec_done
\num_initial_blocks
:
417 AESENCLAST \TMP2
, \XMM1
418 AESENCLAST \TMP2
, \XMM2
419 AESENCLAST \TMP2
, \XMM3
420 AESENCLAST \TMP2
, \XMM4
421 movdqu
16*0(%arg3
, %r11
, 1), \TMP1
423 movdqu \XMM1
, 16*0(%arg2
, %r11
, 1)
425 movdqu
16*1(%arg3
, %r11
, 1), \TMP1
427 movdqu \XMM2
, 16*1(%arg2
, %r11
, 1)
429 movdqu
16*2(%arg3
, %r11
, 1), \TMP1
431 movdqu \XMM3
, 16*2(%arg2
, %r11
, 1)
433 movdqu
16*3(%arg3
, %r11
, 1), \TMP1
435 movdqu \XMM4
, 16*3(%arg2
, %r11
, 1)
438 PSHUFB_XMM
%xmm14
, \XMM1
# perform a 16 byte swap
440 # combine GHASHed value with the corresponding ciphertext
441 PSHUFB_XMM
%xmm14
, \XMM2
# perform a 16 byte swap
442 PSHUFB_XMM
%xmm14
, \XMM3
# perform a 16 byte swap
443 PSHUFB_XMM
%xmm14
, \XMM4
# perform a 16 byte swap
445 _initial_blocks_done
\num_initial_blocks\operation
:
451 * if a = number of total plaintext bytes
453 * num_initial_blocks = b mod 4
454 * encrypt the initial num_initial_blocks blocks and apply ghash on
456 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
458 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
462 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
463 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
464 MOVADQ
SHUF_MASK(%rip
), %xmm14
465 mov arg7
, %r10
# %r10 = AAD
466 mov arg8
, %r12
# %r12 = aadLen
469 _get_AAD_loop
\num_initial_blocks\operation
:
476 jne _get_AAD_loop
\num_initial_blocks\operation
478 je _get_AAD_loop2_done
\num_initial_blocks\operation
480 _get_AAD_loop2
\num_initial_blocks\operation
:
484 jne _get_AAD_loop2
\num_initial_blocks\operation
485 _get_AAD_loop2_done
\num_initial_blocks\operation
:
486 PSHUFB_XMM
%xmm14
, %xmm\i
# byte-reflect the AAD data
488 xor %r11
, %r11
# initialise the data pointer offset as zero
490 # start AES for num_initial_blocks blocks
492 mov
%arg5
, %rax
# %rax = *Y0
493 movdqu (%rax
), \XMM0
# XMM0 = Y0
494 PSHUFB_XMM
%xmm14
, \XMM0
496 .if (\i
== 5) || (\i
== 6) || (\i
== 7)
498 MOVADQ
ONE(%RIP
),\TMP1
499 MOVADQ
0(%arg1
),\TMP2
501 paddd \TMP1
, \XMM0
# INCR Y0
502 MOVADQ \XMM0
, %xmm\index
503 PSHUFB_XMM
%xmm14
, %xmm\index
# perform a 16 byte swap
504 pxor \TMP2
, %xmm\index
508 shr $
2,%eax
# 128->4, 192->6, 256->8
509 add $
5,%eax
# 128->9, 192->11, 256->13
511 aes_loop_initial_enc
\num_initial_blocks
:
514 AESENC \TMP1
, %xmm\index
518 jnz aes_loop_initial_enc
\num_initial_blocks
522 AESENCLAST \TMP1
, %xmm\index
# Last Round
525 movdqu (%arg3
, %r11
, 1), \TMP1
526 pxor \TMP1
, %xmm\index
527 movdqu
%xmm\index
, (%arg2
, %r11
, 1)
528 # write back plaintext/ciphertext for num_initial_blocks
530 PSHUFB_XMM
%xmm14
, %xmm\index
532 # prepare plaintext/ciphertext for GHASH computation
535 GHASH_MUL
%xmm\i
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
536 # apply GHASH on num_initial_blocks blocks
540 GHASH_MUL
%xmm6
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
542 GHASH_MUL
%xmm7
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
544 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
547 GHASH_MUL
%xmm7
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
549 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
552 GHASH_MUL
%xmm8
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP5
, \XMM1
555 jl _initial_blocks_done
\num_initial_blocks\operation
556 # no need for precomputed values
559 * Precomputations for HashKey parallel with encryption of first 4 blocks.
560 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
562 MOVADQ
ONE(%RIP
),\TMP1
563 paddd \TMP1
, \XMM0
# INCR Y0
565 PSHUFB_XMM
%xmm14
, \XMM1
# perform a 16 byte swap
567 paddd \TMP1
, \XMM0
# INCR Y0
569 PSHUFB_XMM
%xmm14
, \XMM2
# perform a 16 byte swap
571 paddd \TMP1
, \XMM0
# INCR Y0
573 PSHUFB_XMM
%xmm14
, \XMM3
# perform a 16 byte swap
575 paddd \TMP1
, \XMM0
# INCR Y0
577 PSHUFB_XMM
%xmm14
, \XMM4
# perform a 16 byte swap
579 MOVADQ
0(%arg1
),\TMP1
585 pshufd $
78, \TMP3
, \TMP1
587 movdqa \TMP1
, HashKey_k(%rsp
)
588 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
589 # TMP5 = HashKey^2<<1 (mod poly)
590 movdqa \TMP5
, HashKey_2(%rsp
)
591 # HashKey_2 = HashKey^2<<1 (mod poly)
592 pshufd $
78, \TMP5
, \TMP1
594 movdqa \TMP1
, HashKey_2_k(%rsp
)
595 .irpc index
, 1234 # do 4 rounds
596 movaps
0x10*\
index(%arg1
), \TMP1
602 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
603 # TMP5 = HashKey^3<<1 (mod poly)
604 movdqa \TMP5
, HashKey_3(%rsp
)
605 pshufd $
78, \TMP5
, \TMP1
607 movdqa \TMP1
, HashKey_3_k(%rsp
)
608 .irpc index
, 56789 # do next 5 rounds
609 movaps
0x10*\
index(%arg1
), \TMP1
615 GHASH_MUL \TMP5
, \TMP3
, \TMP1
, \TMP2
, \TMP4
, \TMP6
, \TMP7
616 # TMP5 = HashKey^3<<1 (mod poly)
617 movdqa \TMP5
, HashKey_4(%rsp
)
618 pshufd $
78, \TMP5
, \TMP1
620 movdqa \TMP1
, HashKey_4_k(%rsp
)
623 shr $
2,%eax
# 128->4, 192->6, 256->8
624 sub $
4,%eax
# 128->0, 192->2, 256->4
625 jz aes_loop_pre_enc_done
\num_initial_blocks
627 aes_loop_pre_enc
\num_initial_blocks
:
630 AESENC \TMP2
, %xmm\index
634 jnz aes_loop_pre_enc
\num_initial_blocks
636 aes_loop_pre_enc_done
\num_initial_blocks
:
638 AESENCLAST \TMP2
, \XMM1
639 AESENCLAST \TMP2
, \XMM2
640 AESENCLAST \TMP2
, \XMM3
641 AESENCLAST \TMP2
, \XMM4
642 movdqu
16*0(%arg3
, %r11
, 1), \TMP1
644 movdqu
16*1(%arg3
, %r11
, 1), \TMP1
646 movdqu
16*2(%arg3
, %r11
, 1), \TMP1
648 movdqu
16*3(%arg3
, %r11
, 1), \TMP1
650 movdqu \XMM1
, 16*0(%arg2
, %r11
, 1)
651 movdqu \XMM2
, 16*1(%arg2
, %r11
, 1)
652 movdqu \XMM3
, 16*2(%arg2
, %r11
, 1)
653 movdqu \XMM4
, 16*3(%arg2
, %r11
, 1)
656 PSHUFB_XMM
%xmm14
, \XMM1
# perform a 16 byte swap
658 # combine GHASHed value with the corresponding ciphertext
659 PSHUFB_XMM
%xmm14
, \XMM2
# perform a 16 byte swap
660 PSHUFB_XMM
%xmm14
, \XMM3
# perform a 16 byte swap
661 PSHUFB_XMM
%xmm14
, \XMM4
# perform a 16 byte swap
663 _initial_blocks_done
\num_initial_blocks\operation
:
668 * encrypt 4 blocks at a time
669 * ghash the 4 previously encrypted ciphertext blocks
670 * arg1, %arg2, %arg3 are used as pointers only, not modified
671 * %r11 is the data offset value
673 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
681 movdqa
SHUF_MASK(%rip
), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
685 pshufd $
78, \XMM5
, \TMP6
687 paddd
ONE(%rip
), \XMM0
# INCR CNT
688 movdqa
HashKey_4(%rsp
), \TMP5
689 PCLMULQDQ
0x11, \TMP5
, \TMP4
# TMP4 = a1*b1
691 paddd
ONE(%rip
), \XMM0
# INCR CNT
693 paddd
ONE(%rip
), \XMM0
# INCR CNT
695 paddd
ONE(%rip
), \XMM0
# INCR CNT
697 PSHUFB_XMM
%xmm15
, \XMM1
# perform a 16 byte swap
698 PCLMULQDQ
0x00, \TMP5
, \XMM5
# XMM5 = a0*b0
699 PSHUFB_XMM
%xmm15
, \XMM2
# perform a 16 byte swap
700 PSHUFB_XMM
%xmm15
, \XMM3
# perform a 16 byte swap
701 PSHUFB_XMM
%xmm15
, \XMM4
# perform a 16 byte swap
707 movdqa
HashKey_4_k(%rsp
), \TMP5
708 PCLMULQDQ
0x00, \TMP5
, \TMP6
# TMP6 = (a1+a0)*(b1+b0)
709 movaps
0x10(%arg1
), \TMP1
710 AESENC \TMP1
, \XMM1
# Round 1
714 movaps
0x20(%arg1
), \TMP1
715 AESENC \TMP1
, \XMM1
# Round 2
720 pshufd $
78, \XMM6
, \TMP2
722 movdqa
HashKey_3(%rsp
), \TMP5
723 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1 * b1
724 movaps
0x30(%arg1
), \TMP3
725 AESENC \TMP3
, \XMM1
# Round 3
729 PCLMULQDQ
0x00, \TMP5
, \XMM6
# XMM6 = a0*b0
730 movaps
0x40(%arg1
), \TMP3
731 AESENC \TMP3
, \XMM1
# Round 4
735 movdqa
HashKey_3_k(%rsp
), \TMP5
736 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
737 movaps
0x50(%arg1
), \TMP3
738 AESENC \TMP3
, \XMM1
# Round 5
743 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
747 pshufd $
78, \XMM7
, \TMP2
749 movdqa
HashKey_2(%rsp
), \TMP5
751 # Multiply TMP5 * HashKey using karatsuba
753 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
754 movaps
0x60(%arg1
), \TMP3
755 AESENC \TMP3
, \XMM1
# Round 6
759 PCLMULQDQ
0x00, \TMP5
, \XMM7
# XMM7 = a0*b0
760 movaps
0x70(%arg1
), \TMP3
761 AESENC \TMP3
, \XMM1
# Round 7
765 movdqa
HashKey_2_k(%rsp
), \TMP5
766 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
767 movaps
0x80(%arg1
), \TMP3
768 AESENC \TMP3
, \XMM1
# Round 8
773 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
781 pshufd $
78, \XMM8
, \TMP2
783 movdqa
HashKey(%rsp
), \TMP5
784 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
785 movaps
0x90(%arg1
), \TMP3
786 AESENC \TMP3
, \XMM1
# Round 9
790 PCLMULQDQ
0x00, \TMP5
, \XMM8
# XMM8 = a0*b0
793 shr $
2,%eax
# 128->4, 192->6, 256->8
794 sub $
4,%eax
# 128->0, 192->2, 256->4
795 jz aes_loop_par_enc_done
800 AESENC \TMP3
, %xmm\index
806 aes_loop_par_enc_done
:
808 AESENCLAST \TMP3
, \XMM1
# Round 10
809 AESENCLAST \TMP3
, \XMM2
810 AESENCLAST \TMP3
, \XMM3
811 AESENCLAST \TMP3
, \XMM4
812 movdqa
HashKey_k(%rsp
), \TMP5
813 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
814 movdqu (%arg3
,%r11
,1), \TMP3
815 pxor \TMP3
, \XMM1
# Ciphertext/Plaintext XOR EK
816 movdqu
16(%arg3
,%r11
,1), \TMP3
817 pxor \TMP3
, \XMM2
# Ciphertext/Plaintext XOR EK
818 movdqu
32(%arg3
,%r11
,1), \TMP3
819 pxor \TMP3
, \XMM3
# Ciphertext/Plaintext XOR EK
820 movdqu
48(%arg3
,%r11
,1), \TMP3
821 pxor \TMP3
, \XMM4
# Ciphertext/Plaintext XOR EK
822 movdqu \XMM1
, (%arg2
,%r11
,1) # Write to the ciphertext buffer
823 movdqu \XMM2
, 16(%arg2
,%r11
,1) # Write to the ciphertext buffer
824 movdqu \XMM3
, 32(%arg2
,%r11
,1) # Write to the ciphertext buffer
825 movdqu \XMM4
, 48(%arg2
,%r11
,1) # Write to the ciphertext buffer
826 PSHUFB_XMM
%xmm15
, \XMM1
# perform a 16 byte swap
827 PSHUFB_XMM
%xmm15
, \XMM2
# perform a 16 byte swap
828 PSHUFB_XMM
%xmm15
, \XMM3
# perform a 16 byte swap
829 PSHUFB_XMM
%xmm15
, \XMM4
# perform a 16 byte swap
837 pslldq $
8, \TMP3
# left shift TMP3 2 DWs
838 psrldq $
8, \TMP2
# right shift TMP2 2 DWs
840 pxor \TMP2
, \TMP1
# accumulate the results in TMP1:XMM5
842 # first phase of reduction
847 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
848 pslld $
31, \TMP2
# packed right shift << 31
849 pslld $
30, \TMP3
# packed right shift << 30
850 pslld $
25, \TMP4
# packed right shift << 25
851 pxor \TMP3
, \TMP2
# xor the shifted versions
854 psrldq $
4, \TMP5
# right shift T5 1 DW
855 pslldq $
12, \TMP2
# left shift T2 3 DWs
858 # second phase of reduction
860 movdqa \XMM5
,\TMP2
# make 3 copies of XMM5 into TMP2, TMP3, TMP4
863 psrld $
1, \TMP2
# packed left shift >>1
864 psrld $
2, \TMP3
# packed left shift >>2
865 psrld $
7, \TMP4
# packed left shift >>7
866 pxor \TMP3
,\TMP2
# xor the shifted versions
870 pxor \TMP1
, \XMM5
# result is in TMP1
876 * decrypt 4 blocks at a time
877 * ghash the 4 previously decrypted ciphertext blocks
878 * arg1, %arg2, %arg3 are used as pointers only, not modified
879 * %r11 is the data offset value
881 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
882 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
889 movdqa
SHUF_MASK(%rip
), %xmm15
890 # multiply TMP5 * HashKey using karatsuba
893 pshufd $
78, \XMM5
, \TMP6
895 paddd
ONE(%rip
), \XMM0
# INCR CNT
896 movdqa
HashKey_4(%rsp
), \TMP5
897 PCLMULQDQ
0x11, \TMP5
, \TMP4
# TMP4 = a1*b1
899 paddd
ONE(%rip
), \XMM0
# INCR CNT
901 paddd
ONE(%rip
), \XMM0
# INCR CNT
903 paddd
ONE(%rip
), \XMM0
# INCR CNT
905 PSHUFB_XMM
%xmm15
, \XMM1
# perform a 16 byte swap
906 PCLMULQDQ
0x00, \TMP5
, \XMM5
# XMM5 = a0*b0
907 PSHUFB_XMM
%xmm15
, \XMM2
# perform a 16 byte swap
908 PSHUFB_XMM
%xmm15
, \XMM3
# perform a 16 byte swap
909 PSHUFB_XMM
%xmm15
, \XMM4
# perform a 16 byte swap
915 movdqa
HashKey_4_k(%rsp
), \TMP5
916 PCLMULQDQ
0x00, \TMP5
, \TMP6
# TMP6 = (a1+a0)*(b1+b0)
917 movaps
0x10(%arg1
), \TMP1
918 AESENC \TMP1
, \XMM1
# Round 1
922 movaps
0x20(%arg1
), \TMP1
923 AESENC \TMP1
, \XMM1
# Round 2
928 pshufd $
78, \XMM6
, \TMP2
930 movdqa
HashKey_3(%rsp
), \TMP5
931 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1 * b1
932 movaps
0x30(%arg1
), \TMP3
933 AESENC \TMP3
, \XMM1
# Round 3
937 PCLMULQDQ
0x00, \TMP5
, \XMM6
# XMM6 = a0*b0
938 movaps
0x40(%arg1
), \TMP3
939 AESENC \TMP3
, \XMM1
# Round 4
943 movdqa
HashKey_3_k(%rsp
), \TMP5
944 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
945 movaps
0x50(%arg1
), \TMP3
946 AESENC \TMP3
, \XMM1
# Round 5
951 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
955 pshufd $
78, \XMM7
, \TMP2
957 movdqa
HashKey_2(%rsp
), \TMP5
959 # Multiply TMP5 * HashKey using karatsuba
961 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
962 movaps
0x60(%arg1
), \TMP3
963 AESENC \TMP3
, \XMM1
# Round 6
967 PCLMULQDQ
0x00, \TMP5
, \XMM7
# XMM7 = a0*b0
968 movaps
0x70(%arg1
), \TMP3
969 AESENC \TMP3
, \XMM1
# Round 7
973 movdqa
HashKey_2_k(%rsp
), \TMP5
974 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
975 movaps
0x80(%arg1
), \TMP3
976 AESENC \TMP3
, \XMM1
# Round 8
981 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
985 # Multiply XMM8 * HashKey
986 # XMM8 and TMP5 hold the values for the two operands
989 pshufd $
78, \XMM8
, \TMP2
991 movdqa
HashKey(%rsp
), \TMP5
992 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
993 movaps
0x90(%arg1
), \TMP3
994 AESENC \TMP3
, \XMM1
# Round 9
998 PCLMULQDQ
0x00, \TMP5
, \XMM8
# XMM8 = a0*b0
1001 shr $
2,%eax
# 128->4, 192->6, 256->8
1002 sub $
4,%eax
# 128->0, 192->2, 256->4
1003 jz aes_loop_par_dec_done
1008 AESENC \TMP3
, %xmm\index
1012 jnz aes_loop_par_dec
1014 aes_loop_par_dec_done
:
1015 MOVADQ (%r10
), \TMP3
1016 AESENCLAST \TMP3
, \XMM1
# last round
1017 AESENCLAST \TMP3
, \XMM2
1018 AESENCLAST \TMP3
, \XMM3
1019 AESENCLAST \TMP3
, \XMM4
1020 movdqa
HashKey_k(%rsp
), \TMP5
1021 PCLMULQDQ
0x00, \TMP5
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
1022 movdqu (%arg3
,%r11
,1), \TMP3
1023 pxor \TMP3
, \XMM1
# Ciphertext/Plaintext XOR EK
1024 movdqu \XMM1
, (%arg2
,%r11
,1) # Write to plaintext buffer
1026 movdqu
16(%arg3
,%r11
,1), \TMP3
1027 pxor \TMP3
, \XMM2
# Ciphertext/Plaintext XOR EK
1028 movdqu \XMM2
, 16(%arg2
,%r11
,1) # Write to plaintext buffer
1030 movdqu
32(%arg3
,%r11
,1), \TMP3
1031 pxor \TMP3
, \XMM3
# Ciphertext/Plaintext XOR EK
1032 movdqu \XMM3
, 32(%arg2
,%r11
,1) # Write to plaintext buffer
1034 movdqu
48(%arg3
,%r11
,1), \TMP3
1035 pxor \TMP3
, \XMM4
# Ciphertext/Plaintext XOR EK
1036 movdqu \XMM4
, 48(%arg2
,%r11
,1) # Write to plaintext buffer
1038 PSHUFB_XMM
%xmm15
, \XMM1
# perform a 16 byte swap
1039 PSHUFB_XMM
%xmm15
, \XMM2
# perform a 16 byte swap
1040 PSHUFB_XMM
%xmm15
, \XMM3
# perform a 16 byte swap
1041 PSHUFB_XMM
%xmm15
, \XMM4
# perform a 16 byte swap
1049 pslldq $
8, \TMP3
# left shift TMP3 2 DWs
1050 psrldq $
8, \TMP2
# right shift TMP2 2 DWs
1052 pxor \TMP2
, \TMP1
# accumulate the results in TMP1:XMM5
1054 # first phase of reduction
1059 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1060 pslld $
31, \TMP2
# packed right shift << 31
1061 pslld $
30, \TMP3
# packed right shift << 30
1062 pslld $
25, \TMP4
# packed right shift << 25
1063 pxor \TMP3
, \TMP2
# xor the shifted versions
1066 psrldq $
4, \TMP5
# right shift T5 1 DW
1067 pslldq $
12, \TMP2
# left shift T2 3 DWs
1070 # second phase of reduction
1072 movdqa \XMM5
,\TMP2
# make 3 copies of XMM5 into TMP2, TMP3, TMP4
1075 psrld $
1, \TMP2
# packed left shift >>1
1076 psrld $
2, \TMP3
# packed left shift >>2
1077 psrld $
7, \TMP4
# packed left shift >>7
1078 pxor \TMP3
,\TMP2
# xor the shifted versions
1082 pxor \TMP1
, \XMM5
# result is in TMP1
1087 /* GHASH the last 4 ciphertext blocks. */
1088 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1089 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1091 # Multiply TMP6 * HashKey (using Karatsuba)
1094 pshufd $
78, \XMM1
, \TMP2
1096 movdqa
HashKey_4(%rsp
), \TMP5
1097 PCLMULQDQ
0x11, \TMP5
, \TMP6
# TMP6 = a1*b1
1098 PCLMULQDQ
0x00, \TMP5
, \XMM1
# XMM1 = a0*b0
1099 movdqa
HashKey_4_k(%rsp
), \TMP4
1100 PCLMULQDQ
0x00, \TMP4
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
1101 movdqa \XMM1
, \XMMDst
1102 movdqa \TMP2
, \XMM1
# result in TMP6, XMMDst, XMM1
1104 # Multiply TMP1 * HashKey (using Karatsuba)
1107 pshufd $
78, \XMM2
, \TMP2
1109 movdqa
HashKey_3(%rsp
), \TMP5
1110 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
1111 PCLMULQDQ
0x00, \TMP5
, \XMM2
# XMM2 = a0*b0
1112 movdqa
HashKey_3_k(%rsp
), \TMP4
1113 PCLMULQDQ
0x00, \TMP4
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
1117 # results accumulated in TMP6, XMMDst, XMM1
1119 # Multiply TMP1 * HashKey (using Karatsuba)
1122 pshufd $
78, \XMM3
, \TMP2
1124 movdqa
HashKey_2(%rsp
), \TMP5
1125 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
1126 PCLMULQDQ
0x00, \TMP5
, \XMM3
# XMM3 = a0*b0
1127 movdqa
HashKey_2_k(%rsp
), \TMP4
1128 PCLMULQDQ
0x00, \TMP4
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
1131 pxor \TMP2
, \XMM1
# results accumulated in TMP6, XMMDst, XMM1
1133 # Multiply TMP1 * HashKey (using Karatsuba)
1135 pshufd $
78, \XMM4
, \TMP2
1137 movdqa
HashKey(%rsp
), \TMP5
1138 PCLMULQDQ
0x11, \TMP5
, \TMP1
# TMP1 = a1*b1
1139 PCLMULQDQ
0x00, \TMP5
, \XMM4
# XMM4 = a0*b0
1140 movdqa
HashKey_k(%rsp
), \TMP4
1141 PCLMULQDQ
0x00, \TMP4
, \TMP2
# TMP2 = (a1+a0)*(b1+b0)
1147 # middle section of the temp results combined as in karatsuba algorithm
1149 pslldq $
8, \TMP4
# left shift TMP4 2 DWs
1150 psrldq $
8, \TMP2
# right shift TMP2 2 DWs
1153 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1154 # first phase of the reduction
1155 movdqa \XMMDst
, \TMP2
1156 movdqa \XMMDst
, \TMP3
1157 movdqa \XMMDst
, \TMP4
1158 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1159 pslld $
31, \TMP2
# packed right shifting << 31
1160 pslld $
30, \TMP3
# packed right shifting << 30
1161 pslld $
25, \TMP4
# packed right shifting << 25
1162 pxor \TMP3
, \TMP2
# xor the shifted versions
1165 psrldq $
4, \TMP7
# right shift TMP7 1 DW
1166 pslldq $
12, \TMP2
# left shift TMP2 3 DWs
1169 # second phase of the reduction
1170 movdqa \XMMDst
, \TMP2
1171 # make 3 copies of XMMDst for doing 3 shift operations
1172 movdqa \XMMDst
, \TMP3
1173 movdqa \XMMDst
, \TMP4
1174 psrld $
1, \TMP2
# packed left shift >> 1
1175 psrld $
2, \TMP3
# packed left shift >> 2
1176 psrld $
7, \TMP4
# packed left shift >> 7
1177 pxor \TMP3
, \TMP2
# xor the shifted versions
1181 pxor \TMP6
, \XMMDst
# reduced result is in XMMDst
1185 /* Encryption of a single block
1189 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1193 shr $
2,%eax
# 128->4, 192->6, 256->8
1194 add $
5,%eax
# 128->9, 192->11, 256->13
1195 lea
16(%arg1
), %r10
# get first expanded key address
1205 AESENCLAST \TMP1
,\XMM0
1207 /*****************************************************************************
1208 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1209 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1210 * const u8 *in, // Ciphertext input
1211 * u64 plaintext_len, // Length of data in bytes for decryption.
1212 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1213 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1214 * // concatenated with 0x00000001. 16-byte aligned pointer.
1215 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1216 * const u8 *aad, // Additional Authentication Data (AAD)
1217 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1218 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1219 * // given authentication tag and only return the plaintext if they match.
1220 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1221 * // (most likely), 12 or 8.
1226 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1227 * set of 11 keys in the data structure void *aes_ctx
1231 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1232 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1233 * | Salt (From the SA) |
1234 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1235 * | Initialization Vector |
1236 * | (This is the sequence number from IPSec header) |
1237 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1239 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244 * AAD padded to 128 bits with 0
1245 * for example, assume AAD is a u32 vector
1247 * if AAD is 8 bytes:
1248 * AAD[3] = {A0, A1};
1249 * padded AAD in xmm register = {A1 A0 0 0}
1252 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1253 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1255 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1256 * | 32-bit Sequence Number (A0) |
1257 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1259 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1261 * AAD Format with 32-bit Sequence Number
1263 * if AAD is 12 bytes:
1264 * AAD[3] = {A0, A1, A2};
1265 * padded AAD in xmm register = {A2 A1 A0 0}
1268 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1269 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1270 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1271 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1273 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1274 * | 64-bit Extended Sequence Number {A1,A0} |
1276 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1278 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1280 * AAD Format with 64-bit Extended Sequence Number
1283 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1284 * The code supports 16 too but for other sizes, the code will fail.
1287 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1288 * For other sizes, the code will fail.
1290 * poly = x^128 + x^127 + x^126 + x^121 + 1
1292 *****************************************************************************/
1293 ENTRY(aesni_gcm_dec
)
1299 * states of %xmm registers %xmm6:%xmm15 not saved
1300 * all %xmm registers are clobbered
1302 sub $VARIABLE_OFFSET
, %rsp
1303 and $
~63, %rsp
# align rsp to 64 bytes
1305 movdqu (%r12
), %xmm13
# %xmm13 = HashKey
1306 movdqa
SHUF_MASK(%rip
), %xmm2
1307 PSHUFB_XMM
%xmm2
, %xmm13
1310 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1312 movdqa
%xmm13
, %xmm2
1322 pshufd $
0x24, %xmm1
, %xmm2
1323 pcmpeqd
TWOONE(%rip
), %xmm2
1324 pand
POLY(%rip
), %xmm2
1325 pxor
%xmm2
, %xmm13
# %xmm13 holds the HashKey<<1 (mod poly)
1328 # Decrypt first few blocks
1330 movdqa
%xmm13
, HashKey(%rsp
) # store HashKey<<1 (mod poly)
1331 mov
%arg4
, %r13
# save the number of bytes of plaintext/ciphertext
1332 and $
-16, %r13
# %r13 = %r13 - (%r13 mod 16)
1335 jz _initial_num_blocks_is_0_decrypt
1337 jb _initial_num_blocks_is_1_decrypt
1338 je _initial_num_blocks_is_2_decrypt
1339 _initial_num_blocks_is_3_decrypt
:
1340 INITIAL_BLOCKS_DEC
3, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1341 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 5, 678, dec
1343 jmp _initial_blocks_decrypted
1344 _initial_num_blocks_is_2_decrypt
:
1345 INITIAL_BLOCKS_DEC
2, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1346 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 6, 78, dec
1348 jmp _initial_blocks_decrypted
1349 _initial_num_blocks_is_1_decrypt
:
1350 INITIAL_BLOCKS_DEC
1, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1351 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 7, 8, dec
1353 jmp _initial_blocks_decrypted
1354 _initial_num_blocks_is_0_decrypt
:
1355 INITIAL_BLOCKS_DEC
0, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1356 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 8, 0, dec
1357 _initial_blocks_decrypted
:
1359 je _zero_cipher_left_decrypt
1361 je _four_cipher_left_decrypt
1363 GHASH_4_ENCRYPT_4_PARALLEL_DEC
%xmm9
, %xmm10
, %xmm11
, %xmm12
, %xmm13
, \
1364 %xmm14
, %xmm0
, %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm5
, %xmm6
, %xmm7
, %xmm8
, dec
1368 _four_cipher_left_decrypt
:
1369 GHASH_LAST_4
%xmm9
, %xmm10
, %xmm11
, %xmm12
, %xmm13
, %xmm14
, \
1370 %xmm15
, %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
1371 _zero_cipher_left_decrypt
:
1373 and $
15, %r13
# %r13 = arg4 (mod 16)
1374 je _multiple_of_16_bytes_decrypt
1376 # Handle the last <16 byte block separately
1378 paddd
ONE(%rip
), %xmm0
# increment CNT to get Yn
1379 movdqa
SHUF_MASK(%rip
), %xmm10
1380 PSHUFB_XMM
%xmm10
, %xmm0
1382 ENCRYPT_SINGLE_BLOCK
%xmm0
, %xmm1
# E(K, Yn)
1385 movdqu (%arg3
,%r11
,1), %xmm1
# receive the last <16 byte block
1386 lea SHIFT_MASK
+16(%rip
), %r12
1388 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1389 # (%r13 is the number of bytes in plaintext mod 16)
1390 movdqu (%r12
), %xmm2
# get the appropriate shuffle mask
1391 PSHUFB_XMM
%xmm2
, %xmm1
# right shift 16-%r13 butes
1394 pxor
%xmm1
, %xmm0
# Ciphertext XOR E(K, Yn)
1395 movdqu ALL_F
-SHIFT_MASK(%r12
), %xmm1
1396 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1397 pand
%xmm1
, %xmm0
# mask out top 16-%r13 bytes of %xmm0
1399 movdqa
SHUF_MASK(%rip
), %xmm10
1400 PSHUFB_XMM
%xmm10
,%xmm2
1403 GHASH_MUL
%xmm8
, %xmm13
, %xmm9
, %xmm10
, %xmm11
, %xmm5
, %xmm6
1404 # GHASH computation for the last <16 byte block
1409 MOVQ_R64_XMM
%xmm0
, %rax
1411 jle _less_than_8_bytes_left_decrypt
1412 mov
%rax
, (%arg2
, %r11
, 1)
1415 MOVQ_R64_XMM
%xmm0
, %rax
1417 _less_than_8_bytes_left_decrypt
:
1418 mov
%al
, (%arg2
, %r11
, 1)
1422 jne _less_than_8_bytes_left_decrypt
1423 _multiple_of_16_bytes_decrypt
:
1424 mov arg8
, %r12
# %r13 = aadLen (number of bytes)
1425 shl $
3, %r12
# convert into number of bits
1426 movd
%r12d
, %xmm15
# len(A) in %xmm15
1427 shl $
3, %arg4
# len(C) in bits (*128)
1428 MOVQ_R64_XMM
%arg4
, %xmm1
1429 pslldq $
8, %xmm15
# %xmm15 = len(A)||0x0000000000000000
1430 pxor
%xmm1
, %xmm15
# %xmm15 = len(A)||len(C)
1432 GHASH_MUL
%xmm8
, %xmm13
, %xmm9
, %xmm10
, %xmm11
, %xmm5
, %xmm6
1433 # final GHASH computation
1434 movdqa
SHUF_MASK(%rip
), %xmm10
1435 PSHUFB_XMM
%xmm10
, %xmm8
1437 mov
%arg5
, %rax
# %rax = *Y0
1438 movdqu (%rax
), %xmm0
# %xmm0 = Y0
1439 ENCRYPT_SINGLE_BLOCK
%xmm0
, %xmm1
# E(K, Y0)
1442 mov arg9
, %r10
# %r10 = authTag
1443 mov arg10
, %r11
# %r11 = auth_tag_len
1449 MOVQ_R64_XMM
%xmm0
, %rax
1451 jmp _return_T_done_decrypt
1453 MOVQ_R64_XMM
%xmm0
, %rax
1458 jmp _return_T_done_decrypt
1460 movdqu
%xmm0
, (%r10
)
1461 _return_T_done_decrypt
:
1467 ENDPROC(aesni_gcm_dec
)
1470 /*****************************************************************************
1471 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1472 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1473 * const u8 *in, // Plaintext input
1474 * u64 plaintext_len, // Length of data in bytes for encryption.
1475 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1476 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1477 * // concatenated with 0x00000001. 16-byte aligned pointer.
1478 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1479 * const u8 *aad, // Additional Authentication Data (AAD)
1480 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1481 * u8 *auth_tag, // Authenticated Tag output.
1482 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1488 * keys are pre-expanded and aligned to 16 bytes. we are using the
1489 * first set of 11 keys in the data structure void *aes_ctx
1494 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1495 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * | Salt (From the SA) |
1497 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1498 * | Initialization Vector |
1499 * | (This is the sequence number from IPSec header) |
1500 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1502 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507 * AAD padded to 128 bits with 0
1508 * for example, assume AAD is a u32 vector
1510 * if AAD is 8 bytes:
1511 * AAD[3] = {A0, A1};
1512 * padded AAD in xmm register = {A1 A0 0 0}
1515 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1516 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1518 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1519 * | 32-bit Sequence Number (A0) |
1520 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1522 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524 * AAD Format with 32-bit Sequence Number
1526 * if AAD is 12 bytes:
1527 * AAD[3] = {A0, A1, A2};
1528 * padded AAD in xmm register = {A2 A1 A0 0}
1531 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1532 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1534 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1535 * | 64-bit Extended Sequence Number {A1,A0} |
1537 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1539 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1541 * AAD Format with 64-bit Extended Sequence Number
1544 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1545 * The code supports 16 too but for other sizes, the code will fail.
1548 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1549 * For other sizes, the code will fail.
1551 * poly = x^128 + x^127 + x^126 + x^121 + 1
1552 ***************************************************************************/
1553 ENTRY(aesni_gcm_enc
)
1559 # states of %xmm registers %xmm6:%xmm15 not saved
1560 # all %xmm registers are clobbered
1562 sub $VARIABLE_OFFSET
, %rsp
1565 movdqu (%r12
), %xmm13
1566 movdqa
SHUF_MASK(%rip
), %xmm2
1567 PSHUFB_XMM
%xmm2
, %xmm13
1570 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1572 movdqa
%xmm13
, %xmm2
1582 pshufd $
0x24, %xmm1
, %xmm2
1583 pcmpeqd
TWOONE(%rip
), %xmm2
1584 pand
POLY(%rip
), %xmm2
1586 movdqa
%xmm13
, HashKey(%rsp
)
1587 mov
%arg4
, %r13
# %xmm13 holds HashKey<<1 (mod poly)
1591 # Encrypt first few blocks
1594 jz _initial_num_blocks_is_0_encrypt
1596 jb _initial_num_blocks_is_1_encrypt
1597 je _initial_num_blocks_is_2_encrypt
1598 _initial_num_blocks_is_3_encrypt
:
1599 INITIAL_BLOCKS_ENC
3, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1600 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 5, 678, enc
1602 jmp _initial_blocks_encrypted
1603 _initial_num_blocks_is_2_encrypt
:
1604 INITIAL_BLOCKS_ENC
2, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1605 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 6, 78, enc
1607 jmp _initial_blocks_encrypted
1608 _initial_num_blocks_is_1_encrypt
:
1609 INITIAL_BLOCKS_ENC
1, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1610 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 7, 8, enc
1612 jmp _initial_blocks_encrypted
1613 _initial_num_blocks_is_0_encrypt
:
1614 INITIAL_BLOCKS_ENC
0, %xmm9
, %xmm10
, %xmm13
, %xmm11
, %xmm12
, %xmm0
, \
1615 %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
, %xmm5
, %xmm6
, 8, 0, enc
1616 _initial_blocks_encrypted
:
1618 # Main loop - Encrypt remaining blocks
1621 je _zero_cipher_left_encrypt
1623 je _four_cipher_left_encrypt
1624 _encrypt_by_4_encrypt
:
1625 GHASH_4_ENCRYPT_4_PARALLEL_ENC
%xmm9
, %xmm10
, %xmm11
, %xmm12
, %xmm13
, \
1626 %xmm14
, %xmm0
, %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm5
, %xmm6
, %xmm7
, %xmm8
, enc
1629 jne _encrypt_by_4_encrypt
1630 _four_cipher_left_encrypt
:
1631 GHASH_LAST_4
%xmm9
, %xmm10
, %xmm11
, %xmm12
, %xmm13
, %xmm14
, \
1632 %xmm15
, %xmm1
, %xmm2
, %xmm3
, %xmm4
, %xmm8
1633 _zero_cipher_left_encrypt
:
1635 and $
15, %r13
# %r13 = arg4 (mod 16)
1636 je _multiple_of_16_bytes_encrypt
1638 # Handle the last <16 Byte block separately
1639 paddd
ONE(%rip
), %xmm0
# INCR CNT to get Yn
1640 movdqa
SHUF_MASK(%rip
), %xmm10
1641 PSHUFB_XMM
%xmm10
, %xmm0
1644 ENCRYPT_SINGLE_BLOCK
%xmm0
, %xmm1
# Encrypt(K, Yn)
1647 movdqu (%arg3
,%r11
,1), %xmm1
# receive the last <16 byte blocks
1648 lea SHIFT_MASK
+16(%rip
), %r12
1650 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1651 # (%r13 is the number of bytes in plaintext mod 16)
1652 movdqu (%r12
), %xmm2
# get the appropriate shuffle mask
1653 PSHUFB_XMM
%xmm2
, %xmm1
# shift right 16-r13 byte
1654 pxor
%xmm1
, %xmm0
# Plaintext XOR Encrypt(K, Yn)
1655 movdqu ALL_F
-SHIFT_MASK(%r12
), %xmm1
1656 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1657 pand
%xmm1
, %xmm0
# mask out top 16-r13 bytes of xmm0
1658 movdqa
SHUF_MASK(%rip
), %xmm10
1659 PSHUFB_XMM
%xmm10
,%xmm0
1662 GHASH_MUL
%xmm8
, %xmm13
, %xmm9
, %xmm10
, %xmm11
, %xmm5
, %xmm6
1663 # GHASH computation for the last <16 byte block
1667 movdqa
SHUF_MASK(%rip
), %xmm10
1668 PSHUFB_XMM
%xmm10
, %xmm0
1670 # shuffle xmm0 back to output as ciphertext
1673 MOVQ_R64_XMM
%xmm0
, %rax
1675 jle _less_than_8_bytes_left_encrypt
1676 mov
%rax
, (%arg2
, %r11
, 1)
1679 MOVQ_R64_XMM
%xmm0
, %rax
1681 _less_than_8_bytes_left_encrypt
:
1682 mov
%al
, (%arg2
, %r11
, 1)
1686 jne _less_than_8_bytes_left_encrypt
1687 _multiple_of_16_bytes_encrypt
:
1688 mov arg8
, %r12
# %r12 = addLen (number of bytes)
1690 movd
%r12d
, %xmm15
# len(A) in %xmm15
1691 shl $
3, %arg4
# len(C) in bits (*128)
1692 MOVQ_R64_XMM
%arg4
, %xmm1
1693 pslldq $
8, %xmm15
# %xmm15 = len(A)||0x0000000000000000
1694 pxor
%xmm1
, %xmm15
# %xmm15 = len(A)||len(C)
1696 GHASH_MUL
%xmm8
, %xmm13
, %xmm9
, %xmm10
, %xmm11
, %xmm5
, %xmm6
1697 # final GHASH computation
1698 movdqa
SHUF_MASK(%rip
), %xmm10
1699 PSHUFB_XMM
%xmm10
, %xmm8
# perform a 16 byte swap
1701 mov
%arg5
, %rax
# %rax = *Y0
1702 movdqu (%rax
), %xmm0
# %xmm0 = Y0
1703 ENCRYPT_SINGLE_BLOCK
%xmm0
, %xmm15
# Encrypt(K, Y0)
1706 mov arg9
, %r10
# %r10 = authTag
1707 mov arg10
, %r11
# %r11 = auth_tag_len
1713 MOVQ_R64_XMM
%xmm0
, %rax
1715 jmp _return_T_done_encrypt
1717 MOVQ_R64_XMM
%xmm0
, %rax
1722 jmp _return_T_done_encrypt
1724 movdqu
%xmm0
, (%r10
)
1725 _return_T_done_encrypt
:
1731 ENDPROC(aesni_gcm_enc
)
1738 _key_expansion_256a
:
1739 pshufd $
0b11111111, %xmm1
, %xmm1
1740 shufps $
0b00010000, %xmm0
, %xmm4
1742 shufps $
0b10001100, %xmm0
, %xmm4
1745 movaps
%xmm0
, (TKEYP
)
1748 ENDPROC(_key_expansion_128
)
1749 ENDPROC(_key_expansion_256a
)
1752 _key_expansion_192a
:
1753 pshufd $
0b01010101, %xmm1
, %xmm1
1754 shufps $
0b00010000, %xmm0
, %xmm4
1756 shufps $
0b10001100, %xmm0
, %xmm4
1763 pshufd $
0b11111111, %xmm0
, %xmm3
1768 shufps $
0b01000100, %xmm0
, %xmm6
1769 movaps
%xmm6
, (TKEYP
)
1770 shufps $
0b01001110, %xmm2
, %xmm1
1771 movaps
%xmm1
, 0x10(TKEYP
)
1774 ENDPROC(_key_expansion_192a
)
1777 _key_expansion_192b
:
1778 pshufd $
0b01010101, %xmm1
, %xmm1
1779 shufps $
0b00010000, %xmm0
, %xmm4
1781 shufps $
0b10001100, %xmm0
, %xmm4
1787 pshufd $
0b11111111, %xmm0
, %xmm3
1791 movaps
%xmm0
, (TKEYP
)
1794 ENDPROC(_key_expansion_192b
)
1797 _key_expansion_256b
:
1798 pshufd $
0b10101010, %xmm1
, %xmm1
1799 shufps $
0b00010000, %xmm2
, %xmm4
1801 shufps $
0b10001100, %xmm2
, %xmm4
1804 movaps
%xmm2
, (TKEYP
)
1807 ENDPROC(_key_expansion_256b
)
1810 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1811 * unsigned int key_len)
1813 ENTRY(aesni_set_key
)
1817 movl (FRAME_OFFSET
+8)(%esp
), KEYP
# ctx
1818 movl (FRAME_OFFSET
+12)(%esp
), UKEYP
# in_key
1819 movl (FRAME_OFFSET
+16)(%esp
), %edx
# key_len
1821 movups (UKEYP
), %xmm0
# user key (first 16 bytes)
1822 movaps
%xmm0
, (KEYP
)
1823 lea
0x10(KEYP
), TKEYP
# key addr
1824 movl
%edx
, 480(KEYP
)
1825 pxor
%xmm4
, %xmm4
# xmm4 is assumed 0 in _key_expansion_x
1829 movups
0x10(UKEYP
), %xmm2
# other user key
1830 movaps
%xmm2
, (TKEYP
)
1832 AESKEYGENASSIST
0x1 %xmm2
%xmm1
# round 1
1833 call _key_expansion_256a
1834 AESKEYGENASSIST
0x1 %xmm0
%xmm1
1835 call _key_expansion_256b
1836 AESKEYGENASSIST
0x2 %xmm2
%xmm1
# round 2
1837 call _key_expansion_256a
1838 AESKEYGENASSIST
0x2 %xmm0
%xmm1
1839 call _key_expansion_256b
1840 AESKEYGENASSIST
0x4 %xmm2
%xmm1
# round 3
1841 call _key_expansion_256a
1842 AESKEYGENASSIST
0x4 %xmm0
%xmm1
1843 call _key_expansion_256b
1844 AESKEYGENASSIST
0x8 %xmm2
%xmm1
# round 4
1845 call _key_expansion_256a
1846 AESKEYGENASSIST
0x8 %xmm0
%xmm1
1847 call _key_expansion_256b
1848 AESKEYGENASSIST
0x10 %xmm2
%xmm1
# round 5
1849 call _key_expansion_256a
1850 AESKEYGENASSIST
0x10 %xmm0
%xmm1
1851 call _key_expansion_256b
1852 AESKEYGENASSIST
0x20 %xmm2
%xmm1
# round 6
1853 call _key_expansion_256a
1854 AESKEYGENASSIST
0x20 %xmm0
%xmm1
1855 call _key_expansion_256b
1856 AESKEYGENASSIST
0x40 %xmm2
%xmm1
# round 7
1857 call _key_expansion_256a
1860 movq
0x10(UKEYP
), %xmm2
# other user key
1861 AESKEYGENASSIST
0x1 %xmm2
%xmm1
# round 1
1862 call _key_expansion_192a
1863 AESKEYGENASSIST
0x2 %xmm2
%xmm1
# round 2
1864 call _key_expansion_192b
1865 AESKEYGENASSIST
0x4 %xmm2
%xmm1
# round 3
1866 call _key_expansion_192a
1867 AESKEYGENASSIST
0x8 %xmm2
%xmm1
# round 4
1868 call _key_expansion_192b
1869 AESKEYGENASSIST
0x10 %xmm2
%xmm1
# round 5
1870 call _key_expansion_192a
1871 AESKEYGENASSIST
0x20 %xmm2
%xmm1
# round 6
1872 call _key_expansion_192b
1873 AESKEYGENASSIST
0x40 %xmm2
%xmm1
# round 7
1874 call _key_expansion_192a
1875 AESKEYGENASSIST
0x80 %xmm2
%xmm1
# round 8
1876 call _key_expansion_192b
1879 AESKEYGENASSIST
0x1 %xmm0
%xmm1
# round 1
1880 call _key_expansion_128
1881 AESKEYGENASSIST
0x2 %xmm0
%xmm1
# round 2
1882 call _key_expansion_128
1883 AESKEYGENASSIST
0x4 %xmm0
%xmm1
# round 3
1884 call _key_expansion_128
1885 AESKEYGENASSIST
0x8 %xmm0
%xmm1
# round 4
1886 call _key_expansion_128
1887 AESKEYGENASSIST
0x10 %xmm0
%xmm1
# round 5
1888 call _key_expansion_128
1889 AESKEYGENASSIST
0x20 %xmm0
%xmm1
# round 6
1890 call _key_expansion_128
1891 AESKEYGENASSIST
0x40 %xmm0
%xmm1
# round 7
1892 call _key_expansion_128
1893 AESKEYGENASSIST
0x80 %xmm0
%xmm1
# round 8
1894 call _key_expansion_128
1895 AESKEYGENASSIST
0x1b %xmm0
%xmm1
# round 9
1896 call _key_expansion_128
1897 AESKEYGENASSIST
0x36 %xmm0
%xmm1
# round 10
1898 call _key_expansion_128
1901 movaps (KEYP
), %xmm0
1902 movaps (TKEYP
), %xmm1
1903 movaps
%xmm0
, 240(TKEYP
)
1904 movaps
%xmm1
, 240(KEYP
)
1906 lea
240-16(TKEYP
), UKEYP
1909 movaps (KEYP
), %xmm0
1911 movaps
%xmm1
, (UKEYP
)
1922 ENDPROC(aesni_set_key
)
1925 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1932 movl (FRAME_OFFSET
+12)(%esp
), KEYP
# ctx
1933 movl (FRAME_OFFSET
+16)(%esp
), OUTP
# dst
1934 movl (FRAME_OFFSET
+20)(%esp
), INP
# src
1936 movl
480(KEYP
), KLEN
# key length
1937 movups (INP
), STATE
# input
1939 movups STATE
, (OUTP
) # output
1949 * _aesni_enc1: internal ABI
1951 * KEYP: key struct pointer
1953 * STATE: initial state (input)
1955 * STATE: finial state (output)
1962 movaps (KEYP
), KEY
# key
1964 pxor KEY
, STATE
# round 0
1968 lea
0x20(TKEYP
), TKEYP
1971 movaps
-0x60(TKEYP
), KEY
1973 movaps
-0x50(TKEYP
), KEY
1977 movaps
-0x40(TKEYP
), KEY
1979 movaps
-0x30(TKEYP
), KEY
1983 movaps
-0x20(TKEYP
), KEY
1985 movaps
-0x10(TKEYP
), KEY
1989 movaps
0x10(TKEYP
), KEY
1991 movaps
0x20(TKEYP
), KEY
1993 movaps
0x30(TKEYP
), KEY
1995 movaps
0x40(TKEYP
), KEY
1997 movaps
0x50(TKEYP
), KEY
1999 movaps
0x60(TKEYP
), KEY
2001 movaps
0x70(TKEYP
), KEY
2002 AESENCLAST KEY STATE
2004 ENDPROC(_aesni_enc1
)
2007 * _aesni_enc4: internal ABI
2009 * KEYP: key struct pointer
2011 * STATE1: initial state (input)
2016 * STATE1: finial state (output)
2026 movaps (KEYP
), KEY
# key
2028 pxor KEY
, STATE1
# round 0
2035 lea
0x20(TKEYP
), TKEYP
2038 movaps
-0x60(TKEYP
), KEY
2043 movaps
-0x50(TKEYP
), KEY
2050 movaps
-0x40(TKEYP
), KEY
2055 movaps
-0x30(TKEYP
), KEY
2062 movaps
-0x20(TKEYP
), KEY
2067 movaps
-0x10(TKEYP
), KEY
2077 movaps
0x10(TKEYP
), KEY
2082 movaps
0x20(TKEYP
), KEY
2087 movaps
0x30(TKEYP
), KEY
2092 movaps
0x40(TKEYP
), KEY
2097 movaps
0x50(TKEYP
), KEY
2102 movaps
0x60(TKEYP
), KEY
2107 movaps
0x70(TKEYP
), KEY
2108 AESENCLAST KEY STATE1
# last round
2109 AESENCLAST KEY STATE2
2110 AESENCLAST KEY STATE3
2111 AESENCLAST KEY STATE4
2113 ENDPROC(_aesni_enc4
)
2116 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2123 movl (FRAME_OFFSET
+12)(%esp
), KEYP
# ctx
2124 movl (FRAME_OFFSET
+16)(%esp
), OUTP
# dst
2125 movl (FRAME_OFFSET
+20)(%esp
), INP
# src
2127 mov
480(KEYP
), KLEN
# key length
2129 movups (INP
), STATE
# input
2131 movups STATE
, (OUTP
) #output
2141 * _aesni_dec1: internal ABI
2143 * KEYP: key struct pointer
2145 * STATE: initial state (input)
2147 * STATE: finial state (output)
2154 movaps (KEYP
), KEY
# key
2156 pxor KEY
, STATE
# round 0
2160 lea
0x20(TKEYP
), TKEYP
2163 movaps
-0x60(TKEYP
), KEY
2165 movaps
-0x50(TKEYP
), KEY
2169 movaps
-0x40(TKEYP
), KEY
2171 movaps
-0x30(TKEYP
), KEY
2175 movaps
-0x20(TKEYP
), KEY
2177 movaps
-0x10(TKEYP
), KEY
2181 movaps
0x10(TKEYP
), KEY
2183 movaps
0x20(TKEYP
), KEY
2185 movaps
0x30(TKEYP
), KEY
2187 movaps
0x40(TKEYP
), KEY
2189 movaps
0x50(TKEYP
), KEY
2191 movaps
0x60(TKEYP
), KEY
2193 movaps
0x70(TKEYP
), KEY
2194 AESDECLAST KEY STATE
2196 ENDPROC(_aesni_dec1
)
2199 * _aesni_dec4: internal ABI
2201 * KEYP: key struct pointer
2203 * STATE1: initial state (input)
2208 * STATE1: finial state (output)
2218 movaps (KEYP
), KEY
# key
2220 pxor KEY
, STATE1
# round 0
2227 lea
0x20(TKEYP
), TKEYP
2230 movaps
-0x60(TKEYP
), KEY
2235 movaps
-0x50(TKEYP
), KEY
2242 movaps
-0x40(TKEYP
), KEY
2247 movaps
-0x30(TKEYP
), KEY
2254 movaps
-0x20(TKEYP
), KEY
2259 movaps
-0x10(TKEYP
), KEY
2269 movaps
0x10(TKEYP
), KEY
2274 movaps
0x20(TKEYP
), KEY
2279 movaps
0x30(TKEYP
), KEY
2284 movaps
0x40(TKEYP
), KEY
2289 movaps
0x50(TKEYP
), KEY
2294 movaps
0x60(TKEYP
), KEY
2299 movaps
0x70(TKEYP
), KEY
2300 AESDECLAST KEY STATE1
# last round
2301 AESDECLAST KEY STATE2
2302 AESDECLAST KEY STATE3
2303 AESDECLAST KEY STATE4
2305 ENDPROC(_aesni_dec4
)
2308 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2311 ENTRY(aesni_ecb_enc
)
2317 movl (FRAME_OFFSET
+16)(%esp
), KEYP
# ctx
2318 movl (FRAME_OFFSET
+20)(%esp
), OUTP
# dst
2319 movl (FRAME_OFFSET
+24)(%esp
), INP
# src
2320 movl (FRAME_OFFSET
+28)(%esp
), LEN
# len
2322 test LEN
, LEN
# check length
2331 movups (INP
), STATE1
2332 movups
0x10(INP
), STATE2
2333 movups
0x20(INP
), STATE3
2334 movups
0x30(INP
), STATE4
2336 movups STATE1
, (OUTP
)
2337 movups STATE2
, 0x10(OUTP
)
2338 movups STATE3
, 0x20(OUTP
)
2339 movups STATE4
, 0x30(OUTP
)
2349 movups (INP
), STATE1
2351 movups STATE1
, (OUTP
)
2365 ENDPROC(aesni_ecb_enc
)
2368 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2371 ENTRY(aesni_ecb_dec
)
2377 movl (FRAME_OFFSET
+16)(%esp
), KEYP
# ctx
2378 movl (FRAME_OFFSET
+20)(%esp
), OUTP
# dst
2379 movl (FRAME_OFFSET
+24)(%esp
), INP
# src
2380 movl (FRAME_OFFSET
+28)(%esp
), LEN
# len
2392 movups (INP
), STATE1
2393 movups
0x10(INP
), STATE2
2394 movups
0x20(INP
), STATE3
2395 movups
0x30(INP
), STATE4
2397 movups STATE1
, (OUTP
)
2398 movups STATE2
, 0x10(OUTP
)
2399 movups STATE3
, 0x20(OUTP
)
2400 movups STATE4
, 0x30(OUTP
)
2410 movups (INP
), STATE1
2412 movups STATE1
, (OUTP
)
2426 ENDPROC(aesni_ecb_dec
)
2429 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2430 * size_t len, u8 *iv)
2432 ENTRY(aesni_cbc_enc
)
2439 movl (FRAME_OFFSET
+20)(%esp
), KEYP
# ctx
2440 movl (FRAME_OFFSET
+24)(%esp
), OUTP
# dst
2441 movl (FRAME_OFFSET
+28)(%esp
), INP
# src
2442 movl (FRAME_OFFSET
+32)(%esp
), LEN
# len
2443 movl (FRAME_OFFSET
+36)(%esp
), IVP
# iv
2448 movups (IVP
), STATE
# load iv as initial state
2451 movups (INP
), IN
# load input
2454 movups STATE
, (OUTP
) # store output
2470 ENDPROC(aesni_cbc_enc
)
2473 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2474 * size_t len, u8 *iv)
2476 ENTRY(aesni_cbc_dec
)
2483 movl (FRAME_OFFSET
+20)(%esp
), KEYP
# ctx
2484 movl (FRAME_OFFSET
+24)(%esp
), OUTP
# dst
2485 movl (FRAME_OFFSET
+28)(%esp
), INP
# src
2486 movl (FRAME_OFFSET
+32)(%esp
), LEN
# len
2487 movl (FRAME_OFFSET
+36)(%esp
), IVP
# iv
2490 jb
.Lcbc_dec_just_ret
2500 movups
0x10(INP
), IN2
2503 movups
0x20(INP
), IN3
2505 movups
0x30(INP
), IN4
2508 movups
0x20(INP
), IN1
2510 movups
0x30(INP
), IN2
2525 movups
0x10(INP
), IN2
2528 movups STATE1
, (OUTP
)
2529 movups STATE2
, 0x10(OUTP
)
2530 movups STATE3
, 0x20(OUTP
)
2531 movups STATE4
, 0x30(OUTP
)
2545 movups STATE
, (OUTP
)
2563 ENDPROC(aesni_cbc_dec
)
2568 .byte
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2571 * _aesni_inc_init: internal ABI
2572 * setup registers used by _aesni_inc
2576 * CTR: == IV, in little endian
2577 * TCTR_LOW: == lower qword of CTR
2578 * INC: == 1, in little endian
2579 * BSWAP_MASK == endian swapping mask
2583 movaps
.Lbswap_mask(%rip
), BSWAP_MASK
2585 PSHUFB_XMM BSWAP_MASK CTR
2587 MOVQ_R64_XMM TCTR_LOW INC
2588 MOVQ_R64_XMM CTR TCTR_LOW
2590 ENDPROC(_aesni_inc_init
)
2593 * _aesni_inc: internal ABI
2594 * Increase IV by 1, IV is in big endian
2597 * CTR: == IV, in little endian
2598 * TCTR_LOW: == lower qword of CTR
2599 * INC: == 1, in little endian
2600 * BSWAP_MASK == endian swapping mask
2604 * CTR: == output IV, in little endian
2605 * TCTR_LOW: == lower qword of CTR
2617 PSHUFB_XMM BSWAP_MASK IV
2622 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2623 * size_t len, u8 *iv)
2625 ENTRY(aesni_ctr_enc
)
2628 jb
.Lctr_enc_just_ret
2631 call _aesni_inc_init
2641 movups
0x10(INP
), IN2
2644 movups
0x20(INP
), IN3
2647 movups
0x30(INP
), IN4
2650 movups STATE1
, (OUTP
)
2652 movups STATE2
, 0x10(OUTP
)
2654 movups STATE3
, 0x20(OUTP
)
2656 movups STATE4
, 0x30(OUTP
)
2671 movups STATE
, (OUTP
)
2682 ENDPROC(aesni_ctr_enc
)
2685 * _aesni_gf128mul_x_ble: internal ABI
2686 * Multiply in GF(2^128) for XTS IVs
2689 * GF128MUL_MASK == mask with 0x87 and 0x01
2693 * CTR: == temporary value
2695 #define _aesni_gf128mul_x_ble() \
2696 pshufd $0x13, IV, CTR; \
2699 pand GF128MUL_MASK, CTR; \
2703 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2706 ENTRY(aesni_xts_crypt8
)
2711 leaq
_aesni_enc4(%rip
), %r11
2712 leaq
_aesni_dec4(%rip
), %rax
2716 movdqa
.Lgf128mul_x_ble_mask(%rip
), GF128MUL_MASK
2723 movdqu
0x00(INP
), INC
2725 movdqu IV
, 0x00(OUTP
)
2727 _aesni_gf128mul_x_ble()
2729 movdqu
0x10(INP
), INC
2731 movdqu IV
, 0x10(OUTP
)
2733 _aesni_gf128mul_x_ble()
2735 movdqu
0x20(INP
), INC
2737 movdqu IV
, 0x20(OUTP
)
2739 _aesni_gf128mul_x_ble()
2741 movdqu
0x30(INP
), INC
2743 movdqu IV
, 0x30(OUTP
)
2747 movdqu
0x00(OUTP
), INC
2749 movdqu STATE1
, 0x00(OUTP
)
2751 _aesni_gf128mul_x_ble()
2753 movdqu
0x40(INP
), INC
2755 movdqu IV
, 0x40(OUTP
)
2757 movdqu
0x10(OUTP
), INC
2759 movdqu STATE2
, 0x10(OUTP
)
2761 _aesni_gf128mul_x_ble()
2763 movdqu
0x50(INP
), INC
2765 movdqu IV
, 0x50(OUTP
)
2767 movdqu
0x20(OUTP
), INC
2769 movdqu STATE3
, 0x20(OUTP
)
2771 _aesni_gf128mul_x_ble()
2773 movdqu
0x60(INP
), INC
2775 movdqu IV
, 0x60(OUTP
)
2777 movdqu
0x30(OUTP
), INC
2779 movdqu STATE4
, 0x30(OUTP
)
2781 _aesni_gf128mul_x_ble()
2783 movdqu
0x70(INP
), INC
2785 movdqu IV
, 0x70(OUTP
)
2787 _aesni_gf128mul_x_ble()
2792 movdqu
0x40(OUTP
), INC
2794 movdqu STATE1
, 0x40(OUTP
)
2796 movdqu
0x50(OUTP
), INC
2798 movdqu STATE2
, 0x50(OUTP
)
2800 movdqu
0x60(OUTP
), INC
2802 movdqu STATE3
, 0x60(OUTP
)
2804 movdqu
0x70(OUTP
), INC
2806 movdqu STATE4
, 0x70(OUTP
)
2810 ENDPROC(aesni_xts_crypt8
)