s4:ldap_server: let ldapsrv_call_writev_start use conn_idle_time to limit the time
[Samba.git] / third_party / aesni-intel / aesni-intel_asm.c
blob7baf703f338a3ed33519fc34b0a45a96c121e720
1 /*
2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #define ENTRY(name) \
33 .globl name ; \
34 .align 4,0x90 ; \
35 name:
36 #define ENDPROC(name) \
37 .type name, @function ; \
38 .size name, .-name
40 #define FRAME_BEGIN
41 #define FRAME_END
42 #define FRAME_OFFSET 0
44 #include "inst-intel.h"
47 * The following macros are used to move an (un)aligned 16 byte value to/from
48 * an XMM register. This can done for either FP or integer values, for FP use
49 * movaps (move aligned packed single) or integer use movdqa (move double quad
50 * aligned). It doesn't make a performance difference which instruction is used
51 * since Nehalem (original Core i7) was released. However, the movaps is a byte
52 * shorter, so that is the one we'll use for now. (same for unaligned).
54 #define MOVADQ movaps
55 #define MOVUDQ movups
57 #ifdef __x86_64__
59 .data
60 .align 16
61 .Lgf128mul_x_ble_mask:
62 .octa 0x00000000000000010000000000000087
63 POLY: .octa 0xC2000000000000000000000000000001
64 TWOONE: .octa 0x00000001000000000000000000000001
66 # order of these constants should not change.
67 # more specifically, ALL_F should follow SHIFT_MASK,
68 # and ZERO should follow ALL_F
70 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
71 MASK1: .octa 0x0000000000000000ffffffffffffffff
72 MASK2: .octa 0xffffffffffffffff0000000000000000
73 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
74 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
75 ZERO: .octa 0x00000000000000000000000000000000
76 ONE: .octa 0x00000000000000000000000000000001
77 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
78 dec: .octa 0x1
79 enc: .octa 0x2
82 .text
85 #define STACK_OFFSET 8*3
86 #define HashKey 16*0 // store HashKey <<1 mod poly here
87 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
88 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
89 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
90 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
91 // bits of HashKey <<1 mod poly here
92 //(for Karatsuba purposes)
93 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
94 // bits of HashKey^2 <<1 mod poly here
95 // (for Karatsuba purposes)
96 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
97 // bits of HashKey^3 <<1 mod poly here
98 // (for Karatsuba purposes)
99 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
100 // bits of HashKey^4 <<1 mod poly here
101 // (for Karatsuba purposes)
102 #define VARIABLE_OFFSET 16*8
104 #define arg1 rdi
105 #define arg2 rsi
106 #define arg3 rdx
107 #define arg4 rcx
108 #define arg5 r8
109 #define arg6 r9
110 #define arg7 STACK_OFFSET+8(%r14)
111 #define arg8 STACK_OFFSET+16(%r14)
112 #define arg9 STACK_OFFSET+24(%r14)
113 #define arg10 STACK_OFFSET+32(%r14)
114 #define keysize 2*15*16(%arg1)
115 #endif
118 #define STATE1 %xmm0
119 #define STATE2 %xmm4
120 #define STATE3 %xmm5
121 #define STATE4 %xmm6
122 #define STATE STATE1
123 #define IN1 %xmm1
124 #define IN2 %xmm7
125 #define IN3 %xmm8
126 #define IN4 %xmm9
127 #define IN IN1
128 #define KEY %xmm2
129 #define IV %xmm3
131 #define BSWAP_MASK %xmm10
132 #define CTR %xmm11
133 #define INC %xmm12
135 #define GF128MUL_MASK %xmm10
137 #ifdef __x86_64__
138 #define AREG %rax
139 #define KEYP %rdi
140 #define OUTP %rsi
141 #define UKEYP OUTP
142 #define INP %rdx
143 #define LEN %rcx
144 #define IVP %r8
145 #define KLEN %r9d
146 #define T1 %r10
147 #define TKEYP T1
148 #define T2 %r11
149 #define TCTR_LOW T2
150 #else
151 #define AREG %eax
152 #define KEYP %edi
153 #define OUTP AREG
154 #define UKEYP OUTP
155 #define INP %edx
156 #define LEN %esi
157 #define IVP %ebp
158 #define KLEN %ebx
159 #define T1 %ecx
160 #define TKEYP T1
161 #endif
164 #ifdef __x86_64__
165 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
168 * Input: A and B (128-bits each, bit-reflected)
169 * Output: C = A*B*x mod poly, (i.e. >>1 )
170 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
171 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
174 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
175 movdqa \GH, \TMP1
176 pshufd $78, \GH, \TMP2
177 pshufd $78, \HK, \TMP3
178 pxor \GH, \TMP2 # TMP2 = a1+a0
179 pxor \HK, \TMP3 # TMP3 = b1+b0
180 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
181 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
182 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
183 pxor \GH, \TMP2
184 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
185 movdqa \TMP2, \TMP3
186 pslldq $8, \TMP3 # left shift TMP3 2 DWs
187 psrldq $8, \TMP2 # right shift TMP2 2 DWs
188 pxor \TMP3, \GH
189 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
191 # first phase of the reduction
193 movdqa \GH, \TMP2
194 movdqa \GH, \TMP3
195 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
196 # in in order to perform
197 # independent shifts
198 pslld $31, \TMP2 # packed right shift <<31
199 pslld $30, \TMP3 # packed right shift <<30
200 pslld $25, \TMP4 # packed right shift <<25
201 pxor \TMP3, \TMP2 # xor the shifted versions
202 pxor \TMP4, \TMP2
203 movdqa \TMP2, \TMP5
204 psrldq $4, \TMP5 # right shift TMP5 1 DW
205 pslldq $12, \TMP2 # left shift TMP2 3 DWs
206 pxor \TMP2, \GH
208 # second phase of the reduction
210 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
211 # in in order to perform
212 # independent shifts
213 movdqa \GH,\TMP3
214 movdqa \GH,\TMP4
215 psrld $1,\TMP2 # packed left shift >>1
216 psrld $2,\TMP3 # packed left shift >>2
217 psrld $7,\TMP4 # packed left shift >>7
218 pxor \TMP3,\TMP2 # xor the shifted versions
219 pxor \TMP4,\TMP2
220 pxor \TMP5, \TMP2
221 pxor \TMP2, \GH
222 pxor \TMP1, \GH # result is in TMP1
223 .endm
226 * if a = number of total plaintext bytes
227 * b = floor(a/16)
228 * num_initial_blocks = b mod 4
229 * encrypt the initial num_initial_blocks blocks and apply ghash on
230 * the ciphertext
231 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
232 * are clobbered
233 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
237 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
238 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
239 MOVADQ SHUF_MASK(%rip), %xmm14
240 mov arg7, %r10 # %r10 = AAD
241 mov arg8, %r12 # %r12 = aadLen
242 mov %r12, %r11
243 pxor %xmm\i, %xmm\i
245 _get_AAD_loop\num_initial_blocks\operation:
246 movd (%r10), \TMP1
247 pslldq $12, \TMP1
248 psrldq $4, %xmm\i
249 pxor \TMP1, %xmm\i
250 add $4, %r10
251 sub $4, %r12
252 jne _get_AAD_loop\num_initial_blocks\operation
254 cmp $16, %r11
255 je _get_AAD_loop2_done\num_initial_blocks\operation
257 mov $16, %r12
258 _get_AAD_loop2\num_initial_blocks\operation:
259 psrldq $4, %xmm\i
260 sub $4, %r12
261 cmp %r11, %r12
262 jne _get_AAD_loop2\num_initial_blocks\operation
264 _get_AAD_loop2_done\num_initial_blocks\operation:
265 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
267 xor %r11, %r11 # initialise the data pointer offset as zero
269 # start AES for num_initial_blocks blocks
271 mov %arg5, %rax # %rax = *Y0
272 movdqu (%rax), \XMM0 # XMM0 = Y0
273 PSHUFB_XMM %xmm14, \XMM0
275 .if (\i == 5) || (\i == 6) || (\i == 7)
276 MOVADQ ONE(%RIP),\TMP1
277 MOVADQ (%arg1),\TMP2
278 .irpc index, \i_seq
279 paddd \TMP1, \XMM0 # INCR Y0
280 movdqa \XMM0, %xmm\index
281 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
282 pxor \TMP2, %xmm\index
283 .endr
284 lea 0x10(%arg1),%r10
285 mov keysize,%eax
286 shr $2,%eax # 128->4, 192->6, 256->8
287 add $5,%eax # 128->9, 192->11, 256->13
289 aes_loop_initial_dec\num_initial_blocks:
290 MOVADQ (%r10),\TMP1
291 .irpc index, \i_seq
292 AESENC \TMP1, %xmm\index
293 .endr
294 add $16,%r10
295 sub $1,%eax
296 jnz aes_loop_initial_dec\num_initial_blocks
298 MOVADQ (%r10), \TMP1
299 .irpc index, \i_seq
300 AESENCLAST \TMP1, %xmm\index # Last Round
301 .endr
302 .irpc index, \i_seq
303 movdqu (%arg3 , %r11, 1), \TMP1
304 pxor \TMP1, %xmm\index
305 movdqu %xmm\index, (%arg2 , %r11, 1)
306 # write back plaintext/ciphertext for num_initial_blocks
307 add $16, %r11
309 movdqa \TMP1, %xmm\index
310 PSHUFB_XMM %xmm14, %xmm\index
311 # prepare plaintext/ciphertext for GHASH computation
312 .endr
313 .endif
314 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 # apply GHASH on num_initial_blocks blocks
317 .if \i == 5
318 pxor %xmm5, %xmm6
319 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
320 pxor %xmm6, %xmm7
321 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322 pxor %xmm7, %xmm8
323 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
324 .elseif \i == 6
325 pxor %xmm6, %xmm7
326 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
327 pxor %xmm7, %xmm8
328 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
329 .elseif \i == 7
330 pxor %xmm7, %xmm8
331 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
332 .endif
333 cmp $64, %r13
334 jl _initial_blocks_done\num_initial_blocks\operation
335 # no need for precomputed values
338 * Precomputations for HashKey parallel with encryption of first 4 blocks.
339 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
341 MOVADQ ONE(%rip), \TMP1
342 paddd \TMP1, \XMM0 # INCR Y0
343 MOVADQ \XMM0, \XMM1
344 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
346 paddd \TMP1, \XMM0 # INCR Y0
347 MOVADQ \XMM0, \XMM2
348 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
350 paddd \TMP1, \XMM0 # INCR Y0
351 MOVADQ \XMM0, \XMM3
352 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
354 paddd \TMP1, \XMM0 # INCR Y0
355 MOVADQ \XMM0, \XMM4
356 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
358 MOVADQ 0(%arg1),\TMP1
359 pxor \TMP1, \XMM1
360 pxor \TMP1, \XMM2
361 pxor \TMP1, \XMM3
362 pxor \TMP1, \XMM4
363 movdqa \TMP3, \TMP5
364 pshufd $78, \TMP3, \TMP1
365 pxor \TMP3, \TMP1
366 movdqa \TMP1, HashKey_k(%rsp)
367 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
368 # TMP5 = HashKey^2<<1 (mod poly)
369 movdqa \TMP5, HashKey_2(%rsp)
370 # HashKey_2 = HashKey^2<<1 (mod poly)
371 pshufd $78, \TMP5, \TMP1
372 pxor \TMP5, \TMP1
373 movdqa \TMP1, HashKey_2_k(%rsp)
374 .irpc index, 1234 # do 4 rounds
375 movaps 0x10*\index(%arg1), \TMP1
376 AESENC \TMP1, \XMM1
377 AESENC \TMP1, \XMM2
378 AESENC \TMP1, \XMM3
379 AESENC \TMP1, \XMM4
380 .endr
381 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
382 # TMP5 = HashKey^3<<1 (mod poly)
383 movdqa \TMP5, HashKey_3(%rsp)
384 pshufd $78, \TMP5, \TMP1
385 pxor \TMP5, \TMP1
386 movdqa \TMP1, HashKey_3_k(%rsp)
387 .irpc index, 56789 # do next 5 rounds
388 movaps 0x10*\index(%arg1), \TMP1
389 AESENC \TMP1, \XMM1
390 AESENC \TMP1, \XMM2
391 AESENC \TMP1, \XMM3
392 AESENC \TMP1, \XMM4
393 .endr
394 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
395 # TMP5 = HashKey^3<<1 (mod poly)
396 movdqa \TMP5, HashKey_4(%rsp)
397 pshufd $78, \TMP5, \TMP1
398 pxor \TMP5, \TMP1
399 movdqa \TMP1, HashKey_4_k(%rsp)
400 lea 0xa0(%arg1),%r10
401 mov keysize,%eax
402 shr $2,%eax # 128->4, 192->6, 256->8
403 sub $4,%eax # 128->0, 192->2, 256->4
404 jz aes_loop_pre_dec_done\num_initial_blocks
406 aes_loop_pre_dec\num_initial_blocks:
407 MOVADQ (%r10),\TMP2
408 .irpc index, 1234
409 AESENC \TMP2, %xmm\index
410 .endr
411 add $16,%r10
412 sub $1,%eax
413 jnz aes_loop_pre_dec\num_initial_blocks
415 aes_loop_pre_dec_done\num_initial_blocks:
416 MOVADQ (%r10), \TMP2
417 AESENCLAST \TMP2, \XMM1
418 AESENCLAST \TMP2, \XMM2
419 AESENCLAST \TMP2, \XMM3
420 AESENCLAST \TMP2, \XMM4
421 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
422 pxor \TMP1, \XMM1
423 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
424 movdqa \TMP1, \XMM1
425 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
426 pxor \TMP1, \XMM2
427 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
428 movdqa \TMP1, \XMM2
429 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
430 pxor \TMP1, \XMM3
431 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
432 movdqa \TMP1, \XMM3
433 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
434 pxor \TMP1, \XMM4
435 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
436 movdqa \TMP1, \XMM4
437 add $64, %r11
438 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
439 pxor \XMMDst, \XMM1
440 # combine GHASHed value with the corresponding ciphertext
441 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
442 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
443 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
445 _initial_blocks_done\num_initial_blocks\operation:
447 .endm
451 * if a = number of total plaintext bytes
452 * b = floor(a/16)
453 * num_initial_blocks = b mod 4
454 * encrypt the initial num_initial_blocks blocks and apply ghash on
455 * the ciphertext
456 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
457 * are clobbered
458 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
462 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
463 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
464 MOVADQ SHUF_MASK(%rip), %xmm14
465 mov arg7, %r10 # %r10 = AAD
466 mov arg8, %r12 # %r12 = aadLen
467 mov %r12, %r11
468 pxor %xmm\i, %xmm\i
469 _get_AAD_loop\num_initial_blocks\operation:
470 movd (%r10), \TMP1
471 pslldq $12, \TMP1
472 psrldq $4, %xmm\i
473 pxor \TMP1, %xmm\i
474 add $4, %r10
475 sub $4, %r12
476 jne _get_AAD_loop\num_initial_blocks\operation
477 cmp $16, %r11
478 je _get_AAD_loop2_done\num_initial_blocks\operation
479 mov $16, %r12
480 _get_AAD_loop2\num_initial_blocks\operation:
481 psrldq $4, %xmm\i
482 sub $4, %r12
483 cmp %r11, %r12
484 jne _get_AAD_loop2\num_initial_blocks\operation
485 _get_AAD_loop2_done\num_initial_blocks\operation:
486 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
488 xor %r11, %r11 # initialise the data pointer offset as zero
490 # start AES for num_initial_blocks blocks
492 mov %arg5, %rax # %rax = *Y0
493 movdqu (%rax), \XMM0 # XMM0 = Y0
494 PSHUFB_XMM %xmm14, \XMM0
496 .if (\i == 5) || (\i == 6) || (\i == 7)
498 MOVADQ ONE(%RIP),\TMP1
499 MOVADQ 0(%arg1),\TMP2
500 .irpc index, \i_seq
501 paddd \TMP1, \XMM0 # INCR Y0
502 MOVADQ \XMM0, %xmm\index
503 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
504 pxor \TMP2, %xmm\index
505 .endr
506 lea 0x10(%arg1),%r10
507 mov keysize,%eax
508 shr $2,%eax # 128->4, 192->6, 256->8
509 add $5,%eax # 128->9, 192->11, 256->13
511 aes_loop_initial_enc\num_initial_blocks:
512 MOVADQ (%r10),\TMP1
513 .irpc index, \i_seq
514 AESENC \TMP1, %xmm\index
515 .endr
516 add $16,%r10
517 sub $1,%eax
518 jnz aes_loop_initial_enc\num_initial_blocks
520 MOVADQ (%r10), \TMP1
521 .irpc index, \i_seq
522 AESENCLAST \TMP1, %xmm\index # Last Round
523 .endr
524 .irpc index, \i_seq
525 movdqu (%arg3 , %r11, 1), \TMP1
526 pxor \TMP1, %xmm\index
527 movdqu %xmm\index, (%arg2 , %r11, 1)
528 # write back plaintext/ciphertext for num_initial_blocks
529 add $16, %r11
530 PSHUFB_XMM %xmm14, %xmm\index
532 # prepare plaintext/ciphertext for GHASH computation
533 .endr
534 .endif
535 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
536 # apply GHASH on num_initial_blocks blocks
538 .if \i == 5
539 pxor %xmm5, %xmm6
540 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
541 pxor %xmm6, %xmm7
542 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
543 pxor %xmm7, %xmm8
544 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
545 .elseif \i == 6
546 pxor %xmm6, %xmm7
547 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
548 pxor %xmm7, %xmm8
549 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
550 .elseif \i == 7
551 pxor %xmm7, %xmm8
552 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 .endif
554 cmp $64, %r13
555 jl _initial_blocks_done\num_initial_blocks\operation
556 # no need for precomputed values
559 * Precomputations for HashKey parallel with encryption of first 4 blocks.
560 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
562 MOVADQ ONE(%RIP),\TMP1
563 paddd \TMP1, \XMM0 # INCR Y0
564 MOVADQ \XMM0, \XMM1
565 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
567 paddd \TMP1, \XMM0 # INCR Y0
568 MOVADQ \XMM0, \XMM2
569 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
571 paddd \TMP1, \XMM0 # INCR Y0
572 MOVADQ \XMM0, \XMM3
573 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
575 paddd \TMP1, \XMM0 # INCR Y0
576 MOVADQ \XMM0, \XMM4
577 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
579 MOVADQ 0(%arg1),\TMP1
580 pxor \TMP1, \XMM1
581 pxor \TMP1, \XMM2
582 pxor \TMP1, \XMM3
583 pxor \TMP1, \XMM4
584 movdqa \TMP3, \TMP5
585 pshufd $78, \TMP3, \TMP1
586 pxor \TMP3, \TMP1
587 movdqa \TMP1, HashKey_k(%rsp)
588 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
589 # TMP5 = HashKey^2<<1 (mod poly)
590 movdqa \TMP5, HashKey_2(%rsp)
591 # HashKey_2 = HashKey^2<<1 (mod poly)
592 pshufd $78, \TMP5, \TMP1
593 pxor \TMP5, \TMP1
594 movdqa \TMP1, HashKey_2_k(%rsp)
595 .irpc index, 1234 # do 4 rounds
596 movaps 0x10*\index(%arg1), \TMP1
597 AESENC \TMP1, \XMM1
598 AESENC \TMP1, \XMM2
599 AESENC \TMP1, \XMM3
600 AESENC \TMP1, \XMM4
601 .endr
602 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
603 # TMP5 = HashKey^3<<1 (mod poly)
604 movdqa \TMP5, HashKey_3(%rsp)
605 pshufd $78, \TMP5, \TMP1
606 pxor \TMP5, \TMP1
607 movdqa \TMP1, HashKey_3_k(%rsp)
608 .irpc index, 56789 # do next 5 rounds
609 movaps 0x10*\index(%arg1), \TMP1
610 AESENC \TMP1, \XMM1
611 AESENC \TMP1, \XMM2
612 AESENC \TMP1, \XMM3
613 AESENC \TMP1, \XMM4
614 .endr
615 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
616 # TMP5 = HashKey^3<<1 (mod poly)
617 movdqa \TMP5, HashKey_4(%rsp)
618 pshufd $78, \TMP5, \TMP1
619 pxor \TMP5, \TMP1
620 movdqa \TMP1, HashKey_4_k(%rsp)
621 lea 0xa0(%arg1),%r10
622 mov keysize,%eax
623 shr $2,%eax # 128->4, 192->6, 256->8
624 sub $4,%eax # 128->0, 192->2, 256->4
625 jz aes_loop_pre_enc_done\num_initial_blocks
627 aes_loop_pre_enc\num_initial_blocks:
628 MOVADQ (%r10),\TMP2
629 .irpc index, 1234
630 AESENC \TMP2, %xmm\index
631 .endr
632 add $16,%r10
633 sub $1,%eax
634 jnz aes_loop_pre_enc\num_initial_blocks
636 aes_loop_pre_enc_done\num_initial_blocks:
637 MOVADQ (%r10), \TMP2
638 AESENCLAST \TMP2, \XMM1
639 AESENCLAST \TMP2, \XMM2
640 AESENCLAST \TMP2, \XMM3
641 AESENCLAST \TMP2, \XMM4
642 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM1
644 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM2
646 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
647 pxor \TMP1, \XMM3
648 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
649 pxor \TMP1, \XMM4
650 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
651 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
652 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
653 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
655 add $64, %r11
656 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
657 pxor \XMMDst, \XMM1
658 # combine GHASHed value with the corresponding ciphertext
659 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
660 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
663 _initial_blocks_done\num_initial_blocks\operation:
665 .endm
668 * encrypt 4 blocks at a time
669 * ghash the 4 previously encrypted ciphertext blocks
670 * arg1, %arg2, %arg3 are used as pointers only, not modified
671 * %r11 is the data offset value
673 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
751 # Multiply TMP5 * HashKey using karatsuba
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 lea 0xa0(%arg1),%r10
792 mov keysize,%eax
793 shr $2,%eax # 128->4, 192->6, 256->8
794 sub $4,%eax # 128->0, 192->2, 256->4
795 jz aes_loop_par_enc_done
797 aes_loop_par_enc:
798 MOVADQ (%r10),\TMP3
799 .irpc index, 1234
800 AESENC \TMP3, %xmm\index
801 .endr
802 add $16,%r10
803 sub $1,%eax
804 jnz aes_loop_par_enc
806 aes_loop_par_enc_done:
807 MOVADQ (%r10), \TMP3
808 AESENCLAST \TMP3, \XMM1 # Round 10
809 AESENCLAST \TMP3, \XMM2
810 AESENCLAST \TMP3, \XMM3
811 AESENCLAST \TMP3, \XMM4
812 movdqa HashKey_k(%rsp), \TMP5
813 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
814 movdqu (%arg3,%r11,1), \TMP3
815 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
816 movdqu 16(%arg3,%r11,1), \TMP3
817 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
818 movdqu 32(%arg3,%r11,1), \TMP3
819 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
820 movdqu 48(%arg3,%r11,1), \TMP3
821 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
822 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
823 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
824 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
825 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
826 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
827 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
828 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
829 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
831 pxor \TMP4, \TMP1
832 pxor \XMM8, \XMM5
833 pxor \TMP6, \TMP2
834 pxor \TMP1, \TMP2
835 pxor \XMM5, \TMP2
836 movdqa \TMP2, \TMP3
837 pslldq $8, \TMP3 # left shift TMP3 2 DWs
838 psrldq $8, \TMP2 # right shift TMP2 2 DWs
839 pxor \TMP3, \XMM5
840 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
842 # first phase of reduction
844 movdqa \XMM5, \TMP2
845 movdqa \XMM5, \TMP3
846 movdqa \XMM5, \TMP4
847 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
848 pslld $31, \TMP2 # packed right shift << 31
849 pslld $30, \TMP3 # packed right shift << 30
850 pslld $25, \TMP4 # packed right shift << 25
851 pxor \TMP3, \TMP2 # xor the shifted versions
852 pxor \TMP4, \TMP2
853 movdqa \TMP2, \TMP5
854 psrldq $4, \TMP5 # right shift T5 1 DW
855 pslldq $12, \TMP2 # left shift T2 3 DWs
856 pxor \TMP2, \XMM5
858 # second phase of reduction
860 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
861 movdqa \XMM5,\TMP3
862 movdqa \XMM5,\TMP4
863 psrld $1, \TMP2 # packed left shift >>1
864 psrld $2, \TMP3 # packed left shift >>2
865 psrld $7, \TMP4 # packed left shift >>7
866 pxor \TMP3,\TMP2 # xor the shifted versions
867 pxor \TMP4,\TMP2
868 pxor \TMP5, \TMP2
869 pxor \TMP2, \XMM5
870 pxor \TMP1, \XMM5 # result is in TMP1
872 pxor \XMM5, \XMM1
873 .endm
876 * decrypt 4 blocks at a time
877 * ghash the 4 previously decrypted ciphertext blocks
878 * arg1, %arg2, %arg3 are used as pointers only, not modified
879 * %r11 is the data offset value
881 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
882 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
884 movdqa \XMM1, \XMM5
885 movdqa \XMM2, \XMM6
886 movdqa \XMM3, \XMM7
887 movdqa \XMM4, \XMM8
889 movdqa SHUF_MASK(%rip), %xmm15
890 # multiply TMP5 * HashKey using karatsuba
892 movdqa \XMM5, \TMP4
893 pshufd $78, \XMM5, \TMP6
894 pxor \XMM5, \TMP6
895 paddd ONE(%rip), \XMM0 # INCR CNT
896 movdqa HashKey_4(%rsp), \TMP5
897 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
898 movdqa \XMM0, \XMM1
899 paddd ONE(%rip), \XMM0 # INCR CNT
900 movdqa \XMM0, \XMM2
901 paddd ONE(%rip), \XMM0 # INCR CNT
902 movdqa \XMM0, \XMM3
903 paddd ONE(%rip), \XMM0 # INCR CNT
904 movdqa \XMM0, \XMM4
905 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
906 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
907 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
908 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
909 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
911 pxor (%arg1), \XMM1
912 pxor (%arg1), \XMM2
913 pxor (%arg1), \XMM3
914 pxor (%arg1), \XMM4
915 movdqa HashKey_4_k(%rsp), \TMP5
916 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
917 movaps 0x10(%arg1), \TMP1
918 AESENC \TMP1, \XMM1 # Round 1
919 AESENC \TMP1, \XMM2
920 AESENC \TMP1, \XMM3
921 AESENC \TMP1, \XMM4
922 movaps 0x20(%arg1), \TMP1
923 AESENC \TMP1, \XMM1 # Round 2
924 AESENC \TMP1, \XMM2
925 AESENC \TMP1, \XMM3
926 AESENC \TMP1, \XMM4
927 movdqa \XMM6, \TMP1
928 pshufd $78, \XMM6, \TMP2
929 pxor \XMM6, \TMP2
930 movdqa HashKey_3(%rsp), \TMP5
931 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
932 movaps 0x30(%arg1), \TMP3
933 AESENC \TMP3, \XMM1 # Round 3
934 AESENC \TMP3, \XMM2
935 AESENC \TMP3, \XMM3
936 AESENC \TMP3, \XMM4
937 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
938 movaps 0x40(%arg1), \TMP3
939 AESENC \TMP3, \XMM1 # Round 4
940 AESENC \TMP3, \XMM2
941 AESENC \TMP3, \XMM3
942 AESENC \TMP3, \XMM4
943 movdqa HashKey_3_k(%rsp), \TMP5
944 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
945 movaps 0x50(%arg1), \TMP3
946 AESENC \TMP3, \XMM1 # Round 5
947 AESENC \TMP3, \XMM2
948 AESENC \TMP3, \XMM3
949 AESENC \TMP3, \XMM4
950 pxor \TMP1, \TMP4
951 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
952 pxor \XMM6, \XMM5
953 pxor \TMP2, \TMP6
954 movdqa \XMM7, \TMP1
955 pshufd $78, \XMM7, \TMP2
956 pxor \XMM7, \TMP2
957 movdqa HashKey_2(%rsp ), \TMP5
959 # Multiply TMP5 * HashKey using karatsuba
961 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
962 movaps 0x60(%arg1), \TMP3
963 AESENC \TMP3, \XMM1 # Round 6
964 AESENC \TMP3, \XMM2
965 AESENC \TMP3, \XMM3
966 AESENC \TMP3, \XMM4
967 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
968 movaps 0x70(%arg1), \TMP3
969 AESENC \TMP3, \XMM1 # Round 7
970 AESENC \TMP3, \XMM2
971 AESENC \TMP3, \XMM3
972 AESENC \TMP3, \XMM4
973 movdqa HashKey_2_k(%rsp), \TMP5
974 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
975 movaps 0x80(%arg1), \TMP3
976 AESENC \TMP3, \XMM1 # Round 8
977 AESENC \TMP3, \XMM2
978 AESENC \TMP3, \XMM3
979 AESENC \TMP3, \XMM4
980 pxor \TMP1, \TMP4
981 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
982 pxor \XMM7, \XMM5
983 pxor \TMP2, \TMP6
985 # Multiply XMM8 * HashKey
986 # XMM8 and TMP5 hold the values for the two operands
988 movdqa \XMM8, \TMP1
989 pshufd $78, \XMM8, \TMP2
990 pxor \XMM8, \TMP2
991 movdqa HashKey(%rsp), \TMP5
992 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
993 movaps 0x90(%arg1), \TMP3
994 AESENC \TMP3, \XMM1 # Round 9
995 AESENC \TMP3, \XMM2
996 AESENC \TMP3, \XMM3
997 AESENC \TMP3, \XMM4
998 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
999 lea 0xa0(%arg1),%r10
1000 mov keysize,%eax
1001 shr $2,%eax # 128->4, 192->6, 256->8
1002 sub $4,%eax # 128->0, 192->2, 256->4
1003 jz aes_loop_par_dec_done
1005 aes_loop_par_dec:
1006 MOVADQ (%r10),\TMP3
1007 .irpc index, 1234
1008 AESENC \TMP3, %xmm\index
1009 .endr
1010 add $16,%r10
1011 sub $1,%eax
1012 jnz aes_loop_par_dec
1014 aes_loop_par_dec_done:
1015 MOVADQ (%r10), \TMP3
1016 AESENCLAST \TMP3, \XMM1 # last round
1017 AESENCLAST \TMP3, \XMM2
1018 AESENCLAST \TMP3, \XMM3
1019 AESENCLAST \TMP3, \XMM4
1020 movdqa HashKey_k(%rsp), \TMP5
1021 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1022 movdqu (%arg3,%r11,1), \TMP3
1023 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1024 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1025 movdqa \TMP3, \XMM1
1026 movdqu 16(%arg3,%r11,1), \TMP3
1027 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1028 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1029 movdqa \TMP3, \XMM2
1030 movdqu 32(%arg3,%r11,1), \TMP3
1031 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1032 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1033 movdqa \TMP3, \XMM3
1034 movdqu 48(%arg3,%r11,1), \TMP3
1035 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1036 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1037 movdqa \TMP3, \XMM4
1038 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1039 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1040 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1041 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1043 pxor \TMP4, \TMP1
1044 pxor \XMM8, \XMM5
1045 pxor \TMP6, \TMP2
1046 pxor \TMP1, \TMP2
1047 pxor \XMM5, \TMP2
1048 movdqa \TMP2, \TMP3
1049 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1050 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1051 pxor \TMP3, \XMM5
1052 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1054 # first phase of reduction
1056 movdqa \XMM5, \TMP2
1057 movdqa \XMM5, \TMP3
1058 movdqa \XMM5, \TMP4
1059 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1060 pslld $31, \TMP2 # packed right shift << 31
1061 pslld $30, \TMP3 # packed right shift << 30
1062 pslld $25, \TMP4 # packed right shift << 25
1063 pxor \TMP3, \TMP2 # xor the shifted versions
1064 pxor \TMP4, \TMP2
1065 movdqa \TMP2, \TMP5
1066 psrldq $4, \TMP5 # right shift T5 1 DW
1067 pslldq $12, \TMP2 # left shift T2 3 DWs
1068 pxor \TMP2, \XMM5
1070 # second phase of reduction
1072 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1073 movdqa \XMM5,\TMP3
1074 movdqa \XMM5,\TMP4
1075 psrld $1, \TMP2 # packed left shift >>1
1076 psrld $2, \TMP3 # packed left shift >>2
1077 psrld $7, \TMP4 # packed left shift >>7
1078 pxor \TMP3,\TMP2 # xor the shifted versions
1079 pxor \TMP4,\TMP2
1080 pxor \TMP5, \TMP2
1081 pxor \TMP2, \XMM5
1082 pxor \TMP1, \XMM5 # result is in TMP1
1084 pxor \XMM5, \XMM1
1085 .endm
1087 /* GHASH the last 4 ciphertext blocks. */
1088 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1089 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1091 # Multiply TMP6 * HashKey (using Karatsuba)
1093 movdqa \XMM1, \TMP6
1094 pshufd $78, \XMM1, \TMP2
1095 pxor \XMM1, \TMP2
1096 movdqa HashKey_4(%rsp), \TMP5
1097 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1098 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1099 movdqa HashKey_4_k(%rsp), \TMP4
1100 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1101 movdqa \XMM1, \XMMDst
1102 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1104 # Multiply TMP1 * HashKey (using Karatsuba)
1106 movdqa \XMM2, \TMP1
1107 pshufd $78, \XMM2, \TMP2
1108 pxor \XMM2, \TMP2
1109 movdqa HashKey_3(%rsp), \TMP5
1110 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1111 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1112 movdqa HashKey_3_k(%rsp), \TMP4
1113 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1114 pxor \TMP1, \TMP6
1115 pxor \XMM2, \XMMDst
1116 pxor \TMP2, \XMM1
1117 # results accumulated in TMP6, XMMDst, XMM1
1119 # Multiply TMP1 * HashKey (using Karatsuba)
1121 movdqa \XMM3, \TMP1
1122 pshufd $78, \XMM3, \TMP2
1123 pxor \XMM3, \TMP2
1124 movdqa HashKey_2(%rsp), \TMP5
1125 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1126 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1127 movdqa HashKey_2_k(%rsp), \TMP4
1128 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1129 pxor \TMP1, \TMP6
1130 pxor \XMM3, \XMMDst
1131 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1133 # Multiply TMP1 * HashKey (using Karatsuba)
1134 movdqa \XMM4, \TMP1
1135 pshufd $78, \XMM4, \TMP2
1136 pxor \XMM4, \TMP2
1137 movdqa HashKey(%rsp), \TMP5
1138 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1139 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1140 movdqa HashKey_k(%rsp), \TMP4
1141 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1142 pxor \TMP1, \TMP6
1143 pxor \XMM4, \XMMDst
1144 pxor \XMM1, \TMP2
1145 pxor \TMP6, \TMP2
1146 pxor \XMMDst, \TMP2
1147 # middle section of the temp results combined as in karatsuba algorithm
1148 movdqa \TMP2, \TMP4
1149 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1151 pxor \TMP4, \XMMDst
1152 pxor \TMP2, \TMP6
1153 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1154 # first phase of the reduction
1155 movdqa \XMMDst, \TMP2
1156 movdqa \XMMDst, \TMP3
1157 movdqa \XMMDst, \TMP4
1158 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1159 pslld $31, \TMP2 # packed right shifting << 31
1160 pslld $30, \TMP3 # packed right shifting << 30
1161 pslld $25, \TMP4 # packed right shifting << 25
1162 pxor \TMP3, \TMP2 # xor the shifted versions
1163 pxor \TMP4, \TMP2
1164 movdqa \TMP2, \TMP7
1165 psrldq $4, \TMP7 # right shift TMP7 1 DW
1166 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1167 pxor \TMP2, \XMMDst
1169 # second phase of the reduction
1170 movdqa \XMMDst, \TMP2
1171 # make 3 copies of XMMDst for doing 3 shift operations
1172 movdqa \XMMDst, \TMP3
1173 movdqa \XMMDst, \TMP4
1174 psrld $1, \TMP2 # packed left shift >> 1
1175 psrld $2, \TMP3 # packed left shift >> 2
1176 psrld $7, \TMP4 # packed left shift >> 7
1177 pxor \TMP3, \TMP2 # xor the shifted versions
1178 pxor \TMP4, \TMP2
1179 pxor \TMP7, \TMP2
1180 pxor \TMP2, \XMMDst
1181 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1182 .endm
1185 /* Encryption of a single block
1186 * uses eax & r10
1189 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1191 pxor (%arg1), \XMM0
1192 mov keysize,%eax
1193 shr $2,%eax # 128->4, 192->6, 256->8
1194 add $5,%eax # 128->9, 192->11, 256->13
1195 lea 16(%arg1), %r10 # get first expanded key address
1197 _esb_loop_\@:
1198 MOVADQ (%r10),\TMP1
1199 AESENC \TMP1,\XMM0
1200 add $16,%r10
1201 sub $1,%eax
1202 jnz _esb_loop_\@
1204 MOVADQ (%r10),\TMP1
1205 AESENCLAST \TMP1,\XMM0
1206 .endm
1207 /*****************************************************************************
1208 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1209 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1210 * const u8 *in, // Ciphertext input
1211 * u64 plaintext_len, // Length of data in bytes for decryption.
1212 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1213 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1214 * // concatenated with 0x00000001. 16-byte aligned pointer.
1215 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1216 * const u8 *aad, // Additional Authentication Data (AAD)
1217 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1218 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1219 * // given authentication tag and only return the plaintext if they match.
1220 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1221 * // (most likely), 12 or 8.
1223 * Assumptions:
1225 * keys:
1226 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1227 * set of 11 keys in the data structure void *aes_ctx
1229 * iv:
1230 * 0 1 2 3
1231 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1232 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1233 * | Salt (From the SA) |
1234 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1235 * | Initialization Vector |
1236 * | (This is the sequence number from IPSec header) |
1237 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1238 * | 0x1 |
1239 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1243 * AAD:
1244 * AAD padded to 128 bits with 0
1245 * for example, assume AAD is a u32 vector
1247 * if AAD is 8 bytes:
1248 * AAD[3] = {A0, A1};
1249 * padded AAD in xmm register = {A1 A0 0 0}
1251 * 0 1 2 3
1252 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1253 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1254 * | SPI (A1) |
1255 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1256 * | 32-bit Sequence Number (A0) |
1257 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1258 * | 0x0 |
1259 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1261 * AAD Format with 32-bit Sequence Number
1263 * if AAD is 12 bytes:
1264 * AAD[3] = {A0, A1, A2};
1265 * padded AAD in xmm register = {A2 A1 A0 0}
1267 * 0 1 2 3
1268 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1269 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1270 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1271 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1272 * | SPI (A2) |
1273 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1274 * | 64-bit Extended Sequence Number {A1,A0} |
1275 * | |
1276 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1277 * | 0x0 |
1278 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1280 * AAD Format with 64-bit Extended Sequence Number
1282 * aadLen:
1283 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1284 * The code supports 16 too but for other sizes, the code will fail.
1286 * TLen:
1287 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1288 * For other sizes, the code will fail.
1290 * poly = x^128 + x^127 + x^126 + x^121 + 1
1292 *****************************************************************************/
1293 ENTRY(aesni_gcm_dec)
1294 push %r12
1295 push %r13
1296 push %r14
1297 mov %rsp, %r14
1299 * states of %xmm registers %xmm6:%xmm15 not saved
1300 * all %xmm registers are clobbered
1302 sub $VARIABLE_OFFSET, %rsp
1303 and $~63, %rsp # align rsp to 64 bytes
1304 mov %arg6, %r12
1305 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1306 movdqa SHUF_MASK(%rip), %xmm2
1307 PSHUFB_XMM %xmm2, %xmm13
1310 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1312 movdqa %xmm13, %xmm2
1313 psllq $1, %xmm13
1314 psrlq $63, %xmm2
1315 movdqa %xmm2, %xmm1
1316 pslldq $8, %xmm2
1317 psrldq $8, %xmm1
1318 por %xmm2, %xmm13
1320 # Reduction
1322 pshufd $0x24, %xmm1, %xmm2
1323 pcmpeqd TWOONE(%rip), %xmm2
1324 pand POLY(%rip), %xmm2
1325 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1328 # Decrypt first few blocks
1330 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1331 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1332 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1333 mov %r13, %r12
1334 and $(3<<4), %r12
1335 jz _initial_num_blocks_is_0_decrypt
1336 cmp $(2<<4), %r12
1337 jb _initial_num_blocks_is_1_decrypt
1338 je _initial_num_blocks_is_2_decrypt
1339 _initial_num_blocks_is_3_decrypt:
1340 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1342 sub $48, %r13
1343 jmp _initial_blocks_decrypted
1344 _initial_num_blocks_is_2_decrypt:
1345 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1347 sub $32, %r13
1348 jmp _initial_blocks_decrypted
1349 _initial_num_blocks_is_1_decrypt:
1350 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1351 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1352 sub $16, %r13
1353 jmp _initial_blocks_decrypted
1354 _initial_num_blocks_is_0_decrypt:
1355 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1356 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1357 _initial_blocks_decrypted:
1358 cmp $0, %r13
1359 je _zero_cipher_left_decrypt
1360 sub $64, %r13
1361 je _four_cipher_left_decrypt
1362 _decrypt_by_4:
1363 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1364 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1365 add $64, %r11
1366 sub $64, %r13
1367 jne _decrypt_by_4
1368 _four_cipher_left_decrypt:
1369 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1370 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1371 _zero_cipher_left_decrypt:
1372 mov %arg4, %r13
1373 and $15, %r13 # %r13 = arg4 (mod 16)
1374 je _multiple_of_16_bytes_decrypt
1376 # Handle the last <16 byte block separately
1378 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1379 movdqa SHUF_MASK(%rip), %xmm10
1380 PSHUFB_XMM %xmm10, %xmm0
1382 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1383 sub $16, %r11
1384 add %r13, %r11
1385 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1386 lea SHIFT_MASK+16(%rip), %r12
1387 sub %r13, %r12
1388 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1389 # (%r13 is the number of bytes in plaintext mod 16)
1390 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1391 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1393 movdqa %xmm1, %xmm2
1394 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1395 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1396 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1397 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1398 pand %xmm1, %xmm2
1399 movdqa SHUF_MASK(%rip), %xmm10
1400 PSHUFB_XMM %xmm10 ,%xmm2
1402 pxor %xmm2, %xmm8
1403 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1404 # GHASH computation for the last <16 byte block
1405 sub %r13, %r11
1406 add $16, %r11
1408 # output %r13 bytes
1409 MOVQ_R64_XMM %xmm0, %rax
1410 cmp $8, %r13
1411 jle _less_than_8_bytes_left_decrypt
1412 mov %rax, (%arg2 , %r11, 1)
1413 add $8, %r11
1414 psrldq $8, %xmm0
1415 MOVQ_R64_XMM %xmm0, %rax
1416 sub $8, %r13
1417 _less_than_8_bytes_left_decrypt:
1418 mov %al, (%arg2, %r11, 1)
1419 add $1, %r11
1420 shr $8, %rax
1421 sub $1, %r13
1422 jne _less_than_8_bytes_left_decrypt
1423 _multiple_of_16_bytes_decrypt:
1424 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1425 shl $3, %r12 # convert into number of bits
1426 movd %r12d, %xmm15 # len(A) in %xmm15
1427 shl $3, %arg4 # len(C) in bits (*128)
1428 MOVQ_R64_XMM %arg4, %xmm1
1429 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1430 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1431 pxor %xmm15, %xmm8
1432 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1433 # final GHASH computation
1434 movdqa SHUF_MASK(%rip), %xmm10
1435 PSHUFB_XMM %xmm10, %xmm8
1437 mov %arg5, %rax # %rax = *Y0
1438 movdqu (%rax), %xmm0 # %xmm0 = Y0
1439 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1440 pxor %xmm8, %xmm0
1441 _return_T_decrypt:
1442 mov arg9, %r10 # %r10 = authTag
1443 mov arg10, %r11 # %r11 = auth_tag_len
1444 cmp $16, %r11
1445 je _T_16_decrypt
1446 cmp $12, %r11
1447 je _T_12_decrypt
1448 _T_8_decrypt:
1449 MOVQ_R64_XMM %xmm0, %rax
1450 mov %rax, (%r10)
1451 jmp _return_T_done_decrypt
1452 _T_12_decrypt:
1453 MOVQ_R64_XMM %xmm0, %rax
1454 mov %rax, (%r10)
1455 psrldq $8, %xmm0
1456 movd %xmm0, %eax
1457 mov %eax, 8(%r10)
1458 jmp _return_T_done_decrypt
1459 _T_16_decrypt:
1460 movdqu %xmm0, (%r10)
1461 _return_T_done_decrypt:
1462 mov %r14, %rsp
1463 pop %r14
1464 pop %r13
1465 pop %r12
1467 ENDPROC(aesni_gcm_dec)
1470 /*****************************************************************************
1471 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1472 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1473 * const u8 *in, // Plaintext input
1474 * u64 plaintext_len, // Length of data in bytes for encryption.
1475 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1476 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1477 * // concatenated with 0x00000001. 16-byte aligned pointer.
1478 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1479 * const u8 *aad, // Additional Authentication Data (AAD)
1480 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1481 * u8 *auth_tag, // Authenticated Tag output.
1482 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1483 * // 12 or 8.
1485 * Assumptions:
1487 * keys:
1488 * keys are pre-expanded and aligned to 16 bytes. we are using the
1489 * first set of 11 keys in the data structure void *aes_ctx
1492 * iv:
1493 * 0 1 2 3
1494 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1495 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * | Salt (From the SA) |
1497 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1498 * | Initialization Vector |
1499 * | (This is the sequence number from IPSec header) |
1500 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1501 * | 0x1 |
1502 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1506 * AAD:
1507 * AAD padded to 128 bits with 0
1508 * for example, assume AAD is a u32 vector
1510 * if AAD is 8 bytes:
1511 * AAD[3] = {A0, A1};
1512 * padded AAD in xmm register = {A1 A0 0 0}
1514 * 0 1 2 3
1515 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1516 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1517 * | SPI (A1) |
1518 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1519 * | 32-bit Sequence Number (A0) |
1520 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1521 * | 0x0 |
1522 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524 * AAD Format with 32-bit Sequence Number
1526 * if AAD is 12 bytes:
1527 * AAD[3] = {A0, A1, A2};
1528 * padded AAD in xmm register = {A2 A1 A0 0}
1530 * 0 1 2 3
1531 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1532 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1533 * | SPI (A2) |
1534 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1535 * | 64-bit Extended Sequence Number {A1,A0} |
1536 * | |
1537 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 * | 0x0 |
1539 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1541 * AAD Format with 64-bit Extended Sequence Number
1543 * aadLen:
1544 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1545 * The code supports 16 too but for other sizes, the code will fail.
1547 * TLen:
1548 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1549 * For other sizes, the code will fail.
1551 * poly = x^128 + x^127 + x^126 + x^121 + 1
1552 ***************************************************************************/
1553 ENTRY(aesni_gcm_enc)
1554 push %r12
1555 push %r13
1556 push %r14
1557 mov %rsp, %r14
1559 # states of %xmm registers %xmm6:%xmm15 not saved
1560 # all %xmm registers are clobbered
1562 sub $VARIABLE_OFFSET, %rsp
1563 and $~63, %rsp
1564 mov %arg6, %r12
1565 movdqu (%r12), %xmm13
1566 movdqa SHUF_MASK(%rip), %xmm2
1567 PSHUFB_XMM %xmm2, %xmm13
1570 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1572 movdqa %xmm13, %xmm2
1573 psllq $1, %xmm13
1574 psrlq $63, %xmm2
1575 movdqa %xmm2, %xmm1
1576 pslldq $8, %xmm2
1577 psrldq $8, %xmm1
1578 por %xmm2, %xmm13
1580 # reduce HashKey<<1
1582 pshufd $0x24, %xmm1, %xmm2
1583 pcmpeqd TWOONE(%rip), %xmm2
1584 pand POLY(%rip), %xmm2
1585 pxor %xmm2, %xmm13
1586 movdqa %xmm13, HashKey(%rsp)
1587 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1588 and $-16, %r13
1589 mov %r13, %r12
1591 # Encrypt first few blocks
1593 and $(3<<4), %r12
1594 jz _initial_num_blocks_is_0_encrypt
1595 cmp $(2<<4), %r12
1596 jb _initial_num_blocks_is_1_encrypt
1597 je _initial_num_blocks_is_2_encrypt
1598 _initial_num_blocks_is_3_encrypt:
1599 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1601 sub $48, %r13
1602 jmp _initial_blocks_encrypted
1603 _initial_num_blocks_is_2_encrypt:
1604 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1606 sub $32, %r13
1607 jmp _initial_blocks_encrypted
1608 _initial_num_blocks_is_1_encrypt:
1609 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1610 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1611 sub $16, %r13
1612 jmp _initial_blocks_encrypted
1613 _initial_num_blocks_is_0_encrypt:
1614 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1615 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1616 _initial_blocks_encrypted:
1618 # Main loop - Encrypt remaining blocks
1620 cmp $0, %r13
1621 je _zero_cipher_left_encrypt
1622 sub $64, %r13
1623 je _four_cipher_left_encrypt
1624 _encrypt_by_4_encrypt:
1625 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1626 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1627 add $64, %r11
1628 sub $64, %r13
1629 jne _encrypt_by_4_encrypt
1630 _four_cipher_left_encrypt:
1631 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1632 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1633 _zero_cipher_left_encrypt:
1634 mov %arg4, %r13
1635 and $15, %r13 # %r13 = arg4 (mod 16)
1636 je _multiple_of_16_bytes_encrypt
1638 # Handle the last <16 Byte block separately
1639 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1640 movdqa SHUF_MASK(%rip), %xmm10
1641 PSHUFB_XMM %xmm10, %xmm0
1644 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1645 sub $16, %r11
1646 add %r13, %r11
1647 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1648 lea SHIFT_MASK+16(%rip), %r12
1649 sub %r13, %r12
1650 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1651 # (%r13 is the number of bytes in plaintext mod 16)
1652 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1653 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1654 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1655 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1656 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1657 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1658 movdqa SHUF_MASK(%rip), %xmm10
1659 PSHUFB_XMM %xmm10,%xmm0
1661 pxor %xmm0, %xmm8
1662 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1663 # GHASH computation for the last <16 byte block
1664 sub %r13, %r11
1665 add $16, %r11
1667 movdqa SHUF_MASK(%rip), %xmm10
1668 PSHUFB_XMM %xmm10, %xmm0
1670 # shuffle xmm0 back to output as ciphertext
1672 # Output %r13 bytes
1673 MOVQ_R64_XMM %xmm0, %rax
1674 cmp $8, %r13
1675 jle _less_than_8_bytes_left_encrypt
1676 mov %rax, (%arg2 , %r11, 1)
1677 add $8, %r11
1678 psrldq $8, %xmm0
1679 MOVQ_R64_XMM %xmm0, %rax
1680 sub $8, %r13
1681 _less_than_8_bytes_left_encrypt:
1682 mov %al, (%arg2, %r11, 1)
1683 add $1, %r11
1684 shr $8, %rax
1685 sub $1, %r13
1686 jne _less_than_8_bytes_left_encrypt
1687 _multiple_of_16_bytes_encrypt:
1688 mov arg8, %r12 # %r12 = addLen (number of bytes)
1689 shl $3, %r12
1690 movd %r12d, %xmm15 # len(A) in %xmm15
1691 shl $3, %arg4 # len(C) in bits (*128)
1692 MOVQ_R64_XMM %arg4, %xmm1
1693 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1694 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1695 pxor %xmm15, %xmm8
1696 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1697 # final GHASH computation
1698 movdqa SHUF_MASK(%rip), %xmm10
1699 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1701 mov %arg5, %rax # %rax = *Y0
1702 movdqu (%rax), %xmm0 # %xmm0 = Y0
1703 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1704 pxor %xmm8, %xmm0
1705 _return_T_encrypt:
1706 mov arg9, %r10 # %r10 = authTag
1707 mov arg10, %r11 # %r11 = auth_tag_len
1708 cmp $16, %r11
1709 je _T_16_encrypt
1710 cmp $12, %r11
1711 je _T_12_encrypt
1712 _T_8_encrypt:
1713 MOVQ_R64_XMM %xmm0, %rax
1714 mov %rax, (%r10)
1715 jmp _return_T_done_encrypt
1716 _T_12_encrypt:
1717 MOVQ_R64_XMM %xmm0, %rax
1718 mov %rax, (%r10)
1719 psrldq $8, %xmm0
1720 movd %xmm0, %eax
1721 mov %eax, 8(%r10)
1722 jmp _return_T_done_encrypt
1723 _T_16_encrypt:
1724 movdqu %xmm0, (%r10)
1725 _return_T_done_encrypt:
1726 mov %r14, %rsp
1727 pop %r14
1728 pop %r13
1729 pop %r12
1731 ENDPROC(aesni_gcm_enc)
1733 #endif
1736 .align 4
1737 _key_expansion_128:
1738 _key_expansion_256a:
1739 pshufd $0b11111111, %xmm1, %xmm1
1740 shufps $0b00010000, %xmm0, %xmm4
1741 pxor %xmm4, %xmm0
1742 shufps $0b10001100, %xmm0, %xmm4
1743 pxor %xmm4, %xmm0
1744 pxor %xmm1, %xmm0
1745 movaps %xmm0, (TKEYP)
1746 add $0x10, TKEYP
1748 ENDPROC(_key_expansion_128)
1749 ENDPROC(_key_expansion_256a)
1751 .align 4
1752 _key_expansion_192a:
1753 pshufd $0b01010101, %xmm1, %xmm1
1754 shufps $0b00010000, %xmm0, %xmm4
1755 pxor %xmm4, %xmm0
1756 shufps $0b10001100, %xmm0, %xmm4
1757 pxor %xmm4, %xmm0
1758 pxor %xmm1, %xmm0
1760 movaps %xmm2, %xmm5
1761 movaps %xmm2, %xmm6
1762 pslldq $4, %xmm5
1763 pshufd $0b11111111, %xmm0, %xmm3
1764 pxor %xmm3, %xmm2
1765 pxor %xmm5, %xmm2
1767 movaps %xmm0, %xmm1
1768 shufps $0b01000100, %xmm0, %xmm6
1769 movaps %xmm6, (TKEYP)
1770 shufps $0b01001110, %xmm2, %xmm1
1771 movaps %xmm1, 0x10(TKEYP)
1772 add $0x20, TKEYP
1774 ENDPROC(_key_expansion_192a)
1776 .align 4
1777 _key_expansion_192b:
1778 pshufd $0b01010101, %xmm1, %xmm1
1779 shufps $0b00010000, %xmm0, %xmm4
1780 pxor %xmm4, %xmm0
1781 shufps $0b10001100, %xmm0, %xmm4
1782 pxor %xmm4, %xmm0
1783 pxor %xmm1, %xmm0
1785 movaps %xmm2, %xmm5
1786 pslldq $4, %xmm5
1787 pshufd $0b11111111, %xmm0, %xmm3
1788 pxor %xmm3, %xmm2
1789 pxor %xmm5, %xmm2
1791 movaps %xmm0, (TKEYP)
1792 add $0x10, TKEYP
1794 ENDPROC(_key_expansion_192b)
1796 .align 4
1797 _key_expansion_256b:
1798 pshufd $0b10101010, %xmm1, %xmm1
1799 shufps $0b00010000, %xmm2, %xmm4
1800 pxor %xmm4, %xmm2
1801 shufps $0b10001100, %xmm2, %xmm4
1802 pxor %xmm4, %xmm2
1803 pxor %xmm1, %xmm2
1804 movaps %xmm2, (TKEYP)
1805 add $0x10, TKEYP
1807 ENDPROC(_key_expansion_256b)
1810 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1811 * unsigned int key_len)
1813 ENTRY(aesni_set_key)
1814 FRAME_BEGIN
1815 #ifndef __x86_64__
1816 pushl KEYP
1817 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1818 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1819 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1820 #endif
1821 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1822 movaps %xmm0, (KEYP)
1823 lea 0x10(KEYP), TKEYP # key addr
1824 movl %edx, 480(KEYP)
1825 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1826 cmp $24, %dl
1827 jb .Lenc_key128
1828 je .Lenc_key192
1829 movups 0x10(UKEYP), %xmm2 # other user key
1830 movaps %xmm2, (TKEYP)
1831 add $0x10, TKEYP
1832 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1833 call _key_expansion_256a
1834 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1835 call _key_expansion_256b
1836 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1837 call _key_expansion_256a
1838 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1839 call _key_expansion_256b
1840 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1841 call _key_expansion_256a
1842 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1843 call _key_expansion_256b
1844 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1845 call _key_expansion_256a
1846 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1847 call _key_expansion_256b
1848 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1849 call _key_expansion_256a
1850 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1851 call _key_expansion_256b
1852 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1853 call _key_expansion_256a
1854 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1855 call _key_expansion_256b
1856 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1857 call _key_expansion_256a
1858 jmp .Ldec_key
1859 .Lenc_key192:
1860 movq 0x10(UKEYP), %xmm2 # other user key
1861 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1862 call _key_expansion_192a
1863 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1864 call _key_expansion_192b
1865 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1866 call _key_expansion_192a
1867 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1868 call _key_expansion_192b
1869 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1870 call _key_expansion_192a
1871 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1872 call _key_expansion_192b
1873 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1874 call _key_expansion_192a
1875 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1876 call _key_expansion_192b
1877 jmp .Ldec_key
1878 .Lenc_key128:
1879 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1880 call _key_expansion_128
1881 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1882 call _key_expansion_128
1883 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1884 call _key_expansion_128
1885 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1886 call _key_expansion_128
1887 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1888 call _key_expansion_128
1889 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1890 call _key_expansion_128
1891 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1892 call _key_expansion_128
1893 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1894 call _key_expansion_128
1895 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1896 call _key_expansion_128
1897 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1898 call _key_expansion_128
1899 .Ldec_key:
1900 sub $0x10, TKEYP
1901 movaps (KEYP), %xmm0
1902 movaps (TKEYP), %xmm1
1903 movaps %xmm0, 240(TKEYP)
1904 movaps %xmm1, 240(KEYP)
1905 add $0x10, KEYP
1906 lea 240-16(TKEYP), UKEYP
1907 .align 4
1908 .Ldec_key_loop:
1909 movaps (KEYP), %xmm0
1910 AESIMC %xmm0 %xmm1
1911 movaps %xmm1, (UKEYP)
1912 add $0x10, KEYP
1913 sub $0x10, UKEYP
1914 cmp TKEYP, KEYP
1915 jb .Ldec_key_loop
1916 xor AREG, AREG
1917 #ifndef __x86_64__
1918 popl KEYP
1919 #endif
1920 FRAME_END
1922 ENDPROC(aesni_set_key)
1925 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1927 ENTRY(aesni_enc)
1928 FRAME_BEGIN
1929 #ifndef __x86_64__
1930 pushl KEYP
1931 pushl KLEN
1932 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1933 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1934 movl (FRAME_OFFSET+20)(%esp), INP # src
1935 #endif
1936 movl 480(KEYP), KLEN # key length
1937 movups (INP), STATE # input
1938 call _aesni_enc1
1939 movups STATE, (OUTP) # output
1940 #ifndef __x86_64__
1941 popl KLEN
1942 popl KEYP
1943 #endif
1944 FRAME_END
1946 ENDPROC(aesni_enc)
1949 * _aesni_enc1: internal ABI
1950 * input:
1951 * KEYP: key struct pointer
1952 * KLEN: round count
1953 * STATE: initial state (input)
1954 * output:
1955 * STATE: finial state (output)
1956 * changed:
1957 * KEY
1958 * TKEYP (T1)
1960 .align 4
1961 _aesni_enc1:
1962 movaps (KEYP), KEY # key
1963 mov KEYP, TKEYP
1964 pxor KEY, STATE # round 0
1965 add $0x30, TKEYP
1966 cmp $24, KLEN
1967 jb .Lenc128
1968 lea 0x20(TKEYP), TKEYP
1969 je .Lenc192
1970 add $0x20, TKEYP
1971 movaps -0x60(TKEYP), KEY
1972 AESENC KEY STATE
1973 movaps -0x50(TKEYP), KEY
1974 AESENC KEY STATE
1975 .align 4
1976 .Lenc192:
1977 movaps -0x40(TKEYP), KEY
1978 AESENC KEY STATE
1979 movaps -0x30(TKEYP), KEY
1980 AESENC KEY STATE
1981 .align 4
1982 .Lenc128:
1983 movaps -0x20(TKEYP), KEY
1984 AESENC KEY STATE
1985 movaps -0x10(TKEYP), KEY
1986 AESENC KEY STATE
1987 movaps (TKEYP), KEY
1988 AESENC KEY STATE
1989 movaps 0x10(TKEYP), KEY
1990 AESENC KEY STATE
1991 movaps 0x20(TKEYP), KEY
1992 AESENC KEY STATE
1993 movaps 0x30(TKEYP), KEY
1994 AESENC KEY STATE
1995 movaps 0x40(TKEYP), KEY
1996 AESENC KEY STATE
1997 movaps 0x50(TKEYP), KEY
1998 AESENC KEY STATE
1999 movaps 0x60(TKEYP), KEY
2000 AESENC KEY STATE
2001 movaps 0x70(TKEYP), KEY
2002 AESENCLAST KEY STATE
2004 ENDPROC(_aesni_enc1)
2007 * _aesni_enc4: internal ABI
2008 * input:
2009 * KEYP: key struct pointer
2010 * KLEN: round count
2011 * STATE1: initial state (input)
2012 * STATE2
2013 * STATE3
2014 * STATE4
2015 * output:
2016 * STATE1: finial state (output)
2017 * STATE2
2018 * STATE3
2019 * STATE4
2020 * changed:
2021 * KEY
2022 * TKEYP (T1)
2024 .align 4
2025 _aesni_enc4:
2026 movaps (KEYP), KEY # key
2027 mov KEYP, TKEYP
2028 pxor KEY, STATE1 # round 0
2029 pxor KEY, STATE2
2030 pxor KEY, STATE3
2031 pxor KEY, STATE4
2032 add $0x30, TKEYP
2033 cmp $24, KLEN
2034 jb .L4enc128
2035 lea 0x20(TKEYP), TKEYP
2036 je .L4enc192
2037 add $0x20, TKEYP
2038 movaps -0x60(TKEYP), KEY
2039 AESENC KEY STATE1
2040 AESENC KEY STATE2
2041 AESENC KEY STATE3
2042 AESENC KEY STATE4
2043 movaps -0x50(TKEYP), KEY
2044 AESENC KEY STATE1
2045 AESENC KEY STATE2
2046 AESENC KEY STATE3
2047 AESENC KEY STATE4
2048 #.align 4
2049 .L4enc192:
2050 movaps -0x40(TKEYP), KEY
2051 AESENC KEY STATE1
2052 AESENC KEY STATE2
2053 AESENC KEY STATE3
2054 AESENC KEY STATE4
2055 movaps -0x30(TKEYP), KEY
2056 AESENC KEY STATE1
2057 AESENC KEY STATE2
2058 AESENC KEY STATE3
2059 AESENC KEY STATE4
2060 #.align 4
2061 .L4enc128:
2062 movaps -0x20(TKEYP), KEY
2063 AESENC KEY STATE1
2064 AESENC KEY STATE2
2065 AESENC KEY STATE3
2066 AESENC KEY STATE4
2067 movaps -0x10(TKEYP), KEY
2068 AESENC KEY STATE1
2069 AESENC KEY STATE2
2070 AESENC KEY STATE3
2071 AESENC KEY STATE4
2072 movaps (TKEYP), KEY
2073 AESENC KEY STATE1
2074 AESENC KEY STATE2
2075 AESENC KEY STATE3
2076 AESENC KEY STATE4
2077 movaps 0x10(TKEYP), KEY
2078 AESENC KEY STATE1
2079 AESENC KEY STATE2
2080 AESENC KEY STATE3
2081 AESENC KEY STATE4
2082 movaps 0x20(TKEYP), KEY
2083 AESENC KEY STATE1
2084 AESENC KEY STATE2
2085 AESENC KEY STATE3
2086 AESENC KEY STATE4
2087 movaps 0x30(TKEYP), KEY
2088 AESENC KEY STATE1
2089 AESENC KEY STATE2
2090 AESENC KEY STATE3
2091 AESENC KEY STATE4
2092 movaps 0x40(TKEYP), KEY
2093 AESENC KEY STATE1
2094 AESENC KEY STATE2
2095 AESENC KEY STATE3
2096 AESENC KEY STATE4
2097 movaps 0x50(TKEYP), KEY
2098 AESENC KEY STATE1
2099 AESENC KEY STATE2
2100 AESENC KEY STATE3
2101 AESENC KEY STATE4
2102 movaps 0x60(TKEYP), KEY
2103 AESENC KEY STATE1
2104 AESENC KEY STATE2
2105 AESENC KEY STATE3
2106 AESENC KEY STATE4
2107 movaps 0x70(TKEYP), KEY
2108 AESENCLAST KEY STATE1 # last round
2109 AESENCLAST KEY STATE2
2110 AESENCLAST KEY STATE3
2111 AESENCLAST KEY STATE4
2113 ENDPROC(_aesni_enc4)
2116 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2118 ENTRY(aesni_dec)
2119 FRAME_BEGIN
2120 #ifndef __x86_64__
2121 pushl KEYP
2122 pushl KLEN
2123 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2124 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2125 movl (FRAME_OFFSET+20)(%esp), INP # src
2126 #endif
2127 mov 480(KEYP), KLEN # key length
2128 add $240, KEYP
2129 movups (INP), STATE # input
2130 call _aesni_dec1
2131 movups STATE, (OUTP) #output
2132 #ifndef __x86_64__
2133 popl KLEN
2134 popl KEYP
2135 #endif
2136 FRAME_END
2138 ENDPROC(aesni_dec)
2141 * _aesni_dec1: internal ABI
2142 * input:
2143 * KEYP: key struct pointer
2144 * KLEN: key length
2145 * STATE: initial state (input)
2146 * output:
2147 * STATE: finial state (output)
2148 * changed:
2149 * KEY
2150 * TKEYP (T1)
2152 .align 4
2153 _aesni_dec1:
2154 movaps (KEYP), KEY # key
2155 mov KEYP, TKEYP
2156 pxor KEY, STATE # round 0
2157 add $0x30, TKEYP
2158 cmp $24, KLEN
2159 jb .Ldec128
2160 lea 0x20(TKEYP), TKEYP
2161 je .Ldec192
2162 add $0x20, TKEYP
2163 movaps -0x60(TKEYP), KEY
2164 AESDEC KEY STATE
2165 movaps -0x50(TKEYP), KEY
2166 AESDEC KEY STATE
2167 .align 4
2168 .Ldec192:
2169 movaps -0x40(TKEYP), KEY
2170 AESDEC KEY STATE
2171 movaps -0x30(TKEYP), KEY
2172 AESDEC KEY STATE
2173 .align 4
2174 .Ldec128:
2175 movaps -0x20(TKEYP), KEY
2176 AESDEC KEY STATE
2177 movaps -0x10(TKEYP), KEY
2178 AESDEC KEY STATE
2179 movaps (TKEYP), KEY
2180 AESDEC KEY STATE
2181 movaps 0x10(TKEYP), KEY
2182 AESDEC KEY STATE
2183 movaps 0x20(TKEYP), KEY
2184 AESDEC KEY STATE
2185 movaps 0x30(TKEYP), KEY
2186 AESDEC KEY STATE
2187 movaps 0x40(TKEYP), KEY
2188 AESDEC KEY STATE
2189 movaps 0x50(TKEYP), KEY
2190 AESDEC KEY STATE
2191 movaps 0x60(TKEYP), KEY
2192 AESDEC KEY STATE
2193 movaps 0x70(TKEYP), KEY
2194 AESDECLAST KEY STATE
2196 ENDPROC(_aesni_dec1)
2199 * _aesni_dec4: internal ABI
2200 * input:
2201 * KEYP: key struct pointer
2202 * KLEN: key length
2203 * STATE1: initial state (input)
2204 * STATE2
2205 * STATE3
2206 * STATE4
2207 * output:
2208 * STATE1: finial state (output)
2209 * STATE2
2210 * STATE3
2211 * STATE4
2212 * changed:
2213 * KEY
2214 * TKEYP (T1)
2216 .align 4
2217 _aesni_dec4:
2218 movaps (KEYP), KEY # key
2219 mov KEYP, TKEYP
2220 pxor KEY, STATE1 # round 0
2221 pxor KEY, STATE2
2222 pxor KEY, STATE3
2223 pxor KEY, STATE4
2224 add $0x30, TKEYP
2225 cmp $24, KLEN
2226 jb .L4dec128
2227 lea 0x20(TKEYP), TKEYP
2228 je .L4dec192
2229 add $0x20, TKEYP
2230 movaps -0x60(TKEYP), KEY
2231 AESDEC KEY STATE1
2232 AESDEC KEY STATE2
2233 AESDEC KEY STATE3
2234 AESDEC KEY STATE4
2235 movaps -0x50(TKEYP), KEY
2236 AESDEC KEY STATE1
2237 AESDEC KEY STATE2
2238 AESDEC KEY STATE3
2239 AESDEC KEY STATE4
2240 .align 4
2241 .L4dec192:
2242 movaps -0x40(TKEYP), KEY
2243 AESDEC KEY STATE1
2244 AESDEC KEY STATE2
2245 AESDEC KEY STATE3
2246 AESDEC KEY STATE4
2247 movaps -0x30(TKEYP), KEY
2248 AESDEC KEY STATE1
2249 AESDEC KEY STATE2
2250 AESDEC KEY STATE3
2251 AESDEC KEY STATE4
2252 .align 4
2253 .L4dec128:
2254 movaps -0x20(TKEYP), KEY
2255 AESDEC KEY STATE1
2256 AESDEC KEY STATE2
2257 AESDEC KEY STATE3
2258 AESDEC KEY STATE4
2259 movaps -0x10(TKEYP), KEY
2260 AESDEC KEY STATE1
2261 AESDEC KEY STATE2
2262 AESDEC KEY STATE3
2263 AESDEC KEY STATE4
2264 movaps (TKEYP), KEY
2265 AESDEC KEY STATE1
2266 AESDEC KEY STATE2
2267 AESDEC KEY STATE3
2268 AESDEC KEY STATE4
2269 movaps 0x10(TKEYP), KEY
2270 AESDEC KEY STATE1
2271 AESDEC KEY STATE2
2272 AESDEC KEY STATE3
2273 AESDEC KEY STATE4
2274 movaps 0x20(TKEYP), KEY
2275 AESDEC KEY STATE1
2276 AESDEC KEY STATE2
2277 AESDEC KEY STATE3
2278 AESDEC KEY STATE4
2279 movaps 0x30(TKEYP), KEY
2280 AESDEC KEY STATE1
2281 AESDEC KEY STATE2
2282 AESDEC KEY STATE3
2283 AESDEC KEY STATE4
2284 movaps 0x40(TKEYP), KEY
2285 AESDEC KEY STATE1
2286 AESDEC KEY STATE2
2287 AESDEC KEY STATE3
2288 AESDEC KEY STATE4
2289 movaps 0x50(TKEYP), KEY
2290 AESDEC KEY STATE1
2291 AESDEC KEY STATE2
2292 AESDEC KEY STATE3
2293 AESDEC KEY STATE4
2294 movaps 0x60(TKEYP), KEY
2295 AESDEC KEY STATE1
2296 AESDEC KEY STATE2
2297 AESDEC KEY STATE3
2298 AESDEC KEY STATE4
2299 movaps 0x70(TKEYP), KEY
2300 AESDECLAST KEY STATE1 # last round
2301 AESDECLAST KEY STATE2
2302 AESDECLAST KEY STATE3
2303 AESDECLAST KEY STATE4
2305 ENDPROC(_aesni_dec4)
2308 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2309 * size_t len)
2311 ENTRY(aesni_ecb_enc)
2312 FRAME_BEGIN
2313 #ifndef __x86_64__
2314 pushl LEN
2315 pushl KEYP
2316 pushl KLEN
2317 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2318 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2319 movl (FRAME_OFFSET+24)(%esp), INP # src
2320 movl (FRAME_OFFSET+28)(%esp), LEN # len
2321 #endif
2322 test LEN, LEN # check length
2323 jz .Lecb_enc_ret
2324 mov 480(KEYP), KLEN
2325 cmp $16, LEN
2326 jb .Lecb_enc_ret
2327 cmp $64, LEN
2328 jb .Lecb_enc_loop1
2329 .align 4
2330 .Lecb_enc_loop4:
2331 movups (INP), STATE1
2332 movups 0x10(INP), STATE2
2333 movups 0x20(INP), STATE3
2334 movups 0x30(INP), STATE4
2335 call _aesni_enc4
2336 movups STATE1, (OUTP)
2337 movups STATE2, 0x10(OUTP)
2338 movups STATE3, 0x20(OUTP)
2339 movups STATE4, 0x30(OUTP)
2340 sub $64, LEN
2341 add $64, INP
2342 add $64, OUTP
2343 cmp $64, LEN
2344 jge .Lecb_enc_loop4
2345 cmp $16, LEN
2346 jb .Lecb_enc_ret
2347 .align 4
2348 .Lecb_enc_loop1:
2349 movups (INP), STATE1
2350 call _aesni_enc1
2351 movups STATE1, (OUTP)
2352 sub $16, LEN
2353 add $16, INP
2354 add $16, OUTP
2355 cmp $16, LEN
2356 jge .Lecb_enc_loop1
2357 .Lecb_enc_ret:
2358 #ifndef __x86_64__
2359 popl KLEN
2360 popl KEYP
2361 popl LEN
2362 #endif
2363 FRAME_END
2365 ENDPROC(aesni_ecb_enc)
2368 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2369 * size_t len);
2371 ENTRY(aesni_ecb_dec)
2372 FRAME_BEGIN
2373 #ifndef __x86_64__
2374 pushl LEN
2375 pushl KEYP
2376 pushl KLEN
2377 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2378 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2379 movl (FRAME_OFFSET+24)(%esp), INP # src
2380 movl (FRAME_OFFSET+28)(%esp), LEN # len
2381 #endif
2382 test LEN, LEN
2383 jz .Lecb_dec_ret
2384 mov 480(KEYP), KLEN
2385 add $240, KEYP
2386 cmp $16, LEN
2387 jb .Lecb_dec_ret
2388 cmp $64, LEN
2389 jb .Lecb_dec_loop1
2390 .align 4
2391 .Lecb_dec_loop4:
2392 movups (INP), STATE1
2393 movups 0x10(INP), STATE2
2394 movups 0x20(INP), STATE3
2395 movups 0x30(INP), STATE4
2396 call _aesni_dec4
2397 movups STATE1, (OUTP)
2398 movups STATE2, 0x10(OUTP)
2399 movups STATE3, 0x20(OUTP)
2400 movups STATE4, 0x30(OUTP)
2401 sub $64, LEN
2402 add $64, INP
2403 add $64, OUTP
2404 cmp $64, LEN
2405 jge .Lecb_dec_loop4
2406 cmp $16, LEN
2407 jb .Lecb_dec_ret
2408 .align 4
2409 .Lecb_dec_loop1:
2410 movups (INP), STATE1
2411 call _aesni_dec1
2412 movups STATE1, (OUTP)
2413 sub $16, LEN
2414 add $16, INP
2415 add $16, OUTP
2416 cmp $16, LEN
2417 jge .Lecb_dec_loop1
2418 .Lecb_dec_ret:
2419 #ifndef __x86_64__
2420 popl KLEN
2421 popl KEYP
2422 popl LEN
2423 #endif
2424 FRAME_END
2426 ENDPROC(aesni_ecb_dec)
2429 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2430 * size_t len, u8 *iv)
2432 ENTRY(aesni_cbc_enc)
2433 FRAME_BEGIN
2434 #ifndef __x86_64__
2435 pushl IVP
2436 pushl LEN
2437 pushl KEYP
2438 pushl KLEN
2439 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2440 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2441 movl (FRAME_OFFSET+28)(%esp), INP # src
2442 movl (FRAME_OFFSET+32)(%esp), LEN # len
2443 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2444 #endif
2445 cmp $16, LEN
2446 jb .Lcbc_enc_ret
2447 mov 480(KEYP), KLEN
2448 movups (IVP), STATE # load iv as initial state
2449 .align 4
2450 .Lcbc_enc_loop:
2451 movups (INP), IN # load input
2452 pxor IN, STATE
2453 call _aesni_enc1
2454 movups STATE, (OUTP) # store output
2455 sub $16, LEN
2456 add $16, INP
2457 add $16, OUTP
2458 cmp $16, LEN
2459 jge .Lcbc_enc_loop
2460 movups STATE, (IVP)
2461 .Lcbc_enc_ret:
2462 #ifndef __x86_64__
2463 popl KLEN
2464 popl KEYP
2465 popl LEN
2466 popl IVP
2467 #endif
2468 FRAME_END
2470 ENDPROC(aesni_cbc_enc)
2473 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2474 * size_t len, u8 *iv)
2476 ENTRY(aesni_cbc_dec)
2477 FRAME_BEGIN
2478 #ifndef __x86_64__
2479 pushl IVP
2480 pushl LEN
2481 pushl KEYP
2482 pushl KLEN
2483 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2484 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2485 movl (FRAME_OFFSET+28)(%esp), INP # src
2486 movl (FRAME_OFFSET+32)(%esp), LEN # len
2487 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2488 #endif
2489 cmp $16, LEN
2490 jb .Lcbc_dec_just_ret
2491 mov 480(KEYP), KLEN
2492 add $240, KEYP
2493 movups (IVP), IV
2494 cmp $64, LEN
2495 jb .Lcbc_dec_loop1
2496 .align 4
2497 .Lcbc_dec_loop4:
2498 movups (INP), IN1
2499 movaps IN1, STATE1
2500 movups 0x10(INP), IN2
2501 movaps IN2, STATE2
2502 #ifdef __x86_64__
2503 movups 0x20(INP), IN3
2504 movaps IN3, STATE3
2505 movups 0x30(INP), IN4
2506 movaps IN4, STATE4
2507 #else
2508 movups 0x20(INP), IN1
2509 movaps IN1, STATE3
2510 movups 0x30(INP), IN2
2511 movaps IN2, STATE4
2512 #endif
2513 call _aesni_dec4
2514 pxor IV, STATE1
2515 #ifdef __x86_64__
2516 pxor IN1, STATE2
2517 pxor IN2, STATE3
2518 pxor IN3, STATE4
2519 movaps IN4, IV
2520 #else
2521 pxor IN1, STATE4
2522 movaps IN2, IV
2523 movups (INP), IN1
2524 pxor IN1, STATE2
2525 movups 0x10(INP), IN2
2526 pxor IN2, STATE3
2527 #endif
2528 movups STATE1, (OUTP)
2529 movups STATE2, 0x10(OUTP)
2530 movups STATE3, 0x20(OUTP)
2531 movups STATE4, 0x30(OUTP)
2532 sub $64, LEN
2533 add $64, INP
2534 add $64, OUTP
2535 cmp $64, LEN
2536 jge .Lcbc_dec_loop4
2537 cmp $16, LEN
2538 jb .Lcbc_dec_ret
2539 .align 4
2540 .Lcbc_dec_loop1:
2541 movups (INP), IN
2542 movaps IN, STATE
2543 call _aesni_dec1
2544 pxor IV, STATE
2545 movups STATE, (OUTP)
2546 movaps IN, IV
2547 sub $16, LEN
2548 add $16, INP
2549 add $16, OUTP
2550 cmp $16, LEN
2551 jge .Lcbc_dec_loop1
2552 .Lcbc_dec_ret:
2553 movups IV, (IVP)
2554 .Lcbc_dec_just_ret:
2555 #ifndef __x86_64__
2556 popl KLEN
2557 popl KEYP
2558 popl LEN
2559 popl IVP
2560 #endif
2561 FRAME_END
2563 ENDPROC(aesni_cbc_dec)
2565 #ifdef __x86_64__
2566 .align 16
2567 .Lbswap_mask:
2568 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2571 * _aesni_inc_init: internal ABI
2572 * setup registers used by _aesni_inc
2573 * input:
2574 * IV
2575 * output:
2576 * CTR: == IV, in little endian
2577 * TCTR_LOW: == lower qword of CTR
2578 * INC: == 1, in little endian
2579 * BSWAP_MASK == endian swapping mask
2581 .align 4
2582 _aesni_inc_init:
2583 movaps .Lbswap_mask(%rip), BSWAP_MASK
2584 movaps IV, CTR
2585 PSHUFB_XMM BSWAP_MASK CTR
2586 mov $1, TCTR_LOW
2587 MOVQ_R64_XMM TCTR_LOW INC
2588 MOVQ_R64_XMM CTR TCTR_LOW
2590 ENDPROC(_aesni_inc_init)
2593 * _aesni_inc: internal ABI
2594 * Increase IV by 1, IV is in big endian
2595 * input:
2596 * IV
2597 * CTR: == IV, in little endian
2598 * TCTR_LOW: == lower qword of CTR
2599 * INC: == 1, in little endian
2600 * BSWAP_MASK == endian swapping mask
2601 * output:
2602 * IV: Increase by 1
2603 * changed:
2604 * CTR: == output IV, in little endian
2605 * TCTR_LOW: == lower qword of CTR
2607 .align 4
2608 _aesni_inc:
2609 paddq INC, CTR
2610 add $1, TCTR_LOW
2611 jnc .Linc_low
2612 pslldq $8, INC
2613 paddq INC, CTR
2614 psrldq $8, INC
2615 .Linc_low:
2616 movaps CTR, IV
2617 PSHUFB_XMM BSWAP_MASK IV
2619 ENDPROC(_aesni_inc)
2622 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2623 * size_t len, u8 *iv)
2625 ENTRY(aesni_ctr_enc)
2626 FRAME_BEGIN
2627 cmp $16, LEN
2628 jb .Lctr_enc_just_ret
2629 mov 480(KEYP), KLEN
2630 movups (IVP), IV
2631 call _aesni_inc_init
2632 cmp $64, LEN
2633 jb .Lctr_enc_loop1
2634 .align 4
2635 .Lctr_enc_loop4:
2636 movaps IV, STATE1
2637 call _aesni_inc
2638 movups (INP), IN1
2639 movaps IV, STATE2
2640 call _aesni_inc
2641 movups 0x10(INP), IN2
2642 movaps IV, STATE3
2643 call _aesni_inc
2644 movups 0x20(INP), IN3
2645 movaps IV, STATE4
2646 call _aesni_inc
2647 movups 0x30(INP), IN4
2648 call _aesni_enc4
2649 pxor IN1, STATE1
2650 movups STATE1, (OUTP)
2651 pxor IN2, STATE2
2652 movups STATE2, 0x10(OUTP)
2653 pxor IN3, STATE3
2654 movups STATE3, 0x20(OUTP)
2655 pxor IN4, STATE4
2656 movups STATE4, 0x30(OUTP)
2657 sub $64, LEN
2658 add $64, INP
2659 add $64, OUTP
2660 cmp $64, LEN
2661 jge .Lctr_enc_loop4
2662 cmp $16, LEN
2663 jb .Lctr_enc_ret
2664 .align 4
2665 .Lctr_enc_loop1:
2666 movaps IV, STATE
2667 call _aesni_inc
2668 movups (INP), IN
2669 call _aesni_enc1
2670 pxor IN, STATE
2671 movups STATE, (OUTP)
2672 sub $16, LEN
2673 add $16, INP
2674 add $16, OUTP
2675 cmp $16, LEN
2676 jge .Lctr_enc_loop1
2677 .Lctr_enc_ret:
2678 movups IV, (IVP)
2679 .Lctr_enc_just_ret:
2680 FRAME_END
2682 ENDPROC(aesni_ctr_enc)
2685 * _aesni_gf128mul_x_ble: internal ABI
2686 * Multiply in GF(2^128) for XTS IVs
2687 * input:
2688 * IV: current IV
2689 * GF128MUL_MASK == mask with 0x87 and 0x01
2690 * output:
2691 * IV: next IV
2692 * changed:
2693 * CTR: == temporary value
2695 #define _aesni_gf128mul_x_ble() \
2696 pshufd $0x13, IV, CTR; \
2697 paddq IV, IV; \
2698 psrad $31, CTR; \
2699 pand GF128MUL_MASK, CTR; \
2700 pxor CTR, IV;
2703 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2704 * bool enc, u8 *iv)
2706 ENTRY(aesni_xts_crypt8)
2707 FRAME_BEGIN
2708 cmpb $0, %cl
2709 movl $0, %ecx
2710 movl $240, %r10d
2711 leaq _aesni_enc4(%rip), %r11
2712 leaq _aesni_dec4(%rip), %rax
2713 cmovel %r10d, %ecx
2714 cmoveq %rax, %r11
2716 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2717 movups (IVP), IV
2719 mov 480(KEYP), KLEN
2720 addq %rcx, KEYP
2722 movdqa IV, STATE1
2723 movdqu 0x00(INP), INC
2724 pxor INC, STATE1
2725 movdqu IV, 0x00(OUTP)
2727 _aesni_gf128mul_x_ble()
2728 movdqa IV, STATE2
2729 movdqu 0x10(INP), INC
2730 pxor INC, STATE2
2731 movdqu IV, 0x10(OUTP)
2733 _aesni_gf128mul_x_ble()
2734 movdqa IV, STATE3
2735 movdqu 0x20(INP), INC
2736 pxor INC, STATE3
2737 movdqu IV, 0x20(OUTP)
2739 _aesni_gf128mul_x_ble()
2740 movdqa IV, STATE4
2741 movdqu 0x30(INP), INC
2742 pxor INC, STATE4
2743 movdqu IV, 0x30(OUTP)
2745 call *%r11
2747 movdqu 0x00(OUTP), INC
2748 pxor INC, STATE1
2749 movdqu STATE1, 0x00(OUTP)
2751 _aesni_gf128mul_x_ble()
2752 movdqa IV, STATE1
2753 movdqu 0x40(INP), INC
2754 pxor INC, STATE1
2755 movdqu IV, 0x40(OUTP)
2757 movdqu 0x10(OUTP), INC
2758 pxor INC, STATE2
2759 movdqu STATE2, 0x10(OUTP)
2761 _aesni_gf128mul_x_ble()
2762 movdqa IV, STATE2
2763 movdqu 0x50(INP), INC
2764 pxor INC, STATE2
2765 movdqu IV, 0x50(OUTP)
2767 movdqu 0x20(OUTP), INC
2768 pxor INC, STATE3
2769 movdqu STATE3, 0x20(OUTP)
2771 _aesni_gf128mul_x_ble()
2772 movdqa IV, STATE3
2773 movdqu 0x60(INP), INC
2774 pxor INC, STATE3
2775 movdqu IV, 0x60(OUTP)
2777 movdqu 0x30(OUTP), INC
2778 pxor INC, STATE4
2779 movdqu STATE4, 0x30(OUTP)
2781 _aesni_gf128mul_x_ble()
2782 movdqa IV, STATE4
2783 movdqu 0x70(INP), INC
2784 pxor INC, STATE4
2785 movdqu IV, 0x70(OUTP)
2787 _aesni_gf128mul_x_ble()
2788 movups IV, (IVP)
2790 call *%r11
2792 movdqu 0x40(OUTP), INC
2793 pxor INC, STATE1
2794 movdqu STATE1, 0x40(OUTP)
2796 movdqu 0x50(OUTP), INC
2797 pxor INC, STATE2
2798 movdqu STATE2, 0x50(OUTP)
2800 movdqu 0x60(OUTP), INC
2801 pxor INC, STATE3
2802 movdqu STATE3, 0x60(OUTP)
2804 movdqu 0x70(OUTP), INC
2805 pxor INC, STATE4
2806 movdqu STATE4, 0x70(OUTP)
2808 FRAME_END
2810 ENDPROC(aesni_xts_crypt8)
2812 #endif