2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include <asm/frame.h>
28 #include "glue_helper-asm-avx.S"
30 .file "cast6-avx-x86_64-asm_64.S"
37 /* structure of crypto context */
47 /**********************************************************************
49 **********************************************************************/
100 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
101 movzbl src ## bh, RID1d; \
102 movzbl src ## bl, RID2d; \
104 movl s1(, RID1, 4), dst ## d; \
105 op1 s2(, RID2, 4), dst ## d; \
106 movzbl src ## bh, RID1d; \
107 movzbl src ## bl, RID2d; \
108 interleave_op(il_reg); \
109 op2 s3(, RID1, 4), dst ## d; \
110 op3 s4(, RID2, 4), dst ## d;
112 #define dummy(d) /* do nothing */
114 #define shr_next(reg) \
117 #define F_head(a, x, gi1, gi2, op0) \
119 vpslld RKRF, x, RTMP; \
126 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
127 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
128 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
130 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
133 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
138 vpinsrq $1, RFS3, x, x;
140 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
141 F_head(b1, RX, RGI1, RGI2, op0); \
142 F_head(b2, RX, RGI3, RGI4, op0); \
144 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
145 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
150 #define F1_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
152 #define F2_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
154 #define F3_2(a1, b1, a2, b2) \
155 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
157 #define qop(in, out, f) \
158 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
160 #define get_round_keys(nn) \
161 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
162 vpand R1ST, RKR, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 vpsrldq $1, RKR, RKR;
167 get_round_keys(4*n+0); \
170 get_round_keys(4*n+1); \
173 get_round_keys(4*n+2); \
176 get_round_keys(4*n+3); \
180 get_round_keys(4*n+3); \
183 get_round_keys(4*n+2); \
186 get_round_keys(4*n+1); \
189 get_round_keys(4*n+0); \
192 #define shuffle(mask) \
193 vpshufb mask, RKR, RKR;
195 #define preload_rkr(n, do_mask, mask) \
196 vbroadcastss .L16_mask, RKR; \
197 /* add 16-bit rotation to key rotations (mod 32) */ \
198 vpxor (kr+n*16)(CTX), RKR, RKR; \
201 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 vpunpckldq x1, x0, t0; \
203 vpunpckhdq x1, x0, t2; \
204 vpunpckldq x3, x2, t1; \
205 vpunpckhdq x3, x2, x3; \
207 vpunpcklqdq t1, t0, x0; \
208 vpunpckhqdq t1, t0, x1; \
209 vpunpcklqdq x3, t2, x2; \
210 vpunpckhqdq x3, t2, x3;
212 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
213 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \
216 vpshufb rmask, x3, x3; \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
220 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
223 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3;
228 .section .rodata.cst16, "aM", @progbits, 16
230 .Lxts_gf128mul_and_shl1_mask:
231 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
233 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
235 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
236 .Lrkr_enc_Q_Q_QBAR_QBAR:
237 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
238 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
239 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
241 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
242 .Lrkr_dec_Q_Q_QBAR_QBAR:
243 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
244 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
245 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
247 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
252 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
257 .section .rodata.cst4.first_mask, "aM", @progbits, 4
268 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
270 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
278 vmovdqa .Lbswap_mask, RKM;
279 vmovd .Lfirst_mask, R1ST;
280 vmovd .L32_mask, R32;
282 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
283 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
285 preload_rkr(0, dummy, none);
290 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
295 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
304 vmovdqa .Lbswap_mask, RKM;
306 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
307 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
310 ENDPROC(__cast6_enc_blk8)
316 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
318 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
326 vmovdqa .Lbswap_mask, RKM;
327 vmovd .Lfirst_mask, R1ST;
328 vmovd .L32_mask, R32;
330 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
331 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
333 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
338 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
343 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
352 vmovdqa .Lbswap_mask, RKM;
353 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
354 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
357 ENDPROC(__cast6_dec_blk8)
359 ENTRY(cast6_ecb_enc_8way)
371 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
373 call __cast6_enc_blk8;
375 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380 ENDPROC(cast6_ecb_enc_8way)
382 ENTRY(cast6_ecb_dec_8way)
394 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
396 call __cast6_dec_blk8;
398 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
403 ENDPROC(cast6_ecb_dec_8way)
405 ENTRY(cast6_cbc_dec_8way)
419 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
421 call __cast6_dec_blk8;
423 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
429 ENDPROC(cast6_cbc_dec_8way)
431 ENTRY(cast6_ctr_8way)
436 * %rcx: iv (little endian, 128bit)
446 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
449 call __cast6_enc_blk8;
451 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
457 ENDPROC(cast6_ctr_8way)
459 ENTRY(cast6_xts_enc_8way)
464 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
472 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
473 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
474 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
476 call __cast6_enc_blk8;
478 /* dst <= regs xor IVs(in dst) */
479 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
484 ENDPROC(cast6_xts_enc_8way)
486 ENTRY(cast6_xts_dec_8way)
491 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
499 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
500 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
501 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
503 call __cast6_dec_blk8;
505 /* dst <= regs xor IVs(in dst) */
506 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
511 ENDPROC(cast6_xts_dec_8way)