2 * Blowfish Cipher Algorithm (x86_64)
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 #include <linux/linkage.h>
25 .file "blowfish-x86_64-asm.S"
28 /* structure of crypto context */
30 #define s0 ((16 + 2) * 4)
31 #define s1 ((16 + 2 + (1 * 256)) * 4)
32 #define s2 ((16 + 2 + (2 * 256)) * 4)
33 #define s3 ((16 + 2 + (3 * 256)) * 4)
71 /***********************************************************************
73 ***********************************************************************/
79 movl s0(CTX,RT0,4), RT0d; \
80 addl s1(CTX,RT1,4), RT0d; \
84 xorl s2(CTX,RT1,4), RT0d; \
85 addl s3(CTX,RT2,4), RT0d; \
88 #define add_roundkey_enc(n) \
89 xorq p+4*(n)(CTX), RX0;
91 #define round_enc(n) \
92 add_roundkey_enc(n); \
97 #define add_roundkey_dec(n) \
98 movq p+4*(n-1)(CTX), RT0; \
102 #define round_dec(n) \
103 add_roundkey_dec(n); \
108 #define read_block() \
113 #define write_block() \
117 #define xor_block() \
121 ENTRY(__blowfish_enc_blk)
126 * %rcx: bool, if true: xor output
144 add_roundkey_enc(16);
157 ENDPROC(__blowfish_enc_blk)
159 ENTRY(blowfish_dec_blk)
189 ENDPROC(blowfish_dec_blk)
191 /**********************************************************************
192 4-way blowfish, four blocks parallel
193 **********************************************************************/
195 /* F() for 4-way. Slower when used alone/1-way, but faster when used
196 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
199 movzbl x ## bh, RT1d; \
200 movzbl x ## bl, RT3d; \
202 movzbl x ## bh, RT0d; \
203 movzbl x ## bl, RT2d; \
205 movl s0(CTX,RT0,4), RT0d; \
206 addl s1(CTX,RT2,4), RT0d; \
207 xorl s2(CTX,RT1,4), RT0d; \
208 addl s3(CTX,RT3,4), RT0d; \
211 #define add_preloaded_roundkey4() \
217 #define preload_roundkey_enc(n) \
218 movq p+4*(n)(CTX), RKEY;
220 #define add_roundkey_enc4(n) \
221 add_preloaded_roundkey4(); \
222 preload_roundkey_enc(n + 2);
224 #define round_enc4(n) \
225 add_roundkey_enc4(n); \
237 #define preload_roundkey_dec(n) \
238 movq p+4*((n)-1)(CTX), RKEY; \
241 #define add_roundkey_dec4(n) \
242 add_preloaded_roundkey4(); \
243 preload_roundkey_dec(n - 2);
245 #define round_dec4(n) \
246 add_roundkey_dec4(n); \
258 #define read_block4() \
275 #define write_block4() \
288 #define xor_block4() \
301 ENTRY(__blowfish_enc_blk_4way)
306 * %rcx: bool, if true: xor output
316 preload_roundkey_enc(0);
328 add_preloaded_roundkey4();
348 ENDPROC(__blowfish_enc_blk_4way)
350 ENTRY(blowfish_dec_blk_4way)
363 preload_roundkey_dec(17);
374 add_preloaded_roundkey4();
383 ENDPROC(blowfish_dec_blk_4way)