1 #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2 /* The code is adapted from Linux kernel's source */
4 // We use shorter insns, even though they are for "wrong"
5 // data type (fp, not int).
6 // For Intel, there is no penalty for doing it at all
7 // (CPUs which do have such penalty do not support SHA insns).
8 // For AMD, the penalty is one extra cycle
9 // (allegedly: I failed to find measurable difference).
11 //#define mova128 movdqa
12 #define mova128 movaps
13 //#define movu128 movdqu
14 #define movu128 movups
15 //#define shuf128_32 pshufd
16 #define shuf128_32 shufps
18 // pshufb and palignr are SSSE3 insns.
19 // We do not check SSSE3 in cpuid,
20 // all SHA-capable CPUs support it as well.
23 .section .note.GNU-stack, "", @progbits
25 .section .text.sha256_process_block64_shaNI, "ax", @progbits
26 .globl sha256_process_block64_shaNI
27 .hidden sha256_process_block64_shaNI
28 .type sha256_process_block64_shaNI, @function
32 #define SHA256CONSTANTS %ecx
44 #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
46 .balign 8 # allow decoders to fetch at least 2 first insns
47 sha256_process_block64_shaNI:
49 movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
50 movu128 76+1*16(%eax), STATE1 /* EFGH */
51 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
52 mova128 STATE1, STATE0
53 /* --- -------------- ABCD -- EFGH */
54 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
55 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
57 /* XMMTMP holds flip mask from here... */
58 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
59 movl $K256+8*16, SHA256CONSTANTS
62 movu128 0*16(DATA_PTR), MSG
65 paddd 0*16-8*16(SHA256CONSTANTS), MSG
66 sha256rnds2 MSG, STATE0, STATE1
67 shuf128_32 $0x0E, MSG, MSG
68 sha256rnds2 MSG, STATE1, STATE0
71 movu128 1*16(DATA_PTR), MSG
74 paddd 1*16-8*16(SHA256CONSTANTS), MSG
75 sha256rnds2 MSG, STATE0, STATE1
76 shuf128_32 $0x0E, MSG, MSG
77 sha256rnds2 MSG, STATE1, STATE0
78 sha256msg1 MSGTMP1, MSGTMP0
81 movu128 2*16(DATA_PTR), MSG
84 paddd 2*16-8*16(SHA256CONSTANTS), MSG
85 sha256rnds2 MSG, STATE0, STATE1
86 shuf128_32 $0x0E, MSG, MSG
87 sha256rnds2 MSG, STATE1, STATE0
88 sha256msg1 MSGTMP2, MSGTMP1
91 movu128 3*16(DATA_PTR), MSG
95 paddd 3*16-8*16(SHA256CONSTANTS), MSG
96 sha256rnds2 MSG, STATE0, STATE1
97 mova128 MSGTMP3, XMMTMP
98 palignr $4, MSGTMP2, XMMTMP
100 sha256msg2 MSGTMP3, MSGTMP0
101 shuf128_32 $0x0E, MSG, MSG
102 sha256rnds2 MSG, STATE1, STATE0
103 sha256msg1 MSGTMP3, MSGTMP2
107 paddd 4*16-8*16(SHA256CONSTANTS), MSG
108 sha256rnds2 MSG, STATE0, STATE1
109 mova128 MSGTMP0, XMMTMP
110 palignr $4, MSGTMP3, XMMTMP
111 paddd XMMTMP, MSGTMP1
112 sha256msg2 MSGTMP0, MSGTMP1
113 shuf128_32 $0x0E, MSG, MSG
114 sha256rnds2 MSG, STATE1, STATE0
115 sha256msg1 MSGTMP0, MSGTMP3
119 paddd 5*16-8*16(SHA256CONSTANTS), MSG
120 sha256rnds2 MSG, STATE0, STATE1
121 mova128 MSGTMP1, XMMTMP
122 palignr $4, MSGTMP0, XMMTMP
123 paddd XMMTMP, MSGTMP2
124 sha256msg2 MSGTMP1, MSGTMP2
125 shuf128_32 $0x0E, MSG, MSG
126 sha256rnds2 MSG, STATE1, STATE0
127 sha256msg1 MSGTMP1, MSGTMP0
131 paddd 6*16-8*16(SHA256CONSTANTS), MSG
132 sha256rnds2 MSG, STATE0, STATE1
133 mova128 MSGTMP2, XMMTMP
134 palignr $4, MSGTMP1, XMMTMP
135 paddd XMMTMP, MSGTMP3
136 sha256msg2 MSGTMP2, MSGTMP3
137 shuf128_32 $0x0E, MSG, MSG
138 sha256rnds2 MSG, STATE1, STATE0
139 sha256msg1 MSGTMP2, MSGTMP1
143 paddd 7*16-8*16(SHA256CONSTANTS), MSG
144 sha256rnds2 MSG, STATE0, STATE1
145 mova128 MSGTMP3, XMMTMP
146 palignr $4, MSGTMP2, XMMTMP
147 paddd XMMTMP, MSGTMP0
148 sha256msg2 MSGTMP3, MSGTMP0
149 shuf128_32 $0x0E, MSG, MSG
150 sha256rnds2 MSG, STATE1, STATE0
151 sha256msg1 MSGTMP3, MSGTMP2
155 paddd 8*16-8*16(SHA256CONSTANTS), MSG
156 sha256rnds2 MSG, STATE0, STATE1
157 mova128 MSGTMP0, XMMTMP
158 palignr $4, MSGTMP3, XMMTMP
159 paddd XMMTMP, MSGTMP1
160 sha256msg2 MSGTMP0, MSGTMP1
161 shuf128_32 $0x0E, MSG, MSG
162 sha256rnds2 MSG, STATE1, STATE0
163 sha256msg1 MSGTMP0, MSGTMP3
167 paddd 9*16-8*16(SHA256CONSTANTS), MSG
168 sha256rnds2 MSG, STATE0, STATE1
169 mova128 MSGTMP1, XMMTMP
170 palignr $4, MSGTMP0, XMMTMP
171 paddd XMMTMP, MSGTMP2
172 sha256msg2 MSGTMP1, MSGTMP2
173 shuf128_32 $0x0E, MSG, MSG
174 sha256rnds2 MSG, STATE1, STATE0
175 sha256msg1 MSGTMP1, MSGTMP0
179 paddd 10*16-8*16(SHA256CONSTANTS), MSG
180 sha256rnds2 MSG, STATE0, STATE1
181 mova128 MSGTMP2, XMMTMP
182 palignr $4, MSGTMP1, XMMTMP
183 paddd XMMTMP, MSGTMP3
184 sha256msg2 MSGTMP2, MSGTMP3
185 shuf128_32 $0x0E, MSG, MSG
186 sha256rnds2 MSG, STATE1, STATE0
187 sha256msg1 MSGTMP2, MSGTMP1
191 paddd 11*16-8*16(SHA256CONSTANTS), MSG
192 sha256rnds2 MSG, STATE0, STATE1
193 mova128 MSGTMP3, XMMTMP
194 palignr $4, MSGTMP2, XMMTMP
195 paddd XMMTMP, MSGTMP0
196 sha256msg2 MSGTMP3, MSGTMP0
197 shuf128_32 $0x0E, MSG, MSG
198 sha256rnds2 MSG, STATE1, STATE0
199 sha256msg1 MSGTMP3, MSGTMP2
203 paddd 12*16-8*16(SHA256CONSTANTS), MSG
204 sha256rnds2 MSG, STATE0, STATE1
205 mova128 MSGTMP0, XMMTMP
206 palignr $4, MSGTMP3, XMMTMP
207 paddd XMMTMP, MSGTMP1
208 sha256msg2 MSGTMP0, MSGTMP1
209 shuf128_32 $0x0E, MSG, MSG
210 sha256rnds2 MSG, STATE1, STATE0
211 sha256msg1 MSGTMP0, MSGTMP3
215 paddd 13*16-8*16(SHA256CONSTANTS), MSG
216 sha256rnds2 MSG, STATE0, STATE1
217 mova128 MSGTMP1, XMMTMP
218 palignr $4, MSGTMP0, XMMTMP
219 paddd XMMTMP, MSGTMP2
220 sha256msg2 MSGTMP1, MSGTMP2
221 shuf128_32 $0x0E, MSG, MSG
222 sha256rnds2 MSG, STATE1, STATE0
226 paddd 14*16-8*16(SHA256CONSTANTS), MSG
227 sha256rnds2 MSG, STATE0, STATE1
228 mova128 MSGTMP2, XMMTMP
229 palignr $4, MSGTMP1, XMMTMP
230 paddd XMMTMP, MSGTMP3
231 sha256msg2 MSGTMP2, MSGTMP3
232 shuf128_32 $0x0E, MSG, MSG
233 sha256rnds2 MSG, STATE1, STATE0
237 paddd 15*16-8*16(SHA256CONSTANTS), MSG
238 sha256rnds2 MSG, STATE0, STATE1
239 shuf128_32 $0x0E, MSG, MSG
240 sha256rnds2 MSG, STATE1, STATE0
242 /* Write hash values back in the correct order */
243 mova128 STATE0, XMMTMP
244 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
245 /* --- -------------- HGDC -- FEBA */
246 shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
247 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
248 /* add current hash values to previous ones */
249 movu128 76+1*16(%eax), STATE1
251 movu128 STATE1, 76+1*16(%eax)
252 movu128 76+0*16(%eax), XMMTMP
254 movu128 STATE0, 76+0*16(%eax)
257 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
259 .section .rodata.cst256.K256, "aM", @progbits, 256
262 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
263 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
264 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
265 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
266 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
267 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
268 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
269 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
270 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
271 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
272 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
273 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
274 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
275 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
276 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
277 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
279 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
281 PSHUFFLE_BSWAP32_FLIP_MASK:
282 .octa 0x0c0d0e0f08090a0b0405060700010203