1 ### Generated by hash_sha1_x86-64.S.sh ###
3 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
5 .section .note.GNU-stack, "", @progbits
7 .section .text.sha1_process_block64, "ax", @progbits
8 .globl sha1_process_block64
9 .hidden sha1_process_block64
10 .type sha1_process_block64, @function
12 .balign 8 # allow decoders to fetch at least 5 first insns
14 pushq %rbp # 1 byte insn
15 pushq %rbx # 1 byte insn
16 # pushq %r15 # 2 byte insn
17 pushq %r14 # 2 byte insn
18 pushq %r13 # 2 byte insn
19 pushq %r12 # 2 byte insn
20 pushq %rdi # we need ctx at the end
22 #Register and stack use:
25 # esi,edi,r8..r14: temps
29 # xmm6: current round constant
30 # xmm7: all round constants
31 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
33 movl 80(%rdi), %eax # a = ctx->hash[0]
34 movl 84(%rdi), %ebx # b = ctx->hash[1]
35 movl 88(%rdi), %ecx # c = ctx->hash[2]
36 movl 92(%rdi), %edx # d = ctx->hash[3]
37 movl 96(%rdi), %ebp # e = ctx->hash[4]
39 movaps sha1const(%rip), %xmm7
40 pshufd $0x00, %xmm7, %xmm6
42 # Load W[] to xmm0..3, byteswapping on the fly.
44 # For iterations 0..15, we pass W[] in rsi,r8..r14
45 # for use in RD1As instead of spilling them to stack.
46 # We lose parallelized addition of RCONST, but LEA
47 # can do two additions at once, so it is probably a wash.
48 # (We use rsi instead of rN because this makes two
49 # LEAs in two first RD1As shorter by one byte).
54 rolq $32, %rsi # rsi = W[1]:W[0]
55 rolq $32, %r8 # r8 = W[3]:W[2]
58 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
59 # movaps %xmm0, %xmm4 # add RCONST, spill to stack
61 # movups %xmm4, -64+16*0(%rsp)
67 rolq $32, %r9 # r9 = W[5]:W[4]
68 rolq $32, %r10 # r10 = W[7]:W[6]
71 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
77 rolq $32, %r11 # r11 = W[9]:W[8]
78 rolq $32, %r12 # r12 = W[11]:W[10]
81 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
87 rolq $32, %r13 # r13 = W[13]:W[12]
88 rolq $32, %r14 # r14 = W[15]:W[14]
91 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
94 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
99 xorl %edx, %edi # (((c ^ d) & b) ^ d)
100 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
102 roll $5, %edi # rotl32(a,5)
103 addl %edi, %ebp # e += rotl32(a,5)
104 rorl $2, %ebx # b = rotl32(b,30)
106 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
110 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
111 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
113 roll $5, %edi # rotl32(a,5)
114 addl %edi, %edx # e += rotl32(a,5)
115 rorl $2, %eax # b = rotl32(b,30)
117 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
122 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
123 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
125 roll $5, %edi # rotl32(a,5)
126 addl %edi, %ecx # e += rotl32(a,5)
127 rorl $2, %ebp # b = rotl32(b,30)
129 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
133 xorl %eax, %edi # (((c ^ d) & b) ^ d)
134 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
136 roll $5, %edi # rotl32(a,5)
137 addl %edi, %ebx # e += rotl32(a,5)
138 rorl $2, %edx # b = rotl32(b,30)
140 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
145 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
146 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
148 roll $5, %edi # rotl32(a,5)
149 addl %edi, %eax # e += rotl32(a,5)
150 rorl $2, %ecx # b = rotl32(b,30)
152 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
156 xorl %edx, %edi # (((c ^ d) & b) ^ d)
157 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
159 roll $5, %edi # rotl32(a,5)
160 addl %edi, %ebp # e += rotl32(a,5)
161 rorl $2, %ebx # b = rotl32(b,30)
163 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
168 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
169 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
171 roll $5, %edi # rotl32(a,5)
172 addl %edi, %edx # e += rotl32(a,5)
173 rorl $2, %eax # b = rotl32(b,30)
175 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
179 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
180 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
182 roll $5, %edi # rotl32(a,5)
183 addl %edi, %ecx # e += rotl32(a,5)
184 rorl $2, %ebp # b = rotl32(b,30)
185 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
187 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
188 # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
189 # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
190 # same result as above, but shorter and faster:
191 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
192 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
194 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
195 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
196 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
197 xorps %xmm5, %xmm0 # ^
198 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
200 xorps %xmm4, %xmm4 # rol(W0,1):
201 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
202 paddd %xmm0, %xmm0 # shift left by 1
203 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
204 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
205 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
209 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
210 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
211 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
214 movups %xmm5, -64+16*0(%rsp)
216 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
221 xorl %eax, %edi # (((c ^ d) & b) ^ d)
222 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
224 roll $5, %edi # rotl32(a,5)
225 addl %edi, %ebx # e += rotl32(a,5)
226 rorl $2, %edx # b = rotl32(b,30)
228 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
232 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
233 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
235 roll $5, %edi # rotl32(a,5)
236 addl %edi, %eax # e += rotl32(a,5)
237 rorl $2, %ecx # b = rotl32(b,30)
239 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
244 xorl %edx, %edi # (((c ^ d) & b) ^ d)
245 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
247 roll $5, %edi # rotl32(a,5)
248 addl %edi, %ebp # e += rotl32(a,5)
249 rorl $2, %ebx # b = rotl32(b,30)
251 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
255 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
256 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
258 roll $5, %edi # rotl32(a,5)
259 addl %edi, %edx # e += rotl32(a,5)
260 rorl $2, %eax # b = rotl32(b,30)
261 pshufd $0x55, %xmm7, %xmm6
262 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
264 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
265 # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
266 # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
267 # same result as above, but shorter and faster:
268 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
269 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
271 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
272 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
273 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
274 xorps %xmm5, %xmm1 # ^
275 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
277 xorps %xmm4, %xmm4 # rol(W0,1):
278 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
279 paddd %xmm1, %xmm1 # shift left by 1
280 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
281 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
282 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
286 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
287 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
288 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
291 movups %xmm5, -64+16*1(%rsp)
293 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
298 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
299 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
301 roll $5, %edi # rotl32(a,5)
302 addl %edi, %ecx # e += rotl32(a,5)
303 rorl $2, %ebp # b = rotl32(b,30)
305 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
309 xorl %eax, %edi # (((c ^ d) & b) ^ d)
310 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
312 roll $5, %edi # rotl32(a,5)
313 addl %edi, %ebx # e += rotl32(a,5)
314 rorl $2, %edx # b = rotl32(b,30)
316 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
321 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
322 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
324 roll $5, %edi # rotl32(a,5)
325 addl %edi, %eax # e += rotl32(a,5)
326 rorl $2, %ecx # b = rotl32(b,30)
328 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
332 xorl %edx, %edi # (((c ^ d) & b) ^ d)
333 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
335 roll $5, %edi # rotl32(a,5)
336 addl %edi, %ebp # e += rotl32(a,5)
337 rorl $2, %ebx # b = rotl32(b,30)
338 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
340 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
341 # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
342 # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
343 # same result as above, but shorter and faster:
344 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
345 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
347 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
348 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
349 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
350 xorps %xmm5, %xmm2 # ^
351 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
353 xorps %xmm4, %xmm4 # rol(W0,1):
354 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
355 paddd %xmm2, %xmm2 # shift left by 1
356 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
357 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
358 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
362 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
363 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
364 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
367 movups %xmm5, -64+16*2(%rsp)
372 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
373 addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
374 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
376 roll $5, %esi # rotl32(a,5)
377 addl %esi, %edx # e += rotl32(a,5)
378 rorl $2, %eax # b = rotl32(b,30)
383 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
384 addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
385 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
387 roll $5, %esi # rotl32(a,5)
388 addl %esi, %ecx # e += rotl32(a,5)
389 rorl $2, %ebp # b = rotl32(b,30)
394 xorl %eax, %edi # (((c ^ d) & b) ^ d)
395 addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
396 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
398 roll $5, %esi # rotl32(a,5)
399 addl %esi, %ebx # e += rotl32(a,5)
400 rorl $2, %edx # b = rotl32(b,30)
405 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
406 addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
407 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
409 roll $5, %esi # rotl32(a,5)
410 addl %esi, %eax # e += rotl32(a,5)
411 rorl $2, %ecx # b = rotl32(b,30)
412 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
414 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
415 # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
416 # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
417 # same result as above, but shorter and faster:
418 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
419 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
421 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
422 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
423 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
424 xorps %xmm5, %xmm3 # ^
425 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
427 xorps %xmm4, %xmm4 # rol(W0,1):
428 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
429 paddd %xmm3, %xmm3 # shift left by 1
430 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
431 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
432 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
436 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
437 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
438 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
441 movups %xmm5, -64+16*3(%rsp)
446 addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
447 addl %edi, %ebp # e += (c ^ d ^ b)
449 roll $5, %esi # rotl32(a,5)
450 addl %esi, %ebp # e += rotl32(a,5)
451 rorl $2, %ebx # b = rotl32(b,30)
456 addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
457 addl %edi, %edx # e += (c ^ d ^ b)
459 roll $5, %esi # rotl32(a,5)
460 addl %esi, %edx # e += rotl32(a,5)
461 rorl $2, %eax # b = rotl32(b,30)
466 addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
467 addl %edi, %ecx # e += (c ^ d ^ b)
469 roll $5, %esi # rotl32(a,5)
470 addl %esi, %ecx # e += rotl32(a,5)
471 rorl $2, %ebp # b = rotl32(b,30)
476 addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
477 addl %edi, %ebx # e += (c ^ d ^ b)
479 roll $5, %esi # rotl32(a,5)
480 addl %esi, %ebx # e += rotl32(a,5)
481 rorl $2, %edx # b = rotl32(b,30)
482 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
484 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
485 # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
486 # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
487 # same result as above, but shorter and faster:
488 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
489 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
491 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
492 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
493 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
494 xorps %xmm5, %xmm0 # ^
495 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
497 xorps %xmm4, %xmm4 # rol(W0,1):
498 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
499 paddd %xmm0, %xmm0 # shift left by 1
500 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
501 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
502 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
506 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
507 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
508 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
511 movups %xmm5, -64+16*0(%rsp)
516 addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
517 addl %edi, %eax # e += (c ^ d ^ b)
519 roll $5, %esi # rotl32(a,5)
520 addl %esi, %eax # e += rotl32(a,5)
521 rorl $2, %ecx # b = rotl32(b,30)
526 addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
527 addl %edi, %ebp # e += (c ^ d ^ b)
529 roll $5, %esi # rotl32(a,5)
530 addl %esi, %ebp # e += rotl32(a,5)
531 rorl $2, %ebx # b = rotl32(b,30)
536 addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
537 addl %edi, %edx # e += (c ^ d ^ b)
539 roll $5, %esi # rotl32(a,5)
540 addl %esi, %edx # e += rotl32(a,5)
541 rorl $2, %eax # b = rotl32(b,30)
546 addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
547 addl %edi, %ecx # e += (c ^ d ^ b)
549 roll $5, %esi # rotl32(a,5)
550 addl %esi, %ecx # e += rotl32(a,5)
551 rorl $2, %ebp # b = rotl32(b,30)
552 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
554 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
555 # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
556 # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
557 # same result as above, but shorter and faster:
558 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
559 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
561 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
562 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
563 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
564 xorps %xmm5, %xmm1 # ^
565 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
567 xorps %xmm4, %xmm4 # rol(W0,1):
568 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
569 paddd %xmm1, %xmm1 # shift left by 1
570 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
571 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
572 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
576 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
577 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
578 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
581 movups %xmm5, -64+16*1(%rsp)
586 addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
587 addl %edi, %ebx # e += (c ^ d ^ b)
589 roll $5, %esi # rotl32(a,5)
590 addl %esi, %ebx # e += rotl32(a,5)
591 rorl $2, %edx # b = rotl32(b,30)
596 addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
597 addl %edi, %eax # e += (c ^ d ^ b)
599 roll $5, %esi # rotl32(a,5)
600 addl %esi, %eax # e += rotl32(a,5)
601 rorl $2, %ecx # b = rotl32(b,30)
606 addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
607 addl %edi, %ebp # e += (c ^ d ^ b)
609 roll $5, %esi # rotl32(a,5)
610 addl %esi, %ebp # e += rotl32(a,5)
611 rorl $2, %ebx # b = rotl32(b,30)
616 addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
617 addl %edi, %edx # e += (c ^ d ^ b)
619 roll $5, %esi # rotl32(a,5)
620 addl %esi, %edx # e += rotl32(a,5)
621 rorl $2, %eax # b = rotl32(b,30)
622 pshufd $0xaa, %xmm7, %xmm6
623 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
625 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
626 # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
627 # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
628 # same result as above, but shorter and faster:
629 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
630 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
632 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
633 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
634 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
635 xorps %xmm5, %xmm2 # ^
636 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
638 xorps %xmm4, %xmm4 # rol(W0,1):
639 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
640 paddd %xmm2, %xmm2 # shift left by 1
641 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
642 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
643 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
647 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
648 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
649 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
652 movups %xmm5, -64+16*2(%rsp)
657 addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
658 addl %edi, %ecx # e += (c ^ d ^ b)
660 roll $5, %esi # rotl32(a,5)
661 addl %esi, %ecx # e += rotl32(a,5)
662 rorl $2, %ebp # b = rotl32(b,30)
667 addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
668 addl %edi, %ebx # e += (c ^ d ^ b)
670 roll $5, %esi # rotl32(a,5)
671 addl %esi, %ebx # e += rotl32(a,5)
672 rorl $2, %edx # b = rotl32(b,30)
677 addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
678 addl %edi, %eax # e += (c ^ d ^ b)
680 roll $5, %esi # rotl32(a,5)
681 addl %esi, %eax # e += rotl32(a,5)
682 rorl $2, %ecx # b = rotl32(b,30)
687 addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
688 addl %edi, %ebp # e += (c ^ d ^ b)
690 roll $5, %esi # rotl32(a,5)
691 addl %esi, %ebp # e += rotl32(a,5)
692 rorl $2, %ebx # b = rotl32(b,30)
693 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
695 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
696 # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
697 # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
698 # same result as above, but shorter and faster:
699 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
700 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
702 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
703 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
704 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
705 xorps %xmm5, %xmm3 # ^
706 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
708 xorps %xmm4, %xmm4 # rol(W0,1):
709 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
710 paddd %xmm3, %xmm3 # shift left by 1
711 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
712 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
713 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
717 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
718 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
719 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
722 movups %xmm5, -64+16*3(%rsp)
727 addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
728 addl %edi, %edx # e += (c ^ d ^ b)
730 roll $5, %esi # rotl32(a,5)
731 addl %esi, %edx # e += rotl32(a,5)
732 rorl $2, %eax # b = rotl32(b,30)
737 addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
738 addl %edi, %ecx # e += (c ^ d ^ b)
740 roll $5, %esi # rotl32(a,5)
741 addl %esi, %ecx # e += rotl32(a,5)
742 rorl $2, %ebp # b = rotl32(b,30)
747 addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
748 addl %edi, %ebx # e += (c ^ d ^ b)
750 roll $5, %esi # rotl32(a,5)
751 addl %esi, %ebx # e += rotl32(a,5)
752 rorl $2, %edx # b = rotl32(b,30)
757 addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
758 addl %edi, %eax # e += (c ^ d ^ b)
760 roll $5, %esi # rotl32(a,5)
761 addl %esi, %eax # e += rotl32(a,5)
762 rorl $2, %ecx # b = rotl32(b,30)
763 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
765 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
766 # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
767 # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
768 # same result as above, but shorter and faster:
769 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
770 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
772 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
773 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
774 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
775 xorps %xmm5, %xmm0 # ^
776 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
778 xorps %xmm4, %xmm4 # rol(W0,1):
779 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
780 paddd %xmm0, %xmm0 # shift left by 1
781 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
782 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
783 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
787 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
788 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
789 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
792 movups %xmm5, -64+16*0(%rsp)
794 movl %ebx, %edi # di: b
795 movl %ebx, %esi # si: b
796 orl %ecx, %edi # di: b | c
797 andl %ecx, %esi # si: b & c
798 andl %edx, %edi # di: (b | c) & d
799 orl %esi, %edi # ((b | c) & d) | (b & c)
800 addl %edi, %ebp # += ((b | c) & d) | (b & c)
801 addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
803 roll $5, %esi # rotl32(a,5)
804 addl %esi, %ebp # e += rotl32(a,5)
805 rorl $2, %ebx # b = rotl32(b,30)
807 movl %eax, %edi # di: b
808 movl %eax, %esi # si: b
809 orl %ebx, %edi # di: b | c
810 andl %ebx, %esi # si: b & c
811 andl %ecx, %edi # di: (b | c) & d
812 orl %esi, %edi # ((b | c) & d) | (b & c)
813 addl %edi, %edx # += ((b | c) & d) | (b & c)
814 addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
816 roll $5, %esi # rotl32(a,5)
817 addl %esi, %edx # e += rotl32(a,5)
818 rorl $2, %eax # b = rotl32(b,30)
820 movl %ebp, %edi # di: b
821 movl %ebp, %esi # si: b
822 orl %eax, %edi # di: b | c
823 andl %eax, %esi # si: b & c
824 andl %ebx, %edi # di: (b | c) & d
825 orl %esi, %edi # ((b | c) & d) | (b & c)
826 addl %edi, %ecx # += ((b | c) & d) | (b & c)
827 addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
829 roll $5, %esi # rotl32(a,5)
830 addl %esi, %ecx # e += rotl32(a,5)
831 rorl $2, %ebp # b = rotl32(b,30)
833 movl %edx, %edi # di: b
834 movl %edx, %esi # si: b
835 orl %ebp, %edi # di: b | c
836 andl %ebp, %esi # si: b & c
837 andl %eax, %edi # di: (b | c) & d
838 orl %esi, %edi # ((b | c) & d) | (b & c)
839 addl %edi, %ebx # += ((b | c) & d) | (b & c)
840 addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
842 roll $5, %esi # rotl32(a,5)
843 addl %esi, %ebx # e += rotl32(a,5)
844 rorl $2, %edx # b = rotl32(b,30)
845 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
847 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
848 # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
849 # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
850 # same result as above, but shorter and faster:
851 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
852 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
854 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
855 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
856 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
857 xorps %xmm5, %xmm1 # ^
858 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
860 xorps %xmm4, %xmm4 # rol(W0,1):
861 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
862 paddd %xmm1, %xmm1 # shift left by 1
863 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
864 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
865 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
869 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
870 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
871 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
874 movups %xmm5, -64+16*1(%rsp)
876 movl %ecx, %edi # di: b
877 movl %ecx, %esi # si: b
878 orl %edx, %edi # di: b | c
879 andl %edx, %esi # si: b & c
880 andl %ebp, %edi # di: (b | c) & d
881 orl %esi, %edi # ((b | c) & d) | (b & c)
882 addl %edi, %eax # += ((b | c) & d) | (b & c)
883 addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
885 roll $5, %esi # rotl32(a,5)
886 addl %esi, %eax # e += rotl32(a,5)
887 rorl $2, %ecx # b = rotl32(b,30)
889 movl %ebx, %edi # di: b
890 movl %ebx, %esi # si: b
891 orl %ecx, %edi # di: b | c
892 andl %ecx, %esi # si: b & c
893 andl %edx, %edi # di: (b | c) & d
894 orl %esi, %edi # ((b | c) & d) | (b & c)
895 addl %edi, %ebp # += ((b | c) & d) | (b & c)
896 addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
898 roll $5, %esi # rotl32(a,5)
899 addl %esi, %ebp # e += rotl32(a,5)
900 rorl $2, %ebx # b = rotl32(b,30)
902 movl %eax, %edi # di: b
903 movl %eax, %esi # si: b
904 orl %ebx, %edi # di: b | c
905 andl %ebx, %esi # si: b & c
906 andl %ecx, %edi # di: (b | c) & d
907 orl %esi, %edi # ((b | c) & d) | (b & c)
908 addl %edi, %edx # += ((b | c) & d) | (b & c)
909 addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
911 roll $5, %esi # rotl32(a,5)
912 addl %esi, %edx # e += rotl32(a,5)
913 rorl $2, %eax # b = rotl32(b,30)
915 movl %ebp, %edi # di: b
916 movl %ebp, %esi # si: b
917 orl %eax, %edi # di: b | c
918 andl %eax, %esi # si: b & c
919 andl %ebx, %edi # di: (b | c) & d
920 orl %esi, %edi # ((b | c) & d) | (b & c)
921 addl %edi, %ecx # += ((b | c) & d) | (b & c)
922 addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
924 roll $5, %esi # rotl32(a,5)
925 addl %esi, %ecx # e += rotl32(a,5)
926 rorl $2, %ebp # b = rotl32(b,30)
927 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
929 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
930 # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
931 # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
932 # same result as above, but shorter and faster:
933 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
934 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
936 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
937 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
938 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
939 xorps %xmm5, %xmm2 # ^
940 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
942 xorps %xmm4, %xmm4 # rol(W0,1):
943 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
944 paddd %xmm2, %xmm2 # shift left by 1
945 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
946 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
947 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
951 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
952 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
953 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
956 movups %xmm5, -64+16*2(%rsp)
958 movl %edx, %edi # di: b
959 movl %edx, %esi # si: b
960 orl %ebp, %edi # di: b | c
961 andl %ebp, %esi # si: b & c
962 andl %eax, %edi # di: (b | c) & d
963 orl %esi, %edi # ((b | c) & d) | (b & c)
964 addl %edi, %ebx # += ((b | c) & d) | (b & c)
965 addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
967 roll $5, %esi # rotl32(a,5)
968 addl %esi, %ebx # e += rotl32(a,5)
969 rorl $2, %edx # b = rotl32(b,30)
971 movl %ecx, %edi # di: b
972 movl %ecx, %esi # si: b
973 orl %edx, %edi # di: b | c
974 andl %edx, %esi # si: b & c
975 andl %ebp, %edi # di: (b | c) & d
976 orl %esi, %edi # ((b | c) & d) | (b & c)
977 addl %edi, %eax # += ((b | c) & d) | (b & c)
978 addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
980 roll $5, %esi # rotl32(a,5)
981 addl %esi, %eax # e += rotl32(a,5)
982 rorl $2, %ecx # b = rotl32(b,30)
984 movl %ebx, %edi # di: b
985 movl %ebx, %esi # si: b
986 orl %ecx, %edi # di: b | c
987 andl %ecx, %esi # si: b & c
988 andl %edx, %edi # di: (b | c) & d
989 orl %esi, %edi # ((b | c) & d) | (b & c)
990 addl %edi, %ebp # += ((b | c) & d) | (b & c)
991 addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
993 roll $5, %esi # rotl32(a,5)
994 addl %esi, %ebp # e += rotl32(a,5)
995 rorl $2, %ebx # b = rotl32(b,30)
997 movl %eax, %edi # di: b
998 movl %eax, %esi # si: b
999 orl %ebx, %edi # di: b | c
1000 andl %ebx, %esi # si: b & c
1001 andl %ecx, %edi # di: (b | c) & d
1002 orl %esi, %edi # ((b | c) & d) | (b & c)
1003 addl %edi, %edx # += ((b | c) & d) | (b & c)
1004 addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
1006 roll $5, %esi # rotl32(a,5)
1007 addl %esi, %edx # e += rotl32(a,5)
1008 rorl $2, %eax # b = rotl32(b,30)
1009 pshufd $0xff, %xmm7, %xmm6
1010 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1012 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1013 # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1014 # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1015 # same result as above, but shorter and faster:
1016 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1017 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1019 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1020 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1021 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1022 xorps %xmm5, %xmm3 # ^
1023 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1025 xorps %xmm4, %xmm4 # rol(W0,1):
1026 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1027 paddd %xmm3, %xmm3 # shift left by 1
1028 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1029 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1030 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1034 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1035 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1036 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1039 movups %xmm5, -64+16*3(%rsp)
1041 movl %ebp, %edi # di: b
1042 movl %ebp, %esi # si: b
1043 orl %eax, %edi # di: b | c
1044 andl %eax, %esi # si: b & c
1045 andl %ebx, %edi # di: (b | c) & d
1046 orl %esi, %edi # ((b | c) & d) | (b & c)
1047 addl %edi, %ecx # += ((b | c) & d) | (b & c)
1048 addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
1050 roll $5, %esi # rotl32(a,5)
1051 addl %esi, %ecx # e += rotl32(a,5)
1052 rorl $2, %ebp # b = rotl32(b,30)
1054 movl %edx, %edi # di: b
1055 movl %edx, %esi # si: b
1056 orl %ebp, %edi # di: b | c
1057 andl %ebp, %esi # si: b & c
1058 andl %eax, %edi # di: (b | c) & d
1059 orl %esi, %edi # ((b | c) & d) | (b & c)
1060 addl %edi, %ebx # += ((b | c) & d) | (b & c)
1061 addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
1063 roll $5, %esi # rotl32(a,5)
1064 addl %esi, %ebx # e += rotl32(a,5)
1065 rorl $2, %edx # b = rotl32(b,30)
1067 movl %ecx, %edi # di: b
1068 movl %ecx, %esi # si: b
1069 orl %edx, %edi # di: b | c
1070 andl %edx, %esi # si: b & c
1071 andl %ebp, %edi # di: (b | c) & d
1072 orl %esi, %edi # ((b | c) & d) | (b & c)
1073 addl %edi, %eax # += ((b | c) & d) | (b & c)
1074 addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
1076 roll $5, %esi # rotl32(a,5)
1077 addl %esi, %eax # e += rotl32(a,5)
1078 rorl $2, %ecx # b = rotl32(b,30)
1080 movl %ebx, %edi # di: b
1081 movl %ebx, %esi # si: b
1082 orl %ecx, %edi # di: b | c
1083 andl %ecx, %esi # si: b & c
1084 andl %edx, %edi # di: (b | c) & d
1085 orl %esi, %edi # ((b | c) & d) | (b & c)
1086 addl %edi, %ebp # += ((b | c) & d) | (b & c)
1087 addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
1089 roll $5, %esi # rotl32(a,5)
1090 addl %esi, %ebp # e += rotl32(a,5)
1091 rorl $2, %ebx # b = rotl32(b,30)
1092 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1094 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1095 # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1096 # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1097 # same result as above, but shorter and faster:
1098 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1099 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1101 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1102 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1103 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1104 xorps %xmm5, %xmm0 # ^
1105 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1107 xorps %xmm4, %xmm4 # rol(W0,1):
1108 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1109 paddd %xmm0, %xmm0 # shift left by 1
1110 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
1111 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1112 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1116 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1117 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
1118 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1121 movups %xmm5, -64+16*0(%rsp)
1123 movl %eax, %edi # di: b
1124 movl %eax, %esi # si: b
1125 orl %ebx, %edi # di: b | c
1126 andl %ebx, %esi # si: b & c
1127 andl %ecx, %edi # di: (b | c) & d
1128 orl %esi, %edi # ((b | c) & d) | (b & c)
1129 addl %edi, %edx # += ((b | c) & d) | (b & c)
1130 addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
1132 roll $5, %esi # rotl32(a,5)
1133 addl %esi, %edx # e += rotl32(a,5)
1134 rorl $2, %eax # b = rotl32(b,30)
1136 movl %ebp, %edi # di: b
1137 movl %ebp, %esi # si: b
1138 orl %eax, %edi # di: b | c
1139 andl %eax, %esi # si: b & c
1140 andl %ebx, %edi # di: (b | c) & d
1141 orl %esi, %edi # ((b | c) & d) | (b & c)
1142 addl %edi, %ecx # += ((b | c) & d) | (b & c)
1143 addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
1145 roll $5, %esi # rotl32(a,5)
1146 addl %esi, %ecx # e += rotl32(a,5)
1147 rorl $2, %ebp # b = rotl32(b,30)
1149 movl %edx, %edi # di: b
1150 movl %edx, %esi # si: b
1151 orl %ebp, %edi # di: b | c
1152 andl %ebp, %esi # si: b & c
1153 andl %eax, %edi # di: (b | c) & d
1154 orl %esi, %edi # ((b | c) & d) | (b & c)
1155 addl %edi, %ebx # += ((b | c) & d) | (b & c)
1156 addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
1158 roll $5, %esi # rotl32(a,5)
1159 addl %esi, %ebx # e += rotl32(a,5)
1160 rorl $2, %edx # b = rotl32(b,30)
1162 movl %ecx, %edi # di: b
1163 movl %ecx, %esi # si: b
1164 orl %edx, %edi # di: b | c
1165 andl %edx, %esi # si: b & c
1166 andl %ebp, %edi # di: (b | c) & d
1167 orl %esi, %edi # ((b | c) & d) | (b & c)
1168 addl %edi, %eax # += ((b | c) & d) | (b & c)
1169 addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
1171 roll $5, %esi # rotl32(a,5)
1172 addl %esi, %eax # e += rotl32(a,5)
1173 rorl $2, %ecx # b = rotl32(b,30)
1174 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1176 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1177 # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1178 # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1179 # same result as above, but shorter and faster:
1180 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1181 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1183 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1184 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1185 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1186 xorps %xmm5, %xmm1 # ^
1187 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1189 xorps %xmm4, %xmm4 # rol(W0,1):
1190 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1191 paddd %xmm1, %xmm1 # shift left by 1
1192 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
1193 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1194 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1198 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1199 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
1200 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1203 movups %xmm5, -64+16*1(%rsp)
1206 xorl %edx, %edi # ^d
1207 xorl %ebx, %edi # ^b
1208 addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
1209 addl %edi, %ebp # e += (c ^ d ^ b)
1211 roll $5, %esi # rotl32(a,5)
1212 addl %esi, %ebp # e += rotl32(a,5)
1213 rorl $2, %ebx # b = rotl32(b,30)
1216 xorl %ecx, %edi # ^d
1217 xorl %eax, %edi # ^b
1218 addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
1219 addl %edi, %edx # e += (c ^ d ^ b)
1221 roll $5, %esi # rotl32(a,5)
1222 addl %esi, %edx # e += rotl32(a,5)
1223 rorl $2, %eax # b = rotl32(b,30)
1226 xorl %ebx, %edi # ^d
1227 xorl %ebp, %edi # ^b
1228 addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
1229 addl %edi, %ecx # e += (c ^ d ^ b)
1231 roll $5, %esi # rotl32(a,5)
1232 addl %esi, %ecx # e += rotl32(a,5)
1233 rorl $2, %ebp # b = rotl32(b,30)
1236 xorl %eax, %edi # ^d
1237 xorl %edx, %edi # ^b
1238 addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
1239 addl %edi, %ebx # e += (c ^ d ^ b)
1241 roll $5, %esi # rotl32(a,5)
1242 addl %esi, %ebx # e += rotl32(a,5)
1243 rorl $2, %edx # b = rotl32(b,30)
1244 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1246 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1247 # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1248 # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1249 # same result as above, but shorter and faster:
1250 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1251 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1253 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1254 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1255 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1256 xorps %xmm5, %xmm2 # ^
1257 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1259 xorps %xmm4, %xmm4 # rol(W0,1):
1260 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1261 paddd %xmm2, %xmm2 # shift left by 1
1262 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
1263 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1264 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1268 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1269 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
1270 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1273 movups %xmm5, -64+16*2(%rsp)
1276 xorl %ebp, %edi # ^d
1277 xorl %ecx, %edi # ^b
1278 addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
1279 addl %edi, %eax # e += (c ^ d ^ b)
1281 roll $5, %esi # rotl32(a,5)
1282 addl %esi, %eax # e += rotl32(a,5)
1283 rorl $2, %ecx # b = rotl32(b,30)
1286 xorl %edx, %edi # ^d
1287 xorl %ebx, %edi # ^b
1288 addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
1289 addl %edi, %ebp # e += (c ^ d ^ b)
1291 roll $5, %esi # rotl32(a,5)
1292 addl %esi, %ebp # e += rotl32(a,5)
1293 rorl $2, %ebx # b = rotl32(b,30)
1296 xorl %ecx, %edi # ^d
1297 xorl %eax, %edi # ^b
1298 addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
1299 addl %edi, %edx # e += (c ^ d ^ b)
1301 roll $5, %esi # rotl32(a,5)
1302 addl %esi, %edx # e += rotl32(a,5)
1303 rorl $2, %eax # b = rotl32(b,30)
1306 xorl %ebx, %edi # ^d
1307 xorl %ebp, %edi # ^b
1308 addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
1309 addl %edi, %ecx # e += (c ^ d ^ b)
1311 roll $5, %esi # rotl32(a,5)
1312 addl %esi, %ecx # e += rotl32(a,5)
1313 rorl $2, %ebp # b = rotl32(b,30)
1314 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1316 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1317 # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1318 # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1319 # same result as above, but shorter and faster:
1320 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1321 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1323 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1324 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1325 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1326 xorps %xmm5, %xmm3 # ^
1327 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1329 xorps %xmm4, %xmm4 # rol(W0,1):
1330 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1331 paddd %xmm3, %xmm3 # shift left by 1
1332 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1333 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1334 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1338 # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1339 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1340 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1343 movups %xmm5, -64+16*3(%rsp)
1346 xorl %eax, %edi # ^d
1347 xorl %edx, %edi # ^b
1348 addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
1349 addl %edi, %ebx # e += (c ^ d ^ b)
1351 roll $5, %esi # rotl32(a,5)
1352 addl %esi, %ebx # e += rotl32(a,5)
1353 rorl $2, %edx # b = rotl32(b,30)
1356 xorl %ebp, %edi # ^d
1357 xorl %ecx, %edi # ^b
1358 addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
1359 addl %edi, %eax # e += (c ^ d ^ b)
1361 roll $5, %esi # rotl32(a,5)
1362 addl %esi, %eax # e += rotl32(a,5)
1363 rorl $2, %ecx # b = rotl32(b,30)
1366 xorl %edx, %edi # ^d
1367 xorl %ebx, %edi # ^b
1368 addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
1369 addl %edi, %ebp # e += (c ^ d ^ b)
1371 roll $5, %esi # rotl32(a,5)
1372 addl %esi, %ebp # e += rotl32(a,5)
1373 rorl $2, %ebx # b = rotl32(b,30)
1376 xorl %ecx, %edi # ^d
1377 xorl %eax, %edi # ^b
1378 addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
1379 addl %edi, %edx # e += (c ^ d ^ b)
1381 roll $5, %esi # rotl32(a,5)
1382 addl %esi, %edx # e += rotl32(a,5)
1383 rorl $2, %eax # b = rotl32(b,30)
1386 xorl %ebx, %edi # ^d
1387 xorl %ebp, %edi # ^b
1388 addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
1389 addl %edi, %ecx # e += (c ^ d ^ b)
1391 roll $5, %esi # rotl32(a,5)
1392 addl %esi, %ecx # e += rotl32(a,5)
1393 rorl $2, %ebp # b = rotl32(b,30)
1396 xorl %eax, %edi # ^d
1397 xorl %edx, %edi # ^b
1398 addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
1399 addl %edi, %ebx # e += (c ^ d ^ b)
1401 roll $5, %esi # rotl32(a,5)
1402 addl %esi, %ebx # e += rotl32(a,5)
1403 rorl $2, %edx # b = rotl32(b,30)
1406 xorl %ebp, %edi # ^d
1407 xorl %ecx, %edi # ^b
1408 addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
1409 addl %edi, %eax # e += (c ^ d ^ b)
1411 roll $5, %esi # rotl32(a,5)
1412 addl %esi, %eax # e += rotl32(a,5)
1413 rorl $2, %ecx # b = rotl32(b,30)
1416 xorl %edx, %edi # ^d
1417 xorl %ebx, %edi # ^b
1418 addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
1419 addl %edi, %ebp # e += (c ^ d ^ b)
1421 roll $5, %esi # rotl32(a,5)
1422 addl %esi, %ebp # e += rotl32(a,5)
1423 rorl $2, %ebx # b = rotl32(b,30)
1426 xorl %ecx, %edi # ^d
1427 xorl %eax, %edi # ^b
1428 addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
1429 addl %edi, %edx # e += (c ^ d ^ b)
1431 roll $5, %esi # rotl32(a,5)
1432 addl %esi, %edx # e += rotl32(a,5)
1433 rorl $2, %eax # b = rotl32(b,30)
1436 xorl %ebx, %edi # ^d
1437 xorl %ebp, %edi # ^b
1438 addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
1439 addl %edi, %ecx # e += (c ^ d ^ b)
1441 roll $5, %esi # rotl32(a,5)
1442 addl %esi, %ecx # e += rotl32(a,5)
1443 rorl $2, %ebp # b = rotl32(b,30)
1446 xorl %eax, %edi # ^d
1447 xorl %edx, %edi # ^b
1448 addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
1449 addl %edi, %ebx # e += (c ^ d ^ b)
1451 roll $5, %esi # rotl32(a,5)
1452 addl %esi, %ebx # e += rotl32(a,5)
1453 rorl $2, %edx # b = rotl32(b,30)
1456 xorl %ebp, %edi # ^d
1457 xorl %ecx, %edi # ^b
1458 addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
1459 addl %edi, %eax # e += (c ^ d ^ b)
1461 roll $5, %esi # rotl32(a,5)
1462 addl %esi, %eax # e += rotl32(a,5)
1463 rorl $2, %ecx # b = rotl32(b,30)
1467 addl %eax, 80(%rdi) # ctx->hash[0] += a
1469 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1471 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1473 addl %edx, 92(%rdi) # ctx->hash[3] += d
1475 addl %ebp, 96(%rdi) # ctx->hash[4] += e
1479 .size sha1_process_block64, .-sha1_process_block64
1481 .section .rodata.cst16.sha1const, "aM", @progbits, 16