libbb/hash_sha1_x86-64.S

   1 ### Generated by hash_sha1_x86-64.S.sh ###
   2
   3 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
   4 #ifdef __linux__
   5         .section        .note.GNU-stack, "", @progbits
   6 #endif
   7         .section        .text.sha1_process_block64, "ax", @progbits
   8         .globl  sha1_process_block64
   9         .hidden sha1_process_block64
  10         .type   sha1_process_block64, @function
  11
  12         .balign 8       # allow decoders to fetch at least 5 first insns
  13 sha1_process_block64:
  14         pushq   %rbp    # 1 byte insn
  15         pushq   %rbx    # 1 byte insn
  16 #       pushq   %r15    # 2 byte insn
  17         pushq   %r14    # 2 byte insn
  18         pushq   %r13    # 2 byte insn
  19         pushq   %r12    # 2 byte insn
  20         pushq   %rdi    # we need ctx at the end
  21
  22 #Register and stack use:
  23 # eax..edx: a..d
  24 # ebp: e
  25 # esi,edi,r8..r14: temps
  26 # r15: unused
  27 # xmm0..xmm3: W[]
  28 # xmm4,xmm5: temps
  29 # xmm6: current round constant
  30 # xmm7: all round constants
  31 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
  32
  33         movl    80(%rdi), %eax          # a = ctx->hash[0]
  34         movl    84(%rdi), %ebx          # b = ctx->hash[1]
  35         movl    88(%rdi), %ecx          # c = ctx->hash[2]
  36         movl    92(%rdi), %edx          # d = ctx->hash[3]
  37         movl    96(%rdi), %ebp          # e = ctx->hash[4]
  38
  39         movaps  sha1const(%rip), %xmm7
  40         pshufd  $0x00, %xmm7, %xmm6
  41
  42         # Load W[] to xmm0..3, byteswapping on the fly.
  43         #
  44         # For iterations 0..15, we pass W[] in rsi,r8..r14
  45         # for use in RD1As instead of spilling them to stack.
  46         # We lose parallelized addition of RCONST, but LEA
  47         # can do two additions at once, so it is probably a wash.
  48         # (We use rsi instead of rN because this makes two
  49         # LEAs in two first RD1As shorter by one byte).
  50         movq    4*0(%rdi), %rsi
  51         movq    4*2(%rdi), %r8
  52         bswapq  %rsi
  53         bswapq  %r8
  54         rolq    $32, %rsi               # rsi = W[1]:W[0]
  55         rolq    $32, %r8                # r8  = W[3]:W[2]
  56         movq    %rsi, %xmm0
  57         movq    %r8, %xmm4
  58         punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
  59 #       movaps  %xmm0, %xmm4            # add RCONST, spill to stack
  60 #       paddd   %xmm6, %xmm4
  61 #       movups  %xmm4, -64+16*0(%rsp)
  62
  63         movq    4*4(%rdi), %r9
  64         movq    4*6(%rdi), %r10
  65         bswapq  %r9
  66         bswapq  %r10
  67         rolq    $32, %r9                # r9  = W[5]:W[4]
  68         rolq    $32, %r10               # r10 = W[7]:W[6]
  69         movq    %r9, %xmm1
  70         movq    %r10, %xmm4
  71         punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
  72
  73         movq    4*8(%rdi), %r11
  74         movq    4*10(%rdi), %r12
  75         bswapq  %r11
  76         bswapq  %r12
  77         rolq    $32, %r11               # r11 = W[9]:W[8]
  78         rolq    $32, %r12               # r12 = W[11]:W[10]
  79         movq    %r11, %xmm2
  80         movq    %r12, %xmm4
  81         punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
  82
  83         movq    4*12(%rdi), %r13
  84         movq    4*14(%rdi), %r14
  85         bswapq  %r13
  86         bswapq  %r14
  87         rolq    $32, %r13               # r13 = W[13]:W[12]
  88         rolq    $32, %r14               # r14 = W[15]:W[14]
  89         movq    %r13, %xmm3
  90         movq    %r14, %xmm4
  91         punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
  92
  93 # 0
  94         leal    0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
  95         shrq    $32, %rsi
  96         movl    %ecx, %edi              # c
  97         xorl    %edx, %edi              # ^d
  98         andl    %ebx, %edi              # &b
  99         xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
 100         addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
 101         movl    %eax, %edi              #
 102         roll    $5, %edi                # rotl32(a,5)
 103         addl    %edi, %ebp              # e += rotl32(a,5)
 104         rorl    $2, %ebx                # b = rotl32(b,30)
 105 # 1
 106         leal    0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 107         movl    %ebx, %edi              # c
 108         xorl    %ecx, %edi              # ^d
 109         andl    %eax, %edi              # &b
 110         xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
 111         addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
 112         movl    %ebp, %edi              #
 113         roll    $5, %edi                # rotl32(a,5)
 114         addl    %edi, %edx              # e += rotl32(a,5)
 115         rorl    $2, %eax                # b = rotl32(b,30)
 116 # 2
 117         leal    0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
 118         shrq    $32, %r8
 119         movl    %eax, %edi              # c
 120         xorl    %ebx, %edi              # ^d
 121         andl    %ebp, %edi              # &b
 122         xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
 123         addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
 124         movl    %edx, %edi              #
 125         roll    $5, %edi                # rotl32(a,5)
 126         addl    %edi, %ecx              # e += rotl32(a,5)
 127         rorl    $2, %ebp                # b = rotl32(b,30)
 128 # 3
 129         leal    0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 130         movl    %ebp, %edi              # c
 131         xorl    %eax, %edi              # ^d
 132         andl    %edx, %edi              # &b
 133         xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
 134         addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
 135         movl    %ecx, %edi              #
 136         roll    $5, %edi                # rotl32(a,5)
 137         addl    %edi, %ebx              # e += rotl32(a,5)
 138         rorl    $2, %edx                # b = rotl32(b,30)
 139 # 4
 140         leal    0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 141         shrq    $32, %r9
 142         movl    %edx, %edi              # c
 143         xorl    %ebp, %edi              # ^d
 144         andl    %ecx, %edi              # &b
 145         xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
 146         addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
 147         movl    %ebx, %edi              #
 148         roll    $5, %edi                # rotl32(a,5)
 149         addl    %edi, %eax              # e += rotl32(a,5)
 150         rorl    $2, %ecx                # b = rotl32(b,30)
 151 # 5
 152         leal    0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
 153         movl    %ecx, %edi              # c
 154         xorl    %edx, %edi              # ^d
 155         andl    %ebx, %edi              # &b
 156         xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
 157         addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
 158         movl    %eax, %edi              #
 159         roll    $5, %edi                # rotl32(a,5)
 160         addl    %edi, %ebp              # e += rotl32(a,5)
 161         rorl    $2, %ebx                # b = rotl32(b,30)
 162 # 6
 163         leal    0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
 164         shrq    $32, %r10
 165         movl    %ebx, %edi              # c
 166         xorl    %ecx, %edi              # ^d
 167         andl    %eax, %edi              # &b
 168         xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
 169         addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
 170         movl    %ebp, %edi              #
 171         roll    $5, %edi                # rotl32(a,5)
 172         addl    %edi, %edx              # e += rotl32(a,5)
 173         rorl    $2, %eax                # b = rotl32(b,30)
 174 # 7
 175         leal    0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
 176         movl    %eax, %edi              # c
 177         xorl    %ebx, %edi              # ^d
 178         andl    %ebp, %edi              # &b
 179         xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
 180         addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
 181         movl    %edx, %edi              #
 182         roll    $5, %edi                # rotl32(a,5)
 183         addl    %edi, %ecx              # e += rotl32(a,5)
 184         rorl    $2, %ebp                # b = rotl32(b,30)
 185 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 186         movaps  %xmm3, %xmm4
 187         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 188 #       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 189 #       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 190 # same result as above, but shorter and faster:
 191 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 192 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 193         movaps  %xmm0, %xmm5
 194         shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 195         xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 196         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 197         xorps   %xmm5, %xmm0    # ^
 198         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 199         movaps  %xmm0, %xmm5
 200         xorps   %xmm4, %xmm4    # rol(W0,1):
 201         pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 202         paddd   %xmm0, %xmm0    #  shift left by 1
 203         psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
 204         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 205         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 206         movaps  %xmm5, %xmm4
 207         pslld   $2, %xmm5
 208         psrld   $30, %xmm4
 209 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 210         xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
 211         xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 212         movaps  %xmm0, %xmm5
 213         paddd   %xmm6, %xmm5
 214         movups  %xmm5, -64+16*0(%rsp)
 215 # 8
 216         leal    0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
 217         shrq    $32, %r11
 218         movl    %ebp, %edi              # c
 219         xorl    %eax, %edi              # ^d
 220         andl    %edx, %edi              # &b
 221         xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
 222         addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
 223         movl    %ecx, %edi              #
 224         roll    $5, %edi                # rotl32(a,5)
 225         addl    %edi, %ebx              # e += rotl32(a,5)
 226         rorl    $2, %edx                # b = rotl32(b,30)
 227 # 9
 228         leal    0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
 229         movl    %edx, %edi              # c
 230         xorl    %ebp, %edi              # ^d
 231         andl    %ecx, %edi              # &b
 232         xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
 233         addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
 234         movl    %ebx, %edi              #
 235         roll    $5, %edi                # rotl32(a,5)
 236         addl    %edi, %eax              # e += rotl32(a,5)
 237         rorl    $2, %ecx                # b = rotl32(b,30)
 238 # 10
 239         leal    0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
 240         shrq    $32, %r12
 241         movl    %ecx, %edi              # c
 242         xorl    %edx, %edi              # ^d
 243         andl    %ebx, %edi              # &b
 244         xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
 245         addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
 246         movl    %eax, %edi              #
 247         roll    $5, %edi                # rotl32(a,5)
 248         addl    %edi, %ebp              # e += rotl32(a,5)
 249         rorl    $2, %ebx                # b = rotl32(b,30)
 250 # 11
 251         leal    0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
 252         movl    %ebx, %edi              # c
 253         xorl    %ecx, %edi              # ^d
 254         andl    %eax, %edi              # &b
 255         xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
 256         addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
 257         movl    %ebp, %edi              #
 258         roll    $5, %edi                # rotl32(a,5)
 259         addl    %edi, %edx              # e += rotl32(a,5)
 260         rorl    $2, %eax                # b = rotl32(b,30)
 261         pshufd  $0x55, %xmm7, %xmm6
 262 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 263         movaps  %xmm0, %xmm4
 264         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 265 #       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 266 #       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 267 # same result as above, but shorter and faster:
 268 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 269 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 270         movaps  %xmm1, %xmm5
 271         shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 272         xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 273         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 274         xorps   %xmm5, %xmm1    # ^
 275         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 276         movaps  %xmm1, %xmm5
 277         xorps   %xmm4, %xmm4    # rol(W0,1):
 278         pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 279         paddd   %xmm1, %xmm1    #  shift left by 1
 280         psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
 281         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 282         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 283         movaps  %xmm5, %xmm4
 284         pslld   $2, %xmm5
 285         psrld   $30, %xmm4
 286 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 287         xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
 288         xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 289         movaps  %xmm1, %xmm5
 290         paddd   %xmm6, %xmm5
 291         movups  %xmm5, -64+16*1(%rsp)
 292 # 12
 293         leal    0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
 294         shrq    $32, %r13
 295         movl    %eax, %edi              # c
 296         xorl    %ebx, %edi              # ^d
 297         andl    %ebp, %edi              # &b
 298         xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
 299         addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
 300         movl    %edx, %edi              #
 301         roll    $5, %edi                # rotl32(a,5)
 302         addl    %edi, %ecx              # e += rotl32(a,5)
 303         rorl    $2, %ebp                # b = rotl32(b,30)
 304 # 13
 305         leal    0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 306         movl    %ebp, %edi              # c
 307         xorl    %eax, %edi              # ^d
 308         andl    %edx, %edi              # &b
 309         xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
 310         addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
 311         movl    %ecx, %edi              #
 312         roll    $5, %edi                # rotl32(a,5)
 313         addl    %edi, %ebx              # e += rotl32(a,5)
 314         rorl    $2, %edx                # b = rotl32(b,30)
 315 # 14
 316         leal    0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 317         shrq    $32, %r14
 318         movl    %edx, %edi              # c
 319         xorl    %ebp, %edi              # ^d
 320         andl    %ecx, %edi              # &b
 321         xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
 322         addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
 323         movl    %ebx, %edi              #
 324         roll    $5, %edi                # rotl32(a,5)
 325         addl    %edi, %eax              # e += rotl32(a,5)
 326         rorl    $2, %ecx                # b = rotl32(b,30)
 327 # 15
 328         leal    0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
 329         movl    %ecx, %edi              # c
 330         xorl    %edx, %edi              # ^d
 331         andl    %ebx, %edi              # &b
 332         xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
 333         addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
 334         movl    %eax, %edi              #
 335         roll    $5, %edi                # rotl32(a,5)
 336         addl    %edi, %ebp              # e += rotl32(a,5)
 337         rorl    $2, %ebx                # b = rotl32(b,30)
 338 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 339         movaps  %xmm1, %xmm4
 340         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 341 #       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 342 #       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 343 # same result as above, but shorter and faster:
 344 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 345 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 346         movaps  %xmm2, %xmm5
 347         shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 348         xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 349         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 350         xorps   %xmm5, %xmm2    # ^
 351         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 352         movaps  %xmm2, %xmm5
 353         xorps   %xmm4, %xmm4    # rol(W0,1):
 354         pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 355         paddd   %xmm2, %xmm2    #  shift left by 1
 356         psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
 357         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 358         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 359         movaps  %xmm5, %xmm4
 360         pslld   $2, %xmm5
 361         psrld   $30, %xmm4
 362 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 363         xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
 364         xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 365         movaps  %xmm2, %xmm5
 366         paddd   %xmm6, %xmm5
 367         movups  %xmm5, -64+16*2(%rsp)
 368 # 16
 369         movl    %ebx, %edi              # c
 370         xorl    %ecx, %edi              # ^d
 371         andl    %eax, %edi              # &b
 372         xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
 373         addl    -64+4*0(%rsp), %edx     # e += RCONST + W[n & 15]
 374         addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
 375         movl    %ebp, %esi              #
 376         roll    $5, %esi                # rotl32(a,5)
 377         addl    %esi, %edx              # e += rotl32(a,5)
 378         rorl    $2, %eax                # b = rotl32(b,30)
 379 # 17
 380         movl    %eax, %edi              # c
 381         xorl    %ebx, %edi              # ^d
 382         andl    %ebp, %edi              # &b
 383         xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
 384         addl    -64+4*1(%rsp), %ecx     # e += RCONST + W[n & 15]
 385         addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
 386         movl    %edx, %esi              #
 387         roll    $5, %esi                # rotl32(a,5)
 388         addl    %esi, %ecx              # e += rotl32(a,5)
 389         rorl    $2, %ebp                # b = rotl32(b,30)
 390 # 18
 391         movl    %ebp, %edi              # c
 392         xorl    %eax, %edi              # ^d
 393         andl    %edx, %edi              # &b
 394         xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
 395         addl    -64+4*2(%rsp), %ebx     # e += RCONST + W[n & 15]
 396         addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
 397         movl    %ecx, %esi              #
 398         roll    $5, %esi                # rotl32(a,5)
 399         addl    %esi, %ebx              # e += rotl32(a,5)
 400         rorl    $2, %edx                # b = rotl32(b,30)
 401 # 19
 402         movl    %edx, %edi              # c
 403         xorl    %ebp, %edi              # ^d
 404         andl    %ecx, %edi              # &b
 405         xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
 406         addl    -64+4*3(%rsp), %eax     # e += RCONST + W[n & 15]
 407         addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
 408         movl    %ebx, %esi              #
 409         roll    $5, %esi                # rotl32(a,5)
 410         addl    %esi, %eax              # e += rotl32(a,5)
 411         rorl    $2, %ecx                # b = rotl32(b,30)
 412 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 413         movaps  %xmm2, %xmm4
 414         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 415 #       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 416 #       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 417 # same result as above, but shorter and faster:
 418 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 419 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 420         movaps  %xmm3, %xmm5
 421         shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 422         xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 423         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 424         xorps   %xmm5, %xmm3    # ^
 425         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 426         movaps  %xmm3, %xmm5
 427         xorps   %xmm4, %xmm4    # rol(W0,1):
 428         pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 429         paddd   %xmm3, %xmm3    #  shift left by 1
 430         psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
 431         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 432         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 433         movaps  %xmm5, %xmm4
 434         pslld   $2, %xmm5
 435         psrld   $30, %xmm4
 436 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 437         xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
 438         xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 439         movaps  %xmm3, %xmm5
 440         paddd   %xmm6, %xmm5
 441         movups  %xmm5, -64+16*3(%rsp)
 442 # 20
 443         movl    %ecx, %edi              # c
 444         xorl    %edx, %edi              # ^d
 445         xorl    %ebx, %edi              # ^b
 446         addl    -64+4*4(%rsp), %ebp     # e += RCONST + W[n & 15]
 447         addl    %edi, %ebp              # e += (c ^ d ^ b)
 448         movl    %eax, %esi              #
 449         roll    $5, %esi                # rotl32(a,5)
 450         addl    %esi, %ebp              # e += rotl32(a,5)
 451         rorl    $2, %ebx                # b = rotl32(b,30)
 452 # 21
 453         movl    %ebx, %edi              # c
 454         xorl    %ecx, %edi              # ^d
 455         xorl    %eax, %edi              # ^b
 456         addl    -64+4*5(%rsp), %edx     # e += RCONST + W[n & 15]
 457         addl    %edi, %edx              # e += (c ^ d ^ b)
 458         movl    %ebp, %esi              #
 459         roll    $5, %esi                # rotl32(a,5)
 460         addl    %esi, %edx              # e += rotl32(a,5)
 461         rorl    $2, %eax                # b = rotl32(b,30)
 462 # 22
 463         movl    %eax, %edi              # c
 464         xorl    %ebx, %edi              # ^d
 465         xorl    %ebp, %edi              # ^b
 466         addl    -64+4*6(%rsp), %ecx     # e += RCONST + W[n & 15]
 467         addl    %edi, %ecx              # e += (c ^ d ^ b)
 468         movl    %edx, %esi              #
 469         roll    $5, %esi                # rotl32(a,5)
 470         addl    %esi, %ecx              # e += rotl32(a,5)
 471         rorl    $2, %ebp                # b = rotl32(b,30)
 472 # 23
 473         movl    %ebp, %edi              # c
 474         xorl    %eax, %edi              # ^d
 475         xorl    %edx, %edi              # ^b
 476         addl    -64+4*7(%rsp), %ebx     # e += RCONST + W[n & 15]
 477         addl    %edi, %ebx              # e += (c ^ d ^ b)
 478         movl    %ecx, %esi              #
 479         roll    $5, %esi                # rotl32(a,5)
 480         addl    %esi, %ebx              # e += rotl32(a,5)
 481         rorl    $2, %edx                # b = rotl32(b,30)
 482 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 483         movaps  %xmm3, %xmm4
 484         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 485 #       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 486 #       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 487 # same result as above, but shorter and faster:
 488 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 489 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 490         movaps  %xmm0, %xmm5
 491         shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 492         xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 493         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 494         xorps   %xmm5, %xmm0    # ^
 495         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 496         movaps  %xmm0, %xmm5
 497         xorps   %xmm4, %xmm4    # rol(W0,1):
 498         pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 499         paddd   %xmm0, %xmm0    #  shift left by 1
 500         psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
 501         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 502         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 503         movaps  %xmm5, %xmm4
 504         pslld   $2, %xmm5
 505         psrld   $30, %xmm4
 506 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 507         xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
 508         xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 509         movaps  %xmm0, %xmm5
 510         paddd   %xmm6, %xmm5
 511         movups  %xmm5, -64+16*0(%rsp)
 512 # 24
 513         movl    %edx, %edi              # c
 514         xorl    %ebp, %edi              # ^d
 515         xorl    %ecx, %edi              # ^b
 516         addl    -64+4*8(%rsp), %eax     # e += RCONST + W[n & 15]
 517         addl    %edi, %eax              # e += (c ^ d ^ b)
 518         movl    %ebx, %esi              #
 519         roll    $5, %esi                # rotl32(a,5)
 520         addl    %esi, %eax              # e += rotl32(a,5)
 521         rorl    $2, %ecx                # b = rotl32(b,30)
 522 # 25
 523         movl    %ecx, %edi              # c
 524         xorl    %edx, %edi              # ^d
 525         xorl    %ebx, %edi              # ^b
 526         addl    -64+4*9(%rsp), %ebp     # e += RCONST + W[n & 15]
 527         addl    %edi, %ebp              # e += (c ^ d ^ b)
 528         movl    %eax, %esi              #
 529         roll    $5, %esi                # rotl32(a,5)
 530         addl    %esi, %ebp              # e += rotl32(a,5)
 531         rorl    $2, %ebx                # b = rotl32(b,30)
 532 # 26
 533         movl    %ebx, %edi              # c
 534         xorl    %ecx, %edi              # ^d
 535         xorl    %eax, %edi              # ^b
 536         addl    -64+4*10(%rsp), %edx    # e += RCONST + W[n & 15]
 537         addl    %edi, %edx              # e += (c ^ d ^ b)
 538         movl    %ebp, %esi              #
 539         roll    $5, %esi                # rotl32(a,5)
 540         addl    %esi, %edx              # e += rotl32(a,5)
 541         rorl    $2, %eax                # b = rotl32(b,30)
 542 # 27
 543         movl    %eax, %edi              # c
 544         xorl    %ebx, %edi              # ^d
 545         xorl    %ebp, %edi              # ^b
 546         addl    -64+4*11(%rsp), %ecx    # e += RCONST + W[n & 15]
 547         addl    %edi, %ecx              # e += (c ^ d ^ b)
 548         movl    %edx, %esi              #
 549         roll    $5, %esi                # rotl32(a,5)
 550         addl    %esi, %ecx              # e += rotl32(a,5)
 551         rorl    $2, %ebp                # b = rotl32(b,30)
 552 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 553         movaps  %xmm0, %xmm4
 554         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 555 #       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 556 #       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 557 # same result as above, but shorter and faster:
 558 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 559 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 560         movaps  %xmm1, %xmm5
 561         shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 562         xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 563         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 564         xorps   %xmm5, %xmm1    # ^
 565         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 566         movaps  %xmm1, %xmm5
 567         xorps   %xmm4, %xmm4    # rol(W0,1):
 568         pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 569         paddd   %xmm1, %xmm1    #  shift left by 1
 570         psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
 571         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 572         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 573         movaps  %xmm5, %xmm4
 574         pslld   $2, %xmm5
 575         psrld   $30, %xmm4
 576 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 577         xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
 578         xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 579         movaps  %xmm1, %xmm5
 580         paddd   %xmm6, %xmm5
 581         movups  %xmm5, -64+16*1(%rsp)
 582 # 28
 583         movl    %ebp, %edi              # c
 584         xorl    %eax, %edi              # ^d
 585         xorl    %edx, %edi              # ^b
 586         addl    -64+4*12(%rsp), %ebx    # e += RCONST + W[n & 15]
 587         addl    %edi, %ebx              # e += (c ^ d ^ b)
 588         movl    %ecx, %esi              #
 589         roll    $5, %esi                # rotl32(a,5)
 590         addl    %esi, %ebx              # e += rotl32(a,5)
 591         rorl    $2, %edx                # b = rotl32(b,30)
 592 # 29
 593         movl    %edx, %edi              # c
 594         xorl    %ebp, %edi              # ^d
 595         xorl    %ecx, %edi              # ^b
 596         addl    -64+4*13(%rsp), %eax    # e += RCONST + W[n & 15]
 597         addl    %edi, %eax              # e += (c ^ d ^ b)
 598         movl    %ebx, %esi              #
 599         roll    $5, %esi                # rotl32(a,5)
 600         addl    %esi, %eax              # e += rotl32(a,5)
 601         rorl    $2, %ecx                # b = rotl32(b,30)
 602 # 30
 603         movl    %ecx, %edi              # c
 604         xorl    %edx, %edi              # ^d
 605         xorl    %ebx, %edi              # ^b
 606         addl    -64+4*14(%rsp), %ebp    # e += RCONST + W[n & 15]
 607         addl    %edi, %ebp              # e += (c ^ d ^ b)
 608         movl    %eax, %esi              #
 609         roll    $5, %esi                # rotl32(a,5)
 610         addl    %esi, %ebp              # e += rotl32(a,5)
 611         rorl    $2, %ebx                # b = rotl32(b,30)
 612 # 31
 613         movl    %ebx, %edi              # c
 614         xorl    %ecx, %edi              # ^d
 615         xorl    %eax, %edi              # ^b
 616         addl    -64+4*15(%rsp), %edx    # e += RCONST + W[n & 15]
 617         addl    %edi, %edx              # e += (c ^ d ^ b)
 618         movl    %ebp, %esi              #
 619         roll    $5, %esi                # rotl32(a,5)
 620         addl    %esi, %edx              # e += rotl32(a,5)
 621         rorl    $2, %eax                # b = rotl32(b,30)
 622         pshufd  $0xaa, %xmm7, %xmm6
 623 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 624         movaps  %xmm1, %xmm4
 625         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 626 #       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 627 #       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 628 # same result as above, but shorter and faster:
 629 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 630 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 631         movaps  %xmm2, %xmm5
 632         shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 633         xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 634         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 635         xorps   %xmm5, %xmm2    # ^
 636         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 637         movaps  %xmm2, %xmm5
 638         xorps   %xmm4, %xmm4    # rol(W0,1):
 639         pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 640         paddd   %xmm2, %xmm2    #  shift left by 1
 641         psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
 642         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 643         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 644         movaps  %xmm5, %xmm4
 645         pslld   $2, %xmm5
 646         psrld   $30, %xmm4
 647 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 648         xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
 649         xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 650         movaps  %xmm2, %xmm5
 651         paddd   %xmm6, %xmm5
 652         movups  %xmm5, -64+16*2(%rsp)
 653 # 32
 654         movl    %eax, %edi              # c
 655         xorl    %ebx, %edi              # ^d
 656         xorl    %ebp, %edi              # ^b
 657         addl    -64+4*0(%rsp), %ecx     # e += RCONST + W[n & 15]
 658         addl    %edi, %ecx              # e += (c ^ d ^ b)
 659         movl    %edx, %esi              #
 660         roll    $5, %esi                # rotl32(a,5)
 661         addl    %esi, %ecx              # e += rotl32(a,5)
 662         rorl    $2, %ebp                # b = rotl32(b,30)
 663 # 33
 664         movl    %ebp, %edi              # c
 665         xorl    %eax, %edi              # ^d
 666         xorl    %edx, %edi              # ^b
 667         addl    -64+4*1(%rsp), %ebx     # e += RCONST + W[n & 15]
 668         addl    %edi, %ebx              # e += (c ^ d ^ b)
 669         movl    %ecx, %esi              #
 670         roll    $5, %esi                # rotl32(a,5)
 671         addl    %esi, %ebx              # e += rotl32(a,5)
 672         rorl    $2, %edx                # b = rotl32(b,30)
 673 # 34
 674         movl    %edx, %edi              # c
 675         xorl    %ebp, %edi              # ^d
 676         xorl    %ecx, %edi              # ^b
 677         addl    -64+4*2(%rsp), %eax     # e += RCONST + W[n & 15]
 678         addl    %edi, %eax              # e += (c ^ d ^ b)
 679         movl    %ebx, %esi              #
 680         roll    $5, %esi                # rotl32(a,5)
 681         addl    %esi, %eax              # e += rotl32(a,5)
 682         rorl    $2, %ecx                # b = rotl32(b,30)
 683 # 35
 684         movl    %ecx, %edi              # c
 685         xorl    %edx, %edi              # ^d
 686         xorl    %ebx, %edi              # ^b
 687         addl    -64+4*3(%rsp), %ebp     # e += RCONST + W[n & 15]
 688         addl    %edi, %ebp              # e += (c ^ d ^ b)
 689         movl    %eax, %esi              #
 690         roll    $5, %esi                # rotl32(a,5)
 691         addl    %esi, %ebp              # e += rotl32(a,5)
 692         rorl    $2, %ebx                # b = rotl32(b,30)
 693 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 694         movaps  %xmm2, %xmm4
 695         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 696 #       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 697 #       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 698 # same result as above, but shorter and faster:
 699 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 700 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 701         movaps  %xmm3, %xmm5
 702         shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 703         xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 704         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 705         xorps   %xmm5, %xmm3    # ^
 706         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 707         movaps  %xmm3, %xmm5
 708         xorps   %xmm4, %xmm4    # rol(W0,1):
 709         pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 710         paddd   %xmm3, %xmm3    #  shift left by 1
 711         psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
 712         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 713         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 714         movaps  %xmm5, %xmm4
 715         pslld   $2, %xmm5
 716         psrld   $30, %xmm4
 717 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 718         xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
 719         xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 720         movaps  %xmm3, %xmm5
 721         paddd   %xmm6, %xmm5
 722         movups  %xmm5, -64+16*3(%rsp)
 723 # 36
 724         movl    %ebx, %edi              # c
 725         xorl    %ecx, %edi              # ^d
 726         xorl    %eax, %edi              # ^b
 727         addl    -64+4*4(%rsp), %edx     # e += RCONST + W[n & 15]
 728         addl    %edi, %edx              # e += (c ^ d ^ b)
 729         movl    %ebp, %esi              #
 730         roll    $5, %esi                # rotl32(a,5)
 731         addl    %esi, %edx              # e += rotl32(a,5)
 732         rorl    $2, %eax                # b = rotl32(b,30)
 733 # 37
 734         movl    %eax, %edi              # c
 735         xorl    %ebx, %edi              # ^d
 736         xorl    %ebp, %edi              # ^b
 737         addl    -64+4*5(%rsp), %ecx     # e += RCONST + W[n & 15]
 738         addl    %edi, %ecx              # e += (c ^ d ^ b)
 739         movl    %edx, %esi              #
 740         roll    $5, %esi                # rotl32(a,5)
 741         addl    %esi, %ecx              # e += rotl32(a,5)
 742         rorl    $2, %ebp                # b = rotl32(b,30)
 743 # 38
 744         movl    %ebp, %edi              # c
 745         xorl    %eax, %edi              # ^d
 746         xorl    %edx, %edi              # ^b
 747         addl    -64+4*6(%rsp), %ebx     # e += RCONST + W[n & 15]
 748         addl    %edi, %ebx              # e += (c ^ d ^ b)
 749         movl    %ecx, %esi              #
 750         roll    $5, %esi                # rotl32(a,5)
 751         addl    %esi, %ebx              # e += rotl32(a,5)
 752         rorl    $2, %edx                # b = rotl32(b,30)
 753 # 39
 754         movl    %edx, %edi              # c
 755         xorl    %ebp, %edi              # ^d
 756         xorl    %ecx, %edi              # ^b
 757         addl    -64+4*7(%rsp), %eax     # e += RCONST + W[n & 15]
 758         addl    %edi, %eax              # e += (c ^ d ^ b)
 759         movl    %ebx, %esi              #
 760         roll    $5, %esi                # rotl32(a,5)
 761         addl    %esi, %eax              # e += rotl32(a,5)
 762         rorl    $2, %ecx                # b = rotl32(b,30)
 763 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 764         movaps  %xmm3, %xmm4
 765         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 766 #       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 767 #       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 768 # same result as above, but shorter and faster:
 769 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 770 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 771         movaps  %xmm0, %xmm5
 772         shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 773         xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 774         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 775         xorps   %xmm5, %xmm0    # ^
 776         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 777         movaps  %xmm0, %xmm5
 778         xorps   %xmm4, %xmm4    # rol(W0,1):
 779         pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 780         paddd   %xmm0, %xmm0    #  shift left by 1
 781         psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
 782         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 783         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 784         movaps  %xmm5, %xmm4
 785         pslld   $2, %xmm5
 786         psrld   $30, %xmm4
 787 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 788         xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
 789         xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 790         movaps  %xmm0, %xmm5
 791         paddd   %xmm6, %xmm5
 792         movups  %xmm5, -64+16*0(%rsp)
 793 # 40
 794         movl    %ebx, %edi              # di: b
 795         movl    %ebx, %esi              # si: b
 796         orl     %ecx, %edi              # di: b | c
 797         andl    %ecx, %esi              # si: b & c
 798         andl    %edx, %edi              # di: (b | c) & d
 799         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 800         addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
 801         addl    -64+4*8(%rsp), %ebp     # e += RCONST + W[n & 15]
 802         movl    %eax, %esi              #
 803         roll    $5, %esi                # rotl32(a,5)
 804         addl    %esi, %ebp              # e += rotl32(a,5)
 805         rorl    $2, %ebx                # b = rotl32(b,30)
 806 # 41
 807         movl    %eax, %edi              # di: b
 808         movl    %eax, %esi              # si: b
 809         orl     %ebx, %edi              # di: b | c
 810         andl    %ebx, %esi              # si: b & c
 811         andl    %ecx, %edi              # di: (b | c) & d
 812         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 813         addl    %edi, %edx              # += ((b | c) & d) | (b & c)
 814         addl    -64+4*9(%rsp), %edx     # e += RCONST + W[n & 15]
 815         movl    %ebp, %esi              #
 816         roll    $5, %esi                # rotl32(a,5)
 817         addl    %esi, %edx              # e += rotl32(a,5)
 818         rorl    $2, %eax                # b = rotl32(b,30)
 819 # 42
 820         movl    %ebp, %edi              # di: b
 821         movl    %ebp, %esi              # si: b
 822         orl     %eax, %edi              # di: b | c
 823         andl    %eax, %esi              # si: b & c
 824         andl    %ebx, %edi              # di: (b | c) & d
 825         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 826         addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
 827         addl    -64+4*10(%rsp), %ecx    # e += RCONST + W[n & 15]
 828         movl    %edx, %esi              #
 829         roll    $5, %esi                # rotl32(a,5)
 830         addl    %esi, %ecx              # e += rotl32(a,5)
 831         rorl    $2, %ebp                # b = rotl32(b,30)
 832 # 43
 833         movl    %edx, %edi              # di: b
 834         movl    %edx, %esi              # si: b
 835         orl     %ebp, %edi              # di: b | c
 836         andl    %ebp, %esi              # si: b & c
 837         andl    %eax, %edi              # di: (b | c) & d
 838         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 839         addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
 840         addl    -64+4*11(%rsp), %ebx    # e += RCONST + W[n & 15]
 841         movl    %ecx, %esi              #
 842         roll    $5, %esi                # rotl32(a,5)
 843         addl    %esi, %ebx              # e += rotl32(a,5)
 844         rorl    $2, %edx                # b = rotl32(b,30)
 845 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 846         movaps  %xmm0, %xmm4
 847         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 848 #       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 849 #       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 850 # same result as above, but shorter and faster:
 851 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 852 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 853         movaps  %xmm1, %xmm5
 854         shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 855         xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 856         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 857         xorps   %xmm5, %xmm1    # ^
 858         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 859         movaps  %xmm1, %xmm5
 860         xorps   %xmm4, %xmm4    # rol(W0,1):
 861         pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 862         paddd   %xmm1, %xmm1    #  shift left by 1
 863         psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
 864         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 865         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 866         movaps  %xmm5, %xmm4
 867         pslld   $2, %xmm5
 868         psrld   $30, %xmm4
 869 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 870         xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
 871         xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 872         movaps  %xmm1, %xmm5
 873         paddd   %xmm6, %xmm5
 874         movups  %xmm5, -64+16*1(%rsp)
 875 # 44
 876         movl    %ecx, %edi              # di: b
 877         movl    %ecx, %esi              # si: b
 878         orl     %edx, %edi              # di: b | c
 879         andl    %edx, %esi              # si: b & c
 880         andl    %ebp, %edi              # di: (b | c) & d
 881         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 882         addl    %edi, %eax              # += ((b | c) & d) | (b & c)
 883         addl    -64+4*12(%rsp), %eax    # e += RCONST + W[n & 15]
 884         movl    %ebx, %esi              #
 885         roll    $5, %esi                # rotl32(a,5)
 886         addl    %esi, %eax              # e += rotl32(a,5)
 887         rorl    $2, %ecx                # b = rotl32(b,30)
 888 # 45
 889         movl    %ebx, %edi              # di: b
 890         movl    %ebx, %esi              # si: b
 891         orl     %ecx, %edi              # di: b | c
 892         andl    %ecx, %esi              # si: b & c
 893         andl    %edx, %edi              # di: (b | c) & d
 894         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 895         addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
 896         addl    -64+4*13(%rsp), %ebp    # e += RCONST + W[n & 15]
 897         movl    %eax, %esi              #
 898         roll    $5, %esi                # rotl32(a,5)
 899         addl    %esi, %ebp              # e += rotl32(a,5)
 900         rorl    $2, %ebx                # b = rotl32(b,30)
 901 # 46
 902         movl    %eax, %edi              # di: b
 903         movl    %eax, %esi              # si: b
 904         orl     %ebx, %edi              # di: b | c
 905         andl    %ebx, %esi              # si: b & c
 906         andl    %ecx, %edi              # di: (b | c) & d
 907         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 908         addl    %edi, %edx              # += ((b | c) & d) | (b & c)
 909         addl    -64+4*14(%rsp), %edx    # e += RCONST + W[n & 15]
 910         movl    %ebp, %esi              #
 911         roll    $5, %esi                # rotl32(a,5)
 912         addl    %esi, %edx              # e += rotl32(a,5)
 913         rorl    $2, %eax                # b = rotl32(b,30)
 914 # 47
 915         movl    %ebp, %edi              # di: b
 916         movl    %ebp, %esi              # si: b
 917         orl     %eax, %edi              # di: b | c
 918         andl    %eax, %esi              # si: b & c
 919         andl    %ebx, %edi              # di: (b | c) & d
 920         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 921         addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
 922         addl    -64+4*15(%rsp), %ecx    # e += RCONST + W[n & 15]
 923         movl    %edx, %esi              #
 924         roll    $5, %esi                # rotl32(a,5)
 925         addl    %esi, %ecx              # e += rotl32(a,5)
 926         rorl    $2, %ebp                # b = rotl32(b,30)
 927 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 928         movaps  %xmm1, %xmm4
 929         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
 930 #       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
 931 #       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
 932 # same result as above, but shorter and faster:
 933 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
 934 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
 935         movaps  %xmm2, %xmm5
 936         shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 937         xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 938         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 939         xorps   %xmm5, %xmm2    # ^
 940         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 941         movaps  %xmm2, %xmm5
 942         xorps   %xmm4, %xmm4    # rol(W0,1):
 943         pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
 944         paddd   %xmm2, %xmm2    #  shift left by 1
 945         psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
 946         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 947         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 948         movaps  %xmm5, %xmm4
 949         pslld   $2, %xmm5
 950         psrld   $30, %xmm4
 951 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
 952         xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
 953         xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
 954         movaps  %xmm2, %xmm5
 955         paddd   %xmm6, %xmm5
 956         movups  %xmm5, -64+16*2(%rsp)
 957 # 48
 958         movl    %edx, %edi              # di: b
 959         movl    %edx, %esi              # si: b
 960         orl     %ebp, %edi              # di: b | c
 961         andl    %ebp, %esi              # si: b & c
 962         andl    %eax, %edi              # di: (b | c) & d
 963         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 964         addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
 965         addl    -64+4*0(%rsp), %ebx     # e += RCONST + W[n & 15]
 966         movl    %ecx, %esi              #
 967         roll    $5, %esi                # rotl32(a,5)
 968         addl    %esi, %ebx              # e += rotl32(a,5)
 969         rorl    $2, %edx                # b = rotl32(b,30)
 970 # 49
 971         movl    %ecx, %edi              # di: b
 972         movl    %ecx, %esi              # si: b
 973         orl     %edx, %edi              # di: b | c
 974         andl    %edx, %esi              # si: b & c
 975         andl    %ebp, %edi              # di: (b | c) & d
 976         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 977         addl    %edi, %eax              # += ((b | c) & d) | (b & c)
 978         addl    -64+4*1(%rsp), %eax     # e += RCONST + W[n & 15]
 979         movl    %ebx, %esi              #
 980         roll    $5, %esi                # rotl32(a,5)
 981         addl    %esi, %eax              # e += rotl32(a,5)
 982         rorl    $2, %ecx                # b = rotl32(b,30)
 983 # 50
 984         movl    %ebx, %edi              # di: b
 985         movl    %ebx, %esi              # si: b
 986         orl     %ecx, %edi              # di: b | c
 987         andl    %ecx, %esi              # si: b & c
 988         andl    %edx, %edi              # di: (b | c) & d
 989         orl     %esi, %edi              # ((b | c) & d) | (b & c)
 990         addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
 991         addl    -64+4*2(%rsp), %ebp     # e += RCONST + W[n & 15]
 992         movl    %eax, %esi              #
 993         roll    $5, %esi                # rotl32(a,5)
 994         addl    %esi, %ebp              # e += rotl32(a,5)
 995         rorl    $2, %ebx                # b = rotl32(b,30)
 996 # 51
 997         movl    %eax, %edi              # di: b
 998         movl    %eax, %esi              # si: b
 999         orl     %ebx, %edi              # di: b | c
1000         andl    %ebx, %esi              # si: b & c
1001         andl    %ecx, %edi              # di: (b | c) & d
1002         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1003         addl    %edi, %edx              # += ((b | c) & d) | (b & c)
1004         addl    -64+4*3(%rsp), %edx     # e += RCONST + W[n & 15]
1005         movl    %ebp, %esi              #
1006         roll    $5, %esi                # rotl32(a,5)
1007         addl    %esi, %edx              # e += rotl32(a,5)
1008         rorl    $2, %eax                # b = rotl32(b,30)
1009         pshufd  $0xff, %xmm7, %xmm6
1010 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1011         movaps  %xmm2, %xmm4
1012         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1013 #       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1014 #       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1015 # same result as above, but shorter and faster:
1016 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1017 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1018         movaps  %xmm3, %xmm5
1019         shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1020         xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1021         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1022         xorps   %xmm5, %xmm3    # ^
1023         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1024         movaps  %xmm3, %xmm5
1025         xorps   %xmm4, %xmm4    # rol(W0,1):
1026         pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
1027         paddd   %xmm3, %xmm3    #  shift left by 1
1028         psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
1029         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1030         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1031         movaps  %xmm5, %xmm4
1032         pslld   $2, %xmm5
1033         psrld   $30, %xmm4
1034 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
1035         xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
1036         xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1037         movaps  %xmm3, %xmm5
1038         paddd   %xmm6, %xmm5
1039         movups  %xmm5, -64+16*3(%rsp)
1040 # 52
1041         movl    %ebp, %edi              # di: b
1042         movl    %ebp, %esi              # si: b
1043         orl     %eax, %edi              # di: b | c
1044         andl    %eax, %esi              # si: b & c
1045         andl    %ebx, %edi              # di: (b | c) & d
1046         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1047         addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
1048         addl    -64+4*4(%rsp), %ecx     # e += RCONST + W[n & 15]
1049         movl    %edx, %esi              #
1050         roll    $5, %esi                # rotl32(a,5)
1051         addl    %esi, %ecx              # e += rotl32(a,5)
1052         rorl    $2, %ebp                # b = rotl32(b,30)
1053 # 53
1054         movl    %edx, %edi              # di: b
1055         movl    %edx, %esi              # si: b
1056         orl     %ebp, %edi              # di: b | c
1057         andl    %ebp, %esi              # si: b & c
1058         andl    %eax, %edi              # di: (b | c) & d
1059         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1060         addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
1061         addl    -64+4*5(%rsp), %ebx     # e += RCONST + W[n & 15]
1062         movl    %ecx, %esi              #
1063         roll    $5, %esi                # rotl32(a,5)
1064         addl    %esi, %ebx              # e += rotl32(a,5)
1065         rorl    $2, %edx                # b = rotl32(b,30)
1066 # 54
1067         movl    %ecx, %edi              # di: b
1068         movl    %ecx, %esi              # si: b
1069         orl     %edx, %edi              # di: b | c
1070         andl    %edx, %esi              # si: b & c
1071         andl    %ebp, %edi              # di: (b | c) & d
1072         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1073         addl    %edi, %eax              # += ((b | c) & d) | (b & c)
1074         addl    -64+4*6(%rsp), %eax     # e += RCONST + W[n & 15]
1075         movl    %ebx, %esi              #
1076         roll    $5, %esi                # rotl32(a,5)
1077         addl    %esi, %eax              # e += rotl32(a,5)
1078         rorl    $2, %ecx                # b = rotl32(b,30)
1079 # 55
1080         movl    %ebx, %edi              # di: b
1081         movl    %ebx, %esi              # si: b
1082         orl     %ecx, %edi              # di: b | c
1083         andl    %ecx, %esi              # si: b & c
1084         andl    %edx, %edi              # di: (b | c) & d
1085         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1086         addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
1087         addl    -64+4*7(%rsp), %ebp     # e += RCONST + W[n & 15]
1088         movl    %eax, %esi              #
1089         roll    $5, %esi                # rotl32(a,5)
1090         addl    %esi, %ebp              # e += rotl32(a,5)
1091         rorl    $2, %ebx                # b = rotl32(b,30)
1092 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1093         movaps  %xmm3, %xmm4
1094         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1095 #       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1096 #       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1097 # same result as above, but shorter and faster:
1098 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1099 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1100         movaps  %xmm0, %xmm5
1101         shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1102         xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1103         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1104         xorps   %xmm5, %xmm0    # ^
1105         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1106         movaps  %xmm0, %xmm5
1107         xorps   %xmm4, %xmm4    # rol(W0,1):
1108         pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
1109         paddd   %xmm0, %xmm0    #  shift left by 1
1110         psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
1111         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1112         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1113         movaps  %xmm5, %xmm4
1114         pslld   $2, %xmm5
1115         psrld   $30, %xmm4
1116 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
1117         xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
1118         xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1119         movaps  %xmm0, %xmm5
1120         paddd   %xmm6, %xmm5
1121         movups  %xmm5, -64+16*0(%rsp)
1122 # 56
1123         movl    %eax, %edi              # di: b
1124         movl    %eax, %esi              # si: b
1125         orl     %ebx, %edi              # di: b | c
1126         andl    %ebx, %esi              # si: b & c
1127         andl    %ecx, %edi              # di: (b | c) & d
1128         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1129         addl    %edi, %edx              # += ((b | c) & d) | (b & c)
1130         addl    -64+4*8(%rsp), %edx     # e += RCONST + W[n & 15]
1131         movl    %ebp, %esi              #
1132         roll    $5, %esi                # rotl32(a,5)
1133         addl    %esi, %edx              # e += rotl32(a,5)
1134         rorl    $2, %eax                # b = rotl32(b,30)
1135 # 57
1136         movl    %ebp, %edi              # di: b
1137         movl    %ebp, %esi              # si: b
1138         orl     %eax, %edi              # di: b | c
1139         andl    %eax, %esi              # si: b & c
1140         andl    %ebx, %edi              # di: (b | c) & d
1141         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1142         addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
1143         addl    -64+4*9(%rsp), %ecx     # e += RCONST + W[n & 15]
1144         movl    %edx, %esi              #
1145         roll    $5, %esi                # rotl32(a,5)
1146         addl    %esi, %ecx              # e += rotl32(a,5)
1147         rorl    $2, %ebp                # b = rotl32(b,30)
1148 # 58
1149         movl    %edx, %edi              # di: b
1150         movl    %edx, %esi              # si: b
1151         orl     %ebp, %edi              # di: b | c
1152         andl    %ebp, %esi              # si: b & c
1153         andl    %eax, %edi              # di: (b | c) & d
1154         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1155         addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
1156         addl    -64+4*10(%rsp), %ebx    # e += RCONST + W[n & 15]
1157         movl    %ecx, %esi              #
1158         roll    $5, %esi                # rotl32(a,5)
1159         addl    %esi, %ebx              # e += rotl32(a,5)
1160         rorl    $2, %edx                # b = rotl32(b,30)
1161 # 59
1162         movl    %ecx, %edi              # di: b
1163         movl    %ecx, %esi              # si: b
1164         orl     %edx, %edi              # di: b | c
1165         andl    %edx, %esi              # si: b & c
1166         andl    %ebp, %edi              # di: (b | c) & d
1167         orl     %esi, %edi              # ((b | c) & d) | (b & c)
1168         addl    %edi, %eax              # += ((b | c) & d) | (b & c)
1169         addl    -64+4*11(%rsp), %eax    # e += RCONST + W[n & 15]
1170         movl    %ebx, %esi              #
1171         roll    $5, %esi                # rotl32(a,5)
1172         addl    %esi, %eax              # e += rotl32(a,5)
1173         rorl    $2, %ecx                # b = rotl32(b,30)
1174 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1175         movaps  %xmm0, %xmm4
1176         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1177 #       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1178 #       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1179 # same result as above, but shorter and faster:
1180 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1181 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1182         movaps  %xmm1, %xmm5
1183         shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1184         xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1185         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1186         xorps   %xmm5, %xmm1    # ^
1187         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1188         movaps  %xmm1, %xmm5
1189         xorps   %xmm4, %xmm4    # rol(W0,1):
1190         pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
1191         paddd   %xmm1, %xmm1    #  shift left by 1
1192         psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
1193         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1194         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1195         movaps  %xmm5, %xmm4
1196         pslld   $2, %xmm5
1197         psrld   $30, %xmm4
1198 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
1199         xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
1200         xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1201         movaps  %xmm1, %xmm5
1202         paddd   %xmm6, %xmm5
1203         movups  %xmm5, -64+16*1(%rsp)
1204 # 60
1205         movl    %ecx, %edi              # c
1206         xorl    %edx, %edi              # ^d
1207         xorl    %ebx, %edi              # ^b
1208         addl    -64+4*12(%rsp), %ebp    # e += RCONST + W[n & 15]
1209         addl    %edi, %ebp              # e += (c ^ d ^ b)
1210         movl    %eax, %esi              #
1211         roll    $5, %esi                # rotl32(a,5)
1212         addl    %esi, %ebp              # e += rotl32(a,5)
1213         rorl    $2, %ebx                # b = rotl32(b,30)
1214 # 61
1215         movl    %ebx, %edi              # c
1216         xorl    %ecx, %edi              # ^d
1217         xorl    %eax, %edi              # ^b
1218         addl    -64+4*13(%rsp), %edx    # e += RCONST + W[n & 15]
1219         addl    %edi, %edx              # e += (c ^ d ^ b)
1220         movl    %ebp, %esi              #
1221         roll    $5, %esi                # rotl32(a,5)
1222         addl    %esi, %edx              # e += rotl32(a,5)
1223         rorl    $2, %eax                # b = rotl32(b,30)
1224 # 62
1225         movl    %eax, %edi              # c
1226         xorl    %ebx, %edi              # ^d
1227         xorl    %ebp, %edi              # ^b
1228         addl    -64+4*14(%rsp), %ecx    # e += RCONST + W[n & 15]
1229         addl    %edi, %ecx              # e += (c ^ d ^ b)
1230         movl    %edx, %esi              #
1231         roll    $5, %esi                # rotl32(a,5)
1232         addl    %esi, %ecx              # e += rotl32(a,5)
1233         rorl    $2, %ebp                # b = rotl32(b,30)
1234 # 63
1235         movl    %ebp, %edi              # c
1236         xorl    %eax, %edi              # ^d
1237         xorl    %edx, %edi              # ^b
1238         addl    -64+4*15(%rsp), %ebx    # e += RCONST + W[n & 15]
1239         addl    %edi, %ebx              # e += (c ^ d ^ b)
1240         movl    %ecx, %esi              #
1241         roll    $5, %esi                # rotl32(a,5)
1242         addl    %esi, %ebx              # e += rotl32(a,5)
1243         rorl    $2, %edx                # b = rotl32(b,30)
1244 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1245         movaps  %xmm1, %xmm4
1246         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1247 #       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1248 #       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1249 # same result as above, but shorter and faster:
1250 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1251 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1252         movaps  %xmm2, %xmm5
1253         shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1254         xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1255         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1256         xorps   %xmm5, %xmm2    # ^
1257         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1258         movaps  %xmm2, %xmm5
1259         xorps   %xmm4, %xmm4    # rol(W0,1):
1260         pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
1261         paddd   %xmm2, %xmm2    #  shift left by 1
1262         psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
1263         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1264         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1265         movaps  %xmm5, %xmm4
1266         pslld   $2, %xmm5
1267         psrld   $30, %xmm4
1268 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
1269         xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
1270         xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1271         movaps  %xmm2, %xmm5
1272         paddd   %xmm6, %xmm5
1273         movups  %xmm5, -64+16*2(%rsp)
1274 # 64
1275         movl    %edx, %edi              # c
1276         xorl    %ebp, %edi              # ^d
1277         xorl    %ecx, %edi              # ^b
1278         addl    -64+4*0(%rsp), %eax     # e += RCONST + W[n & 15]
1279         addl    %edi, %eax              # e += (c ^ d ^ b)
1280         movl    %ebx, %esi              #
1281         roll    $5, %esi                # rotl32(a,5)
1282         addl    %esi, %eax              # e += rotl32(a,5)
1283         rorl    $2, %ecx                # b = rotl32(b,30)
1284 # 65
1285         movl    %ecx, %edi              # c
1286         xorl    %edx, %edi              # ^d
1287         xorl    %ebx, %edi              # ^b
1288         addl    -64+4*1(%rsp), %ebp     # e += RCONST + W[n & 15]
1289         addl    %edi, %ebp              # e += (c ^ d ^ b)
1290         movl    %eax, %esi              #
1291         roll    $5, %esi                # rotl32(a,5)
1292         addl    %esi, %ebp              # e += rotl32(a,5)
1293         rorl    $2, %ebx                # b = rotl32(b,30)
1294 # 66
1295         movl    %ebx, %edi              # c
1296         xorl    %ecx, %edi              # ^d
1297         xorl    %eax, %edi              # ^b
1298         addl    -64+4*2(%rsp), %edx     # e += RCONST + W[n & 15]
1299         addl    %edi, %edx              # e += (c ^ d ^ b)
1300         movl    %ebp, %esi              #
1301         roll    $5, %esi                # rotl32(a,5)
1302         addl    %esi, %edx              # e += rotl32(a,5)
1303         rorl    $2, %eax                # b = rotl32(b,30)
1304 # 67
1305         movl    %eax, %edi              # c
1306         xorl    %ebx, %edi              # ^d
1307         xorl    %ebp, %edi              # ^b
1308         addl    -64+4*3(%rsp), %ecx     # e += RCONST + W[n & 15]
1309         addl    %edi, %ecx              # e += (c ^ d ^ b)
1310         movl    %edx, %esi              #
1311         roll    $5, %esi                # rotl32(a,5)
1312         addl    %esi, %ecx              # e += rotl32(a,5)
1313         rorl    $2, %ebp                # b = rotl32(b,30)
1314 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1315         movaps  %xmm2, %xmm4
1316         psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1317 #       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1318 #       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1319 # same result as above, but shorter and faster:
1320 # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1321 # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1322         movaps  %xmm3, %xmm5
1323         shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1324         xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1325         xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1326         xorps   %xmm5, %xmm3    # ^
1327         # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1328         movaps  %xmm3, %xmm5
1329         xorps   %xmm4, %xmm4    # rol(W0,1):
1330         pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
1331         paddd   %xmm3, %xmm3    #  shift left by 1
1332         psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
1333         # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1334         pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1335         movaps  %xmm5, %xmm4
1336         pslld   $2, %xmm5
1337         psrld   $30, %xmm4
1338 #       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
1339         xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
1340         xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1341         movaps  %xmm3, %xmm5
1342         paddd   %xmm6, %xmm5
1343         movups  %xmm5, -64+16*3(%rsp)
1344 # 68
1345         movl    %ebp, %edi              # c
1346         xorl    %eax, %edi              # ^d
1347         xorl    %edx, %edi              # ^b
1348         addl    -64+4*4(%rsp), %ebx     # e += RCONST + W[n & 15]
1349         addl    %edi, %ebx              # e += (c ^ d ^ b)
1350         movl    %ecx, %esi              #
1351         roll    $5, %esi                # rotl32(a,5)
1352         addl    %esi, %ebx              # e += rotl32(a,5)
1353         rorl    $2, %edx                # b = rotl32(b,30)
1354 # 69
1355         movl    %edx, %edi              # c
1356         xorl    %ebp, %edi              # ^d
1357         xorl    %ecx, %edi              # ^b
1358         addl    -64+4*5(%rsp), %eax     # e += RCONST + W[n & 15]
1359         addl    %edi, %eax              # e += (c ^ d ^ b)
1360         movl    %ebx, %esi              #
1361         roll    $5, %esi                # rotl32(a,5)
1362         addl    %esi, %eax              # e += rotl32(a,5)
1363         rorl    $2, %ecx                # b = rotl32(b,30)
1364 # 70
1365         movl    %ecx, %edi              # c
1366         xorl    %edx, %edi              # ^d
1367         xorl    %ebx, %edi              # ^b
1368         addl    -64+4*6(%rsp), %ebp     # e += RCONST + W[n & 15]
1369         addl    %edi, %ebp              # e += (c ^ d ^ b)
1370         movl    %eax, %esi              #
1371         roll    $5, %esi                # rotl32(a,5)
1372         addl    %esi, %ebp              # e += rotl32(a,5)
1373         rorl    $2, %ebx                # b = rotl32(b,30)
1374 # 71
1375         movl    %ebx, %edi              # c
1376         xorl    %ecx, %edi              # ^d
1377         xorl    %eax, %edi              # ^b
1378         addl    -64+4*7(%rsp), %edx     # e += RCONST + W[n & 15]
1379         addl    %edi, %edx              # e += (c ^ d ^ b)
1380         movl    %ebp, %esi              #
1381         roll    $5, %esi                # rotl32(a,5)
1382         addl    %esi, %edx              # e += rotl32(a,5)
1383         rorl    $2, %eax                # b = rotl32(b,30)
1384 # 72
1385         movl    %eax, %edi              # c
1386         xorl    %ebx, %edi              # ^d
1387         xorl    %ebp, %edi              # ^b
1388         addl    -64+4*8(%rsp), %ecx     # e += RCONST + W[n & 15]
1389         addl    %edi, %ecx              # e += (c ^ d ^ b)
1390         movl    %edx, %esi              #
1391         roll    $5, %esi                # rotl32(a,5)
1392         addl    %esi, %ecx              # e += rotl32(a,5)
1393         rorl    $2, %ebp                # b = rotl32(b,30)
1394 # 73
1395         movl    %ebp, %edi              # c
1396         xorl    %eax, %edi              # ^d
1397         xorl    %edx, %edi              # ^b
1398         addl    -64+4*9(%rsp), %ebx     # e += RCONST + W[n & 15]
1399         addl    %edi, %ebx              # e += (c ^ d ^ b)
1400         movl    %ecx, %esi              #
1401         roll    $5, %esi                # rotl32(a,5)
1402         addl    %esi, %ebx              # e += rotl32(a,5)
1403         rorl    $2, %edx                # b = rotl32(b,30)
1404 # 74
1405         movl    %edx, %edi              # c
1406         xorl    %ebp, %edi              # ^d
1407         xorl    %ecx, %edi              # ^b
1408         addl    -64+4*10(%rsp), %eax    # e += RCONST + W[n & 15]
1409         addl    %edi, %eax              # e += (c ^ d ^ b)
1410         movl    %ebx, %esi              #
1411         roll    $5, %esi                # rotl32(a,5)
1412         addl    %esi, %eax              # e += rotl32(a,5)
1413         rorl    $2, %ecx                # b = rotl32(b,30)
1414 # 75
1415         movl    %ecx, %edi              # c
1416         xorl    %edx, %edi              # ^d
1417         xorl    %ebx, %edi              # ^b
1418         addl    -64+4*11(%rsp), %ebp    # e += RCONST + W[n & 15]
1419         addl    %edi, %ebp              # e += (c ^ d ^ b)
1420         movl    %eax, %esi              #
1421         roll    $5, %esi                # rotl32(a,5)
1422         addl    %esi, %ebp              # e += rotl32(a,5)
1423         rorl    $2, %ebx                # b = rotl32(b,30)
1424 # 76
1425         movl    %ebx, %edi              # c
1426         xorl    %ecx, %edi              # ^d
1427         xorl    %eax, %edi              # ^b
1428         addl    -64+4*12(%rsp), %edx    # e += RCONST + W[n & 15]
1429         addl    %edi, %edx              # e += (c ^ d ^ b)
1430         movl    %ebp, %esi              #
1431         roll    $5, %esi                # rotl32(a,5)
1432         addl    %esi, %edx              # e += rotl32(a,5)
1433         rorl    $2, %eax                # b = rotl32(b,30)
1434 # 77
1435         movl    %eax, %edi              # c
1436         xorl    %ebx, %edi              # ^d
1437         xorl    %ebp, %edi              # ^b
1438         addl    -64+4*13(%rsp), %ecx    # e += RCONST + W[n & 15]
1439         addl    %edi, %ecx              # e += (c ^ d ^ b)
1440         movl    %edx, %esi              #
1441         roll    $5, %esi                # rotl32(a,5)
1442         addl    %esi, %ecx              # e += rotl32(a,5)
1443         rorl    $2, %ebp                # b = rotl32(b,30)
1444 # 78
1445         movl    %ebp, %edi              # c
1446         xorl    %eax, %edi              # ^d
1447         xorl    %edx, %edi              # ^b
1448         addl    -64+4*14(%rsp), %ebx    # e += RCONST + W[n & 15]
1449         addl    %edi, %ebx              # e += (c ^ d ^ b)
1450         movl    %ecx, %esi              #
1451         roll    $5, %esi                # rotl32(a,5)
1452         addl    %esi, %ebx              # e += rotl32(a,5)
1453         rorl    $2, %edx                # b = rotl32(b,30)
1454 # 79
1455         movl    %edx, %edi              # c
1456         xorl    %ebp, %edi              # ^d
1457         xorl    %ecx, %edi              # ^b
1458         addl    -64+4*15(%rsp), %eax    # e += RCONST + W[n & 15]
1459         addl    %edi, %eax              # e += (c ^ d ^ b)
1460         movl    %ebx, %esi              #
1461         roll    $5, %esi                # rotl32(a,5)
1462         addl    %esi, %eax              # e += rotl32(a,5)
1463         rorl    $2, %ecx                # b = rotl32(b,30)
1464
1465         popq    %rdi            #
1466         popq    %r12            #
1467         addl    %eax, 80(%rdi)  # ctx->hash[0] += a
1468         popq    %r13            #
1469         addl    %ebx, 84(%rdi)  # ctx->hash[1] += b
1470         popq    %r14            #
1471         addl    %ecx, 88(%rdi)  # ctx->hash[2] += c
1472 #       popq    %r15            #
1473         addl    %edx, 92(%rdi)  # ctx->hash[3] += d
1474         popq    %rbx            #
1475         addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
1476         popq    %rbp            #
1477
1478         ret
1479         .size   sha1_process_block64, .-sha1_process_block64
1480
1481         .section        .rodata.cst16.sha1const, "aM", @progbits, 16
1482         .balign 16
1483 sha1const:
1484         .long   0x5A827999
1485         .long   0x6ED9EBA1
1486         .long   0x8F1BBCDC
1487         .long   0xCA62C1D6
1488
1489 #endif