1 // Copyright (c) 2017 The Bitcoin Core developers
2 // Distributed under the MIT software license, see the accompanying
3 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
5 // This is a translation to GCC extended asm syntax from YASM code by Intel
6 // (available at the bottom of this file).
11 #if defined(__x86_64__) || defined(__amd64__)
15 void Transform(uint32_t* s
, const unsigned char* chunk
, size_t blocks
)
17 static const uint32_t K256
alignas(16) [] = {
18 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
19 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
20 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
21 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
22 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
23 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
24 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
25 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
26 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
27 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
28 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
29 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
30 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
31 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
32 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
33 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
35 static const uint32_t FLIP_MASK
alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
36 static const uint32_t SHUF_00BA
alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
37 static const uint32_t SHUF_DC00
alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
38 uint32_t a
, b
, c
, d
, f
, g
, h
, y0
, y1
, y2
;
40 uint64_t inp_end
, inp
;
41 uint32_t xfer
alignas(16) [4];
63 "pshufb %%xmm12,%%xmm4;"
64 "movdqu 0x10(%1),%%xmm5;"
65 "pshufb %%xmm12,%%xmm5;"
66 "movdqu 0x20(%1),%%xmm6;"
67 "pshufb %%xmm12,%%xmm6;"
68 "movdqu 0x30(%1),%%xmm7;"
69 "pshufb %%xmm12,%%xmm7;"
74 "movdqa 0x0(%13),%%xmm9;"
75 "paddd %%xmm4,%%xmm9;"
77 "movdqa %%xmm7,%%xmm0;"
81 "palignr $0x4,%%xmm6,%%xmm0;"
86 "movdqa %%xmm5,%%xmm1;"
89 "paddd %%xmm4,%%xmm0;"
93 "palignr $0x4,%%xmm4,%%xmm1;"
97 "movdqa %%xmm1,%%xmm2;"
101 "movdqa %%xmm1,%%xmm3;"
105 "pslld $0x19,%%xmm1;"
115 "movdqa %%xmm3,%%xmm2;"
118 "movdqa %%xmm3,%%xmm8;"
127 "psrld $0x12,%%xmm2;"
132 "pxor %%xmm3,%%xmm1;"
139 "pxor %%xmm2,%%xmm1;"
143 "pxor %%xmm8,%%xmm1;"
147 "pshufd $0xfa,%%xmm7,%%xmm2;"
150 "paddd %%xmm1,%%xmm0;"
153 "movdqa %%xmm2,%%xmm3;"
157 "movdqa %%xmm2,%%xmm8;"
163 "psrlq $0x11,%%xmm2;"
165 "psrlq $0x13,%%xmm3;"
173 "pxor %%xmm3,%%xmm2;"
177 "pxor %%xmm2,%%xmm8;"
181 "pshufb %%xmm10,%%xmm8;"
185 "paddd %%xmm8,%%xmm0;"
188 "pshufd $0x50,%%xmm0,%%xmm2;"
191 "movdqa %%xmm2,%%xmm3;"
195 "movdqa %%xmm2,%%xmm4;"
200 "psrlq $0x11,%%xmm2;"
203 "psrlq $0x13,%%xmm3;"
211 "pxor %%xmm3,%%xmm2;"
215 "pxor %%xmm2,%%xmm4;"
219 "pshufb %%xmm11,%%xmm4;"
223 "paddd %%xmm0,%%xmm4;"
228 "movdqa 0x10(%13),%%xmm9;"
229 "paddd %%xmm5,%%xmm9;"
231 "movdqa %%xmm4,%%xmm0;"
235 "palignr $0x4,%%xmm7,%%xmm0;"
240 "movdqa %%xmm6,%%xmm1;"
243 "paddd %%xmm5,%%xmm0;"
247 "palignr $0x4,%%xmm5,%%xmm1;"
251 "movdqa %%xmm1,%%xmm2;"
255 "movdqa %%xmm1,%%xmm3;"
259 "pslld $0x19,%%xmm1;"
269 "movdqa %%xmm3,%%xmm2;"
272 "movdqa %%xmm3,%%xmm8;"
281 "psrld $0x12,%%xmm2;"
286 "pxor %%xmm3,%%xmm1;"
293 "pxor %%xmm2,%%xmm1;"
297 "pxor %%xmm8,%%xmm1;"
301 "pshufd $0xfa,%%xmm4,%%xmm2;"
304 "paddd %%xmm1,%%xmm0;"
307 "movdqa %%xmm2,%%xmm3;"
311 "movdqa %%xmm2,%%xmm8;"
317 "psrlq $0x11,%%xmm2;"
319 "psrlq $0x13,%%xmm3;"
327 "pxor %%xmm3,%%xmm2;"
331 "pxor %%xmm2,%%xmm8;"
335 "pshufb %%xmm10,%%xmm8;"
339 "paddd %%xmm8,%%xmm0;"
342 "pshufd $0x50,%%xmm0,%%xmm2;"
345 "movdqa %%xmm2,%%xmm3;"
349 "movdqa %%xmm2,%%xmm5;"
354 "psrlq $0x11,%%xmm2;"
357 "psrlq $0x13,%%xmm3;"
365 "pxor %%xmm3,%%xmm2;"
369 "pxor %%xmm2,%%xmm5;"
373 "pshufb %%xmm11,%%xmm5;"
377 "paddd %%xmm0,%%xmm5;"
382 "movdqa 0x20(%13),%%xmm9;"
383 "paddd %%xmm6,%%xmm9;"
385 "movdqa %%xmm5,%%xmm0;"
389 "palignr $0x4,%%xmm4,%%xmm0;"
394 "movdqa %%xmm7,%%xmm1;"
397 "paddd %%xmm6,%%xmm0;"
401 "palignr $0x4,%%xmm6,%%xmm1;"
405 "movdqa %%xmm1,%%xmm2;"
409 "movdqa %%xmm1,%%xmm3;"
413 "pslld $0x19,%%xmm1;"
423 "movdqa %%xmm3,%%xmm2;"
426 "movdqa %%xmm3,%%xmm8;"
435 "psrld $0x12,%%xmm2;"
440 "pxor %%xmm3,%%xmm1;"
447 "pxor %%xmm2,%%xmm1;"
451 "pxor %%xmm8,%%xmm1;"
455 "pshufd $0xfa,%%xmm5,%%xmm2;"
458 "paddd %%xmm1,%%xmm0;"
461 "movdqa %%xmm2,%%xmm3;"
465 "movdqa %%xmm2,%%xmm8;"
471 "psrlq $0x11,%%xmm2;"
473 "psrlq $0x13,%%xmm3;"
481 "pxor %%xmm3,%%xmm2;"
485 "pxor %%xmm2,%%xmm8;"
489 "pshufb %%xmm10,%%xmm8;"
493 "paddd %%xmm8,%%xmm0;"
496 "pshufd $0x50,%%xmm0,%%xmm2;"
499 "movdqa %%xmm2,%%xmm3;"
503 "movdqa %%xmm2,%%xmm6;"
508 "psrlq $0x11,%%xmm2;"
511 "psrlq $0x13,%%xmm3;"
519 "pxor %%xmm3,%%xmm2;"
523 "pxor %%xmm2,%%xmm6;"
527 "pshufb %%xmm11,%%xmm6;"
531 "paddd %%xmm0,%%xmm6;"
536 "movdqa 0x30(%13),%%xmm9;"
537 "paddd %%xmm7,%%xmm9;"
540 "movdqa %%xmm6,%%xmm0;"
544 "palignr $0x4,%%xmm5,%%xmm0;"
549 "movdqa %%xmm4,%%xmm1;"
552 "paddd %%xmm7,%%xmm0;"
556 "palignr $0x4,%%xmm7,%%xmm1;"
560 "movdqa %%xmm1,%%xmm2;"
564 "movdqa %%xmm1,%%xmm3;"
568 "pslld $0x19,%%xmm1;"
578 "movdqa %%xmm3,%%xmm2;"
581 "movdqa %%xmm3,%%xmm8;"
590 "psrld $0x12,%%xmm2;"
595 "pxor %%xmm3,%%xmm1;"
602 "pxor %%xmm2,%%xmm1;"
606 "pxor %%xmm8,%%xmm1;"
610 "pshufd $0xfa,%%xmm6,%%xmm2;"
613 "paddd %%xmm1,%%xmm0;"
616 "movdqa %%xmm2,%%xmm3;"
620 "movdqa %%xmm2,%%xmm8;"
626 "psrlq $0x11,%%xmm2;"
628 "psrlq $0x13,%%xmm3;"
636 "pxor %%xmm3,%%xmm2;"
640 "pxor %%xmm2,%%xmm8;"
644 "pshufb %%xmm10,%%xmm8;"
648 "paddd %%xmm8,%%xmm0;"
651 "pshufd $0x50,%%xmm0,%%xmm2;"
654 "movdqa %%xmm2,%%xmm3;"
658 "movdqa %%xmm2,%%xmm7;"
663 "psrlq $0x11,%%xmm2;"
666 "psrlq $0x13,%%xmm3;"
674 "pxor %%xmm3,%%xmm2;"
678 "pxor %%xmm2,%%xmm7;"
682 "pshufb %%xmm11,%%xmm7;"
686 "paddd %%xmm0,%%xmm7;"
696 "paddd 0x0(%13),%%xmm4;"
810 "paddd 0x10(%13),%%xmm5;"
925 "movdqa %%xmm6,%%xmm4;"
926 "movdqa %%xmm7,%%xmm5;"
952 : "+r"(s
), "+r"(chunk
), "+r"(blocks
), "=r"(a
), "=r"(b
), "=r"(c
), "=r"(d
), /* e = chunk */ "=r"(f
), "=r"(g
), "=r"(h
), "=r"(y0
), "=r"(y1
), "=r"(y2
), "=r"(tbl
), "+m"(inp_end
), "+m"(inp
), "+m"(xfer
)
953 : "m"(K256
), "m"(FLIP_MASK
), "m"(SHUF_00BA
), "m"(SHUF_DC00
)
954 : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
960 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
961 ; Copyright (c) 2012, Intel Corporation
963 ; All rights reserved.
965 ; Redistribution and use in source and binary forms, with or without
966 ; modification, are permitted provided that the following conditions are
969 ; * Redistributions of source code must retain the above copyright
970 ; notice, this list of conditions and the following disclaimer.
972 ; * Redistributions in binary form must reproduce the above copyright
973 ; notice, this list of conditions and the following disclaimer in the
974 ; documentation and/or other materials provided with the
977 ; * Neither the name of the Intel Corporation nor the names of its
978 ; contributors may be used to endorse or promote products derived from
979 ; this software without specific prior written permission.
982 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
983 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
984 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
985 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
986 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
987 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
988 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
989 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
990 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
991 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
992 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
993 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
995 ; Example YASM command lines:
996 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
997 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
999 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1001 ; This code is described in an Intel White-Paper:
1002 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1004 ; To find it, surf to http://www.intel.com/p/en_US/embedded
1005 ; and search for that title.
1006 ; The paper is expected to be released roughly at the end of April, 2012
1008 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1009 ; This code schedules 1 blocks at a time, with 4 lanes per block
1010 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1012 %define MOVDQ movdqu ;; assume buffers not aligned
1014 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1017 ; Add reg to mem using reg-mem add and store
1023 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1025 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1026 ; Load xmm with mem and byte swap each dword
1027 %macro COPY_XMM_AND_BSWAP 3
1032 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1046 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1047 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1048 %define BYTE_FLIP_MASK xmm12
1051 %define NUM_BLKS rdx ; 3rd arg
1052 %define CTX rsi ; 2nd arg
1053 %define INP rdi ; 1st arg
1055 %define SRND rdi ; clobbers INP
1060 %define NUM_BLKS r8 ; 3rd arg
1061 %define CTX rdx ; 2nd arg
1062 %define INP rcx ; 1st arg
1064 %define SRND rcx ; clobbers INP
1088 _XMM_SAVE_SIZE equ 0
1090 _XMM_SAVE_SIZE equ 7*16
1092 ; STACK_SIZE plus pushes must be an odd multiple of 8
1096 _INP equ _INP_END + _INP_END_SIZE
1097 _XFER equ _INP + _INP_SIZE
1098 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1099 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1102 ; Rotate values of symbols X0...X3
1112 ; Rotate values of symbols a...h
1113 %macro ROTATE_ARGS 0
1125 %macro FOUR_ROUNDS_AND_SCHED 0
1126 ;; compute s0 four at a time and s1 two at a time
1127 ;; compute W[-16] + W[-7] 4 at a time
1130 ror y0, (25-11) ; y0 = e >> (25-11)
1132 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1133 ror y1, (22-13) ; y1 = a >> (22-13)
1134 xor y0, e ; y0 = e ^ (e >> (25-11))
1136 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1138 xor y1, a ; y1 = a ^ (a >> (22-13)
1139 xor y2, g ; y2 = f^g
1140 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1141 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1142 and y2, e ; y2 = (f^g)&e
1143 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1145 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1146 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1147 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1148 xor y2, g ; y2 = CH = ((f^g)&e)^g
1149 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1150 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1151 add y2, y0 ; y2 = S1 + CH
1152 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1153 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1155 add h, y2 ; h = h + S1 + CH + k + w
1159 add d, h ; d = d + h + S1 + CH + k + w
1160 and y2, c ; y2 = a&c
1162 and y0, b ; y0 = (a|c)&b
1163 add h, y1 ; h = h + S1 + CH + k + w + S0
1164 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1165 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1166 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1169 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1172 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1173 ror y0, (25-11) ; y0 = e >> (25-11)
1174 xor y0, e ; y0 = e ^ (e >> (25-11))
1176 ror y1, (22-13) ; y1 = a >> (22-13)
1177 pslld XTMP3, (32-18)
1178 xor y1, a ; y1 = a ^ (a >> (22-13)
1179 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1180 xor y2, g ; y2 = f^g
1182 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1183 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1184 and y2, e ; y2 = (f^g)&e
1185 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1187 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1188 xor y2, g ; y2 = CH = ((f^g)&e)^g
1189 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1190 add y2, y0 ; y2 = S1 + CH
1191 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1192 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1193 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1195 add h, y2 ; h = h + S1 + CH + k + w
1197 pxor XTMP1, XTMP4 ; XTMP1 = s0
1199 add d, h ; d = d + h + S1 + CH + k + w
1200 and y2, c ; y2 = a&c
1202 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1203 and y0, b ; y0 = (a|c)&b
1204 add h, y1 ; h = h + S1 + CH + k + w + S0
1205 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1206 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1207 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1210 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1213 ror y0, (25-11) ; y0 = e >> (25-11)
1214 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1215 xor y0, e ; y0 = e ^ (e >> (25-11))
1216 ror y1, (22-13) ; y1 = a >> (22-13)
1218 xor y1, a ; y1 = a ^ (a >> (22-13)
1219 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1220 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1221 xor y2, g ; y2 = f^g
1222 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1223 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1224 and y2, e ; y2 = (f^g)&e
1225 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1226 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1227 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1228 xor y2, g ; y2 = CH = ((f^g)&e)^g
1229 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1231 add y2, y0 ; y2 = S1 + CH
1232 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1233 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1234 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1236 add h, y2 ; h = h + S1 + CH + k + w
1238 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1240 add d, h ; d = d + h + S1 + CH + k + w
1241 and y2, c ; y2 = a&c
1242 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1243 and y0, b ; y0 = (a|c)&b
1244 add h, y1 ; h = h + S1 + CH + k + w + S0
1246 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1247 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1248 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1251 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1253 ror y0, (25-11) ; y0 = e >> (25-11)
1255 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1256 ror y1, (22-13) ; y1 = a >> (22-13)
1257 xor y0, e ; y0 = e ^ (e >> (25-11))
1259 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1260 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1261 xor y1, a ; y1 = a ^ (a >> (22-13)
1262 xor y2, g ; y2 = f^g
1263 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1264 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1265 and y2, e ; y2 = (f^g)&e
1266 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1267 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1268 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1269 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1270 xor y2, g ; y2 = CH = ((f^g)&e)^g
1272 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1273 add y2, y0 ; y2 = S1 + CH
1274 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1275 pxor X0, XTMP2 ; X0 = s1 {xDxC}
1277 add h, y2 ; h = h + S1 + CH + k + w
1279 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1281 add d, h ; d = d + h + S1 + CH + k + w
1282 and y2, c ; y2 = a&c
1283 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1284 and y0, b ; y0 = (a|c)&b
1285 add h, y1 ; h = h + S1 + CH + k + w + S0
1286 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1287 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1293 ;; input is [rsp + _XFER + %1 * 4]
1296 ror y0, (25-11) ; y0 = e >> (25-11)
1298 xor y0, e ; y0 = e ^ (e >> (25-11))
1299 ror y1, (22-13) ; y1 = a >> (22-13)
1301 xor y1, a ; y1 = a ^ (a >> (22-13)
1302 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1303 xor y2, g ; y2 = f^g
1304 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1305 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1306 and y2, e ; y2 = (f^g)&e
1307 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1308 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1309 xor y2, g ; y2 = CH = ((f^g)&e)^g
1310 add y2, y0 ; y2 = S1 + CH
1311 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1312 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1314 add h, y2 ; h = h + S1 + CH + k + w
1317 add d, h ; d = d + h + S1 + CH + k + w
1318 and y2, c ; y2 = a&c
1319 and y0, b ; y0 = (a|c)&b
1320 add h, y1 ; h = h + S1 + CH + k + w + S0
1321 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1322 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1327 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1328 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1329 ;; arg 1 : pointer to input data
1330 ;; arg 2 : pointer to digest
1331 ;; arg 3 : Num blocks
1348 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1349 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1350 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1351 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1352 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1353 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1354 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1357 shl NUM_BLKS, 6 ; convert to bytes
1359 add NUM_BLKS, INP ; pointer to end of data
1360 mov [rsp + _INP_END], NUM_BLKS
1362 ;; load initial digest
1372 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1373 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1374 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1377 lea TBL,[K256 wrt rip]
1379 ;; byte swap first 16 dwords
1380 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1381 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1382 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1383 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1385 mov [rsp + _INP], INP
1387 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1391 movdqa XFER, [TBL + 0*16]
1393 movdqa [rsp + _XFER], XFER
1394 FOUR_ROUNDS_AND_SCHED
1396 movdqa XFER, [TBL + 1*16]
1398 movdqa [rsp + _XFER], XFER
1399 FOUR_ROUNDS_AND_SCHED
1401 movdqa XFER, [TBL + 2*16]
1403 movdqa [rsp + _XFER], XFER
1404 FOUR_ROUNDS_AND_SCHED
1406 movdqa XFER, [TBL + 3*16]
1408 movdqa [rsp + _XFER], XFER
1410 FOUR_ROUNDS_AND_SCHED
1417 paddd X0, [TBL + 0*16]
1418 movdqa [rsp + _XFER], X0
1423 paddd X1, [TBL + 1*16]
1424 movdqa [rsp + _XFER], X1
1446 mov INP, [rsp + _INP]
1448 cmp INP, [rsp + _INP_END]
1453 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1454 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1455 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1456 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1457 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1458 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1459 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1480 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1481 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1482 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1483 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1484 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1485 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1486 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1487 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1488 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1489 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1490 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1491 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1492 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1493 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1494 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1495 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1497 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1499 ; shuffle xBxA -> 00BA
1500 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1502 ; shuffle xDxC -> DC00
1503 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF