2 * RISC-V Vector Crypto Extension Helpers for QEMU.
4 * Copyright (C) 2023 SiFive, Inc.
5 * Written by Codethink Ltd and SiFive.
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2 or later, as published by the Free Software Foundation.
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * You should have received a copy of the GNU General Public License along with
17 * this program. If not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "qemu/host-utils.h"
22 #include "qemu/bitops.h"
23 #include "qemu/bswap.h"
25 #include "crypto/aes.h"
26 #include "crypto/aes-round.h"
27 #include "crypto/sm4.h"
28 #include "exec/memop.h"
29 #include "exec/exec-all.h"
30 #include "exec/helper-proto.h"
31 #include "internals.h"
32 #include "vector_internals.h"
34 static uint64_t clmul64(uint64_t y
, uint64_t x
)
37 for (int j
= 63; j
>= 0; j
--) {
45 static uint64_t clmulh64(uint64_t y
, uint64_t x
)
48 for (int j
= 63; j
>= 1; j
--) {
50 result
^= (x
>> (64 - j
));
56 RVVCALL(OPIVV2
, vclmul_vv
, OP_UUU_D
, H8
, H8
, H8
, clmul64
)
57 GEN_VEXT_VV(vclmul_vv
, 8)
58 RVVCALL(OPIVX2
, vclmul_vx
, OP_UUU_D
, H8
, H8
, clmul64
)
59 GEN_VEXT_VX(vclmul_vx
, 8)
60 RVVCALL(OPIVV2
, vclmulh_vv
, OP_UUU_D
, H8
, H8
, H8
, clmulh64
)
61 GEN_VEXT_VV(vclmulh_vv
, 8)
62 RVVCALL(OPIVX2
, vclmulh_vx
, OP_UUU_D
, H8
, H8
, clmulh64
)
63 GEN_VEXT_VX(vclmulh_vx
, 8)
65 RVVCALL(OPIVV2
, vror_vv_b
, OP_UUU_B
, H1
, H1
, H1
, ror8
)
66 RVVCALL(OPIVV2
, vror_vv_h
, OP_UUU_H
, H2
, H2
, H2
, ror16
)
67 RVVCALL(OPIVV2
, vror_vv_w
, OP_UUU_W
, H4
, H4
, H4
, ror32
)
68 RVVCALL(OPIVV2
, vror_vv_d
, OP_UUU_D
, H8
, H8
, H8
, ror64
)
69 GEN_VEXT_VV(vror_vv_b
, 1)
70 GEN_VEXT_VV(vror_vv_h
, 2)
71 GEN_VEXT_VV(vror_vv_w
, 4)
72 GEN_VEXT_VV(vror_vv_d
, 8)
74 RVVCALL(OPIVX2
, vror_vx_b
, OP_UUU_B
, H1
, H1
, ror8
)
75 RVVCALL(OPIVX2
, vror_vx_h
, OP_UUU_H
, H2
, H2
, ror16
)
76 RVVCALL(OPIVX2
, vror_vx_w
, OP_UUU_W
, H4
, H4
, ror32
)
77 RVVCALL(OPIVX2
, vror_vx_d
, OP_UUU_D
, H8
, H8
, ror64
)
78 GEN_VEXT_VX(vror_vx_b
, 1)
79 GEN_VEXT_VX(vror_vx_h
, 2)
80 GEN_VEXT_VX(vror_vx_w
, 4)
81 GEN_VEXT_VX(vror_vx_d
, 8)
83 RVVCALL(OPIVV2
, vrol_vv_b
, OP_UUU_B
, H1
, H1
, H1
, rol8
)
84 RVVCALL(OPIVV2
, vrol_vv_h
, OP_UUU_H
, H2
, H2
, H2
, rol16
)
85 RVVCALL(OPIVV2
, vrol_vv_w
, OP_UUU_W
, H4
, H4
, H4
, rol32
)
86 RVVCALL(OPIVV2
, vrol_vv_d
, OP_UUU_D
, H8
, H8
, H8
, rol64
)
87 GEN_VEXT_VV(vrol_vv_b
, 1)
88 GEN_VEXT_VV(vrol_vv_h
, 2)
89 GEN_VEXT_VV(vrol_vv_w
, 4)
90 GEN_VEXT_VV(vrol_vv_d
, 8)
92 RVVCALL(OPIVX2
, vrol_vx_b
, OP_UUU_B
, H1
, H1
, rol8
)
93 RVVCALL(OPIVX2
, vrol_vx_h
, OP_UUU_H
, H2
, H2
, rol16
)
94 RVVCALL(OPIVX2
, vrol_vx_w
, OP_UUU_W
, H4
, H4
, rol32
)
95 RVVCALL(OPIVX2
, vrol_vx_d
, OP_UUU_D
, H8
, H8
, rol64
)
96 GEN_VEXT_VX(vrol_vx_b
, 1)
97 GEN_VEXT_VX(vrol_vx_h
, 2)
98 GEN_VEXT_VX(vrol_vx_w
, 4)
99 GEN_VEXT_VX(vrol_vx_d
, 8)
101 static uint64_t brev8(uint64_t val
)
103 val
= ((val
& 0x5555555555555555ull
) << 1) |
104 ((val
& 0xAAAAAAAAAAAAAAAAull
) >> 1);
105 val
= ((val
& 0x3333333333333333ull
) << 2) |
106 ((val
& 0xCCCCCCCCCCCCCCCCull
) >> 2);
107 val
= ((val
& 0x0F0F0F0F0F0F0F0Full
) << 4) |
108 ((val
& 0xF0F0F0F0F0F0F0F0ull
) >> 4);
113 RVVCALL(OPIVV1
, vbrev8_v_b
, OP_UU_B
, H1
, H1
, brev8
)
114 RVVCALL(OPIVV1
, vbrev8_v_h
, OP_UU_H
, H2
, H2
, brev8
)
115 RVVCALL(OPIVV1
, vbrev8_v_w
, OP_UU_W
, H4
, H4
, brev8
)
116 RVVCALL(OPIVV1
, vbrev8_v_d
, OP_UU_D
, H8
, H8
, brev8
)
117 GEN_VEXT_V(vbrev8_v_b
, 1)
118 GEN_VEXT_V(vbrev8_v_h
, 2)
119 GEN_VEXT_V(vbrev8_v_w
, 4)
120 GEN_VEXT_V(vbrev8_v_d
, 8)
122 #define DO_IDENTITY(a) (a)
123 RVVCALL(OPIVV1
, vrev8_v_b
, OP_UU_B
, H1
, H1
, DO_IDENTITY
)
124 RVVCALL(OPIVV1
, vrev8_v_h
, OP_UU_H
, H2
, H2
, bswap16
)
125 RVVCALL(OPIVV1
, vrev8_v_w
, OP_UU_W
, H4
, H4
, bswap32
)
126 RVVCALL(OPIVV1
, vrev8_v_d
, OP_UU_D
, H8
, H8
, bswap64
)
127 GEN_VEXT_V(vrev8_v_b
, 1)
128 GEN_VEXT_V(vrev8_v_h
, 2)
129 GEN_VEXT_V(vrev8_v_w
, 4)
130 GEN_VEXT_V(vrev8_v_d
, 8)
132 #define DO_ANDN(a, b) ((a) & ~(b))
133 RVVCALL(OPIVV2
, vandn_vv_b
, OP_UUU_B
, H1
, H1
, H1
, DO_ANDN
)
134 RVVCALL(OPIVV2
, vandn_vv_h
, OP_UUU_H
, H2
, H2
, H2
, DO_ANDN
)
135 RVVCALL(OPIVV2
, vandn_vv_w
, OP_UUU_W
, H4
, H4
, H4
, DO_ANDN
)
136 RVVCALL(OPIVV2
, vandn_vv_d
, OP_UUU_D
, H8
, H8
, H8
, DO_ANDN
)
137 GEN_VEXT_VV(vandn_vv_b
, 1)
138 GEN_VEXT_VV(vandn_vv_h
, 2)
139 GEN_VEXT_VV(vandn_vv_w
, 4)
140 GEN_VEXT_VV(vandn_vv_d
, 8)
142 RVVCALL(OPIVX2
, vandn_vx_b
, OP_UUU_B
, H1
, H1
, DO_ANDN
)
143 RVVCALL(OPIVX2
, vandn_vx_h
, OP_UUU_H
, H2
, H2
, DO_ANDN
)
144 RVVCALL(OPIVX2
, vandn_vx_w
, OP_UUU_W
, H4
, H4
, DO_ANDN
)
145 RVVCALL(OPIVX2
, vandn_vx_d
, OP_UUU_D
, H8
, H8
, DO_ANDN
)
146 GEN_VEXT_VX(vandn_vx_b
, 1)
147 GEN_VEXT_VX(vandn_vx_h
, 2)
148 GEN_VEXT_VX(vandn_vx_w
, 4)
149 GEN_VEXT_VX(vandn_vx_d
, 8)
151 RVVCALL(OPIVV1
, vbrev_v_b
, OP_UU_B
, H1
, H1
, revbit8
)
152 RVVCALL(OPIVV1
, vbrev_v_h
, OP_UU_H
, H2
, H2
, revbit16
)
153 RVVCALL(OPIVV1
, vbrev_v_w
, OP_UU_W
, H4
, H4
, revbit32
)
154 RVVCALL(OPIVV1
, vbrev_v_d
, OP_UU_D
, H8
, H8
, revbit64
)
155 GEN_VEXT_V(vbrev_v_b
, 1)
156 GEN_VEXT_V(vbrev_v_h
, 2)
157 GEN_VEXT_V(vbrev_v_w
, 4)
158 GEN_VEXT_V(vbrev_v_d
, 8)
160 RVVCALL(OPIVV1
, vclz_v_b
, OP_UU_B
, H1
, H1
, clz8
)
161 RVVCALL(OPIVV1
, vclz_v_h
, OP_UU_H
, H2
, H2
, clz16
)
162 RVVCALL(OPIVV1
, vclz_v_w
, OP_UU_W
, H4
, H4
, clz32
)
163 RVVCALL(OPIVV1
, vclz_v_d
, OP_UU_D
, H8
, H8
, clz64
)
164 GEN_VEXT_V(vclz_v_b
, 1)
165 GEN_VEXT_V(vclz_v_h
, 2)
166 GEN_VEXT_V(vclz_v_w
, 4)
167 GEN_VEXT_V(vclz_v_d
, 8)
169 RVVCALL(OPIVV1
, vctz_v_b
, OP_UU_B
, H1
, H1
, ctz8
)
170 RVVCALL(OPIVV1
, vctz_v_h
, OP_UU_H
, H2
, H2
, ctz16
)
171 RVVCALL(OPIVV1
, vctz_v_w
, OP_UU_W
, H4
, H4
, ctz32
)
172 RVVCALL(OPIVV1
, vctz_v_d
, OP_UU_D
, H8
, H8
, ctz64
)
173 GEN_VEXT_V(vctz_v_b
, 1)
174 GEN_VEXT_V(vctz_v_h
, 2)
175 GEN_VEXT_V(vctz_v_w
, 4)
176 GEN_VEXT_V(vctz_v_d
, 8)
178 RVVCALL(OPIVV1
, vcpop_v_b
, OP_UU_B
, H1
, H1
, ctpop8
)
179 RVVCALL(OPIVV1
, vcpop_v_h
, OP_UU_H
, H2
, H2
, ctpop16
)
180 RVVCALL(OPIVV1
, vcpop_v_w
, OP_UU_W
, H4
, H4
, ctpop32
)
181 RVVCALL(OPIVV1
, vcpop_v_d
, OP_UU_D
, H8
, H8
, ctpop64
)
182 GEN_VEXT_V(vcpop_v_b
, 1)
183 GEN_VEXT_V(vcpop_v_h
, 2)
184 GEN_VEXT_V(vcpop_v_w
, 4)
185 GEN_VEXT_V(vcpop_v_d
, 8)
187 #define DO_SLL(N, M) (N << (M & (sizeof(N) * 8 - 1)))
188 RVVCALL(OPIVV2
, vwsll_vv_b
, WOP_UUU_B
, H2
, H1
, H1
, DO_SLL
)
189 RVVCALL(OPIVV2
, vwsll_vv_h
, WOP_UUU_H
, H4
, H2
, H2
, DO_SLL
)
190 RVVCALL(OPIVV2
, vwsll_vv_w
, WOP_UUU_W
, H8
, H4
, H4
, DO_SLL
)
191 GEN_VEXT_VV(vwsll_vv_b
, 2)
192 GEN_VEXT_VV(vwsll_vv_h
, 4)
193 GEN_VEXT_VV(vwsll_vv_w
, 8)
195 RVVCALL(OPIVX2
, vwsll_vx_b
, WOP_UUU_B
, H2
, H1
, DO_SLL
)
196 RVVCALL(OPIVX2
, vwsll_vx_h
, WOP_UUU_H
, H4
, H2
, DO_SLL
)
197 RVVCALL(OPIVX2
, vwsll_vx_w
, WOP_UUU_W
, H8
, H4
, DO_SLL
)
198 GEN_VEXT_VX(vwsll_vx_b
, 2)
199 GEN_VEXT_VX(vwsll_vx_h
, 4)
200 GEN_VEXT_VX(vwsll_vx_w
, 8)
202 void HELPER(egs_check
)(uint32_t egs
, CPURISCVState
*env
)
204 uint32_t vl
= env
->vl
;
205 uint32_t vstart
= env
->vstart
;
207 if (vl
% egs
!= 0 || vstart
% egs
!= 0) {
208 riscv_raise_exception(env
, RISCV_EXCP_ILLEGAL_INST
, GETPC());
212 static inline void xor_round_key(AESState
*round_state
, AESState
*round_key
)
214 round_state
->v
= round_state
->v
^ round_key
->v
;
217 #define GEN_ZVKNED_HELPER_VV(NAME, ...) \
218 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \
221 uint32_t vl = env->vl; \
222 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \
223 uint32_t vta = vext_vta(desc); \
225 VSTART_CHECK_EARLY_EXIT(env); \
227 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \
228 AESState round_key; \
229 round_key.d[0] = *((uint64_t *)vs2 + H8(i * 2 + 0)); \
230 round_key.d[1] = *((uint64_t *)vs2 + H8(i * 2 + 1)); \
231 AESState round_state; \
232 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \
233 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \
235 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \
236 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \
239 /* set tail elements to 1s */ \
240 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \
243 #define GEN_ZVKNED_HELPER_VS(NAME, ...) \
244 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \
247 uint32_t vl = env->vl; \
248 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \
249 uint32_t vta = vext_vta(desc); \
251 VSTART_CHECK_EARLY_EXIT(env); \
253 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \
254 AESState round_key; \
255 round_key.d[0] = *((uint64_t *)vs2 + H8(0)); \
256 round_key.d[1] = *((uint64_t *)vs2 + H8(1)); \
257 AESState round_state; \
258 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \
259 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \
261 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \
262 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \
265 /* set tail elements to 1s */ \
266 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \
269 GEN_ZVKNED_HELPER_VV(vaesef_vv
, aesenc_SB_SR_AK(&round_state
,
273 GEN_ZVKNED_HELPER_VS(vaesef_vs
, aesenc_SB_SR_AK(&round_state
,
277 GEN_ZVKNED_HELPER_VV(vaesdf_vv
, aesdec_ISB_ISR_AK(&round_state
,
281 GEN_ZVKNED_HELPER_VS(vaesdf_vs
, aesdec_ISB_ISR_AK(&round_state
,
285 GEN_ZVKNED_HELPER_VV(vaesem_vv
, aesenc_SB_SR_MC_AK(&round_state
,
289 GEN_ZVKNED_HELPER_VS(vaesem_vs
, aesenc_SB_SR_MC_AK(&round_state
,
293 GEN_ZVKNED_HELPER_VV(vaesdm_vv
, aesdec_ISB_ISR_AK_IMC(&round_state
,
297 GEN_ZVKNED_HELPER_VS(vaesdm_vs
, aesdec_ISB_ISR_AK_IMC(&round_state
,
301 GEN_ZVKNED_HELPER_VS(vaesz_vs
, xor_round_key(&round_state
, &round_key
);)
303 void HELPER(vaeskf1_vi
)(void *vd_vptr
, void *vs2_vptr
, uint32_t uimm
,
304 CPURISCVState
*env
, uint32_t desc
)
306 uint32_t *vd
= vd_vptr
;
307 uint32_t *vs2
= vs2_vptr
;
308 uint32_t vl
= env
->vl
;
309 uint32_t total_elems
= vext_get_total_elems(env
, desc
, 4);
310 uint32_t vta
= vext_vta(desc
);
312 VSTART_CHECK_EARLY_EXIT(env
);
315 if (uimm
> 10 || uimm
== 0) {
319 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
321 static const uint32_t rcon
[] = {
322 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010,
323 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036,
326 rk
[0] = vs2
[i
* 4 + H4(0)];
327 rk
[1] = vs2
[i
* 4 + H4(1)];
328 rk
[2] = vs2
[i
* 4 + H4(2)];
329 rk
[3] = vs2
[i
* 4 + H4(3)];
330 tmp
= ror32(rk
[3], 8);
332 rk
[4] = rk
[0] ^ (((uint32_t)AES_sbox
[(tmp
>> 24) & 0xff] << 24) |
333 ((uint32_t)AES_sbox
[(tmp
>> 16) & 0xff] << 16) |
334 ((uint32_t)AES_sbox
[(tmp
>> 8) & 0xff] << 8) |
335 ((uint32_t)AES_sbox
[(tmp
>> 0) & 0xff] << 0))
337 rk
[5] = rk
[1] ^ rk
[4];
338 rk
[6] = rk
[2] ^ rk
[5];
339 rk
[7] = rk
[3] ^ rk
[6];
341 vd
[i
* 4 + H4(0)] = rk
[4];
342 vd
[i
* 4 + H4(1)] = rk
[5];
343 vd
[i
* 4 + H4(2)] = rk
[6];
344 vd
[i
* 4 + H4(3)] = rk
[7];
347 /* set tail elements to 1s */
348 vext_set_elems_1s(vd
, vta
, vl
* 4, total_elems
* 4);
351 void HELPER(vaeskf2_vi
)(void *vd_vptr
, void *vs2_vptr
, uint32_t uimm
,
352 CPURISCVState
*env
, uint32_t desc
)
354 uint32_t *vd
= vd_vptr
;
355 uint32_t *vs2
= vs2_vptr
;
356 uint32_t vl
= env
->vl
;
357 uint32_t total_elems
= vext_get_total_elems(env
, desc
, 4);
358 uint32_t vta
= vext_vta(desc
);
360 VSTART_CHECK_EARLY_EXIT(env
);
363 if (uimm
> 14 || uimm
< 2) {
367 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
368 uint32_t rk
[12], tmp
;
369 static const uint32_t rcon
[] = {
370 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010,
371 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036,
374 rk
[0] = vd
[i
* 4 + H4(0)];
375 rk
[1] = vd
[i
* 4 + H4(1)];
376 rk
[2] = vd
[i
* 4 + H4(2)];
377 rk
[3] = vd
[i
* 4 + H4(3)];
378 rk
[4] = vs2
[i
* 4 + H4(0)];
379 rk
[5] = vs2
[i
* 4 + H4(1)];
380 rk
[6] = vs2
[i
* 4 + H4(2)];
381 rk
[7] = vs2
[i
* 4 + H4(3)];
384 tmp
= ror32(rk
[7], 8);
385 rk
[8] = rk
[0] ^ (((uint32_t)AES_sbox
[(tmp
>> 24) & 0xff] << 24) |
386 ((uint32_t)AES_sbox
[(tmp
>> 16) & 0xff] << 16) |
387 ((uint32_t)AES_sbox
[(tmp
>> 8) & 0xff] << 8) |
388 ((uint32_t)AES_sbox
[(tmp
>> 0) & 0xff] << 0))
389 ^ rcon
[(uimm
- 1) / 2];
391 rk
[8] = rk
[0] ^ (((uint32_t)AES_sbox
[(rk
[7] >> 24) & 0xff] << 24) |
392 ((uint32_t)AES_sbox
[(rk
[7] >> 16) & 0xff] << 16) |
393 ((uint32_t)AES_sbox
[(rk
[7] >> 8) & 0xff] << 8) |
394 ((uint32_t)AES_sbox
[(rk
[7] >> 0) & 0xff] << 0));
396 rk
[9] = rk
[1] ^ rk
[8];
397 rk
[10] = rk
[2] ^ rk
[9];
398 rk
[11] = rk
[3] ^ rk
[10];
400 vd
[i
* 4 + H4(0)] = rk
[8];
401 vd
[i
* 4 + H4(1)] = rk
[9];
402 vd
[i
* 4 + H4(2)] = rk
[10];
403 vd
[i
* 4 + H4(3)] = rk
[11];
406 /* set tail elements to 1s */
407 vext_set_elems_1s(vd
, vta
, vl
* 4, total_elems
* 4);
410 static inline uint32_t sig0_sha256(uint32_t x
)
412 return ror32(x
, 7) ^ ror32(x
, 18) ^ (x
>> 3);
415 static inline uint32_t sig1_sha256(uint32_t x
)
417 return ror32(x
, 17) ^ ror32(x
, 19) ^ (x
>> 10);
420 static inline uint64_t sig0_sha512(uint64_t x
)
422 return ror64(x
, 1) ^ ror64(x
, 8) ^ (x
>> 7);
425 static inline uint64_t sig1_sha512(uint64_t x
)
427 return ror64(x
, 19) ^ ror64(x
, 61) ^ (x
>> 6);
430 static inline void vsha2ms_e32(uint32_t *vd
, uint32_t *vs1
, uint32_t *vs2
)
433 res
[0] = sig1_sha256(vs1
[H4(2)]) + vs2
[H4(1)] + sig0_sha256(vd
[H4(1)]) +
435 res
[1] = sig1_sha256(vs1
[H4(3)]) + vs2
[H4(2)] + sig0_sha256(vd
[H4(2)]) +
438 sig1_sha256(res
[0]) + vs2
[H4(3)] + sig0_sha256(vd
[H4(3)]) + vd
[H4(2)];
440 sig1_sha256(res
[1]) + vs1
[H4(0)] + sig0_sha256(vs2
[H4(0)]) + vd
[H4(3)];
447 static inline void vsha2ms_e64(uint64_t *vd
, uint64_t *vs1
, uint64_t *vs2
)
450 res
[0] = sig1_sha512(vs1
[2]) + vs2
[1] + sig0_sha512(vd
[1]) + vd
[0];
451 res
[1] = sig1_sha512(vs1
[3]) + vs2
[2] + sig0_sha512(vd
[2]) + vd
[1];
452 res
[2] = sig1_sha512(res
[0]) + vs2
[3] + sig0_sha512(vd
[3]) + vd
[2];
453 res
[3] = sig1_sha512(res
[1]) + vs1
[0] + sig0_sha512(vs2
[0]) + vd
[3];
460 void HELPER(vsha2ms_vv
)(void *vd
, void *vs1
, void *vs2
, CPURISCVState
*env
,
463 uint32_t sew
= FIELD_EX64(env
->vtype
, VTYPE
, VSEW
);
464 uint32_t esz
= sew
== MO_32
? 4 : 8;
465 uint32_t total_elems
;
466 uint32_t vta
= vext_vta(desc
);
468 VSTART_CHECK_EARLY_EXIT(env
);
470 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
472 vsha2ms_e32(((uint32_t *)vd
) + i
* 4, ((uint32_t *)vs1
) + i
* 4,
473 ((uint32_t *)vs2
) + i
* 4);
475 /* If not 32 then SEW should be 64 */
476 vsha2ms_e64(((uint64_t *)vd
) + i
* 4, ((uint64_t *)vs1
) + i
* 4,
477 ((uint64_t *)vs2
) + i
* 4);
480 /* set tail elements to 1s */
481 total_elems
= vext_get_total_elems(env
, desc
, esz
);
482 vext_set_elems_1s(vd
, vta
, env
->vl
* esz
, total_elems
* esz
);
486 static inline uint64_t sum0_64(uint64_t x
)
488 return ror64(x
, 28) ^ ror64(x
, 34) ^ ror64(x
, 39);
491 static inline uint32_t sum0_32(uint32_t x
)
493 return ror32(x
, 2) ^ ror32(x
, 13) ^ ror32(x
, 22);
496 static inline uint64_t sum1_64(uint64_t x
)
498 return ror64(x
, 14) ^ ror64(x
, 18) ^ ror64(x
, 41);
501 static inline uint32_t sum1_32(uint32_t x
)
503 return ror32(x
, 6) ^ ror32(x
, 11) ^ ror32(x
, 25);
506 #define ch(x, y, z) ((x & y) ^ ((~x) & z))
508 #define maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
510 static void vsha2c_64(uint64_t *vs2
, uint64_t *vd
, uint64_t *vs1
)
512 uint64_t a
= vs2
[3], b
= vs2
[2], e
= vs2
[1], f
= vs2
[0];
513 uint64_t c
= vd
[3], d
= vd
[2], g
= vd
[1], h
= vd
[0];
514 uint64_t W0
= vs1
[0], W1
= vs1
[1];
515 uint64_t T1
= h
+ sum1_64(e
) + ch(e
, f
, g
) + W0
;
516 uint64_t T2
= sum0_64(a
) + maj(a
, b
, c
);
527 T1
= h
+ sum1_64(e
) + ch(e
, f
, g
) + W1
;
528 T2
= sum0_64(a
) + maj(a
, b
, c
);
544 static void vsha2c_32(uint32_t *vs2
, uint32_t *vd
, uint32_t *vs1
)
546 uint32_t a
= vs2
[H4(3)], b
= vs2
[H4(2)], e
= vs2
[H4(1)], f
= vs2
[H4(0)];
547 uint32_t c
= vd
[H4(3)], d
= vd
[H4(2)], g
= vd
[H4(1)], h
= vd
[H4(0)];
548 uint32_t W0
= vs1
[H4(0)], W1
= vs1
[H4(1)];
549 uint32_t T1
= h
+ sum1_32(e
) + ch(e
, f
, g
) + W0
;
550 uint32_t T2
= sum0_32(a
) + maj(a
, b
, c
);
561 T1
= h
+ sum1_32(e
) + ch(e
, f
, g
) + W1
;
562 T2
= sum0_32(a
) + maj(a
, b
, c
);
578 void HELPER(vsha2ch32_vv
)(void *vd
, void *vs1
, void *vs2
, CPURISCVState
*env
,
581 const uint32_t esz
= 4;
582 uint32_t total_elems
;
583 uint32_t vta
= vext_vta(desc
);
585 VSTART_CHECK_EARLY_EXIT(env
);
587 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
588 vsha2c_32(((uint32_t *)vs2
) + 4 * i
, ((uint32_t *)vd
) + 4 * i
,
589 ((uint32_t *)vs1
) + 4 * i
+ 2);
592 /* set tail elements to 1s */
593 total_elems
= vext_get_total_elems(env
, desc
, esz
);
594 vext_set_elems_1s(vd
, vta
, env
->vl
* esz
, total_elems
* esz
);
598 void HELPER(vsha2ch64_vv
)(void *vd
, void *vs1
, void *vs2
, CPURISCVState
*env
,
601 const uint32_t esz
= 8;
602 uint32_t total_elems
;
603 uint32_t vta
= vext_vta(desc
);
605 VSTART_CHECK_EARLY_EXIT(env
);
607 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
608 vsha2c_64(((uint64_t *)vs2
) + 4 * i
, ((uint64_t *)vd
) + 4 * i
,
609 ((uint64_t *)vs1
) + 4 * i
+ 2);
612 /* set tail elements to 1s */
613 total_elems
= vext_get_total_elems(env
, desc
, esz
);
614 vext_set_elems_1s(vd
, vta
, env
->vl
* esz
, total_elems
* esz
);
618 void HELPER(vsha2cl32_vv
)(void *vd
, void *vs1
, void *vs2
, CPURISCVState
*env
,
621 const uint32_t esz
= 4;
622 uint32_t total_elems
;
623 uint32_t vta
= vext_vta(desc
);
625 VSTART_CHECK_EARLY_EXIT(env
);
627 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
628 vsha2c_32(((uint32_t *)vs2
) + 4 * i
, ((uint32_t *)vd
) + 4 * i
,
629 (((uint32_t *)vs1
) + 4 * i
));
632 /* set tail elements to 1s */
633 total_elems
= vext_get_total_elems(env
, desc
, esz
);
634 vext_set_elems_1s(vd
, vta
, env
->vl
* esz
, total_elems
* esz
);
638 void HELPER(vsha2cl64_vv
)(void *vd
, void *vs1
, void *vs2
, CPURISCVState
*env
,
642 uint32_t total_elems
;
643 uint32_t vta
= vext_vta(desc
);
645 VSTART_CHECK_EARLY_EXIT(env
);
647 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
648 vsha2c_64(((uint64_t *)vs2
) + 4 * i
, ((uint64_t *)vd
) + 4 * i
,
649 (((uint64_t *)vs1
) + 4 * i
));
652 /* set tail elements to 1s */
653 total_elems
= vext_get_total_elems(env
, desc
, esz
);
654 vext_set_elems_1s(vd
, vta
, env
->vl
* esz
, total_elems
* esz
);
658 static inline uint32_t p1(uint32_t x
)
660 return x
^ rol32(x
, 15) ^ rol32(x
, 23);
663 static inline uint32_t zvksh_w(uint32_t m16
, uint32_t m9
, uint32_t m3
,
664 uint32_t m13
, uint32_t m6
)
666 return p1(m16
^ m9
^ rol32(m3
, 15)) ^ rol32(m13
, 7) ^ m6
;
669 void HELPER(vsm3me_vv
)(void *vd_vptr
, void *vs1_vptr
, void *vs2_vptr
,
670 CPURISCVState
*env
, uint32_t desc
)
672 uint32_t esz
= memop_size(FIELD_EX64(env
->vtype
, VTYPE
, VSEW
));
673 uint32_t total_elems
= vext_get_total_elems(env
, desc
, esz
);
674 uint32_t vta
= vext_vta(desc
);
675 uint32_t *vd
= vd_vptr
;
676 uint32_t *vs1
= vs1_vptr
;
677 uint32_t *vs2
= vs2_vptr
;
679 VSTART_CHECK_EARLY_EXIT(env
);
681 for (int i
= env
->vstart
/ 8; i
< env
->vl
/ 8; i
++) {
683 for (int j
= 0; j
< 8; j
++) {
684 w
[j
] = bswap32(vs1
[H4((i
* 8) + j
)]);
685 w
[j
+ 8] = bswap32(vs2
[H4((i
* 8) + j
)]);
687 for (int j
= 0; j
< 8; j
++) {
689 zvksh_w(w
[j
], w
[j
+ 7], w
[j
+ 13], w
[j
+ 3], w
[j
+ 10]);
691 for (int j
= 0; j
< 8; j
++) {
692 vd
[(i
* 8) + j
] = bswap32(w
[H4(j
+ 16)]);
695 vext_set_elems_1s(vd_vptr
, vta
, env
->vl
* esz
, total_elems
* esz
);
699 static inline uint32_t ff1(uint32_t x
, uint32_t y
, uint32_t z
)
704 static inline uint32_t ff2(uint32_t x
, uint32_t y
, uint32_t z
)
706 return (x
& y
) | (x
& z
) | (y
& z
);
709 static inline uint32_t ff_j(uint32_t x
, uint32_t y
, uint32_t z
, uint32_t j
)
711 return (j
<= 15) ? ff1(x
, y
, z
) : ff2(x
, y
, z
);
714 static inline uint32_t gg1(uint32_t x
, uint32_t y
, uint32_t z
)
719 static inline uint32_t gg2(uint32_t x
, uint32_t y
, uint32_t z
)
721 return (x
& y
) | (~x
& z
);
724 static inline uint32_t gg_j(uint32_t x
, uint32_t y
, uint32_t z
, uint32_t j
)
726 return (j
<= 15) ? gg1(x
, y
, z
) : gg2(x
, y
, z
);
729 static inline uint32_t t_j(uint32_t j
)
731 return (j
<= 15) ? 0x79cc4519 : 0x7a879d8a;
734 static inline uint32_t p_0(uint32_t x
)
736 return x
^ rol32(x
, 9) ^ rol32(x
, 17);
739 static void sm3c(uint32_t *vd
, uint32_t *vs1
, uint32_t *vs2
, uint32_t uimm
)
743 uint32_t ss1
, ss2
, tt1
, tt2
;
744 x0
= vs2
[0] ^ vs2
[4];
745 x1
= vs2
[1] ^ vs2
[5];
747 ss1
= rol32(rol32(vs1
[0], 12) + vs1
[4] + rol32(t_j(j
), j
% 32), 7);
748 ss2
= ss1
^ rol32(vs1
[0], 12);
749 tt1
= ff_j(vs1
[0], vs1
[1], vs1
[2], j
) + vs1
[3] + ss2
+ x0
;
750 tt2
= gg_j(vs1
[4], vs1
[5], vs1
[6], j
) + vs1
[7] + ss1
+ vs2
[0];
752 vd
[3] = rol32(vs1
[1], 9);
756 vd
[7] = rol32(vs1
[5], 19);
760 ss1
= rol32(rol32(vd
[1], 12) + vd
[5] + rol32(t_j(j
), j
% 32), 7);
761 ss2
= ss1
^ rol32(vd
[1], 12);
762 tt1
= ff_j(vd
[1], vs1
[1], vd
[3], j
) + vs1
[3] + ss2
+ x1
;
763 tt2
= gg_j(vd
[5], vs1
[5], vd
[7], j
) + vs1
[7] + ss1
+ vs2
[1];
764 vd
[2] = rol32(vs1
[1], 9);
766 vd
[6] = rol32(vs1
[5], 19);
770 void HELPER(vsm3c_vi
)(void *vd_vptr
, void *vs2_vptr
, uint32_t uimm
,
771 CPURISCVState
*env
, uint32_t desc
)
773 uint32_t esz
= memop_size(FIELD_EX64(env
->vtype
, VTYPE
, VSEW
));
774 uint32_t total_elems
= vext_get_total_elems(env
, desc
, esz
);
775 uint32_t vta
= vext_vta(desc
);
776 uint32_t *vd
= vd_vptr
;
777 uint32_t *vs2
= vs2_vptr
;
778 uint32_t v1
[8], v2
[8], v3
[8];
780 VSTART_CHECK_EARLY_EXIT(env
);
782 for (int i
= env
->vstart
/ 8; i
< env
->vl
/ 8; i
++) {
783 for (int k
= 0; k
< 8; k
++) {
784 v2
[k
] = bswap32(vd
[H4(i
* 8 + k
)]);
785 v3
[k
] = bswap32(vs2
[H4(i
* 8 + k
)]);
787 sm3c(v1
, v2
, v3
, uimm
);
788 for (int k
= 0; k
< 8; k
++) {
789 vd
[i
* 8 + k
] = bswap32(v1
[H4(k
)]);
792 vext_set_elems_1s(vd_vptr
, vta
, env
->vl
* esz
, total_elems
* esz
);
796 void HELPER(vghsh_vv
)(void *vd_vptr
, void *vs1_vptr
, void *vs2_vptr
,
797 CPURISCVState
*env
, uint32_t desc
)
799 uint64_t *vd
= vd_vptr
;
800 uint64_t *vs1
= vs1_vptr
;
801 uint64_t *vs2
= vs2_vptr
;
802 uint32_t vta
= vext_vta(desc
);
803 uint32_t total_elems
= vext_get_total_elems(env
, desc
, 4);
805 VSTART_CHECK_EARLY_EXIT(env
);
807 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
808 uint64_t Y
[2] = {vd
[i
* 2 + 0], vd
[i
* 2 + 1]};
809 uint64_t H
[2] = {brev8(vs2
[i
* 2 + 0]), brev8(vs2
[i
* 2 + 1])};
810 uint64_t X
[2] = {vs1
[i
* 2 + 0], vs1
[i
* 2 + 1]};
811 uint64_t Z
[2] = {0, 0};
813 uint64_t S
[2] = {brev8(Y
[0] ^ X
[0]), brev8(Y
[1] ^ X
[1])};
815 for (int j
= 0; j
< 128; j
++) {
816 if ((S
[j
/ 64] >> (j
% 64)) & 1) {
820 bool reduce
= ((H
[1] >> 63) & 1);
821 H
[1] = H
[1] << 1 | H
[0] >> 63;
828 vd
[i
* 2 + 0] = brev8(Z
[0]);
829 vd
[i
* 2 + 1] = brev8(Z
[1]);
831 /* set tail elements to 1s */
832 vext_set_elems_1s(vd
, vta
, env
->vl
* 4, total_elems
* 4);
836 void HELPER(vgmul_vv
)(void *vd_vptr
, void *vs2_vptr
, CPURISCVState
*env
,
839 uint64_t *vd
= vd_vptr
;
840 uint64_t *vs2
= vs2_vptr
;
841 uint32_t vta
= vext_vta(desc
);
842 uint32_t total_elems
= vext_get_total_elems(env
, desc
, 4);
844 VSTART_CHECK_EARLY_EXIT(env
);
846 for (uint32_t i
= env
->vstart
/ 4; i
< env
->vl
/ 4; i
++) {
847 uint64_t Y
[2] = {brev8(vd
[i
* 2 + 0]), brev8(vd
[i
* 2 + 1])};
848 uint64_t H
[2] = {brev8(vs2
[i
* 2 + 0]), brev8(vs2
[i
* 2 + 1])};
849 uint64_t Z
[2] = {0, 0};
851 for (int j
= 0; j
< 128; j
++) {
852 if ((Y
[j
/ 64] >> (j
% 64)) & 1) {
856 bool reduce
= ((H
[1] >> 63) & 1);
857 H
[1] = H
[1] << 1 | H
[0] >> 63;
864 vd
[i
* 2 + 0] = brev8(Z
[0]);
865 vd
[i
* 2 + 1] = brev8(Z
[1]);
867 /* set tail elements to 1s */
868 vext_set_elems_1s(vd
, vta
, env
->vl
* 4, total_elems
* 4);
872 void HELPER(vsm4k_vi
)(void *vd
, void *vs2
, uint32_t uimm5
, CPURISCVState
*env
,
875 const uint32_t egs
= 4;
876 uint32_t rnd
= uimm5
& 0x7;
877 uint32_t group_start
= env
->vstart
/ egs
;
878 uint32_t group_end
= env
->vl
/ egs
;
879 uint32_t esz
= sizeof(uint32_t);
880 uint32_t total_elems
= vext_get_total_elems(env
, desc
, esz
);
882 VSTART_CHECK_EARLY_EXIT(env
);
884 for (uint32_t i
= group_start
; i
< group_end
; ++i
) {
885 uint32_t vstart
= i
* egs
;
886 uint32_t vend
= (i
+ 1) * egs
;
887 uint32_t rk
[4] = {0};
888 uint32_t tmp
[8] = {0};
890 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
891 rk
[j
- vstart
] = *((uint32_t *)vs2
+ H4(j
));
894 for (uint32_t j
= 0; j
< egs
; ++j
) {
898 for (uint32_t j
= 0; j
< egs
; ++j
) {
900 b
= tmp
[j
+ 1] ^ tmp
[j
+ 2] ^ tmp
[j
+ 3] ^ sm4_ck
[rnd
* 4 + j
];
904 tmp
[j
+ 4] = tmp
[j
] ^ (s
^ rol32(s
, 13) ^ rol32(s
, 23));
907 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
908 *((uint32_t *)vd
+ H4(j
)) = tmp
[egs
+ (j
- vstart
)];
913 /* set tail elements to 1s */
914 vext_set_elems_1s(vd
, vext_vta(desc
), env
->vl
* esz
, total_elems
* esz
);
917 static void do_sm4_round(uint32_t *rk
, uint32_t *buf
)
919 const uint32_t egs
= 4;
922 for (uint32_t j
= egs
; j
< egs
* 2; ++j
) {
923 b
= buf
[j
- 3] ^ buf
[j
- 2] ^ buf
[j
- 1] ^ rk
[j
- 4];
927 buf
[j
] = buf
[j
- 4] ^ (s
^ rol32(s
, 2) ^ rol32(s
, 10) ^ rol32(s
, 18) ^
932 void HELPER(vsm4r_vv
)(void *vd
, void *vs2
, CPURISCVState
*env
, uint32_t desc
)
934 const uint32_t egs
= 4;
935 uint32_t group_start
= env
->vstart
/ egs
;
936 uint32_t group_end
= env
->vl
/ egs
;
937 uint32_t esz
= sizeof(uint32_t);
938 uint32_t total_elems
= vext_get_total_elems(env
, desc
, esz
);
940 VSTART_CHECK_EARLY_EXIT(env
);
942 for (uint32_t i
= group_start
; i
< group_end
; ++i
) {
943 uint32_t vstart
= i
* egs
;
944 uint32_t vend
= (i
+ 1) * egs
;
945 uint32_t rk
[4] = {0};
946 uint32_t tmp
[8] = {0};
948 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
949 rk
[j
- vstart
] = *((uint32_t *)vs2
+ H4(j
));
952 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
953 tmp
[j
- vstart
] = *((uint32_t *)vd
+ H4(j
));
956 do_sm4_round(rk
, tmp
);
958 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
959 *((uint32_t *)vd
+ H4(j
)) = tmp
[egs
+ (j
- vstart
)];
964 /* set tail elements to 1s */
965 vext_set_elems_1s(vd
, vext_vta(desc
), env
->vl
* esz
, total_elems
* esz
);
968 void HELPER(vsm4r_vs
)(void *vd
, void *vs2
, CPURISCVState
*env
, uint32_t desc
)
970 const uint32_t egs
= 4;
971 uint32_t group_start
= env
->vstart
/ egs
;
972 uint32_t group_end
= env
->vl
/ egs
;
973 uint32_t esz
= sizeof(uint32_t);
974 uint32_t total_elems
= vext_get_total_elems(env
, desc
, esz
);
976 VSTART_CHECK_EARLY_EXIT(env
);
978 for (uint32_t i
= group_start
; i
< group_end
; ++i
) {
979 uint32_t vstart
= i
* egs
;
980 uint32_t vend
= (i
+ 1) * egs
;
981 uint32_t rk
[4] = {0};
982 uint32_t tmp
[8] = {0};
984 for (uint32_t j
= 0; j
< egs
; ++j
) {
985 rk
[j
] = *((uint32_t *)vs2
+ H4(j
));
988 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
989 tmp
[j
- vstart
] = *((uint32_t *)vd
+ H4(j
));
992 do_sm4_round(rk
, tmp
);
994 for (uint32_t j
= vstart
; j
< vend
; ++j
) {
995 *((uint32_t *)vd
+ H4(j
)) = tmp
[egs
+ (j
- vstart
)];
1000 /* set tail elements to 1s */
1001 vext_set_elems_1s(vd
, vext_vta(desc
), env
->vl
* esz
, total_elems
* esz
);