2 * AArch64 SME translation
4 * Copyright (c) 2022 Linaro, Ltd
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
21 #include "translate.h"
22 #include "translate-a64.h"
25 * Include the generated decoder.
28 #include "decode-sme.c.inc"
32 * Resolve tile.size[index] to a host pointer, where tile and index
33 * are always decoded together, dependent on the element size.
35 static TCGv_ptr
get_tile_rowcol(DisasContext
*s
, int esz
, int rs
,
36 int tile_index
, bool vertical
)
38 int tile
= tile_index
>> (4 - esz
);
39 int index
= esz
== MO_128
? 0 : extract32(tile_index
, 0, 4 - esz
);
44 /* Compute the final index, which is Rs+imm. */
45 tmp
= tcg_temp_new_i32();
46 tcg_gen_trunc_tl_i32(tmp
, cpu_reg(s
, rs
));
47 tcg_gen_addi_i32(tmp
, tmp
, index
);
49 /* Prepare a power-of-two modulo via extraction of @len bits. */
50 len
= ctz32(streaming_vec_reg_size(s
)) - esz
;
54 * Compute the byte offset of the index within the tile:
55 * (index % (svl / size)) * size
56 * = (index % (svl >> esz)) << esz
57 * Perform the power-of-two modulo via extraction of the low @len bits.
58 * Perform the multiply by shifting left by @pos bits.
59 * Perform these operations simultaneously via deposit into zero.
62 tcg_gen_deposit_z_i32(tmp
, tmp
, pos
, len
);
65 * For big-endian, adjust the indexed column byte offset within
66 * the uint64_t host words that make up env->zarray[].
68 if (HOST_BIG_ENDIAN
&& esz
< MO_64
) {
69 tcg_gen_xori_i32(tmp
, tmp
, 8 - (1 << esz
));
73 * Compute the byte offset of the index within the tile:
74 * (index % (svl / size)) * (size * sizeof(row))
75 * = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
77 pos
= esz
+ ctz32(sizeof(ARMVectorReg
));
78 tcg_gen_deposit_z_i32(tmp
, tmp
, pos
, len
);
80 /* Row slices are always aligned and need no endian adjustment. */
83 /* The tile byte offset within env->zarray is the row. */
84 offset
= tile
* sizeof(ARMVectorReg
);
86 /* Include the byte offset of zarray to make this relative to env. */
87 offset
+= offsetof(CPUARMState
, zarray
);
88 tcg_gen_addi_i32(tmp
, tmp
, offset
);
90 /* Add the byte offset to env to produce the final pointer. */
91 addr
= tcg_temp_new_ptr();
92 tcg_gen_ext_i32_ptr(addr
, tmp
);
93 tcg_gen_add_ptr(addr
, addr
, tcg_env
);
99 * Resolve tile.size[0] to a host pointer.
100 * Used by e.g. outer product insns where we require the entire tile.
102 static TCGv_ptr
get_tile(DisasContext
*s
, int esz
, int tile
)
104 TCGv_ptr addr
= tcg_temp_new_ptr();
107 offset
= tile
* sizeof(ARMVectorReg
) + offsetof(CPUARMState
, zarray
);
109 tcg_gen_addi_ptr(addr
, tcg_env
, offset
);
113 static bool trans_ZERO(DisasContext
*s
, arg_ZERO
*a
)
115 if (!dc_isar_feature(aa64_sme
, s
)) {
118 if (sme_za_enabled_check(s
)) {
119 gen_helper_sme_zero(tcg_env
, tcg_constant_i32(a
->imm
),
120 tcg_constant_i32(streaming_vec_reg_size(s
)));
125 static bool trans_MOVA(DisasContext
*s
, arg_MOVA
*a
)
127 static gen_helper_gvec_4
* const h_fns
[5] = {
128 gen_helper_sve_sel_zpzz_b
, gen_helper_sve_sel_zpzz_h
,
129 gen_helper_sve_sel_zpzz_s
, gen_helper_sve_sel_zpzz_d
,
130 gen_helper_sve_sel_zpzz_q
132 static gen_helper_gvec_3
* const cz_fns
[5] = {
133 gen_helper_sme_mova_cz_b
, gen_helper_sme_mova_cz_h
,
134 gen_helper_sme_mova_cz_s
, gen_helper_sme_mova_cz_d
,
135 gen_helper_sme_mova_cz_q
,
137 static gen_helper_gvec_3
* const zc_fns
[5] = {
138 gen_helper_sme_mova_zc_b
, gen_helper_sme_mova_zc_h
,
139 gen_helper_sme_mova_zc_s
, gen_helper_sme_mova_zc_d
,
140 gen_helper_sme_mova_zc_q
,
143 TCGv_ptr t_za
, t_zr
, t_pg
;
147 if (!dc_isar_feature(aa64_sme
, s
)) {
150 if (!sme_smza_enabled_check(s
)) {
154 t_za
= get_tile_rowcol(s
, a
->esz
, a
->rs
, a
->za_imm
, a
->v
);
155 t_zr
= vec_full_reg_ptr(s
, a
->zr
);
156 t_pg
= pred_full_reg_ptr(s
, a
->pg
);
158 svl
= streaming_vec_reg_size(s
);
159 t_desc
= tcg_constant_i32(simd_desc(svl
, svl
, 0));
162 /* Vertical slice -- use sme mova helpers. */
164 zc_fns
[a
->esz
](t_zr
, t_za
, t_pg
, t_desc
);
166 cz_fns
[a
->esz
](t_za
, t_zr
, t_pg
, t_desc
);
169 /* Horizontal slice -- reuse sve sel helpers. */
171 h_fns
[a
->esz
](t_zr
, t_za
, t_zr
, t_pg
, t_desc
);
173 h_fns
[a
->esz
](t_za
, t_zr
, t_za
, t_pg
, t_desc
);
179 static bool trans_LDST1(DisasContext
*s
, arg_LDST1
*a
)
181 typedef void GenLdSt1(TCGv_env
, TCGv_ptr
, TCGv_ptr
, TCGv
, TCGv_i32
);
184 * Indexed by [esz][be][v][mte][st], which is (except for load/store)
185 * also the order in which the elements appear in the function names,
186 * and so how we must concatenate the pieces.
189 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
190 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) }
191 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) }
192 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
194 static GenLdSt1
* const fns
[5][2][2][2][2] = {
210 bool be
= s
->be_data
== MO_BE
;
211 bool mte
= s
->mte_active
[0];
213 if (!dc_isar_feature(aa64_sme
, s
)) {
216 if (!sme_smza_enabled_check(s
)) {
220 t_za
= get_tile_rowcol(s
, a
->esz
, a
->rs
, a
->za_imm
, a
->v
);
221 t_pg
= pred_full_reg_ptr(s
, a
->pg
);
222 addr
= tcg_temp_new_i64();
224 tcg_gen_shli_i64(addr
, cpu_reg(s
, a
->rm
), a
->esz
);
225 tcg_gen_add_i64(addr
, addr
, cpu_reg_sp(s
, a
->rn
));
228 desc
= FIELD_DP32(desc
, MTEDESC
, MIDX
, get_mem_index(s
));
229 desc
= FIELD_DP32(desc
, MTEDESC
, TBI
, s
->tbid
);
230 desc
= FIELD_DP32(desc
, MTEDESC
, TCMA
, s
->tcma
);
231 desc
= FIELD_DP32(desc
, MTEDESC
, WRITE
, a
->st
);
232 desc
= FIELD_DP32(desc
, MTEDESC
, SIZEM1
, (1 << a
->esz
) - 1);
233 desc
<<= SVE_MTEDESC_SHIFT
;
235 addr
= clean_data_tbi(s
, addr
);
237 svl
= streaming_vec_reg_size(s
);
238 desc
= simd_desc(svl
, svl
, desc
);
240 fns
[a
->esz
][be
][a
->v
][mte
][a
->st
](tcg_env
, t_za
, t_pg
, addr
,
241 tcg_constant_i32(desc
));
245 typedef void GenLdStR(DisasContext
*, TCGv_ptr
, int, int, int, int);
247 static bool do_ldst_r(DisasContext
*s
, arg_ldstr
*a
, GenLdStR
*fn
)
249 int svl
= streaming_vec_reg_size(s
);
253 if (!sme_za_enabled_check(s
)) {
257 /* ZA[n] equates to ZA0H.B[n]. */
258 base
= get_tile_rowcol(s
, MO_8
, a
->rv
, imm
, false);
260 fn(s
, base
, 0, svl
, a
->rn
, imm
* svl
);
264 TRANS_FEAT(LDR
, aa64_sme
, do_ldst_r
, a
, gen_sve_ldr
)
265 TRANS_FEAT(STR
, aa64_sme
, do_ldst_r
, a
, gen_sve_str
)
267 static bool do_adda(DisasContext
*s
, arg_adda
*a
, MemOp esz
,
268 gen_helper_gvec_4
*fn
)
270 int svl
= streaming_vec_reg_size(s
);
271 uint32_t desc
= simd_desc(svl
, svl
, 0);
272 TCGv_ptr za
, zn
, pn
, pm
;
274 if (!sme_smza_enabled_check(s
)) {
278 za
= get_tile(s
, esz
, a
->zad
);
279 zn
= vec_full_reg_ptr(s
, a
->zn
);
280 pn
= pred_full_reg_ptr(s
, a
->pn
);
281 pm
= pred_full_reg_ptr(s
, a
->pm
);
283 fn(za
, zn
, pn
, pm
, tcg_constant_i32(desc
));
287 TRANS_FEAT(ADDHA_s
, aa64_sme
, do_adda
, a
, MO_32
, gen_helper_sme_addha_s
)
288 TRANS_FEAT(ADDVA_s
, aa64_sme
, do_adda
, a
, MO_32
, gen_helper_sme_addva_s
)
289 TRANS_FEAT(ADDHA_d
, aa64_sme_i16i64
, do_adda
, a
, MO_64
, gen_helper_sme_addha_d
)
290 TRANS_FEAT(ADDVA_d
, aa64_sme_i16i64
, do_adda
, a
, MO_64
, gen_helper_sme_addva_d
)
292 static bool do_outprod(DisasContext
*s
, arg_op
*a
, MemOp esz
,
293 gen_helper_gvec_5
*fn
)
295 int svl
= streaming_vec_reg_size(s
);
296 uint32_t desc
= simd_desc(svl
, svl
, a
->sub
);
297 TCGv_ptr za
, zn
, zm
, pn
, pm
;
299 if (!sme_smza_enabled_check(s
)) {
303 za
= get_tile(s
, esz
, a
->zad
);
304 zn
= vec_full_reg_ptr(s
, a
->zn
);
305 zm
= vec_full_reg_ptr(s
, a
->zm
);
306 pn
= pred_full_reg_ptr(s
, a
->pn
);
307 pm
= pred_full_reg_ptr(s
, a
->pm
);
309 fn(za
, zn
, zm
, pn
, pm
, tcg_constant_i32(desc
));
313 static bool do_outprod_fpst(DisasContext
*s
, arg_op
*a
, MemOp esz
,
314 gen_helper_gvec_5_ptr
*fn
)
316 int svl
= streaming_vec_reg_size(s
);
317 uint32_t desc
= simd_desc(svl
, svl
, a
->sub
);
318 TCGv_ptr za
, zn
, zm
, pn
, pm
, fpst
;
320 if (!sme_smza_enabled_check(s
)) {
324 za
= get_tile(s
, esz
, a
->zad
);
325 zn
= vec_full_reg_ptr(s
, a
->zn
);
326 zm
= vec_full_reg_ptr(s
, a
->zm
);
327 pn
= pred_full_reg_ptr(s
, a
->pn
);
328 pm
= pred_full_reg_ptr(s
, a
->pm
);
329 fpst
= fpstatus_ptr(FPST_FPCR
);
331 fn(za
, zn
, zm
, pn
, pm
, fpst
, tcg_constant_i32(desc
));
335 TRANS_FEAT(FMOPA_h
, aa64_sme
, do_outprod_fpst
, a
, MO_32
, gen_helper_sme_fmopa_h
)
336 TRANS_FEAT(FMOPA_s
, aa64_sme
, do_outprod_fpst
, a
, MO_32
, gen_helper_sme_fmopa_s
)
337 TRANS_FEAT(FMOPA_d
, aa64_sme_f64f64
, do_outprod_fpst
, a
, MO_64
, gen_helper_sme_fmopa_d
)
339 /* TODO: FEAT_EBF16 */
340 TRANS_FEAT(BFMOPA
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_bfmopa
)
342 TRANS_FEAT(SMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_smopa_s
)
343 TRANS_FEAT(UMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_umopa_s
)
344 TRANS_FEAT(SUMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_sumopa_s
)
345 TRANS_FEAT(USMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_usmopa_s
)
347 TRANS_FEAT(SMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_smopa_d
)
348 TRANS_FEAT(UMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_umopa_d
)
349 TRANS_FEAT(SUMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_sumopa_d
)
350 TRANS_FEAT(USMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_usmopa_d
)