2 * AArch64 SME translation
4 * Copyright (c) 2022 Linaro, Ltd
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "tcg/tcg-op.h"
23 #include "tcg/tcg-op-gvec.h"
24 #include "tcg/tcg-gvec-desc.h"
25 #include "translate.h"
26 #include "exec/helper-gen.h"
27 #include "translate-a64.h"
28 #include "fpu/softfloat.h"
32 * Include the generated decoder.
35 #include "decode-sme.c.inc"
39 * Resolve tile.size[index] to a host pointer, where tile and index
40 * are always decoded together, dependent on the element size.
42 static TCGv_ptr
get_tile_rowcol(DisasContext
*s
, int esz
, int rs
,
43 int tile_index
, bool vertical
)
45 int tile
= tile_index
>> (4 - esz
);
46 int index
= esz
== MO_128
? 0 : extract32(tile_index
, 0, 4 - esz
);
51 /* Compute the final index, which is Rs+imm. */
52 tmp
= tcg_temp_new_i32();
53 tcg_gen_trunc_tl_i32(tmp
, cpu_reg(s
, rs
));
54 tcg_gen_addi_i32(tmp
, tmp
, index
);
56 /* Prepare a power-of-two modulo via extraction of @len bits. */
57 len
= ctz32(streaming_vec_reg_size(s
)) - esz
;
61 * Compute the byte offset of the index within the tile:
62 * (index % (svl / size)) * size
63 * = (index % (svl >> esz)) << esz
64 * Perform the power-of-two modulo via extraction of the low @len bits.
65 * Perform the multiply by shifting left by @pos bits.
66 * Perform these operations simultaneously via deposit into zero.
69 tcg_gen_deposit_z_i32(tmp
, tmp
, pos
, len
);
72 * For big-endian, adjust the indexed column byte offset within
73 * the uint64_t host words that make up env->zarray[].
75 if (HOST_BIG_ENDIAN
&& esz
< MO_64
) {
76 tcg_gen_xori_i32(tmp
, tmp
, 8 - (1 << esz
));
80 * Compute the byte offset of the index within the tile:
81 * (index % (svl / size)) * (size * sizeof(row))
82 * = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
84 pos
= esz
+ ctz32(sizeof(ARMVectorReg
));
85 tcg_gen_deposit_z_i32(tmp
, tmp
, pos
, len
);
87 /* Row slices are always aligned and need no endian adjustment. */
90 /* The tile byte offset within env->zarray is the row. */
91 offset
= tile
* sizeof(ARMVectorReg
);
93 /* Include the byte offset of zarray to make this relative to env. */
94 offset
+= offsetof(CPUARMState
, zarray
);
95 tcg_gen_addi_i32(tmp
, tmp
, offset
);
97 /* Add the byte offset to env to produce the final pointer. */
98 addr
= tcg_temp_new_ptr();
99 tcg_gen_ext_i32_ptr(addr
, tmp
);
100 tcg_temp_free_i32(tmp
);
101 tcg_gen_add_ptr(addr
, addr
, cpu_env
);
106 static bool trans_ZERO(DisasContext
*s
, arg_ZERO
*a
)
108 if (!dc_isar_feature(aa64_sme
, s
)) {
111 if (sme_za_enabled_check(s
)) {
112 gen_helper_sme_zero(cpu_env
, tcg_constant_i32(a
->imm
),
113 tcg_constant_i32(streaming_vec_reg_size(s
)));
118 static bool trans_MOVA(DisasContext
*s
, arg_MOVA
*a
)
120 static gen_helper_gvec_4
* const h_fns
[5] = {
121 gen_helper_sve_sel_zpzz_b
, gen_helper_sve_sel_zpzz_h
,
122 gen_helper_sve_sel_zpzz_s
, gen_helper_sve_sel_zpzz_d
,
123 gen_helper_sve_sel_zpzz_q
125 static gen_helper_gvec_3
* const cz_fns
[5] = {
126 gen_helper_sme_mova_cz_b
, gen_helper_sme_mova_cz_h
,
127 gen_helper_sme_mova_cz_s
, gen_helper_sme_mova_cz_d
,
128 gen_helper_sme_mova_cz_q
,
130 static gen_helper_gvec_3
* const zc_fns
[5] = {
131 gen_helper_sme_mova_zc_b
, gen_helper_sme_mova_zc_h
,
132 gen_helper_sme_mova_zc_s
, gen_helper_sme_mova_zc_d
,
133 gen_helper_sme_mova_zc_q
,
136 TCGv_ptr t_za
, t_zr
, t_pg
;
140 if (!dc_isar_feature(aa64_sme
, s
)) {
143 if (!sme_smza_enabled_check(s
)) {
147 t_za
= get_tile_rowcol(s
, a
->esz
, a
->rs
, a
->za_imm
, a
->v
);
148 t_zr
= vec_full_reg_ptr(s
, a
->zr
);
149 t_pg
= pred_full_reg_ptr(s
, a
->pg
);
151 svl
= streaming_vec_reg_size(s
);
152 t_desc
= tcg_constant_i32(simd_desc(svl
, svl
, 0));
155 /* Vertical slice -- use sme mova helpers. */
157 zc_fns
[a
->esz
](t_zr
, t_za
, t_pg
, t_desc
);
159 cz_fns
[a
->esz
](t_za
, t_zr
, t_pg
, t_desc
);
162 /* Horizontal slice -- reuse sve sel helpers. */
164 h_fns
[a
->esz
](t_zr
, t_za
, t_zr
, t_pg
, t_desc
);
166 h_fns
[a
->esz
](t_za
, t_zr
, t_za
, t_pg
, t_desc
);
170 tcg_temp_free_ptr(t_za
);
171 tcg_temp_free_ptr(t_zr
);
172 tcg_temp_free_ptr(t_pg
);
177 static bool trans_LDST1(DisasContext
*s
, arg_LDST1
*a
)
179 typedef void GenLdSt1(TCGv_env
, TCGv_ptr
, TCGv_ptr
, TCGv
, TCGv_i32
);
182 * Indexed by [esz][be][v][mte][st], which is (except for load/store)
183 * also the order in which the elements appear in the function names,
184 * and so how we must concatenate the pieces.
187 #define FN_LS(F) { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
188 #define FN_MTE(F) { FN_LS(F), FN_LS(F##_mte) }
189 #define FN_HV(F) { FN_MTE(F##_h), FN_MTE(F##_v) }
190 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
192 static GenLdSt1
* const fns
[5][2][2][2][2] = {
208 bool be
= s
->be_data
== MO_BE
;
209 bool mte
= s
->mte_active
[0];
211 if (!dc_isar_feature(aa64_sme
, s
)) {
214 if (!sme_smza_enabled_check(s
)) {
218 t_za
= get_tile_rowcol(s
, a
->esz
, a
->rs
, a
->za_imm
, a
->v
);
219 t_pg
= pred_full_reg_ptr(s
, a
->pg
);
220 addr
= tcg_temp_new_i64();
222 tcg_gen_shli_i64(addr
, cpu_reg(s
, a
->rm
), a
->esz
);
223 tcg_gen_add_i64(addr
, addr
, cpu_reg_sp(s
, a
->rn
));
226 desc
= FIELD_DP32(desc
, MTEDESC
, MIDX
, get_mem_index(s
));
227 desc
= FIELD_DP32(desc
, MTEDESC
, TBI
, s
->tbid
);
228 desc
= FIELD_DP32(desc
, MTEDESC
, TCMA
, s
->tcma
);
229 desc
= FIELD_DP32(desc
, MTEDESC
, WRITE
, a
->st
);
230 desc
= FIELD_DP32(desc
, MTEDESC
, SIZEM1
, (1 << a
->esz
) - 1);
231 desc
<<= SVE_MTEDESC_SHIFT
;
233 addr
= clean_data_tbi(s
, addr
);
235 svl
= streaming_vec_reg_size(s
);
236 desc
= simd_desc(svl
, svl
, desc
);
238 fns
[a
->esz
][be
][a
->v
][mte
][a
->st
](cpu_env
, t_za
, t_pg
, addr
,
239 tcg_constant_i32(desc
));
241 tcg_temp_free_ptr(t_za
);
242 tcg_temp_free_ptr(t_pg
);
243 tcg_temp_free_i64(addr
);
247 typedef void GenLdStR(DisasContext
*, TCGv_ptr
, int, int, int, int);
249 static bool do_ldst_r(DisasContext
*s
, arg_ldstr
*a
, GenLdStR
*fn
)
251 int svl
= streaming_vec_reg_size(s
);
255 if (!sme_za_enabled_check(s
)) {
259 /* ZA[n] equates to ZA0H.B[n]. */
260 base
= get_tile_rowcol(s
, MO_8
, a
->rv
, imm
, false);
262 fn(s
, base
, 0, svl
, a
->rn
, imm
* svl
);
264 tcg_temp_free_ptr(base
);
268 TRANS_FEAT(LDR
, aa64_sme
, do_ldst_r
, a
, gen_sve_ldr
)
269 TRANS_FEAT(STR
, aa64_sme
, do_ldst_r
, a
, gen_sve_str
)
271 static bool do_adda(DisasContext
*s
, arg_adda
*a
, MemOp esz
,
272 gen_helper_gvec_4
*fn
)
274 int svl
= streaming_vec_reg_size(s
);
275 uint32_t desc
= simd_desc(svl
, svl
, 0);
276 TCGv_ptr za
, zn
, pn
, pm
;
278 if (!sme_smza_enabled_check(s
)) {
282 /* Sum XZR+zad to find ZAd. */
283 za
= get_tile_rowcol(s
, esz
, 31, a
->zad
, false);
284 zn
= vec_full_reg_ptr(s
, a
->zn
);
285 pn
= pred_full_reg_ptr(s
, a
->pn
);
286 pm
= pred_full_reg_ptr(s
, a
->pm
);
288 fn(za
, zn
, pn
, pm
, tcg_constant_i32(desc
));
290 tcg_temp_free_ptr(za
);
291 tcg_temp_free_ptr(zn
);
292 tcg_temp_free_ptr(pn
);
293 tcg_temp_free_ptr(pm
);
297 TRANS_FEAT(ADDHA_s
, aa64_sme
, do_adda
, a
, MO_32
, gen_helper_sme_addha_s
)
298 TRANS_FEAT(ADDVA_s
, aa64_sme
, do_adda
, a
, MO_32
, gen_helper_sme_addva_s
)
299 TRANS_FEAT(ADDHA_d
, aa64_sme_i16i64
, do_adda
, a
, MO_64
, gen_helper_sme_addha_d
)
300 TRANS_FEAT(ADDVA_d
, aa64_sme_i16i64
, do_adda
, a
, MO_64
, gen_helper_sme_addva_d
)
302 static bool do_outprod(DisasContext
*s
, arg_op
*a
, MemOp esz
,
303 gen_helper_gvec_5
*fn
)
305 int svl
= streaming_vec_reg_size(s
);
306 uint32_t desc
= simd_desc(svl
, svl
, a
->sub
);
307 TCGv_ptr za
, zn
, zm
, pn
, pm
;
309 if (!sme_smza_enabled_check(s
)) {
313 /* Sum XZR+zad to find ZAd. */
314 za
= get_tile_rowcol(s
, esz
, 31, a
->zad
, false);
315 zn
= vec_full_reg_ptr(s
, a
->zn
);
316 zm
= vec_full_reg_ptr(s
, a
->zm
);
317 pn
= pred_full_reg_ptr(s
, a
->pn
);
318 pm
= pred_full_reg_ptr(s
, a
->pm
);
320 fn(za
, zn
, zm
, pn
, pm
, tcg_constant_i32(desc
));
322 tcg_temp_free_ptr(za
);
323 tcg_temp_free_ptr(zn
);
324 tcg_temp_free_ptr(pn
);
325 tcg_temp_free_ptr(pm
);
329 static bool do_outprod_fpst(DisasContext
*s
, arg_op
*a
, MemOp esz
,
330 gen_helper_gvec_5_ptr
*fn
)
332 int svl
= streaming_vec_reg_size(s
);
333 uint32_t desc
= simd_desc(svl
, svl
, a
->sub
);
334 TCGv_ptr za
, zn
, zm
, pn
, pm
, fpst
;
336 if (!sme_smza_enabled_check(s
)) {
340 /* Sum XZR+zad to find ZAd. */
341 za
= get_tile_rowcol(s
, esz
, 31, a
->zad
, false);
342 zn
= vec_full_reg_ptr(s
, a
->zn
);
343 zm
= vec_full_reg_ptr(s
, a
->zm
);
344 pn
= pred_full_reg_ptr(s
, a
->pn
);
345 pm
= pred_full_reg_ptr(s
, a
->pm
);
346 fpst
= fpstatus_ptr(FPST_FPCR
);
348 fn(za
, zn
, zm
, pn
, pm
, fpst
, tcg_constant_i32(desc
));
350 tcg_temp_free_ptr(za
);
351 tcg_temp_free_ptr(zn
);
352 tcg_temp_free_ptr(pn
);
353 tcg_temp_free_ptr(pm
);
354 tcg_temp_free_ptr(fpst
);
358 TRANS_FEAT(FMOPA_h
, aa64_sme
, do_outprod_fpst
, a
, MO_32
, gen_helper_sme_fmopa_h
)
359 TRANS_FEAT(FMOPA_s
, aa64_sme
, do_outprod_fpst
, a
, MO_32
, gen_helper_sme_fmopa_s
)
360 TRANS_FEAT(FMOPA_d
, aa64_sme_f64f64
, do_outprod_fpst
, a
, MO_64
, gen_helper_sme_fmopa_d
)
362 /* TODO: FEAT_EBF16 */
363 TRANS_FEAT(BFMOPA
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_bfmopa
)
365 TRANS_FEAT(SMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_smopa_s
)
366 TRANS_FEAT(UMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_umopa_s
)
367 TRANS_FEAT(SUMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_sumopa_s
)
368 TRANS_FEAT(USMOPA_s
, aa64_sme
, do_outprod
, a
, MO_32
, gen_helper_sme_usmopa_s
)
370 TRANS_FEAT(SMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_smopa_d
)
371 TRANS_FEAT(UMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_umopa_d
)
372 TRANS_FEAT(SUMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_sumopa_d
)
373 TRANS_FEAT(USMOPA_d
, aa64_sme_i16i64
, do_outprod
, a
, MO_64
, gen_helper_sme_usmopa_d
)