target/arm/tcg/translate-sme.c

   1 /*
   2  * AArch64 SME translation
   3  *
   4  * Copyright (c) 2022 Linaro, Ltd
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "translate.h"
  22 #include "translate-a64.h"
  23
  24 /*
  25  * Include the generated decoder.
  26  */
  27
  28 #include "decode-sme.c.inc"
  29
  30
  31 /*
  32  * Resolve tile.size[index] to a host pointer, where tile and index
  33  * are always decoded together, dependent on the element size.
  34  */
  35 static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
  36                                 int tile_index, bool vertical)
  37 {
  38     int tile = tile_index >> (4 - esz);
  39     int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
  40     int pos, len, offset;
  41     TCGv_i32 tmp;
  42     TCGv_ptr addr;
  43
  44     /* Compute the final index, which is Rs+imm. */
  45     tmp = tcg_temp_new_i32();
  46     tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
  47     tcg_gen_addi_i32(tmp, tmp, index);
  48
  49     /* Prepare a power-of-two modulo via extraction of @len bits. */
  50     len = ctz32(streaming_vec_reg_size(s)) - esz;
  51
  52     if (vertical) {
  53         /*
  54          * Compute the byte offset of the index within the tile:
  55          *     (index % (svl / size)) * size
  56          *   = (index % (svl >> esz)) << esz
  57          * Perform the power-of-two modulo via extraction of the low @len bits.
  58          * Perform the multiply by shifting left by @pos bits.
  59          * Perform these operations simultaneously via deposit into zero.
  60          */
  61         pos = esz;
  62         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
  63
  64         /*
  65          * For big-endian, adjust the indexed column byte offset within
  66          * the uint64_t host words that make up env->zarray[].
  67          */
  68         if (HOST_BIG_ENDIAN && esz < MO_64) {
  69             tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
  70         }
  71     } else {
  72         /*
  73          * Compute the byte offset of the index within the tile:
  74          *     (index % (svl / size)) * (size * sizeof(row))
  75          *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
  76          */
  77         pos = esz + ctz32(sizeof(ARMVectorReg));
  78         tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
  79
  80         /* Row slices are always aligned and need no endian adjustment. */
  81     }
  82
  83     /* The tile byte offset within env->zarray is the row. */
  84     offset = tile * sizeof(ARMVectorReg);
  85
  86     /* Include the byte offset of zarray to make this relative to env. */
  87     offset += offsetof(CPUARMState, zarray);
  88     tcg_gen_addi_i32(tmp, tmp, offset);
  89
  90     /* Add the byte offset to env to produce the final pointer. */
  91     addr = tcg_temp_new_ptr();
  92     tcg_gen_ext_i32_ptr(addr, tmp);
  93     tcg_gen_add_ptr(addr, addr, tcg_env);
  94
  95     return addr;
  96 }
  97
  98 /*
  99  * Resolve tile.size[0] to a host pointer.
 100  * Used by e.g. outer product insns where we require the entire tile.
 101  */
 102 static TCGv_ptr get_tile(DisasContext *s, int esz, int tile)
 103 {
 104     TCGv_ptr addr = tcg_temp_new_ptr();
 105     int offset;
 106
 107     offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray);
 108
 109     tcg_gen_addi_ptr(addr, tcg_env, offset);
 110     return addr;
 111 }
 112
 113 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
 114 {
 115     if (!dc_isar_feature(aa64_sme, s)) {
 116         return false;
 117     }
 118     if (sme_za_enabled_check(s)) {
 119         gen_helper_sme_zero(tcg_env, tcg_constant_i32(a->imm),
 120                             tcg_constant_i32(streaming_vec_reg_size(s)));
 121     }
 122     return true;
 123 }
 124
 125 static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
 126 {
 127     static gen_helper_gvec_4 * const h_fns[5] = {
 128         gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
 129         gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
 130         gen_helper_sve_sel_zpzz_q
 131     };
 132     static gen_helper_gvec_3 * const cz_fns[5] = {
 133         gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
 134         gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
 135         gen_helper_sme_mova_cz_q,
 136     };
 137     static gen_helper_gvec_3 * const zc_fns[5] = {
 138         gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
 139         gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
 140         gen_helper_sme_mova_zc_q,
 141     };
 142
 143     TCGv_ptr t_za, t_zr, t_pg;
 144     TCGv_i32 t_desc;
 145     int svl;
 146
 147     if (!dc_isar_feature(aa64_sme, s)) {
 148         return false;
 149     }
 150     if (!sme_smza_enabled_check(s)) {
 151         return true;
 152     }
 153
 154     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
 155     t_zr = vec_full_reg_ptr(s, a->zr);
 156     t_pg = pred_full_reg_ptr(s, a->pg);
 157
 158     svl = streaming_vec_reg_size(s);
 159     t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
 160
 161     if (a->v) {
 162         /* Vertical slice -- use sme mova helpers. */
 163         if (a->to_vec) {
 164             zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
 165         } else {
 166             cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
 167         }
 168     } else {
 169         /* Horizontal slice -- reuse sve sel helpers. */
 170         if (a->to_vec) {
 171             h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
 172         } else {
 173             h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
 174         }
 175     }
 176     return true;
 177 }
 178
 179 static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
 180 {
 181     typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
 182
 183     /*
 184      * Indexed by [esz][be][v][mte][st], which is (except for load/store)
 185      * also the order in which the elements appear in the function names,
 186      * and so how we must concatenate the pieces.
 187      */
 188
 189 #define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
 190 #define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
 191 #define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
 192 #define FN_END(L, B) { FN_HV(L), FN_HV(B) }
 193
 194     static GenLdSt1 * const fns[5][2][2][2][2] = {
 195         FN_END(b, b),
 196         FN_END(h_le, h_be),
 197         FN_END(s_le, s_be),
 198         FN_END(d_le, d_be),
 199         FN_END(q_le, q_be),
 200     };
 201
 202 #undef FN_LS
 203 #undef FN_MTE
 204 #undef FN_HV
 205 #undef FN_END
 206
 207     TCGv_ptr t_za, t_pg;
 208     TCGv_i64 addr;
 209     int svl, desc = 0;
 210     bool be = s->be_data == MO_BE;
 211     bool mte = s->mte_active[0];
 212
 213     if (!dc_isar_feature(aa64_sme, s)) {
 214         return false;
 215     }
 216     if (!sme_smza_enabled_check(s)) {
 217         return true;
 218     }
 219
 220     t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
 221     t_pg = pred_full_reg_ptr(s, a->pg);
 222     addr = tcg_temp_new_i64();
 223
 224     tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
 225     tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
 226
 227     if (mte) {
 228         desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
 229         desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
 230         desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
 231         desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
 232         desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
 233         desc <<= SVE_MTEDESC_SHIFT;
 234     } else {
 235         addr = clean_data_tbi(s, addr);
 236     }
 237     svl = streaming_vec_reg_size(s);
 238     desc = simd_desc(svl, svl, desc);
 239
 240     fns[a->esz][be][a->v][mte][a->st](tcg_env, t_za, t_pg, addr,
 241                                       tcg_constant_i32(desc));
 242     return true;
 243 }
 244
 245 typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
 246
 247 static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
 248 {
 249     int svl = streaming_vec_reg_size(s);
 250     int imm = a->imm;
 251     TCGv_ptr base;
 252
 253     if (!sme_za_enabled_check(s)) {
 254         return true;
 255     }
 256
 257     /* ZA[n] equates to ZA0H.B[n]. */
 258     base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
 259
 260     fn(s, base, 0, svl, a->rn, imm * svl);
 261     return true;
 262 }
 263
 264 TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
 265 TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
 266
 267 static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
 268                     gen_helper_gvec_4 *fn)
 269 {
 270     int svl = streaming_vec_reg_size(s);
 271     uint32_t desc = simd_desc(svl, svl, 0);
 272     TCGv_ptr za, zn, pn, pm;
 273
 274     if (!sme_smza_enabled_check(s)) {
 275         return true;
 276     }
 277
 278     za = get_tile(s, esz, a->zad);
 279     zn = vec_full_reg_ptr(s, a->zn);
 280     pn = pred_full_reg_ptr(s, a->pn);
 281     pm = pred_full_reg_ptr(s, a->pm);
 282
 283     fn(za, zn, pn, pm, tcg_constant_i32(desc));
 284     return true;
 285 }
 286
 287 TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
 288 TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
 289 TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
 290 TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
 291
 292 static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
 293                        gen_helper_gvec_5 *fn)
 294 {
 295     int svl = streaming_vec_reg_size(s);
 296     uint32_t desc = simd_desc(svl, svl, a->sub);
 297     TCGv_ptr za, zn, zm, pn, pm;
 298
 299     if (!sme_smza_enabled_check(s)) {
 300         return true;
 301     }
 302
 303     za = get_tile(s, esz, a->zad);
 304     zn = vec_full_reg_ptr(s, a->zn);
 305     zm = vec_full_reg_ptr(s, a->zm);
 306     pn = pred_full_reg_ptr(s, a->pn);
 307     pm = pred_full_reg_ptr(s, a->pm);
 308
 309     fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
 310     return true;
 311 }
 312
 313 static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
 314                             gen_helper_gvec_5_ptr *fn)
 315 {
 316     int svl = streaming_vec_reg_size(s);
 317     uint32_t desc = simd_desc(svl, svl, a->sub);
 318     TCGv_ptr za, zn, zm, pn, pm, fpst;
 319
 320     if (!sme_smza_enabled_check(s)) {
 321         return true;
 322     }
 323
 324     za = get_tile(s, esz, a->zad);
 325     zn = vec_full_reg_ptr(s, a->zn);
 326     zm = vec_full_reg_ptr(s, a->zm);
 327     pn = pred_full_reg_ptr(s, a->pn);
 328     pm = pred_full_reg_ptr(s, a->pm);
 329     fpst = fpstatus_ptr(FPST_FPCR);
 330
 331     fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
 332     return true;
 333 }
 334
 335 TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
 336 TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
 337 TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
 338
 339 /* TODO: FEAT_EBF16 */
 340 TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
 341
 342 TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
 343 TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
 344 TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
 345 TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
 346
 347 TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
 348 TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
 349 TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
 350 TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)