gcc/config/riscv/riscv-v.cc

   1 /* Subroutines used for code generation for RISC-V 'V' Extension for
   2    GNU compiler.
   3    Copyright (C) 2022-2023 Free Software Foundation, Inc.
   4    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   5
   6    This file is part of GCC.
   7
   8    GCC is free software; you can redistribute it and/or modify it
   9    under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GCC is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GCC; see the file COPYING3.  If not see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 #define IN_TARGET_CODE 1
  23
  24 /* We have a maximum of 11 operands for RVV instruction patterns according to
  25    the vector.md.  */
  26 #define RVV_INSN_OPERANDS_MAX 11
  27
  28 #include "config.h"
  29 #include "system.h"
  30 #include "coretypes.h"
  31 #include "tm.h"
  32 #include "backend.h"
  33 #include "rtl.h"
  34 #include "insn-config.h"
  35 #include "insn-attr.h"
  36 #include "recog.h"
  37 #include "alias.h"
  38 #include "tree.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "explow.h"
  42 #include "memmodel.h"
  43 #include "emit-rtl.h"
  44 #include "tm_p.h"
  45 #include "target.h"
  46 #include "targhooks.h"
  47 #include "expr.h"
  48 #include "optabs.h"
  49 #include "tm-constrs.h"
  50 #include "rtx-vector-builder.h"
  51 #include "targhooks.h"
  52 #include "predict.h"
  53
  54 using namespace riscv_vector;
  55
  56 namespace riscv_vector {
  57
  58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli.  */
  59 bool
  60 imm_avl_p (machine_mode mode)
  61 {
  62   poly_uint64 nunits = GET_MODE_NUNITS (mode);
  63
  64   return nunits.is_constant ()
  65            /* The vsetivli can only hold register 0~31.  */
  66            ? (IN_RANGE (nunits.to_constant (), 0, 31))
  67            /* Only allowed in VLS-VLMAX mode.  */
  68            : false;
  69 }
  70
  71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
  72 static bool
  73 is_vlmax_len_p (machine_mode mode, rtx len)
  74 {
  75   poly_int64 value;
  76   return poly_int_rtx_p (len, &value)
  77          && known_eq (value, GET_MODE_NUNITS (mode))
  78          && !satisfies_constraint_K (len);
  79 }
  80
  81 /* Helper functions for insn_flags && insn_types */
  82
  83 /* Return true if caller need pass mask operand for insn pattern with
  84    INSN_FLAGS. */
  85
  86 static bool
  87 need_mask_operand_p (unsigned insn_flags)
  88 {
  89   return (insn_flags & HAS_MASK_P)
  90          && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
  91 }
  92
  93 template <int MAX_OPERANDS> class insn_expander
  94 {
  95 public:
  96   insn_expander () = delete;
  97
  98   insn_expander (unsigned insn_flags, bool vlmax_p)
  99     : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
 100       m_vl_op (NULL_RTX)
 101   {
 102     check_insn_flags ();
 103   }
 104
 105   void check_insn_flags () const
 106   {
 107     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 108       /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P.  */
 109       gcc_assert ((m_insn_flags & HAS_MASK_P));
 110
 111     if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 112       /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P.  */
 113       gcc_assert ((m_insn_flags & HAS_MASK_P));
 114
 115     /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive.  */
 116     gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
 117                   && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
 118
 119     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 120       /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P.  */
 121       gcc_assert ((m_insn_flags & HAS_MERGE_P));
 122
 123     /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive.  */
 124     gcc_assert (
 125       !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
 126
 127     /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive.  */
 128     gcc_assert (
 129       !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
 130
 131     /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
 132        exclusive.  */
 133     gcc_assert (
 134       !((m_insn_flags & NULLARY_OP_P)
 135         && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 136             || (m_insn_flags & TERNARY_OP_P))));
 137     gcc_assert (
 138       !((m_insn_flags & UNARY_OP_P)
 139         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 140             || (m_insn_flags & TERNARY_OP_P))));
 141     gcc_assert (
 142       !((m_insn_flags & BINARY_OP_P)
 143         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 144             || (m_insn_flags & TERNARY_OP_P))));
 145     gcc_assert (
 146       !((m_insn_flags & TERNARY_OP_P)
 147         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 148             || (m_insn_flags & BINARY_OP_P))));
 149   }
 150
 151   void set_vl (rtx vl) { m_vl_op = vl; }
 152
 153   void add_output_operand (rtx x, machine_mode mode)
 154   {
 155     create_output_operand (&m_ops[m_opno++], x, mode);
 156     gcc_assert (m_opno <= MAX_OPERANDS);
 157   }
 158   void add_input_operand (rtx x, machine_mode mode)
 159   {
 160     create_input_operand (&m_ops[m_opno++], x, mode);
 161     gcc_assert (m_opno <= MAX_OPERANDS);
 162   }
 163   void add_all_one_mask_operand (machine_mode mask_mode)
 164   {
 165     add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
 166   }
 167   void add_first_one_true_mask_operand (machine_mode mask_mode)
 168   {
 169     add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
 170   }
 171   void add_vundef_operand (machine_mode dest_mode)
 172   {
 173     add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
 174   }
 175   void add_policy_operand ()
 176   {
 177     if (m_insn_flags & TU_POLICY_P)
 178       {
 179         rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
 180         add_input_operand (tail_policy_rtx, Pmode);
 181       }
 182     else if (m_insn_flags & TDEFAULT_POLICY_P)
 183       {
 184         rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
 185         add_input_operand (tail_policy_rtx, Pmode);
 186       }
 187
 188     if (m_insn_flags & MU_POLICY_P)
 189       {
 190         rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
 191         add_input_operand (mask_policy_rtx, Pmode);
 192       }
 193     else if (m_insn_flags & MDEFAULT_POLICY_P)
 194       {
 195         rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
 196         add_input_operand (mask_policy_rtx, Pmode);
 197       }
 198   }
 199   void add_avl_type_operand (avl_type type)
 200   {
 201     add_input_operand (gen_int_mode (type, Pmode), Pmode);
 202   }
 203
 204   void
 205   add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
 206   {
 207     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 208     add_input_operand (frm_rtx, Pmode);
 209   }
 210
 211   /* Return the vtype mode based on insn_flags.
 212      vtype mode mean the mode vsetvl insn set. */
 213   machine_mode
 214   get_vtype_mode (rtx *ops)
 215   {
 216     machine_mode vtype_mode;
 217     if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
 218       vtype_mode = GET_MODE (ops[1]);
 219     else
 220       vtype_mode = GET_MODE (ops[0]);
 221     return vtype_mode;
 222   }
 223
 224   void emit_insn (enum insn_code icode, rtx *ops)
 225   {
 226     int opno = 0;
 227     int num_ops;
 228     /* It's true if any operand is memory operand.  */
 229     bool any_mem_p = false;
 230
 231     machine_mode vtype_mode = get_vtype_mode (ops);
 232     machine_mode mask_mode = get_mask_mode (vtype_mode);
 233
 234     /* Add dest operand.  */
 235     if (m_insn_flags & HAS_DEST_P)
 236       {
 237         rtx op = ops[opno++];
 238         any_mem_p |= MEM_P (op);
 239         add_output_operand (op, GET_MODE (op));
 240       }
 241
 242     /* Add mask operand.  */
 243     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 244       add_first_one_true_mask_operand (mask_mode);
 245     else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 246       add_all_one_mask_operand (mask_mode);
 247     else if (m_insn_flags & HAS_MASK_P)
 248       {
 249         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 250         gcc_assert (mode != VOIDmode);
 251         add_input_operand (ops[opno++], mode);
 252       }
 253
 254     /* Add merge operand.  */
 255     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 256       /* Same as dest operand.  */
 257       add_vundef_operand (GET_MODE (ops[0]));
 258     else if (m_insn_flags & HAS_MERGE_P)
 259       {
 260         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 261         gcc_assert (mode != VOIDmode);
 262         add_input_operand (ops[opno++], mode);
 263       }
 264
 265     if (m_insn_flags & NULLARY_OP_P)
 266       num_ops = 0;
 267     else if (m_insn_flags & UNARY_OP_P)
 268       num_ops = 1;
 269     else if (m_insn_flags & BINARY_OP_P)
 270       num_ops = 2;
 271     else if (m_insn_flags & TERNARY_OP_P)
 272       num_ops = 3;
 273     else
 274       gcc_unreachable ();
 275
 276     /* Add the remain operands.  */
 277     for (; num_ops; num_ops--, opno++)
 278       {
 279         any_mem_p |= MEM_P (ops[opno]);
 280         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 281         /* 'create_input_operand doesn't allow VOIDmode.
 282            According to vector.md, we may have some patterns that do not have
 283            explicit machine mode specifying the operand. Such operands are
 284            always Pmode.  */
 285         if (mode == VOIDmode)
 286           mode = Pmode;
 287         else
 288           /* Early assertion ensures same mode since maybe_legitimize_operand
 289              will check this.  */
 290           gcc_assert (GET_MODE (ops[opno]) == VOIDmode
 291                       || GET_MODE (ops[opno]) == mode);
 292
 293         add_input_operand (ops[opno], mode);
 294       }
 295
 296     /* Add vl operand.  */
 297     rtx len = m_vl_op;
 298     bool vls_p = false;
 299     if (m_vlmax_p)
 300       {
 301         if (riscv_v_ext_vls_mode_p (vtype_mode))
 302           {
 303             /* VLS modes always set VSETVL by
 304                "vsetvl zero, rs1/imm".  */
 305             poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
 306             len = gen_int_mode (nunits, Pmode);
 307             vls_p = true;
 308           }
 309         else if (can_create_pseudo_p ())
 310           {
 311             len = gen_reg_rtx (Pmode);
 312             emit_vlmax_vsetvl (vtype_mode, len);
 313           }
 314       }
 315
 316     gcc_assert (len != NULL_RTX);
 317     add_input_operand (len, Pmode);
 318
 319     /* Add tail and mask policy operands.  */
 320     add_policy_operand ();
 321
 322     /* Add avl_type operand.  */
 323     add_avl_type_operand (
 324       vls_p ? avl_type::VLS
 325             : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
 326
 327     /* Add rounding mode operand.  */
 328     if (m_insn_flags & FRM_DYN_P)
 329       add_rounding_mode_operand (FRM_DYN);
 330     else if (m_insn_flags & FRM_RUP_P)
 331       add_rounding_mode_operand (FRM_RUP);
 332     else if (m_insn_flags & FRM_RDN_P)
 333       add_rounding_mode_operand (FRM_RDN);
 334     else if (m_insn_flags & FRM_RMM_P)
 335       add_rounding_mode_operand (FRM_RMM);
 336     else if (m_insn_flags & FRM_RNE_P)
 337       add_rounding_mode_operand (FRM_RNE);
 338
 339     gcc_assert (insn_data[(int) icode].n_operands == m_opno);
 340     expand (icode, any_mem_p);
 341   }
 342
 343   void expand (enum insn_code icode, bool temporary_volatile_p = false)
 344   {
 345     if (temporary_volatile_p)
 346       {
 347         temporary_volatile_ok v (true);
 348         expand_insn (icode, m_opno, m_ops);
 349       }
 350     else
 351       expand_insn (icode, m_opno, m_ops);
 352   }
 353
 354 private:
 355   unsigned m_insn_flags;
 356   int m_opno;
 357   bool m_vlmax_p;
 358   rtx m_vl_op;
 359   expand_operand m_ops[MAX_OPERANDS];
 360 };
 361
 362 /* Emit an RVV insn with a vector length that equals the number of units of the
 363    vector mode.  For VLA modes this corresponds to VLMAX.
 364
 365    Unless the vector length can be encoded in the vsetivl[i] instruction this
 366    function must only be used as long as we can create pseudo registers. This is
 367    because it will set a pseudo register to VLMAX using vsetvl and use this as
 368    definition for the vector length.  */
 369 void
 370 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 371 {
 372   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 373   gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
 374
 375   e.emit_insn ((enum insn_code) icode, ops);
 376 }
 377
 378 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
 379    registers anymore.  This function, however, takes a predefined vector length
 380    from the value in VL. */
 381 void
 382 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 383 {
 384   gcc_assert (!can_create_pseudo_p ());
 385   machine_mode mode = GET_MODE (ops[0]);
 386
 387   if (imm_avl_p (mode))
 388     {
 389       /* Even though VL is a real hardreg already allocated since
 390          it is post-RA now, we still gain benefits that we emit
 391          vsetivli zero, imm instead of vsetvli VL, zero which is
 392          we can be more flexible in post-RA instruction scheduling.  */
 393       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 394       e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
 395       e.emit_insn ((enum insn_code) icode, ops);
 396     }
 397   else
 398     {
 399       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 400       e.set_vl (vl);
 401       e.emit_insn ((enum insn_code) icode, ops);
 402     }
 403 }
 404
 405 /* Emit an RVV insn with a predefined vector length.  Contrary to
 406    emit_vlmax_insn the instruction's vector length is not deduced from its mode
 407    but taken from  the value in VL.  */
 408 void
 409 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 410 {
 411   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 412   e.set_vl (vl);
 413   e.emit_insn ((enum insn_code) icode, ops);
 414 }
 415
 416 class rvv_builder : public rtx_vector_builder
 417 {
 418 public:
 419   rvv_builder () : rtx_vector_builder () {}
 420   rvv_builder (machine_mode mode, unsigned int npatterns,
 421                unsigned int nelts_per_pattern)
 422     : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
 423   {
 424     m_inner_mode = GET_MODE_INNER (mode);
 425     m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
 426     m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
 427     m_mask_mode = get_mask_mode (mode);
 428
 429     gcc_assert (
 430       int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
 431     m_int_mode
 432       = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
 433   }
 434
 435   bool can_duplicate_repeating_sequence_p ();
 436   rtx get_merged_repeating_sequence ();
 437
 438   bool repeating_sequence_use_merge_profitable_p ();
 439   bool combine_sequence_use_slideup_profitable_p ();
 440   bool combine_sequence_use_merge_profitable_p ();
 441   rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
 442
 443   bool single_step_npatterns_p () const;
 444   bool npatterns_all_equal_p () const;
 445   bool interleaved_stepped_npatterns_p () const;
 446   bool npatterns_vid_diff_repeated_p () const;
 447
 448   machine_mode new_mode () const { return m_new_mode; }
 449   scalar_mode inner_mode () const { return m_inner_mode; }
 450   scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
 451   machine_mode mask_mode () const { return m_mask_mode; }
 452   machine_mode int_mode () const { return m_int_mode; }
 453   unsigned int inner_bits_size () const { return m_inner_bits_size; }
 454   unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
 455
 456 private:
 457   scalar_mode m_inner_mode;
 458   scalar_int_mode m_inner_int_mode;
 459   machine_mode m_new_mode;
 460   scalar_int_mode m_new_inner_mode;
 461   machine_mode m_mask_mode;
 462   machine_mode m_int_mode;
 463   unsigned int m_inner_bits_size;
 464   unsigned int m_inner_bytes_size;
 465 };
 466
 467 /* Return true if the vector duplicated by a super element which is the fusion
 468    of consecutive elements.
 469
 470      v = { a, b, a, b } super element = ab, v = { ab, ab }  */
 471 bool
 472 rvv_builder::can_duplicate_repeating_sequence_p ()
 473 {
 474   poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
 475   unsigned int new_inner_size = m_inner_bits_size * npatterns ();
 476   if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
 477       || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
 478       || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
 479     return false;
 480   if (full_nelts ().is_constant ())
 481     return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
 482   return nelts_per_pattern () == 1;
 483 }
 484
 485 /* Return true if it is a repeating sequence that using
 486    merge approach has better codegen than using default
 487    approach (slide1down).
 488
 489    Sequence A:
 490      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 491
 492    nelts = 16
 493    npatterns = 2
 494
 495    for merging a we need mask 101010....
 496    for merging b we need mask 010101....
 497
 498    Foreach element in the npattern, we need to build a mask in scalar register.
 499    Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
 500    instruction and 1 scalar move to v0 register.  Finally we need vector merge
 501    to merge them.
 502
 503    lui          a5, #imm
 504    add          a5, #imm
 505    vmov.s.x     v0, a5
 506    vmerge.vxm   v9, v9, a1, v0
 507
 508    So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
 509    If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
 510    So return true in this case as it is profitable.
 511
 512    Sequence B:
 513      {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
 514
 515    nelts = 16
 516    npatterns = 8
 517
 518    COST of merge approach = (3 + 1) * npatterns = 24
 519    COST of slide1down approach = nelts = 16
 520    Return false in this case as it is NOT profitable in merge approach.
 521 */
 522 bool
 523 rvv_builder::repeating_sequence_use_merge_profitable_p ()
 524 {
 525   if (inner_bytes_size () > UNITS_PER_WORD)
 526     return false;
 527
 528   unsigned int nelts = full_nelts ().to_constant ();
 529
 530   if (!repeating_sequence_p (0, nelts, npatterns ()))
 531     return false;
 532
 533   unsigned int merge_cost = 1;
 534   unsigned int build_merge_mask_cost = 3;
 535   unsigned int slide1down_cost = nelts;
 536
 537   return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
 538 }
 539
 540 /* Return true if it's worthwhile to use slideup combine 2 vectors.  */
 541 bool
 542 rvv_builder::combine_sequence_use_slideup_profitable_p ()
 543 {
 544   int nelts = full_nelts ().to_constant ();
 545   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 546   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 547
 548   /* ??? Current heuristic we do is we do combine 2 vectors
 549      by slideup when:
 550        1. # of leading same elements is equal to # of trailing same elements.
 551        2. Both of above are equal to nelts / 2.
 552      Otherwise, it is not profitable.  */
 553   return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
 554 }
 555
 556 /* Return true if it's worthwhile to use merge combine vector with a scalar.  */
 557 bool
 558 rvv_builder::combine_sequence_use_merge_profitable_p ()
 559 {
 560   int nelts = full_nelts ().to_constant ();
 561   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 562   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 563   int nregs = riscv_get_v_regno_alignment (int_mode ());
 564
 565   if (leading_ndups + trailing_ndups != nelts)
 566     return false;
 567
 568   /* Leading elements num > 255 which exceeds the maximum value
 569      of QImode, we will need to use HImode.  */
 570   machine_mode mode;
 571   if (leading_ndups > 255 || nregs > 2)
 572     {
 573       if (!get_vector_mode (HImode, nelts).exists (&mode))
 574         return false;
 575       /* We will need one more AVL/VL toggling vsetvl instruction.  */
 576       return leading_ndups > 4 && trailing_ndups > 4;
 577     }
 578
 579   /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
 580      consume 3 slide instructions.  */
 581   return leading_ndups > 3 && trailing_ndups > 3;
 582 }
 583
 584 /* Merge the repeating sequence into a single element and return the RTX.  */
 585 rtx
 586 rvv_builder::get_merged_repeating_sequence ()
 587 {
 588   scalar_int_mode mode = Pmode;
 589   rtx target = gen_reg_rtx (mode);
 590   emit_move_insn (target, const0_rtx);
 591   rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
 592   /* { a, b, a, b }: Generate duplicate element = b << bits | a.  */
 593   for (unsigned int i = 0; i < npatterns (); i++)
 594     {
 595       unsigned int loc = m_inner_bits_size * i;
 596       rtx shift = gen_int_mode (loc, mode);
 597       rtx ele = gen_lowpart (mode, elt (i));
 598       rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
 599                                      OPTAB_DIRECT);
 600       rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
 601                                       OPTAB_DIRECT);
 602       rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
 603                                       OPTAB_DIRECT);
 604       emit_move_insn (target, tmp3);
 605     }
 606   if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
 607     return gen_lowpart (m_new_inner_mode, target);
 608   return target;
 609 }
 610
 611 /* Get the mask for merge approach.
 612
 613    Consider such following case:
 614      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 615    To merge "a", the mask should be 1010....
 616    To merge "b", the mask should be 0101....
 617 */
 618 rtx
 619 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
 620                                     machine_mode inner_mode) const
 621 {
 622   unsigned HOST_WIDE_INT mask = 0;
 623   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
 624   /* Here we construct a mask pattern that will later be broadcast
 625      to a vector register.  The maximum broadcast size for vmv.v.x/vmv.s.x
 626      is determined by the length of a vector element (ELEN) and not by
 627      XLEN so make sure we do not exceed it.  One example is -march=zve32*
 628      which mandates ELEN == 32 but can be combined with -march=rv64
 629      with XLEN == 64.  */
 630   unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
 631
 632   gcc_assert (elen % npatterns () == 0);
 633
 634   int limit = elen / npatterns ();
 635
 636   for (int i = 0; i < limit; i++)
 637     mask |= base_mask << (i * npatterns ());
 638
 639   return gen_int_mode (mask, inner_mode);
 640 }
 641
 642 /* Return true if the variable-length vector is single step.
 643    Single step means step all patterns in NPATTERNS are equal.
 644    Consider this following case:
 645
 646      CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 647        { 0, 2, 2, 4, 4, 6, ... }
 648      First pattern: step1 = 2 - 0 = 2
 649                     step2 = 4 - 2 = 2
 650      Second pattern: step1 = 4 - 2 = 2
 651                      step2 = 6 - 4 = 2
 652      Since all steps of NPATTERNS are equal step = 2.
 653      Return true in this case.
 654
 655      CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 656        { 0, 1, 2, 4, 4, 7, ... }
 657      First pattern: step1 = 2 - 0 = 2
 658                     step2 = 4 - 2 = 2
 659      Second pattern: step1 = 4 - 1 = 3
 660                      step2 = 7 - 4 = 3
 661      Since not all steps are equal, return false.  */
 662 bool
 663 rvv_builder::single_step_npatterns_p () const
 664 {
 665   if (nelts_per_pattern () != 3)
 666     return false;
 667
 668   poly_int64 step
 669     = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
 670   for (unsigned int i = 0; i < npatterns (); i++)
 671     {
 672       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 673       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 674       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 675       poly_int64 diff1 = ele1 - ele0;
 676       poly_int64 diff2 = ele2 - ele1;
 677       if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
 678         return false;
 679     }
 680   return true;
 681 }
 682
 683 /* Return true if the diff between const vector and vid sequence
 684    is repeated. For example as below cases:
 685    The diff means the const vector - vid.
 686      CASE 1:
 687      CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
 688      VID         : {0, 1, 2, 3, 4, 5, 6, 7, ... }
 689      DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
 690      The diff sequence {3, 1,-1,-3} is repeated in the npattern and
 691      return TRUE for case 1.
 692
 693      CASE 2:
 694      CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
 695      VID         : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
 696      DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
 697      The diff sequence {-4, 3} is not repated in the npattern and
 698      return FALSE for case 2.  */
 699 bool
 700 rvv_builder::npatterns_vid_diff_repeated_p () const
 701 {
 702   if (nelts_per_pattern () != 3)
 703     return false;
 704   else if (npatterns () == 0)
 705     return false;
 706
 707   for (unsigned i = 0; i < npatterns (); i++)
 708     {
 709       poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
 710       poly_int64 diff_1
 711         = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
 712
 713       if (maybe_ne (diff_0, diff_1))
 714         return false;
 715     }
 716
 717   return true;
 718 }
 719
 720 /* Return true if the permutation consists of two
 721    interleaved patterns with a constant step each.
 722    TODO: We currently only support NPATTERNS = 2.  */
 723 bool
 724 rvv_builder::interleaved_stepped_npatterns_p () const
 725 {
 726   if (npatterns () != 2 || nelts_per_pattern () != 3)
 727     return false;
 728   for (unsigned int i = 0; i < npatterns (); i++)
 729     {
 730       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 731       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 732       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 733       poly_int64 diff1 = ele1 - ele0;
 734       poly_int64 diff2 = ele2 - ele1;
 735       if (maybe_ne (diff1, diff2))
 736         return false;
 737     }
 738   return true;
 739 }
 740
 741 /* Return true if all elements of NPATTERNS are equal.
 742
 743    E.g. NPATTERNS = 4:
 744      { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
 745    E.g. NPATTERNS = 8:
 746      { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
 747    We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
 748    We don't need to check the elements[n] with n >= NPATTERNS since
 749    they don't belong to the same pattern.
 750 */
 751 bool
 752 rvv_builder::npatterns_all_equal_p () const
 753 {
 754   poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
 755   for (unsigned int i = 1; i < npatterns (); i++)
 756     {
 757       poly_int64 ele = rtx_to_poly_int64 (elt (i));
 758       if (!known_eq (ele, ele0))
 759         return false;
 760     }
 761   return true;
 762 }
 763
 764 static unsigned
 765 get_sew (machine_mode mode)
 766 {
 767   unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
 768                        ? 8
 769                        : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
 770   return sew;
 771 }
 772
 773 /* Return true if X is a const_vector with all duplicate elements, which is in
 774    the range between MINVAL and MAXVAL.  */
 775 bool
 776 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
 777                                HOST_WIDE_INT maxval)
 778 {
 779   rtx elt;
 780   return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
 781           && IN_RANGE (INTVAL (elt), minval, maxval));
 782 }
 783
 784 /* Return true if VEC is a constant in which every element is in the range
 785    [MINVAL, MAXVAL].  The elements do not need to have the same value.
 786
 787    This function also exists in aarch64, we may unify it in middle-end in the
 788    future.  */
 789
 790 static bool
 791 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
 792 {
 793   if (!CONST_VECTOR_P (vec)
 794       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
 795     return false;
 796
 797   int nunits;
 798   if (!CONST_VECTOR_STEPPED_P (vec))
 799     nunits = const_vector_encoded_nelts (vec);
 800   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
 801     return false;
 802
 803   for (int i = 0; i < nunits; i++)
 804     {
 805       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
 806       poly_int64 value;
 807       if (!poly_int_rtx_p (vec_elem, &value)
 808           || maybe_lt (value, minval)
 809           || maybe_gt (value, maxval))
 810         return false;
 811     }
 812   return true;
 813 }
 814
 815 /* Return a const vector of VAL. The VAL can be either const_int or
 816    const_poly_int.  */
 817
 818 static rtx
 819 gen_const_vector_dup (machine_mode mode, poly_int64 val)
 820 {
 821   scalar_mode smode = GET_MODE_INNER (mode);
 822   rtx c = gen_int_mode (val, smode);
 823   if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
 824     {
 825       /* When VAL is const_poly_int value, we need to explicitly broadcast
 826          it into a vector using RVV broadcast instruction.  */
 827       return expand_vector_broadcast (mode, c);
 828     }
 829    return gen_const_vec_duplicate (mode, c);
 830 }
 831
 832 /* Emit a vlmax vsetvl instruction.  This should only be used when
 833    optimization is disabled or after vsetvl insertion pass.  */
 834 void
 835 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
 836 {
 837   unsigned int sew = get_sew (vmode);
 838   emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
 839                          gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
 840                          const0_rtx));
 841 }
 842
 843 void
 844 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
 845 {
 846   unsigned int sew = get_sew (vmode);
 847   enum vlmul_type vlmul = get_vlmul (vmode);
 848   unsigned int ratio = calculate_ratio (sew, vlmul);
 849
 850   if (!optimize)
 851     emit_hard_vlmax_vsetvl (vmode, vl);
 852   else
 853     emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
 854 }
 855
 856 /* Calculate SEW/LMUL ratio.  */
 857 unsigned int
 858 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
 859 {
 860   unsigned int ratio;
 861   switch (vlmul)
 862     {
 863     case LMUL_1:
 864       ratio = sew;
 865       break;
 866     case LMUL_2:
 867       ratio = sew / 2;
 868       break;
 869     case LMUL_4:
 870       ratio = sew / 4;
 871       break;
 872     case LMUL_8:
 873       ratio = sew / 8;
 874       break;
 875     case LMUL_F8:
 876       ratio = sew * 8;
 877       break;
 878     case LMUL_F4:
 879       ratio = sew * 4;
 880       break;
 881     case LMUL_F2:
 882       ratio = sew * 2;
 883       break;
 884     default:
 885       gcc_unreachable ();
 886     }
 887   return ratio;
 888 }
 889
 890 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
 891    compile-time unknown). FIXED meands that the vector-length is specific
 892    (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
 893    auto-vectorization using VLMAX vsetvl configuration.  */
 894 static bool
 895 autovec_use_vlmax_p (void)
 896 {
 897   return (riscv_autovec_preference == RVV_SCALABLE
 898           || riscv_autovec_preference == RVV_FIXED_VLMAX);
 899 }
 900
 901 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
 902    is a const duplicate vector. Otherwise, emit vrgather.vv.  */
 903 static void
 904 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
 905 {
 906   rtx elt;
 907   insn_code icode;
 908   machine_mode data_mode = GET_MODE (target);
 909   machine_mode sel_mode = GET_MODE (sel);
 910   if (const_vec_duplicate_p (sel, &elt))
 911     {
 912       icode = code_for_pred_gather_scalar (data_mode);
 913       sel = elt;
 914     }
 915   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 916     icode = code_for_pred_gatherei16 (data_mode);
 917   else
 918     icode = code_for_pred_gather (data_mode);
 919   rtx ops[] = {target, op, sel};
 920   emit_vlmax_insn (icode, BINARY_OP, ops);
 921 }
 922
 923 static void
 924 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
 925 {
 926   rtx elt;
 927   insn_code icode;
 928   machine_mode data_mode = GET_MODE (target);
 929   machine_mode sel_mode = GET_MODE (sel);
 930   if (const_vec_duplicate_p (sel, &elt))
 931     {
 932       icode = code_for_pred_gather_scalar (data_mode);
 933       sel = elt;
 934     }
 935   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 936     icode = code_for_pred_gatherei16 (data_mode);
 937   else
 938     icode = code_for_pred_gather (data_mode);
 939   rtx ops[] = {target, mask, target, op, sel};
 940   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 941 }
 942
 943 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
 944    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 945
 946   There is no inverse vdecompress provided, as this operation can be readily
 947   synthesized using iota and a masked vrgather:
 948
 949       Desired functionality of 'vdecompress'
 950         7 6 5 4 3 2 1 0     # vid
 951
 952               e d c b a     # packed vector of 5 elements
 953         1 0 0 1 1 1 0 1     # mask vector of 8 elements
 954         p q r s t u v w     # destination register before vdecompress
 955
 956         e q r d c b v a     # result of vdecompress
 957        # v0 holds mask
 958        # v1 holds packed data
 959        # v11 holds input expanded vector and result
 960        viota.m v10, v0                 # Calc iota from mask in v0
 961        vrgather.vv v11, v1, v10, v0.t  # Expand into destination
 962      p q r s t u v w  # v11 destination register
 963            e d c b a  # v1 source vector
 964      1 0 0 1 1 1 0 1  # v0 mask vector
 965
 966      4 4 4 3 2 1 1 0  # v10 result of viota.m
 967      e q r d c b v a  # v11 destination after vrgather using viota.m under mask
 968 */
 969 static void
 970 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
 971 {
 972   machine_mode data_mode = GET_MODE (target);
 973   machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
 974   if (GET_MODE_INNER (data_mode) == QImode)
 975     sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
 976
 977   rtx sel = gen_reg_rtx (sel_mode);
 978   rtx iota_ops[] = {sel, mask};
 979   emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
 980   emit_vlmax_gather_insn (target, op0, sel);
 981   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 982 }
 983
 984 /* Emit merge instruction.  */
 985
 986 static machine_mode
 987 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
 988                                          machine_mode mask_bit_mode)
 989 {
 990   unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
 991   unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
 992     ? builder.inner_bits_size () : mask_precision;
 993
 994   scalar_mode inner_mode;
 995   unsigned minimal_bits_size;
 996
 997   switch (mask_scalar_size)
 998     {
 999       case 8:
1000         inner_mode = QImode;
1001         minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
1002         break;
1003       case 16:
1004         inner_mode = HImode;
1005         minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
1006         break;
1007       case 32:
1008         inner_mode = SImode;
1009         minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
1010         break;
1011       case 64:
1012         inner_mode = DImode;
1013         minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
1014         break;
1015       default:
1016         gcc_unreachable ();
1017         break;
1018     }
1019
1020   gcc_assert (mask_precision % mask_scalar_size == 0);
1021
1022   uint64_t dup_nunit = mask_precision > mask_scalar_size
1023     ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1024
1025   return get_vector_mode (inner_mode, dup_nunit).require ();
1026 }
1027
1028 /* Expand series const vector.  If VID is NULL_RTX, we use vid.v
1029    instructions to generate sequence for VID:
1030
1031      VID = { 0, 1, 2, 3, ... }
1032
1033    Otherwise, we use the VID argument directly.  */
1034
1035 void
1036 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1037 {
1038   machine_mode mode = GET_MODE (dest);
1039   poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1040   poly_int64 value;
1041   rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1042
1043   /* VECT_IV = BASE + I * STEP.  */
1044
1045   /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v.  */
1046   bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1047                    && poly_int_rtx_p (base, &value)
1048                    && known_eq (nunits_m1, value);
1049   if (!vid)
1050     {
1051       vid = gen_reg_rtx (mode);
1052       rtx op[] = {vid};
1053       emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1054     }
1055
1056   rtx step_adj;
1057   if (reverse_p)
1058     {
1059       /* Special case:
1060            {nunits - 1, nunits - 2, ... , 0}.
1061            nunits can be either const_int or const_poly_int.
1062
1063          Code sequence:
1064            vid.v v
1065            vrsub nunits - 1, v.  */
1066       rtx ops[]
1067         = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1068       insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1069       emit_vlmax_insn (icode, BINARY_OP, ops);
1070     }
1071   else
1072     {
1073       /* Step 2: Generate I * STEP.
1074          - STEP is 1, we don't emit any instructions.
1075          - STEP is power of 2, we use vsll.vi/vsll.vx.
1076          - STEP is non-power of 2, we use vmul.vx.  */
1077       if (rtx_equal_p (step, const1_rtx))
1078         step_adj = vid;
1079       else
1080         {
1081           step_adj = gen_reg_rtx (mode);
1082           if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1083             {
1084               /* Emit logical left shift operation.  */
1085               int shift = exact_log2 (INTVAL (step));
1086               rtx shift_amount = gen_int_mode (shift, Pmode);
1087               insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1088               rtx ops[] = {step_adj, vid, shift_amount};
1089               emit_vlmax_insn (icode, BINARY_OP, ops);
1090             }
1091           else
1092             {
1093               insn_code icode = code_for_pred_scalar (MULT, mode);
1094               rtx ops[] = {step_adj, vid, step};
1095               emit_vlmax_insn (icode, BINARY_OP, ops);
1096             }
1097         }
1098
1099       /* Step 3: Generate BASE + I * STEP.
1100           - BASE is 0, use result of vid.
1101           - BASE is not 0, we use vadd.vx/vadd.vi.  */
1102       if (rtx_equal_p (base, const0_rtx))
1103         emit_move_insn (result, step_adj);
1104       else
1105         {
1106           insn_code icode = code_for_pred_scalar (PLUS, mode);
1107           rtx ops[] = {result, step_adj, base};
1108           emit_vlmax_insn (icode, BINARY_OP, ops);
1109         }
1110     }
1111
1112   if (result != dest)
1113     emit_move_insn (dest, result);
1114 }
1115
1116 static void
1117 expand_const_vector (rtx target, rtx src)
1118 {
1119   machine_mode mode = GET_MODE (target);
1120   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1121     {
1122       rtx elt;
1123       gcc_assert (
1124         const_vec_duplicate_p (src, &elt)
1125         && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
1126       rtx ops[] = {target, src};
1127       emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1128       return;
1129     }
1130
1131   rtx elt;
1132   if (const_vec_duplicate_p (src, &elt))
1133     {
1134       rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1135       /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1136          we use vmv.v.i instruction.  */
1137       if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
1138         {
1139           rtx ops[] = {tmp, src};
1140           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1141         }
1142       else
1143         {
1144           /* Emit vec_duplicate<mode> split pattern before RA so that
1145              we could have a better optimization opportunity in LICM
1146              which will hoist vmv.v.x outside the loop and in fwprop && combine
1147              which will transform 'vv' into 'vx' instruction.
1148
1149              The reason we don't emit vec_duplicate<mode> split pattern during
1150              RA since the split stage after RA is a too late stage to generate
1151              RVV instruction which need an additional register (We can't
1152              allocate a new register after RA) for VL operand of vsetvl
1153              instruction (vsetvl a5, zero).  */
1154           if (lra_in_progress)
1155             {
1156               rtx ops[] = {tmp, elt};
1157               emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1158             }
1159           else
1160             {
1161               struct expand_operand ops[2];
1162               enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1163               gcc_assert (icode != CODE_FOR_nothing);
1164               create_output_operand (&ops[0], tmp, mode);
1165               create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1166               expand_insn (icode, 2, ops);
1167               tmp = ops[0].value;
1168             }
1169         }
1170
1171       if (tmp != target)
1172         emit_move_insn (target, tmp);
1173       return;
1174     }
1175
1176   /* Support scalable const series vector.  */
1177   rtx base, step;
1178   if (const_vec_series_p (src, &base, &step))
1179     {
1180       expand_vec_series (target, base, step);
1181       return;
1182     }
1183
1184   /* Handle variable-length vector.  */
1185   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1186   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1187   rvv_builder builder (mode, npatterns, nelts_per_pattern);
1188   for (unsigned int i = 0; i < nelts_per_pattern; i++)
1189     {
1190       for (unsigned int j = 0; j < npatterns; j++)
1191         builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1192     }
1193   builder.finalize ();
1194
1195   if (CONST_VECTOR_DUPLICATE_P (src))
1196     {
1197       /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1198          E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1199               NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1200         The elements within NPATTERNS are not necessary regular.  */
1201       if (builder.can_duplicate_repeating_sequence_p ())
1202         {
1203           /* We handle the case that we can find a vector containter to hold
1204              element bitsize = NPATTERNS * ele_bitsize.
1205
1206                NPATTERNS = 8, element width = 8
1207                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1208                In this case, we can combine NPATTERNS element into a larger
1209                element. Use element width = 64 and broadcast a vector with
1210                all element equal to 0x0706050403020100.  */
1211           rtx ele = builder.get_merged_repeating_sequence ();
1212           rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1213           emit_move_insn (target, gen_lowpart (mode, dup));
1214         }
1215       else
1216         {
1217           /* We handle the case that we can't find a vector containter to hold
1218              element bitsize = NPATTERNS * ele_bitsize.
1219
1220                NPATTERNS = 8, element width = 16
1221                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1222                Since NPATTERNS * element width = 128, we can't find a container
1223                to hold it.
1224
1225                In this case, we use NPATTERNS merge operations to generate such
1226                vector.  */
1227           unsigned int nbits = npatterns - 1;
1228
1229           /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1230           rtx vid = gen_reg_rtx (builder.int_mode ());
1231           rtx op[] = {vid};
1232           emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1233                             NULLARY_OP, op);
1234
1235           /* Generate vid_repeat = { 0, 1, ... nbits, ... }  */
1236           rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1237           rtx and_ops[] = {vid_repeat, vid,
1238                            gen_int_mode (nbits, builder.inner_int_mode ())};
1239           emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1240                             BINARY_OP, and_ops);
1241
1242           rtx tmp = gen_reg_rtx (builder.mode ());
1243           rtx dup_ops[] = {tmp, builder.elt (0)};
1244           emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1245                             dup_ops);
1246           for (unsigned int i = 1; i < builder.npatterns (); i++)
1247             {
1248               /* Generate mask according to i.  */
1249               rtx mask = gen_reg_rtx (builder.mask_mode ());
1250               rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1251               expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1252
1253               /* Merge scalar to each i.  */
1254               rtx tmp2 = gen_reg_rtx (builder.mode ());
1255               rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
1256               insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1257               emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1258               tmp = tmp2;
1259             }
1260           emit_move_insn (target, tmp);
1261         }
1262     }
1263   else if (CONST_VECTOR_STEPPED_P (src))
1264     {
1265       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1266       if (builder.single_step_npatterns_p ())
1267         {
1268           /* Describe the case by choosing NPATTERNS = 4 as an example.  */
1269           insn_code icode;
1270
1271           /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1272           rtx vid = gen_reg_rtx (builder.mode ());
1273           rtx vid_ops[] = {vid};
1274           icode = code_for_pred_series (builder.mode ());
1275           emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1276
1277           if (builder.npatterns_all_equal_p ())
1278             {
1279               /* Generate the variable-length vector following this rule:
1280                  { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1281                    E.g. { 0, 0, 8, 8, 16, 16, ... } */
1282               /* We want to create a pattern where value[ix] = floor (ix /
1283                  NPATTERNS). As NPATTERNS is always a power of two we can
1284                  rewrite this as = ix & -NPATTERNS.  */
1285               /* Step 2: VID AND -NPATTERNS:
1286                  { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1287               */
1288               rtx imm
1289                 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1290               rtx tmp = gen_reg_rtx (builder.mode ());
1291               rtx and_ops[] = {tmp, vid, imm};
1292               icode = code_for_pred_scalar (AND, builder.mode ());
1293               emit_vlmax_insn (icode, BINARY_OP, and_ops);
1294               HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1295               if (init_val == 0)
1296                 emit_move_insn (target, tmp);
1297               else
1298                 {
1299                   rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1300                   rtx add_ops[] = {target, tmp, dup};
1301                   icode = code_for_pred (PLUS, builder.mode ());
1302                   emit_vlmax_insn (icode, BINARY_OP, add_ops);
1303                 }
1304             }
1305           else
1306             {
1307               /* Generate the variable-length vector following this rule:
1308                 { a, b, a + step, b + step, a + step*2, b + step*2, ... }  */
1309
1310               if (builder.npatterns_vid_diff_repeated_p ())
1311                 {
1312                   /* Case 1: For example as below:
1313                      {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1314                      We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1315                      repeated as below after minus vid.
1316                      {3, 1, -1, -3, 3, 1, -1, -3...}
1317                      Then we can simplify the diff code gen to at most
1318                      npatterns().  */
1319                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1320
1321                   /* Step 1: Generate diff = TARGET - VID.  */
1322                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1323                     {
1324                      poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1325                      v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1326                     }
1327
1328                   /* Step 2: Generate result = VID + diff.  */
1329                   rtx vec = v.build ();
1330                   rtx add_ops[] = {target, vid, vec};
1331                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1332                                    BINARY_OP, add_ops);
1333                 }
1334               else
1335                 {
1336                   /* Case 2: For example as below:
1337                      { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1338                    */
1339                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1340
1341                   /* Step 1: Generate { a, b, a, b, ... }  */
1342                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1343                     v.quick_push (builder.elt (i));
1344                   rtx new_base = v.build ();
1345
1346                   /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS).  */
1347                   rtx shift_count
1348                     = gen_int_mode (exact_log2 (builder.npatterns ()),
1349                                     builder.inner_mode ());
1350                   rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT,
1351                                                  vid, shift_count, NULL_RTX,
1352                                                  false, OPTAB_DIRECT);
1353
1354                   /* Step 3: Generate tmp2 = tmp * step.  */
1355                   rtx tmp2 = gen_reg_rtx (builder.mode ());
1356                   rtx step
1357                     = simplify_binary_operation (MINUS, builder.inner_mode (),
1358                                                  builder.elt (v.npatterns()),
1359                                                  builder.elt (0));
1360                   expand_vec_series (tmp2, const0_rtx, step, tmp);
1361
1362                   /* Step 4: Generate target = tmp2 + new_base.  */
1363                   rtx add_ops[] = {target, tmp2, new_base};
1364                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1365                                    BINARY_OP, add_ops);
1366                 }
1367             }
1368         }
1369       else if (builder.interleaved_stepped_npatterns_p ())
1370         {
1371           rtx base1 = builder.elt (0);
1372           rtx base2 = builder.elt (1);
1373           poly_int64 step1
1374             = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1375               - rtx_to_poly_int64 (base1);
1376           poly_int64 step2
1377             = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1378               - rtx_to_poly_int64 (base2);
1379
1380           /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1381              integer vector mode to generate such vector efficiently.
1382
1383              E.g. EEW = 16, { 2, 0, 4, 0, ... }
1384
1385              can be interpreted into:
1386
1387                   EEW = 32, { 2, 4, ... }  */
1388           unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1389           scalar_int_mode new_smode;
1390           machine_mode new_mode;
1391           poly_uint64 new_nunits
1392             = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1393           if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1394               && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1395             {
1396               rtx tmp = gen_reg_rtx (new_mode);
1397               base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1398               expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode));
1399
1400               if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1401                 /* { 1, 0, 2, 0, ... }.  */
1402                 emit_move_insn (target, gen_lowpart (mode, tmp));
1403               else if (known_eq (step2, 0))
1404                 {
1405                   /* { 1, 1, 2, 1, ... }.  */
1406                   rtx scalar = expand_simple_binop (
1407                     new_smode, ASHIFT,
1408                     gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1409                     gen_int_mode (builder.inner_bits_size (), new_smode),
1410                     NULL_RTX, false, OPTAB_DIRECT);
1411                   rtx tmp2 = gen_reg_rtx (new_mode);
1412                   rtx and_ops[] = {tmp2, tmp, scalar};
1413                   emit_vlmax_insn (code_for_pred_scalar (AND, new_mode),
1414                                    BINARY_OP, and_ops);
1415                   emit_move_insn (target, gen_lowpart (mode, tmp2));
1416                 }
1417               else
1418                 {
1419                   /* { 1, 3, 2, 6, ... }.  */
1420                   rtx tmp2 = gen_reg_rtx (new_mode);
1421                   base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1422                   expand_vec_series (tmp2, base2,
1423                                      gen_int_mode (step2, new_smode));
1424                   rtx shifted_tmp2 = expand_simple_binop (
1425                     new_mode, ASHIFT, tmp2,
1426                     gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1427                     false, OPTAB_DIRECT);
1428                   rtx tmp3 = gen_reg_rtx (new_mode);
1429                   rtx ior_ops[] = {tmp3, tmp, shifted_tmp2};
1430                   emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1431                                    ior_ops);
1432                   emit_move_insn (target, gen_lowpart (mode, tmp3));
1433                 }
1434             }
1435           else
1436             {
1437               rtx vid = gen_reg_rtx (mode);
1438               expand_vec_series (vid, const0_rtx, const1_rtx);
1439               /* Transform into { 0, 0, 1, 1, 2, 2, ... }.  */
1440               rtx shifted_vid
1441                 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1442                                        NULL_RTX, false, OPTAB_DIRECT);
1443               rtx tmp1 = gen_reg_rtx (mode);
1444               rtx tmp2 = gen_reg_rtx (mode);
1445               expand_vec_series (tmp1, base1,
1446                                  gen_int_mode (step1, builder.inner_mode ()),
1447                                  shifted_vid);
1448               expand_vec_series (tmp2, base2,
1449                                  gen_int_mode (step2, builder.inner_mode ()),
1450                                  shifted_vid);
1451
1452               /* Transform into { 0, 1, 0, 1, 0, 1, ... }.  */
1453               rtx and_vid = gen_reg_rtx (mode);
1454               rtx and_ops[] = {and_vid, vid, const1_rtx};
1455               emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1456                                and_ops);
1457               rtx mask = gen_reg_rtx (builder.mask_mode ());
1458               expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1459
1460               rtx ops[] = {target, tmp1, tmp2, mask};
1461               emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1462             }
1463         }
1464       else if (npatterns == 1 && nelts_per_pattern == 3)
1465         {
1466           /* Generate the following CONST_VECTOR:
1467              { base0, base1, base1 + step, base1 + step * 2, ... }  */
1468           rtx base0 = builder.elt (0);
1469           rtx base1 = builder.elt (1);
1470           rtx base2 = builder.elt (2);
1471
1472           rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
1473                                                 base2, base1);
1474
1475           /* Step 1 - { base1, base1 + step, base1 + step * 2, ... }  */
1476           rtx tmp = gen_reg_rtx (mode);
1477           expand_vec_series (tmp, base1, step);
1478           /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... }  */
1479           if (!rtx_equal_p (base0, const0_rtx))
1480             base0 = force_reg (builder.inner_mode (), base0);
1481
1482           insn_code icode = optab_handler (vec_shl_insert_optab, mode);
1483           gcc_assert (icode != CODE_FOR_nothing);
1484           emit_insn (GEN_FCN (icode) (target, tmp, base0));
1485         }
1486       else
1487         /* TODO: We will enable more variable-length vector in the future.  */
1488         gcc_unreachable ();
1489     }
1490   else
1491     gcc_unreachable ();
1492 }
1493
1494 /* Get the frm mode with given CONST_INT rtx, the default mode is
1495    FRM_DYN.  */
1496 enum floating_point_rounding_mode
1497 get_frm_mode (rtx operand)
1498 {
1499   gcc_assert (CONST_INT_P (operand));
1500
1501   switch (INTVAL (operand))
1502     {
1503     case FRM_RNE:
1504       return FRM_RNE;
1505     case FRM_RTZ:
1506       return FRM_RTZ;
1507     case FRM_RDN:
1508       return FRM_RDN;
1509     case FRM_RUP:
1510       return FRM_RUP;
1511     case FRM_RMM:
1512       return FRM_RMM;
1513     case FRM_DYN:
1514       return FRM_DYN;
1515     default:
1516       gcc_unreachable ();
1517     }
1518
1519   gcc_unreachable ();
1520 }
1521
1522 /* Expand a pre-RA RVV data move from SRC to DEST.
1523    It expands move for RVV fractional vector modes.
1524    Return true if the move as already been emitted.  */
1525 bool
1526 legitimize_move (rtx dest, rtx *srcp)
1527 {
1528   rtx src = *srcp;
1529   machine_mode mode = GET_MODE (dest);
1530   if (CONST_VECTOR_P (src))
1531     {
1532       expand_const_vector (dest, src);
1533       return true;
1534     }
1535
1536   if (riscv_v_ext_vls_mode_p (mode))
1537     {
1538       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1539         {
1540           /* For NUNITS <= 31 VLS modes, we don't need extrac
1541              scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
1542           if (can_create_pseudo_p ())
1543             {
1544               /* Need to force register if mem <- !reg.  */
1545               if (MEM_P (dest) && !REG_P (src))
1546                 *srcp = force_reg (mode, src);
1547
1548               return false;
1549             }
1550         }
1551       else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1552         {
1553           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1554           return true;
1555         }
1556     }
1557   else
1558     {
1559       /* In order to decrease the memory traffic, we don't use whole register
1560        * load/store for the LMUL less than 1 and mask mode, so those case will
1561        * require one extra general purpose register, but it's not allowed during
1562        * LRA process, so we have a special move pattern used for LRA, which will
1563        * defer the expansion after LRA.  */
1564       if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1565            || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1566           && lra_in_progress)
1567         {
1568           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1569           return true;
1570         }
1571
1572       if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1573           && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1574         {
1575           /* Need to force register if mem <- !reg.  */
1576           if (MEM_P (dest) && !REG_P (src))
1577             *srcp = force_reg (mode, src);
1578
1579           return false;
1580         }
1581     }
1582
1583   if (register_operand (src, mode) && register_operand (dest, mode))
1584     {
1585       emit_insn (gen_rtx_SET (dest, src));
1586       return true;
1587     }
1588
1589   unsigned insn_flags
1590     = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1591   if (!register_operand (src, mode) && !register_operand (dest, mode))
1592     {
1593       rtx tmp = gen_reg_rtx (mode);
1594       if (MEM_P (src))
1595         {
1596           rtx ops[] = {tmp, src};
1597           emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1598         }
1599       else
1600         emit_move_insn (tmp, src);
1601       src = tmp;
1602     }
1603
1604   if (satisfies_constraint_vu (src))
1605     return false;
1606
1607   rtx ops[] = {dest, src};
1608   emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1609   return true;
1610 }
1611
1612 /* VTYPE information for machine_mode.  */
1613 struct mode_vtype_group
1614 {
1615   enum vlmul_type vlmul[NUM_MACHINE_MODES];
1616   uint8_t ratio[NUM_MACHINE_MODES];
1617   machine_mode subpart_mode[NUM_MACHINE_MODES];
1618   uint8_t nf[NUM_MACHINE_MODES];
1619   mode_vtype_group ()
1620   {
1621 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO)                                 \
1622   vlmul[MODE##mode] = VLMUL;                                                   \
1623   ratio[MODE##mode] = RATIO;
1624 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO)         \
1625   subpart_mode[MODE##mode] = SUBPART_MODE##mode;                               \
1626   nf[MODE##mode] = NF;                                                         \
1627   vlmul[MODE##mode] = VLMUL;                                                   \
1628   ratio[MODE##mode] = RATIO;
1629 #include "riscv-vector-switch.def"
1630 #undef ENTRY
1631 #undef TUPLE_ENTRY
1632   }
1633 };
1634
1635 static mode_vtype_group mode_vtype_infos;
1636
1637 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR.  */
1638 enum vlmul_type
1639 get_vlmul (machine_mode mode)
1640 {
1641   /* For VLS modes, the vlmul should be dynamically
1642      calculated since we need to adjust VLMUL according
1643      to TARGET_MIN_VLEN.  */
1644   if (riscv_v_ext_vls_mode_p (mode))
1645     {
1646       int size = GET_MODE_BITSIZE (mode).to_constant ();
1647       int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1648       if (size < TARGET_MIN_VLEN)
1649         {
1650           int factor = TARGET_MIN_VLEN / size;
1651           if (inner_size == 8)
1652             factor = MIN (factor, 8);
1653           else if (inner_size == 16)
1654             factor = MIN (factor, 4);
1655           else if (inner_size == 32)
1656             factor = MIN (factor, 2);
1657           else if (inner_size == 64)
1658             factor = MIN (factor, 1);
1659           else
1660             gcc_unreachable ();
1661
1662           switch (factor)
1663             {
1664             case 1:
1665               return LMUL_1;
1666             case 2:
1667               return LMUL_F2;
1668             case 4:
1669               return LMUL_F4;
1670             case 8:
1671               return LMUL_F8;
1672
1673             default:
1674               gcc_unreachable ();
1675             }
1676         }
1677       else
1678         {
1679           int factor = size / TARGET_MIN_VLEN;
1680           switch (factor)
1681             {
1682             case 1:
1683               return LMUL_1;
1684             case 2:
1685               return LMUL_2;
1686             case 4:
1687               return LMUL_4;
1688             case 8:
1689               return LMUL_8;
1690
1691             default:
1692               gcc_unreachable ();
1693             }
1694         }
1695     }
1696   return mode_vtype_infos.vlmul[mode];
1697 }
1698
1699 /* Return the VLMAX rtx of vector mode MODE.  */
1700 rtx
1701 get_vlmax_rtx (machine_mode mode)
1702 {
1703   gcc_assert (riscv_v_ext_vector_mode_p (mode));
1704   return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1705 }
1706
1707 /* Return the NF value of the corresponding mode.  */
1708 unsigned int
1709 get_nf (machine_mode mode)
1710 {
1711   /* We don't allow non-tuple modes go through this function.  */
1712   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1713   return mode_vtype_infos.nf[mode];
1714 }
1715
1716 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1717    the subpart mode is RVVM2SImode. This will help to build
1718    array/struct type in builtins.  */
1719 machine_mode
1720 get_subpart_mode (machine_mode mode)
1721 {
1722   /* We don't allow non-tuple modes go through this function.  */
1723   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1724   return mode_vtype_infos.subpart_mode[mode];
1725 }
1726
1727 /* Get ratio according to machine mode.  */
1728 unsigned int
1729 get_ratio (machine_mode mode)
1730 {
1731   if (riscv_v_ext_vls_mode_p (mode))
1732     {
1733       unsigned int sew = get_sew (mode);
1734       vlmul_type vlmul = get_vlmul (mode);
1735       switch (vlmul)
1736         {
1737         case LMUL_1:
1738           return sew;
1739         case LMUL_2:
1740           return sew / 2;
1741         case LMUL_4:
1742           return sew / 4;
1743         case LMUL_8:
1744           return sew / 8;
1745         case LMUL_F8:
1746           return sew * 8;
1747         case LMUL_F4:
1748           return sew * 4;
1749         case LMUL_F2:
1750           return sew * 2;
1751
1752         default:
1753           gcc_unreachable ();
1754         }
1755     }
1756   return mode_vtype_infos.ratio[mode];
1757 }
1758
1759 /* Get ta according to operand[tail_op_idx].  */
1760 int
1761 get_ta (rtx ta)
1762 {
1763   if (INTVAL (ta) == TAIL_ANY)
1764     return INVALID_ATTRIBUTE;
1765   return INTVAL (ta);
1766 }
1767
1768 /* Get ma according to operand[mask_op_idx].  */
1769 int
1770 get_ma (rtx ma)
1771 {
1772   if (INTVAL (ma) == MASK_ANY)
1773     return INVALID_ATTRIBUTE;
1774   return INTVAL (ma);
1775 }
1776
1777 /* Get prefer tail policy.  */
1778 enum tail_policy
1779 get_prefer_tail_policy ()
1780 {
1781   /* TODO: By default, we choose to use TAIL_ANY which allows
1782      compiler pick up either agnostic or undisturbed. Maybe we
1783      will have a compile option like -mprefer=agnostic to set
1784      this value???.  */
1785   return TAIL_ANY;
1786 }
1787
1788 /* Get prefer mask policy.  */
1789 enum mask_policy
1790 get_prefer_mask_policy ()
1791 {
1792   /* TODO: By default, we choose to use MASK_ANY which allows
1793      compiler pick up either agnostic or undisturbed. Maybe we
1794      will have a compile option like -mprefer=agnostic to set
1795      this value???.  */
1796   return MASK_ANY;
1797 }
1798
1799 /* Get avl_type rtx.  */
1800 rtx
1801 get_avl_type_rtx (enum avl_type type)
1802 {
1803   return gen_int_mode (type, Pmode);
1804 }
1805
1806 /* Return the appropriate mask mode for MODE.  */
1807
1808 machine_mode
1809 get_mask_mode (machine_mode mode)
1810 {
1811   poly_int64 nunits = GET_MODE_NUNITS (mode);
1812   if (riscv_v_ext_tuple_mode_p (mode))
1813     {
1814       unsigned int nf = get_nf (mode);
1815       nunits = exact_div (nunits, nf);
1816     }
1817   return get_vector_mode (BImode, nunits).require ();
1818 }
1819
1820 /* Return the appropriate M1 mode for MODE.  */
1821
1822 static opt_machine_mode
1823 get_m1_mode (machine_mode mode)
1824 {
1825   scalar_mode smode = GET_MODE_INNER (mode);
1826   unsigned int bytes = GET_MODE_SIZE (smode);
1827   poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1828   return get_vector_mode (smode, m1_nunits);
1829 }
1830
1831 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1832    This function is not only used by builtins, but also will be used by
1833    auto-vectorization in the future.  */
1834 opt_machine_mode
1835 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1836 {
1837   enum mode_class mclass;
1838   if (inner_mode == E_BImode)
1839     mclass = MODE_VECTOR_BOOL;
1840   else if (FLOAT_MODE_P (inner_mode))
1841     mclass = MODE_VECTOR_FLOAT;
1842   else
1843     mclass = MODE_VECTOR_INT;
1844   machine_mode mode;
1845   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1846     if (inner_mode == GET_MODE_INNER (mode)
1847         && known_eq (nunits, GET_MODE_NUNITS (mode))
1848         && (riscv_v_ext_vector_mode_p (mode)
1849             || riscv_v_ext_vls_mode_p (mode)))
1850       return mode;
1851   return opt_machine_mode ();
1852 }
1853
1854 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1855    corresponding subpart mode and NF.  */
1856 opt_machine_mode
1857 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1858 {
1859   poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1860   scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1861   enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1862   machine_mode mode;
1863   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1864     if (inner_mode == GET_MODE_INNER (mode)
1865         && known_eq (nunits, GET_MODE_NUNITS (mode))
1866         && riscv_v_ext_tuple_mode_p (mode)
1867         && get_subpart_mode (mode) == subpart_mode)
1868       return mode;
1869   return opt_machine_mode ();
1870 }
1871
1872 bool
1873 simm5_p (rtx x)
1874 {
1875   if (!CONST_INT_P (x))
1876     return false;
1877   return IN_RANGE (INTVAL (x), -16, 15);
1878 }
1879
1880 bool
1881 neg_simm5_p (rtx x)
1882 {
1883   if (!CONST_INT_P (x))
1884     return false;
1885   return IN_RANGE (INTVAL (x), -15, 16);
1886 }
1887
1888 bool
1889 has_vi_variant_p (rtx_code code, rtx x)
1890 {
1891   switch (code)
1892     {
1893     case PLUS:
1894     case AND:
1895     case IOR:
1896     case XOR:
1897     case SS_PLUS:
1898     case US_PLUS:
1899     case EQ:
1900     case NE:
1901     case LE:
1902     case LEU:
1903     case GT:
1904     case GTU:
1905       return simm5_p (x);
1906
1907     case LT:
1908     case LTU:
1909     case GE:
1910     case GEU:
1911     case MINUS:
1912     case SS_MINUS:
1913       return neg_simm5_p (x);
1914
1915     default:
1916       return false;
1917     }
1918 }
1919
1920 bool
1921 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
1922                      machine_mode vector_mode, bool has_vi_variant_p,
1923                      void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
1924 {
1925   machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
1926   if (has_vi_variant_p)
1927     {
1928       *scalar_op = force_reg (scalar_mode, *scalar_op);
1929       return false;
1930     }
1931
1932   if (TARGET_64BIT)
1933     {
1934       if (!rtx_equal_p (*scalar_op, const0_rtx))
1935         *scalar_op = force_reg (scalar_mode, *scalar_op);
1936       return false;
1937     }
1938
1939   if (immediate_operand (*scalar_op, Pmode))
1940     {
1941       if (!rtx_equal_p (*scalar_op, const0_rtx))
1942         *scalar_op = force_reg (Pmode, *scalar_op);
1943
1944       *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
1945       return false;
1946     }
1947
1948   if (CONST_INT_P (*scalar_op))
1949     {
1950       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
1951         *scalar_op = force_const_mem (scalar_mode, *scalar_op);
1952       else
1953         *scalar_op = force_reg (scalar_mode, *scalar_op);
1954     }
1955
1956   rtx tmp = gen_reg_rtx (vector_mode);
1957   rtx ops[] = {tmp, *scalar_op};
1958   if (type == VLMAX)
1959     emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
1960   else
1961     emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
1962                         vl);
1963   emit_vector_func (operands, tmp);
1964
1965   return true;
1966 }
1967
1968 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask.  */
1969 rtx
1970 gen_scalar_move_mask (machine_mode mode)
1971 {
1972   rtx_vector_builder builder (mode, 1, 2);
1973   builder.quick_push (const1_rtx);
1974   builder.quick_push (const0_rtx);
1975   return builder.build ();
1976 }
1977
1978 static unsigned
1979 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
1980 {
1981   // Original equation:
1982   //   VLMAX = (VectorBits / EltSize) * LMUL
1983   //   where LMUL = MinSize / TARGET_MIN_VLEN
1984   // The following equations have been reordered to prevent loss of precision
1985   // when calculating fractional LMUL.
1986   return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
1987 }
1988
1989 static unsigned
1990 get_unknown_min_value (machine_mode mode)
1991 {
1992   enum vlmul_type vlmul = get_vlmul (mode);
1993   switch (vlmul)
1994     {
1995     case LMUL_1:
1996       return TARGET_MIN_VLEN;
1997     case LMUL_2:
1998       return TARGET_MIN_VLEN * 2;
1999     case LMUL_4:
2000       return TARGET_MIN_VLEN * 4;
2001     case LMUL_8:
2002       return TARGET_MIN_VLEN * 8;
2003     default:
2004       gcc_unreachable ();
2005     }
2006 }
2007
2008 static rtx
2009 force_vector_length_operand (rtx vl)
2010 {
2011   if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2012     return force_reg (Pmode, vl);
2013   return vl;
2014 }
2015
2016 rtx
2017 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2018 {
2019   unsigned int sew = get_sew (vmode);
2020   rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2021   rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2022   return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2023                                      gen_int_mode (get_vlmul (vmode), Pmode),
2024                                      tail_policy, mask_policy);
2025 }
2026
2027 /* GET VL * 2 rtx.  */
2028 static rtx
2029 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2030 {
2031   rtx i32vl = NULL_RTX;
2032   if (CONST_INT_P (avl))
2033     {
2034       unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2035       unsigned min_size = get_unknown_min_value (mode);
2036       unsigned vlen_max = RVV_65536;
2037       unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2038       unsigned vlen_min = TARGET_MIN_VLEN;
2039       unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2040
2041       unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2042       if (avl_int <= vlmax_min)
2043         i32vl = gen_int_mode (2 * avl_int, Pmode);
2044       else if (avl_int >= 2 * vlmax_max)
2045         {
2046           // Just set i32vl to VLMAX in this situation
2047           i32vl = gen_reg_rtx (Pmode);
2048           emit_insn (
2049             gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2050         }
2051       else
2052         {
2053           // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2054           // is related to the hardware implementation.
2055           // So let the following code handle
2056         }
2057     }
2058   if (!i32vl)
2059     {
2060       // Using vsetvli instruction to get actually used length which related to
2061       // the hardware implementation
2062       rtx i64vl = gen_reg_rtx (Pmode);
2063       emit_insn (
2064         gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2065       // scale 2 for 32-bit length
2066       i32vl = gen_reg_rtx (Pmode);
2067       emit_insn (
2068         gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2069     }
2070
2071   return force_vector_length_operand (i32vl);
2072 }
2073
2074 bool
2075 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2076                      machine_mode demote_mask_mode, rtx *ops)
2077 {
2078   rtx scalar_op = ops[4];
2079   rtx avl = ops[5];
2080   machine_mode scalar_mode = GET_MODE_INNER (mode);
2081   if (rtx_equal_p (scalar_op, const0_rtx))
2082     {
2083       ops[5] = force_vector_length_operand (ops[5]);
2084       return false;
2085     }
2086
2087   if (TARGET_64BIT)
2088     {
2089       ops[4] = force_reg (scalar_mode, scalar_op);
2090       ops[5] = force_vector_length_operand (ops[5]);
2091       return false;
2092     }
2093
2094   if (immediate_operand (scalar_op, Pmode))
2095     {
2096       ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2097       ops[5] = force_vector_length_operand (ops[5]);
2098       return false;
2099     }
2100
2101   if (CONST_INT_P (scalar_op))
2102     scalar_op = force_reg (scalar_mode, scalar_op);
2103
2104   rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2105
2106   rtx demote_scalar_op1, demote_scalar_op2;
2107   if (unspec == UNSPEC_VSLIDE1UP)
2108     {
2109       demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2110       demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2111     }
2112   else
2113     {
2114       demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2115       demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2116     }
2117
2118   rtx temp = gen_reg_rtx (demote_mode);
2119   rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2120   rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2121   rtx merge = RVV_VUNDEF (demote_mode);
2122   /* Handle vslide1<ud>_tu.  */
2123   if (register_operand (ops[2], mode)
2124       && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2125     {
2126       merge = gen_lowpart (demote_mode, ops[2]);
2127       ta = ops[6];
2128       ma = ops[7];
2129     }
2130
2131   emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2132                              CONSTM1_RTX (demote_mask_mode), merge,
2133                              gen_lowpart (demote_mode, ops[3]),
2134                              demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2135   emit_insn (gen_pred_slide (unspec, demote_mode,
2136                              gen_lowpart (demote_mode, ops[0]),
2137                              CONSTM1_RTX (demote_mask_mode), merge, temp,
2138                              demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2139
2140   if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2141       && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2142     emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2143                                force_vector_length_operand (ops[5]), ops[6],
2144                                ops[8]));
2145   return true;
2146 }
2147
2148 rtx
2149 gen_avl_for_scalar_move (rtx avl)
2150 {
2151   /* AVL for scalar move has different behavior between 0 and large than 0.  */
2152   if (CONST_INT_P (avl))
2153     {
2154       /* So we could just set AVL to 1 for any constant other than 0.  */
2155       if (rtx_equal_p (avl, const0_rtx))
2156         return const0_rtx;
2157       else
2158         return const1_rtx;
2159     }
2160   else
2161     {
2162       /* For non-constant value, we set any non zero value to 1 by
2163          `sgtu new_avl,input_avl,zero` + `vsetvli`.  */
2164       rtx tmp = gen_reg_rtx (Pmode);
2165       emit_insn (
2166         gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2167       return tmp;
2168     }
2169 }
2170
2171 /* Expand tuple modes data movement for.  */
2172 void
2173 expand_tuple_move (rtx *ops)
2174 {
2175   unsigned int i;
2176   machine_mode tuple_mode = GET_MODE (ops[0]);
2177   machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2178   poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2179   unsigned int nf = get_nf (tuple_mode);
2180   bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2181
2182   if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2183     {
2184       rtx val;
2185       gcc_assert (can_create_pseudo_p ()
2186                   && const_vec_duplicate_p (ops[1], &val));
2187       for (i = 0; i < nf; ++i)
2188         {
2189           poly_int64 offset = i * subpart_size;
2190           rtx subreg
2191             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2192           rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2193           emit_move_insn (subreg, dup);
2194         }
2195     }
2196   else if (REG_P (ops[0]) && REG_P (ops[1]))
2197     {
2198       for (i = 0; i < nf; ++i)
2199         {
2200           int index = i;
2201
2202           /* Take NF = 2 and LMUL = 1 for example:
2203
2204               - move v8 to v9:
2205                  vmv1r v10,v9
2206                  vmv1r v9,v8
2207
2208               - move v8 to v7:
2209                  vmv1r v7,v8
2210                  vmv1r v8,v9  */
2211           if (REGNO (ops[0]) > REGNO (ops[1]))
2212             index = nf - 1 - i;
2213           poly_int64 offset = index * subpart_size;
2214           rtx dst_subreg
2215             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2216           rtx src_subreg
2217             = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2218           emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2219         }
2220     }
2221   else
2222     {
2223       /* Expand tuple memory data movement.  */
2224       gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2225       rtx offset = gen_int_mode (subpart_size, Pmode);
2226       if (!subpart_size.is_constant ())
2227         {
2228           emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2229           if (fractional_p)
2230             {
2231               unsigned int factor
2232                 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2233                     .to_constant ();
2234               rtx pat
2235                 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2236                                     gen_int_mode (exact_log2 (factor), Pmode));
2237               emit_insn (gen_rtx_SET (ops[2], pat));
2238             }
2239
2240           if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2241             {
2242               unsigned int factor
2243                 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2244                     .to_constant ();
2245               rtx pat
2246                 = gen_rtx_ASHIFT (Pmode, ops[2],
2247                                   gen_int_mode (exact_log2 (factor), Pmode));
2248               emit_insn (gen_rtx_SET (ops[2], pat));
2249             }
2250           offset = ops[2];
2251         }
2252
2253       /* Non-fractional LMUL has whole register moves that don't require a
2254          vsetvl for VLMAX.  */
2255       if (fractional_p)
2256         emit_vlmax_vsetvl (subpart_mode, ops[4]);
2257       if (MEM_P (ops[1]))
2258         {
2259           /* Load operations.  */
2260           emit_move_insn (ops[3], XEXP (ops[1], 0));
2261           for (i = 0; i < nf; i++)
2262             {
2263               rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2264                                                 tuple_mode, i * subpart_size);
2265               if (i != 0)
2266                 {
2267                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2268                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2269                 }
2270               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2271
2272               if (fractional_p)
2273                 {
2274                   rtx operands[] = {subreg, mem};
2275                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2276                                         UNARY_OP, operands, ops[4]);
2277                 }
2278               else
2279                 emit_move_insn (subreg, mem);
2280             }
2281         }
2282       else
2283         {
2284           /* Store operations.  */
2285           emit_move_insn (ops[3], XEXP (ops[0], 0));
2286           for (i = 0; i < nf; i++)
2287             {
2288               rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2289                                                 tuple_mode, i * subpart_size);
2290               if (i != 0)
2291                 {
2292                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2293                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2294                 }
2295               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2296
2297               if (fractional_p)
2298                 {
2299                   rtx operands[] = {mem, subreg};
2300                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2301                                         UNARY_OP, operands, ops[4]);
2302                 }
2303               else
2304                 emit_move_insn (mem, subreg);
2305             }
2306         }
2307     }
2308 }
2309
2310 /* Return the vectorization machine mode for RVV according to LMUL.  */
2311 machine_mode
2312 preferred_simd_mode (scalar_mode mode)
2313 {
2314   if (autovec_use_vlmax_p ())
2315     {
2316       /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2317          riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
2318          get the auto-vectorization mode.  */
2319       poly_uint64 nunits;
2320       poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2321       poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2322       /* Disable vectorization when we can't find a RVV mode for it.
2323          E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2324          a double (DFmode) type.  */
2325       if (!multiple_p (vector_size, scalar_size, &nunits))
2326         return word_mode;
2327       machine_mode rvv_mode;
2328       if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2329         return rvv_mode;
2330     }
2331   return word_mode;
2332 }
2333
2334 /* Subroutine of riscv_vector_expand_vector_init.
2335    Works as follows:
2336    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
2337    (b) Skip leading elements from BUILDER, which are the same as
2338        element NELTS_REQD - 1.
2339    (c) Insert earlier elements in reverse order in TARGET using vslide1down.  */
2340
2341 static void
2342 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
2343                                  int nelts_reqd)
2344 {
2345   machine_mode mode = GET_MODE (target);
2346   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2347   emit_move_insn (target, dup);
2348   int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2349   for (int i = ndups; i < nelts_reqd; i++)
2350     {
2351       unsigned int unspec
2352         = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
2353       insn_code icode = code_for_pred_slide (unspec, mode);
2354       rtx ops[] = {target, target, builder.elt (i)};
2355       emit_vlmax_insn (icode, BINARY_OP, ops);
2356     }
2357 }
2358
2359 /* Use merge approach to initialize the vector with repeating sequence.
2360    v = {a, b, a, b, a, b, a, b}.
2361
2362    v = broadcast (a).
2363    mask = 0b01010101....
2364    v = merge (v, b, mask)
2365 */
2366 static void
2367 expand_vector_init_merge_repeating_sequence (rtx target,
2368                                              const rvv_builder &builder)
2369 {
2370   /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2371      since we don't have such instruction in RVV.
2372      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2373      instruction to generate the mask data we want.  */
2374   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2375   machine_mode mask_int_mode
2376     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2377   uint64_t full_nelts = builder.full_nelts ().to_constant ();
2378
2379   /* Step 1: Broadcast the first pattern.  */
2380   rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2381   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2382                     UNARY_OP, ops);
2383   /* Step 2: Merge the rest iteration of pattern.  */
2384   for (unsigned int i = 1; i < builder.npatterns (); i++)
2385     {
2386       /* Step 2-1: Generate mask register v0 for each merge.  */
2387       rtx merge_mask
2388         = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2389       rtx mask = gen_reg_rtx (mask_bit_mode);
2390       rtx dup = gen_reg_rtx (mask_int_mode);
2391
2392       if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
2393         {
2394           rtx ops[] = {dup, merge_mask};
2395           emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2396                                SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2397         }
2398       else /* vmv.v.x.  */
2399         {
2400           rtx ops[] = {dup,
2401                        force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2402           rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2403                                  Pmode);
2404           emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2405                                ops, vl);
2406         }
2407
2408       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2409
2410       /* Step 2-2: Merge pattern according to the mask.  */
2411       rtx ops[] = {target, target, builder.elt (i), mask};
2412       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2413                         MERGE_OP, ops);
2414     }
2415 }
2416
2417 /* Use slideup approach to combine the vectors.
2418      v = {a, a, a, a, b, b, b, b}
2419
2420    First:
2421      v1 = {a, a, a, a, a, a, a, a}
2422      v2 = {b, b, b, b, b, b, b, b}
2423      v = slideup (v1, v2, nelt / 2)
2424 */
2425 static void
2426 expand_vector_init_slideup_combine_sequence (rtx target,
2427                                              const rvv_builder &builder)
2428 {
2429   machine_mode mode = GET_MODE (target);
2430   int nelts = builder.full_nelts ().to_constant ();
2431   rtx first_elt = builder.elt (0);
2432   rtx last_elt = builder.elt (nelts - 1);
2433   rtx low = expand_vector_broadcast (mode, first_elt);
2434   rtx high = expand_vector_broadcast (mode, last_elt);
2435   insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2436   rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2437   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2438 }
2439
2440 /* Use merge approach to merge a scalar into a vector.
2441      v = {a, a, a, a, a, a, b, b}
2442
2443      v1 = {a, a, a, a, a, a, a, a}
2444      scalar = b
2445      mask = {0, 0, 0, 0, 0, 0, 1, 1}
2446 */
2447 static void
2448 expand_vector_init_merge_combine_sequence (rtx target,
2449                                            const rvv_builder &builder)
2450 {
2451   machine_mode mode = GET_MODE (target);
2452   machine_mode imode = builder.int_mode ();
2453   machine_mode mmode = builder.mask_mode ();
2454   int nelts = builder.full_nelts ().to_constant ();
2455   int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2456   if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2457       || riscv_get_v_regno_alignment (imode) > 1)
2458     imode = get_vector_mode (HImode, nelts).require ();
2459
2460   /* Generate vid = { 0, 1, 2, ..., n }.  */
2461   rtx vid = gen_reg_rtx (imode);
2462   expand_vec_series (vid, const0_rtx, const1_rtx);
2463
2464   /* Generate mask.  */
2465   rtx mask = gen_reg_rtx (mmode);
2466   insn_code icode = code_for_pred_cmp_scalar (imode);
2467   rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2468   rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2469   /* vmsgtu.vi/vmsgtu.vx.  */
2470   rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2471   rtx sel = builder.elt (nelts - 1);
2472   rtx mask_ops[] = {mask, cmp, vid, index};
2473   emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2474
2475   /* Duplicate the first elements.  */
2476   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2477   /* Merge scalar into vector according to mask.  */
2478   rtx merge_ops[] = {target, dup, sel, mask};
2479   icode = code_for_pred_merge_scalar (mode);
2480   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2481 }
2482
2483 /* Subroutine of expand_vec_init to handle case
2484    when all trailing elements of builder are same.
2485    This works as follows:
2486    (a) Use expand_insn interface to broadcast last vector element in TARGET.
2487    (b) Insert remaining elements in TARGET using insr.
2488
2489    ??? The heuristic used is to do above if number of same trailing elements
2490    is greater than leading_ndups, loosely based on
2491    heuristic from mostly_zeros_p.  May need fine-tuning.  */
2492
2493 static bool
2494 expand_vector_init_trailing_same_elem (rtx target,
2495                                        const rtx_vector_builder &builder,
2496                                        int nelts_reqd)
2497 {
2498   int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2499   int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
2500   machine_mode mode = GET_MODE (target);
2501
2502   if (trailing_ndups > leading_ndups)
2503     {
2504       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
2505       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
2506         {
2507           unsigned int unspec
2508             = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
2509           insn_code icode = code_for_pred_slide (unspec, mode);
2510           rtx tmp = gen_reg_rtx (mode);
2511           rtx ops[] = {tmp, dup, builder.elt (i)};
2512           emit_vlmax_insn (icode, BINARY_OP, ops);
2513           /* slide1up need source and dest to be different REG.  */
2514           dup = tmp;
2515         }
2516
2517       emit_move_insn (target, dup);
2518       return true;
2519     }
2520
2521   return false;
2522 }
2523
2524 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
2525
2526 void
2527 expand_vec_init (rtx target, rtx vals)
2528 {
2529   machine_mode mode = GET_MODE (target);
2530   int nelts = XVECLEN (vals, 0);
2531
2532   rvv_builder v (mode, nelts, 1);
2533   for (int i = 0; i < nelts; i++)
2534     v.quick_push (XVECEXP (vals, 0, i));
2535   v.finalize ();
2536
2537   if (nelts > 3)
2538     {
2539       /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }.  */
2540       if (v.can_duplicate_repeating_sequence_p ())
2541         {
2542           rtx ele = v.get_merged_repeating_sequence ();
2543           rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2544           emit_move_insn (target, gen_lowpart (mode, dup));
2545           return;
2546         }
2547
2548       /* Case 2: Optimize repeating sequence cases that Case 1 can
2549          not handle and it is profitable.  For example:
2550          ELEMENT BITSIZE = 64.
2551          v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2552          We can't find a vector mode for "ab" which will be combined into
2553          128-bit element to duplicate.  */
2554       if (v.repeating_sequence_use_merge_profitable_p ())
2555         {
2556           expand_vector_init_merge_repeating_sequence (target, v);
2557           return;
2558         }
2559
2560       /* Case 3: Optimize combine sequence.
2561          E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2562          We can combine:
2563            v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2564          and
2565            v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2566          by slideup.  */
2567       if (v.combine_sequence_use_slideup_profitable_p ())
2568         {
2569           expand_vector_init_slideup_combine_sequence (target, v);
2570           return;
2571         }
2572
2573       /* Case 4: Optimize combine sequence.
2574          E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2575
2576          Generate vector:
2577            v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2578
2579          Generate mask:
2580            mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2581
2582          Merge b into v by mask:
2583            v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.  */
2584       if (v.combine_sequence_use_merge_profitable_p ())
2585         {
2586           expand_vector_init_merge_combine_sequence (target, v);
2587           return;
2588         }
2589     }
2590
2591   /* Optimize trailing same elements sequence:
2592       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
2593   if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2594     /* Handle common situation by vslide1down. This function can handle any
2595        situation of vec_init<mode>. Only the cases that are not optimized above
2596        will fall through here.  */
2597     expand_vector_init_insert_elems (target, v, nelts);
2598 }
2599
2600 /* Get insn code for corresponding comparison.  */
2601
2602 static insn_code
2603 get_cmp_insn_code (rtx_code code, machine_mode mode)
2604 {
2605   insn_code icode;
2606   switch (code)
2607     {
2608     case EQ:
2609     case NE:
2610     case LE:
2611     case LEU:
2612     case GT:
2613     case GTU:
2614     case LTGT:
2615       icode = code_for_pred_cmp (mode);
2616       break;
2617     case LT:
2618     case LTU:
2619     case GE:
2620     case GEU:
2621       if (FLOAT_MODE_P (mode))
2622         icode = code_for_pred_cmp (mode);
2623       else
2624         icode = code_for_pred_ltge (mode);
2625       break;
2626     default:
2627       gcc_unreachable ();
2628     }
2629   return icode;
2630 }
2631
2632 /* This hook gives the vectorizer more vector mode options.  We want it to not
2633    only try modes with the maximum number of units a full vector can hold but
2634    for example also half the number of units for a smaller elements size.
2635    Such vectors can be promoted to a full vector of widened elements
2636    (still with the same number of elements, essentially vectorizing at a
2637    fixed number of units rather than a fixed number of bytes).  */
2638 unsigned int
2639 autovectorize_vector_modes (vector_modes *modes, bool)
2640 {
2641   if (autovec_use_vlmax_p ())
2642     {
2643       poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2644
2645       /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2646          fit a whole vector.
2647          Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2648          is guided by the extensions we have available (vf2, vf4 and vf8).
2649
2650          - full_size: Try using full vectors for all element types.
2651          - full_size / 2:
2652            Try using 16-bit containers for 8-bit elements and full vectors
2653            for wider elements.
2654          - full_size / 4:
2655            Try using 32-bit containers for 8-bit and 16-bit elements and
2656            full vectors for wider elements.
2657          - full_size / 8:
2658            Try using 64-bit containers for all element types.  */
2659       static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2660       for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2661         {
2662           poly_uint64 units;
2663           machine_mode mode;
2664           if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2665               && get_vector_mode (QImode, units).exists (&mode))
2666             modes->safe_push (mode);
2667         }
2668     }
2669     /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
2670     unsigned int i = 0;
2671     unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2672     unsigned int size = base_size;
2673     machine_mode mode;
2674     while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2675      {
2676         if (vls_mode_valid_p (mode))
2677           modes->safe_push (mode);
2678
2679         i++;
2680         size = base_size / (1U << i);
2681      }
2682   /* Enable LOOP_VINFO comparison in COST model.  */
2683   return VECT_COMPARE_COSTS;
2684 }
2685
2686 /* Return true if we can find the related MODE according to default LMUL. */
2687 static bool
2688 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2689                          poly_uint64 *nunits)
2690 {
2691   if (!autovec_use_vlmax_p ())
2692     return false;
2693   if (riscv_v_ext_vector_mode_p (vector_mode)
2694       && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2695                      GET_MODE_SIZE (element_mode), nunits))
2696     return true;
2697   if (riscv_v_ext_vls_mode_p (vector_mode)
2698       && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2699                      GET_MODE_SIZE (element_mode), nunits))
2700     return true;
2701   return false;
2702 }
2703
2704 /* If the given VECTOR_MODE is an RVV mode,  first get the largest number
2705    of units that fit into a full vector at the given ELEMENT_MODE.
2706    We will have the vectorizer call us with a successively decreasing
2707    number of units (as specified in autovectorize_vector_modes).
2708    The starting mode is always the one specified by preferred_simd_mode. */
2709 opt_machine_mode
2710 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2711                         poly_uint64 nunits)
2712 {
2713   /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2714   poly_uint64 min_units;
2715   if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2716     {
2717       machine_mode rvv_mode;
2718       if (maybe_ne (nunits, 0U))
2719         {
2720           /* If we were given a number of units NUNITS, try to find an
2721              RVV vector mode of inner mode ELEMENT_MODE with the same
2722              number of units.  */
2723           if (multiple_p (min_units, nunits)
2724               && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2725             return rvv_mode;
2726         }
2727       else
2728         {
2729           /* Look for a vector mode with the same number of units as the
2730              VECTOR_MODE we were given.  We keep track of the minimum
2731              number of units so far which determines the smallest necessary
2732              but largest possible, suitable mode for vectorization.  */
2733           min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2734           if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2735             return rvv_mode;
2736         }
2737     }
2738
2739   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2740 }
2741
2742 /* Expand an RVV comparison.  */
2743
2744 void
2745 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1)
2746 {
2747   machine_mode mask_mode = GET_MODE (target);
2748   machine_mode data_mode = GET_MODE (op0);
2749   insn_code icode = get_cmp_insn_code (code, data_mode);
2750
2751   if (code == LTGT)
2752     {
2753       rtx lt = gen_reg_rtx (mask_mode);
2754       rtx gt = gen_reg_rtx (mask_mode);
2755       expand_vec_cmp (lt, LT, op0, op1);
2756       expand_vec_cmp (gt, GT, op0, op1);
2757       icode = code_for_pred (IOR, mask_mode);
2758       rtx ops[] = {target, lt, gt};
2759       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2760       return;
2761     }
2762
2763   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2764   rtx ops[] = {target, cmp, op0, op1};
2765   emit_vlmax_insn (icode, COMPARE_OP, ops);
2766 }
2767
2768 void
2769 expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0,
2770                 rtx op1)
2771 {
2772   machine_mode mask_mode = GET_MODE (target);
2773   machine_mode data_mode = GET_MODE (op0);
2774   insn_code icode = get_cmp_insn_code (code, data_mode);
2775
2776   if (code == LTGT)
2777     {
2778       rtx lt = gen_reg_rtx (mask_mode);
2779       rtx gt = gen_reg_rtx (mask_mode);
2780       expand_vec_cmp (lt, LT, mask, maskoff, op0, op1);
2781       expand_vec_cmp (gt, GT, mask, maskoff, op0, op1);
2782       icode = code_for_pred (IOR, mask_mode);
2783       rtx ops[] = {target, lt, gt};
2784       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2785       return;
2786     }
2787
2788   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2789   rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2790   emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2791 }
2792
2793 /* Expand an RVV floating-point comparison:
2794
2795    If CAN_INVERT_P is true, the caller can also handle inverted results;
2796    return true if the result is in fact inverted.  */
2797
2798 bool
2799 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2800                       bool can_invert_p)
2801 {
2802   machine_mode mask_mode = GET_MODE (target);
2803   machine_mode data_mode = GET_MODE (op0);
2804
2805   /* If can_invert_p = true:
2806      It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2807
2808        vmfeq.vv    v0, va, va
2809        vmfeq.vv    v1, vb, vb
2810        vmand.mm    v0, v0, v1
2811        vmflt.vv    v0, va, vb, v0.t
2812        vmnot.m     v0, v0
2813
2814      And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2815      second vmfeq.vv:
2816
2817        vmfeq.vv    v0, va, va
2818        vmfeq.vv    v0, vb, vb, v0.t
2819        vmflt.vv    v0, va, vb, v0.t
2820        vmnot.m     v0, v0
2821
2822      If can_invert_p = false:
2823
2824        # Example of implementing isgreater()
2825        vmfeq.vv v0, va, va        # Only set where A is not NaN.
2826        vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
2827        vmand.mm v0, v0, v1        # Only set where A and B are ordered,
2828        vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
2829   */
2830
2831   rtx eq0 = gen_reg_rtx (mask_mode);
2832   rtx eq1 = gen_reg_rtx (mask_mode);
2833   switch (code)
2834     {
2835     case EQ:
2836     case NE:
2837     case LT:
2838     case LE:
2839     case GT:
2840     case GE:
2841     case LTGT:
2842       /* There is native support for the comparison.  */
2843       expand_vec_cmp (target, code, op0, op1);
2844       return false;
2845     case UNEQ:
2846     case ORDERED:
2847     case UNORDERED:
2848     case UNLT:
2849     case UNLE:
2850     case UNGT:
2851     case UNGE:
2852       /* vmfeq.vv v0, va, va  */
2853       expand_vec_cmp (eq0, EQ, op0, op0);
2854       if (HONOR_SNANS (data_mode))
2855         {
2856           /*
2857              vmfeq.vv    v1, vb, vb
2858              vmand.mm    v0, v0, v1
2859           */
2860           expand_vec_cmp (eq1, EQ, op1, op1);
2861           insn_code icode = code_for_pred (AND, mask_mode);
2862           rtx ops[] = {eq0, eq0, eq1};
2863           emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2864         }
2865       else
2866         {
2867           /* vmfeq.vv    v0, vb, vb, v0.t  */
2868           expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1);
2869         }
2870       break;
2871     default:
2872       gcc_unreachable ();
2873     }
2874
2875   if (code == ORDERED)
2876     {
2877       emit_move_insn (target, eq0);
2878       return false;
2879     }
2880
2881   /* There is native support for the inverse comparison.  */
2882   code = reverse_condition_maybe_unordered (code);
2883   if (code == ORDERED)
2884     emit_move_insn (target, eq0);
2885   else
2886     expand_vec_cmp (eq0, code, eq0, eq0, op0, op1);
2887
2888   if (can_invert_p)
2889     {
2890       emit_move_insn (target, eq0);
2891       return true;
2892     }
2893
2894   /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2895      into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm.  */
2896   emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2897   return false;
2898 }
2899
2900 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2901    MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2902    2 * nunits - 1.  */
2903 static rtx
2904 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2905 {
2906   rtx sel_mod;
2907   machine_mode sel_mode = GET_MODE (sel);
2908   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2909   poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2910   /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2911      Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2912      indice.  */
2913   if (CONST_VECTOR_P (sel)
2914       && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2915     sel_mod = sel;
2916   else
2917     {
2918       rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2919       sel_mod
2920         = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2921     }
2922   return sel_mod;
2923 }
2924
2925 /* Implement vec_perm<mode>.  */
2926
2927 void
2928 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2929 {
2930   machine_mode data_mode = GET_MODE (target);
2931   machine_mode sel_mode = GET_MODE (sel);
2932   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2933
2934   /* Check if the sel only references the first values vector. If each select
2935      index is in range of [0, nunits - 1]. A single vrgather instructions is
2936      enough. Since we will use vrgatherei16.vv for variable-length vector,
2937      it is never out of range and we don't need to modulo the index.  */
2938   if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2939     {
2940       emit_vlmax_gather_insn (target, op0, sel);
2941       return;
2942     }
2943
2944   /* Check if all the indices are same.  */
2945   rtx elt;
2946   if (const_vec_duplicate_p (sel, &elt))
2947     {
2948       poly_uint64 value = rtx_to_poly_int64 (elt);
2949       rtx op = op0;
2950       if (maybe_gt (value, nunits - 1))
2951         {
2952           sel = gen_const_vector_dup (sel_mode, value - nunits);
2953           op = op1;
2954         }
2955       emit_vlmax_gather_insn (target, op, sel);
2956     }
2957
2958   /* Note: vec_perm indices are supposed to wrap when they go beyond the
2959      size of the two value vectors, i.e. the upper bits of the indices
2960      are effectively ignored.  RVV vrgather instead produces 0 for any
2961      out-of-range indices, so we need to modulo all the vec_perm indices
2962      to ensure they are all in range of [0, nunits - 1] when op0 == op1
2963      or all in range of [0, 2 * nunits - 1] when op0 != op1.  */
2964   rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2965
2966   /* Check if the two values vectors are the same.  */
2967   if (rtx_equal_p (op0, op1))
2968     {
2969       emit_vlmax_gather_insn (target, op0, sel_mod);
2970       return;
2971     }
2972
2973   /* This following sequence is handling the case that:
2974      __builtin_shufflevector (vec1, vec2, index...), the index can be any
2975      value in range of [0, 2 * nunits - 1].  */
2976   machine_mode mask_mode;
2977   mask_mode = get_mask_mode (data_mode);
2978   rtx mask = gen_reg_rtx (mask_mode);
2979   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2980
2981   /* Step 1: generate a mask that should select everything >= nunits into the
2982    * mask.  */
2983   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2984
2985   /* Step2: gather every op0 values indexed by sel into target,
2986             we don't need to care about the result of the element
2987             whose index >= nunits.  */
2988   emit_vlmax_gather_insn (target, op0, sel_mod);
2989
2990   /* Step3: shift the range from (nunits, max_of_mode] to
2991             [0, max_of_mode - nunits].  */
2992   rtx tmp = gen_reg_rtx (sel_mode);
2993   rtx ops[] = {tmp, sel_mod, max_sel};
2994   emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
2995
2996   /* Step4: gather those into the previously masked-out elements
2997             of target.  */
2998   emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
2999 }
3000
3001 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
3002
3003 /* vec_perm support.  */
3004
3005 struct expand_vec_perm_d
3006 {
3007   rtx target, op0, op1;
3008   vec_perm_indices perm;
3009   machine_mode vmode;
3010   machine_mode op_mode;
3011   bool one_vector_p;
3012   bool testing_p;
3013 };
3014
3015 /* Return the appropriate index mode for gather instructions.  */
3016 opt_machine_mode
3017 get_gather_index_mode (struct expand_vec_perm_d *d)
3018 {
3019   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3020   poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3021
3022   if (GET_MODE_INNER (d->vmode) == QImode)
3023     {
3024       if (nunits.is_constant ())
3025         {
3026           /* If indice is LMUL8 CONST_VECTOR and any element value
3027              exceed the range of 0 ~ 255, Forbid such permutation
3028              since we need vector HI mode to hold such indice and
3029              we don't have it.  */
3030           if (!d->perm.all_in_range_p (0, 255)
3031               && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3032             return opt_machine_mode ();
3033         }
3034       else
3035         {
3036           /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3037              Otherwise, it could overflow the index range.  */
3038           if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3039             return opt_machine_mode ();
3040         }
3041     }
3042   else if (riscv_get_v_regno_alignment (sel_mode) > 1
3043            && GET_MODE_INNER (sel_mode) != HImode)
3044     sel_mode = get_vector_mode (HImode, nunits).require ();
3045   return sel_mode;
3046 }
3047
3048 /* Recognize the patterns that we can use merge operation to shuffle the
3049    vectors. The value of Each element (index i) in selector can only be
3050    either i or nunits + i.  We will check the pattern is actually monotonic.
3051
3052    E.g.
3053    v = VEC_PERM_EXPR (v0, v1, selector),
3054    selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ...  }
3055
3056    We can transform such pattern into:
3057
3058    v = vcond_mask (v0, v1, mask),
3059    mask = { 0, 1, 0, 1, 0, 1, ... }.  */
3060
3061 static bool
3062 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3063 {
3064   machine_mode vmode = d->vmode;
3065   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3066   int n_patterns = d->perm.encoding ().npatterns ();
3067   poly_int64 vec_len = d->perm.length ();
3068
3069   for (int i = 0; i < n_patterns; ++i)
3070     if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3071       return false;
3072
3073   /* Check the pattern is monotonic here, otherwise, return false.  */
3074   for (int i = n_patterns; i < n_patterns * 2; i++)
3075     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3076         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3077       return false;
3078
3079   /* We need to use precomputed mask for such situation and such mask
3080      can only be computed in compile-time known size modes.  */
3081   bool indices_fit_selector_p
3082     = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3083   if (!indices_fit_selector_p && !vec_len.is_constant ())
3084     return false;
3085
3086   if (d->testing_p)
3087     return true;
3088
3089   machine_mode mask_mode = get_mask_mode (vmode);
3090   rtx mask = gen_reg_rtx (mask_mode);
3091
3092   if (indices_fit_selector_p)
3093     {
3094       /* MASK = SELECTOR < NUNTIS ? 1 : 0.  */
3095       rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3096       rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3097       insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3098       rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3099       rtx ops[] = {mask, cmp, sel, x};
3100       emit_vlmax_insn (icode, COMPARE_OP, ops);
3101     }
3102   else
3103     {
3104       /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3105          directly to generate the selector mask, instead, we can only use
3106          precomputed mask.
3107
3108          E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3109          don't have a QImode scalar register to hold larger than 255.
3110          We also cannot hold that in a vector QImode register if LMUL = 8, and,
3111          since there is no larger HI mode vector we cannot create a larger
3112          selector.
3113
3114          As the mask is a simple {0, 1, ...} pattern and the length is known we
3115          can store it in a scalar register and broadcast it to a mask register.
3116        */
3117       gcc_assert (vec_len.is_constant ());
3118       int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3119       machine_mode mode = get_vector_mode (QImode, size).require ();
3120       rtx tmp = gen_reg_rtx (mode);
3121       rvv_builder v (mode, 1, size);
3122       for (int i = 0; i < vec_len.to_constant () / 8; i++)
3123         {
3124           uint8_t value = 0;
3125           for (int j = 0; j < 8; j++)
3126             {
3127               int index = i * 8 + j;
3128               if (known_lt (d->perm[index], 256))
3129                 value |= 1 << j;
3130             }
3131           v.quick_push (gen_int_mode (value, QImode));
3132         }
3133       emit_move_insn (tmp, v.build ());
3134       emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3135     }
3136
3137   /* TARGET = MASK ? OP0 : OP1.  */
3138   /* swap op0 and op1 since the order is opposite to pred_merge.  */
3139   rtx ops2[] = {d->target, d->op1, d->op0, mask};
3140   emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3141   return true;
3142 }
3143
3144 /* Recognize the consecutive index that we can use a single
3145    vrgather.v[x|i] to shuffle the vectors.
3146
3147    e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3148    Use SEW = 32, index = 1 vrgather.vi to get the result.  */
3149 static bool
3150 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3151 {
3152   machine_mode vmode = d->vmode;
3153   scalar_mode smode = GET_MODE_INNER (vmode);
3154   poly_int64 vec_len = d->perm.length ();
3155   HOST_WIDE_INT elt;
3156
3157   if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3158     return false;
3159   int vlen = vec_len.to_constant ();
3160
3161   /* Compute the last element index of consecutive pattern from the leading
3162      consecutive elements.  */
3163   int last_consecutive_idx = -1;
3164   int consecutive_num = -1;
3165   for (int i = 1; i < vlen; i++)
3166     {
3167       if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3168         break;
3169       last_consecutive_idx = i;
3170       consecutive_num = last_consecutive_idx + 1;
3171     }
3172
3173   int new_vlen = vlen / consecutive_num;
3174   if (last_consecutive_idx < 0 || consecutive_num == vlen
3175       || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3176     return false;
3177   /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3178      All elements of index, index + 1, ... index + consecutive_num - 1 should
3179      locate at the same vector.  */
3180   if (maybe_ge (d->perm[0], vec_len)
3181       != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3182     return false;
3183   /* If a vector has 8 elements.  We allow optimizations on consecutive
3184      patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3185      Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3186      to be optimized.  */
3187   if (d->perm[0].to_constant () % consecutive_num != 0)
3188     return false;
3189   unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3190   if (container_bits > 64)
3191     return false;
3192   else if (container_bits == 64)
3193     {
3194       if (!TARGET_VECTOR_ELEN_64)
3195         return false;
3196       else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3197         return false;
3198     }
3199
3200   /* Check the rest of elements are the same consecutive pattern.  */
3201   for (int i = consecutive_num; i < vlen; i++)
3202     if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3203       return false;
3204
3205   if (FLOAT_MODE_P (smode))
3206     smode = float_mode_for_size (container_bits).require ();
3207   else
3208     smode = int_mode_for_size (container_bits, 0).require ();
3209   if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3210     return false;
3211   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3212
3213   /* Success! */
3214   if (d->testing_p)
3215     return true;
3216
3217   int index = elt / consecutive_num;
3218   if (index >= new_vlen)
3219     index = index - new_vlen;
3220   rtx sel = gen_const_vector_dup (sel_mode, index);
3221   rtx op = elt >= vlen ? d->op0 : d->op1;
3222   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3223                           gen_lowpart (vmode, op), sel);
3224   return true;
3225 }
3226
3227 /* Recognize the patterns that we can use compress operation to shuffle the
3228    vectors. The perm selector of compress pattern is divided into 2 part:
3229    The first part is the random index number < NUNITS.
3230    The second part is consecutive last N index number >= NUNITS.
3231
3232    E.g.
3233    v = VEC_PERM_EXPR (v0, v1, selector),
3234    selector = { 0, 2, 6, 7 }
3235
3236    We can transform such pattern into:
3237
3238    op1 = vcompress (op0, mask)
3239    mask = { 1, 0, 1, 0 }
3240    v = op1.  */
3241
3242 static bool
3243 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3244 {
3245   machine_mode vmode = d->vmode;
3246   poly_int64 vec_len = d->perm.length ();
3247
3248   if (!vec_len.is_constant ())
3249     return false;
3250
3251   int vlen = vec_len.to_constant ();
3252
3253   /* It's not worthwhile the compress pattern has elemenets < 4
3254      and we can't modulo indices for compress pattern.  */
3255   if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3256     return false;
3257
3258   /* Compress pattern doesn't work for one vector.  */
3259   if (d->one_vector_p)
3260     return false;
3261
3262   /* Compress point is the point that all elements value with index i >=
3263      compress point of the selector are all consecutive series increasing and
3264      each selector value >= NUNTIS. In this case, we could compress all elements
3265      of i < compress point into the op1.  */
3266   int compress_point = -1;
3267   for (int i = 0; i < vlen; i++)
3268     {
3269       if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3270         {
3271           compress_point = i;
3272           break;
3273         }
3274     }
3275
3276   /* We don't apply compress approach if we can't find the compress point.  */
3277   if (compress_point < 0)
3278     return false;
3279
3280   /* We can only apply compress approach when all index values from 0 to
3281      compress point are increasing.  */
3282   for (int i = 1; i < compress_point; i++)
3283     if (maybe_le (d->perm[i], d->perm[i - 1]))
3284       return false;
3285
3286   /* It must be series increasing from compress point.  */
3287   for (int i = 1 + compress_point; i < vlen; i++)
3288     if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3289       return false;
3290
3291   /* Success!  */
3292   if (d->testing_p)
3293     return true;
3294
3295   /* Check whether we need to slideup op1 to apply compress approach.
3296
3297        E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3298             is 2 * NUNITS - 1, so we don't need to slide up.
3299
3300             For index = { 0, 2, 5, 6}, we need to slide op1 up before
3301             we apply compress approach.  */
3302   bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3303                         && !const_vec_duplicate_p (d->op1);
3304
3305   /* If we leave it directly be handled by general gather,
3306      the code sequence will be:
3307         VECTOR LOAD  selector
3308         GEU          mask, selector, NUNITS
3309         GATHER       dest, op0, selector
3310         SUB          selector, selector, NUNITS
3311         GATHER       dest, op1, selector, mask
3312      Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3313      as COST = 4. So, we consider the general gather handling COST = 9.
3314      TODO: This cost is not accurate, we can adjust it by tune info.  */
3315   int general_cost = 9;
3316
3317   /* If we can use compress approach, the code squence will be:
3318         MASK LOAD    mask
3319         COMPRESS     op1, op0, mask
3320      If it needs slide up, it will be:
3321         MASK LOAD    mask
3322         SLIDEUP      op1
3323         COMPRESS     op1, op0, mask
3324      By default, mask load COST = 2.
3325      TODO: This cost is not accurate, we can adjust it by tune info.  */
3326   int compress_cost = 4;
3327
3328   if (general_cost <= compress_cost)
3329     return false;
3330
3331   /* Build a mask that is true when selector element is true.  */
3332   machine_mode mask_mode = get_mask_mode (vmode);
3333   rvv_builder builder (mask_mode, vlen, 1);
3334   for (int i = 0; i < vlen; i++)
3335     {
3336       bool is_compress_index = false;
3337       for (int j = 0; j < compress_point; j++)
3338         {
3339           if (known_eq (d->perm[j], i))
3340             {
3341               is_compress_index = true;
3342               break;
3343             }
3344         }
3345       if (is_compress_index)
3346         builder.quick_push (CONST1_RTX (BImode));
3347       else
3348         builder.quick_push (CONST0_RTX (BImode));
3349     }
3350   rtx mask = force_reg (mask_mode, builder.build ());
3351
3352   rtx merge = d->op1;
3353   if (need_slideup_p)
3354     {
3355       int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3356       merge = gen_reg_rtx (vmode);
3357       rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3358       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3359       emit_vlmax_insn (icode, BINARY_OP, ops);
3360     }
3361
3362   insn_code icode = code_for_pred_compress (vmode);
3363   rtx ops[] = {d->target, merge, d->op0, mask};
3364   emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3365   return true;
3366 }
3367
3368 /* Recognize decompress patterns:
3369
3370    1. VEC_PERM_EXPR op0 and op1
3371       with isel = { 0, nunits, 1, nunits + 1, ... }.
3372       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3373
3374    2. VEC_PERM_EXPR op0 and op1
3375       with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3376       Slide down op0 and op1 with OFFSET = 1/2 nunits.
3377       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3378 */
3379 static bool
3380 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3381 {
3382   poly_uint64 nelt = d->perm.length ();
3383   machine_mode mask_mode = get_mask_mode (d->vmode);
3384
3385   /* For constant size indices, we dont't need to handle it here.
3386      Just leave it to vec_perm<mode>.  */
3387   if (d->perm.length ().is_constant ())
3388     return false;
3389
3390   poly_uint64 first = d->perm[0];
3391   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3392       || !d->perm.series_p (0, 2, first, 1)
3393       || !d->perm.series_p (1, 2, first + nelt, 1))
3394     return false;
3395
3396   /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3397      Otherwise, it could overflow the index range.  */
3398   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3399   if (GET_MODE_INNER (d->vmode) == QImode
3400       && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3401     return false;
3402
3403   /* Success!  */
3404   if (d->testing_p)
3405     return true;
3406
3407   rtx op0, op1;
3408   if (known_eq (first, 0U))
3409     {
3410       op0 = d->op0;
3411       op1 = d->op1;
3412     }
3413   else
3414     {
3415       op0 = gen_reg_rtx (d->vmode);
3416       op1 = gen_reg_rtx (d->vmode);
3417       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3418       rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3419       rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3420       emit_vlmax_insn (icode, BINARY_OP, ops0);
3421       emit_vlmax_insn (icode, BINARY_OP, ops1);
3422     }
3423   /* Generate { 0, 1, .... } mask.  */
3424   rtx vid = gen_reg_rtx (sel_mode);
3425   rtx vid_repeat = gen_reg_rtx (sel_mode);
3426   expand_vec_series (vid, const0_rtx, const1_rtx);
3427   rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3428   emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3429   rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3430   rtx mask = gen_reg_rtx (mask_mode);
3431   expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3432   emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3433   return true;
3434 }
3435
3436 static bool
3437 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3438 {
3439   HOST_WIDE_INT diff;
3440   unsigned i, size, step;
3441
3442   if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3443     return false;
3444
3445   step = diff + 1;
3446   size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3447
3448   switch (size)
3449     {
3450     case 16:
3451       break;
3452     case 32:
3453     case 64:
3454       /* We will have VEC_PERM_EXPR after rtl expand when invoking
3455          __builtin_bswap. It will generate about 9 instructions in
3456          loop as below, no matter it is bswap16, bswap32 or bswap64.
3457            .L2:
3458          1 vle16.v v4,0(a0)
3459          2 vmv.v.x v2,a7
3460          3 vand.vv v2,v6,v2
3461          4 slli    a2,a5,1
3462          5 vrgatherei16.vv v1,v4,v2
3463          6 sub     a4,a4,a5
3464          7 vse16.v v1,0(a3)
3465          8 add     a0,a0,a2
3466          9 add     a3,a3,a2
3467            bne     a4,zero,.L2
3468
3469          But for bswap16 we may have a even simple code gen, which
3470          has only 7 instructions in loop as below.
3471            .L5
3472          1 vle8.v  v2,0(a5)
3473          2 addi    a5,a5,32
3474          3 vsrl.vi v4,v2,8
3475          4 vsll.vi v2,v2,8
3476          5 vor.vv  v4,v4,v2
3477          6 vse8.v  v4,0(a4)
3478          7 addi    a4,a4,32
3479            bne     a5,a6,.L5
3480
3481          Unfortunately, the instructions in loop will grow to 13 and 24
3482          for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3483          for both the bswap64 and bswap32, but take shift and or (7 insn)
3484          for bswap16.
3485        */
3486     default:
3487       return false;
3488     }
3489
3490   for (i = 0; i < step; i++)
3491     if (!d->perm.series_p (i, step, diff - i, step))
3492       return false;
3493
3494   /* Disable when nunits < 4 since the later generic approach
3495      is more profitable on BSWAP.  */
3496   if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3497     return false;
3498
3499   if (d->testing_p)
3500     return true;
3501
3502   machine_mode vhi_mode;
3503   poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3504
3505   if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3506     return false;
3507
3508   /* Step-1: Move op0 to src with VHI mode.  */
3509   rtx src = gen_reg_rtx (vhi_mode);
3510   emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3511
3512   /* Step-2: Shift right 8 bits to dest.  */
3513   rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3514                            NULL_RTX, 0, OPTAB_DIRECT);
3515
3516   /* Step-3: Shift left 8 bits to src.  */
3517   src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3518                       NULL_RTX, 0, OPTAB_DIRECT);
3519
3520   /* Step-4: Logic Or dest and src to dest.  */
3521   dest = expand_binop (vhi_mode, ior_optab, dest, src,
3522                        NULL_RTX, 0, OPTAB_DIRECT);
3523
3524   /* Step-5: Move src to target with VQI mode.  */
3525   emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3526
3527   return true;
3528 }
3529
3530 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3531    approach.  */
3532
3533 static bool
3534 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3535 {
3536   poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3537
3538   /* Recognize { nunits - 1, nunits, nunits + 1, ... }.  */
3539   if (!d->perm.series_p (0, 2, nunits - 1, 2)
3540       || !d->perm.series_p (1, 2, nunits, 2))
3541     return false;
3542
3543   /* Disable when nunits < 4 since the later generic approach
3544      is more profitable on indice = { nunits - 1, nunits }.  */
3545   if (!known_gt (nunits, 2))
3546     return false;
3547
3548   /* Success! */
3549   if (d->testing_p)
3550     return true;
3551
3552   /* Extract the last element of the first vector.  */
3553   scalar_mode smode = GET_MODE_INNER (d->vmode);
3554   rtx tmp = gen_reg_rtx (smode);
3555   emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3556
3557   /* Insert the scalar into element 0.  */
3558   unsigned int unspec
3559     = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3560   insn_code icode = code_for_pred_slide (unspec, d->vmode);
3561   rtx ops[] = {d->target, d->op1, tmp};
3562   emit_vlmax_insn (icode, BINARY_OP, ops);
3563   return true;
3564 }
3565
3566 static bool
3567 shuffle_series_patterns (struct expand_vec_perm_d *d)
3568 {
3569   if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3570     return false;
3571
3572   poly_int64 el1 = d->perm[0];
3573   poly_int64 el2 = d->perm[1];
3574   poly_int64 el3 = d->perm[2];
3575
3576   poly_int64 step1 = el2 - el1;
3577   poly_int64 step2 = el3 - el2;
3578
3579   bool need_insert = false;
3580   bool have_series = false;
3581
3582   /* Check for a full series.  */
3583   if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3584     have_series = true;
3585
3586   /* Check for a series starting at the second element.  */
3587   else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3588     {
3589       have_series = true;
3590       need_insert = true;
3591     }
3592
3593   if (!have_series)
3594     return false;
3595
3596   /* Disable shuffle if we can't find an appropriate integer index mode for
3597      gather.  */
3598   machine_mode sel_mode;
3599   if (!get_gather_index_mode (d).exists (&sel_mode))
3600     return false;
3601
3602   /* Success! */
3603   if (d->testing_p)
3604     return true;
3605
3606   /* Create the series.  */
3607   machine_mode eltmode = Pmode;
3608   rtx series = gen_reg_rtx (sel_mode);
3609   expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3610                      gen_int_mode (need_insert ? step2 : step1, eltmode));
3611
3612   /* Insert the remaining element if necessary.  */
3613   if (need_insert)
3614     {
3615       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3616       rtx ops[]
3617         = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3618       emit_vlmax_insn (icode, BINARY_OP, ops);
3619     }
3620
3621   emit_vlmax_gather_insn (d->target, d->op0, series);
3622
3623   return true;
3624 }
3625
3626 /* Recognize the pattern that can be shuffled by generic approach.  */
3627
3628 static bool
3629 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3630 {
3631   machine_mode sel_mode;
3632
3633   /* We don't enable SLP for non-power of 2 NPATTERNS.  */
3634   if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3635     return false;
3636
3637   /* Disable shuffle if we can't find an appropriate integer index mode for
3638      gather.  */
3639   if (!get_gather_index_mode (d).exists (&sel_mode))
3640     return false;
3641
3642   /* Success! */
3643   if (d->testing_p)
3644     return true;
3645
3646   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3647   /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3648      instead of expand vec_perm<mode>, we handle it directly.  */
3649   expand_vec_perm (d->target, d->op0, d->op1, sel);
3650   return true;
3651 }
3652
3653 /* This function recognizes and supports different permutation patterns
3654    and enable VLA SLP auto-vectorization.  */
3655 static bool
3656 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3657 {
3658   gcc_assert (d->op_mode != E_VOIDmode);
3659
3660   /* The pattern matching functions above are written to look for a small
3661      number to begin the sequence (0, 1, N/2).  If we begin with an index
3662      from the second operand, we can swap the operands.  */
3663   poly_int64 nelt = d->perm.length ();
3664   if (known_ge (d->perm[0], nelt))
3665     {
3666       d->perm.rotate_inputs (1);
3667       std::swap (d->op0, d->op1);
3668     }
3669
3670   if (known_gt (nelt, 1))
3671     {
3672       if (d->vmode == d->op_mode)
3673         {
3674           if (shuffle_merge_patterns (d))
3675             return true;
3676           if (shuffle_consecutive_patterns (d))
3677             return true;
3678           if (shuffle_compress_patterns (d))
3679             return true;
3680           if (shuffle_decompress_patterns (d))
3681             return true;
3682           if (shuffle_bswap_pattern (d))
3683             return true;
3684           if (shuffle_extract_and_slide1up_patterns (d))
3685             return true;
3686           if (shuffle_series_patterns (d))
3687             return true;
3688           if (shuffle_generic_patterns (d))
3689             return true;
3690           return false;
3691         }
3692       else
3693         return false;
3694     }
3695   return false;
3696 }
3697
3698 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3699  * instructions.  */
3700 bool
3701 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3702                        rtx op0, rtx op1, const vec_perm_indices &sel)
3703 {
3704   /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3705      mask to do the iteration loop control. Just disable it directly.  */
3706   if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3707     return false;
3708   /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3709      may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3710      Ideally, middle-end loop vectorizer should be able to disable it
3711      itself, We can remove the codes here when middle-end code is able
3712      to disable VLA SLP vectorization for poly size (1, 1) VF.  */
3713   if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3714       && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3715                    poly_int64 (16, 16)))
3716     return false;
3717
3718   struct expand_vec_perm_d d;
3719
3720   /* Check whether the mask can be applied to a single vector.  */
3721   if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3722     d.one_vector_p = true;
3723   else if (sel.all_from_input_p (0))
3724     {
3725       d.one_vector_p = true;
3726       op1 = op0;
3727     }
3728   else if (sel.all_from_input_p (1))
3729     {
3730       d.one_vector_p = true;
3731       op0 = op1;
3732     }
3733   else
3734     d.one_vector_p = false;
3735
3736   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3737                      sel.nelts_per_input ());
3738   d.vmode = vmode;
3739   d.op_mode = op_mode;
3740   d.target = target;
3741   d.op0 = op0;
3742   if (op0 == op1)
3743     d.op1 = d.op0;
3744   else
3745     d.op1 = op1;
3746   d.testing_p = !target;
3747
3748   if (!d.testing_p)
3749     return expand_vec_perm_const_1 (&d);
3750
3751   rtx_insn *last = get_last_insn ();
3752   bool ret = expand_vec_perm_const_1 (&d);
3753   gcc_assert (last == get_last_insn ());
3754
3755   return ret;
3756 }
3757
3758 /* Generate no side effects vsetvl to get the vector length.  */
3759 void
3760 expand_select_vl (rtx *ops)
3761 {
3762   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3763   if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3764     {
3765       /* If length is known <= VF, we just use the length directly instead
3766          of using vsetvli.
3767
3768          E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3769          We move 3 into _255 intead of using explicit vsetvl.  */
3770       emit_move_insn (ops[0], ops[1]);
3771       return;
3772     }
3773   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3774      since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
3775   scalar_int_mode mode = QImode;
3776   machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3777   emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3778 }
3779
3780 /* Expand MASK_LEN_{LOAD,STORE}.  */
3781 void
3782 expand_load_store (rtx *ops, bool is_load)
3783 {
3784   poly_int64 value;
3785   rtx mask = ops[2];
3786   rtx len = ops[3];
3787   machine_mode mode = GET_MODE (ops[0]);
3788
3789   if (is_vlmax_len_p (mode, len))
3790     {
3791       /* If the length operand is equal to VF, it is VLMAX load/store.  */
3792       if (is_load)
3793         {
3794           rtx m_ops[] = {ops[0], mask, ops[1]};
3795           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3796         }
3797       else
3798         {
3799           len = gen_reg_rtx (Pmode);
3800           emit_vlmax_vsetvl (mode, len);
3801           emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3802                                      get_avl_type_rtx (VLMAX)));
3803         }
3804     }
3805   else
3806     {
3807       if (!satisfies_constraint_K (len))
3808         len = force_reg (Pmode, len);
3809       if (is_load)
3810         {
3811           rtx m_ops[] = {ops[0], mask, ops[1]};
3812           emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3813                                len);
3814         }
3815       else
3816         emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3817                                    get_avl_type_rtx (NONVLMAX)));
3818     }
3819 }
3820
3821
3822 /* Return true if the operation is the floating-point operation need FRM.  */
3823 static bool
3824 needs_fp_rounding (unsigned icode, machine_mode mode)
3825 {
3826   if (!FLOAT_MODE_P (mode))
3827     return false;
3828
3829   return icode != maybe_code_for_pred (SMIN, mode)
3830          && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3831          && icode != maybe_code_for_pred (SMAX, mode)
3832          && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3833          && icode != maybe_code_for_pred (NEG, mode)
3834          && icode != maybe_code_for_pred (ABS, mode)
3835          /* narrower-FP -> FP */
3836          && icode != maybe_code_for_pred_extend (mode)
3837          /* narrower-INT -> FP */
3838          && icode != maybe_code_for_pred_widen (FLOAT, mode)
3839          && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3840          /* vfsgnj */
3841          && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3842          && icode != maybe_code_for_pred_mov (mode);
3843 }
3844
3845 /* Subroutine to expand COND_LEN_* patterns.  */
3846 static void
3847 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3848 {
3849   rtx dest = ops[0];
3850   rtx mask = ops[1];
3851   machine_mode mode = GET_MODE (dest);
3852   machine_mode mask_mode = GET_MODE (mask);
3853   poly_int64 value;
3854   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3855   bool is_vlmax_len = is_vlmax_len_p (mode, len);
3856
3857   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3858   if (is_dummy_mask)
3859     insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3860   else if (is_vlmax_len)
3861     insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3862   else
3863     insn_flags |= TU_POLICY_P | MU_POLICY_P;
3864
3865   if (needs_fp_rounding (icode, mode))
3866     insn_flags |= FRM_DYN_P;
3867
3868   if (is_vlmax_len)
3869     emit_vlmax_insn (icode, insn_flags, ops);
3870   else
3871     emit_nonvlmax_insn (icode, insn_flags, ops, len);
3872 }
3873
3874 /* Return RVV_VUNDEF if the ELSE value is scratch rtx.  */
3875 static rtx
3876 get_else_operand (rtx op)
3877 {
3878   return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3879 }
3880
3881 /* Expand unary ops COND_LEN_*.  */
3882 void
3883 expand_cond_len_unop (unsigned icode, rtx *ops)
3884 {
3885   rtx dest = ops[0];
3886   rtx mask = ops[1];
3887   rtx src = ops[2];
3888   rtx merge = get_else_operand (ops[3]);
3889   rtx len = ops[4];
3890
3891   rtx cond_ops[] = {dest, mask, merge, src};
3892   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3893 }
3894
3895 /* Expand unary ops COND_*.  */
3896 void
3897 expand_cond_unop (unsigned icode, rtx *ops)
3898 {
3899   rtx dest = ops[0];
3900   rtx mask = ops[1];
3901   rtx src = ops[2];
3902   rtx merge = get_else_operand (ops[3]);
3903   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3904
3905   rtx cond_ops[] = {dest, mask, merge, src};
3906   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3907 }
3908
3909 /* Expand binary ops COND_LEN_*.  */
3910 void
3911 expand_cond_len_binop (unsigned icode, rtx *ops)
3912 {
3913   rtx dest = ops[0];
3914   rtx mask = ops[1];
3915   rtx src1 = ops[2];
3916   rtx src2 = ops[3];
3917   rtx merge = get_else_operand (ops[4]);
3918   rtx len = ops[5];
3919
3920   rtx cond_ops[] = {dest, mask, merge, src1, src2};
3921   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3922 }
3923
3924 /* Expand binary ops COND_*.  */
3925 void
3926 expand_cond_binop (unsigned icode, rtx *ops)
3927 {
3928   rtx dest = ops[0];
3929   rtx mask = ops[1];
3930   rtx src1 = ops[2];
3931   rtx src2 = ops[3];
3932   rtx merge = get_else_operand (ops[4]);
3933   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3934
3935   rtx cond_ops[] = {dest, mask, merge, src1, src2};
3936   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3937 }
3938
3939 /* Prepare insn_code for gather_load/scatter_store according to
3940    the vector mode and index mode.  */
3941 static insn_code
3942 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
3943                         bool is_load)
3944 {
3945   if (!is_load)
3946     return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
3947   else
3948     {
3949       unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
3950       unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
3951       if (dst_eew_bitsize == src_eew_bitsize)
3952         return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
3953       else if (dst_eew_bitsize > src_eew_bitsize)
3954         {
3955           unsigned factor = dst_eew_bitsize / src_eew_bitsize;
3956           switch (factor)
3957             {
3958             case 2:
3959               return code_for_pred_indexed_load_x2_greater_eew (
3960                 UNSPEC_UNORDERED, vec_mode);
3961             case 4:
3962               return code_for_pred_indexed_load_x4_greater_eew (
3963                 UNSPEC_UNORDERED, vec_mode);
3964             case 8:
3965               return code_for_pred_indexed_load_x8_greater_eew (
3966                 UNSPEC_UNORDERED, vec_mode);
3967             default:
3968               gcc_unreachable ();
3969             }
3970         }
3971       else
3972         {
3973           unsigned factor = src_eew_bitsize / dst_eew_bitsize;
3974           switch (factor)
3975             {
3976             case 2:
3977               return code_for_pred_indexed_load_x2_smaller_eew (
3978                 UNSPEC_UNORDERED, vec_mode);
3979             case 4:
3980               return code_for_pred_indexed_load_x4_smaller_eew (
3981                 UNSPEC_UNORDERED, vec_mode);
3982             case 8:
3983               return code_for_pred_indexed_load_x8_smaller_eew (
3984                 UNSPEC_UNORDERED, vec_mode);
3985             default:
3986               gcc_unreachable ();
3987             }
3988         }
3989     }
3990 }
3991
3992 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}.  */
3993 void
3994 expand_gather_scatter (rtx *ops, bool is_load)
3995 {
3996   rtx ptr, vec_offset, vec_reg;
3997   bool zero_extend_p;
3998   int scale_log2;
3999   rtx mask = ops[5];
4000   rtx len = ops[6];
4001   if (is_load)
4002     {
4003       vec_reg = ops[0];
4004       ptr = ops[1];
4005       vec_offset = ops[2];
4006       zero_extend_p = INTVAL (ops[3]);
4007       scale_log2 = exact_log2 (INTVAL (ops[4]));
4008     }
4009   else
4010     {
4011       vec_reg = ops[4];
4012       ptr = ops[0];
4013       vec_offset = ops[1];
4014       zero_extend_p = INTVAL (ops[2]);
4015       scale_log2 = exact_log2 (INTVAL (ops[3]));
4016     }
4017
4018   machine_mode vec_mode = GET_MODE (vec_reg);
4019   machine_mode idx_mode = GET_MODE (vec_offset);
4020   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4021   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4022   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4023   poly_int64 value;
4024   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4025
4026   /* Extend the offset element to address width.  */
4027   if (inner_offsize < BITS_PER_WORD)
4028     {
4029       /* 7.2. Vector Load/Store Addressing Modes.
4030          If the vector offset elements are narrower than XLEN, they are
4031          zero-extended to XLEN before adding to the ptr effective address. If
4032          the vector offset elements are wider than XLEN, the least-significant
4033          XLEN bits are used in the address calculation. An implementation must
4034          raise an illegal instruction exception if the EEW is not supported for
4035          offset elements.
4036
4037          RVV spec only refers to the scale_log == 0 case.  */
4038       if (!zero_extend_p || scale_log2 != 0)
4039         {
4040           if (zero_extend_p)
4041             inner_idx_mode
4042               = int_mode_for_size (inner_offsize * 2, 0).require ();
4043           else
4044             inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4045           machine_mode new_idx_mode
4046             = get_vector_mode (inner_idx_mode, nunits).require ();
4047           rtx tmp = gen_reg_rtx (new_idx_mode);
4048           emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4049                                       zero_extend_p ? true : false));
4050           vec_offset = tmp;
4051           idx_mode = new_idx_mode;
4052         }
4053     }
4054
4055   if (scale_log2 != 0)
4056     {
4057       rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4058                               gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
4059                               OPTAB_DIRECT);
4060       vec_offset = tmp;
4061     }
4062
4063   insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4064   if (is_vlmax)
4065     {
4066       if (is_load)
4067         {
4068           rtx load_ops[]
4069             = {vec_reg, mask, ptr, vec_offset};
4070           emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4071         }
4072       else
4073         {
4074           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4075           emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4076         }
4077     }
4078   else
4079     {
4080       if (is_load)
4081         {
4082           rtx load_ops[]
4083             = {vec_reg, mask, ptr, vec_offset};
4084           emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4085         }
4086       else
4087         {
4088           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4089           emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4090         }
4091     }
4092 }
4093
4094 /* Expand COND_LEN_*.  */
4095 void
4096 expand_cond_len_ternop (unsigned icode, rtx *ops)
4097 {
4098   rtx dest = ops[0];
4099   rtx mask = ops[1];
4100   rtx src1 = ops[2];
4101   rtx src2 = ops[3];
4102   rtx src3 = ops[4];
4103   rtx merge = get_else_operand (ops[5]);
4104   rtx len = ops[6];
4105
4106   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4107   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4108 }
4109
4110 /* Expand COND_*.  */
4111 void
4112 expand_cond_ternop (unsigned icode, rtx *ops)
4113 {
4114   rtx dest = ops[0];
4115   rtx mask = ops[1];
4116   rtx src1 = ops[2];
4117   rtx src2 = ops[3];
4118   rtx src3 = ops[4];
4119   rtx merge = get_else_operand (ops[5]);
4120   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4121
4122   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4123   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4124 }
4125
4126 /* Expand reduction operations.
4127      Case 1: ops = {scalar_dest, vector_src}
4128      Case 2: ops = {scalar_dest, vector_src, mask, vl}
4129 */
4130 void
4131 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4132 {
4133   rtx scalar_dest = ops[0];
4134   rtx vector_src = ops[1];
4135   machine_mode vmode = GET_MODE (vector_src);
4136   machine_mode vel_mode = GET_MODE (scalar_dest);
4137   machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4138
4139   rtx m1_tmp = gen_reg_rtx (m1_mode);
4140   rtx scalar_move_ops[] = {m1_tmp, init};
4141   emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP,
4142                       scalar_move_ops,
4143                       need_mask_operand_p (insn_flags) ? ops[3]
4144                                                        : CONST1_RTX (Pmode));
4145   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4146   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4147   insn_code icode = code_for_pred (unspec, vmode);
4148
4149   if (need_mask_operand_p (insn_flags))
4150     {
4151       rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4152       emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4153     }
4154   else
4155     emit_vlmax_insn (icode, insn_flags, reduc_ops);
4156
4157   emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4158 }
4159
4160 /* Prepare ops for ternary operations.
4161    It can be called before or after RA.  */
4162 void
4163 prepare_ternary_operands (rtx *ops)
4164 {
4165   machine_mode mode = GET_MODE (ops[0]);
4166
4167   if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4168       && (VECTOR_MODE_P (GET_MODE (ops[2]))
4169           && !rtx_equal_p (ops[2], ops[5]))
4170       && !rtx_equal_p (ops[3], ops[5])
4171       && !rtx_equal_p (ops[4], ops[5]))
4172     {
4173       /* RA will fail to find vector REG and report ICE, so we pre-merge
4174          the ops for LMUL = 8.  */
4175       if (satisfies_constraint_Wc1 (ops[1]))
4176         {
4177           emit_move_insn (ops[0], ops[5]);
4178           emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4179                                    ops[7], ops[8], ops[9]));
4180         }
4181       else
4182         emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4183                                    ops[4], ops[1], ops[6], ops[7], ops[9]));
4184       ops[5] = ops[4] = ops[0];
4185     }
4186   else
4187     {
4188       /* Swap the multiplication ops if the fallback value is the
4189          second of the two.  */
4190       if (rtx_equal_p (ops[3], ops[5]))
4191         std::swap (ops[2], ops[3]);
4192
4193       /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4194          into PLUS (ASHIFT (a, 2), b) according to uarchs.  */
4195     }
4196   gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4197               || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4198 }
4199
4200 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}.  */
4201 void
4202 expand_lanes_load_store (rtx *ops, bool is_load)
4203 {
4204   poly_int64 value;
4205   rtx mask = ops[2];
4206   rtx len = ops[3];
4207   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4208   rtx reg = is_load ? ops[0] : ops[1];
4209   machine_mode mode = GET_MODE (ops[0]);
4210
4211   if (is_vlmax_len_p (mode, len))
4212     {
4213       /* If the length operand is equal to VF, it is VLMAX load/store.  */
4214       if (is_load)
4215         {
4216           rtx m_ops[] = {reg, mask, addr};
4217           emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4218                             m_ops);
4219         }
4220       else
4221         {
4222           len = gen_reg_rtx (Pmode);
4223           emit_vlmax_vsetvl (mode, len);
4224           emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4225                                                   get_avl_type_rtx (VLMAX)));
4226         }
4227     }
4228   else
4229     {
4230       if (!satisfies_constraint_K (len))
4231         len = force_reg (Pmode, len);
4232       if (is_load)
4233         {
4234           rtx m_ops[] = {reg, mask, addr};
4235           emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4236                                UNARY_OP_TAMA, m_ops, len);
4237         }
4238       else
4239         emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4240                                                 get_avl_type_rtx (NONVLMAX)));
4241     }
4242 }
4243
4244 /* Expand LEN_FOLD_EXTRACT_LAST.  */
4245 void
4246 expand_fold_extract_last (rtx *ops)
4247 {
4248   rtx dst = ops[0];
4249   rtx default_value = ops[1];
4250   rtx mask = ops[2];
4251   rtx anchor = gen_reg_rtx (Pmode);
4252   rtx index = gen_reg_rtx (Pmode);
4253   rtx vect = ops[3];
4254   rtx else_label = gen_label_rtx ();
4255   rtx end_label = gen_label_rtx ();
4256   rtx len = ops[4];
4257   poly_int64 value;
4258   machine_mode mode = GET_MODE (vect);
4259   machine_mode mask_mode = GET_MODE (mask);
4260   rtx compress_vect = gen_reg_rtx (mode);
4261   rtx slide_vect = gen_reg_rtx (mode);
4262   insn_code icode;
4263
4264   if (is_vlmax_len_p (mode, len))
4265     len = NULL_RTX;
4266
4267   /* Calculate the number of 1-bit in mask. */
4268   rtx cpop_ops[] = {anchor, mask};
4269   if (len)
4270     emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4271                          cpop_ops, len);
4272   else
4273     emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4274                       cpop_ops);
4275
4276   riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4277   emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4278   /* Compress the vector.  */
4279   icode = code_for_pred_compress (mode);
4280   rtx compress_ops[] = {compress_vect, vect, mask};
4281   if (len)
4282     emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4283   else
4284     emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4285   /* Emit the slide down to index 0 in a new vector.  */
4286   rtx slide_ops[] = {slide_vect, compress_vect, index};
4287   icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4288   if (len)
4289     emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4290   else
4291     emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4292   /* Emit v(f)mv.[xf].s.  */
4293   emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4294
4295   emit_jump_insn (gen_jump (end_label));
4296   emit_barrier ();
4297   emit_label (else_label);
4298   emit_move_insn (dst, default_value);
4299   emit_label (end_label);
4300 }
4301
4302 /* Return true if the LMUL of comparison less than or equal to one.  */
4303 bool
4304 cmp_lmul_le_one (machine_mode mode)
4305 {
4306   if (riscv_v_ext_vector_mode_p (mode))
4307     return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4308   else if (riscv_v_ext_vls_mode_p (mode))
4309     return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4310   return false;
4311 }
4312
4313 /* Return true if the LMUL of comparison greater than one.  */
4314 bool
4315 cmp_lmul_gt_one (machine_mode mode)
4316 {
4317   if (riscv_v_ext_vector_mode_p (mode))
4318     return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4319   else if (riscv_v_ext_vls_mode_p (mode))
4320     return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4321   return false;
4322 }
4323
4324 /* Return true if the VLS mode is legal. There are 2 cases here.
4325
4326    1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4327       is the highest priority choice and should not conflict with VLS modes.
4328    2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4329       VLS mode are smaller than the minimal vla.
4330
4331    Take vlen = 2048 as example for case 2.
4332
4333    Note: Below table based on vlen = 2048.
4334    +----------------------------------------------------+----------------------+
4335    | VLS mode                                           | VLA mode             |
4336    +----------------------------------------------------+----------------------+
4337    | Name       | Precision | Inner Precision | Enabled | Min mode  | Min bits |
4338    +------------+-----------+-----------------+---------+-----------+----------+
4339    | V1BI       |     1     |              1  | Yes     | RVVMF64BI |    32    |
4340    | V2BI       |     2     |              1  | Yes     | RVVMF64BI |    32    |
4341    | V4BI       |     4     |              1  | Yes     | RVVMF64BI |    32    |
4342    | V8BI       |     8     |              1  | Yes     | RVVMF64BI |    32    |
4343    | V16BI      |    16     |              1  | Yes     | RVVMF64BI |    32    |
4344    | V32BI      |    32     |              1  | NO      | RVVMF64BI |    32    |
4345    | V64BI      |    64     |              1  | NO      | RVVMF64BI |    32    |
4346    | ...        |   ...     |            ...  | ...     | RVVMF64BI |    32    |
4347    | V4096BI    |  4096     |              1  | NO      | RVVMF64BI |    32    |
4348    +------------+-----------+-----------------+---------+-----------+----------+
4349    | V1QI       |     8     |              8  | Yes     | RVVMF8QI  |   256    |
4350    | V2QI       |    16     |              8  | Yes     | RVVMF8QI  |   256    |
4351    | V4QI       |    32     |              8  | Yes     | RVVMF8QI  |   256    |
4352    | V8QI       |    64     |              8  | Yes     | RVVMF8QI  |   256    |
4353    | V16QI      |   128     |              8  | Yes     | RVVMF8QI  |   256    |
4354    | V32QI      |   256     |              8  | NO      | RVVMF8QI  |   256    |
4355    | V64QI      |   512     |              8  | NO      | RVVMF8QI  |   256    |
4356    | ...        |   ...     |              .. | ...     | RVVMF8QI  |   256    |
4357    | V4096QI    | 32768     |              8  | NO      | RVVMF8QI  |   256    |
4358    +------------+-----------+-----------------+---------+-----------+----------+
4359    | V1HI       |    16     |              16 | Yes     | RVVMF4HI  |   512    |
4360    | V2HI       |    32     |              16 | Yes     | RVVMF4HI  |   512    |
4361    | V4HI       |    64     |              16 | Yes     | RVVMF4HI  |   512    |
4362    | V8HI       |   128     |              16 | Yes     | RVVMF4HI  |   512    |
4363    | V16HI      |   256     |              16 | Yes     | RVVMF4HI  |   512    |
4364    | V32HI      |   512     |              16 | NO      | RVVMF4HI  |   512    |
4365    | V64HI      |  1024     |              16 | NO      | RVVMF4HI  |   512    |
4366    | ...        |   ...     |              .. | ...     | RVVMF4HI  |   512    |
4367    | V2048HI    | 32768     |              16 | NO      | RVVMF4HI  |   512    |
4368    +------------+-----------+-----------------+---------+-----------+----------+
4369    | V1SI/SF    |    32     |              32 | Yes     | RVVMF2SI  |  1024    |
4370    | V2SI/SF    |    64     |              32 | Yes     | RVVMF2SI  |  1024    |
4371    | V4SI/SF    |   128     |              32 | Yes     | RVVMF2SI  |  1024    |
4372    | V8SI/SF    |   256     |              32 | Yes     | RVVMF2SI  |  1024    |
4373    | V16SI/SF   |   512     |              32 | Yes     | RVVMF2SI  |  1024    |
4374    | V32SI/SF   |  1024     |              32 | NO      | RVVMF2SI  |  1024    |
4375    | V64SI/SF   |  2048     |              32 | NO      | RVVMF2SI  |  1024    |
4376    | ...        |   ...     |              .. | ...     | RVVMF2SI  |  1024    |
4377    | V1024SI/SF | 32768     |              32 | NO      | RVVMF2SI  |  1024    |
4378    +------------+-----------+-----------------+---------+-----------+----------+
4379    | V1DI/DF    |    64     |              64 | Yes     | RVVM1DI   |  2048    |
4380    | V2DI/DF    |   128     |              64 | Yes     | RVVM1DI   |  2048    |
4381    | V4DI/DF    |   256     |              64 | Yes     | RVVM1DI   |  2048    |
4382    | V8DI/DF    |   512     |              64 | Yes     | RVVM1DI   |  2048    |
4383    | V16DI/DF   |  1024     |              64 | Yes     | RVVM1DI   |  2048    |
4384    | V32DI/DF   |  2048     |              64 | NO      | RVVM1DI   |  2048    |
4385    | V64DI/DF   |  4096     |              64 | NO      | RVVM1DI   |  2048    |
4386    | ...        |   ...     |              .. | ...     | RVVM1DI   |  2048    |
4387    | V512DI/DF  | 32768     |              64 | NO      | RVVM1DI   |  2048    |
4388    +------------+-----------+-----------------+---------+-----------+----------+
4389
4390    Then we can have the condition for VLS mode in fixed-vlmax, aka:
4391      PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)).  */
4392 bool
4393 vls_mode_valid_p (machine_mode vls_mode)
4394 {
4395   if (!TARGET_VECTOR)
4396     return false;
4397
4398   if (riscv_autovec_preference == RVV_SCALABLE)
4399     {
4400       if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4401           && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4402                          GET_MODE_PRECISION (vls_mode)))
4403         /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4404            BITS_PER_RISCV_VECTOR.
4405
4406            e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4407            We enable VLS modes have fixed size <= 128bit.  Since ordered_p is
4408            false between VLA modes with size = (128, 128) bits and VLS mode
4409            with size = 128 bits, we will end up with multiple ICEs in
4410            middle-end generic codes.  */
4411         return false;
4412       return true;
4413     }
4414
4415   if (riscv_autovec_preference == RVV_FIXED_VLMAX)
4416     {
4417       machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4418       int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4419       int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4420
4421       return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4422     }
4423
4424   return false;
4425 }
4426
4427 /* We don't have to convert the floating point to integer when the
4428    mantissa is zero.  Thus, ther will be a limitation for both the
4429    single and double precision floating point.  There will be no
4430    mantissa if the floating point is greater than the limit.
4431
4432    1. Half floating point.
4433       +-----------+---------------+
4434       | float     | binary layout |
4435       +-----------+---------------+
4436       | 1023.5    | 0x63ff        |
4437       +-----------+---------------+
4438       | 1024.0    | 0x6400        |
4439       +-----------+---------------+
4440       | 1025.0    | 0x6401        |
4441       +-----------+---------------+
4442       | ...       | ...           |
4443
4444       All half floating point will be unchanged for ceil if it is
4445       greater than and equal to 1024.
4446
4447    2. Single floating point.
4448       +-----------+---------------+
4449       | float     | binary layout |
4450       +-----------+---------------+
4451       | 8388607.5 | 0x4affffff    |
4452       +-----------+---------------+
4453       | 8388608.0 | 0x4b000000    |
4454       +-----------+---------------+
4455       | 8388609.0 | 0x4b000001    |
4456       +-----------+---------------+
4457       | ...       | ...           |
4458
4459       All single floating point will be unchanged for ceil if it is
4460       greater than and equal to 8388608.
4461
4462    3. Double floating point.
4463       +--------------------+--------------------+
4464       | float              | binary layout      |
4465       +--------------------+--------------------+
4466       | 4503599627370495.5 | 0X432fffffffffffff |
4467       +--------------------+--------------------+
4468       | 4503599627370496.0 | 0X4330000000000000 |
4469       +--------------------+--------------------+
4470       | 4503599627370497.0 | 0X4340000000000000 |
4471       +--------------------+--------------------+
4472       | ...                | ...                |
4473
4474       All double floating point will be unchanged for ceil if it is
4475       greater than and equal to 4503599627370496.
4476  */
4477 static rtx
4478 get_fp_rounding_coefficient (machine_mode inner_mode)
4479 {
4480   REAL_VALUE_TYPE real;
4481
4482   if (inner_mode == E_HFmode)
4483     real_from_integer (&real, inner_mode, 1024, SIGNED);
4484   else if (inner_mode == E_SFmode)
4485     real_from_integer (&real, inner_mode, 8388608, SIGNED);
4486   else if (inner_mode == E_DFmode)
4487     real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4488   else
4489     gcc_unreachable ();
4490
4491   return const_double_from_real_value (real, inner_mode);
4492 }
4493
4494 static rtx
4495 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4496                          machine_mode vec_fp_mode)
4497 {
4498   /* Step-1: Prepare the scalar float compare register.  */
4499   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4500   emit_insn (gen_move_insn (fp_reg, fp_scalar));
4501
4502   /* Step-2: Generate the mask.  */
4503   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4504   rtx mask = gen_reg_rtx (mask_mode);
4505   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4506   rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4507   insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4508   emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4509
4510   return mask;
4511 }
4512
4513 static void
4514 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4515                    machine_mode vec_mode)
4516 {
4517   rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4518   insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4519
4520   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4521 }
4522
4523 static void
4524 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4525 {
4526   rtx abs_ops[] = {op_dest, op_src};
4527   insn_code icode = code_for_pred (ABS, vec_mode);
4528
4529   emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4530 }
4531
4532 static void
4533 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4534                   insn_type type, machine_mode vec_mode)
4535 {
4536   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4537
4538   if (type & USE_VUNDEF_MERGE_P)
4539     {
4540       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4541       emit_vlmax_insn (icode, type, cvt_x_ops);
4542     }
4543   else
4544     {
4545       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4546       emit_vlmax_insn (icode, type, cvt_x_ops);
4547     }
4548 }
4549
4550 static void
4551 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4552                   machine_mode vec_mode)
4553 {
4554   rtx ops[] = {op_dest, op_src};
4555   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4556
4557   emit_vlmax_insn (icode, type, ops);
4558 }
4559
4560 static void
4561 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4562                          machine_mode vec_mode)
4563 {
4564   rtx ops[] = {op_dest, op_src};
4565   insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4566
4567   emit_vlmax_insn (icode, type, ops);
4568 }
4569
4570 static void
4571 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4572                          machine_mode vec_mode)
4573 {
4574   rtx ops[] = {op_dest, op_src};
4575   insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4576
4577   emit_vlmax_insn (icode, type, ops);
4578 }
4579
4580 static void
4581 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4582                          machine_mode vec_mode)
4583 {
4584   rtx ops[] = {op_dest, op_src};
4585   insn_code icode = code_for_pred_extend (vec_mode);
4586
4587   emit_vlmax_insn (icode, type, ops);
4588 }
4589
4590 static void
4591 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4592                   insn_type type, machine_mode vec_mode)
4593 {
4594   rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4595   insn_code icode = code_for_pred (FLOAT, vec_mode);
4596
4597   emit_vlmax_insn (icode, type, cvt_fp_ops);
4598 }
4599
4600 static void
4601 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4602                       insn_type type, machine_mode vec_mode)
4603 {
4604   insn_code icode = code_for_pred (FIX, vec_mode);
4605
4606   if (type & USE_VUNDEF_MERGE_P)
4607     {
4608       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4609       emit_vlmax_insn (icode, type, cvt_x_ops);
4610     }
4611   else
4612     {
4613       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4614       emit_vlmax_insn (icode, type, cvt_x_ops);
4615     }
4616 }
4617
4618 void
4619 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4620                  machine_mode vec_int_mode)
4621 {
4622   /* Step-1: Get the abs float value for mask generation.  */
4623   emit_vec_abs (op_0, op_1, vec_fp_mode);
4624
4625   /* Step-2: Generate the mask on const fp.  */
4626   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4627   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4628
4629   /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
4630   rtx tmp = gen_reg_rtx (vec_int_mode);
4631   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4632
4633   /* Step-4: Convert to floating-point on mask for the final result.
4634      To avoid unnecessary frm register access, we use RUP here and it will
4635      never do the rounding up because the tmp rtx comes from the float
4636      to int conversion.  */
4637   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4638
4639   /* Step-5: Retrieve the sign bit for -0.0.  */
4640   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4641 }
4642
4643 void
4644 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4645                   machine_mode vec_int_mode)
4646 {
4647   /* Step-1: Get the abs float value for mask generation.  */
4648   emit_vec_abs (op_0, op_1, vec_fp_mode);
4649
4650   /* Step-2: Generate the mask on const fp.  */
4651   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4652   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4653
4654   /* Step-3: Convert to integer on mask, with rounding down (aka floor).  */
4655   rtx tmp = gen_reg_rtx (vec_int_mode);
4656   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4657
4658   /* Step-4: Convert to floating-point on mask for the floor result.  */
4659   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4660
4661   /* Step-5: Retrieve the sign bit for -0.0.  */
4662   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4663 }
4664
4665 void
4666 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4667                       machine_mode vec_int_mode)
4668 {
4669   /* Step-1: Get the abs float value for mask generation.  */
4670   emit_vec_abs (op_0, op_1, vec_fp_mode);
4671
4672   /* Step-2: Generate the mask on const fp.  */
4673   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4674   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4675
4676   /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4677   rtx fflags = gen_reg_rtx (SImode);
4678   emit_insn (gen_riscv_frflags (fflags));
4679
4680   /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint).  */
4681   rtx tmp = gen_reg_rtx (vec_int_mode);
4682   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4683
4684   /* Step-5: Convert to floating-point on mask for the nearbyint result.  */
4685   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4686
4687   /* Step-6: Restore FP exception flags. */
4688   emit_insn (gen_riscv_fsflags (fflags));
4689
4690   /* Step-7: Retrieve the sign bit for -0.0.  */
4691   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4692 }
4693
4694 void
4695 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4696                  machine_mode vec_int_mode)
4697 {
4698   /* Step-1: Get the abs float value for mask generation.  */
4699   emit_vec_abs (op_0, op_1, vec_fp_mode);
4700
4701   /* Step-2: Generate the mask on const fp.  */
4702   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4703   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4704
4705   /* Step-3: Convert to integer on mask, with dyn rounding (aka rint).  */
4706   rtx tmp = gen_reg_rtx (vec_int_mode);
4707   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4708
4709   /* Step-4: Convert to floating-point on mask for the rint result.  */
4710   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4711
4712   /* Step-5: Retrieve the sign bit for -0.0.  */
4713   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4714 }
4715
4716 void
4717 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4718                   machine_mode vec_int_mode)
4719 {
4720   /* Step-1: Get the abs float value for mask generation.  */
4721   emit_vec_abs (op_0, op_1, vec_fp_mode);
4722
4723   /* Step-2: Generate the mask on const fp.  */
4724   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4725   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4726
4727   /* Step-3: Convert to integer on mask, rounding to nearest (aka round).  */
4728   rtx tmp = gen_reg_rtx (vec_int_mode);
4729   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4730
4731   /* Step-4: Convert to floating-point on mask for the round result.  */
4732   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4733
4734   /* Step-5: Retrieve the sign bit for -0.0.  */
4735   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4736 }
4737
4738 void
4739 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4740                   machine_mode vec_int_mode)
4741 {
4742   /* Step-1: Get the abs float value for mask generation.  */
4743   emit_vec_abs (op_0, op_1, vec_fp_mode);
4744
4745   /* Step-2: Generate the mask on const fp.  */
4746   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4747   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4748
4749   /* Step-3: Convert to integer on mask, rounding to zero (aka truncate).  */
4750   rtx tmp = gen_reg_rtx (vec_int_mode);
4751   emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4752
4753   /* Step-4: Convert to floating-point on mask for the rint result.  */
4754   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4755
4756   /* Step-5: Retrieve the sign bit for -0.0.  */
4757   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4758 }
4759
4760 void
4761 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4762                       machine_mode vec_int_mode)
4763 {
4764   /* Step-1: Get the abs float value for mask generation.  */
4765   emit_vec_abs (op_0, op_1, vec_fp_mode);
4766
4767   /* Step-2: Generate the mask on const fp.  */
4768   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4769   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4770
4771   /* Step-3: Convert to integer on mask, rounding to nearest, ties to even.  */
4772   rtx tmp = gen_reg_rtx (vec_int_mode);
4773   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4774
4775   /* Step-4: Convert to floating-point on mask for the rint result.  */
4776   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4777
4778   /* Step-5: Retrieve the sign bit for -0.0.  */
4779   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4780 }
4781
4782 /* Handling the rounding from floating-point to int/long/long long.  */
4783 static void
4784 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4785                               machine_mode vec_fp_mode,
4786                               machine_mode vec_int_mode,
4787                               machine_mode vec_bridge_mode = E_VOIDmode)
4788 {
4789   poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4790   poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4791
4792   if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI.  */
4793     emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4794   else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI.  */
4795     emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4796   else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI.  */
4797     emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode);
4798   else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI.  */
4799     {
4800       gcc_assert (vec_bridge_mode != E_VOIDmode);
4801
4802       rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4803
4804       /* Step-1: HF => SF, no rounding here.  */
4805       emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4806       /* Step-2: SF => DI.  */
4807       emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4808     }
4809   else
4810     gcc_unreachable ();
4811 }
4812
4813 void
4814 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4815                   machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4816 {
4817   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4818                                 vec_int_mode, vec_bridge_mode);
4819 }
4820
4821 void
4822 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4823                    machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4824 {
4825   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4826                                 vec_int_mode, vec_bridge_mode);
4827 }
4828
4829 void
4830 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4831                   machine_mode vec_int_mode)
4832 {
4833   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4834                                 vec_int_mode);
4835 }
4836
4837 void
4838 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4839                    machine_mode vec_int_mode)
4840 {
4841   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4842                                 vec_int_mode);
4843 }
4844
4845 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
4846    well.  */
4847 void
4848 expand_popcount (rtx *ops)
4849 {
4850   rtx dst = ops[0];
4851   rtx src = ops[1];
4852   machine_mode mode = GET_MODE (dst);
4853   scalar_mode imode = GET_MODE_INNER (mode);
4854   static const uint64_t m5 = 0x5555555555555555ULL;
4855   static const uint64_t m3 = 0x3333333333333333ULL;
4856   static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
4857   static const uint64_t m1 = 0x0101010101010101ULL;
4858
4859   rtx x1 = gen_reg_rtx (mode);
4860   rtx x2 = gen_reg_rtx (mode);
4861   rtx x3 = gen_reg_rtx (mode);
4862   rtx x4 = gen_reg_rtx (mode);
4863
4864   /* x1 = src - (src >> 1) & 0x555...);  */
4865   rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
4866                              OPTAB_DIRECT);
4867
4868   rtx and1 = gen_reg_rtx (mode);
4869   rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
4870   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4871                    ops1);
4872
4873   x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
4874
4875   /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
4876    */
4877   rtx and2 = gen_reg_rtx (mode);
4878   rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
4879   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4880                    ops2);
4881
4882   rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
4883                              OPTAB_DIRECT);
4884
4885   rtx and22 = gen_reg_rtx (mode);
4886   rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
4887   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4888                    ops22);
4889
4890   x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
4891
4892   /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL;  */
4893   rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
4894                              OPTAB_DIRECT);
4895
4896   rtx plus3
4897     = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
4898
4899   rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
4900   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4901                    ops3);
4902
4903   /* dest = (x3 * 0x0101010101010101ULL) >> 56;  */
4904   rtx mul4 = gen_reg_rtx (mode);
4905   rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
4906   emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
4907                    ops4);
4908
4909   x4 = expand_binop (mode, lshr_optab, mul4,
4910                      GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
4911                      OPTAB_DIRECT);
4912
4913   emit_move_insn (dst, x4);
4914 }
4915
4916 /* Return true if it is VLMAX AVL TYPE.  */
4917 bool
4918 vlmax_avl_type_p (rtx_insn *rinsn)
4919 {
4920   extract_insn_cached (rinsn);
4921   int index = get_attr_avl_type_idx (rinsn);
4922   if (index == INVALID_ATTRIBUTE)
4923     return false;
4924   rtx avl_type = recog_data.operand[index];
4925   return INTVAL (avl_type) == VLMAX;
4926 }
4927
4928 /* Return true if it is an RVV instruction depends on VL global
4929    status register.  */
4930 bool
4931 has_vl_op (rtx_insn *rinsn)
4932 {
4933   return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
4934 }
4935
4936 /* Get default tail policy.  */
4937 static bool
4938 get_default_ta ()
4939 {
4940   /* For the instruction that doesn't require TA, we still need a default value
4941      to emit vsetvl. We pick up the default value according to prefer policy. */
4942   return (bool) (get_prefer_tail_policy () & 0x1
4943                  || (get_prefer_tail_policy () >> 1 & 0x1));
4944 }
4945
4946 /* Helper function to get TA operand.  */
4947 bool
4948 tail_agnostic_p (rtx_insn *rinsn)
4949 {
4950   /* If it doesn't have TA, we return agnostic by default.  */
4951   extract_insn_cached (rinsn);
4952   int ta = get_attr_ta (rinsn);
4953   return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
4954 }
4955
4956 /* Change insn and Assert the change always happens.  */
4957 void
4958 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
4959 {
4960   bool change_p = validate_change (object, loc, new_rtx, in_group);
4961   gcc_assert (change_p);
4962 }
4963
4964 /* Return true if it is NONVLMAX AVL TYPE.  */
4965 bool
4966 nonvlmax_avl_type_p (rtx_insn *rinsn)
4967 {
4968   extract_insn_cached (rinsn);
4969   int index = get_attr_avl_type_idx (rinsn);
4970   if (index == INVALID_ATTRIBUTE)
4971     return false;
4972   rtx avl_type = recog_data.operand[index];
4973   return INTVAL (avl_type) == NONVLMAX;
4974 }
4975
4976 /* Return true if RTX is RVV VLMAX AVL.  */
4977 bool
4978 vlmax_avl_p (rtx x)
4979 {
4980   return x && rtx_equal_p (x, RVV_VLMAX);
4981 }
4982
4983 /* Helper function to get SEW operand. We always have SEW value for
4984    all RVV instructions that have VTYPE OP.  */
4985 uint8_t
4986 get_sew (rtx_insn *rinsn)
4987 {
4988   return get_attr_sew (rinsn);
4989 }
4990
4991 /* Helper function to get VLMUL operand. We always have VLMUL value for
4992    all RVV instructions that have VTYPE OP. */
4993 enum vlmul_type
4994 get_vlmul (rtx_insn *rinsn)
4995 {
4996   return (enum vlmul_type) get_attr_vlmul (rinsn);
4997 }
4998
4999 /* Count the number of REGNO in RINSN.  */
5000 int
5001 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5002 {
5003   int count = 0;
5004   extract_insn (rinsn);
5005   for (int i = 0; i < recog_data.n_operands; i++)
5006     if (refers_to_regno_p (regno, recog_data.operand[i]))
5007       count++;
5008   return count;
5009 }
5010
5011 /* Return true if the OP can be directly broadcasted.  */
5012 bool
5013 can_be_broadcasted_p (rtx op)
5014 {
5015   machine_mode mode = GET_MODE (op);
5016   /* We don't allow RA (register allocation) reload generate
5017     (vec_duplicate:DI reg) in RV32 system wheras we allow
5018     (vec_duplicate:DI mem) in RV32 system.  */
5019   if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5020       && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5021       && !satisfies_constraint_Wdm (op))
5022     return false;
5023
5024   if (satisfies_constraint_K (op) || register_operand (op, mode)
5025       || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5026     return true;
5027
5028   return can_create_pseudo_p () && nonmemory_operand (op, mode);
5029 }
5030
5031 void
5032 emit_vec_extract (rtx target, rtx src, rtx index)
5033 {
5034   machine_mode vmode = GET_MODE (src);
5035   machine_mode smode = GET_MODE (target);
5036   class expand_operand ops[3];
5037   enum insn_code icode
5038     = convert_optab_handler (vec_extract_optab, vmode, smode);
5039   gcc_assert (icode != CODE_FOR_nothing);
5040   create_output_operand (&ops[0], target, smode);
5041   ops[0].target = 1;
5042   create_input_operand (&ops[1], src, vmode);
5043
5044   poly_int64 val;
5045   if (poly_int_rtx_p (index, &val))
5046     create_integer_operand (&ops[2], val);
5047   else
5048     create_input_operand (&ops[2], index, Pmode);
5049
5050   expand_insn (icode, 3, ops);
5051   if (ops[0].value != target)
5052     emit_move_insn (target, ops[0].value);
5053 }
5054
5055 /* Return true if the offset mode is valid mode that we use for gather/scatter
5056    autovectorization.  */
5057 bool
5058 gather_scatter_valid_offset_p (machine_mode mode)
5059 {
5060   /* If the element size of offset mode is already >= Pmode size,
5061      we don't need any extensions.  */
5062   if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5063     return true;
5064
5065   /* Since we are very likely extend the offset mode into vector Pmode,
5066      Disable gather/scatter autovectorization if we can't extend the offset
5067      mode into vector Pmode.  */
5068   if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5069     return false;
5070   return true;
5071 }
5072
5073 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5074    Look into the tuning structure for an estimate.
5075    KIND specifies the type of requested estimate: min, max or likely.
5076    For cores with a known VLA width all three estimates are the same.
5077    For generic VLA tuning we want to distinguish the maximum estimate from
5078    the minimum and likely ones.
5079    The likely estimate is the same as the minimum in that case to give a
5080    conservative behavior of auto-vectorizing with VLA when it is a win
5081    even for VLA vectorization.
5082    When VLA width information is available VAL.coeffs[1] is multiplied by
5083    the number of VLA chunks over the initial VLS bits.  */
5084 HOST_WIDE_INT
5085 estimated_poly_value (poly_int64 val, unsigned int kind)
5086 {
5087   unsigned int width_source
5088     = BITS_PER_RISCV_VECTOR.is_constant ()
5089         ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5090         : (unsigned int) RVV_SCALABLE;
5091
5092   /* If there is no core-specific information then the minimum and likely
5093      values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5094      the architectural maximum of 65536 bits.  */
5095   unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5096   if (width_source == RVV_SCALABLE)
5097     switch (kind)
5098       {
5099       case POLY_VALUE_MIN:
5100       case POLY_VALUE_LIKELY:
5101         return val.coeffs[0];
5102
5103       case POLY_VALUE_MAX:
5104         return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5105       }
5106
5107   /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5108      lowest as likely.  This could be made more general if future -mtune
5109      options need it to be.  */
5110   if (kind == POLY_VALUE_MAX)
5111     width_source = 1 << floor_log2 (width_source);
5112   else
5113     width_source = least_bit_hwi (width_source);
5114
5115   /* If the core provides width information, use that.  */
5116   HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5117   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5118 }
5119
5120 } // namespace riscv_vector