gcc/config/riscv/riscv-v.cc

   1 /* Subroutines used for code generation for RISC-V 'V' Extension for
   2    GNU compiler.
   3    Copyright (C) 2022-2024 Free Software Foundation, Inc.
   4    Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
   5
   6    This file is part of GCC.
   7
   8    GCC is free software; you can redistribute it and/or modify it
   9    under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GCC is distributed in the hope that it will be useful, but
  14    WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16    General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GCC; see the file COPYING3.  If not see
  20    <http://www.gnu.org/licenses/>.  */
  21
  22 #define IN_TARGET_CODE 1
  23
  24 /* We have a maximum of 11 operands for RVV instruction patterns according to
  25    the vector.md.  */
  26 #define RVV_INSN_OPERANDS_MAX 11
  27
  28 #include "config.h"
  29 #include "system.h"
  30 #include "coretypes.h"
  31 #include "tm.h"
  32 #include "backend.h"
  33 #include "rtl.h"
  34 #include "insn-config.h"
  35 #include "insn-attr.h"
  36 #include "recog.h"
  37 #include "alias.h"
  38 #include "tree.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "explow.h"
  42 #include "memmodel.h"
  43 #include "emit-rtl.h"
  44 #include "tm_p.h"
  45 #include "target.h"
  46 #include "targhooks.h"
  47 #include "expr.h"
  48 #include "optabs.h"
  49 #include "tm-constrs.h"
  50 #include "rtx-vector-builder.h"
  51 #include "targhooks.h"
  52 #include "predict.h"
  53 #include "errors.h"
  54 #include "riscv-v.h"
  55
  56 using namespace riscv_vector;
  57
  58 namespace riscv_vector {
  59
  60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli.  */
  61 bool
  62 imm_avl_p (machine_mode mode)
  63 {
  64   poly_uint64 nunits = GET_MODE_NUNITS (mode);
  65
  66   return nunits.is_constant ()
  67            /* The vsetivli can only hold register 0~31.  */
  68            ? (IN_RANGE (nunits.to_constant (), 0, 31))
  69            /* Only allowed in VLS-VLMAX mode.  */
  70            : false;
  71 }
  72
  73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
  74 static bool
  75 is_vlmax_len_p (machine_mode mode, rtx len)
  76 {
  77   poly_int64 value;
  78   return poly_int_rtx_p (len, &value)
  79          && known_eq (value, GET_MODE_NUNITS (mode));
  80 }
  81
  82 /* Helper functions for insn_flags && insn_types */
  83
  84 /* Return true if caller need pass mask operand for insn pattern with
  85    INSN_FLAGS. */
  86
  87 static bool
  88 need_mask_operand_p (unsigned insn_flags)
  89 {
  90   return (insn_flags & HAS_MASK_P)
  91          && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
  92 }
  93
  94 template <int MAX_OPERANDS> class insn_expander
  95 {
  96 public:
  97   insn_expander () = delete;
  98
  99   insn_expander (unsigned insn_flags, bool vlmax_p)
 100     : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
 101       m_vl_op (NULL_RTX)
 102   {
 103     check_insn_flags ();
 104   }
 105
 106   void check_insn_flags () const
 107   {
 108     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 109       /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P.  */
 110       gcc_assert ((m_insn_flags & HAS_MASK_P));
 111
 112     if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 113       /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P.  */
 114       gcc_assert ((m_insn_flags & HAS_MASK_P));
 115
 116     /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive.  */
 117     gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
 118                   && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
 119
 120     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 121       /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P.  */
 122       gcc_assert ((m_insn_flags & HAS_MERGE_P));
 123
 124     /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive.  */
 125     gcc_assert (
 126       !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
 127
 128     /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive.  */
 129     gcc_assert (
 130       !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
 131
 132     /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
 133        exclusive.  */
 134     gcc_assert (
 135       !((m_insn_flags & NULLARY_OP_P)
 136         && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 137             || (m_insn_flags & TERNARY_OP_P))));
 138     gcc_assert (
 139       !((m_insn_flags & UNARY_OP_P)
 140         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
 141             || (m_insn_flags & TERNARY_OP_P))));
 142     gcc_assert (
 143       !((m_insn_flags & BINARY_OP_P)
 144         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 145             || (m_insn_flags & TERNARY_OP_P))));
 146     gcc_assert (
 147       !((m_insn_flags & TERNARY_OP_P)
 148         && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
 149             || (m_insn_flags & BINARY_OP_P))));
 150   }
 151
 152   void set_vl (rtx vl) { m_vl_op = vl; }
 153
 154   void add_output_operand (rtx x, machine_mode mode)
 155   {
 156     create_output_operand (&m_ops[m_opno++], x, mode);
 157     gcc_assert (m_opno <= MAX_OPERANDS);
 158   }
 159   void add_input_operand (rtx x, machine_mode mode)
 160   {
 161     create_input_operand (&m_ops[m_opno++], x, mode);
 162     gcc_assert (m_opno <= MAX_OPERANDS);
 163   }
 164   void add_all_one_mask_operand (machine_mode mask_mode)
 165   {
 166     add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
 167   }
 168   void add_first_one_true_mask_operand (machine_mode mask_mode)
 169   {
 170     add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
 171   }
 172   void add_vundef_operand (machine_mode dest_mode)
 173   {
 174     add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
 175   }
 176   void add_policy_operand ()
 177   {
 178     if (m_insn_flags & TU_POLICY_P)
 179       {
 180         rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
 181         add_input_operand (tail_policy_rtx, Pmode);
 182       }
 183     else if (m_insn_flags & TDEFAULT_POLICY_P)
 184       {
 185         rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
 186         add_input_operand (tail_policy_rtx, Pmode);
 187       }
 188
 189     if (m_insn_flags & MU_POLICY_P)
 190       {
 191         rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
 192         add_input_operand (mask_policy_rtx, Pmode);
 193       }
 194     else if (m_insn_flags & MDEFAULT_POLICY_P)
 195       {
 196         rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
 197         add_input_operand (mask_policy_rtx, Pmode);
 198       }
 199   }
 200   void add_avl_type_operand (avl_type type)
 201   {
 202     add_input_operand (gen_int_mode (type, Pmode), Pmode);
 203   }
 204
 205   void
 206   add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
 207   {
 208     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 209     add_input_operand (frm_rtx, Pmode);
 210   }
 211
 212   void
 213   add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
 214   {
 215     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
 216     add_input_operand (frm_rtx, Pmode);
 217   }
 218
 219   /* Return the vtype mode based on insn_flags.
 220      vtype mode mean the mode vsetvl insn set. */
 221   machine_mode
 222   get_vtype_mode (rtx *ops)
 223   {
 224     machine_mode vtype_mode;
 225     if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
 226       vtype_mode = GET_MODE (ops[1]);
 227     else
 228       vtype_mode = GET_MODE (ops[0]);
 229     return vtype_mode;
 230   }
 231
 232   void emit_insn (enum insn_code icode, rtx *ops)
 233   {
 234     int opno = 0;
 235     int num_ops;
 236     /* It's true if any operand is memory operand.  */
 237     bool any_mem_p = false;
 238
 239     machine_mode vtype_mode = get_vtype_mode (ops);
 240     machine_mode mask_mode = get_mask_mode (vtype_mode);
 241
 242     /* Add dest operand.  */
 243     if (m_insn_flags & HAS_DEST_P)
 244       {
 245         rtx op = ops[opno++];
 246         any_mem_p |= MEM_P (op);
 247         add_output_operand (op, GET_MODE (op));
 248       }
 249
 250     /* Add mask operand.  */
 251     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
 252       add_first_one_true_mask_operand (mask_mode);
 253     else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
 254       add_all_one_mask_operand (mask_mode);
 255     else if (m_insn_flags & HAS_MASK_P)
 256       {
 257         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 258         gcc_assert (mode != VOIDmode);
 259         add_input_operand (ops[opno++], mode);
 260       }
 261
 262     /* Add merge operand.  */
 263     if (m_insn_flags & USE_VUNDEF_MERGE_P)
 264       /* Same as dest operand.  */
 265       add_vundef_operand (GET_MODE (ops[0]));
 266     else if (m_insn_flags & HAS_MERGE_P)
 267       {
 268         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 269         gcc_assert (mode != VOIDmode);
 270         add_input_operand (ops[opno++], mode);
 271       }
 272
 273     if (m_insn_flags & NULLARY_OP_P)
 274       num_ops = 0;
 275     else if (m_insn_flags & UNARY_OP_P)
 276       num_ops = 1;
 277     else if (m_insn_flags & BINARY_OP_P)
 278       num_ops = 2;
 279     else if (m_insn_flags & TERNARY_OP_P)
 280       num_ops = 3;
 281     else
 282       gcc_unreachable ();
 283
 284     /* Add the remain operands.  */
 285     for (; num_ops; num_ops--, opno++)
 286       {
 287         any_mem_p |= MEM_P (ops[opno]);
 288         machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
 289         /* 'create_input_operand doesn't allow VOIDmode.
 290            According to vector.md, we may have some patterns that do not have
 291            explicit machine mode specifying the operand. Such operands are
 292            always Pmode.  */
 293         if (mode == VOIDmode)
 294           mode = Pmode;
 295
 296         /* Early assertion ensures same mode since maybe_legitimize_operand
 297            will check this.  */
 298         machine_mode required_mode = GET_MODE (ops[opno]);
 299         if (required_mode != VOIDmode && required_mode != mode)
 300           internal_error ("expected mode %s for operand %d of "
 301                           "insn %s but got mode %s.\n",
 302                           GET_MODE_NAME (mode),
 303                           opno,
 304                           insn_data[(int) icode].name,
 305                           GET_MODE_NAME (required_mode));
 306
 307         add_input_operand (ops[opno], mode);
 308       }
 309
 310     /* Add vl operand.  */
 311     rtx len = m_vl_op;
 312     bool vls_p = false;
 313     if (m_vlmax_p)
 314       {
 315         if (riscv_v_ext_vls_mode_p (vtype_mode))
 316           {
 317             /* VLS modes always set VSETVL by
 318                "vsetvl zero, rs1/imm".  */
 319             poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
 320             len = gen_int_mode (nunits, Pmode);
 321             vls_p = true;
 322           }
 323         else if (can_create_pseudo_p ())
 324           {
 325             len = gen_reg_rtx (Pmode);
 326             emit_vlmax_vsetvl (vtype_mode, len);
 327           }
 328       }
 329
 330     gcc_assert (len != NULL_RTX);
 331     add_input_operand (len, Pmode);
 332
 333     /* Add tail and mask policy operands.  */
 334     add_policy_operand ();
 335
 336     /* Add avl_type operand.  */
 337     add_avl_type_operand (
 338       vls_p ? avl_type::VLS
 339             : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
 340
 341     /* Add rounding mode operand.  */
 342     if (m_insn_flags & FRM_DYN_P)
 343       add_rounding_mode_operand (FRM_DYN);
 344     else if (m_insn_flags & FRM_RUP_P)
 345       add_rounding_mode_operand (FRM_RUP);
 346     else if (m_insn_flags & FRM_RDN_P)
 347       add_rounding_mode_operand (FRM_RDN);
 348     else if (m_insn_flags & FRM_RMM_P)
 349       add_rounding_mode_operand (FRM_RMM);
 350     else if (m_insn_flags & FRM_RNE_P)
 351       add_rounding_mode_operand (FRM_RNE);
 352     else if (m_insn_flags & VXRM_RNU_P)
 353       add_rounding_mode_operand (VXRM_RNU);
 354     else if (m_insn_flags & VXRM_RDN_P)
 355       add_rounding_mode_operand (VXRM_RDN);
 356
 357
 358     if (insn_data[(int) icode].n_operands != m_opno)
 359       internal_error ("invalid number of operands for insn %s, "
 360                       "expected %d but got %d.\n",
 361                       insn_data[(int) icode].name,
 362                       insn_data[(int) icode].n_operands, m_opno);
 363
 364     expand (icode, any_mem_p);
 365   }
 366
 367   void expand (enum insn_code icode, bool temporary_volatile_p = false)
 368   {
 369     if (temporary_volatile_p)
 370       {
 371         temporary_volatile_ok v (true);
 372         expand_insn (icode, m_opno, m_ops);
 373       }
 374     else
 375       expand_insn (icode, m_opno, m_ops);
 376   }
 377
 378 private:
 379   unsigned m_insn_flags;
 380   int m_opno;
 381   bool m_vlmax_p;
 382   rtx m_vl_op;
 383   expand_operand m_ops[MAX_OPERANDS];
 384 };
 385
 386 /* Emit an RVV insn with a vector length that equals the number of units of the
 387    vector mode.  For VLA modes this corresponds to VLMAX.
 388
 389    Unless the vector length can be encoded in the vsetivl[i] instruction this
 390    function must only be used as long as we can create pseudo registers. This is
 391    because it will set a pseudo register to VLMAX using vsetvl and use this as
 392    definition for the vector length.  */
 393 void
 394 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
 395 {
 396   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 397   gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
 398
 399   e.emit_insn ((enum insn_code) icode, ops);
 400 }
 401
 402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
 403    registers anymore.  This function, however, takes a predefined vector length
 404    from the value in VL. */
 405 void
 406 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 407 {
 408   gcc_assert (!can_create_pseudo_p ());
 409   machine_mode mode = GET_MODE (ops[0]);
 410
 411   if (imm_avl_p (mode))
 412     {
 413       /* Even though VL is a real hardreg already allocated since
 414          it is post-RA now, we still gain benefits that we emit
 415          vsetivli zero, imm instead of vsetvli VL, zero which is
 416          we can be more flexible in post-RA instruction scheduling.  */
 417       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 418       e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
 419       e.emit_insn ((enum insn_code) icode, ops);
 420     }
 421   else
 422     {
 423       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
 424       e.set_vl (vl);
 425       e.emit_insn ((enum insn_code) icode, ops);
 426     }
 427 }
 428
 429 /* Emit an RVV insn with a predefined vector length.  Contrary to
 430    emit_vlmax_insn the instruction's vector length is not deduced from its mode
 431    but taken from  the value in VL.  */
 432 void
 433 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 434 {
 435   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
 436   e.set_vl (vl);
 437   e.emit_insn ((enum insn_code) icode, ops);
 438 }
 439
 440 /* Return true if the vector duplicated by a super element which is the fusion
 441    of consecutive elements.
 442
 443      v = { a, b, a, b } super element = ab, v = { ab, ab }  */
 444 bool
 445 rvv_builder::can_duplicate_repeating_sequence_p ()
 446 {
 447   poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
 448   unsigned int new_inner_size = m_inner_bits_size * npatterns ();
 449   if (m_inner_mode == Pmode
 450       || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
 451       || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
 452       || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
 453     return false;
 454   return repeating_sequence_p (0, encoded_nelts (), npatterns ());
 455 }
 456
 457 /* Return true if the vector is a simple sequence with one pattern and all
 458    elements the same.  */
 459 bool
 460 rvv_builder::is_repeating_sequence ()
 461 {
 462   if (npatterns () > 1)
 463     return false;
 464   return repeating_sequence_p (0, encoded_nelts (), 1);
 465 }
 466
 467 /* Return true if it is a repeating sequence that using
 468    merge approach has better codegen than using default
 469    approach (slide1down).
 470
 471    Sequence A:
 472      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 473
 474    nelts = 16
 475    npatterns = 2
 476
 477    for merging a we need mask 101010....
 478    for merging b we need mask 010101....
 479
 480    Foreach element in the npattern, we need to build a mask in scalar register.
 481    Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
 482    instructions and 1 scalar move to v0 register.  Finally we need vector merge
 483    to merge them.
 484
 485    lui          a5, #imm
 486    add          a5, #imm
 487    vmov.s.x     v0, a5
 488    vmerge.vxm   v9, v9, a1, v0
 489
 490    So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
 491    If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
 492    So return true in this case as it is profitable.
 493
 494    Sequence B:
 495      {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
 496
 497    nelts = 16
 498    npatterns = 8
 499
 500    COST of merge approach = (3 + 1) * npatterns = 24
 501    COST of slide1down approach = nelts = 16
 502    Return false in this case as it is NOT profitable in merge approach.
 503 */
 504 bool
 505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
 506 {
 507   if (inner_bytes_size () > UNITS_PER_WORD)
 508     return false;
 509
 510   unsigned int nelts = full_nelts ().to_constant ();
 511
 512   if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
 513     return false;
 514
 515   unsigned int merge_cost = 1;
 516   unsigned int build_merge_mask_cost = 3;
 517   unsigned int slide1down_cost = nelts;
 518
 519   return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
 520 }
 521
 522 /* Return true if it's worthwhile to use slideup combine 2 vectors.  */
 523 bool
 524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
 525 {
 526   int nelts = full_nelts ().to_constant ();
 527   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 528   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 529
 530   /* ??? Current heuristic we do is we do combine 2 vectors
 531      by slideup when:
 532        1. # of leading same elements is equal to # of trailing same elements.
 533        2. Both of above are equal to nelts / 2.
 534      Otherwise, it is not profitable.  */
 535   return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
 536 }
 537
 538 /* Return true if it's worthwhile to use merge combine vector with a scalar.  */
 539 bool
 540 rvv_builder::combine_sequence_use_merge_profitable_p ()
 541 {
 542   int nelts = full_nelts ().to_constant ();
 543   int leading_ndups = this->count_dups (0, nelts - 1, 1);
 544   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
 545   int nregs = riscv_get_v_regno_alignment (int_mode ());
 546
 547   if (leading_ndups + trailing_ndups != nelts)
 548     return false;
 549
 550   /* Leading elements num > 255 which exceeds the maximum value
 551      of QImode, we will need to use HImode.  */
 552   machine_mode mode;
 553   if (leading_ndups > 255 || nregs > 2)
 554     {
 555       if (!get_vector_mode (HImode, nelts).exists (&mode))
 556         return false;
 557       /* We will need one more AVL/VL toggling vsetvl instruction.  */
 558       return leading_ndups > 4 && trailing_ndups > 4;
 559     }
 560
 561   /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
 562      consume 3 slide instructions.  */
 563   return leading_ndups > 3 && trailing_ndups > 3;
 564 }
 565
 566 /* Merge the repeating sequence into a single element and return the RTX.  */
 567 rtx
 568 rvv_builder::get_merged_repeating_sequence ()
 569 {
 570   scalar_int_mode mode = Pmode;
 571   rtx target = gen_reg_rtx (mode);
 572   emit_move_insn (target, const0_rtx);
 573   rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
 574   /* { a, b, a, b }: Generate duplicate element = b << bits | a.  */
 575   for (unsigned int i = 0; i < npatterns (); i++)
 576     {
 577       unsigned int loc = m_inner_bits_size * i;
 578       rtx shift = gen_int_mode (loc, mode);
 579       rtx ele = gen_lowpart (mode, elt (i));
 580       rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
 581                                      OPTAB_DIRECT);
 582       rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
 583                                       OPTAB_DIRECT);
 584       rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
 585                                       OPTAB_DIRECT);
 586       emit_move_insn (target, tmp3);
 587     }
 588   if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
 589     return gen_lowpart (m_new_inner_mode, target);
 590   return target;
 591 }
 592
 593 /* Get the mask for merge approach.
 594
 595    Consider such following case:
 596      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
 597    To merge "a", the mask should be 1010....
 598    To merge "b", the mask should be 0101....
 599 */
 600 rtx
 601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
 602                                     machine_mode inner_mode) const
 603 {
 604   unsigned HOST_WIDE_INT mask = 0;
 605   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
 606   /* Here we construct a mask pattern that will later be broadcast
 607      to a vector register.  The maximum broadcast size for vmv.v.x/vmv.s.x
 608      is determined by the length of a vector element (ELEN) and not by
 609      XLEN so make sure we do not exceed it.  One example is -march=zve32*
 610      which mandates ELEN == 32 but can be combined with -march=rv64
 611      with XLEN == 64.  */
 612   unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
 613
 614   gcc_assert (elen % npatterns () == 0);
 615
 616   int limit = elen / npatterns ();
 617
 618   for (int i = 0; i < limit; i++)
 619     mask |= base_mask << (i * npatterns ());
 620
 621   return gen_int_mode (mask, inner_mode);
 622 }
 623
 624 /* Return true if the variable-length vector is single step.
 625    Single step means step all patterns in NPATTERNS are equal.
 626    Consider this following case:
 627
 628      CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 629        { 0, 2, 2, 4, 4, 6, ... }
 630      First pattern: step1 = 2 - 0 = 2
 631                     step2 = 4 - 2 = 2
 632      Second pattern: step1 = 4 - 2 = 2
 633                      step2 = 6 - 4 = 2
 634      Since all steps of NPATTERNS are equal step = 2.
 635      Return true in this case.
 636
 637      CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
 638        { 0, 1, 2, 4, 4, 7, ... }
 639      First pattern: step1 = 2 - 0 = 2
 640                     step2 = 4 - 2 = 2
 641      Second pattern: step1 = 4 - 1 = 3
 642                      step2 = 7 - 4 = 3
 643      Since not all steps are equal, return false.  */
 644 bool
 645 rvv_builder::single_step_npatterns_p () const
 646 {
 647   if (nelts_per_pattern () != 3)
 648     return false;
 649
 650   poly_int64 step
 651     = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
 652   for (unsigned int i = 0; i < npatterns (); i++)
 653     {
 654       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 655       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 656       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 657       poly_int64 diff1 = ele1 - ele0;
 658       poly_int64 diff2 = ele2 - ele1;
 659       if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
 660         return false;
 661     }
 662   return true;
 663 }
 664
 665 /* Return true if the diff between const vector and vid sequence
 666    is repeated. For example as below cases:
 667    The diff means the const vector - vid.
 668      CASE 1:
 669      CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
 670      VID         : {0, 1, 2, 3, 4, 5, 6, 7, ... }
 671      DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
 672      The diff sequence {3, 1,-1,-3} is repeated in the npattern and
 673      return TRUE for case 1.
 674
 675      CASE 2:
 676      CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
 677      VID         : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
 678      DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
 679      The diff sequence {-4, 3} is not repeated in the npattern and
 680      return FALSE for case 2.  */
 681 bool
 682 rvv_builder::npatterns_vid_diff_repeated_p () const
 683 {
 684   if (nelts_per_pattern () != 3)
 685     return false;
 686   else if (npatterns () == 0)
 687     return false;
 688
 689   for (unsigned i = 0; i < npatterns (); i++)
 690     {
 691       poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
 692       poly_int64 diff_1
 693         = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
 694
 695       if (maybe_ne (diff_0, diff_1))
 696         return false;
 697     }
 698
 699   return true;
 700 }
 701
 702 /* Return true if the permutation consists of two
 703    interleaved patterns with a constant step each.
 704    TODO: We currently only support NPATTERNS = 2.  */
 705 bool
 706 rvv_builder::interleaved_stepped_npatterns_p () const
 707 {
 708   if (npatterns () != 2 || nelts_per_pattern () != 3)
 709     return false;
 710   for (unsigned int i = 0; i < npatterns (); i++)
 711     {
 712       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
 713       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
 714       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
 715       poly_int64 diff1 = ele1 - ele0;
 716       poly_int64 diff2 = ele2 - ele1;
 717       if (maybe_ne (diff1, diff2))
 718         return false;
 719     }
 720   return true;
 721 }
 722
 723 /* Return true if all elements of NPATTERNS are equal.
 724
 725    E.g. NPATTERNS = 4:
 726      { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
 727    E.g. NPATTERNS = 8:
 728      { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
 729    We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
 730    We don't need to check the elements[n] with n >= NPATTERNS since
 731    they don't belong to the same pattern.
 732 */
 733 bool
 734 rvv_builder::npatterns_all_equal_p () const
 735 {
 736   poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
 737   for (unsigned int i = 1; i < npatterns (); i++)
 738     {
 739       poly_int64 ele = rtx_to_poly_int64 (elt (i));
 740       if (!known_eq (ele, ele0))
 741         return false;
 742     }
 743   return true;
 744 }
 745
 746 static unsigned
 747 get_sew (machine_mode mode)
 748 {
 749   unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
 750                        ? 8
 751                        : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
 752   return sew;
 753 }
 754
 755 /* Return true if X is a const_vector with all duplicate elements, which is in
 756    the range between MINVAL and MAXVAL.  */
 757 bool
 758 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
 759                                HOST_WIDE_INT maxval)
 760 {
 761   rtx elt;
 762   return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
 763           && IN_RANGE (INTVAL (elt), minval, maxval));
 764 }
 765
 766 /* Return true if VEC is a constant in which every element is in the range
 767    [MINVAL, MAXVAL].  The elements do not need to have the same value.
 768
 769    This function also exists in aarch64, we may unify it in middle-end in the
 770    future.  */
 771
 772 static bool
 773 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
 774 {
 775   if (!CONST_VECTOR_P (vec)
 776       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
 777     return false;
 778
 779   int nunits;
 780   if (!CONST_VECTOR_STEPPED_P (vec))
 781     nunits = const_vector_encoded_nelts (vec);
 782   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
 783     return false;
 784
 785   for (int i = 0; i < nunits; i++)
 786     {
 787       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
 788       poly_int64 value;
 789       if (!poly_int_rtx_p (vec_elem, &value)
 790           || maybe_lt (value, minval)
 791           || maybe_gt (value, maxval))
 792         return false;
 793     }
 794   return true;
 795 }
 796
 797 /* Returns true if the vector's elements are all duplicates in
 798    range -16 ~ 15 integer or 0.0 floating-point.  */
 799
 800 bool
 801 valid_vec_immediate_p (rtx x)
 802 {
 803   return (satisfies_constraint_vi (x) || satisfies_constraint_Wc0 (x));
 804 }
 805
 806 /* Return a const vector of VAL. The VAL can be either const_int or
 807    const_poly_int.  */
 808
 809 static rtx
 810 gen_const_vector_dup (machine_mode mode, poly_int64 val)
 811 {
 812   scalar_mode smode = GET_MODE_INNER (mode);
 813   rtx c = gen_int_mode (val, smode);
 814   if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
 815     {
 816       /* When VAL is const_poly_int value, we need to explicitly broadcast
 817          it into a vector using RVV broadcast instruction.  */
 818       return expand_vector_broadcast (mode, c);
 819     }
 820    return gen_const_vec_duplicate (mode, c);
 821 }
 822
 823 /* Emit a vlmax vsetvl instruction.  This should only be used when
 824    optimization is disabled or after vsetvl insertion pass.  */
 825 void
 826 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
 827 {
 828   unsigned int sew = get_sew (vmode);
 829   emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
 830                          gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
 831                          const0_rtx));
 832 }
 833
 834 void
 835 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
 836 {
 837   unsigned int sew = get_sew (vmode);
 838   enum vlmul_type vlmul = get_vlmul (vmode);
 839   unsigned int ratio = calculate_ratio (sew, vlmul);
 840
 841   if (!optimize)
 842     emit_hard_vlmax_vsetvl (vmode, vl);
 843   else
 844     emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
 845 }
 846
 847 /* Calculate SEW/LMUL ratio.  */
 848 unsigned int
 849 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
 850 {
 851   unsigned int ratio;
 852   switch (vlmul)
 853     {
 854     case LMUL_1:
 855       ratio = sew;
 856       break;
 857     case LMUL_2:
 858       ratio = sew / 2;
 859       break;
 860     case LMUL_4:
 861       ratio = sew / 4;
 862       break;
 863     case LMUL_8:
 864       ratio = sew / 8;
 865       break;
 866     case LMUL_F8:
 867       ratio = sew * 8;
 868       break;
 869     case LMUL_F4:
 870       ratio = sew * 4;
 871       break;
 872     case LMUL_F2:
 873       ratio = sew * 2;
 874       break;
 875     default:
 876       gcc_unreachable ();
 877     }
 878   return ratio;
 879 }
 880
 881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
 882    compile-time unknown). ZVL means that the vector-length is specific
 883    (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
 884    auto-vectorization using VLMAX vsetvl configuration.  */
 885 static bool
 886 autovec_use_vlmax_p (void)
 887 {
 888   return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE
 889           || rvv_vector_bits == RVV_VECTOR_BITS_ZVL;
 890 }
 891
 892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
 893    is a const duplicate vector. Otherwise, emit vrgather.vv.  */
 894 static void
 895 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
 896 {
 897   rtx elt;
 898   insn_code icode;
 899   machine_mode data_mode = GET_MODE (target);
 900   machine_mode sel_mode = GET_MODE (sel);
 901   if (const_vec_duplicate_p (sel, &elt))
 902     {
 903       icode = code_for_pred_gather_scalar (data_mode);
 904       sel = elt;
 905     }
 906   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 907     icode = code_for_pred_gatherei16 (data_mode);
 908   else
 909     icode = code_for_pred_gather (data_mode);
 910   rtx ops[] = {target, op, sel};
 911   emit_vlmax_insn (icode, BINARY_OP, ops);
 912 }
 913
 914 static void
 915 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
 916 {
 917   rtx elt;
 918   insn_code icode;
 919   machine_mode data_mode = GET_MODE (target);
 920   machine_mode sel_mode = GET_MODE (sel);
 921   if (const_vec_duplicate_p (sel, &elt))
 922     {
 923       icode = code_for_pred_gather_scalar (data_mode);
 924       sel = elt;
 925     }
 926   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
 927     icode = code_for_pred_gatherei16 (data_mode);
 928   else
 929     icode = code_for_pred_gather (data_mode);
 930   rtx ops[] = {target, mask, target, op, sel};
 931   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 932 }
 933
 934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
 935    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 936
 937   There is no inverse vdecompress provided, as this operation can be readily
 938   synthesized using iota and a masked vrgather:
 939
 940       Desired functionality of 'vdecompress'
 941         7 6 5 4 3 2 1 0     # vid
 942
 943               e d c b a     # packed vector of 5 elements
 944         1 0 0 1 1 1 0 1     # mask vector of 8 elements
 945         p q r s t u v w     # destination register before vdecompress
 946
 947         e q r d c b v a     # result of vdecompress
 948        # v0 holds mask
 949        # v1 holds packed data
 950        # v11 holds input expanded vector and result
 951        viota.m v10, v0                 # Calc iota from mask in v0
 952        vrgather.vv v11, v1, v10, v0.t  # Expand into destination
 953      p q r s t u v w  # v11 destination register
 954            e d c b a  # v1 source vector
 955      1 0 0 1 1 1 0 1  # v0 mask vector
 956
 957      4 4 4 3 2 1 1 0  # v10 result of viota.m
 958      e q r d c b v a  # v11 destination after vrgather using viota.m under mask
 959 */
 960 static void
 961 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
 962 {
 963   machine_mode data_mode = GET_MODE (target);
 964   machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
 965   if (GET_MODE_INNER (data_mode) == QImode)
 966     sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
 967
 968   rtx sel = gen_reg_rtx (sel_mode);
 969   rtx iota_ops[] = {sel, mask};
 970   emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
 971   emit_vlmax_gather_insn (target, op0, sel);
 972   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 973 }
 974
 975 /* Emit merge instruction.  */
 976
 977 static machine_mode
 978 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
 979                                          machine_mode mask_bit_mode)
 980 {
 981   unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
 982   unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
 983     ? builder.inner_bits_size () : mask_precision;
 984
 985   scalar_mode inner_mode;
 986   unsigned minimal_bits_size;
 987
 988   switch (mask_scalar_size)
 989     {
 990       case 8:
 991         inner_mode = QImode;
 992         minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
 993         break;
 994       case 16:
 995         inner_mode = HImode;
 996         minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
 997         break;
 998       case 32:
 999         inner_mode = SImode;
1000         minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
1001         break;
1002       case 64:
1003         inner_mode = DImode;
1004         minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
1005         break;
1006       default:
1007         gcc_unreachable ();
1008         break;
1009     }
1010
1011   gcc_assert (mask_precision % mask_scalar_size == 0);
1012
1013   uint64_t dup_nunit = mask_precision > mask_scalar_size
1014     ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1015
1016   return get_vector_mode (inner_mode, dup_nunit).require ();
1017 }
1018
1019 /* Expand series const vector.  If VID is NULL_RTX, we use vid.v
1020    instructions to generate sequence for VID:
1021
1022      VID = { 0, 1, 2, 3, ... }
1023
1024    Otherwise, we use the VID argument directly.  */
1025
1026 void
1027 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1028 {
1029   machine_mode mode = GET_MODE (dest);
1030   poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1031   poly_int64 value;
1032   rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1033
1034   /* VECT_IV = BASE + I * STEP.  */
1035
1036   /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v.  */
1037   bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1038                    && poly_int_rtx_p (base, &value)
1039                    && known_eq (nunits_m1, value);
1040   if (!vid)
1041     {
1042       vid = gen_reg_rtx (mode);
1043       rtx op[] = {vid};
1044       emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1045     }
1046
1047   rtx step_adj;
1048   if (reverse_p)
1049     {
1050       /* Special case:
1051            {nunits - 1, nunits - 2, ... , 0}.
1052            nunits can be either const_int or const_poly_int.
1053
1054          Code sequence:
1055            vid.v v
1056            vrsub nunits - 1, v.  */
1057       rtx ops[]
1058         = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1059       insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1060       emit_vlmax_insn (icode, BINARY_OP, ops);
1061     }
1062   else
1063     {
1064       /* Step 2: Generate I * STEP.
1065          - STEP is 1, we don't emit any instructions.
1066          - STEP is power of 2, we use vsll.vi/vsll.vx.
1067          - STEP is non-power of 2, we use vmul.vx.  */
1068       if (rtx_equal_p (step, const1_rtx))
1069         step_adj = vid;
1070       else
1071         {
1072           step_adj = gen_reg_rtx (mode);
1073           if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1074             {
1075               /* Emit logical left shift operation.  */
1076               int shift = exact_log2 (INTVAL (step));
1077               rtx shift_amount = gen_int_mode (shift, Pmode);
1078               insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1079               rtx ops[] = {step_adj, vid, shift_amount};
1080               emit_vlmax_insn (icode, BINARY_OP, ops);
1081             }
1082           else
1083             {
1084               insn_code icode = code_for_pred_scalar (MULT, mode);
1085               rtx ops[] = {step_adj, vid, step};
1086               emit_vlmax_insn (icode, BINARY_OP, ops);
1087             }
1088         }
1089
1090       /* Step 3: Generate BASE + I * STEP.
1091           - BASE is 0, use result of vid.
1092           - BASE is not 0, we use vadd.vx/vadd.vi.  */
1093       if (rtx_equal_p (base, const0_rtx))
1094         emit_move_insn (result, step_adj);
1095       else
1096         {
1097           insn_code icode = code_for_pred_scalar (PLUS, mode);
1098           rtx ops[] = {result, step_adj, base};
1099           emit_vlmax_insn (icode, BINARY_OP, ops);
1100         }
1101     }
1102
1103   if (result != dest)
1104     emit_move_insn (dest, result);
1105 }
1106
1107 /* Subroutine of riscv_vector_expand_vector_init.
1108    Works as follows:
1109    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110    (b) Skip leading elements from BUILDER, which are the same as
1111        element NELTS_REQD - 1.
1112    (c) Insert earlier elements in reverse order in TARGET using vslide1down.  */
1113
1114 static void
1115 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
1116                                  int nelts_reqd)
1117 {
1118   machine_mode mode = GET_MODE (target);
1119   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
1120   emit_move_insn (target, dup);
1121   int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1122   for (int i = ndups; i < nelts_reqd; i++)
1123     {
1124       unsigned int unspec
1125         = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
1126       insn_code icode = code_for_pred_slide (unspec, mode);
1127       rtx ops[] = {target, target, builder.elt (i)};
1128       emit_vlmax_insn (icode, BINARY_OP, ops);
1129     }
1130 }
1131
1132 /* Subroutine of expand_vec_init to handle case
1133    when all trailing elements of builder are same.
1134    This works as follows:
1135    (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136    (b) Insert remaining elements in TARGET using insr.
1137
1138    ??? The heuristic used is to do above if number of same trailing elements
1139    is greater than leading_ndups, loosely based on
1140    heuristic from mostly_zeros_p.  May need fine-tuning.  */
1141
1142 static bool
1143 expand_vector_init_trailing_same_elem (rtx target,
1144                                        const rtx_vector_builder &builder,
1145                                        int nelts_reqd)
1146 {
1147   int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1148   int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
1149   machine_mode mode = GET_MODE (target);
1150
1151   if (trailing_ndups > leading_ndups)
1152     {
1153       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
1154       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
1155         {
1156           unsigned int unspec
1157             = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
1158           insn_code icode = code_for_pred_slide (unspec, mode);
1159           rtx tmp = gen_reg_rtx (mode);
1160           rtx ops[] = {tmp, dup, builder.elt (i)};
1161           emit_vlmax_insn (icode, BINARY_OP, ops);
1162           /* slide1up need source and dest to be different REG.  */
1163           dup = tmp;
1164         }
1165
1166       emit_move_insn (target, dup);
1167       return true;
1168     }
1169
1170   return false;
1171 }
1172
1173 static void
1174 expand_const_vector (rtx target, rtx src)
1175 {
1176   machine_mode mode = GET_MODE (target);
1177   rtx result = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1178   rtx elt;
1179   if (const_vec_duplicate_p (src, &elt))
1180     {
1181       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1182         {
1183           gcc_assert (rtx_equal_p (elt, const0_rtx)
1184                       || rtx_equal_p (elt, const1_rtx));
1185           rtx ops[] = {result, src};
1186           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1187         }
1188       /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189          we use vmv.v.i instruction.  */
1190       else if (valid_vec_immediate_p (src))
1191         {
1192           rtx ops[] = {result, src};
1193           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1194         }
1195       else
1196         {
1197           /* Emit vec_duplicate<mode> split pattern before RA so that
1198              we could have a better optimization opportunity in LICM
1199              which will hoist vmv.v.x outside the loop and in fwprop && combine
1200              which will transform 'vv' into 'vx' instruction.
1201
1202              The reason we don't emit vec_duplicate<mode> split pattern during
1203              RA since the split stage after RA is a too late stage to generate
1204              RVV instruction which need an additional register (We can't
1205              allocate a new register after RA) for VL operand of vsetvl
1206              instruction (vsetvl a5, zero).  */
1207           if (lra_in_progress)
1208             {
1209               rtx ops[] = {result, elt};
1210               emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1211             }
1212           else
1213             {
1214               struct expand_operand ops[2];
1215               enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1216               gcc_assert (icode != CODE_FOR_nothing);
1217               create_output_operand (&ops[0], result, mode);
1218               create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1219               expand_insn (icode, 2, ops);
1220               result = ops[0].value;
1221             }
1222         }
1223
1224       if (result != target)
1225         emit_move_insn (target, result);
1226       return;
1227     }
1228
1229   /* Support scalable const series vector.  */
1230   rtx base, step;
1231   if (const_vec_series_p (src, &base, &step))
1232     {
1233       expand_vec_series (result, base, step);
1234
1235       if (result != target)
1236         emit_move_insn (target, result);
1237       return;
1238     }
1239
1240   /* Handle variable-length vector.  */
1241   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1242   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1243   rvv_builder builder (mode, npatterns, nelts_per_pattern);
1244   for (unsigned int i = 0; i < nelts_per_pattern; i++)
1245     {
1246       for (unsigned int j = 0; j < npatterns; j++)
1247         builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1248     }
1249   builder.finalize ();
1250
1251   if (CONST_VECTOR_DUPLICATE_P (src))
1252     {
1253       /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254          E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255               NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256         The elements within NPATTERNS are not necessary regular.  */
1257       if (builder.can_duplicate_repeating_sequence_p ())
1258         {
1259           /* We handle the case that we can find a vector container to hold
1260              element bitsize = NPATTERNS * ele_bitsize.
1261
1262                NPATTERNS = 8, element width = 8
1263                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264                In this case, we can combine NPATTERNS element into a larger
1265                element. Use element width = 64 and broadcast a vector with
1266                all element equal to 0x0706050403020100.  */
1267           rtx ele = builder.get_merged_repeating_sequence ();
1268           rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1269           emit_move_insn (result, gen_lowpart (mode, dup));
1270         }
1271       else
1272         {
1273           /* We handle the case that we can't find a vector container to hold
1274              element bitsize = NPATTERNS * ele_bitsize.
1275
1276                NPATTERNS = 8, element width = 16
1277                  v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278                Since NPATTERNS * element width = 128, we can't find a container
1279                to hold it.
1280
1281                In this case, we use NPATTERNS merge operations to generate such
1282                vector.  */
1283           unsigned int nbits = npatterns - 1;
1284
1285           /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1286           rtx vid = gen_reg_rtx (builder.int_mode ());
1287           rtx op[] = {vid};
1288           emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1289                             NULLARY_OP, op);
1290
1291           /* Generate vid_repeat = { 0, 1, ... nbits, ... }  */
1292           rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1293           rtx and_ops[] = {vid_repeat, vid,
1294                            gen_int_mode (nbits, builder.inner_int_mode ())};
1295           emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1296                             BINARY_OP, and_ops);
1297
1298           rtx tmp1 = gen_reg_rtx (builder.mode ());
1299           rtx dup_ops[] = {tmp1, builder.elt (0)};
1300           emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1301                             dup_ops);
1302           for (unsigned int i = 1; i < builder.npatterns (); i++)
1303             {
1304               /* Generate mask according to i.  */
1305               rtx mask = gen_reg_rtx (builder.mask_mode ());
1306               rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1307               expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1308
1309               /* Merge scalar to each i.  */
1310               rtx tmp2 = gen_reg_rtx (builder.mode ());
1311               rtx merge_ops[] = {tmp2, tmp1, builder.elt (i), mask};
1312               insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1313               emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1314               tmp1 = tmp2;
1315             }
1316           emit_move_insn (result, tmp1);
1317         }
1318     }
1319   else if (CONST_VECTOR_STEPPED_P (src))
1320     {
1321       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1322       if (builder.single_step_npatterns_p ())
1323         {
1324           /* Describe the case by choosing NPATTERNS = 4 as an example.  */
1325           insn_code icode;
1326
1327           /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
1328           rtx vid = gen_reg_rtx (builder.mode ());
1329           rtx vid_ops[] = {vid};
1330           icode = code_for_pred_series (builder.mode ());
1331           emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1332
1333           if (builder.npatterns_all_equal_p ())
1334             {
1335               /* Generate the variable-length vector following this rule:
1336                  { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337                    E.g. { 0, 0, 8, 8, 16, 16, ... } */
1338
1339               /* We want to create a pattern where value[idx] = floor (idx /
1340                  NPATTERNS). As NPATTERNS is always a power of two we can
1341                  rewrite this as = idx & -NPATTERNS.  */
1342               /* Step 2: VID AND -NPATTERNS:
1343                  { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1344               */
1345               rtx imm
1346                 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1347               rtx tmp1 = gen_reg_rtx (builder.mode ());
1348               rtx and_ops[] = {tmp1, vid, imm};
1349               icode = code_for_pred_scalar (AND, builder.mode ());
1350               emit_vlmax_insn (icode, BINARY_OP, and_ops);
1351
1352               /* Step 3: Convert to step size 1.  */
1353               rtx tmp2 = gen_reg_rtx (builder.mode ());
1354               /* log2 (npatterns) to get the shift amount to convert
1355                  Eg.  { 0, 0, 0, 0, 4, 4, ... }
1356                  into { 0, 0, 0, 0, 1, 1, ... }.  */
1357               HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ;
1358               rtx shift = gen_int_mode (shift_amt, builder.inner_mode ());
1359               rtx shift_ops[] = {tmp2, tmp1, shift};
1360               icode = code_for_pred_scalar (ASHIFTRT, builder.mode ());
1361               emit_vlmax_insn (icode, BINARY_OP, shift_ops);
1362
1363               /* Step 4: Multiply to step size n.  */
1364               HOST_WIDE_INT step_size =
1365                 INTVAL (builder.elt (builder.npatterns ()))
1366                 - INTVAL (builder.elt (0));
1367               rtx tmp3 = gen_reg_rtx (builder.mode ());
1368               if (pow2p_hwi (step_size))
1369                 {
1370                   /* Power of 2 can be handled with a left shift.  */
1371                   HOST_WIDE_INT shift = exact_log2 (step_size);
1372                   rtx shift_amount = gen_int_mode (shift, Pmode);
1373                   insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1374                   rtx ops[] = {tmp3, tmp2, shift_amount};
1375                   emit_vlmax_insn (icode, BINARY_OP, ops);
1376                 }
1377               else
1378                 {
1379                   rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ());
1380                   insn_code icode = code_for_pred_scalar (MULT, builder.mode ());
1381                   rtx ops[] = {tmp3, tmp2, mult_amt};
1382                   emit_vlmax_insn (icode, BINARY_OP, ops);
1383                 }
1384
1385               /* Step 5: Add starting value to all elements.  */
1386               HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1387               if (init_val == 0)
1388                 emit_move_insn (result, tmp3);
1389               else
1390                 {
1391                   rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1392                   rtx add_ops[] = {result, tmp3, dup};
1393                   icode = code_for_pred (PLUS, builder.mode ());
1394                   emit_vlmax_insn (icode, BINARY_OP, add_ops);
1395                 }
1396             }
1397           else
1398             {
1399               /* Generate the variable-length vector following this rule:
1400                 { a, b, a + step, b + step, a + step*2, b + step*2, ... }  */
1401
1402               if (builder.npatterns_vid_diff_repeated_p ())
1403                 {
1404                   /* Case 1: For example as below:
1405                      {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406                      We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407                      repeated as below after minus vid.
1408                      {3, 1, -1, -3, 3, 1, -1, -3...}
1409                      Then we can simplify the diff code gen to at most
1410                      npatterns().  */
1411                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1412
1413                   /* Step 1: Generate diff = TARGET - VID.  */
1414                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1415                     {
1416                      poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1417                      v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1418                     }
1419
1420                   /* Step 2: Generate result = VID + diff.  */
1421                   rtx vec = v.build ();
1422                   rtx add_ops[] = {result, vid, vec};
1423                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1424                                    BINARY_OP, add_ops);
1425                 }
1426               else
1427                 {
1428                   /* Case 2: For example as below:
1429                      { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1430                    */
1431                   rvv_builder v (builder.mode (), builder.npatterns (), 1);
1432
1433                   /* Step 1: Generate { a, b, a, b, ... }  */
1434                   for (unsigned int i = 0; i < v.npatterns (); ++i)
1435                     v.quick_push (builder.elt (i));
1436                   rtx new_base = v.build ();
1437
1438                   /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS).  */
1439                   rtx shift_count
1440                     = gen_int_mode (exact_log2 (builder.npatterns ()),
1441                                     builder.inner_mode ());
1442                   rtx tmp1 = expand_simple_binop (builder.mode (), LSHIFTRT,
1443                                                  vid, shift_count, NULL_RTX,
1444                                                  false, OPTAB_DIRECT);
1445
1446                   /* Step 3: Generate tmp2 = tmp1 * step.  */
1447                   rtx tmp2 = gen_reg_rtx (builder.mode ());
1448                   rtx step
1449                     = simplify_binary_operation (MINUS, builder.inner_mode (),
1450                                                  builder.elt (v.npatterns()),
1451                                                  builder.elt (0));
1452                   expand_vec_series (tmp2, const0_rtx, step, tmp1);
1453
1454                   /* Step 4: Generate result = tmp2 + new_base.  */
1455                   rtx add_ops[] = {result, tmp2, new_base};
1456                   emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1457                                    BINARY_OP, add_ops);
1458                 }
1459             }
1460         }
1461       else if (builder.interleaved_stepped_npatterns_p ())
1462         {
1463           rtx base1 = builder.elt (0);
1464           rtx base2 = builder.elt (1);
1465           poly_int64 step1
1466             = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1467               - rtx_to_poly_int64 (base1);
1468           poly_int64 step2
1469             = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1470               - rtx_to_poly_int64 (base2);
1471
1472           /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1473              integer vector mode to generate such vector efficiently.
1474
1475              E.g. EEW = 16, { 2, 0, 4, 0, ... }
1476
1477              can be interpreted into:
1478
1479                   EEW = 32, { 2, 4, ... }  */
1480           unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1481           scalar_int_mode new_smode;
1482           machine_mode new_mode;
1483           poly_uint64 new_nunits
1484             = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1485           if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1486               && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1487             {
1488               rtx tmp1 = gen_reg_rtx (new_mode);
1489               base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1490               expand_vec_series (tmp1, base1, gen_int_mode (step1, new_smode));
1491
1492               if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1493                 /* { 1, 0, 2, 0, ... }.  */
1494                 emit_move_insn (result, gen_lowpart (mode, tmp1));
1495               else if (known_eq (step2, 0))
1496                 {
1497                   /* { 1, 1, 2, 1, ... }.  */
1498                   rtx scalar = expand_simple_binop (
1499                     new_smode, ASHIFT,
1500                     gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1501                     gen_int_mode (builder.inner_bits_size (), new_smode),
1502                     NULL_RTX, false, OPTAB_DIRECT);
1503                   rtx tmp2 = gen_reg_rtx (new_mode);
1504                   rtx ior_ops[] = {tmp2, tmp1, scalar};
1505                   emit_vlmax_insn (code_for_pred_scalar (IOR, new_mode),
1506                                    BINARY_OP, ior_ops);
1507                   emit_move_insn (result, gen_lowpart (mode, tmp2));
1508                 }
1509               else
1510                 {
1511                   /* { 1, 3, 2, 6, ... }.  */
1512                   rtx tmp2 = gen_reg_rtx (new_mode);
1513                   base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1514                   expand_vec_series (tmp2, base2,
1515                                      gen_int_mode (step2, new_smode));
1516                   rtx shifted_tmp2 = expand_simple_binop (
1517                     new_mode, ASHIFT, tmp2,
1518                     gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1519                     false, OPTAB_DIRECT);
1520                   rtx tmp3 = gen_reg_rtx (new_mode);
1521                   rtx ior_ops[] = {tmp3, tmp1, shifted_tmp2};
1522                   emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1523                                    ior_ops);
1524                   emit_move_insn (result, gen_lowpart (mode, tmp3));
1525                 }
1526             }
1527           else
1528             {
1529               rtx vid = gen_reg_rtx (mode);
1530               expand_vec_series (vid, const0_rtx, const1_rtx);
1531               /* Transform into { 0, 0, 1, 1, 2, 2, ... }.  */
1532               rtx shifted_vid
1533                 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1534                                        NULL_RTX, false, OPTAB_DIRECT);
1535               rtx tmp1 = gen_reg_rtx (mode);
1536               rtx tmp2 = gen_reg_rtx (mode);
1537               expand_vec_series (tmp1, base1,
1538                                  gen_int_mode (step1, builder.inner_mode ()),
1539                                  shifted_vid);
1540               expand_vec_series (tmp2, base2,
1541                                  gen_int_mode (step2, builder.inner_mode ()),
1542                                  shifted_vid);
1543
1544               /* Transform into { 0, 1, 0, 1, 0, 1, ... }.  */
1545               rtx and_vid = gen_reg_rtx (mode);
1546               rtx and_ops[] = {and_vid, vid, const1_rtx};
1547               emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1548                                and_ops);
1549               rtx mask = gen_reg_rtx (builder.mask_mode ());
1550               expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1551
1552               rtx ops[] = {result, tmp1, tmp2, mask};
1553               emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1554             }
1555         }
1556       else
1557         /* TODO: We will enable more variable-length vector in the future.  */
1558         gcc_unreachable ();
1559     }
1560   else
1561     gcc_unreachable ();
1562
1563   if (result != target)
1564     emit_move_insn (target, result);
1565 }
1566
1567 /* Get the frm mode with given CONST_INT rtx, the default mode is
1568    FRM_DYN.  */
1569 enum floating_point_rounding_mode
1570 get_frm_mode (rtx operand)
1571 {
1572   gcc_assert (CONST_INT_P (operand));
1573
1574   switch (INTVAL (operand))
1575     {
1576     case FRM_RNE:
1577       return FRM_RNE;
1578     case FRM_RTZ:
1579       return FRM_RTZ;
1580     case FRM_RDN:
1581       return FRM_RDN;
1582     case FRM_RUP:
1583       return FRM_RUP;
1584     case FRM_RMM:
1585       return FRM_RMM;
1586     case FRM_DYN:
1587       return FRM_DYN;
1588     default:
1589       gcc_unreachable ();
1590     }
1591
1592   gcc_unreachable ();
1593 }
1594
1595 /* Expand a pre-RA RVV data move from SRC to DEST.
1596    It expands move for RVV fractional vector modes.
1597    Return true if the move as already been emitted.  */
1598 bool
1599 legitimize_move (rtx dest, rtx *srcp)
1600 {
1601   rtx src = *srcp;
1602   machine_mode mode = GET_MODE (dest);
1603   if (CONST_VECTOR_P (src))
1604     {
1605       expand_const_vector (dest, src);
1606       return true;
1607     }
1608
1609   if (riscv_v_ext_vls_mode_p (mode))
1610     {
1611       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1612         {
1613           /* For NUNITS <= 31 VLS modes, we don't need extract
1614              scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1615           if (can_create_pseudo_p ())
1616             {
1617               /* Need to force register if mem <- !reg.  */
1618               if (MEM_P (dest) && !REG_P (src))
1619                 *srcp = force_reg (mode, src);
1620
1621               return false;
1622             }
1623         }
1624       else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1625         {
1626           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1627           return true;
1628         }
1629     }
1630   else
1631     {
1632       /* In order to decrease the memory traffic, we don't use whole register
1633        * load/store for the LMUL less than 1 and mask mode, so those case will
1634        * require one extra general purpose register, but it's not allowed during
1635        * LRA process, so we have a special move pattern used for LRA, which will
1636        * defer the expansion after LRA.  */
1637       if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1638            || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1639           && lra_in_progress)
1640         {
1641           emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1642           return true;
1643         }
1644
1645       if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1646           && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1647         {
1648           /* Need to force register if mem <- !reg.  */
1649           if (MEM_P (dest) && !REG_P (src))
1650             *srcp = force_reg (mode, src);
1651
1652           return false;
1653         }
1654     }
1655
1656   if (register_operand (src, mode) && register_operand (dest, mode))
1657     {
1658       emit_insn (gen_rtx_SET (dest, src));
1659       return true;
1660     }
1661
1662   unsigned insn_flags
1663     = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1664   if (!register_operand (src, mode) && !register_operand (dest, mode))
1665     {
1666       rtx tmp = gen_reg_rtx (mode);
1667       if (MEM_P (src))
1668         {
1669           rtx ops[] = {tmp, src};
1670           emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1671         }
1672       else
1673         emit_move_insn (tmp, src);
1674       src = tmp;
1675     }
1676
1677   if (satisfies_constraint_vu (src))
1678     return false;
1679
1680   rtx ops[] = {dest, src};
1681   emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1682   return true;
1683 }
1684
1685 /* VTYPE information for machine_mode.  */
1686 struct mode_vtype_group
1687 {
1688   enum vlmul_type vlmul[NUM_MACHINE_MODES];
1689   uint8_t ratio[NUM_MACHINE_MODES];
1690   machine_mode subpart_mode[NUM_MACHINE_MODES];
1691   uint8_t nf[NUM_MACHINE_MODES];
1692   mode_vtype_group ()
1693   {
1694 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO)                                 \
1695   vlmul[MODE##mode] = VLMUL;                                                   \
1696   ratio[MODE##mode] = RATIO;
1697 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO)         \
1698   subpart_mode[MODE##mode] = SUBPART_MODE##mode;                               \
1699   nf[MODE##mode] = NF;                                                         \
1700   vlmul[MODE##mode] = VLMUL;                                                   \
1701   ratio[MODE##mode] = RATIO;
1702 #include "riscv-vector-switch.def"
1703 #undef ENTRY
1704 #undef TUPLE_ENTRY
1705   }
1706 };
1707
1708 static mode_vtype_group mode_vtype_infos;
1709
1710 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR.  */
1711 enum vlmul_type
1712 get_vlmul (machine_mode mode)
1713 {
1714   /* For VLS modes, the vlmul should be dynamically
1715      calculated since we need to adjust VLMUL according
1716      to TARGET_MIN_VLEN.  */
1717   if (riscv_v_ext_vls_mode_p (mode))
1718     {
1719       int size = GET_MODE_BITSIZE (mode).to_constant ();
1720       int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1721       if (size < TARGET_MIN_VLEN)
1722         {
1723           int factor = TARGET_MIN_VLEN / size;
1724           if (inner_size == 8)
1725             factor = MIN (factor, 8);
1726           else if (inner_size == 16)
1727             factor = MIN (factor, 4);
1728           else if (inner_size == 32)
1729             factor = MIN (factor, 2);
1730           else if (inner_size == 64)
1731             factor = MIN (factor, 1);
1732           else
1733             gcc_unreachable ();
1734
1735           switch (factor)
1736             {
1737             case 1:
1738               return LMUL_1;
1739             case 2:
1740               return LMUL_F2;
1741             case 4:
1742               return LMUL_F4;
1743             case 8:
1744               return LMUL_F8;
1745
1746             default:
1747               gcc_unreachable ();
1748             }
1749         }
1750       else
1751         {
1752           int factor = size / TARGET_MIN_VLEN;
1753           switch (factor)
1754             {
1755             case 1:
1756               return LMUL_1;
1757             case 2:
1758               return LMUL_2;
1759             case 4:
1760               return LMUL_4;
1761             case 8:
1762               return LMUL_8;
1763
1764             default:
1765               gcc_unreachable ();
1766             }
1767         }
1768     }
1769   return mode_vtype_infos.vlmul[mode];
1770 }
1771
1772 /* Return the VLMAX rtx of vector mode MODE.  */
1773 rtx
1774 get_vlmax_rtx (machine_mode mode)
1775 {
1776   gcc_assert (riscv_v_ext_vector_mode_p (mode));
1777   return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1778 }
1779
1780 /* Return the NF value of the corresponding mode.  */
1781 unsigned int
1782 get_nf (machine_mode mode)
1783 {
1784   /* We don't allow non-tuple modes go through this function.  */
1785   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1786   return mode_vtype_infos.nf[mode];
1787 }
1788
1789 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1790    the subpart mode is RVVM2SImode. This will help to build
1791    array/struct type in builtins.  */
1792 machine_mode
1793 get_subpart_mode (machine_mode mode)
1794 {
1795   /* We don't allow non-tuple modes go through this function.  */
1796   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1797   return mode_vtype_infos.subpart_mode[mode];
1798 }
1799
1800 /* Get ratio according to machine mode.  */
1801 unsigned int
1802 get_ratio (machine_mode mode)
1803 {
1804   if (riscv_v_ext_vls_mode_p (mode))
1805     {
1806       unsigned int sew = get_sew (mode);
1807       vlmul_type vlmul = get_vlmul (mode);
1808       switch (vlmul)
1809         {
1810         case LMUL_1:
1811           return sew;
1812         case LMUL_2:
1813           return sew / 2;
1814         case LMUL_4:
1815           return sew / 4;
1816         case LMUL_8:
1817           return sew / 8;
1818         case LMUL_F8:
1819           return sew * 8;
1820         case LMUL_F4:
1821           return sew * 4;
1822         case LMUL_F2:
1823           return sew * 2;
1824
1825         default:
1826           gcc_unreachable ();
1827         }
1828     }
1829   return mode_vtype_infos.ratio[mode];
1830 }
1831
1832 /* Get ta according to operand[tail_op_idx].  */
1833 int
1834 get_ta (rtx ta)
1835 {
1836   if (INTVAL (ta) == TAIL_ANY)
1837     return INVALID_ATTRIBUTE;
1838   return INTVAL (ta);
1839 }
1840
1841 /* Get ma according to operand[mask_op_idx].  */
1842 int
1843 get_ma (rtx ma)
1844 {
1845   if (INTVAL (ma) == MASK_ANY)
1846     return INVALID_ATTRIBUTE;
1847   return INTVAL (ma);
1848 }
1849
1850 /* Get prefer tail policy.  */
1851 enum tail_policy
1852 get_prefer_tail_policy ()
1853 {
1854   /* TODO: By default, we choose to use TAIL_ANY which allows
1855      compiler pick up either agnostic or undisturbed. Maybe we
1856      will have a compile option like -mprefer=agnostic to set
1857      this value???.  */
1858   return TAIL_ANY;
1859 }
1860
1861 /* Get prefer mask policy.  */
1862 enum mask_policy
1863 get_prefer_mask_policy ()
1864 {
1865   /* TODO: By default, we choose to use MASK_ANY which allows
1866      compiler pick up either agnostic or undisturbed. Maybe we
1867      will have a compile option like -mprefer=agnostic to set
1868      this value???.  */
1869   return MASK_ANY;
1870 }
1871
1872 /* Get avl_type rtx.  */
1873 rtx
1874 get_avl_type_rtx (enum avl_type type)
1875 {
1876   return gen_int_mode (type, Pmode);
1877 }
1878
1879 /* Return the appropriate mask mode for MODE.  */
1880
1881 machine_mode
1882 get_mask_mode (machine_mode mode)
1883 {
1884   poly_int64 nunits = GET_MODE_NUNITS (mode);
1885   if (riscv_v_ext_tuple_mode_p (mode))
1886     {
1887       unsigned int nf = get_nf (mode);
1888       nunits = exact_div (nunits, nf);
1889     }
1890   return get_vector_mode (BImode, nunits).require ();
1891 }
1892
1893 /* Return the appropriate LMUL mode for MODE.  */
1894
1895 opt_machine_mode
1896 get_lmul_mode (scalar_mode mode, int lmul)
1897 {
1898   poly_uint64 lmul_nunits;
1899   unsigned int bytes = GET_MODE_SIZE (mode);
1900   if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits))
1901     return get_vector_mode (mode, lmul_nunits);
1902   return E_VOIDmode;
1903 }
1904
1905 /* Return the appropriate M1 mode for MODE.  */
1906
1907 static opt_machine_mode
1908 get_m1_mode (machine_mode mode)
1909 {
1910   scalar_mode smode = GET_MODE_INNER (mode);
1911   unsigned int bytes = GET_MODE_SIZE (smode);
1912   poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1913   return get_vector_mode (smode, m1_nunits);
1914 }
1915
1916 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1917    This function is not only used by builtins, but also will be used by
1918    auto-vectorization in the future.  */
1919 opt_machine_mode
1920 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1921 {
1922   enum mode_class mclass;
1923   if (inner_mode == E_BImode)
1924     mclass = MODE_VECTOR_BOOL;
1925   else if (FLOAT_MODE_P (inner_mode))
1926     mclass = MODE_VECTOR_FLOAT;
1927   else
1928     mclass = MODE_VECTOR_INT;
1929   machine_mode mode;
1930   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1931     if (inner_mode == GET_MODE_INNER (mode)
1932         && known_eq (nunits, GET_MODE_NUNITS (mode))
1933         && (riscv_v_ext_vector_mode_p (mode)
1934             || riscv_v_ext_vls_mode_p (mode)))
1935       return mode;
1936   return opt_machine_mode ();
1937 }
1938
1939 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1940    corresponding subpart mode and NF.  */
1941 opt_machine_mode
1942 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1943 {
1944   poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1945   scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1946   enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1947   machine_mode mode;
1948   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1949     if (inner_mode == GET_MODE_INNER (mode)
1950         && known_eq (nunits, GET_MODE_NUNITS (mode))
1951         && riscv_v_ext_tuple_mode_p (mode)
1952         && get_subpart_mode (mode) == subpart_mode)
1953       return mode;
1954   return opt_machine_mode ();
1955 }
1956
1957 bool
1958 simm5_p (rtx x)
1959 {
1960   if (!CONST_INT_P (x))
1961     return false;
1962   return IN_RANGE (INTVAL (x), -16, 15);
1963 }
1964
1965 bool
1966 neg_simm5_p (rtx x)
1967 {
1968   if (!CONST_INT_P (x))
1969     return false;
1970   return IN_RANGE (INTVAL (x), -15, 16);
1971 }
1972
1973 bool
1974 has_vi_variant_p (rtx_code code, rtx x)
1975 {
1976   switch (code)
1977     {
1978     case PLUS:
1979     case AND:
1980     case IOR:
1981     case XOR:
1982     case SS_PLUS:
1983     case US_PLUS:
1984     case EQ:
1985     case NE:
1986     case LE:
1987     case LEU:
1988     case GT:
1989     case GTU:
1990       return simm5_p (x);
1991
1992     case LT:
1993     case LTU:
1994     case GE:
1995     case GEU:
1996     case MINUS:
1997     case SS_MINUS:
1998       return neg_simm5_p (x);
1999
2000     default:
2001       return false;
2002     }
2003 }
2004
2005 bool
2006 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
2007                      machine_mode vector_mode, bool has_vi_variant_p,
2008                      void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
2009 {
2010   machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
2011   if (has_vi_variant_p)
2012     {
2013       *scalar_op = force_reg (scalar_mode, *scalar_op);
2014       return false;
2015     }
2016
2017   if (TARGET_64BIT)
2018     {
2019       if (!rtx_equal_p (*scalar_op, const0_rtx))
2020         *scalar_op = force_reg (scalar_mode, *scalar_op);
2021       return false;
2022     }
2023
2024   if (immediate_operand (*scalar_op, Pmode))
2025     {
2026       if (!rtx_equal_p (*scalar_op, const0_rtx))
2027         *scalar_op = force_reg (Pmode, *scalar_op);
2028
2029       *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
2030       return false;
2031     }
2032
2033   if (CONST_INT_P (*scalar_op))
2034     {
2035       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
2036         *scalar_op = force_const_mem (scalar_mode, *scalar_op);
2037       else
2038         *scalar_op = force_reg (scalar_mode, *scalar_op);
2039     }
2040
2041   rtx tmp = gen_reg_rtx (vector_mode);
2042   rtx ops[] = {tmp, *scalar_op};
2043   if (type == VLMAX)
2044     emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
2045   else
2046     emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
2047                         vl);
2048   emit_vector_func (operands, tmp);
2049
2050   return true;
2051 }
2052
2053 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask.  */
2054 rtx
2055 gen_scalar_move_mask (machine_mode mode)
2056 {
2057   rtx_vector_builder builder (mode, 1, 2);
2058   builder.quick_push (const1_rtx);
2059   builder.quick_push (const0_rtx);
2060   return builder.build ();
2061 }
2062
2063 static unsigned
2064 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
2065 {
2066   // Original equation:
2067   //   VLMAX = (VectorBits / EltSize) * LMUL
2068   //   where LMUL = MinSize / TARGET_MIN_VLEN
2069   // The following equations have been reordered to prevent loss of precision
2070   // when calculating fractional LMUL.
2071   return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
2072 }
2073
2074 static unsigned
2075 get_unknown_min_value (machine_mode mode)
2076 {
2077   enum vlmul_type vlmul = get_vlmul (mode);
2078   switch (vlmul)
2079     {
2080     case LMUL_1:
2081       return TARGET_MIN_VLEN;
2082     case LMUL_2:
2083       return TARGET_MIN_VLEN * 2;
2084     case LMUL_4:
2085       return TARGET_MIN_VLEN * 4;
2086     case LMUL_8:
2087       return TARGET_MIN_VLEN * 8;
2088     default:
2089       gcc_unreachable ();
2090     }
2091 }
2092
2093 static rtx
2094 force_vector_length_operand (rtx vl)
2095 {
2096   if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2097     return force_reg (Pmode, vl);
2098   return vl;
2099 }
2100
2101 rtx
2102 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2103 {
2104   unsigned int sew = get_sew (vmode);
2105   rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2106   rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2107   return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2108                                      gen_int_mode (get_vlmul (vmode), Pmode),
2109                                      tail_policy, mask_policy);
2110 }
2111
2112 /* GET VL * 2 rtx.  */
2113 static rtx
2114 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2115 {
2116   rtx i32vl = NULL_RTX;
2117   if (CONST_INT_P (avl))
2118     {
2119       unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2120       unsigned min_size = get_unknown_min_value (mode);
2121       unsigned vlen_max = RVV_65536;
2122       unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2123       unsigned vlen_min = TARGET_MIN_VLEN;
2124       unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2125
2126       unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2127       if (avl_int <= vlmax_min)
2128         i32vl = gen_int_mode (2 * avl_int, Pmode);
2129       else if (avl_int >= 2 * vlmax_max)
2130         {
2131           // Just set i32vl to VLMAX in this situation
2132           i32vl = gen_reg_rtx (Pmode);
2133           emit_insn (
2134             gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2135         }
2136       else
2137         {
2138           // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2139           // is related to the hardware implementation.
2140           // So let the following code handle
2141         }
2142     }
2143   if (!i32vl)
2144     {
2145       // Using vsetvli instruction to get actually used length which related to
2146       // the hardware implementation
2147       rtx i64vl = gen_reg_rtx (Pmode);
2148       emit_insn (
2149         gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2150       // scale 2 for 32-bit length
2151       i32vl = gen_reg_rtx (Pmode);
2152       emit_insn (
2153         gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2154     }
2155
2156   return force_vector_length_operand (i32vl);
2157 }
2158
2159 bool
2160 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2161                      machine_mode demote_mask_mode, rtx *ops)
2162 {
2163   rtx scalar_op = ops[4];
2164   rtx avl = ops[5];
2165   machine_mode scalar_mode = GET_MODE_INNER (mode);
2166   if (rtx_equal_p (scalar_op, const0_rtx))
2167     {
2168       ops[5] = force_vector_length_operand (ops[5]);
2169       return false;
2170     }
2171
2172   if (TARGET_64BIT)
2173     {
2174       ops[4] = force_reg (scalar_mode, scalar_op);
2175       ops[5] = force_vector_length_operand (ops[5]);
2176       return false;
2177     }
2178
2179   if (immediate_operand (scalar_op, Pmode))
2180     {
2181       ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2182       ops[5] = force_vector_length_operand (ops[5]);
2183       return false;
2184     }
2185
2186   if (CONST_INT_P (scalar_op))
2187     scalar_op = force_reg (scalar_mode, scalar_op);
2188
2189   rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2190
2191   rtx demote_scalar_op1, demote_scalar_op2;
2192   if (unspec == UNSPEC_VSLIDE1UP)
2193     {
2194       demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2195       demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2196     }
2197   else
2198     {
2199       demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2200       demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2201     }
2202
2203   rtx temp = gen_reg_rtx (demote_mode);
2204   rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2205   rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2206   rtx merge = RVV_VUNDEF (demote_mode);
2207   /* Handle vslide1<ud>_tu.  */
2208   if (register_operand (ops[2], mode)
2209       && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2210     {
2211       merge = gen_lowpart (demote_mode, ops[2]);
2212       ta = ops[6];
2213       ma = ops[7];
2214     }
2215
2216   emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2217                              CONSTM1_RTX (demote_mask_mode), merge,
2218                              gen_lowpart (demote_mode, ops[3]),
2219                              demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2220   emit_insn (gen_pred_slide (unspec, demote_mode,
2221                              gen_lowpart (demote_mode, ops[0]),
2222                              CONSTM1_RTX (demote_mask_mode), merge, temp,
2223                              demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2224
2225   if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2226       && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2227     emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2228                                force_vector_length_operand (ops[5]), ops[6],
2229                                ops[8]));
2230   return true;
2231 }
2232
2233 rtx
2234 gen_avl_for_scalar_move (rtx avl)
2235 {
2236   /* AVL for scalar move has different behavior between 0 and large than 0.  */
2237   if (CONST_INT_P (avl))
2238     {
2239       /* So we could just set AVL to 1 for any constant other than 0.  */
2240       if (rtx_equal_p (avl, const0_rtx))
2241         return const0_rtx;
2242       else
2243         return const1_rtx;
2244     }
2245   else
2246     {
2247       /* For non-constant value, we set any non zero value to 1 by
2248          `sgtu new_avl,input_avl,zero` + `vsetvli`.  */
2249       rtx tmp = gen_reg_rtx (Pmode);
2250       emit_insn (
2251         gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2252       return tmp;
2253     }
2254 }
2255
2256 /* Expand tuple modes data movement for.  */
2257 void
2258 expand_tuple_move (rtx *ops)
2259 {
2260   unsigned int i;
2261   machine_mode tuple_mode = GET_MODE (ops[0]);
2262   machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2263   poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2264   unsigned int nf = get_nf (tuple_mode);
2265   bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2266
2267   if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2268     {
2269       rtx val;
2270       gcc_assert (can_create_pseudo_p ()
2271                   && const_vec_duplicate_p (ops[1], &val));
2272       for (i = 0; i < nf; ++i)
2273         {
2274           poly_int64 offset = i * subpart_size;
2275           rtx subreg
2276             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2277           rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2278           emit_move_insn (subreg, dup);
2279         }
2280     }
2281   else if (REG_P (ops[0]) && REG_P (ops[1]))
2282     {
2283       for (i = 0; i < nf; ++i)
2284         {
2285           int index = i;
2286
2287           /* Take NF = 2 and LMUL = 1 for example:
2288
2289               - move v8 to v9:
2290                  vmv1r v10,v9
2291                  vmv1r v9,v8
2292
2293               - move v8 to v7:
2294                  vmv1r v7,v8
2295                  vmv1r v8,v9  */
2296           if (REGNO (ops[0]) > REGNO (ops[1]))
2297             index = nf - 1 - i;
2298           poly_int64 offset = index * subpart_size;
2299           rtx dst_subreg
2300             = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2301           rtx src_subreg
2302             = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2303           emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2304         }
2305     }
2306   else
2307     {
2308       /* Expand tuple memory data movement.  */
2309       gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2310       rtx offset = gen_int_mode (subpart_size, Pmode);
2311       if (!subpart_size.is_constant ())
2312         {
2313           emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2314           if (fractional_p)
2315             {
2316               unsigned int factor
2317                 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2318                     .to_constant ();
2319               rtx pat
2320                 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2321                                     gen_int_mode (exact_log2 (factor), Pmode));
2322               emit_insn (gen_rtx_SET (ops[2], pat));
2323             }
2324
2325           if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2326             {
2327               unsigned int factor
2328                 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2329                     .to_constant ();
2330               rtx pat
2331                 = gen_rtx_ASHIFT (Pmode, ops[2],
2332                                   gen_int_mode (exact_log2 (factor), Pmode));
2333               emit_insn (gen_rtx_SET (ops[2], pat));
2334             }
2335           offset = ops[2];
2336         }
2337
2338       /* Non-fractional LMUL has whole register moves that don't require a
2339          vsetvl for VLMAX.  */
2340       if (fractional_p)
2341         emit_vlmax_vsetvl (subpart_mode, ops[4]);
2342       if (MEM_P (ops[1]))
2343         {
2344           /* Load operations.  */
2345           emit_move_insn (ops[3], XEXP (ops[1], 0));
2346           for (i = 0; i < nf; i++)
2347             {
2348               rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2349                                                 tuple_mode, i * subpart_size);
2350               if (i != 0)
2351                 {
2352                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2353                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2354                 }
2355               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2356
2357               if (fractional_p)
2358                 {
2359                   rtx operands[] = {subreg, mem};
2360                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2361                                         UNARY_OP, operands, ops[4]);
2362                 }
2363               else
2364                 emit_move_insn (subreg, mem);
2365             }
2366         }
2367       else
2368         {
2369           /* Store operations.  */
2370           emit_move_insn (ops[3], XEXP (ops[0], 0));
2371           for (i = 0; i < nf; i++)
2372             {
2373               rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2374                                                 tuple_mode, i * subpart_size);
2375               if (i != 0)
2376                 {
2377                   rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2378                   emit_insn (gen_rtx_SET (ops[3], new_addr));
2379                 }
2380               rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2381
2382               if (fractional_p)
2383                 {
2384                   rtx operands[] = {mem, subreg};
2385                   emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2386                                         UNARY_OP, operands, ops[4]);
2387                 }
2388               else
2389                 emit_move_insn (mem, subreg);
2390             }
2391         }
2392     }
2393 }
2394
2395 /* Return the vectorization machine mode for RVV according to LMUL.  */
2396 machine_mode
2397 preferred_simd_mode (scalar_mode mode)
2398 {
2399   if (autovec_use_vlmax_p ())
2400     {
2401       /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2402          rvv_max_lmul as multiply factor to calculate the NUNITS to
2403          get the auto-vectorization mode.  */
2404       poly_uint64 nunits;
2405       poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2406       poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2407       /* Disable vectorization when we can't find a RVV mode for it.
2408          E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2409          a double (DFmode) type.  */
2410       if (!multiple_p (vector_size, scalar_size, &nunits))
2411         return word_mode;
2412       machine_mode rvv_mode;
2413       if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2414         return rvv_mode;
2415     }
2416   return word_mode;
2417 }
2418
2419 /* Use merge approach to initialize the vector with repeating sequence.
2420    v = {a, b, a, b, a, b, a, b}.
2421
2422    v = broadcast (a).
2423    mask = 0b01010101....
2424    v = merge (v, b, mask)
2425 */
2426 static void
2427 expand_vector_init_merge_repeating_sequence (rtx target,
2428                                              const rvv_builder &builder)
2429 {
2430   /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2431      since we don't have such instruction in RVV.
2432      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2433      instruction to generate the mask data we want.  */
2434   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2435   machine_mode mask_int_mode
2436     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2437   uint64_t full_nelts = builder.full_nelts ().to_constant ();
2438
2439   /* Step 1: Broadcast the first pattern.  */
2440   rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2441   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2442                     UNARY_OP, ops);
2443   /* Step 2: Merge the rest iteration of pattern.  */
2444   for (unsigned int i = 1; i < builder.npatterns (); i++)
2445     {
2446       /* Step 2-1: Generate mask register v0 for each merge.  */
2447       rtx merge_mask
2448         = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2449       rtx mask = gen_reg_rtx (mask_bit_mode);
2450       rtx dup = gen_reg_rtx (mask_int_mode);
2451
2452       if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
2453         {
2454           rtx ops[] = {dup, merge_mask};
2455           emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2456                                SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2457         }
2458       else /* vmv.v.x.  */
2459         {
2460           rtx ops[] = {dup,
2461                        force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2462           rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2463                                  Pmode);
2464           emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2465                                ops, vl);
2466         }
2467
2468       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2469
2470       /* Step 2-2: Merge pattern according to the mask.  */
2471       rtx ops[] = {target, target, builder.elt (i), mask};
2472       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2473                         MERGE_OP, ops);
2474     }
2475 }
2476
2477 /* Use slideup approach to combine the vectors.
2478      v = {a, a, a, a, b, b, b, b}
2479
2480    First:
2481      v1 = {a, a, a, a, a, a, a, a}
2482      v2 = {b, b, b, b, b, b, b, b}
2483      v = slideup (v1, v2, nelt / 2)
2484 */
2485 static void
2486 expand_vector_init_slideup_combine_sequence (rtx target,
2487                                              const rvv_builder &builder)
2488 {
2489   machine_mode mode = GET_MODE (target);
2490   int nelts = builder.full_nelts ().to_constant ();
2491   rtx first_elt = builder.elt (0);
2492   rtx last_elt = builder.elt (nelts - 1);
2493   rtx low = expand_vector_broadcast (mode, first_elt);
2494   rtx high = expand_vector_broadcast (mode, last_elt);
2495   insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2496   rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2497   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2498 }
2499
2500 /* Use merge approach to merge a scalar into a vector.
2501      v = {a, a, a, a, a, a, b, b}
2502
2503      v1 = {a, a, a, a, a, a, a, a}
2504      scalar = b
2505      mask = {0, 0, 0, 0, 0, 0, 1, 1}
2506 */
2507 static void
2508 expand_vector_init_merge_combine_sequence (rtx target,
2509                                            const rvv_builder &builder)
2510 {
2511   machine_mode mode = GET_MODE (target);
2512   machine_mode imode = builder.int_mode ();
2513   machine_mode mmode = builder.mask_mode ();
2514   int nelts = builder.full_nelts ().to_constant ();
2515   int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2516   if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2517       || riscv_get_v_regno_alignment (imode) > 1)
2518     imode = get_vector_mode (HImode, nelts).require ();
2519
2520   /* Generate vid = { 0, 1, 2, ..., n }.  */
2521   rtx vid = gen_reg_rtx (imode);
2522   expand_vec_series (vid, const0_rtx, const1_rtx);
2523
2524   /* Generate mask.  */
2525   rtx mask = gen_reg_rtx (mmode);
2526   insn_code icode = code_for_pred_cmp_scalar (imode);
2527   rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2528   rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2529   /* vmsgtu.vi/vmsgtu.vx.  */
2530   rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2531   rtx sel = builder.elt (nelts - 1);
2532   rtx mask_ops[] = {mask, cmp, vid, index};
2533   emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2534
2535   /* Duplicate the first elements.  */
2536   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2537   /* Merge scalar into vector according to mask.  */
2538   rtx merge_ops[] = {target, dup, sel, mask};
2539   icode = code_for_pred_merge_scalar (mode);
2540   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2541 }
2542
2543 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
2544
2545 void
2546 expand_vec_init (rtx target, rtx vals)
2547 {
2548   machine_mode mode = GET_MODE (target);
2549   int nelts = XVECLEN (vals, 0);
2550
2551   rvv_builder v (mode, nelts, 1);
2552   for (int i = 0; i < nelts; i++)
2553     v.quick_push (XVECEXP (vals, 0, i));
2554   v.finalize ();
2555
2556   /* If the sequence is v = { a, a, a, a } just broadcast an element.  */
2557   if (v.is_repeating_sequence ())
2558     {
2559       machine_mode mode = GET_MODE (target);
2560       rtx dup = expand_vector_broadcast (mode, v.elt (0));
2561       emit_move_insn (target, dup);
2562       return;
2563     }
2564
2565   if (nelts > 3)
2566     {
2567       /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }.  */
2568       if (v.can_duplicate_repeating_sequence_p ())
2569         {
2570           rtx ele = v.get_merged_repeating_sequence ();
2571           rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2572           emit_move_insn (target, gen_lowpart (mode, dup));
2573           return;
2574         }
2575
2576       /* Case 2: Optimize repeating sequence cases that Case 1 can
2577          not handle and it is profitable.  For example:
2578          ELEMENT BITSIZE = 64.
2579          v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2580          We can't find a vector mode for "ab" which will be combined into
2581          128-bit element to duplicate.  */
2582       if (v.repeating_sequence_use_merge_profitable_p ())
2583         {
2584           expand_vector_init_merge_repeating_sequence (target, v);
2585           return;
2586         }
2587
2588       /* Case 3: Optimize combine sequence.
2589          E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2590          We can combine:
2591            v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2592          and
2593            v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2594          by slideup.  */
2595       if (v.combine_sequence_use_slideup_profitable_p ())
2596         {
2597           expand_vector_init_slideup_combine_sequence (target, v);
2598           return;
2599         }
2600
2601       /* Case 4: Optimize combine sequence.
2602          E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2603
2604          Generate vector:
2605            v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2606
2607          Generate mask:
2608            mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2609
2610          Merge b into v by mask:
2611            v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.  */
2612       if (v.combine_sequence_use_merge_profitable_p ())
2613         {
2614           expand_vector_init_merge_combine_sequence (target, v);
2615           return;
2616         }
2617     }
2618
2619   /* Optimize trailing same elements sequence:
2620       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
2621   if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2622     /* Handle common situation by vslide1down. This function can handle any
2623        situation of vec_init<mode>. Only the cases that are not optimized above
2624        will fall through here.  */
2625     expand_vector_init_insert_elems (target, v, nelts);
2626 }
2627
2628 /* Get insn code for corresponding comparison.  */
2629
2630 static insn_code
2631 get_cmp_insn_code (rtx_code code, machine_mode mode)
2632 {
2633   insn_code icode;
2634   switch (code)
2635     {
2636     case EQ:
2637     case NE:
2638     case LE:
2639     case LEU:
2640     case GT:
2641     case GTU:
2642     case LTGT:
2643       icode = code_for_pred_cmp (mode);
2644       break;
2645     case LT:
2646     case LTU:
2647     case GE:
2648     case GEU:
2649       if (FLOAT_MODE_P (mode))
2650         icode = code_for_pred_cmp (mode);
2651       else
2652         icode = code_for_pred_ltge (mode);
2653       break;
2654     default:
2655       gcc_unreachable ();
2656     }
2657   return icode;
2658 }
2659
2660 /* This hook gives the vectorizer more vector mode options.  We want it to not
2661    only try modes with the maximum number of units a full vector can hold but
2662    for example also half the number of units for a smaller elements size.
2663    Such vectors can be promoted to a full vector of widened elements
2664    (still with the same number of elements, essentially vectorizing at a
2665    fixed number of units rather than a fixed number of bytes).  */
2666 unsigned int
2667 autovectorize_vector_modes (vector_modes *modes, bool)
2668 {
2669   if (autovec_use_vlmax_p ())
2670     {
2671       poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2672
2673       /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2674          fit a whole vector.
2675          Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2676          is guided by the extensions we have available (vf2, vf4 and vf8).
2677
2678          - full_size: Try using full vectors for all element types.
2679          - full_size / 2:
2680            Try using 16-bit containers for 8-bit elements and full vectors
2681            for wider elements.
2682          - full_size / 4:
2683            Try using 32-bit containers for 8-bit and 16-bit elements and
2684            full vectors for wider elements.
2685          - full_size / 8:
2686            Try using 64-bit containers for all element types.  */
2687       static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2688       for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2689         {
2690           poly_uint64 units;
2691           machine_mode mode;
2692           if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2693               && get_vector_mode (QImode, units).exists (&mode))
2694             modes->safe_push (mode);
2695         }
2696     }
2697     /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
2698     unsigned int i = 0;
2699     unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2700     unsigned int size = base_size;
2701     machine_mode mode;
2702     while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2703      {
2704         if (vls_mode_valid_p (mode))
2705           modes->safe_push (mode);
2706
2707         i++;
2708         size = base_size / (1U << i);
2709      }
2710   /* Enable LOOP_VINFO comparison in COST model.  */
2711   return VECT_COMPARE_COSTS;
2712 }
2713
2714 /* Return true if we can find the related MODE according to default LMUL. */
2715 static bool
2716 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2717                          poly_uint64 *nunits)
2718 {
2719   if (!autovec_use_vlmax_p ())
2720     return false;
2721   if (riscv_v_ext_vector_mode_p (vector_mode)
2722       && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2723                      GET_MODE_SIZE (element_mode), nunits))
2724     return true;
2725   if (riscv_v_ext_vls_mode_p (vector_mode)
2726       && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2727                      GET_MODE_SIZE (element_mode), nunits))
2728     return true;
2729   return false;
2730 }
2731
2732 /* If the given VECTOR_MODE is an RVV mode,  first get the largest number
2733    of units that fit into a full vector at the given ELEMENT_MODE.
2734    We will have the vectorizer call us with a successively decreasing
2735    number of units (as specified in autovectorize_vector_modes).
2736    The starting mode is always the one specified by preferred_simd_mode. */
2737 opt_machine_mode
2738 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2739                         poly_uint64 nunits)
2740 {
2741   /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2742   poly_uint64 min_units;
2743   if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2744     {
2745       machine_mode rvv_mode;
2746       if (maybe_ne (nunits, 0U))
2747         {
2748           /* If we were given a number of units NUNITS, try to find an
2749              RVV vector mode of inner mode ELEMENT_MODE with the same
2750              number of units.  */
2751           if (multiple_p (min_units, nunits)
2752               && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2753             return rvv_mode;
2754         }
2755       else
2756         {
2757           /* Look for a vector mode with the same number of units as the
2758              VECTOR_MODE we were given.  We keep track of the minimum
2759              number of units so far which determines the smallest necessary
2760              but largest possible, suitable mode for vectorization.  */
2761           min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2762           if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2763             return rvv_mode;
2764         }
2765     }
2766
2767   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2768 }
2769
2770 /* Expand an RVV comparison.  */
2771
2772 void
2773 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask,
2774                 rtx maskoff)
2775 {
2776   machine_mode mask_mode = GET_MODE (target);
2777   machine_mode data_mode = GET_MODE (op0);
2778   insn_code icode = get_cmp_insn_code (code, data_mode);
2779
2780   if (code == LTGT)
2781     {
2782       rtx lt = gen_reg_rtx (mask_mode);
2783       rtx gt = gen_reg_rtx (mask_mode);
2784       expand_vec_cmp (lt, LT, op0, op1, mask, maskoff);
2785       expand_vec_cmp (gt, GT, op0, op1, mask, maskoff);
2786       icode = code_for_pred (IOR, mask_mode);
2787       rtx ops[] = {target, lt, gt};
2788       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2789       return;
2790     }
2791
2792   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2793   if (!mask && !maskoff)
2794     {
2795       rtx ops[] = {target, cmp, op0, op1};
2796       emit_vlmax_insn (icode, COMPARE_OP, ops);
2797     }
2798   else
2799     {
2800       rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2801       emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2802     }
2803 }
2804
2805 /* Expand an RVV floating-point comparison:
2806
2807    If CAN_INVERT_P is true, the caller can also handle inverted results;
2808    return true if the result is in fact inverted.  */
2809
2810 bool
2811 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2812                       bool can_invert_p)
2813 {
2814   machine_mode mask_mode = GET_MODE (target);
2815   machine_mode data_mode = GET_MODE (op0);
2816
2817   /* If can_invert_p = true:
2818      It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2819
2820        vmfeq.vv    v0, va, va
2821        vmfeq.vv    v1, vb, vb
2822        vmand.mm    v0, v0, v1
2823        vmflt.vv    v0, va, vb, v0.t
2824        vmnot.m     v0, v0
2825
2826      And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2827      second vmfeq.vv:
2828
2829        vmfeq.vv    v0, va, va
2830        vmfeq.vv    v0, vb, vb, v0.t
2831        vmflt.vv    v0, va, vb, v0.t
2832        vmnot.m     v0, v0
2833
2834      If can_invert_p = false:
2835
2836        # Example of implementing isgreater()
2837        vmfeq.vv v0, va, va        # Only set where A is not NaN.
2838        vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
2839        vmand.mm v0, v0, v1        # Only set where A and B are ordered,
2840        vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
2841   */
2842
2843   rtx eq0 = gen_reg_rtx (mask_mode);
2844   rtx eq1 = gen_reg_rtx (mask_mode);
2845   switch (code)
2846     {
2847     case EQ:
2848     case NE:
2849     case LT:
2850     case LE:
2851     case GT:
2852     case GE:
2853     case LTGT:
2854       /* There is native support for the comparison.  */
2855       expand_vec_cmp (target, code, op0, op1);
2856       return false;
2857     case UNEQ:
2858     case ORDERED:
2859     case UNORDERED:
2860     case UNLT:
2861     case UNLE:
2862     case UNGT:
2863     case UNGE:
2864       /* vmfeq.vv v0, va, va  */
2865       expand_vec_cmp (eq0, EQ, op0, op0);
2866       if (HONOR_SNANS (data_mode))
2867         {
2868           /*
2869              vmfeq.vv    v1, vb, vb
2870              vmand.mm    v0, v0, v1
2871           */
2872           expand_vec_cmp (eq1, EQ, op1, op1);
2873           insn_code icode = code_for_pred (AND, mask_mode);
2874           rtx ops[] = {eq0, eq0, eq1};
2875           emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2876         }
2877       else
2878         {
2879           /* vmfeq.vv    v0, vb, vb, v0.t  */
2880           expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0);
2881         }
2882       break;
2883     default:
2884       gcc_unreachable ();
2885     }
2886
2887   if (code == ORDERED)
2888     {
2889       emit_move_insn (target, eq0);
2890       return false;
2891     }
2892
2893   /* There is native support for the inverse comparison.  */
2894   code = reverse_condition_maybe_unordered (code);
2895   if (code == ORDERED)
2896     emit_move_insn (target, eq0);
2897   else
2898     expand_vec_cmp (eq0, code, op0, op1, eq0, eq0);
2899
2900   if (can_invert_p)
2901     {
2902       emit_move_insn (target, eq0);
2903       return true;
2904     }
2905
2906   /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2907      into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm.  */
2908   emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2909   return false;
2910 }
2911
2912 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2913    MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2914    2 * nunits - 1.  */
2915 static rtx
2916 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2917 {
2918   rtx sel_mod;
2919   machine_mode sel_mode = GET_MODE (sel);
2920   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2921   poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2922   /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2923      Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2924      indice.  */
2925   if (CONST_VECTOR_P (sel)
2926       && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2927     sel_mod = sel;
2928   else
2929     {
2930       rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2931       sel_mod
2932         = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2933     }
2934   return sel_mod;
2935 }
2936
2937 /* Implement vec_perm<mode>.  */
2938
2939 void
2940 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2941 {
2942   machine_mode data_mode = GET_MODE (target);
2943   machine_mode sel_mode = GET_MODE (sel);
2944   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2945
2946   /* Check if the sel only references the first values vector. If each select
2947      index is in range of [0, nunits - 1]. A single vrgather instructions is
2948      enough. Since we will use vrgatherei16.vv for variable-length vector,
2949      it is never out of range and we don't need to modulo the index.  */
2950   if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2951     {
2952       emit_vlmax_gather_insn (target, op0, sel);
2953       return;
2954     }
2955
2956   /* Check if all the indices are same.  */
2957   rtx elt;
2958   if (const_vec_duplicate_p (sel, &elt))
2959     {
2960       poly_uint64 value = rtx_to_poly_int64 (elt);
2961       rtx op = op0;
2962       if (maybe_gt (value, nunits - 1))
2963         {
2964           sel = gen_const_vector_dup (sel_mode, value - nunits);
2965           op = op1;
2966         }
2967       emit_vlmax_gather_insn (target, op, sel);
2968     }
2969
2970   /* Note: vec_perm indices are supposed to wrap when they go beyond the
2971      size of the two value vectors, i.e. the upper bits of the indices
2972      are effectively ignored.  RVV vrgather instead produces 0 for any
2973      out-of-range indices, so we need to modulo all the vec_perm indices
2974      to ensure they are all in range of [0, nunits - 1] when op0 == op1
2975      or all in range of [0, 2 * nunits - 1] when op0 != op1.  */
2976   rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2977
2978   /* Check if the two values vectors are the same.  */
2979   if (rtx_equal_p (op0, op1))
2980     {
2981       emit_vlmax_gather_insn (target, op0, sel_mod);
2982       return;
2983     }
2984
2985   /* This following sequence is handling the case that:
2986      __builtin_shufflevector (vec1, vec2, index...), the index can be any
2987      value in range of [0, 2 * nunits - 1].  */
2988   machine_mode mask_mode;
2989   mask_mode = get_mask_mode (data_mode);
2990   rtx mask = gen_reg_rtx (mask_mode);
2991   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2992
2993   /* Step 1: generate a mask that should select everything >= nunits into the
2994    * mask.  */
2995   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2996
2997   /* Step2: gather every op0 values indexed by sel into target,
2998             we don't need to care about the result of the element
2999             whose index >= nunits.  */
3000   emit_vlmax_gather_insn (target, op0, sel_mod);
3001
3002   /* Step3: shift the range from (nunits, max_of_mode] to
3003             [0, max_of_mode - nunits].  */
3004   rtx tmp = gen_reg_rtx (sel_mode);
3005   rtx ops[] = {tmp, sel_mod, max_sel};
3006   emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
3007
3008   /* Step4: gather those into the previously masked-out elements
3009             of target.  */
3010   emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
3011 }
3012
3013 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
3014
3015 /* vec_perm support.  */
3016
3017 struct expand_vec_perm_d
3018 {
3019   rtx target, op0, op1;
3020   vec_perm_indices perm;
3021   machine_mode vmode;
3022   machine_mode op_mode;
3023   bool one_vector_p;
3024   bool testing_p;
3025 };
3026
3027 /* Return the appropriate index mode for gather instructions.  */
3028 opt_machine_mode
3029 get_gather_index_mode (struct expand_vec_perm_d *d)
3030 {
3031   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3032   poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3033
3034   if (GET_MODE_INNER (d->vmode) == QImode)
3035     {
3036       if (nunits.is_constant ())
3037         {
3038           /* If indice is LMUL8 CONST_VECTOR and any element value
3039              exceed the range of 0 ~ 255, Forbid such permutation
3040              since we need vector HI mode to hold such indice and
3041              we don't have it.  */
3042           if (!d->perm.all_in_range_p (0, 255)
3043               && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3044             return opt_machine_mode ();
3045         }
3046       else
3047         {
3048           /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3049              Otherwise, it could overflow the index range.  */
3050           if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3051             return opt_machine_mode ();
3052         }
3053     }
3054   else if (riscv_get_v_regno_alignment (sel_mode) > 1
3055            && GET_MODE_INNER (sel_mode) != HImode)
3056     sel_mode = get_vector_mode (HImode, nunits).require ();
3057   return sel_mode;
3058 }
3059
3060 /* Recognize the patterns that we can use merge operation to shuffle the
3061    vectors. The value of Each element (index i) in selector can only be
3062    either i or nunits + i.  We will check the pattern is actually monotonic.
3063
3064    E.g.
3065    v = VEC_PERM_EXPR (v0, v1, selector),
3066    selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ...  }
3067
3068    We can transform such pattern into:
3069
3070    v = vcond_mask (v0, v1, mask),
3071    mask = { 0, 1, 0, 1, 0, 1, ... }.  */
3072
3073 static bool
3074 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3075 {
3076   machine_mode vmode = d->vmode;
3077   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3078   int n_patterns = d->perm.encoding ().npatterns ();
3079   poly_int64 vec_len = d->perm.length ();
3080
3081   for (int i = 0; i < n_patterns; ++i)
3082     if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3083       return false;
3084
3085   /* Check the pattern is monotonic here, otherwise, return false.  */
3086   for (int i = n_patterns; i < n_patterns * 2; i++)
3087     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3088         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3089       return false;
3090
3091   /* We need to use precomputed mask for such situation and such mask
3092      can only be computed in compile-time known size modes.  */
3093   bool indices_fit_selector_p
3094     = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3095   if (!indices_fit_selector_p && !vec_len.is_constant ())
3096     return false;
3097
3098   if (d->testing_p)
3099     return true;
3100
3101   machine_mode mask_mode = get_mask_mode (vmode);
3102   rtx mask = gen_reg_rtx (mask_mode);
3103
3104   if (indices_fit_selector_p)
3105     {
3106       /* MASK = SELECTOR < NUNITS ? 1 : 0.  */
3107       rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3108       rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3109       insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3110       rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3111       rtx ops[] = {mask, cmp, sel, x};
3112       emit_vlmax_insn (icode, COMPARE_OP, ops);
3113     }
3114   else
3115     {
3116       /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3117          directly to generate the selector mask, instead, we can only use
3118          precomputed mask.
3119
3120          E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3121          don't have a QImode scalar register to hold larger than 255.
3122          We also cannot hold that in a vector QImode register if LMUL = 8, and,
3123          since there is no larger HI mode vector we cannot create a larger
3124          selector.
3125
3126          As the mask is a simple {0, 1, ...} pattern and the length is known we
3127          can store it in a scalar register and broadcast it to a mask register.
3128        */
3129       gcc_assert (vec_len.is_constant ());
3130       int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3131       machine_mode mode = get_vector_mode (QImode, size).require ();
3132       rtx tmp = gen_reg_rtx (mode);
3133       rvv_builder v (mode, 1, size);
3134       for (int i = 0; i < vec_len.to_constant () / 8; i++)
3135         {
3136           uint8_t value = 0;
3137           for (int j = 0; j < 8; j++)
3138             {
3139               int index = i * 8 + j;
3140               if (known_lt (d->perm[index], 256))
3141                 value |= 1 << j;
3142             }
3143           v.quick_push (gen_int_mode (value, QImode));
3144         }
3145       emit_move_insn (tmp, v.build ());
3146       emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3147     }
3148
3149   /* TARGET = MASK ? OP0 : OP1.  */
3150   /* swap op0 and op1 since the order is opposite to pred_merge.  */
3151   rtx ops2[] = {d->target, d->op1, d->op0, mask};
3152   emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3153   return true;
3154 }
3155
3156 /* Recognize the consecutive index that we can use a single
3157    vrgather.v[x|i] to shuffle the vectors.
3158
3159    e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3160    Use SEW = 32, index = 1 vrgather.vi to get the result.  */
3161 static bool
3162 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3163 {
3164   machine_mode vmode = d->vmode;
3165   scalar_mode smode = GET_MODE_INNER (vmode);
3166   poly_int64 vec_len = d->perm.length ();
3167   HOST_WIDE_INT elt;
3168
3169   if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3170     return false;
3171   int vlen = vec_len.to_constant ();
3172
3173   /* Compute the last element index of consecutive pattern from the leading
3174      consecutive elements.  */
3175   int last_consecutive_idx = -1;
3176   int consecutive_num = -1;
3177   for (int i = 1; i < vlen; i++)
3178     {
3179       if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3180         break;
3181       last_consecutive_idx = i;
3182       consecutive_num = last_consecutive_idx + 1;
3183     }
3184
3185   int new_vlen = vlen / consecutive_num;
3186   if (last_consecutive_idx < 0 || consecutive_num == vlen
3187       || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3188     return false;
3189   /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3190      All elements of index, index + 1, ... index + consecutive_num - 1 should
3191      locate at the same vector.  */
3192   if (maybe_ge (d->perm[0], vec_len)
3193       != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3194     return false;
3195   /* If a vector has 8 elements.  We allow optimizations on consecutive
3196      patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3197      Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3198      to be optimized.  */
3199   if (d->perm[0].to_constant () % consecutive_num != 0)
3200     return false;
3201   unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3202   if (container_bits > 64)
3203     return false;
3204   else if (container_bits == 64)
3205     {
3206       if (!TARGET_VECTOR_ELEN_64)
3207         return false;
3208       else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3209         return false;
3210     }
3211
3212   /* Check the rest of elements are the same consecutive pattern.  */
3213   for (int i = consecutive_num; i < vlen; i++)
3214     if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3215       return false;
3216
3217   if (FLOAT_MODE_P (smode))
3218     smode = float_mode_for_size (container_bits).require ();
3219   else
3220     smode = int_mode_for_size (container_bits, 0).require ();
3221   if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3222     return false;
3223   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3224
3225   /* Success! */
3226   if (d->testing_p)
3227     return true;
3228
3229   int index = elt / consecutive_num;
3230   if (index >= new_vlen)
3231     index = index - new_vlen;
3232   rtx sel = gen_const_vector_dup (sel_mode, index);
3233   rtx op = elt >= vlen ? d->op0 : d->op1;
3234   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3235                           gen_lowpart (vmode, op), sel);
3236   return true;
3237 }
3238
3239 /* Recognize the patterns that we can use compress operation to shuffle the
3240    vectors. The perm selector of compress pattern is divided into 2 part:
3241    The first part is the random index number < NUNITS.
3242    The second part is consecutive last N index number >= NUNITS.
3243
3244    E.g.
3245    v = VEC_PERM_EXPR (v0, v1, selector),
3246    selector = { 0, 2, 6, 7 }
3247
3248    We can transform such pattern into:
3249
3250    op1 = vcompress (op0, mask)
3251    mask = { 1, 0, 1, 0 }
3252    v = op1.  */
3253
3254 static bool
3255 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3256 {
3257   machine_mode vmode = d->vmode;
3258   poly_int64 vec_len = d->perm.length ();
3259
3260   if (!vec_len.is_constant ())
3261     return false;
3262
3263   int vlen = vec_len.to_constant ();
3264
3265   /* It's not worthwhile the compress pattern has elements < 4
3266      and we can't modulo indices for compress pattern.  */
3267   if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3268     return false;
3269
3270   /* Compress pattern doesn't work for one vector.  */
3271   if (d->one_vector_p)
3272     return false;
3273
3274   /* Compress point is the point that all elements value with index i >=
3275      compress point of the selector are all consecutive series increasing and
3276      each selector value >= NUNITS. In this case, we could compress all elements
3277      of i < compress point into the op1.  */
3278   int compress_point = -1;
3279   for (int i = 0; i < vlen; i++)
3280     {
3281       if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3282         {
3283           compress_point = i;
3284           break;
3285         }
3286     }
3287
3288   /* We don't apply compress approach if we can't find the compress point.  */
3289   if (compress_point < 0)
3290     return false;
3291
3292   /* We can only apply compress approach when all index values from 0 to
3293      compress point are increasing.  */
3294   for (int i = 1; i < compress_point; i++)
3295     if (maybe_le (d->perm[i], d->perm[i - 1]))
3296       return false;
3297
3298   /* It must be series increasing from compress point.  */
3299   for (int i = 1 + compress_point; i < vlen; i++)
3300     if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3301       return false;
3302
3303   /* Success!  */
3304   if (d->testing_p)
3305     return true;
3306
3307   /* Check whether we need to slideup op1 to apply compress approach.
3308
3309        E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3310             is 2 * NUNITS - 1, so we don't need to slide up.
3311
3312             For index = { 0, 2, 5, 6}, we need to slide op1 up before
3313             we apply compress approach.  */
3314   bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3315                         && !const_vec_duplicate_p (d->op1);
3316
3317   /* If we leave it directly be handled by general gather,
3318      the code sequence will be:
3319         VECTOR LOAD  selector
3320         GEU          mask, selector, NUNITS
3321         GATHER       dest, op0, selector
3322         SUB          selector, selector, NUNITS
3323         GATHER       dest, op1, selector, mask
3324      Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3325      as COST = 4. So, we consider the general gather handling COST = 9.
3326      TODO: This cost is not accurate, we can adjust it by tune info.  */
3327   int general_cost = 9;
3328
3329   /* If we can use compress approach, the code sequence will be:
3330         MASK LOAD    mask
3331         COMPRESS     op1, op0, mask
3332      If it needs slide up, it will be:
3333         MASK LOAD    mask
3334         SLIDEUP      op1
3335         COMPRESS     op1, op0, mask
3336      By default, mask load COST = 2.
3337      TODO: This cost is not accurate, we can adjust it by tune info.  */
3338   int compress_cost = 4;
3339
3340   if (general_cost <= compress_cost)
3341     return false;
3342
3343   /* Build a mask that is true when selector element is true.  */
3344   machine_mode mask_mode = get_mask_mode (vmode);
3345   rvv_builder builder (mask_mode, vlen, 1);
3346   for (int i = 0; i < vlen; i++)
3347     {
3348       bool is_compress_index = false;
3349       for (int j = 0; j < compress_point; j++)
3350         {
3351           if (known_eq (d->perm[j], i))
3352             {
3353               is_compress_index = true;
3354               break;
3355             }
3356         }
3357       if (is_compress_index)
3358         builder.quick_push (CONST1_RTX (BImode));
3359       else
3360         builder.quick_push (CONST0_RTX (BImode));
3361     }
3362   rtx mask = force_reg (mask_mode, builder.build ());
3363
3364   rtx merge = d->op1;
3365   if (need_slideup_p)
3366     {
3367       int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3368       merge = gen_reg_rtx (vmode);
3369       rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3370       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3371       emit_vlmax_insn (icode, BINARY_OP, ops);
3372     }
3373
3374   insn_code icode = code_for_pred_compress (vmode);
3375   rtx ops[] = {d->target, merge, d->op0, mask};
3376   emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3377   return true;
3378 }
3379
3380 /* Recognize decompress patterns:
3381
3382    1. VEC_PERM_EXPR op0 and op1
3383       with isel = { 0, nunits, 1, nunits + 1, ... }.
3384       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3385
3386    2. VEC_PERM_EXPR op0 and op1
3387       with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3388       Slide down op0 and op1 with OFFSET = 1/2 nunits.
3389       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3390 */
3391 static bool
3392 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3393 {
3394   poly_uint64 nelt = d->perm.length ();
3395   machine_mode mask_mode = get_mask_mode (d->vmode);
3396
3397   /* For constant size indices, we dont't need to handle it here.
3398      Just leave it to vec_perm<mode>.  */
3399   if (d->perm.length ().is_constant ())
3400     return false;
3401
3402   poly_uint64 first = d->perm[0];
3403   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3404       || !d->perm.series_p (0, 2, first, 1)
3405       || !d->perm.series_p (1, 2, first + nelt, 1))
3406     return false;
3407
3408   /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3409      Otherwise, it could overflow the index range.  */
3410   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3411   if (GET_MODE_INNER (d->vmode) == QImode
3412       && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3413     return false;
3414
3415   /* Success!  */
3416   if (d->testing_p)
3417     return true;
3418
3419   rtx op0, op1;
3420   if (known_eq (first, 0U))
3421     {
3422       op0 = d->op0;
3423       op1 = d->op1;
3424     }
3425   else
3426     {
3427       op0 = gen_reg_rtx (d->vmode);
3428       op1 = gen_reg_rtx (d->vmode);
3429       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3430       rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3431       rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3432       emit_vlmax_insn (icode, BINARY_OP, ops0);
3433       emit_vlmax_insn (icode, BINARY_OP, ops1);
3434     }
3435   /* Generate { 0, 1, .... } mask.  */
3436   rtx vid = gen_reg_rtx (sel_mode);
3437   rtx vid_repeat = gen_reg_rtx (sel_mode);
3438   expand_vec_series (vid, const0_rtx, const1_rtx);
3439   rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3440   emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3441   rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3442   rtx mask = gen_reg_rtx (mask_mode);
3443   expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3444   emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3445   return true;
3446 }
3447
3448 static bool
3449 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3450 {
3451   HOST_WIDE_INT diff;
3452   unsigned i, size, step;
3453
3454   if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3455     return false;
3456
3457   step = diff + 1;
3458   size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3459
3460   switch (size)
3461     {
3462     case 16:
3463       break;
3464     case 32:
3465     case 64:
3466       /* We will have VEC_PERM_EXPR after rtl expand when invoking
3467          __builtin_bswap. It will generate about 9 instructions in
3468          loop as below, no matter it is bswap16, bswap32 or bswap64.
3469            .L2:
3470          1 vle16.v v4,0(a0)
3471          2 vmv.v.x v2,a7
3472          3 vand.vv v2,v6,v2
3473          4 slli    a2,a5,1
3474          5 vrgatherei16.vv v1,v4,v2
3475          6 sub     a4,a4,a5
3476          7 vse16.v v1,0(a3)
3477          8 add     a0,a0,a2
3478          9 add     a3,a3,a2
3479            bne     a4,zero,.L2
3480
3481          But for bswap16 we may have a even simple code gen, which
3482          has only 7 instructions in loop as below.
3483            .L5
3484          1 vle8.v  v2,0(a5)
3485          2 addi    a5,a5,32
3486          3 vsrl.vi v4,v2,8
3487          4 vsll.vi v2,v2,8
3488          5 vor.vv  v4,v4,v2
3489          6 vse8.v  v4,0(a4)
3490          7 addi    a4,a4,32
3491            bne     a5,a6,.L5
3492
3493          Unfortunately, the instructions in loop will grow to 13 and 24
3494          for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3495          for both the bswap64 and bswap32, but take shift and or (7 insn)
3496          for bswap16.
3497        */
3498     default:
3499       return false;
3500     }
3501
3502   for (i = 0; i < step; i++)
3503     if (!d->perm.series_p (i, step, diff - i, step))
3504       return false;
3505
3506   /* Disable when nunits < 4 since the later generic approach
3507      is more profitable on BSWAP.  */
3508   if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3509     return false;
3510
3511   if (d->testing_p)
3512     return true;
3513
3514   machine_mode vhi_mode;
3515   poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3516
3517   if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3518     return false;
3519
3520   /* Step-1: Move op0 to src with VHI mode.  */
3521   rtx src = gen_reg_rtx (vhi_mode);
3522   emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3523
3524   /* Step-2: Shift right 8 bits to dest.  */
3525   rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3526                            NULL_RTX, 0, OPTAB_DIRECT);
3527
3528   /* Step-3: Shift left 8 bits to src.  */
3529   src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3530                       NULL_RTX, 0, OPTAB_DIRECT);
3531
3532   /* Step-4: Logic Or dest and src to dest.  */
3533   dest = expand_binop (vhi_mode, ior_optab, dest, src,
3534                        NULL_RTX, 0, OPTAB_DIRECT);
3535
3536   /* Step-5: Move src to target with VQI mode.  */
3537   emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3538
3539   return true;
3540 }
3541
3542 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3543    approach.  */
3544
3545 static bool
3546 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3547 {
3548   poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3549
3550   /* Recognize { nunits - 1, nunits, nunits + 1, ... }.  */
3551   if (!d->perm.series_p (0, 2, nunits - 1, 2)
3552       || !d->perm.series_p (1, 2, nunits, 2))
3553     return false;
3554
3555   /* Disable when nunits < 4 since the later generic approach
3556      is more profitable on indice = { nunits - 1, nunits }.  */
3557   if (!known_gt (nunits, 2))
3558     return false;
3559
3560   /* Success! */
3561   if (d->testing_p)
3562     return true;
3563
3564   /* Extract the last element of the first vector.  */
3565   scalar_mode smode = GET_MODE_INNER (d->vmode);
3566   rtx tmp = gen_reg_rtx (smode);
3567   emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3568
3569   /* Insert the scalar into element 0.  */
3570   unsigned int unspec
3571     = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3572   insn_code icode = code_for_pred_slide (unspec, d->vmode);
3573   rtx ops[] = {d->target, d->op1, tmp};
3574   emit_vlmax_insn (icode, BINARY_OP, ops);
3575   return true;
3576 }
3577
3578 /* This looks for a series pattern in the provided vector permute structure D.
3579    If successful it emits a series insn as well as a gather to implement it.
3580    Return true if successful, false otherwise.  */
3581
3582 static bool
3583 shuffle_series_patterns (struct expand_vec_perm_d *d)
3584 {
3585   if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3586     return false;
3587
3588   poly_int64 el1 = d->perm[0];
3589   poly_int64 el2 = d->perm[1];
3590   poly_int64 el3 = d->perm[2];
3591
3592   poly_int64 step1 = el2 - el1;
3593   poly_int64 step2 = el3 - el2;
3594
3595   bool need_insert = false;
3596   bool have_series = false;
3597
3598   /* Check for a full series.  */
3599   if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3600     have_series = true;
3601
3602   /* Check for a series starting at the second element.  */
3603   else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3604     {
3605       have_series = true;
3606       need_insert = true;
3607     }
3608
3609   if (!have_series)
3610     return false;
3611
3612   /* Disable shuffle if we can't find an appropriate integer index mode for
3613      gather.  */
3614   machine_mode sel_mode;
3615   if (!get_gather_index_mode (d).exists (&sel_mode))
3616     return false;
3617
3618   /* Success! */
3619   if (d->testing_p)
3620     return true;
3621
3622   /* Create the series.  */
3623   machine_mode eltmode = Pmode;
3624   rtx series = gen_reg_rtx (sel_mode);
3625   expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3626                      gen_int_mode (need_insert ? step2 : step1, eltmode));
3627
3628   /* Insert the remaining element if necessary.  */
3629   if (need_insert)
3630     {
3631       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3632       rtx ops[]
3633         = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3634       emit_vlmax_insn (icode, BINARY_OP, ops);
3635     }
3636
3637   emit_vlmax_gather_insn (d->target, d->op0, series);
3638
3639   return true;
3640 }
3641
3642 /* Recognize the pattern that can be shuffled by generic approach.  */
3643
3644 static bool
3645 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3646 {
3647   machine_mode sel_mode;
3648
3649   /* We don't enable SLP for non-power of 2 NPATTERNS.  */
3650   if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3651     return false;
3652
3653   /* Disable shuffle if we can't find an appropriate integer index mode for
3654      gather.  */
3655   if (!get_gather_index_mode (d).exists (&sel_mode))
3656     return false;
3657
3658   /* Success! */
3659   if (d->testing_p)
3660     return true;
3661
3662   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3663   /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3664      instead of expand vec_perm<mode>, we handle it directly.  */
3665   expand_vec_perm (d->target, d->op0, d->op1, sel);
3666   return true;
3667 }
3668
3669 /* This function recognizes and supports different permutation patterns
3670    and enable VLA SLP auto-vectorization.  */
3671 static bool
3672 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3673 {
3674   gcc_assert (d->op_mode != E_VOIDmode);
3675
3676   /* The pattern matching functions above are written to look for a small
3677      number to begin the sequence (0, 1, N/2).  If we begin with an index
3678      from the second operand, we can swap the operands.  */
3679   poly_int64 nelt = d->perm.length ();
3680   if (known_ge (d->perm[0], nelt))
3681     {
3682       d->perm.rotate_inputs (1);
3683       std::swap (d->op0, d->op1);
3684     }
3685
3686   if (known_gt (nelt, 1))
3687     {
3688       if (d->vmode == d->op_mode)
3689         {
3690           if (shuffle_merge_patterns (d))
3691             return true;
3692           if (shuffle_consecutive_patterns (d))
3693             return true;
3694           if (shuffle_compress_patterns (d))
3695             return true;
3696           if (shuffle_decompress_patterns (d))
3697             return true;
3698           if (shuffle_bswap_pattern (d))
3699             return true;
3700           if (shuffle_extract_and_slide1up_patterns (d))
3701             return true;
3702           if (shuffle_series_patterns (d))
3703             return true;
3704           if (shuffle_generic_patterns (d))
3705             return true;
3706           return false;
3707         }
3708       else
3709         return false;
3710     }
3711   return false;
3712 }
3713
3714 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3715  * instructions.  */
3716 bool
3717 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3718                        rtx op0, rtx op1, const vec_perm_indices &sel)
3719 {
3720   /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3721      mask to do the iteration loop control. Just disable it directly.  */
3722   if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3723     return false;
3724   /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3725      may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3726      Ideally, middle-end loop vectorizer should be able to disable it
3727      itself, We can remove the codes here when middle-end code is able
3728      to disable VLA SLP vectorization for poly size (1, 1) VF.  */
3729   if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3730       && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3731                    poly_int64 (16, 16)))
3732     return false;
3733
3734   struct expand_vec_perm_d d;
3735
3736   /* Check whether the mask can be applied to a single vector.  */
3737   if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3738     d.one_vector_p = true;
3739   else if (sel.all_from_input_p (0))
3740     {
3741       d.one_vector_p = true;
3742       op1 = op0;
3743     }
3744   else if (sel.all_from_input_p (1))
3745     {
3746       d.one_vector_p = true;
3747       op0 = op1;
3748     }
3749   else
3750     d.one_vector_p = false;
3751
3752   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3753                      sel.nelts_per_input ());
3754   d.vmode = vmode;
3755   d.op_mode = op_mode;
3756   d.target = target;
3757   d.op0 = op0;
3758   if (op0 == op1)
3759     d.op1 = d.op0;
3760   else
3761     d.op1 = op1;
3762   d.testing_p = !target;
3763
3764   if (!d.testing_p)
3765     return expand_vec_perm_const_1 (&d);
3766
3767   rtx_insn *last = get_last_insn ();
3768   bool ret = expand_vec_perm_const_1 (&d);
3769   gcc_assert (last == get_last_insn ());
3770
3771   return ret;
3772 }
3773
3774 /* Generate no side effects vsetvl to get the vector length.  */
3775 void
3776 expand_select_vl (rtx *ops)
3777 {
3778   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3779   if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3780     {
3781       /* If length is known <= VF, we just use the length directly instead
3782          of using vsetvli.
3783
3784          E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3785          We move 3 into _255 instead of using explicit vsetvl.  */
3786       emit_move_insn (ops[0], ops[1]);
3787       return;
3788     }
3789   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3790      since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
3791   scalar_int_mode mode = QImode;
3792   machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3793   emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3794 }
3795
3796 /* Expand MASK_LEN_{LOAD,STORE}.  */
3797 void
3798 expand_load_store (rtx *ops, bool is_load)
3799 {
3800   rtx mask = ops[2];
3801   rtx len = ops[3];
3802   machine_mode mode = GET_MODE (ops[0]);
3803
3804   if (is_vlmax_len_p (mode, len))
3805     {
3806       /* If the length operand is equal to VF, it is VLMAX load/store.  */
3807       if (is_load)
3808         {
3809           rtx m_ops[] = {ops[0], mask, ops[1]};
3810           emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3811         }
3812       else
3813         {
3814           len = gen_reg_rtx (Pmode);
3815           emit_vlmax_vsetvl (mode, len);
3816           emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3817                                      get_avl_type_rtx (VLMAX)));
3818         }
3819     }
3820   else
3821     {
3822       if (!satisfies_constraint_K (len))
3823         len = force_reg (Pmode, len);
3824       if (is_load)
3825         {
3826           rtx m_ops[] = {ops[0], mask, ops[1]};
3827           emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3828                                len);
3829         }
3830       else
3831         emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3832                                    get_avl_type_rtx (NONVLMAX)));
3833     }
3834 }
3835
3836 /* Expand MASK_LEN_STRIDED_LOAD.  */
3837 void
3838 expand_strided_load (machine_mode mode, rtx *ops)
3839 {
3840   rtx v_reg = ops[0];
3841   rtx base = ops[1];
3842   rtx stride = ops[2];
3843   rtx mask = ops[3];
3844   rtx len = ops[4];
3845   poly_int64 len_val;
3846
3847   insn_code icode = code_for_pred_strided_load (mode);
3848   rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
3849
3850   if (poly_int_rtx_p (len, &len_val)
3851       && known_eq (len_val, GET_MODE_NUNITS (mode)))
3852     emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
3853   else
3854     {
3855       len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
3856       emit_nonvlmax_insn (icode, BINARY_OP_TAMA, emit_ops, len);
3857     }
3858 }
3859
3860 /* Expand MASK_LEN_STRIDED_STORE.  */
3861 void
3862 expand_strided_store (machine_mode mode, rtx *ops)
3863 {
3864   rtx v_reg = ops[2];
3865   rtx base = ops[0];
3866   rtx stride = ops[1];
3867   rtx mask = ops[3];
3868   rtx len = ops[4];
3869   poly_int64 len_val;
3870   rtx vl_type;
3871
3872   if (poly_int_rtx_p (len, &len_val)
3873       && known_eq (len_val, GET_MODE_NUNITS (mode)))
3874     {
3875       len = gen_reg_rtx (Pmode);
3876       emit_vlmax_vsetvl (mode, len);
3877       vl_type = get_avl_type_rtx (VLMAX);
3878     }
3879   else
3880     {
3881       len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
3882       vl_type = get_avl_type_rtx (NONVLMAX);
3883     }
3884
3885   emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, base),
3886                                      mask, stride, v_reg, len, vl_type));
3887 }
3888
3889 /* Return true if the operation is the floating-point operation need FRM.  */
3890 static bool
3891 needs_fp_rounding (unsigned icode, machine_mode mode)
3892 {
3893   if (!FLOAT_MODE_P (mode))
3894     return false;
3895
3896   return icode != maybe_code_for_pred (SMIN, mode)
3897          && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3898          && icode != maybe_code_for_pred (SMAX, mode)
3899          && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3900          && icode != maybe_code_for_pred (NEG, mode)
3901          && icode != maybe_code_for_pred (ABS, mode)
3902          /* narrower-FP -> FP */
3903          && icode != maybe_code_for_pred_extend (mode)
3904          /* narrower-INT -> FP */
3905          && icode != maybe_code_for_pred_widen (FLOAT, mode)
3906          && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3907          /* vfsgnj */
3908          && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3909          && icode != maybe_code_for_pred_mov (mode);
3910 }
3911
3912 /* Subroutine to expand COND_LEN_* patterns.  */
3913 static void
3914 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3915 {
3916   rtx dest = ops[0];
3917   rtx mask = ops[1];
3918   machine_mode mode = GET_MODE (dest);
3919   machine_mode mask_mode = GET_MODE (mask);
3920   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3921   bool is_vlmax_len = is_vlmax_len_p (mode, len);
3922
3923   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3924   /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3925      dummy mask) into NEG_EXPR in GIMPLE FOLD yet.  So, we do such
3926      simplification in RISC-V backend and may do that in middle-end in the
3927      future.  */
3928   if (is_dummy_mask && is_vlmax_len)
3929     insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
3930   else if (is_dummy_mask)
3931     insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3932   else if (is_vlmax_len)
3933     insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3934   else
3935     insn_flags |= TU_POLICY_P | MU_POLICY_P;
3936
3937   if (needs_fp_rounding (icode, mode))
3938     insn_flags |= FRM_DYN_P;
3939
3940   if (is_vlmax_len)
3941     emit_vlmax_insn (icode, insn_flags, ops);
3942   else
3943     emit_nonvlmax_insn (icode, insn_flags, ops, len);
3944 }
3945
3946 /* Return RVV_VUNDEF if the ELSE value is scratch rtx.  */
3947 static rtx
3948 get_else_operand (rtx op)
3949 {
3950   return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3951 }
3952
3953 /* Expand unary ops COND_LEN_*.  */
3954 void
3955 expand_cond_len_unop (unsigned icode, rtx *ops)
3956 {
3957   rtx dest = ops[0];
3958   rtx mask = ops[1];
3959   rtx src = ops[2];
3960   rtx merge = get_else_operand (ops[3]);
3961   rtx len = ops[4];
3962
3963   rtx cond_ops[] = {dest, mask, merge, src};
3964   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3965 }
3966
3967 /* Expand unary ops COND_*.  */
3968 void
3969 expand_cond_unop (unsigned icode, rtx *ops)
3970 {
3971   rtx dest = ops[0];
3972   rtx mask = ops[1];
3973   rtx src = ops[2];
3974   rtx merge = get_else_operand (ops[3]);
3975   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3976
3977   rtx cond_ops[] = {dest, mask, merge, src};
3978   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3979 }
3980
3981 /* Expand binary ops COND_LEN_*.  */
3982 void
3983 expand_cond_len_binop (unsigned icode, rtx *ops)
3984 {
3985   rtx dest = ops[0];
3986   rtx mask = ops[1];
3987   rtx src1 = ops[2];
3988   rtx src2 = ops[3];
3989   rtx merge = get_else_operand (ops[4]);
3990   rtx len = ops[5];
3991
3992   rtx cond_ops[] = {dest, mask, merge, src1, src2};
3993   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3994 }
3995
3996 /* Expand binary ops COND_*.  */
3997 void
3998 expand_cond_binop (unsigned icode, rtx *ops)
3999 {
4000   rtx dest = ops[0];
4001   rtx mask = ops[1];
4002   rtx src1 = ops[2];
4003   rtx src2 = ops[3];
4004   rtx merge = get_else_operand (ops[4]);
4005   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4006
4007   rtx cond_ops[] = {dest, mask, merge, src1, src2};
4008   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4009 }
4010
4011 /* Prepare insn_code for gather_load/scatter_store according to
4012    the vector mode and index mode.  */
4013 static insn_code
4014 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
4015                         bool is_load)
4016 {
4017   if (!is_load)
4018     return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
4019   else
4020     {
4021       unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
4022       unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
4023       if (dst_eew_bitsize == src_eew_bitsize)
4024         return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
4025       else if (dst_eew_bitsize > src_eew_bitsize)
4026         {
4027           unsigned factor = dst_eew_bitsize / src_eew_bitsize;
4028           switch (factor)
4029             {
4030             case 2:
4031               return code_for_pred_indexed_load_x2_greater_eew (
4032                 UNSPEC_UNORDERED, vec_mode);
4033             case 4:
4034               return code_for_pred_indexed_load_x4_greater_eew (
4035                 UNSPEC_UNORDERED, vec_mode);
4036             case 8:
4037               return code_for_pred_indexed_load_x8_greater_eew (
4038                 UNSPEC_UNORDERED, vec_mode);
4039             default:
4040               gcc_unreachable ();
4041             }
4042         }
4043       else
4044         {
4045           unsigned factor = src_eew_bitsize / dst_eew_bitsize;
4046           switch (factor)
4047             {
4048             case 2:
4049               return code_for_pred_indexed_load_x2_smaller_eew (
4050                 UNSPEC_UNORDERED, vec_mode);
4051             case 4:
4052               return code_for_pred_indexed_load_x4_smaller_eew (
4053                 UNSPEC_UNORDERED, vec_mode);
4054             case 8:
4055               return code_for_pred_indexed_load_x8_smaller_eew (
4056                 UNSPEC_UNORDERED, vec_mode);
4057             default:
4058               gcc_unreachable ();
4059             }
4060         }
4061     }
4062 }
4063
4064 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}.  */
4065 void
4066 expand_gather_scatter (rtx *ops, bool is_load)
4067 {
4068   rtx ptr, vec_offset, vec_reg;
4069   bool zero_extend_p;
4070   int shift;
4071   rtx mask = ops[5];
4072   rtx len = ops[6];
4073   if (is_load)
4074     {
4075       vec_reg = ops[0];
4076       ptr = ops[1];
4077       vec_offset = ops[2];
4078       zero_extend_p = INTVAL (ops[3]);
4079       shift = exact_log2 (INTVAL (ops[4]));
4080     }
4081   else
4082     {
4083       vec_reg = ops[4];
4084       ptr = ops[0];
4085       vec_offset = ops[1];
4086       zero_extend_p = INTVAL (ops[2]);
4087       shift = exact_log2 (INTVAL (ops[3]));
4088     }
4089
4090   machine_mode vec_mode = GET_MODE (vec_reg);
4091   machine_mode idx_mode = GET_MODE (vec_offset);
4092   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4093   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4094   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4095   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4096
4097   bool use_widening_shift = false;
4098
4099   /* Extend the offset element to address width.  */
4100   if (inner_offsize < BITS_PER_WORD)
4101     {
4102       use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
4103       /* 7.2. Vector Load/Store Addressing Modes.
4104          If the vector offset elements are narrower than XLEN, they are
4105          zero-extended to XLEN before adding to the ptr effective address. If
4106          the vector offset elements are wider than XLEN, the least-significant
4107          XLEN bits are used in the address calculation. An implementation must
4108          raise an illegal instruction exception if the EEW is not supported for
4109          offset elements.
4110
4111          RVV spec only refers to the shift == 0 case.  */
4112       if (!zero_extend_p || shift)
4113         {
4114           if (zero_extend_p)
4115             inner_idx_mode
4116               = int_mode_for_size (inner_offsize * 2, 0).require ();
4117           else
4118             inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4119           machine_mode new_idx_mode
4120             = get_vector_mode (inner_idx_mode, nunits).require ();
4121           if (!use_widening_shift)
4122             {
4123               rtx tmp = gen_reg_rtx (new_idx_mode);
4124               emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4125                                           zero_extend_p ? true : false));
4126               vec_offset = tmp;
4127             }
4128           idx_mode = new_idx_mode;
4129         }
4130     }
4131
4132   if (shift)
4133     {
4134       rtx tmp;
4135       if (!use_widening_shift)
4136         tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4137                             gen_int_mode (shift, Pmode), NULL_RTX, 0,
4138                             OPTAB_DIRECT);
4139       else
4140         {
4141           tmp = gen_reg_rtx (idx_mode);
4142           insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
4143           rtx ops[] = {tmp, vec_offset, const1_rtx};
4144           emit_vlmax_insn (icode, BINARY_OP, ops);
4145         }
4146
4147       vec_offset = tmp;
4148     }
4149
4150   insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4151   if (is_vlmax)
4152     {
4153       if (is_load)
4154         {
4155           rtx load_ops[]
4156             = {vec_reg, mask, ptr, vec_offset};
4157           emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4158         }
4159       else
4160         {
4161           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4162           emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4163         }
4164     }
4165   else
4166     {
4167       if (is_load)
4168         {
4169           rtx load_ops[]
4170             = {vec_reg, mask, ptr, vec_offset};
4171           emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4172         }
4173       else
4174         {
4175           rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4176           emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4177         }
4178     }
4179 }
4180
4181 /* Expand COND_LEN_*.  */
4182 void
4183 expand_cond_len_ternop (unsigned icode, rtx *ops)
4184 {
4185   rtx dest = ops[0];
4186   rtx mask = ops[1];
4187   rtx src1 = ops[2];
4188   rtx src2 = ops[3];
4189   rtx src3 = ops[4];
4190   rtx merge = get_else_operand (ops[5]);
4191   rtx len = ops[6];
4192
4193   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4194   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4195 }
4196
4197 /* Expand COND_*.  */
4198 void
4199 expand_cond_ternop (unsigned icode, rtx *ops)
4200 {
4201   rtx dest = ops[0];
4202   rtx mask = ops[1];
4203   rtx src1 = ops[2];
4204   rtx src2 = ops[3];
4205   rtx src3 = ops[4];
4206   rtx merge = get_else_operand (ops[5]);
4207   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4208
4209   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4210   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4211 }
4212
4213 /* Expand reduction operations.
4214      Case 1: ops = {scalar_dest, vector_src}
4215      Case 2: ops = {scalar_dest, vector_src, mask, vl}
4216 */
4217 void
4218 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4219 {
4220   rtx scalar_dest = ops[0];
4221   rtx vector_src = ops[1];
4222   machine_mode vmode = GET_MODE (vector_src);
4223   machine_mode vel_mode = GET_MODE (scalar_dest);
4224   machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4225
4226   rtx m1_tmp = gen_reg_rtx (m1_mode);
4227   rtx scalar_move_ops[] = {m1_tmp, init};
4228   insn_code icode = code_for_pred_broadcast (m1_mode);
4229   if (need_mask_operand_p (insn_flags))
4230     emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, ops[3]);
4231   else
4232     emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
4233
4234   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4235   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4236   icode = code_for_pred (unspec, vmode);
4237
4238   if (need_mask_operand_p (insn_flags))
4239     {
4240       rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4241       emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4242     }
4243   else
4244     emit_vlmax_insn (icode, insn_flags, reduc_ops);
4245
4246   emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4247 }
4248
4249 /* Prepare ops for ternary operations.
4250    It can be called before or after RA.  */
4251 void
4252 prepare_ternary_operands (rtx *ops)
4253 {
4254   machine_mode mode = GET_MODE (ops[0]);
4255
4256   if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4257       && (VECTOR_MODE_P (GET_MODE (ops[2]))
4258           && !rtx_equal_p (ops[2], ops[5]))
4259       && !rtx_equal_p (ops[3], ops[5])
4260       && !rtx_equal_p (ops[4], ops[5]))
4261     {
4262       /* RA will fail to find vector REG and report ICE, so we pre-merge
4263          the ops for LMUL = 8.  */
4264       if (satisfies_constraint_Wc1 (ops[1]))
4265         {
4266           emit_move_insn (ops[0], ops[5]);
4267           emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4268                                    ops[7], ops[8], ops[9]));
4269         }
4270       else
4271         emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4272                                    ops[4], ops[1], ops[6], ops[7], ops[9]));
4273       ops[5] = ops[4] = ops[0];
4274     }
4275   else
4276     {
4277       /* Swap the multiplication ops if the fallback value is the
4278          second of the two.  */
4279       if (rtx_equal_p (ops[3], ops[5]))
4280         std::swap (ops[2], ops[3]);
4281
4282       /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4283          into PLUS (ASHIFT (a, 2), b) according to uarchs.  */
4284     }
4285   gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4286               || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4287 }
4288
4289 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}.  */
4290 void
4291 expand_lanes_load_store (rtx *ops, bool is_load)
4292 {
4293   rtx mask = ops[2];
4294   rtx len = ops[3];
4295   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4296   rtx reg = is_load ? ops[0] : ops[1];
4297   machine_mode mode = GET_MODE (ops[0]);
4298
4299   if (is_vlmax_len_p (mode, len))
4300     {
4301       /* If the length operand is equal to VF, it is VLMAX load/store.  */
4302       if (is_load)
4303         {
4304           rtx m_ops[] = {reg, mask, addr};
4305           emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4306                             m_ops);
4307         }
4308       else
4309         {
4310           len = gen_reg_rtx (Pmode);
4311           emit_vlmax_vsetvl (mode, len);
4312           emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4313                                                   get_avl_type_rtx (VLMAX)));
4314         }
4315     }
4316   else
4317     {
4318       if (!satisfies_constraint_K (len))
4319         len = force_reg (Pmode, len);
4320       if (is_load)
4321         {
4322           rtx m_ops[] = {reg, mask, addr};
4323           emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4324                                UNARY_OP_TAMA, m_ops, len);
4325         }
4326       else
4327         emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4328                                                 get_avl_type_rtx (NONVLMAX)));
4329     }
4330 }
4331
4332 /* Expand LEN_FOLD_EXTRACT_LAST.  */
4333 void
4334 expand_fold_extract_last (rtx *ops)
4335 {
4336   rtx dst = ops[0];
4337   rtx default_value = ops[1];
4338   rtx mask = ops[2];
4339   rtx anchor = gen_reg_rtx (Pmode);
4340   rtx index = gen_reg_rtx (Pmode);
4341   rtx vect = ops[3];
4342   rtx else_label = gen_label_rtx ();
4343   rtx end_label = gen_label_rtx ();
4344   rtx len = ops[4];
4345   machine_mode mode = GET_MODE (vect);
4346   machine_mode mask_mode = GET_MODE (mask);
4347   rtx compress_vect = gen_reg_rtx (mode);
4348   rtx slide_vect = gen_reg_rtx (mode);
4349   insn_code icode;
4350
4351   if (is_vlmax_len_p (mode, len))
4352     len = NULL_RTX;
4353
4354   /* Calculate the number of 1-bit in mask. */
4355   rtx cpop_ops[] = {anchor, mask};
4356   if (len)
4357     emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4358                          cpop_ops, len);
4359   else
4360     emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4361                       cpop_ops);
4362
4363   riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4364   emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4365   /* Compress the vector.  */
4366   icode = code_for_pred_compress (mode);
4367   rtx compress_ops[] = {compress_vect, vect, mask};
4368   if (len)
4369     emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4370   else
4371     emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4372   /* Emit the slide down to index 0 in a new vector.  */
4373   rtx slide_ops[] = {slide_vect, compress_vect, index};
4374   icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4375   if (len)
4376     emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4377   else
4378     emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4379   /* Emit v(f)mv.[xf].s.  */
4380   emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4381
4382   emit_jump_insn (gen_jump (end_label));
4383   emit_barrier ();
4384   emit_label (else_label);
4385   emit_move_insn (dst, default_value);
4386   emit_label (end_label);
4387 }
4388
4389 /* Return true if the LMUL of comparison less than or equal to one.  */
4390 bool
4391 cmp_lmul_le_one (machine_mode mode)
4392 {
4393   if (riscv_v_ext_vector_mode_p (mode))
4394     return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4395   else if (riscv_v_ext_vls_mode_p (mode))
4396     return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4397   return false;
4398 }
4399
4400 /* Return true if the LMUL of comparison greater than one.  */
4401 bool
4402 cmp_lmul_gt_one (machine_mode mode)
4403 {
4404   if (riscv_v_ext_vector_mode_p (mode))
4405     return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4406   else if (riscv_v_ext_vls_mode_p (mode))
4407     return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4408   return false;
4409 }
4410
4411 /* Return true if the VLS mode is legal. There are 2 cases here.
4412
4413    1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4414       is the highest priority choice and should not conflict with VLS modes.
4415    2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4416       VLS mode are smaller than the minimal vla.
4417
4418    Take vlen = 2048 as example for case 2.
4419
4420    Note: Below table based on vlen = 2048.
4421    +----------------------------------------------------+----------------------+
4422    | VLS mode                                           | VLA mode             |
4423    +----------------------------------------------------+----------------------+
4424    | Name       | Precision | Inner Precision | Enabled | Min mode  | Min bits |
4425    +------------+-----------+-----------------+---------+-----------+----------+
4426    | V1BI       |     1     |              1  | Yes     | RVVMF64BI |    32    |
4427    | V2BI       |     2     |              1  | Yes     | RVVMF64BI |    32    |
4428    | V4BI       |     4     |              1  | Yes     | RVVMF64BI |    32    |
4429    | V8BI       |     8     |              1  | Yes     | RVVMF64BI |    32    |
4430    | V16BI      |    16     |              1  | Yes     | RVVMF64BI |    32    |
4431    | V32BI      |    32     |              1  | NO      | RVVMF64BI |    32    |
4432    | V64BI      |    64     |              1  | NO      | RVVMF64BI |    32    |
4433    | ...        |   ...     |            ...  | ...     | RVVMF64BI |    32    |
4434    | V4096BI    |  4096     |              1  | NO      | RVVMF64BI |    32    |
4435    +------------+-----------+-----------------+---------+-----------+----------+
4436    | V1QI       |     8     |              8  | Yes     | RVVMF8QI  |   256    |
4437    | V2QI       |    16     |              8  | Yes     | RVVMF8QI  |   256    |
4438    | V4QI       |    32     |              8  | Yes     | RVVMF8QI  |   256    |
4439    | V8QI       |    64     |              8  | Yes     | RVVMF8QI  |   256    |
4440    | V16QI      |   128     |              8  | Yes     | RVVMF8QI  |   256    |
4441    | V32QI      |   256     |              8  | NO      | RVVMF8QI  |   256    |
4442    | V64QI      |   512     |              8  | NO      | RVVMF8QI  |   256    |
4443    | ...        |   ...     |              .. | ...     | RVVMF8QI  |   256    |
4444    | V4096QI    | 32768     |              8  | NO      | RVVMF8QI  |   256    |
4445    +------------+-----------+-----------------+---------+-----------+----------+
4446    | V1HI       |    16     |              16 | Yes     | RVVMF4HI  |   512    |
4447    | V2HI       |    32     |              16 | Yes     | RVVMF4HI  |   512    |
4448    | V4HI       |    64     |              16 | Yes     | RVVMF4HI  |   512    |
4449    | V8HI       |   128     |              16 | Yes     | RVVMF4HI  |   512    |
4450    | V16HI      |   256     |              16 | Yes     | RVVMF4HI  |   512    |
4451    | V32HI      |   512     |              16 | NO      | RVVMF4HI  |   512    |
4452    | V64HI      |  1024     |              16 | NO      | RVVMF4HI  |   512    |
4453    | ...        |   ...     |              .. | ...     | RVVMF4HI  |   512    |
4454    | V2048HI    | 32768     |              16 | NO      | RVVMF4HI  |   512    |
4455    +------------+-----------+-----------------+---------+-----------+----------+
4456    | V1SI/SF    |    32     |              32 | Yes     | RVVMF2SI  |  1024    |
4457    | V2SI/SF    |    64     |              32 | Yes     | RVVMF2SI  |  1024    |
4458    | V4SI/SF    |   128     |              32 | Yes     | RVVMF2SI  |  1024    |
4459    | V8SI/SF    |   256     |              32 | Yes     | RVVMF2SI  |  1024    |
4460    | V16SI/SF   |   512     |              32 | Yes     | RVVMF2SI  |  1024    |
4461    | V32SI/SF   |  1024     |              32 | NO      | RVVMF2SI  |  1024    |
4462    | V64SI/SF   |  2048     |              32 | NO      | RVVMF2SI  |  1024    |
4463    | ...        |   ...     |              .. | ...     | RVVMF2SI  |  1024    |
4464    | V1024SI/SF | 32768     |              32 | NO      | RVVMF2SI  |  1024    |
4465    +------------+-----------+-----------------+---------+-----------+----------+
4466    | V1DI/DF    |    64     |              64 | Yes     | RVVM1DI   |  2048    |
4467    | V2DI/DF    |   128     |              64 | Yes     | RVVM1DI   |  2048    |
4468    | V4DI/DF    |   256     |              64 | Yes     | RVVM1DI   |  2048    |
4469    | V8DI/DF    |   512     |              64 | Yes     | RVVM1DI   |  2048    |
4470    | V16DI/DF   |  1024     |              64 | Yes     | RVVM1DI   |  2048    |
4471    | V32DI/DF   |  2048     |              64 | NO      | RVVM1DI   |  2048    |
4472    | V64DI/DF   |  4096     |              64 | NO      | RVVM1DI   |  2048    |
4473    | ...        |   ...     |              .. | ...     | RVVM1DI   |  2048    |
4474    | V512DI/DF  | 32768     |              64 | NO      | RVVM1DI   |  2048    |
4475    +------------+-----------+-----------------+---------+-----------+----------+
4476
4477    Then we can have the condition for VLS mode in fixed-vlmax, aka:
4478      PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)).  */
4479 bool
4480 vls_mode_valid_p (machine_mode vls_mode)
4481 {
4482   if (!TARGET_VECTOR || TARGET_XTHEADVECTOR)
4483     return false;
4484
4485   if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE)
4486     {
4487       if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4488           && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4489                          GET_MODE_PRECISION (vls_mode)))
4490         /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4491            BITS_PER_RISCV_VECTOR.
4492
4493            e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4494            We enable VLS modes have fixed size <= 128bit.  Since ordered_p is
4495            false between VLA modes with size = (128, 128) bits and VLS mode
4496            with size = 128 bits, we will end up with multiple ICEs in
4497            middle-end generic codes.  */
4498         return false;
4499       return true;
4500     }
4501
4502   if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
4503     {
4504       machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4505       int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4506       int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4507
4508       return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4509     }
4510
4511   return false;
4512 }
4513
4514 /* We don't have to convert the floating point to integer when the
4515    mantissa is zero.  Thus, ther will be a limitation for both the
4516    single and double precision floating point.  There will be no
4517    mantissa if the floating point is greater than the limit.
4518
4519    1. Half floating point.
4520       +-----------+---------------+
4521       | float     | binary layout |
4522       +-----------+---------------+
4523       | 1023.5    | 0x63ff        |
4524       +-----------+---------------+
4525       | 1024.0    | 0x6400        |
4526       +-----------+---------------+
4527       | 1025.0    | 0x6401        |
4528       +-----------+---------------+
4529       | ...       | ...           |
4530
4531       All half floating point will be unchanged for ceil if it is
4532       greater than and equal to 1024.
4533
4534    2. Single floating point.
4535       +-----------+---------------+
4536       | float     | binary layout |
4537       +-----------+---------------+
4538       | 8388607.5 | 0x4affffff    |
4539       +-----------+---------------+
4540       | 8388608.0 | 0x4b000000    |
4541       +-----------+---------------+
4542       | 8388609.0 | 0x4b000001    |
4543       +-----------+---------------+
4544       | ...       | ...           |
4545
4546       All single floating point will be unchanged for ceil if it is
4547       greater than and equal to 8388608.
4548
4549    3. Double floating point.
4550       +--------------------+--------------------+
4551       | float              | binary layout      |
4552       +--------------------+--------------------+
4553       | 4503599627370495.5 | 0X432fffffffffffff |
4554       +--------------------+--------------------+
4555       | 4503599627370496.0 | 0X4330000000000000 |
4556       +--------------------+--------------------+
4557       | 4503599627370497.0 | 0X4340000000000000 |
4558       +--------------------+--------------------+
4559       | ...                | ...                |
4560
4561       All double floating point will be unchanged for ceil if it is
4562       greater than and equal to 4503599627370496.
4563  */
4564 rtx
4565 get_fp_rounding_coefficient (machine_mode inner_mode)
4566 {
4567   REAL_VALUE_TYPE real;
4568
4569   if (inner_mode == E_HFmode)
4570     real_from_integer (&real, inner_mode, 1024, SIGNED);
4571   else if (inner_mode == E_SFmode)
4572     real_from_integer (&real, inner_mode, 8388608, SIGNED);
4573   else if (inner_mode == E_DFmode)
4574     real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4575   else
4576     gcc_unreachable ();
4577
4578   return const_double_from_real_value (real, inner_mode);
4579 }
4580
4581 static rtx
4582 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4583                          machine_mode vec_fp_mode)
4584 {
4585   /* Step-1: Prepare the scalar float compare register.  */
4586   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4587   emit_insn (gen_move_insn (fp_reg, fp_scalar));
4588
4589   /* Step-2: Generate the mask.  */
4590   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4591   rtx mask = gen_reg_rtx (mask_mode);
4592   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4593   rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4594   insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4595   emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4596
4597   return mask;
4598 }
4599
4600 static void
4601 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4602                    machine_mode vec_mode)
4603 {
4604   rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4605   insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4606
4607   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4608 }
4609
4610 static void
4611 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4612 {
4613   rtx abs_ops[] = {op_dest, op_src};
4614   insn_code icode = code_for_pred (ABS, vec_mode);
4615
4616   emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4617 }
4618
4619 static void
4620 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4621                   insn_type type, machine_mode vec_mode)
4622 {
4623   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4624
4625   if (type & USE_VUNDEF_MERGE_P)
4626     {
4627       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4628       emit_vlmax_insn (icode, type, cvt_x_ops);
4629     }
4630   else
4631     {
4632       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4633       emit_vlmax_insn (icode, type, cvt_x_ops);
4634     }
4635 }
4636
4637 static void
4638 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4639                   machine_mode vec_mode)
4640 {
4641   rtx ops[] = {op_dest, op_src};
4642   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4643
4644   emit_vlmax_insn (icode, type, ops);
4645 }
4646
4647 static void
4648 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4649                          machine_mode vec_mode)
4650 {
4651   rtx ops[] = {op_dest, op_src};
4652   insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4653
4654   emit_vlmax_insn (icode, type, ops);
4655 }
4656
4657 static void
4658 emit_vec_widen_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4659                          machine_mode vec_mode)
4660 {
4661   rtx ops[] = {op_dest, op_src};
4662   insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4663
4664   emit_vlmax_insn (icode, type, ops);
4665 }
4666
4667 static void
4668 emit_vec_widen_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4669                          machine_mode vec_mode)
4670 {
4671   rtx ops[] = {op_dest, op_src};
4672   insn_code icode = code_for_pred_extend (vec_mode);
4673
4674   emit_vlmax_insn (icode, type, ops);
4675 }
4676
4677 static void
4678 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4679                   insn_type type, machine_mode vec_mode)
4680 {
4681   rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4682   insn_code icode = code_for_pred (FLOAT, vec_mode);
4683
4684   emit_vlmax_insn (icode, type, cvt_fp_ops);
4685 }
4686
4687 static void
4688 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4689                       insn_type type, machine_mode vec_mode)
4690 {
4691   insn_code icode = code_for_pred (FIX, vec_mode);
4692
4693   if (type & USE_VUNDEF_MERGE_P)
4694     {
4695       rtx cvt_x_ops[] = {op_dest, mask, op_src};
4696       emit_vlmax_insn (icode, type, cvt_x_ops);
4697     }
4698   else
4699     {
4700       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4701       emit_vlmax_insn (icode, type, cvt_x_ops);
4702     }
4703 }
4704
4705 static void
4706 emit_vec_binary_alu (rtx op_dest, rtx op_1, rtx op_2, enum rtx_code rcode,
4707                      machine_mode vec_mode)
4708 {
4709   rtx ops[] = {op_dest, op_1, op_2};
4710   insn_code icode = code_for_pred (rcode, vec_mode);
4711
4712   emit_vlmax_insn (icode, BINARY_OP, ops);
4713 }
4714
4715 void
4716 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4717                  machine_mode vec_int_mode)
4718 {
4719   /* Step-1: Get the abs float value for mask generation.  */
4720   emit_vec_abs (op_0, op_1, vec_fp_mode);
4721
4722   /* Step-2: Generate the mask on const fp.  */
4723   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4724   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4725
4726   /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
4727   rtx tmp = gen_reg_rtx (vec_int_mode);
4728   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4729
4730   /* Step-4: Convert to floating-point on mask for the final result.
4731      To avoid unnecessary frm register access, we use RUP here and it will
4732      never do the rounding up because the tmp rtx comes from the float
4733      to int conversion.  */
4734   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4735
4736   /* Step-5: Retrieve the sign bit for -0.0.  */
4737   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4738 }
4739
4740 void
4741 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4742                   machine_mode vec_int_mode)
4743 {
4744   /* Step-1: Get the abs float value for mask generation.  */
4745   emit_vec_abs (op_0, op_1, vec_fp_mode);
4746
4747   /* Step-2: Generate the mask on const fp.  */
4748   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4749   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4750
4751   /* Step-3: Convert to integer on mask, with rounding down (aka floor).  */
4752   rtx tmp = gen_reg_rtx (vec_int_mode);
4753   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4754
4755   /* Step-4: Convert to floating-point on mask for the floor result.  */
4756   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4757
4758   /* Step-5: Retrieve the sign bit for -0.0.  */
4759   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4760 }
4761
4762 void
4763 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4764                       machine_mode vec_int_mode)
4765 {
4766   /* Step-1: Get the abs float value for mask generation.  */
4767   emit_vec_abs (op_0, op_1, vec_fp_mode);
4768
4769   /* Step-2: Generate the mask on const fp.  */
4770   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4771   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4772
4773   /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4774   rtx fflags = gen_reg_rtx (SImode);
4775   emit_insn (gen_riscv_frflags (fflags));
4776
4777   /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint).  */
4778   rtx tmp = gen_reg_rtx (vec_int_mode);
4779   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4780
4781   /* Step-5: Convert to floating-point on mask for the nearbyint result.  */
4782   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4783
4784   /* Step-6: Restore FP exception flags. */
4785   emit_insn (gen_riscv_fsflags (fflags));
4786
4787   /* Step-7: Retrieve the sign bit for -0.0.  */
4788   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4789 }
4790
4791 void
4792 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4793                  machine_mode vec_int_mode)
4794 {
4795   /* Step-1: Get the abs float value for mask generation.  */
4796   emit_vec_abs (op_0, op_1, vec_fp_mode);
4797
4798   /* Step-2: Generate the mask on const fp.  */
4799   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4800   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4801
4802   /* Step-3: Convert to integer on mask, with dyn rounding (aka rint).  */
4803   rtx tmp = gen_reg_rtx (vec_int_mode);
4804   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4805
4806   /* Step-4: Convert to floating-point on mask for the rint result.  */
4807   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4808
4809   /* Step-5: Retrieve the sign bit for -0.0.  */
4810   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4811 }
4812
4813 void
4814 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4815                   machine_mode vec_int_mode)
4816 {
4817   /* Step-1: Get the abs float value for mask generation.  */
4818   emit_vec_abs (op_0, op_1, vec_fp_mode);
4819
4820   /* Step-2: Generate the mask on const fp.  */
4821   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4822   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4823
4824   /* Step-3: Convert to integer on mask, rounding to nearest (aka round).  */
4825   rtx tmp = gen_reg_rtx (vec_int_mode);
4826   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4827
4828   /* Step-4: Convert to floating-point on mask for the round result.  */
4829   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4830
4831   /* Step-5: Retrieve the sign bit for -0.0.  */
4832   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4833 }
4834
4835 void
4836 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4837                   machine_mode vec_int_mode)
4838 {
4839   /* Step-1: Get the abs float value for mask generation.  */
4840   emit_vec_abs (op_0, op_1, vec_fp_mode);
4841
4842   /* Step-2: Generate the mask on const fp.  */
4843   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4844   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4845
4846   /* Step-3: Convert to integer on mask, rounding to zero (aka truncate).  */
4847   rtx tmp = gen_reg_rtx (vec_int_mode);
4848   emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4849
4850   /* Step-4: Convert to floating-point on mask for the rint result.  */
4851   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4852
4853   /* Step-5: Retrieve the sign bit for -0.0.  */
4854   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4855 }
4856
4857 void
4858 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4859                       machine_mode vec_int_mode)
4860 {
4861   /* Step-1: Get the abs float value for mask generation.  */
4862   emit_vec_abs (op_0, op_1, vec_fp_mode);
4863
4864   /* Step-2: Generate the mask on const fp.  */
4865   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4866   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4867
4868   /* Step-3: Convert to integer on mask, rounding to nearest, ties to even.  */
4869   rtx tmp = gen_reg_rtx (vec_int_mode);
4870   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4871
4872   /* Step-4: Convert to floating-point on mask for the rint result.  */
4873   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4874
4875   /* Step-5: Retrieve the sign bit for -0.0.  */
4876   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4877 }
4878
4879 /* Handling the rounding from floating-point to int/long/long long.  */
4880 static void
4881 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4882                               machine_mode vec_fp_mode,
4883                               machine_mode vec_int_mode,
4884                               machine_mode vec_bridge_mode = E_VOIDmode)
4885 {
4886   poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4887   poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4888
4889   if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI.  */
4890     emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4891   else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI.  */
4892     emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4893   else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI.  */
4894     emit_vec_widen_cvt_x_f (op_0, op_1, type, vec_int_mode);
4895   else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI.  */
4896     {
4897       gcc_assert (vec_bridge_mode != E_VOIDmode);
4898
4899       rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4900
4901       /* Step-1: HF => SF, no rounding here.  */
4902       emit_vec_widen_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4903       /* Step-2: SF => DI.  */
4904       emit_vec_widen_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4905     }
4906   else
4907     gcc_unreachable ();
4908 }
4909
4910 void
4911 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4912                   machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4913 {
4914   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4915                                 vec_int_mode, vec_bridge_mode);
4916 }
4917
4918 void
4919 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4920                    machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4921 {
4922   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4923                                 vec_int_mode, vec_bridge_mode);
4924 }
4925
4926 void
4927 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4928                   machine_mode vec_int_mode)
4929 {
4930   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4931                                 vec_int_mode);
4932 }
4933
4934 void
4935 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4936                    machine_mode vec_int_mode)
4937 {
4938   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4939                                 vec_int_mode);
4940 }
4941
4942 /* Expand the standard name usadd<mode>3 for vector mode,  we can leverage
4943    the vector fixed point vector single-width saturating add directly.  */
4944
4945 void
4946 expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4947 {
4948   emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode);
4949 }
4950
4951 /* Expand the standard name ssadd<mode>3 for vector mode,  we can leverage
4952    the vector fixed point vector single-width saturating add directly.  */
4953
4954 void
4955 expand_vec_ssadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4956 {
4957   emit_vec_binary_alu (op_0, op_1, op_2, SS_PLUS, vec_mode);
4958 }
4959
4960 /* Expand the standard name usadd<mode>3 for vector mode,  we can leverage
4961    the vector fixed point vector single-width saturating add directly.  */
4962
4963 void
4964 expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4965 {
4966   emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
4967 }
4968
4969 /* Expand the standard name ssadd<mode>3 for vector mode,  we can leverage
4970    the vector fixed point vector single-width saturating add directly.  */
4971
4972 void
4973 expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4974 {
4975   emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode);
4976 }
4977
4978 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
4979    DI => SI.  we can leverage the vector fixed point vector narrowing
4980    fixed-point clip directly.  */
4981
4982 void
4983 expand_vec_double_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
4984 {
4985   insn_code icode;
4986   rtx zero = CONST0_RTX (Xmode);
4987   enum unspec unspec = UNSPEC_VNCLIPU;
4988   rtx ops[] = {op_0, op_1, zero};
4989
4990   icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
4991   emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
4992 }
4993
4994 /* Expand the standard name sstrunc<m><n>2 for double vector mode,  like
4995    DI => SI.  we can leverage the vector fixed point vector narrowing
4996    fixed-point clip directly.  */
4997
4998 void
4999 expand_vec_double_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5000 {
5001   insn_code icode;
5002   rtx zero = CONST0_RTX (Xmode);
5003   enum unspec unspec = UNSPEC_VNCLIP;
5004   rtx ops[] = {op_0, op_1, zero};
5005
5006   icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5007   emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5008 }
5009
5010 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
5011    DI => HI.  we can leverage the vector fixed point vector narrowing
5012    fixed-point clip directly.  */
5013
5014 void
5015 expand_vec_quad_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5016                          machine_mode double_mode)
5017 {
5018   rtx double_rtx = gen_reg_rtx (double_mode);
5019
5020   expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5021   expand_vec_double_ustrunc (op_0, double_rtx, double_mode);
5022 }
5023
5024 /* Expand the standard name sstrunc<m><n>2 for quad vector mode,  like
5025    DI => HI.  we can leverage the vector fixed point vector narrowing
5026    fixed-point clip directly.  */
5027
5028 void
5029 expand_vec_quad_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5030                          machine_mode double_mode)
5031 {
5032   rtx double_rtx = gen_reg_rtx (double_mode);
5033
5034   expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5035   expand_vec_double_sstrunc (op_0, double_rtx, double_mode);
5036 }
5037
5038 /* Expand the standard name ustrunc<m><n>2 for double vector mode,  like
5039    DI => QI.  we can leverage the vector fixed point vector narrowing
5040    fixed-point clip directly.  */
5041
5042 void
5043 expand_vec_oct_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5044                         machine_mode double_mode, machine_mode quad_mode)
5045 {
5046   rtx double_rtx = gen_reg_rtx (double_mode);
5047   rtx quad_rtx = gen_reg_rtx (quad_mode);
5048
5049   expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5050   expand_vec_double_ustrunc (quad_rtx, double_rtx, double_mode);
5051   expand_vec_double_ustrunc (op_0, quad_rtx, quad_mode);
5052 }
5053
5054 /* Expand the standard name sstrunc<m><n>2 for oct vector mode,  like
5055    DI => QI.  we can leverage the vector fixed point vector narrowing
5056    fixed-point clip directly.  */
5057
5058 void
5059 expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5060                         machine_mode double_mode, machine_mode quad_mode)
5061 {
5062   rtx double_rtx = gen_reg_rtx (double_mode);
5063   rtx quad_rtx = gen_reg_rtx (quad_mode);
5064
5065   expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5066   expand_vec_double_sstrunc (quad_rtx, double_rtx, double_mode);
5067   expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
5068 }
5069
5070 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5071    well.  */
5072 void
5073 expand_popcount (rtx *ops)
5074 {
5075   rtx dst = ops[0];
5076   rtx src = ops[1];
5077   machine_mode mode = GET_MODE (dst);
5078   scalar_mode imode = GET_MODE_INNER (mode);
5079   static const uint64_t m5 = 0x5555555555555555ULL;
5080   static const uint64_t m3 = 0x3333333333333333ULL;
5081   static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
5082   static const uint64_t m1 = 0x0101010101010101ULL;
5083
5084   rtx x1 = gen_reg_rtx (mode);
5085   rtx x2 = gen_reg_rtx (mode);
5086   rtx x3 = gen_reg_rtx (mode);
5087   rtx x4 = gen_reg_rtx (mode);
5088
5089   /* x1 = src - (src >> 1) & 0x555...);  */
5090   rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
5091                              OPTAB_DIRECT);
5092
5093   rtx and1 = gen_reg_rtx (mode);
5094   rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
5095   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5096                    ops1);
5097
5098   x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
5099
5100   /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5101    */
5102   rtx and2 = gen_reg_rtx (mode);
5103   rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
5104   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5105                    ops2);
5106
5107   rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
5108                              OPTAB_DIRECT);
5109
5110   rtx and22 = gen_reg_rtx (mode);
5111   rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
5112   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5113                    ops22);
5114
5115   x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
5116
5117   /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL;  */
5118   rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
5119                              OPTAB_DIRECT);
5120
5121   rtx plus3
5122     = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
5123
5124   rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
5125   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5126                    ops3);
5127
5128   /* dest = (x3 * 0x0101010101010101ULL) >> 56;  */
5129   rtx mul4 = gen_reg_rtx (mode);
5130   rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
5131   emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
5132                    ops4);
5133
5134   x4 = expand_binop (mode, lshr_optab, mul4,
5135                      GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
5136                      OPTAB_DIRECT);
5137
5138   emit_move_insn (dst, x4);
5139 }
5140
5141 /* Return true if it is VLMAX AVL TYPE.  */
5142 bool
5143 vlmax_avl_type_p (rtx_insn *rinsn)
5144 {
5145   extract_insn_cached (rinsn);
5146   int index = get_attr_avl_type_idx (rinsn);
5147   if (index == INVALID_ATTRIBUTE)
5148     return false;
5149   rtx avl_type = recog_data.operand[index];
5150   return INTVAL (avl_type) == VLMAX;
5151 }
5152
5153 /* Return true if it is an RVV instruction depends on VL global
5154    status register.  */
5155 bool
5156 has_vl_op (rtx_insn *rinsn)
5157 {
5158   return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
5159 }
5160
5161 /* Get default tail policy.  */
5162 static bool
5163 get_default_ta ()
5164 {
5165   /* For the instruction that doesn't require TA, we still need a default value
5166      to emit vsetvl. We pick up the default value according to prefer policy. */
5167   return (bool) (get_prefer_tail_policy () & 0x1
5168                  || (get_prefer_tail_policy () >> 1 & 0x1));
5169 }
5170
5171 /* Helper function to get TA operand.  */
5172 bool
5173 tail_agnostic_p (rtx_insn *rinsn)
5174 {
5175   /* If it doesn't have TA, we return agnostic by default.  */
5176   extract_insn_cached (rinsn);
5177   int ta = get_attr_ta (rinsn);
5178   return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
5179 }
5180
5181 /* Change insn and Assert the change always happens.  */
5182 void
5183 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
5184 {
5185   bool change_p = validate_change (object, loc, new_rtx, in_group);
5186   gcc_assert (change_p);
5187 }
5188
5189 /* Return true if it is NONVLMAX AVL TYPE.  */
5190 bool
5191 nonvlmax_avl_type_p (rtx_insn *rinsn)
5192 {
5193   extract_insn_cached (rinsn);
5194   int index = get_attr_avl_type_idx (rinsn);
5195   if (index == INVALID_ATTRIBUTE)
5196     return false;
5197   rtx avl_type = recog_data.operand[index];
5198   return INTVAL (avl_type) == NONVLMAX;
5199 }
5200
5201 /* Return true if RTX is RVV VLMAX AVL.  */
5202 bool
5203 vlmax_avl_p (rtx x)
5204 {
5205   return x && rtx_equal_p (x, RVV_VLMAX);
5206 }
5207
5208 /* Helper function to get SEW operand. We always have SEW value for
5209    all RVV instructions that have VTYPE OP.  */
5210 uint8_t
5211 get_sew (rtx_insn *rinsn)
5212 {
5213   return get_attr_sew (rinsn);
5214 }
5215
5216 /* Helper function to get VLMUL operand. We always have VLMUL value for
5217    all RVV instructions that have VTYPE OP. */
5218 enum vlmul_type
5219 get_vlmul (rtx_insn *rinsn)
5220 {
5221   return (enum vlmul_type) get_attr_vlmul (rinsn);
5222 }
5223
5224 /* Count the number of REGNO in RINSN.  */
5225 int
5226 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5227 {
5228   int count = 0;
5229   extract_insn (rinsn);
5230   for (int i = 0; i < recog_data.n_operands; i++)
5231     if (refers_to_regno_p (regno, recog_data.operand[i]))
5232       count++;
5233   return count;
5234 }
5235
5236 /* Return true if the OP can be directly broadcasted.  */
5237 bool
5238 can_be_broadcasted_p (rtx op)
5239 {
5240   machine_mode mode = GET_MODE (op);
5241   /* We don't allow RA (register allocation) reload generate
5242     (vec_duplicate:DI reg) in RV32 system wheras we allow
5243     (vec_duplicate:DI mem) in RV32 system.  */
5244   if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5245       && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5246       && !satisfies_constraint_Wdm (op))
5247     return false;
5248
5249   if (satisfies_constraint_K (op) || register_operand (op, mode)
5250       || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5251     return true;
5252
5253   return can_create_pseudo_p () && nonmemory_operand (op, mode);
5254 }
5255
5256 void
5257 emit_vec_extract (rtx target, rtx src, rtx index)
5258 {
5259   machine_mode vmode = GET_MODE (src);
5260   machine_mode smode = GET_MODE (target);
5261   class expand_operand ops[3];
5262   enum insn_code icode
5263     = convert_optab_handler (vec_extract_optab, vmode, smode);
5264   gcc_assert (icode != CODE_FOR_nothing);
5265   create_output_operand (&ops[0], target, smode);
5266   ops[0].target = 1;
5267   create_input_operand (&ops[1], src, vmode);
5268
5269   poly_int64 val;
5270   if (poly_int_rtx_p (index, &val))
5271     create_integer_operand (&ops[2], val);
5272   else
5273     create_input_operand (&ops[2], index, Pmode);
5274
5275   expand_insn (icode, 3, ops);
5276   if (ops[0].value != target)
5277     emit_move_insn (target, ops[0].value);
5278 }
5279
5280 /* Return true if the offset mode is valid mode that we use for gather/scatter
5281    autovectorization.  */
5282 bool
5283 gather_scatter_valid_offset_p (machine_mode mode)
5284 {
5285   /* If the element size of offset mode is already >= Pmode size,
5286      we don't need any extensions.  */
5287   if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5288     return true;
5289
5290   /* Since we are very likely extend the offset mode into vector Pmode,
5291      Disable gather/scatter autovectorization if we can't extend the offset
5292      mode into vector Pmode.  */
5293   if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5294     return false;
5295   return true;
5296 }
5297
5298 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5299    Look into the tuning structure for an estimate.
5300    KIND specifies the type of requested estimate: min, max or likely.
5301    For cores with a known VLA width all three estimates are the same.
5302    For generic VLA tuning we want to distinguish the maximum estimate from
5303    the minimum and likely ones.
5304    The likely estimate is the same as the minimum in that case to give a
5305    conservative behavior of auto-vectorizing with VLA when it is a win
5306    even for VLA vectorization.
5307    When VLA width information is available VAL.coeffs[1] is multiplied by
5308    the number of VLA chunks over the initial VLS bits.  */
5309 HOST_WIDE_INT
5310 estimated_poly_value (poly_int64 val, unsigned int kind)
5311 {
5312   unsigned int width_source
5313     = BITS_PER_RISCV_VECTOR.is_constant ()
5314         ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5315         : (unsigned int) RVV_VECTOR_BITS_SCALABLE;
5316
5317   /* If there is no core-specific information then the minimum and likely
5318      values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5319      the architectural maximum of 65536 bits.  */
5320   unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5321   if (width_source == RVV_VECTOR_BITS_SCALABLE)
5322     switch (kind)
5323       {
5324       case POLY_VALUE_MIN:
5325       case POLY_VALUE_LIKELY:
5326         return val.coeffs[0];
5327
5328       case POLY_VALUE_MAX:
5329         return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5330       }
5331
5332   /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5333      lowest as likely.  This could be made more general if future -mtune
5334      options need it to be.  */
5335   if (kind == POLY_VALUE_MAX)
5336     width_source = 1 << floor_log2 (width_source);
5337   else
5338     width_source = least_bit_hwi (width_source);
5339
5340   /* If the core provides width information, use that.  */
5341   HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5342   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5343 }
5344
5345 /* Return true it is whole register-register move.  */
5346 bool
5347 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
5348 {
5349   /* An operation is a whole-register move if either
5350      (1) Its vlmax operand equals VLMAX
5351      (2) Its vl operand equals the number of units of its mode.  */
5352   if (register_operand (ops[0], mode)
5353       && register_operand (ops[3], mode)
5354       && satisfies_constraint_vu (ops[2])
5355       && satisfies_constraint_Wc1 (ops[1]))
5356     {
5357       if (INTVAL (ops[avl_type_index]) == VLMAX)
5358         return true;
5359       /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5360          into NON-VLMAX with LEN = NUNITS.  */
5361       else if (CONST_INT_P (ops[4])
5362                && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
5363         return true;
5364     }
5365   return false;
5366 }
5367
5368 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
5369 bool
5370 splat_to_scalar_move_p (rtx *ops)
5371 {
5372   return satisfies_constraint_Wc1 (ops[1])
5373          && satisfies_constraint_vu (ops[2])
5374          && !MEM_P (ops[3])
5375          && satisfies_constraint_c01 (ops[4])
5376          && INTVAL (ops[7]) == NONVLMAX
5377          && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
5378 }
5379
5380 } // namespace riscv_vector