c++: Fix tree_contains_struct for TRAIT_EXPR
[official-gcc.git] / gcc / config / riscv / riscv-v.cc
blob5e728f04cf51a013b4d4ad24d8ec6a0f7da579df
1 /* Subroutines used for code generation for RISC-V 'V' Extension for
2 GNU compiler.
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
25 the vector.md. */
26 #define RVV_INSN_OPERANDS_MAX 11
28 #include "config.h"
29 #include "system.h"
30 #include "coretypes.h"
31 #include "tm.h"
32 #include "backend.h"
33 #include "rtl.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
36 #include "recog.h"
37 #include "alias.h"
38 #include "tree.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "explow.h"
42 #include "memmodel.h"
43 #include "emit-rtl.h"
44 #include "tm_p.h"
45 #include "target.h"
46 #include "targhooks.h"
47 #include "expr.h"
48 #include "optabs.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
52 #include "predict.h"
53 #include "errors.h"
54 #include "riscv-v.h"
56 using namespace riscv_vector;
58 namespace riscv_vector {
60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli. */
61 bool
62 imm_avl_p (machine_mode mode)
64 poly_uint64 nunits = GET_MODE_NUNITS (mode);
66 return nunits.is_constant ()
67 /* The vsetivli can only hold register 0~31. */
68 ? (IN_RANGE (nunits.to_constant (), 0, 31))
69 /* Only allowed in VLS-VLMAX mode. */
70 : false;
73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
74 static bool
75 is_vlmax_len_p (machine_mode mode, rtx len)
77 poly_int64 value;
78 return poly_int_rtx_p (len, &value)
79 && known_eq (value, GET_MODE_NUNITS (mode));
82 /* Helper functions for insn_flags && insn_types */
84 /* Return true if caller need pass mask operand for insn pattern with
85 INSN_FLAGS. */
87 static bool
88 need_mask_operand_p (unsigned insn_flags)
90 return (insn_flags & HAS_MASK_P)
91 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
94 template <int MAX_OPERANDS> class insn_expander
96 public:
97 insn_expander () = delete;
99 insn_expander (unsigned insn_flags, bool vlmax_p)
100 : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
101 m_vl_op (NULL_RTX)
103 check_insn_flags ();
106 void check_insn_flags () const
108 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
109 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
110 gcc_assert ((m_insn_flags & HAS_MASK_P));
112 if (m_insn_flags & USE_ALL_TRUES_MASK_P)
113 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
114 gcc_assert ((m_insn_flags & HAS_MASK_P));
116 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
117 gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
118 && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
120 if (m_insn_flags & USE_VUNDEF_MERGE_P)
121 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
122 gcc_assert ((m_insn_flags & HAS_MERGE_P));
124 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
125 gcc_assert (
126 !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
128 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
129 gcc_assert (
130 !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
132 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
133 exclusive. */
134 gcc_assert (
135 !((m_insn_flags & NULLARY_OP_P)
136 && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
137 || (m_insn_flags & TERNARY_OP_P))));
138 gcc_assert (
139 !((m_insn_flags & UNARY_OP_P)
140 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
141 || (m_insn_flags & TERNARY_OP_P))));
142 gcc_assert (
143 !((m_insn_flags & BINARY_OP_P)
144 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
145 || (m_insn_flags & TERNARY_OP_P))));
146 gcc_assert (
147 !((m_insn_flags & TERNARY_OP_P)
148 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
149 || (m_insn_flags & BINARY_OP_P))));
152 void set_vl (rtx vl) { m_vl_op = vl; }
154 void add_output_operand (rtx x, machine_mode mode)
156 create_output_operand (&m_ops[m_opno++], x, mode);
157 gcc_assert (m_opno <= MAX_OPERANDS);
159 void add_input_operand (rtx x, machine_mode mode)
161 create_input_operand (&m_ops[m_opno++], x, mode);
162 gcc_assert (m_opno <= MAX_OPERANDS);
164 void add_all_one_mask_operand (machine_mode mask_mode)
166 add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
168 void add_first_one_true_mask_operand (machine_mode mask_mode)
170 add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
172 void add_vundef_operand (machine_mode dest_mode)
174 add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
176 void add_policy_operand ()
178 if (m_insn_flags & TU_POLICY_P)
180 rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
181 add_input_operand (tail_policy_rtx, Pmode);
183 else if (m_insn_flags & TDEFAULT_POLICY_P)
185 rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
186 add_input_operand (tail_policy_rtx, Pmode);
189 if (m_insn_flags & MU_POLICY_P)
191 rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
192 add_input_operand (mask_policy_rtx, Pmode);
194 else if (m_insn_flags & MDEFAULT_POLICY_P)
196 rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
197 add_input_operand (mask_policy_rtx, Pmode);
200 void add_avl_type_operand (avl_type type)
202 add_input_operand (gen_int_mode (type, Pmode), Pmode);
205 void
206 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
208 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
209 add_input_operand (frm_rtx, Pmode);
212 void
213 add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
215 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
216 add_input_operand (frm_rtx, Pmode);
219 /* Return the vtype mode based on insn_flags.
220 vtype mode mean the mode vsetvl insn set. */
221 machine_mode
222 get_vtype_mode (rtx *ops)
224 machine_mode vtype_mode;
225 if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
226 vtype_mode = GET_MODE (ops[1]);
227 else
228 vtype_mode = GET_MODE (ops[0]);
229 return vtype_mode;
232 void emit_insn (enum insn_code icode, rtx *ops)
234 int opno = 0;
235 int num_ops;
236 /* It's true if any operand is memory operand. */
237 bool any_mem_p = false;
239 machine_mode vtype_mode = get_vtype_mode (ops);
240 machine_mode mask_mode = get_mask_mode (vtype_mode);
242 /* Add dest operand. */
243 if (m_insn_flags & HAS_DEST_P)
245 rtx op = ops[opno++];
246 any_mem_p |= MEM_P (op);
247 add_output_operand (op, GET_MODE (op));
250 /* Add mask operand. */
251 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
252 add_first_one_true_mask_operand (mask_mode);
253 else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
254 add_all_one_mask_operand (mask_mode);
255 else if (m_insn_flags & HAS_MASK_P)
257 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
258 gcc_assert (mode != VOIDmode);
259 add_input_operand (ops[opno++], mode);
262 /* Add merge operand. */
263 if (m_insn_flags & USE_VUNDEF_MERGE_P)
264 /* Same as dest operand. */
265 add_vundef_operand (GET_MODE (ops[0]));
266 else if (m_insn_flags & HAS_MERGE_P)
268 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
269 gcc_assert (mode != VOIDmode);
270 add_input_operand (ops[opno++], mode);
273 if (m_insn_flags & NULLARY_OP_P)
274 num_ops = 0;
275 else if (m_insn_flags & UNARY_OP_P)
276 num_ops = 1;
277 else if (m_insn_flags & BINARY_OP_P)
278 num_ops = 2;
279 else if (m_insn_flags & TERNARY_OP_P)
280 num_ops = 3;
281 else
282 gcc_unreachable ();
284 /* Add the remain operands. */
285 for (; num_ops; num_ops--, opno++)
287 any_mem_p |= MEM_P (ops[opno]);
288 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
289 /* 'create_input_operand doesn't allow VOIDmode.
290 According to vector.md, we may have some patterns that do not have
291 explicit machine mode specifying the operand. Such operands are
292 always Pmode. */
293 if (mode == VOIDmode)
294 mode = Pmode;
296 /* Early assertion ensures same mode since maybe_legitimize_operand
297 will check this. */
298 machine_mode required_mode = GET_MODE (ops[opno]);
299 if (required_mode != VOIDmode && required_mode != mode)
300 internal_error ("expected mode %s for operand %d of "
301 "insn %s but got mode %s.\n",
302 GET_MODE_NAME (mode),
303 opno,
304 insn_data[(int) icode].name,
305 GET_MODE_NAME (required_mode));
307 add_input_operand (ops[opno], mode);
310 /* Add vl operand. */
311 rtx len = m_vl_op;
312 bool vls_p = false;
313 if (m_vlmax_p)
315 if (riscv_v_ext_vls_mode_p (vtype_mode))
317 /* VLS modes always set VSETVL by
318 "vsetvl zero, rs1/imm". */
319 poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
320 len = gen_int_mode (nunits, Pmode);
321 vls_p = true;
323 else if (can_create_pseudo_p ())
325 len = gen_reg_rtx (Pmode);
326 emit_vlmax_vsetvl (vtype_mode, len);
330 gcc_assert (len != NULL_RTX);
331 add_input_operand (len, Pmode);
333 /* Add tail and mask policy operands. */
334 add_policy_operand ();
336 /* Add avl_type operand. */
337 add_avl_type_operand (
338 vls_p ? avl_type::VLS
339 : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
341 /* Add rounding mode operand. */
342 if (m_insn_flags & FRM_DYN_P)
343 add_rounding_mode_operand (FRM_DYN);
344 else if (m_insn_flags & FRM_RUP_P)
345 add_rounding_mode_operand (FRM_RUP);
346 else if (m_insn_flags & FRM_RDN_P)
347 add_rounding_mode_operand (FRM_RDN);
348 else if (m_insn_flags & FRM_RMM_P)
349 add_rounding_mode_operand (FRM_RMM);
350 else if (m_insn_flags & FRM_RNE_P)
351 add_rounding_mode_operand (FRM_RNE);
352 else if (m_insn_flags & VXRM_RNU_P)
353 add_rounding_mode_operand (VXRM_RNU);
354 else if (m_insn_flags & VXRM_RDN_P)
355 add_rounding_mode_operand (VXRM_RDN);
358 if (insn_data[(int) icode].n_operands != m_opno)
359 internal_error ("invalid number of operands for insn %s, "
360 "expected %d but got %d.\n",
361 insn_data[(int) icode].name,
362 insn_data[(int) icode].n_operands, m_opno);
364 expand (icode, any_mem_p);
367 void expand (enum insn_code icode, bool temporary_volatile_p = false)
369 if (temporary_volatile_p)
371 temporary_volatile_ok v (true);
372 expand_insn (icode, m_opno, m_ops);
374 else
375 expand_insn (icode, m_opno, m_ops);
378 private:
379 unsigned m_insn_flags;
380 int m_opno;
381 bool m_vlmax_p;
382 rtx m_vl_op;
383 expand_operand m_ops[MAX_OPERANDS];
386 /* Emit an RVV insn with a vector length that equals the number of units of the
387 vector mode. For VLA modes this corresponds to VLMAX.
389 Unless the vector length can be encoded in the vsetivl[i] instruction this
390 function must only be used as long as we can create pseudo registers. This is
391 because it will set a pseudo register to VLMAX using vsetvl and use this as
392 definition for the vector length. */
393 void
394 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
396 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
397 gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
399 e.emit_insn ((enum insn_code) icode, ops);
402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
403 registers anymore. This function, however, takes a predefined vector length
404 from the value in VL. */
405 void
406 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
408 gcc_assert (!can_create_pseudo_p ());
409 machine_mode mode = GET_MODE (ops[0]);
411 if (imm_avl_p (mode))
413 /* Even though VL is a real hardreg already allocated since
414 it is post-RA now, we still gain benefits that we emit
415 vsetivli zero, imm instead of vsetvli VL, zero which is
416 we can be more flexible in post-RA instruction scheduling. */
417 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
418 e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
419 e.emit_insn ((enum insn_code) icode, ops);
421 else
423 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
424 e.set_vl (vl);
425 e.emit_insn ((enum insn_code) icode, ops);
429 /* Emit an RVV insn with a predefined vector length. Contrary to
430 emit_vlmax_insn the instruction's vector length is not deduced from its mode
431 but taken from the value in VL. */
432 void
433 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
435 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
436 e.set_vl (vl);
437 e.emit_insn ((enum insn_code) icode, ops);
440 /* Return true if the vector duplicated by a super element which is the fusion
441 of consecutive elements.
443 v = { a, b, a, b } super element = ab, v = { ab, ab } */
444 bool
445 rvv_builder::can_duplicate_repeating_sequence_p ()
447 poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
448 unsigned int new_inner_size = m_inner_bits_size * npatterns ();
449 if (m_inner_mode == Pmode
450 || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
451 || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
452 || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
453 return false;
454 return repeating_sequence_p (0, encoded_nelts (), npatterns ());
457 /* Return true if the vector is a simple sequence with one pattern and all
458 elements the same. */
459 bool
460 rvv_builder::is_repeating_sequence ()
462 if (npatterns () > 1)
463 return false;
464 return repeating_sequence_p (0, encoded_nelts (), 1);
467 /* Return true if it is a repeating sequence that using
468 merge approach has better codegen than using default
469 approach (slide1down).
471 Sequence A:
472 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
474 nelts = 16
475 npatterns = 2
477 for merging a we need mask 101010....
478 for merging b we need mask 010101....
480 Foreach element in the npattern, we need to build a mask in scalar register.
481 Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
482 instructions and 1 scalar move to v0 register. Finally we need vector merge
483 to merge them.
485 lui a5, #imm
486 add a5, #imm
487 vmov.s.x v0, a5
488 vmerge.vxm v9, v9, a1, v0
490 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
491 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
492 So return true in this case as it is profitable.
494 Sequence B:
495 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
497 nelts = 16
498 npatterns = 8
500 COST of merge approach = (3 + 1) * npatterns = 24
501 COST of slide1down approach = nelts = 16
502 Return false in this case as it is NOT profitable in merge approach.
504 bool
505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
507 if (inner_bytes_size () > UNITS_PER_WORD)
508 return false;
510 unsigned int nelts = full_nelts ().to_constant ();
512 if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
513 return false;
515 unsigned int merge_cost = 1;
516 unsigned int build_merge_mask_cost = 3;
517 unsigned int slide1down_cost = nelts;
519 return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
522 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
523 bool
524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
526 int nelts = full_nelts ().to_constant ();
527 int leading_ndups = this->count_dups (0, nelts - 1, 1);
528 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
530 /* ??? Current heuristic we do is we do combine 2 vectors
531 by slideup when:
532 1. # of leading same elements is equal to # of trailing same elements.
533 2. Both of above are equal to nelts / 2.
534 Otherwise, it is not profitable. */
535 return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
538 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
539 bool
540 rvv_builder::combine_sequence_use_merge_profitable_p ()
542 int nelts = full_nelts ().to_constant ();
543 int leading_ndups = this->count_dups (0, nelts - 1, 1);
544 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
545 int nregs = riscv_get_v_regno_alignment (int_mode ());
547 if (leading_ndups + trailing_ndups != nelts)
548 return false;
550 /* Leading elements num > 255 which exceeds the maximum value
551 of QImode, we will need to use HImode. */
552 machine_mode mode;
553 if (leading_ndups > 255 || nregs > 2)
555 if (!get_vector_mode (HImode, nelts).exists (&mode))
556 return false;
557 /* We will need one more AVL/VL toggling vsetvl instruction. */
558 return leading_ndups > 4 && trailing_ndups > 4;
561 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
562 consume 3 slide instructions. */
563 return leading_ndups > 3 && trailing_ndups > 3;
566 /* Merge the repeating sequence into a single element and return the RTX. */
568 rvv_builder::get_merged_repeating_sequence ()
570 scalar_int_mode mode = Pmode;
571 rtx target = gen_reg_rtx (mode);
572 emit_move_insn (target, const0_rtx);
573 rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
574 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
575 for (unsigned int i = 0; i < npatterns (); i++)
577 unsigned int loc = m_inner_bits_size * i;
578 rtx shift = gen_int_mode (loc, mode);
579 rtx ele = gen_lowpart (mode, elt (i));
580 rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
581 OPTAB_DIRECT);
582 rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
583 OPTAB_DIRECT);
584 rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
585 OPTAB_DIRECT);
586 emit_move_insn (target, tmp3);
588 if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
589 return gen_lowpart (m_new_inner_mode, target);
590 return target;
593 /* Get the mask for merge approach.
595 Consider such following case:
596 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
597 To merge "a", the mask should be 1010....
598 To merge "b", the mask should be 0101....
601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
602 machine_mode inner_mode) const
604 unsigned HOST_WIDE_INT mask = 0;
605 unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
606 /* Here we construct a mask pattern that will later be broadcast
607 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
608 is determined by the length of a vector element (ELEN) and not by
609 XLEN so make sure we do not exceed it. One example is -march=zve32*
610 which mandates ELEN == 32 but can be combined with -march=rv64
611 with XLEN == 64. */
612 unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
614 gcc_assert (elen % npatterns () == 0);
616 int limit = elen / npatterns ();
618 for (int i = 0; i < limit; i++)
619 mask |= base_mask << (i * npatterns ());
621 return gen_int_mode (mask, inner_mode);
624 /* Return true if the variable-length vector is single step.
625 Single step means step all patterns in NPATTERNS are equal.
626 Consider this following case:
628 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
629 { 0, 2, 2, 4, 4, 6, ... }
630 First pattern: step1 = 2 - 0 = 2
631 step2 = 4 - 2 = 2
632 Second pattern: step1 = 4 - 2 = 2
633 step2 = 6 - 4 = 2
634 Since all steps of NPATTERNS are equal step = 2.
635 Return true in this case.
637 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
638 { 0, 1, 2, 4, 4, 7, ... }
639 First pattern: step1 = 2 - 0 = 2
640 step2 = 4 - 2 = 2
641 Second pattern: step1 = 4 - 1 = 3
642 step2 = 7 - 4 = 3
643 Since not all steps are equal, return false. */
644 bool
645 rvv_builder::single_step_npatterns_p () const
647 if (nelts_per_pattern () != 3)
648 return false;
650 poly_int64 step
651 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
652 for (unsigned int i = 0; i < npatterns (); i++)
654 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
655 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
656 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
657 poly_int64 diff1 = ele1 - ele0;
658 poly_int64 diff2 = ele2 - ele1;
659 if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
660 return false;
662 return true;
665 /* Return true if the diff between const vector and vid sequence
666 is repeated. For example as below cases:
667 The diff means the const vector - vid.
668 CASE 1:
669 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
670 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
671 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
672 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
673 return TRUE for case 1.
675 CASE 2:
676 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
677 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
678 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
679 The diff sequence {-4, 3} is not repeated in the npattern and
680 return FALSE for case 2. */
681 bool
682 rvv_builder::npatterns_vid_diff_repeated_p () const
684 if (nelts_per_pattern () != 3)
685 return false;
686 else if (npatterns () == 0)
687 return false;
689 for (unsigned i = 0; i < npatterns (); i++)
691 poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
692 poly_int64 diff_1
693 = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
695 if (maybe_ne (diff_0, diff_1))
696 return false;
699 return true;
702 /* Return true if the permutation consists of two
703 interleaved patterns with a constant step each.
704 TODO: We currently only support NPATTERNS = 2. */
705 bool
706 rvv_builder::interleaved_stepped_npatterns_p () const
708 if (npatterns () != 2 || nelts_per_pattern () != 3)
709 return false;
710 for (unsigned int i = 0; i < npatterns (); i++)
712 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
713 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
714 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
715 poly_int64 diff1 = ele1 - ele0;
716 poly_int64 diff2 = ele2 - ele1;
717 if (maybe_ne (diff1, diff2))
718 return false;
720 return true;
723 /* Return true if all elements of NPATTERNS are equal.
725 E.g. NPATTERNS = 4:
726 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
727 E.g. NPATTERNS = 8:
728 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
729 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
730 We don't need to check the elements[n] with n >= NPATTERNS since
731 they don't belong to the same pattern.
733 bool
734 rvv_builder::npatterns_all_equal_p () const
736 poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
737 for (unsigned int i = 1; i < npatterns (); i++)
739 poly_int64 ele = rtx_to_poly_int64 (elt (i));
740 if (!known_eq (ele, ele0))
741 return false;
743 return true;
746 static unsigned
747 get_sew (machine_mode mode)
749 unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
751 : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
752 return sew;
755 /* Return true if X is a const_vector with all duplicate elements, which is in
756 the range between MINVAL and MAXVAL. */
757 bool
758 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
759 HOST_WIDE_INT maxval)
761 rtx elt;
762 return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
763 && IN_RANGE (INTVAL (elt), minval, maxval));
766 /* Return true if VEC is a constant in which every element is in the range
767 [MINVAL, MAXVAL]. The elements do not need to have the same value.
769 This function also exists in aarch64, we may unify it in middle-end in the
770 future. */
772 static bool
773 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
775 if (!CONST_VECTOR_P (vec)
776 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
777 return false;
779 int nunits;
780 if (!CONST_VECTOR_STEPPED_P (vec))
781 nunits = const_vector_encoded_nelts (vec);
782 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
783 return false;
785 for (int i = 0; i < nunits; i++)
787 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
788 poly_int64 value;
789 if (!poly_int_rtx_p (vec_elem, &value)
790 || maybe_lt (value, minval)
791 || maybe_gt (value, maxval))
792 return false;
794 return true;
797 /* Returns true if the vector's elements are all duplicates in
798 range -16 ~ 15 integer or 0.0 floating-point. */
800 bool
801 valid_vec_immediate_p (rtx x)
803 return (satisfies_constraint_vi (x) || satisfies_constraint_Wc0 (x));
806 /* Return a const vector of VAL. The VAL can be either const_int or
807 const_poly_int. */
809 static rtx
810 gen_const_vector_dup (machine_mode mode, poly_int64 val)
812 scalar_mode smode = GET_MODE_INNER (mode);
813 rtx c = gen_int_mode (val, smode);
814 if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
816 /* When VAL is const_poly_int value, we need to explicitly broadcast
817 it into a vector using RVV broadcast instruction. */
818 return expand_vector_broadcast (mode, c);
820 return gen_const_vec_duplicate (mode, c);
823 /* Emit a vlmax vsetvl instruction. This should only be used when
824 optimization is disabled or after vsetvl insertion pass. */
825 void
826 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
828 unsigned int sew = get_sew (vmode);
829 emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
830 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
831 const0_rtx));
834 void
835 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
837 unsigned int sew = get_sew (vmode);
838 enum vlmul_type vlmul = get_vlmul (vmode);
839 unsigned int ratio = calculate_ratio (sew, vlmul);
841 if (!optimize)
842 emit_hard_vlmax_vsetvl (vmode, vl);
843 else
844 emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
847 /* Calculate SEW/LMUL ratio. */
848 unsigned int
849 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
851 unsigned int ratio;
852 switch (vlmul)
854 case LMUL_1:
855 ratio = sew;
856 break;
857 case LMUL_2:
858 ratio = sew / 2;
859 break;
860 case LMUL_4:
861 ratio = sew / 4;
862 break;
863 case LMUL_8:
864 ratio = sew / 8;
865 break;
866 case LMUL_F8:
867 ratio = sew * 8;
868 break;
869 case LMUL_F4:
870 ratio = sew * 4;
871 break;
872 case LMUL_F2:
873 ratio = sew * 2;
874 break;
875 default:
876 gcc_unreachable ();
878 return ratio;
881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
882 compile-time unknown). ZVL means that the vector-length is specific
883 (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
884 auto-vectorization using VLMAX vsetvl configuration. */
885 static bool
886 autovec_use_vlmax_p (void)
888 return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE
889 || rvv_vector_bits == RVV_VECTOR_BITS_ZVL;
892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
893 is a const duplicate vector. Otherwise, emit vrgather.vv. */
894 static void
895 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
897 rtx elt;
898 insn_code icode;
899 machine_mode data_mode = GET_MODE (target);
900 machine_mode sel_mode = GET_MODE (sel);
901 if (const_vec_duplicate_p (sel, &elt))
903 icode = code_for_pred_gather_scalar (data_mode);
904 sel = elt;
906 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
907 icode = code_for_pred_gatherei16 (data_mode);
908 else
909 icode = code_for_pred_gather (data_mode);
910 rtx ops[] = {target, op, sel};
911 emit_vlmax_insn (icode, BINARY_OP, ops);
914 static void
915 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
917 rtx elt;
918 insn_code icode;
919 machine_mode data_mode = GET_MODE (target);
920 machine_mode sel_mode = GET_MODE (sel);
921 if (const_vec_duplicate_p (sel, &elt))
923 icode = code_for_pred_gather_scalar (data_mode);
924 sel = elt;
926 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
927 icode = code_for_pred_gatherei16 (data_mode);
928 else
929 icode = code_for_pred_gather (data_mode);
930 rtx ops[] = {target, mask, target, op, sel};
931 emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
935 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
937 There is no inverse vdecompress provided, as this operation can be readily
938 synthesized using iota and a masked vrgather:
940 Desired functionality of 'vdecompress'
941 7 6 5 4 3 2 1 0 # vid
943 e d c b a # packed vector of 5 elements
944 1 0 0 1 1 1 0 1 # mask vector of 8 elements
945 p q r s t u v w # destination register before vdecompress
947 e q r d c b v a # result of vdecompress
948 # v0 holds mask
949 # v1 holds packed data
950 # v11 holds input expanded vector and result
951 viota.m v10, v0 # Calc iota from mask in v0
952 vrgather.vv v11, v1, v10, v0.t # Expand into destination
953 p q r s t u v w # v11 destination register
954 e d c b a # v1 source vector
955 1 0 0 1 1 1 0 1 # v0 mask vector
957 4 4 4 3 2 1 1 0 # v10 result of viota.m
958 e q r d c b v a # v11 destination after vrgather using viota.m under mask
960 static void
961 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
963 machine_mode data_mode = GET_MODE (target);
964 machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
965 if (GET_MODE_INNER (data_mode) == QImode)
966 sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
968 rtx sel = gen_reg_rtx (sel_mode);
969 rtx iota_ops[] = {sel, mask};
970 emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
971 emit_vlmax_gather_insn (target, op0, sel);
972 emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
975 /* Emit merge instruction. */
977 static machine_mode
978 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
979 machine_mode mask_bit_mode)
981 unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
982 unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
983 ? builder.inner_bits_size () : mask_precision;
985 scalar_mode inner_mode;
986 unsigned minimal_bits_size;
988 switch (mask_scalar_size)
990 case 8:
991 inner_mode = QImode;
992 minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */
993 break;
994 case 16:
995 inner_mode = HImode;
996 minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */
997 break;
998 case 32:
999 inner_mode = SImode;
1000 minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */
1001 break;
1002 case 64:
1003 inner_mode = DImode;
1004 minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */
1005 break;
1006 default:
1007 gcc_unreachable ();
1008 break;
1011 gcc_assert (mask_precision % mask_scalar_size == 0);
1013 uint64_t dup_nunit = mask_precision > mask_scalar_size
1014 ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1016 return get_vector_mode (inner_mode, dup_nunit).require ();
1019 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1020 instructions to generate sequence for VID:
1022 VID = { 0, 1, 2, 3, ... }
1024 Otherwise, we use the VID argument directly. */
1026 void
1027 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1029 machine_mode mode = GET_MODE (dest);
1030 poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1031 poly_int64 value;
1032 rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1034 /* VECT_IV = BASE + I * STEP. */
1036 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1037 bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1038 && poly_int_rtx_p (base, &value)
1039 && known_eq (nunits_m1, value);
1040 if (!vid)
1042 vid = gen_reg_rtx (mode);
1043 rtx op[] = {vid};
1044 emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1047 rtx step_adj;
1048 if (reverse_p)
1050 /* Special case:
1051 {nunits - 1, nunits - 2, ... , 0}.
1052 nunits can be either const_int or const_poly_int.
1054 Code sequence:
1055 vid.v v
1056 vrsub nunits - 1, v. */
1057 rtx ops[]
1058 = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1059 insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1060 emit_vlmax_insn (icode, BINARY_OP, ops);
1062 else
1064 /* Step 2: Generate I * STEP.
1065 - STEP is 1, we don't emit any instructions.
1066 - STEP is power of 2, we use vsll.vi/vsll.vx.
1067 - STEP is non-power of 2, we use vmul.vx. */
1068 if (rtx_equal_p (step, const1_rtx))
1069 step_adj = vid;
1070 else
1072 step_adj = gen_reg_rtx (mode);
1073 if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1075 /* Emit logical left shift operation. */
1076 int shift = exact_log2 (INTVAL (step));
1077 rtx shift_amount = gen_int_mode (shift, Pmode);
1078 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1079 rtx ops[] = {step_adj, vid, shift_amount};
1080 emit_vlmax_insn (icode, BINARY_OP, ops);
1082 else
1084 insn_code icode = code_for_pred_scalar (MULT, mode);
1085 rtx ops[] = {step_adj, vid, step};
1086 emit_vlmax_insn (icode, BINARY_OP, ops);
1090 /* Step 3: Generate BASE + I * STEP.
1091 - BASE is 0, use result of vid.
1092 - BASE is not 0, we use vadd.vx/vadd.vi. */
1093 if (rtx_equal_p (base, const0_rtx))
1094 emit_move_insn (result, step_adj);
1095 else
1097 insn_code icode = code_for_pred_scalar (PLUS, mode);
1098 rtx ops[] = {result, step_adj, base};
1099 emit_vlmax_insn (icode, BINARY_OP, ops);
1103 if (result != dest)
1104 emit_move_insn (dest, result);
1107 /* Subroutine of riscv_vector_expand_vector_init.
1108 Works as follows:
1109 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110 (b) Skip leading elements from BUILDER, which are the same as
1111 element NELTS_REQD - 1.
1112 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
1114 static void
1115 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
1116 int nelts_reqd)
1118 machine_mode mode = GET_MODE (target);
1119 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
1120 emit_move_insn (target, dup);
1121 int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1122 for (int i = ndups; i < nelts_reqd; i++)
1124 unsigned int unspec
1125 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
1126 insn_code icode = code_for_pred_slide (unspec, mode);
1127 rtx ops[] = {target, target, builder.elt (i)};
1128 emit_vlmax_insn (icode, BINARY_OP, ops);
1132 /* Subroutine of expand_vec_init to handle case
1133 when all trailing elements of builder are same.
1134 This works as follows:
1135 (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136 (b) Insert remaining elements in TARGET using insr.
1138 ??? The heuristic used is to do above if number of same trailing elements
1139 is greater than leading_ndups, loosely based on
1140 heuristic from mostly_zeros_p. May need fine-tuning. */
1142 static bool
1143 expand_vector_init_trailing_same_elem (rtx target,
1144 const rtx_vector_builder &builder,
1145 int nelts_reqd)
1147 int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
1148 int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
1149 machine_mode mode = GET_MODE (target);
1151 if (trailing_ndups > leading_ndups)
1153 rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
1154 for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
1156 unsigned int unspec
1157 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
1158 insn_code icode = code_for_pred_slide (unspec, mode);
1159 rtx tmp = gen_reg_rtx (mode);
1160 rtx ops[] = {tmp, dup, builder.elt (i)};
1161 emit_vlmax_insn (icode, BINARY_OP, ops);
1162 /* slide1up need source and dest to be different REG. */
1163 dup = tmp;
1166 emit_move_insn (target, dup);
1167 return true;
1170 return false;
1173 static void
1174 expand_const_vector (rtx target, rtx src)
1176 machine_mode mode = GET_MODE (target);
1177 rtx result = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1178 rtx elt;
1179 if (const_vec_duplicate_p (src, &elt))
1181 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1183 gcc_assert (rtx_equal_p (elt, const0_rtx)
1184 || rtx_equal_p (elt, const1_rtx));
1185 rtx ops[] = {result, src};
1186 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1188 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189 we use vmv.v.i instruction. */
1190 else if (valid_vec_immediate_p (src))
1192 rtx ops[] = {result, src};
1193 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1195 else
1197 /* Emit vec_duplicate<mode> split pattern before RA so that
1198 we could have a better optimization opportunity in LICM
1199 which will hoist vmv.v.x outside the loop and in fwprop && combine
1200 which will transform 'vv' into 'vx' instruction.
1202 The reason we don't emit vec_duplicate<mode> split pattern during
1203 RA since the split stage after RA is a too late stage to generate
1204 RVV instruction which need an additional register (We can't
1205 allocate a new register after RA) for VL operand of vsetvl
1206 instruction (vsetvl a5, zero). */
1207 if (lra_in_progress)
1209 rtx ops[] = {result, elt};
1210 emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1212 else
1214 struct expand_operand ops[2];
1215 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1216 gcc_assert (icode != CODE_FOR_nothing);
1217 create_output_operand (&ops[0], result, mode);
1218 create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1219 expand_insn (icode, 2, ops);
1220 result = ops[0].value;
1224 if (result != target)
1225 emit_move_insn (target, result);
1226 return;
1229 /* Support scalable const series vector. */
1230 rtx base, step;
1231 if (const_vec_series_p (src, &base, &step))
1233 expand_vec_series (result, base, step);
1235 if (result != target)
1236 emit_move_insn (target, result);
1237 return;
1240 /* Handle variable-length vector. */
1241 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1242 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1243 rvv_builder builder (mode, npatterns, nelts_per_pattern);
1244 for (unsigned int i = 0; i < nelts_per_pattern; i++)
1246 for (unsigned int j = 0; j < npatterns; j++)
1247 builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1249 builder.finalize ();
1251 if (CONST_VECTOR_DUPLICATE_P (src))
1253 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256 The elements within NPATTERNS are not necessary regular. */
1257 if (builder.can_duplicate_repeating_sequence_p ())
1259 /* We handle the case that we can find a vector container to hold
1260 element bitsize = NPATTERNS * ele_bitsize.
1262 NPATTERNS = 8, element width = 8
1263 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264 In this case, we can combine NPATTERNS element into a larger
1265 element. Use element width = 64 and broadcast a vector with
1266 all element equal to 0x0706050403020100. */
1267 rtx ele = builder.get_merged_repeating_sequence ();
1268 rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1269 emit_move_insn (result, gen_lowpart (mode, dup));
1271 else
1273 /* We handle the case that we can't find a vector container to hold
1274 element bitsize = NPATTERNS * ele_bitsize.
1276 NPATTERNS = 8, element width = 16
1277 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278 Since NPATTERNS * element width = 128, we can't find a container
1279 to hold it.
1281 In this case, we use NPATTERNS merge operations to generate such
1282 vector. */
1283 unsigned int nbits = npatterns - 1;
1285 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1286 rtx vid = gen_reg_rtx (builder.int_mode ());
1287 rtx op[] = {vid};
1288 emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1289 NULLARY_OP, op);
1291 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1292 rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1293 rtx and_ops[] = {vid_repeat, vid,
1294 gen_int_mode (nbits, builder.inner_int_mode ())};
1295 emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1296 BINARY_OP, and_ops);
1298 rtx tmp1 = gen_reg_rtx (builder.mode ());
1299 rtx dup_ops[] = {tmp1, builder.elt (0)};
1300 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1301 dup_ops);
1302 for (unsigned int i = 1; i < builder.npatterns (); i++)
1304 /* Generate mask according to i. */
1305 rtx mask = gen_reg_rtx (builder.mask_mode ());
1306 rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1307 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1309 /* Merge scalar to each i. */
1310 rtx tmp2 = gen_reg_rtx (builder.mode ());
1311 rtx merge_ops[] = {tmp2, tmp1, builder.elt (i), mask};
1312 insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1313 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1314 tmp1 = tmp2;
1316 emit_move_insn (result, tmp1);
1319 else if (CONST_VECTOR_STEPPED_P (src))
1321 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1322 if (builder.single_step_npatterns_p ())
1324 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1325 insn_code icode;
1327 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1328 rtx vid = gen_reg_rtx (builder.mode ());
1329 rtx vid_ops[] = {vid};
1330 icode = code_for_pred_series (builder.mode ());
1331 emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1333 if (builder.npatterns_all_equal_p ())
1335 /* Generate the variable-length vector following this rule:
1336 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1339 /* We want to create a pattern where value[idx] = floor (idx /
1340 NPATTERNS). As NPATTERNS is always a power of two we can
1341 rewrite this as = idx & -NPATTERNS. */
1342 /* Step 2: VID AND -NPATTERNS:
1343 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1345 rtx imm
1346 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1347 rtx tmp1 = gen_reg_rtx (builder.mode ());
1348 rtx and_ops[] = {tmp1, vid, imm};
1349 icode = code_for_pred_scalar (AND, builder.mode ());
1350 emit_vlmax_insn (icode, BINARY_OP, and_ops);
1352 /* Step 3: Convert to step size 1. */
1353 rtx tmp2 = gen_reg_rtx (builder.mode ());
1354 /* log2 (npatterns) to get the shift amount to convert
1355 Eg. { 0, 0, 0, 0, 4, 4, ... }
1356 into { 0, 0, 0, 0, 1, 1, ... }. */
1357 HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ;
1358 rtx shift = gen_int_mode (shift_amt, builder.inner_mode ());
1359 rtx shift_ops[] = {tmp2, tmp1, shift};
1360 icode = code_for_pred_scalar (ASHIFTRT, builder.mode ());
1361 emit_vlmax_insn (icode, BINARY_OP, shift_ops);
1363 /* Step 4: Multiply to step size n. */
1364 HOST_WIDE_INT step_size =
1365 INTVAL (builder.elt (builder.npatterns ()))
1366 - INTVAL (builder.elt (0));
1367 rtx tmp3 = gen_reg_rtx (builder.mode ());
1368 if (pow2p_hwi (step_size))
1370 /* Power of 2 can be handled with a left shift. */
1371 HOST_WIDE_INT shift = exact_log2 (step_size);
1372 rtx shift_amount = gen_int_mode (shift, Pmode);
1373 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1374 rtx ops[] = {tmp3, tmp2, shift_amount};
1375 emit_vlmax_insn (icode, BINARY_OP, ops);
1377 else
1379 rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ());
1380 insn_code icode = code_for_pred_scalar (MULT, builder.mode ());
1381 rtx ops[] = {tmp3, tmp2, mult_amt};
1382 emit_vlmax_insn (icode, BINARY_OP, ops);
1385 /* Step 5: Add starting value to all elements. */
1386 HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1387 if (init_val == 0)
1388 emit_move_insn (result, tmp3);
1389 else
1391 rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1392 rtx add_ops[] = {result, tmp3, dup};
1393 icode = code_for_pred (PLUS, builder.mode ());
1394 emit_vlmax_insn (icode, BINARY_OP, add_ops);
1397 else
1399 /* Generate the variable-length vector following this rule:
1400 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1402 if (builder.npatterns_vid_diff_repeated_p ())
1404 /* Case 1: For example as below:
1405 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407 repeated as below after minus vid.
1408 {3, 1, -1, -3, 3, 1, -1, -3...}
1409 Then we can simplify the diff code gen to at most
1410 npatterns(). */
1411 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1413 /* Step 1: Generate diff = TARGET - VID. */
1414 for (unsigned int i = 0; i < v.npatterns (); ++i)
1416 poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1417 v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1420 /* Step 2: Generate result = VID + diff. */
1421 rtx vec = v.build ();
1422 rtx add_ops[] = {result, vid, vec};
1423 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1424 BINARY_OP, add_ops);
1426 else
1428 /* Case 2: For example as below:
1429 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1431 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1433 /* Step 1: Generate { a, b, a, b, ... } */
1434 for (unsigned int i = 0; i < v.npatterns (); ++i)
1435 v.quick_push (builder.elt (i));
1436 rtx new_base = v.build ();
1438 /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS).  */
1439 rtx shift_count
1440 = gen_int_mode (exact_log2 (builder.npatterns ()),
1441 builder.inner_mode ());
1442 rtx tmp1 = expand_simple_binop (builder.mode (), LSHIFTRT,
1443 vid, shift_count, NULL_RTX,
1444 false, OPTAB_DIRECT);
1446 /* Step 3: Generate tmp2 = tmp1 * step.  */
1447 rtx tmp2 = gen_reg_rtx (builder.mode ());
1448 rtx step
1449 = simplify_binary_operation (MINUS, builder.inner_mode (),
1450 builder.elt (v.npatterns()),
1451 builder.elt (0));
1452 expand_vec_series (tmp2, const0_rtx, step, tmp1);
1454 /* Step 4: Generate result = tmp2 + new_base.  */
1455 rtx add_ops[] = {result, tmp2, new_base};
1456 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1457 BINARY_OP, add_ops);
1461 else if (builder.interleaved_stepped_npatterns_p ())
1463 rtx base1 = builder.elt (0);
1464 rtx base2 = builder.elt (1);
1465 poly_int64 step1
1466 = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1467 - rtx_to_poly_int64 (base1);
1468 poly_int64 step2
1469 = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1470 - rtx_to_poly_int64 (base2);
1472 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1473 integer vector mode to generate such vector efficiently.
1475 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1477 can be interpreted into:
1479 EEW = 32, { 2, 4, ... } */
1480 unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1481 scalar_int_mode new_smode;
1482 machine_mode new_mode;
1483 poly_uint64 new_nunits
1484 = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1485 if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1486 && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1488 rtx tmp1 = gen_reg_rtx (new_mode);
1489 base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1490 expand_vec_series (tmp1, base1, gen_int_mode (step1, new_smode));
1492 if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1493 /* { 1, 0, 2, 0, ... }. */
1494 emit_move_insn (result, gen_lowpart (mode, tmp1));
1495 else if (known_eq (step2, 0))
1497 /* { 1, 1, 2, 1, ... }. */
1498 rtx scalar = expand_simple_binop (
1499 new_smode, ASHIFT,
1500 gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1501 gen_int_mode (builder.inner_bits_size (), new_smode),
1502 NULL_RTX, false, OPTAB_DIRECT);
1503 rtx tmp2 = gen_reg_rtx (new_mode);
1504 rtx ior_ops[] = {tmp2, tmp1, scalar};
1505 emit_vlmax_insn (code_for_pred_scalar (IOR, new_mode),
1506 BINARY_OP, ior_ops);
1507 emit_move_insn (result, gen_lowpart (mode, tmp2));
1509 else
1511 /* { 1, 3, 2, 6, ... }. */
1512 rtx tmp2 = gen_reg_rtx (new_mode);
1513 base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1514 expand_vec_series (tmp2, base2,
1515 gen_int_mode (step2, new_smode));
1516 rtx shifted_tmp2 = expand_simple_binop (
1517 new_mode, ASHIFT, tmp2,
1518 gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1519 false, OPTAB_DIRECT);
1520 rtx tmp3 = gen_reg_rtx (new_mode);
1521 rtx ior_ops[] = {tmp3, tmp1, shifted_tmp2};
1522 emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1523 ior_ops);
1524 emit_move_insn (result, gen_lowpart (mode, tmp3));
1527 else
1529 rtx vid = gen_reg_rtx (mode);
1530 expand_vec_series (vid, const0_rtx, const1_rtx);
1531 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1532 rtx shifted_vid
1533 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1534 NULL_RTX, false, OPTAB_DIRECT);
1535 rtx tmp1 = gen_reg_rtx (mode);
1536 rtx tmp2 = gen_reg_rtx (mode);
1537 expand_vec_series (tmp1, base1,
1538 gen_int_mode (step1, builder.inner_mode ()),
1539 shifted_vid);
1540 expand_vec_series (tmp2, base2,
1541 gen_int_mode (step2, builder.inner_mode ()),
1542 shifted_vid);
1544 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1545 rtx and_vid = gen_reg_rtx (mode);
1546 rtx and_ops[] = {and_vid, vid, const1_rtx};
1547 emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1548 and_ops);
1549 rtx mask = gen_reg_rtx (builder.mask_mode ());
1550 expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1552 rtx ops[] = {result, tmp1, tmp2, mask};
1553 emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1556 else
1557 /* TODO: We will enable more variable-length vector in the future. */
1558 gcc_unreachable ();
1560 else
1561 gcc_unreachable ();
1563 if (result != target)
1564 emit_move_insn (target, result);
1567 /* Get the frm mode with given CONST_INT rtx, the default mode is
1568 FRM_DYN. */
1569 enum floating_point_rounding_mode
1570 get_frm_mode (rtx operand)
1572 gcc_assert (CONST_INT_P (operand));
1574 switch (INTVAL (operand))
1576 case FRM_RNE:
1577 return FRM_RNE;
1578 case FRM_RTZ:
1579 return FRM_RTZ;
1580 case FRM_RDN:
1581 return FRM_RDN;
1582 case FRM_RUP:
1583 return FRM_RUP;
1584 case FRM_RMM:
1585 return FRM_RMM;
1586 case FRM_DYN:
1587 return FRM_DYN;
1588 default:
1589 gcc_unreachable ();
1592 gcc_unreachable ();
1595 /* Expand a pre-RA RVV data move from SRC to DEST.
1596 It expands move for RVV fractional vector modes.
1597 Return true if the move as already been emitted. */
1598 bool
1599 legitimize_move (rtx dest, rtx *srcp)
1601 rtx src = *srcp;
1602 machine_mode mode = GET_MODE (dest);
1603 if (CONST_VECTOR_P (src))
1605 expand_const_vector (dest, src);
1606 return true;
1609 if (riscv_v_ext_vls_mode_p (mode))
1611 if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1613 /* For NUNITS <= 31 VLS modes, we don't need extract
1614 scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1615 if (can_create_pseudo_p ())
1617 /* Need to force register if mem <- !reg. */
1618 if (MEM_P (dest) && !REG_P (src))
1619 *srcp = force_reg (mode, src);
1621 return false;
1624 else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1626 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1627 return true;
1630 else
1632 /* In order to decrease the memory traffic, we don't use whole register
1633 * load/store for the LMUL less than 1 and mask mode, so those case will
1634 * require one extra general purpose register, but it's not allowed during
1635 * LRA process, so we have a special move pattern used for LRA, which will
1636 * defer the expansion after LRA. */
1637 if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1638 || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1639 && lra_in_progress)
1641 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1642 return true;
1645 if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1646 && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1648 /* Need to force register if mem <- !reg. */
1649 if (MEM_P (dest) && !REG_P (src))
1650 *srcp = force_reg (mode, src);
1652 return false;
1656 if (register_operand (src, mode) && register_operand (dest, mode))
1658 emit_insn (gen_rtx_SET (dest, src));
1659 return true;
1662 unsigned insn_flags
1663 = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1664 if (!register_operand (src, mode) && !register_operand (dest, mode))
1666 rtx tmp = gen_reg_rtx (mode);
1667 if (MEM_P (src))
1669 rtx ops[] = {tmp, src};
1670 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1672 else
1673 emit_move_insn (tmp, src);
1674 src = tmp;
1677 if (satisfies_constraint_vu (src))
1678 return false;
1680 rtx ops[] = {dest, src};
1681 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1682 return true;
1685 /* VTYPE information for machine_mode. */
1686 struct mode_vtype_group
1688 enum vlmul_type vlmul[NUM_MACHINE_MODES];
1689 uint8_t ratio[NUM_MACHINE_MODES];
1690 machine_mode subpart_mode[NUM_MACHINE_MODES];
1691 uint8_t nf[NUM_MACHINE_MODES];
1692 mode_vtype_group ()
1694 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1695 vlmul[MODE##mode] = VLMUL; \
1696 ratio[MODE##mode] = RATIO;
1697 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1698 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1699 nf[MODE##mode] = NF; \
1700 vlmul[MODE##mode] = VLMUL; \
1701 ratio[MODE##mode] = RATIO;
1702 #include "riscv-vector-switch.def"
1703 #undef ENTRY
1704 #undef TUPLE_ENTRY
1708 static mode_vtype_group mode_vtype_infos;
1710 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1711 enum vlmul_type
1712 get_vlmul (machine_mode mode)
1714 /* For VLS modes, the vlmul should be dynamically
1715 calculated since we need to adjust VLMUL according
1716 to TARGET_MIN_VLEN. */
1717 if (riscv_v_ext_vls_mode_p (mode))
1719 int size = GET_MODE_BITSIZE (mode).to_constant ();
1720 int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1721 if (size < TARGET_MIN_VLEN)
1723 int factor = TARGET_MIN_VLEN / size;
1724 if (inner_size == 8)
1725 factor = MIN (factor, 8);
1726 else if (inner_size == 16)
1727 factor = MIN (factor, 4);
1728 else if (inner_size == 32)
1729 factor = MIN (factor, 2);
1730 else if (inner_size == 64)
1731 factor = MIN (factor, 1);
1732 else
1733 gcc_unreachable ();
1735 switch (factor)
1737 case 1:
1738 return LMUL_1;
1739 case 2:
1740 return LMUL_F2;
1741 case 4:
1742 return LMUL_F4;
1743 case 8:
1744 return LMUL_F8;
1746 default:
1747 gcc_unreachable ();
1750 else
1752 int factor = size / TARGET_MIN_VLEN;
1753 switch (factor)
1755 case 1:
1756 return LMUL_1;
1757 case 2:
1758 return LMUL_2;
1759 case 4:
1760 return LMUL_4;
1761 case 8:
1762 return LMUL_8;
1764 default:
1765 gcc_unreachable ();
1769 return mode_vtype_infos.vlmul[mode];
1772 /* Return the VLMAX rtx of vector mode MODE. */
1774 get_vlmax_rtx (machine_mode mode)
1776 gcc_assert (riscv_v_ext_vector_mode_p (mode));
1777 return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1780 /* Return the NF value of the corresponding mode. */
1781 unsigned int
1782 get_nf (machine_mode mode)
1784 /* We don't allow non-tuple modes go through this function. */
1785 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1786 return mode_vtype_infos.nf[mode];
1789 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1790 the subpart mode is RVVM2SImode. This will help to build
1791 array/struct type in builtins. */
1792 machine_mode
1793 get_subpart_mode (machine_mode mode)
1795 /* We don't allow non-tuple modes go through this function. */
1796 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1797 return mode_vtype_infos.subpart_mode[mode];
1800 /* Get ratio according to machine mode. */
1801 unsigned int
1802 get_ratio (machine_mode mode)
1804 if (riscv_v_ext_vls_mode_p (mode))
1806 unsigned int sew = get_sew (mode);
1807 vlmul_type vlmul = get_vlmul (mode);
1808 switch (vlmul)
1810 case LMUL_1:
1811 return sew;
1812 case LMUL_2:
1813 return sew / 2;
1814 case LMUL_4:
1815 return sew / 4;
1816 case LMUL_8:
1817 return sew / 8;
1818 case LMUL_F8:
1819 return sew * 8;
1820 case LMUL_F4:
1821 return sew * 4;
1822 case LMUL_F2:
1823 return sew * 2;
1825 default:
1826 gcc_unreachable ();
1829 return mode_vtype_infos.ratio[mode];
1832 /* Get ta according to operand[tail_op_idx]. */
1834 get_ta (rtx ta)
1836 if (INTVAL (ta) == TAIL_ANY)
1837 return INVALID_ATTRIBUTE;
1838 return INTVAL (ta);
1841 /* Get ma according to operand[mask_op_idx]. */
1843 get_ma (rtx ma)
1845 if (INTVAL (ma) == MASK_ANY)
1846 return INVALID_ATTRIBUTE;
1847 return INTVAL (ma);
1850 /* Get prefer tail policy. */
1851 enum tail_policy
1852 get_prefer_tail_policy ()
1854 /* TODO: By default, we choose to use TAIL_ANY which allows
1855 compiler pick up either agnostic or undisturbed. Maybe we
1856 will have a compile option like -mprefer=agnostic to set
1857 this value???. */
1858 return TAIL_ANY;
1861 /* Get prefer mask policy. */
1862 enum mask_policy
1863 get_prefer_mask_policy ()
1865 /* TODO: By default, we choose to use MASK_ANY which allows
1866 compiler pick up either agnostic or undisturbed. Maybe we
1867 will have a compile option like -mprefer=agnostic to set
1868 this value???. */
1869 return MASK_ANY;
1872 /* Get avl_type rtx. */
1874 get_avl_type_rtx (enum avl_type type)
1876 return gen_int_mode (type, Pmode);
1879 /* Return the appropriate mask mode for MODE. */
1881 machine_mode
1882 get_mask_mode (machine_mode mode)
1884 poly_int64 nunits = GET_MODE_NUNITS (mode);
1885 if (riscv_v_ext_tuple_mode_p (mode))
1887 unsigned int nf = get_nf (mode);
1888 nunits = exact_div (nunits, nf);
1890 return get_vector_mode (BImode, nunits).require ();
1893 /* Return the appropriate LMUL mode for MODE. */
1895 opt_machine_mode
1896 get_lmul_mode (scalar_mode mode, int lmul)
1898 poly_uint64 lmul_nunits;
1899 unsigned int bytes = GET_MODE_SIZE (mode);
1900 if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits))
1901 return get_vector_mode (mode, lmul_nunits);
1902 return E_VOIDmode;
1905 /* Return the appropriate M1 mode for MODE. */
1907 static opt_machine_mode
1908 get_m1_mode (machine_mode mode)
1910 scalar_mode smode = GET_MODE_INNER (mode);
1911 unsigned int bytes = GET_MODE_SIZE (smode);
1912 poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1913 return get_vector_mode (smode, m1_nunits);
1916 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1917 This function is not only used by builtins, but also will be used by
1918 auto-vectorization in the future. */
1919 opt_machine_mode
1920 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1922 enum mode_class mclass;
1923 if (inner_mode == E_BImode)
1924 mclass = MODE_VECTOR_BOOL;
1925 else if (FLOAT_MODE_P (inner_mode))
1926 mclass = MODE_VECTOR_FLOAT;
1927 else
1928 mclass = MODE_VECTOR_INT;
1929 machine_mode mode;
1930 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1931 if (inner_mode == GET_MODE_INNER (mode)
1932 && known_eq (nunits, GET_MODE_NUNITS (mode))
1933 && (riscv_v_ext_vector_mode_p (mode)
1934 || riscv_v_ext_vls_mode_p (mode)))
1935 return mode;
1936 return opt_machine_mode ();
1939 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1940 corresponding subpart mode and NF. */
1941 opt_machine_mode
1942 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1944 poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1945 scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1946 enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1947 machine_mode mode;
1948 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1949 if (inner_mode == GET_MODE_INNER (mode)
1950 && known_eq (nunits, GET_MODE_NUNITS (mode))
1951 && riscv_v_ext_tuple_mode_p (mode)
1952 && get_subpart_mode (mode) == subpart_mode)
1953 return mode;
1954 return opt_machine_mode ();
1957 bool
1958 simm5_p (rtx x)
1960 if (!CONST_INT_P (x))
1961 return false;
1962 return IN_RANGE (INTVAL (x), -16, 15);
1965 bool
1966 neg_simm5_p (rtx x)
1968 if (!CONST_INT_P (x))
1969 return false;
1970 return IN_RANGE (INTVAL (x), -15, 16);
1973 bool
1974 has_vi_variant_p (rtx_code code, rtx x)
1976 switch (code)
1978 case PLUS:
1979 case AND:
1980 case IOR:
1981 case XOR:
1982 case SS_PLUS:
1983 case US_PLUS:
1984 case EQ:
1985 case NE:
1986 case LE:
1987 case LEU:
1988 case GT:
1989 case GTU:
1990 return simm5_p (x);
1992 case LT:
1993 case LTU:
1994 case GE:
1995 case GEU:
1996 case MINUS:
1997 case SS_MINUS:
1998 return neg_simm5_p (x);
2000 default:
2001 return false;
2005 bool
2006 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
2007 machine_mode vector_mode, bool has_vi_variant_p,
2008 void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
2010 machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
2011 if (has_vi_variant_p)
2013 *scalar_op = force_reg (scalar_mode, *scalar_op);
2014 return false;
2017 if (TARGET_64BIT)
2019 if (!rtx_equal_p (*scalar_op, const0_rtx))
2020 *scalar_op = force_reg (scalar_mode, *scalar_op);
2021 return false;
2024 if (immediate_operand (*scalar_op, Pmode))
2026 if (!rtx_equal_p (*scalar_op, const0_rtx))
2027 *scalar_op = force_reg (Pmode, *scalar_op);
2029 *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
2030 return false;
2033 if (CONST_INT_P (*scalar_op))
2035 if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
2036 *scalar_op = force_const_mem (scalar_mode, *scalar_op);
2037 else
2038 *scalar_op = force_reg (scalar_mode, *scalar_op);
2041 rtx tmp = gen_reg_rtx (vector_mode);
2042 rtx ops[] = {tmp, *scalar_op};
2043 if (type == VLMAX)
2044 emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
2045 else
2046 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
2047 vl);
2048 emit_vector_func (operands, tmp);
2050 return true;
2053 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
2055 gen_scalar_move_mask (machine_mode mode)
2057 rtx_vector_builder builder (mode, 1, 2);
2058 builder.quick_push (const1_rtx);
2059 builder.quick_push (const0_rtx);
2060 return builder.build ();
2063 static unsigned
2064 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
2066 // Original equation:
2067 // VLMAX = (VectorBits / EltSize) * LMUL
2068 // where LMUL = MinSize / TARGET_MIN_VLEN
2069 // The following equations have been reordered to prevent loss of precision
2070 // when calculating fractional LMUL.
2071 return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
2074 static unsigned
2075 get_unknown_min_value (machine_mode mode)
2077 enum vlmul_type vlmul = get_vlmul (mode);
2078 switch (vlmul)
2080 case LMUL_1:
2081 return TARGET_MIN_VLEN;
2082 case LMUL_2:
2083 return TARGET_MIN_VLEN * 2;
2084 case LMUL_4:
2085 return TARGET_MIN_VLEN * 4;
2086 case LMUL_8:
2087 return TARGET_MIN_VLEN * 8;
2088 default:
2089 gcc_unreachable ();
2093 static rtx
2094 force_vector_length_operand (rtx vl)
2096 if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2097 return force_reg (Pmode, vl);
2098 return vl;
2102 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2104 unsigned int sew = get_sew (vmode);
2105 rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2106 rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2107 return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2108 gen_int_mode (get_vlmul (vmode), Pmode),
2109 tail_policy, mask_policy);
2112 /* GET VL * 2 rtx. */
2113 static rtx
2114 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2116 rtx i32vl = NULL_RTX;
2117 if (CONST_INT_P (avl))
2119 unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2120 unsigned min_size = get_unknown_min_value (mode);
2121 unsigned vlen_max = RVV_65536;
2122 unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2123 unsigned vlen_min = TARGET_MIN_VLEN;
2124 unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2126 unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2127 if (avl_int <= vlmax_min)
2128 i32vl = gen_int_mode (2 * avl_int, Pmode);
2129 else if (avl_int >= 2 * vlmax_max)
2131 // Just set i32vl to VLMAX in this situation
2132 i32vl = gen_reg_rtx (Pmode);
2133 emit_insn (
2134 gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2136 else
2138 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2139 // is related to the hardware implementation.
2140 // So let the following code handle
2143 if (!i32vl)
2145 // Using vsetvli instruction to get actually used length which related to
2146 // the hardware implementation
2147 rtx i64vl = gen_reg_rtx (Pmode);
2148 emit_insn (
2149 gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2150 // scale 2 for 32-bit length
2151 i32vl = gen_reg_rtx (Pmode);
2152 emit_insn (
2153 gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2156 return force_vector_length_operand (i32vl);
2159 bool
2160 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2161 machine_mode demote_mask_mode, rtx *ops)
2163 rtx scalar_op = ops[4];
2164 rtx avl = ops[5];
2165 machine_mode scalar_mode = GET_MODE_INNER (mode);
2166 if (rtx_equal_p (scalar_op, const0_rtx))
2168 ops[5] = force_vector_length_operand (ops[5]);
2169 return false;
2172 if (TARGET_64BIT)
2174 ops[4] = force_reg (scalar_mode, scalar_op);
2175 ops[5] = force_vector_length_operand (ops[5]);
2176 return false;
2179 if (immediate_operand (scalar_op, Pmode))
2181 ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2182 ops[5] = force_vector_length_operand (ops[5]);
2183 return false;
2186 if (CONST_INT_P (scalar_op))
2187 scalar_op = force_reg (scalar_mode, scalar_op);
2189 rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2191 rtx demote_scalar_op1, demote_scalar_op2;
2192 if (unspec == UNSPEC_VSLIDE1UP)
2194 demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2195 demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2197 else
2199 demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2200 demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2203 rtx temp = gen_reg_rtx (demote_mode);
2204 rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2205 rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2206 rtx merge = RVV_VUNDEF (demote_mode);
2207 /* Handle vslide1<ud>_tu. */
2208 if (register_operand (ops[2], mode)
2209 && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2211 merge = gen_lowpart (demote_mode, ops[2]);
2212 ta = ops[6];
2213 ma = ops[7];
2216 emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2217 CONSTM1_RTX (demote_mask_mode), merge,
2218 gen_lowpart (demote_mode, ops[3]),
2219 demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2220 emit_insn (gen_pred_slide (unspec, demote_mode,
2221 gen_lowpart (demote_mode, ops[0]),
2222 CONSTM1_RTX (demote_mask_mode), merge, temp,
2223 demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2225 if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2226 && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2227 emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2228 force_vector_length_operand (ops[5]), ops[6],
2229 ops[8]));
2230 return true;
2234 gen_avl_for_scalar_move (rtx avl)
2236 /* AVL for scalar move has different behavior between 0 and large than 0. */
2237 if (CONST_INT_P (avl))
2239 /* So we could just set AVL to 1 for any constant other than 0. */
2240 if (rtx_equal_p (avl, const0_rtx))
2241 return const0_rtx;
2242 else
2243 return const1_rtx;
2245 else
2247 /* For non-constant value, we set any non zero value to 1 by
2248 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2249 rtx tmp = gen_reg_rtx (Pmode);
2250 emit_insn (
2251 gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2252 return tmp;
2256 /* Expand tuple modes data movement for. */
2257 void
2258 expand_tuple_move (rtx *ops)
2260 unsigned int i;
2261 machine_mode tuple_mode = GET_MODE (ops[0]);
2262 machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2263 poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2264 unsigned int nf = get_nf (tuple_mode);
2265 bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2267 if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2269 rtx val;
2270 gcc_assert (can_create_pseudo_p ()
2271 && const_vec_duplicate_p (ops[1], &val));
2272 for (i = 0; i < nf; ++i)
2274 poly_int64 offset = i * subpart_size;
2275 rtx subreg
2276 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2277 rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2278 emit_move_insn (subreg, dup);
2281 else if (REG_P (ops[0]) && REG_P (ops[1]))
2283 for (i = 0; i < nf; ++i)
2285 int index = i;
2287 /* Take NF = 2 and LMUL = 1 for example:
2289 - move v8 to v9:
2290 vmv1r v10,v9
2291 vmv1r v9,v8
2293 - move v8 to v7:
2294 vmv1r v7,v8
2295 vmv1r v8,v9 */
2296 if (REGNO (ops[0]) > REGNO (ops[1]))
2297 index = nf - 1 - i;
2298 poly_int64 offset = index * subpart_size;
2299 rtx dst_subreg
2300 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2301 rtx src_subreg
2302 = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2303 emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2306 else
2308 /* Expand tuple memory data movement. */
2309 gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2310 rtx offset = gen_int_mode (subpart_size, Pmode);
2311 if (!subpart_size.is_constant ())
2313 emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2314 if (fractional_p)
2316 unsigned int factor
2317 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2318 .to_constant ();
2319 rtx pat
2320 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2321 gen_int_mode (exact_log2 (factor), Pmode));
2322 emit_insn (gen_rtx_SET (ops[2], pat));
2325 if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2327 unsigned int factor
2328 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2329 .to_constant ();
2330 rtx pat
2331 = gen_rtx_ASHIFT (Pmode, ops[2],
2332 gen_int_mode (exact_log2 (factor), Pmode));
2333 emit_insn (gen_rtx_SET (ops[2], pat));
2335 offset = ops[2];
2338 /* Non-fractional LMUL has whole register moves that don't require a
2339 vsetvl for VLMAX. */
2340 if (fractional_p)
2341 emit_vlmax_vsetvl (subpart_mode, ops[4]);
2342 if (MEM_P (ops[1]))
2344 /* Load operations. */
2345 emit_move_insn (ops[3], XEXP (ops[1], 0));
2346 for (i = 0; i < nf; i++)
2348 rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2349 tuple_mode, i * subpart_size);
2350 if (i != 0)
2352 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2353 emit_insn (gen_rtx_SET (ops[3], new_addr));
2355 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2357 if (fractional_p)
2359 rtx operands[] = {subreg, mem};
2360 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2361 UNARY_OP, operands, ops[4]);
2363 else
2364 emit_move_insn (subreg, mem);
2367 else
2369 /* Store operations. */
2370 emit_move_insn (ops[3], XEXP (ops[0], 0));
2371 for (i = 0; i < nf; i++)
2373 rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2374 tuple_mode, i * subpart_size);
2375 if (i != 0)
2377 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2378 emit_insn (gen_rtx_SET (ops[3], new_addr));
2380 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2382 if (fractional_p)
2384 rtx operands[] = {mem, subreg};
2385 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2386 UNARY_OP, operands, ops[4]);
2388 else
2389 emit_move_insn (mem, subreg);
2395 /* Return the vectorization machine mode for RVV according to LMUL. */
2396 machine_mode
2397 preferred_simd_mode (scalar_mode mode)
2399 if (autovec_use_vlmax_p ())
2401 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2402 rvv_max_lmul as multiply factor to calculate the NUNITS to
2403 get the auto-vectorization mode. */
2404 poly_uint64 nunits;
2405 poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2406 poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2407 /* Disable vectorization when we can't find a RVV mode for it.
2408 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2409 a double (DFmode) type. */
2410 if (!multiple_p (vector_size, scalar_size, &nunits))
2411 return word_mode;
2412 machine_mode rvv_mode;
2413 if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2414 return rvv_mode;
2416 return word_mode;
2419 /* Use merge approach to initialize the vector with repeating sequence.
2420 v = {a, b, a, b, a, b, a, b}.
2422 v = broadcast (a).
2423 mask = 0b01010101....
2424 v = merge (v, b, mask)
2426 static void
2427 expand_vector_init_merge_repeating_sequence (rtx target,
2428 const rvv_builder &builder)
2430 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2431 since we don't have such instruction in RVV.
2432 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2433 instruction to generate the mask data we want. */
2434 machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2435 machine_mode mask_int_mode
2436 = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2437 uint64_t full_nelts = builder.full_nelts ().to_constant ();
2439 /* Step 1: Broadcast the first pattern. */
2440 rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2441 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2442 UNARY_OP, ops);
2443 /* Step 2: Merge the rest iteration of pattern. */
2444 for (unsigned int i = 1; i < builder.npatterns (); i++)
2446 /* Step 2-1: Generate mask register v0 for each merge. */
2447 rtx merge_mask
2448 = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2449 rtx mask = gen_reg_rtx (mask_bit_mode);
2450 rtx dup = gen_reg_rtx (mask_int_mode);
2452 if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
2454 rtx ops[] = {dup, merge_mask};
2455 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2456 SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2458 else /* vmv.v.x. */
2460 rtx ops[] = {dup,
2461 force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2462 rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2463 Pmode);
2464 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2465 ops, vl);
2468 emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2470 /* Step 2-2: Merge pattern according to the mask. */
2471 rtx ops[] = {target, target, builder.elt (i), mask};
2472 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2473 MERGE_OP, ops);
2477 /* Use slideup approach to combine the vectors.
2478 v = {a, a, a, a, b, b, b, b}
2480 First:
2481 v1 = {a, a, a, a, a, a, a, a}
2482 v2 = {b, b, b, b, b, b, b, b}
2483 v = slideup (v1, v2, nelt / 2)
2485 static void
2486 expand_vector_init_slideup_combine_sequence (rtx target,
2487 const rvv_builder &builder)
2489 machine_mode mode = GET_MODE (target);
2490 int nelts = builder.full_nelts ().to_constant ();
2491 rtx first_elt = builder.elt (0);
2492 rtx last_elt = builder.elt (nelts - 1);
2493 rtx low = expand_vector_broadcast (mode, first_elt);
2494 rtx high = expand_vector_broadcast (mode, last_elt);
2495 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2496 rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2497 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2500 /* Use merge approach to merge a scalar into a vector.
2501 v = {a, a, a, a, a, a, b, b}
2503 v1 = {a, a, a, a, a, a, a, a}
2504 scalar = b
2505 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2507 static void
2508 expand_vector_init_merge_combine_sequence (rtx target,
2509 const rvv_builder &builder)
2511 machine_mode mode = GET_MODE (target);
2512 machine_mode imode = builder.int_mode ();
2513 machine_mode mmode = builder.mask_mode ();
2514 int nelts = builder.full_nelts ().to_constant ();
2515 int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2516 if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2517 || riscv_get_v_regno_alignment (imode) > 1)
2518 imode = get_vector_mode (HImode, nelts).require ();
2520 /* Generate vid = { 0, 1, 2, ..., n }. */
2521 rtx vid = gen_reg_rtx (imode);
2522 expand_vec_series (vid, const0_rtx, const1_rtx);
2524 /* Generate mask. */
2525 rtx mask = gen_reg_rtx (mmode);
2526 insn_code icode = code_for_pred_cmp_scalar (imode);
2527 rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2528 rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2529 /* vmsgtu.vi/vmsgtu.vx. */
2530 rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2531 rtx sel = builder.elt (nelts - 1);
2532 rtx mask_ops[] = {mask, cmp, vid, index};
2533 emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2535 /* Duplicate the first elements. */
2536 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2537 /* Merge scalar into vector according to mask. */
2538 rtx merge_ops[] = {target, dup, sel, mask};
2539 icode = code_for_pred_merge_scalar (mode);
2540 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2543 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2545 void
2546 expand_vec_init (rtx target, rtx vals)
2548 machine_mode mode = GET_MODE (target);
2549 int nelts = XVECLEN (vals, 0);
2551 rvv_builder v (mode, nelts, 1);
2552 for (int i = 0; i < nelts; i++)
2553 v.quick_push (XVECEXP (vals, 0, i));
2554 v.finalize ();
2556 /* If the sequence is v = { a, a, a, a } just broadcast an element. */
2557 if (v.is_repeating_sequence ())
2559 machine_mode mode = GET_MODE (target);
2560 rtx dup = expand_vector_broadcast (mode, v.elt (0));
2561 emit_move_insn (target, dup);
2562 return;
2565 if (nelts > 3)
2567 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2568 if (v.can_duplicate_repeating_sequence_p ())
2570 rtx ele = v.get_merged_repeating_sequence ();
2571 rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2572 emit_move_insn (target, gen_lowpart (mode, dup));
2573 return;
2576 /* Case 2: Optimize repeating sequence cases that Case 1 can
2577 not handle and it is profitable. For example:
2578 ELEMENT BITSIZE = 64.
2579 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2580 We can't find a vector mode for "ab" which will be combined into
2581 128-bit element to duplicate. */
2582 if (v.repeating_sequence_use_merge_profitable_p ())
2584 expand_vector_init_merge_repeating_sequence (target, v);
2585 return;
2588 /* Case 3: Optimize combine sequence.
2589 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2590 We can combine:
2591 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2593 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2594 by slideup. */
2595 if (v.combine_sequence_use_slideup_profitable_p ())
2597 expand_vector_init_slideup_combine_sequence (target, v);
2598 return;
2601 /* Case 4: Optimize combine sequence.
2602 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2604 Generate vector:
2605 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2607 Generate mask:
2608 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2610 Merge b into v by mask:
2611 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2612 if (v.combine_sequence_use_merge_profitable_p ())
2614 expand_vector_init_merge_combine_sequence (target, v);
2615 return;
2619 /* Optimize trailing same elements sequence:
2620 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2621 if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2622 /* Handle common situation by vslide1down. This function can handle any
2623 situation of vec_init<mode>. Only the cases that are not optimized above
2624 will fall through here. */
2625 expand_vector_init_insert_elems (target, v, nelts);
2628 /* Get insn code for corresponding comparison. */
2630 static insn_code
2631 get_cmp_insn_code (rtx_code code, machine_mode mode)
2633 insn_code icode;
2634 switch (code)
2636 case EQ:
2637 case NE:
2638 case LE:
2639 case LEU:
2640 case GT:
2641 case GTU:
2642 case LTGT:
2643 icode = code_for_pred_cmp (mode);
2644 break;
2645 case LT:
2646 case LTU:
2647 case GE:
2648 case GEU:
2649 if (FLOAT_MODE_P (mode))
2650 icode = code_for_pred_cmp (mode);
2651 else
2652 icode = code_for_pred_ltge (mode);
2653 break;
2654 default:
2655 gcc_unreachable ();
2657 return icode;
2660 /* This hook gives the vectorizer more vector mode options. We want it to not
2661 only try modes with the maximum number of units a full vector can hold but
2662 for example also half the number of units for a smaller elements size.
2663 Such vectors can be promoted to a full vector of widened elements
2664 (still with the same number of elements, essentially vectorizing at a
2665 fixed number of units rather than a fixed number of bytes). */
2666 unsigned int
2667 autovectorize_vector_modes (vector_modes *modes, bool)
2669 if (autovec_use_vlmax_p ())
2671 poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2673 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2674 fit a whole vector.
2675 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2676 is guided by the extensions we have available (vf2, vf4 and vf8).
2678 - full_size: Try using full vectors for all element types.
2679 - full_size / 2:
2680 Try using 16-bit containers for 8-bit elements and full vectors
2681 for wider elements.
2682 - full_size / 4:
2683 Try using 32-bit containers for 8-bit and 16-bit elements and
2684 full vectors for wider elements.
2685 - full_size / 8:
2686 Try using 64-bit containers for all element types. */
2687 static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2688 for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2690 poly_uint64 units;
2691 machine_mode mode;
2692 if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2693 && get_vector_mode (QImode, units).exists (&mode))
2694 modes->safe_push (mode);
2697 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2698 unsigned int i = 0;
2699 unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2700 unsigned int size = base_size;
2701 machine_mode mode;
2702 while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2704 if (vls_mode_valid_p (mode))
2705 modes->safe_push (mode);
2707 i++;
2708 size = base_size / (1U << i);
2710 /* Enable LOOP_VINFO comparison in COST model. */
2711 return VECT_COMPARE_COSTS;
2714 /* Return true if we can find the related MODE according to default LMUL. */
2715 static bool
2716 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2717 poly_uint64 *nunits)
2719 if (!autovec_use_vlmax_p ())
2720 return false;
2721 if (riscv_v_ext_vector_mode_p (vector_mode)
2722 && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2723 GET_MODE_SIZE (element_mode), nunits))
2724 return true;
2725 if (riscv_v_ext_vls_mode_p (vector_mode)
2726 && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2727 GET_MODE_SIZE (element_mode), nunits))
2728 return true;
2729 return false;
2732 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2733 of units that fit into a full vector at the given ELEMENT_MODE.
2734 We will have the vectorizer call us with a successively decreasing
2735 number of units (as specified in autovectorize_vector_modes).
2736 The starting mode is always the one specified by preferred_simd_mode. */
2737 opt_machine_mode
2738 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2739 poly_uint64 nunits)
2741 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2742 poly_uint64 min_units;
2743 if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2745 machine_mode rvv_mode;
2746 if (maybe_ne (nunits, 0U))
2748 /* If we were given a number of units NUNITS, try to find an
2749 RVV vector mode of inner mode ELEMENT_MODE with the same
2750 number of units. */
2751 if (multiple_p (min_units, nunits)
2752 && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2753 return rvv_mode;
2755 else
2757 /* Look for a vector mode with the same number of units as the
2758 VECTOR_MODE we were given. We keep track of the minimum
2759 number of units so far which determines the smallest necessary
2760 but largest possible, suitable mode for vectorization. */
2761 min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2762 if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2763 return rvv_mode;
2767 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2770 /* Expand an RVV comparison. */
2772 void
2773 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask,
2774 rtx maskoff)
2776 machine_mode mask_mode = GET_MODE (target);
2777 machine_mode data_mode = GET_MODE (op0);
2778 insn_code icode = get_cmp_insn_code (code, data_mode);
2780 if (code == LTGT)
2782 rtx lt = gen_reg_rtx (mask_mode);
2783 rtx gt = gen_reg_rtx (mask_mode);
2784 expand_vec_cmp (lt, LT, op0, op1, mask, maskoff);
2785 expand_vec_cmp (gt, GT, op0, op1, mask, maskoff);
2786 icode = code_for_pred (IOR, mask_mode);
2787 rtx ops[] = {target, lt, gt};
2788 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2789 return;
2792 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2793 if (!mask && !maskoff)
2795 rtx ops[] = {target, cmp, op0, op1};
2796 emit_vlmax_insn (icode, COMPARE_OP, ops);
2798 else
2800 rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2801 emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2805 /* Expand an RVV floating-point comparison:
2807 If CAN_INVERT_P is true, the caller can also handle inverted results;
2808 return true if the result is in fact inverted. */
2810 bool
2811 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2812 bool can_invert_p)
2814 machine_mode mask_mode = GET_MODE (target);
2815 machine_mode data_mode = GET_MODE (op0);
2817 /* If can_invert_p = true:
2818 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2820 vmfeq.vv v0, va, va
2821 vmfeq.vv v1, vb, vb
2822 vmand.mm v0, v0, v1
2823 vmflt.vv v0, va, vb, v0.t
2824 vmnot.m v0, v0
2826 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2827 second vmfeq.vv:
2829 vmfeq.vv v0, va, va
2830 vmfeq.vv v0, vb, vb, v0.t
2831 vmflt.vv v0, va, vb, v0.t
2832 vmnot.m v0, v0
2834 If can_invert_p = false:
2836 # Example of implementing isgreater()
2837 vmfeq.vv v0, va, va # Only set where A is not NaN.
2838 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2839 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2840 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2843 rtx eq0 = gen_reg_rtx (mask_mode);
2844 rtx eq1 = gen_reg_rtx (mask_mode);
2845 switch (code)
2847 case EQ:
2848 case NE:
2849 case LT:
2850 case LE:
2851 case GT:
2852 case GE:
2853 case LTGT:
2854 /* There is native support for the comparison. */
2855 expand_vec_cmp (target, code, op0, op1);
2856 return false;
2857 case UNEQ:
2858 case ORDERED:
2859 case UNORDERED:
2860 case UNLT:
2861 case UNLE:
2862 case UNGT:
2863 case UNGE:
2864 /* vmfeq.vv v0, va, va */
2865 expand_vec_cmp (eq0, EQ, op0, op0);
2866 if (HONOR_SNANS (data_mode))
2869 vmfeq.vv v1, vb, vb
2870 vmand.mm v0, v0, v1
2872 expand_vec_cmp (eq1, EQ, op1, op1);
2873 insn_code icode = code_for_pred (AND, mask_mode);
2874 rtx ops[] = {eq0, eq0, eq1};
2875 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2877 else
2879 /* vmfeq.vv v0, vb, vb, v0.t */
2880 expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0);
2882 break;
2883 default:
2884 gcc_unreachable ();
2887 if (code == ORDERED)
2889 emit_move_insn (target, eq0);
2890 return false;
2893 /* There is native support for the inverse comparison. */
2894 code = reverse_condition_maybe_unordered (code);
2895 if (code == ORDERED)
2896 emit_move_insn (target, eq0);
2897 else
2898 expand_vec_cmp (eq0, code, op0, op1, eq0, eq0);
2900 if (can_invert_p)
2902 emit_move_insn (target, eq0);
2903 return true;
2906 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2907 into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm. */
2908 emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2909 return false;
2912 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2913 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2914 2 * nunits - 1. */
2915 static rtx
2916 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2918 rtx sel_mod;
2919 machine_mode sel_mode = GET_MODE (sel);
2920 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2921 poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2922 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2923 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2924 indice. */
2925 if (CONST_VECTOR_P (sel)
2926 && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2927 sel_mod = sel;
2928 else
2930 rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2931 sel_mod
2932 = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2934 return sel_mod;
2937 /* Implement vec_perm<mode>. */
2939 void
2940 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2942 machine_mode data_mode = GET_MODE (target);
2943 machine_mode sel_mode = GET_MODE (sel);
2944 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2946 /* Check if the sel only references the first values vector. If each select
2947 index is in range of [0, nunits - 1]. A single vrgather instructions is
2948 enough. Since we will use vrgatherei16.vv for variable-length vector,
2949 it is never out of range and we don't need to modulo the index. */
2950 if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2952 emit_vlmax_gather_insn (target, op0, sel);
2953 return;
2956 /* Check if all the indices are same. */
2957 rtx elt;
2958 if (const_vec_duplicate_p (sel, &elt))
2960 poly_uint64 value = rtx_to_poly_int64 (elt);
2961 rtx op = op0;
2962 if (maybe_gt (value, nunits - 1))
2964 sel = gen_const_vector_dup (sel_mode, value - nunits);
2965 op = op1;
2967 emit_vlmax_gather_insn (target, op, sel);
2970 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2971 size of the two value vectors, i.e. the upper bits of the indices
2972 are effectively ignored. RVV vrgather instead produces 0 for any
2973 out-of-range indices, so we need to modulo all the vec_perm indices
2974 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2975 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2976 rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2978 /* Check if the two values vectors are the same. */
2979 if (rtx_equal_p (op0, op1))
2981 emit_vlmax_gather_insn (target, op0, sel_mod);
2982 return;
2985 /* This following sequence is handling the case that:
2986 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2987 value in range of [0, 2 * nunits - 1]. */
2988 machine_mode mask_mode;
2989 mask_mode = get_mask_mode (data_mode);
2990 rtx mask = gen_reg_rtx (mask_mode);
2991 rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2993 /* Step 1: generate a mask that should select everything >= nunits into the
2994 * mask. */
2995 expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2997 /* Step2: gather every op0 values indexed by sel into target,
2998 we don't need to care about the result of the element
2999 whose index >= nunits. */
3000 emit_vlmax_gather_insn (target, op0, sel_mod);
3002 /* Step3: shift the range from (nunits, max_of_mode] to
3003 [0, max_of_mode - nunits]. */
3004 rtx tmp = gen_reg_rtx (sel_mode);
3005 rtx ops[] = {tmp, sel_mod, max_sel};
3006 emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
3008 /* Step4: gather those into the previously masked-out elements
3009 of target. */
3010 emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
3013 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3015 /* vec_perm support. */
3017 struct expand_vec_perm_d
3019 rtx target, op0, op1;
3020 vec_perm_indices perm;
3021 machine_mode vmode;
3022 machine_mode op_mode;
3023 bool one_vector_p;
3024 bool testing_p;
3027 /* Return the appropriate index mode for gather instructions. */
3028 opt_machine_mode
3029 get_gather_index_mode (struct expand_vec_perm_d *d)
3031 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3032 poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3034 if (GET_MODE_INNER (d->vmode) == QImode)
3036 if (nunits.is_constant ())
3038 /* If indice is LMUL8 CONST_VECTOR and any element value
3039 exceed the range of 0 ~ 255, Forbid such permutation
3040 since we need vector HI mode to hold such indice and
3041 we don't have it. */
3042 if (!d->perm.all_in_range_p (0, 255)
3043 && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3044 return opt_machine_mode ();
3046 else
3048 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3049 Otherwise, it could overflow the index range. */
3050 if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3051 return opt_machine_mode ();
3054 else if (riscv_get_v_regno_alignment (sel_mode) > 1
3055 && GET_MODE_INNER (sel_mode) != HImode)
3056 sel_mode = get_vector_mode (HImode, nunits).require ();
3057 return sel_mode;
3060 /* Recognize the patterns that we can use merge operation to shuffle the
3061 vectors. The value of Each element (index i) in selector can only be
3062 either i or nunits + i. We will check the pattern is actually monotonic.
3064 E.g.
3065 v = VEC_PERM_EXPR (v0, v1, selector),
3066 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3068 We can transform such pattern into:
3070 v = vcond_mask (v0, v1, mask),
3071 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3073 static bool
3074 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3076 machine_mode vmode = d->vmode;
3077 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3078 int n_patterns = d->perm.encoding ().npatterns ();
3079 poly_int64 vec_len = d->perm.length ();
3081 for (int i = 0; i < n_patterns; ++i)
3082 if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3083 return false;
3085 /* Check the pattern is monotonic here, otherwise, return false. */
3086 for (int i = n_patterns; i < n_patterns * 2; i++)
3087 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3088 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3089 return false;
3091 /* We need to use precomputed mask for such situation and such mask
3092 can only be computed in compile-time known size modes. */
3093 bool indices_fit_selector_p
3094 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3095 if (!indices_fit_selector_p && !vec_len.is_constant ())
3096 return false;
3098 if (d->testing_p)
3099 return true;
3101 machine_mode mask_mode = get_mask_mode (vmode);
3102 rtx mask = gen_reg_rtx (mask_mode);
3104 if (indices_fit_selector_p)
3106 /* MASK = SELECTOR < NUNITS ? 1 : 0. */
3107 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3108 rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3109 insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3110 rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3111 rtx ops[] = {mask, cmp, sel, x};
3112 emit_vlmax_insn (icode, COMPARE_OP, ops);
3114 else
3116 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3117 directly to generate the selector mask, instead, we can only use
3118 precomputed mask.
3120 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3121 don't have a QImode scalar register to hold larger than 255.
3122 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3123 since there is no larger HI mode vector we cannot create a larger
3124 selector.
3126 As the mask is a simple {0, 1, ...} pattern and the length is known we
3127 can store it in a scalar register and broadcast it to a mask register.
3129 gcc_assert (vec_len.is_constant ());
3130 int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3131 machine_mode mode = get_vector_mode (QImode, size).require ();
3132 rtx tmp = gen_reg_rtx (mode);
3133 rvv_builder v (mode, 1, size);
3134 for (int i = 0; i < vec_len.to_constant () / 8; i++)
3136 uint8_t value = 0;
3137 for (int j = 0; j < 8; j++)
3139 int index = i * 8 + j;
3140 if (known_lt (d->perm[index], 256))
3141 value |= 1 << j;
3143 v.quick_push (gen_int_mode (value, QImode));
3145 emit_move_insn (tmp, v.build ());
3146 emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3149 /* TARGET = MASK ? OP0 : OP1. */
3150 /* swap op0 and op1 since the order is opposite to pred_merge. */
3151 rtx ops2[] = {d->target, d->op1, d->op0, mask};
3152 emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3153 return true;
3156 /* Recognize the consecutive index that we can use a single
3157 vrgather.v[x|i] to shuffle the vectors.
3159 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3160 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3161 static bool
3162 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3164 machine_mode vmode = d->vmode;
3165 scalar_mode smode = GET_MODE_INNER (vmode);
3166 poly_int64 vec_len = d->perm.length ();
3167 HOST_WIDE_INT elt;
3169 if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3170 return false;
3171 int vlen = vec_len.to_constant ();
3173 /* Compute the last element index of consecutive pattern from the leading
3174 consecutive elements. */
3175 int last_consecutive_idx = -1;
3176 int consecutive_num = -1;
3177 for (int i = 1; i < vlen; i++)
3179 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3180 break;
3181 last_consecutive_idx = i;
3182 consecutive_num = last_consecutive_idx + 1;
3185 int new_vlen = vlen / consecutive_num;
3186 if (last_consecutive_idx < 0 || consecutive_num == vlen
3187 || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3188 return false;
3189 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3190 All elements of index, index + 1, ... index + consecutive_num - 1 should
3191 locate at the same vector. */
3192 if (maybe_ge (d->perm[0], vec_len)
3193 != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3194 return false;
3195 /* If a vector has 8 elements. We allow optimizations on consecutive
3196 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3197 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3198 to be optimized. */
3199 if (d->perm[0].to_constant () % consecutive_num != 0)
3200 return false;
3201 unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3202 if (container_bits > 64)
3203 return false;
3204 else if (container_bits == 64)
3206 if (!TARGET_VECTOR_ELEN_64)
3207 return false;
3208 else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3209 return false;
3212 /* Check the rest of elements are the same consecutive pattern. */
3213 for (int i = consecutive_num; i < vlen; i++)
3214 if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3215 return false;
3217 if (FLOAT_MODE_P (smode))
3218 smode = float_mode_for_size (container_bits).require ();
3219 else
3220 smode = int_mode_for_size (container_bits, 0).require ();
3221 if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3222 return false;
3223 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3225 /* Success! */
3226 if (d->testing_p)
3227 return true;
3229 int index = elt / consecutive_num;
3230 if (index >= new_vlen)
3231 index = index - new_vlen;
3232 rtx sel = gen_const_vector_dup (sel_mode, index);
3233 rtx op = elt >= vlen ? d->op0 : d->op1;
3234 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3235 gen_lowpart (vmode, op), sel);
3236 return true;
3239 /* Recognize the patterns that we can use compress operation to shuffle the
3240 vectors. The perm selector of compress pattern is divided into 2 part:
3241 The first part is the random index number < NUNITS.
3242 The second part is consecutive last N index number >= NUNITS.
3244 E.g.
3245 v = VEC_PERM_EXPR (v0, v1, selector),
3246 selector = { 0, 2, 6, 7 }
3248 We can transform such pattern into:
3250 op1 = vcompress (op0, mask)
3251 mask = { 1, 0, 1, 0 }
3252 v = op1. */
3254 static bool
3255 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3257 machine_mode vmode = d->vmode;
3258 poly_int64 vec_len = d->perm.length ();
3260 if (!vec_len.is_constant ())
3261 return false;
3263 int vlen = vec_len.to_constant ();
3265 /* It's not worthwhile the compress pattern has elements < 4
3266 and we can't modulo indices for compress pattern. */
3267 if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3268 return false;
3270 /* Compress pattern doesn't work for one vector. */
3271 if (d->one_vector_p)
3272 return false;
3274 /* Compress point is the point that all elements value with index i >=
3275 compress point of the selector are all consecutive series increasing and
3276 each selector value >= NUNITS. In this case, we could compress all elements
3277 of i < compress point into the op1. */
3278 int compress_point = -1;
3279 for (int i = 0; i < vlen; i++)
3281 if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3283 compress_point = i;
3284 break;
3288 /* We don't apply compress approach if we can't find the compress point. */
3289 if (compress_point < 0)
3290 return false;
3292 /* We can only apply compress approach when all index values from 0 to
3293 compress point are increasing. */
3294 for (int i = 1; i < compress_point; i++)
3295 if (maybe_le (d->perm[i], d->perm[i - 1]))
3296 return false;
3298 /* It must be series increasing from compress point. */
3299 for (int i = 1 + compress_point; i < vlen; i++)
3300 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3301 return false;
3303 /* Success! */
3304 if (d->testing_p)
3305 return true;
3307 /* Check whether we need to slideup op1 to apply compress approach.
3309 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3310 is 2 * NUNITS - 1, so we don't need to slide up.
3312 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3313 we apply compress approach. */
3314 bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3315 && !const_vec_duplicate_p (d->op1);
3317 /* If we leave it directly be handled by general gather,
3318 the code sequence will be:
3319 VECTOR LOAD selector
3320 GEU mask, selector, NUNITS
3321 GATHER dest, op0, selector
3322 SUB selector, selector, NUNITS
3323 GATHER dest, op1, selector, mask
3324 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3325 as COST = 4. So, we consider the general gather handling COST = 9.
3326 TODO: This cost is not accurate, we can adjust it by tune info. */
3327 int general_cost = 9;
3329 /* If we can use compress approach, the code sequence will be:
3330 MASK LOAD mask
3331 COMPRESS op1, op0, mask
3332 If it needs slide up, it will be:
3333 MASK LOAD mask
3334 SLIDEUP op1
3335 COMPRESS op1, op0, mask
3336 By default, mask load COST = 2.
3337 TODO: This cost is not accurate, we can adjust it by tune info. */
3338 int compress_cost = 4;
3340 if (general_cost <= compress_cost)
3341 return false;
3343 /* Build a mask that is true when selector element is true. */
3344 machine_mode mask_mode = get_mask_mode (vmode);
3345 rvv_builder builder (mask_mode, vlen, 1);
3346 for (int i = 0; i < vlen; i++)
3348 bool is_compress_index = false;
3349 for (int j = 0; j < compress_point; j++)
3351 if (known_eq (d->perm[j], i))
3353 is_compress_index = true;
3354 break;
3357 if (is_compress_index)
3358 builder.quick_push (CONST1_RTX (BImode));
3359 else
3360 builder.quick_push (CONST0_RTX (BImode));
3362 rtx mask = force_reg (mask_mode, builder.build ());
3364 rtx merge = d->op1;
3365 if (need_slideup_p)
3367 int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3368 merge = gen_reg_rtx (vmode);
3369 rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3370 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3371 emit_vlmax_insn (icode, BINARY_OP, ops);
3374 insn_code icode = code_for_pred_compress (vmode);
3375 rtx ops[] = {d->target, merge, d->op0, mask};
3376 emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3377 return true;
3380 /* Recognize decompress patterns:
3382 1. VEC_PERM_EXPR op0 and op1
3383 with isel = { 0, nunits, 1, nunits + 1, ... }.
3384 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3386 2. VEC_PERM_EXPR op0 and op1
3387 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3388 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3389 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3391 static bool
3392 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3394 poly_uint64 nelt = d->perm.length ();
3395 machine_mode mask_mode = get_mask_mode (d->vmode);
3397 /* For constant size indices, we dont't need to handle it here.
3398 Just leave it to vec_perm<mode>. */
3399 if (d->perm.length ().is_constant ())
3400 return false;
3402 poly_uint64 first = d->perm[0];
3403 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3404 || !d->perm.series_p (0, 2, first, 1)
3405 || !d->perm.series_p (1, 2, first + nelt, 1))
3406 return false;
3408 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3409 Otherwise, it could overflow the index range. */
3410 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3411 if (GET_MODE_INNER (d->vmode) == QImode
3412 && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3413 return false;
3415 /* Success! */
3416 if (d->testing_p)
3417 return true;
3419 rtx op0, op1;
3420 if (known_eq (first, 0U))
3422 op0 = d->op0;
3423 op1 = d->op1;
3425 else
3427 op0 = gen_reg_rtx (d->vmode);
3428 op1 = gen_reg_rtx (d->vmode);
3429 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3430 rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3431 rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3432 emit_vlmax_insn (icode, BINARY_OP, ops0);
3433 emit_vlmax_insn (icode, BINARY_OP, ops1);
3435 /* Generate { 0, 1, .... } mask. */
3436 rtx vid = gen_reg_rtx (sel_mode);
3437 rtx vid_repeat = gen_reg_rtx (sel_mode);
3438 expand_vec_series (vid, const0_rtx, const1_rtx);
3439 rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3440 emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3441 rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3442 rtx mask = gen_reg_rtx (mask_mode);
3443 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3444 emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3445 return true;
3448 static bool
3449 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3451 HOST_WIDE_INT diff;
3452 unsigned i, size, step;
3454 if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3455 return false;
3457 step = diff + 1;
3458 size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3460 switch (size)
3462 case 16:
3463 break;
3464 case 32:
3465 case 64:
3466 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3467 __builtin_bswap. It will generate about 9 instructions in
3468 loop as below, no matter it is bswap16, bswap32 or bswap64.
3469 .L2:
3470 1 vle16.v v4,0(a0)
3471 2 vmv.v.x v2,a7
3472 3 vand.vv v2,v6,v2
3473 4 slli a2,a5,1
3474 5 vrgatherei16.vv v1,v4,v2
3475 6 sub a4,a4,a5
3476 7 vse16.v v1,0(a3)
3477 8 add a0,a0,a2
3478 9 add a3,a3,a2
3479 bne a4,zero,.L2
3481 But for bswap16 we may have a even simple code gen, which
3482 has only 7 instructions in loop as below.
3484 1 vle8.v v2,0(a5)
3485 2 addi a5,a5,32
3486 3 vsrl.vi v4,v2,8
3487 4 vsll.vi v2,v2,8
3488 5 vor.vv v4,v4,v2
3489 6 vse8.v v4,0(a4)
3490 7 addi a4,a4,32
3491 bne a5,a6,.L5
3493 Unfortunately, the instructions in loop will grow to 13 and 24
3494 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3495 for both the bswap64 and bswap32, but take shift and or (7 insn)
3496 for bswap16.
3498 default:
3499 return false;
3502 for (i = 0; i < step; i++)
3503 if (!d->perm.series_p (i, step, diff - i, step))
3504 return false;
3506 /* Disable when nunits < 4 since the later generic approach
3507 is more profitable on BSWAP. */
3508 if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3509 return false;
3511 if (d->testing_p)
3512 return true;
3514 machine_mode vhi_mode;
3515 poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3517 if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3518 return false;
3520 /* Step-1: Move op0 to src with VHI mode. */
3521 rtx src = gen_reg_rtx (vhi_mode);
3522 emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3524 /* Step-2: Shift right 8 bits to dest. */
3525 rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3526 NULL_RTX, 0, OPTAB_DIRECT);
3528 /* Step-3: Shift left 8 bits to src. */
3529 src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3530 NULL_RTX, 0, OPTAB_DIRECT);
3532 /* Step-4: Logic Or dest and src to dest. */
3533 dest = expand_binop (vhi_mode, ior_optab, dest, src,
3534 NULL_RTX, 0, OPTAB_DIRECT);
3536 /* Step-5: Move src to target with VQI mode. */
3537 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3539 return true;
3542 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3543 approach. */
3545 static bool
3546 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3548 poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3550 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3551 if (!d->perm.series_p (0, 2, nunits - 1, 2)
3552 || !d->perm.series_p (1, 2, nunits, 2))
3553 return false;
3555 /* Disable when nunits < 4 since the later generic approach
3556 is more profitable on indice = { nunits - 1, nunits }. */
3557 if (!known_gt (nunits, 2))
3558 return false;
3560 /* Success! */
3561 if (d->testing_p)
3562 return true;
3564 /* Extract the last element of the first vector. */
3565 scalar_mode smode = GET_MODE_INNER (d->vmode);
3566 rtx tmp = gen_reg_rtx (smode);
3567 emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3569 /* Insert the scalar into element 0. */
3570 unsigned int unspec
3571 = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3572 insn_code icode = code_for_pred_slide (unspec, d->vmode);
3573 rtx ops[] = {d->target, d->op1, tmp};
3574 emit_vlmax_insn (icode, BINARY_OP, ops);
3575 return true;
3578 /* This looks for a series pattern in the provided vector permute structure D.
3579 If successful it emits a series insn as well as a gather to implement it.
3580 Return true if successful, false otherwise. */
3582 static bool
3583 shuffle_series_patterns (struct expand_vec_perm_d *d)
3585 if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3586 return false;
3588 poly_int64 el1 = d->perm[0];
3589 poly_int64 el2 = d->perm[1];
3590 poly_int64 el3 = d->perm[2];
3592 poly_int64 step1 = el2 - el1;
3593 poly_int64 step2 = el3 - el2;
3595 bool need_insert = false;
3596 bool have_series = false;
3598 /* Check for a full series. */
3599 if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3600 have_series = true;
3602 /* Check for a series starting at the second element. */
3603 else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3605 have_series = true;
3606 need_insert = true;
3609 if (!have_series)
3610 return false;
3612 /* Disable shuffle if we can't find an appropriate integer index mode for
3613 gather. */
3614 machine_mode sel_mode;
3615 if (!get_gather_index_mode (d).exists (&sel_mode))
3616 return false;
3618 /* Success! */
3619 if (d->testing_p)
3620 return true;
3622 /* Create the series. */
3623 machine_mode eltmode = Pmode;
3624 rtx series = gen_reg_rtx (sel_mode);
3625 expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3626 gen_int_mode (need_insert ? step2 : step1, eltmode));
3628 /* Insert the remaining element if necessary. */
3629 if (need_insert)
3631 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3632 rtx ops[]
3633 = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3634 emit_vlmax_insn (icode, BINARY_OP, ops);
3637 emit_vlmax_gather_insn (d->target, d->op0, series);
3639 return true;
3642 /* Recognize the pattern that can be shuffled by generic approach. */
3644 static bool
3645 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3647 machine_mode sel_mode;
3649 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3650 if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3651 return false;
3653 /* Disable shuffle if we can't find an appropriate integer index mode for
3654 gather. */
3655 if (!get_gather_index_mode (d).exists (&sel_mode))
3656 return false;
3658 /* Success! */
3659 if (d->testing_p)
3660 return true;
3662 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3663 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3664 instead of expand vec_perm<mode>, we handle it directly. */
3665 expand_vec_perm (d->target, d->op0, d->op1, sel);
3666 return true;
3669 /* This function recognizes and supports different permutation patterns
3670 and enable VLA SLP auto-vectorization. */
3671 static bool
3672 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3674 gcc_assert (d->op_mode != E_VOIDmode);
3676 /* The pattern matching functions above are written to look for a small
3677 number to begin the sequence (0, 1, N/2). If we begin with an index
3678 from the second operand, we can swap the operands. */
3679 poly_int64 nelt = d->perm.length ();
3680 if (known_ge (d->perm[0], nelt))
3682 d->perm.rotate_inputs (1);
3683 std::swap (d->op0, d->op1);
3686 if (known_gt (nelt, 1))
3688 if (d->vmode == d->op_mode)
3690 if (shuffle_merge_patterns (d))
3691 return true;
3692 if (shuffle_consecutive_patterns (d))
3693 return true;
3694 if (shuffle_compress_patterns (d))
3695 return true;
3696 if (shuffle_decompress_patterns (d))
3697 return true;
3698 if (shuffle_bswap_pattern (d))
3699 return true;
3700 if (shuffle_extract_and_slide1up_patterns (d))
3701 return true;
3702 if (shuffle_series_patterns (d))
3703 return true;
3704 if (shuffle_generic_patterns (d))
3705 return true;
3706 return false;
3708 else
3709 return false;
3711 return false;
3714 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3715 * instructions. */
3716 bool
3717 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3718 rtx op0, rtx op1, const vec_perm_indices &sel)
3720 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3721 mask to do the iteration loop control. Just disable it directly. */
3722 if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3723 return false;
3724 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3725 may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3726 Ideally, middle-end loop vectorizer should be able to disable it
3727 itself, We can remove the codes here when middle-end code is able
3728 to disable VLA SLP vectorization for poly size (1, 1) VF. */
3729 if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3730 && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3731 poly_int64 (16, 16)))
3732 return false;
3734 struct expand_vec_perm_d d;
3736 /* Check whether the mask can be applied to a single vector. */
3737 if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3738 d.one_vector_p = true;
3739 else if (sel.all_from_input_p (0))
3741 d.one_vector_p = true;
3742 op1 = op0;
3744 else if (sel.all_from_input_p (1))
3746 d.one_vector_p = true;
3747 op0 = op1;
3749 else
3750 d.one_vector_p = false;
3752 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3753 sel.nelts_per_input ());
3754 d.vmode = vmode;
3755 d.op_mode = op_mode;
3756 d.target = target;
3757 d.op0 = op0;
3758 if (op0 == op1)
3759 d.op1 = d.op0;
3760 else
3761 d.op1 = op1;
3762 d.testing_p = !target;
3764 if (!d.testing_p)
3765 return expand_vec_perm_const_1 (&d);
3767 rtx_insn *last = get_last_insn ();
3768 bool ret = expand_vec_perm_const_1 (&d);
3769 gcc_assert (last == get_last_insn ());
3771 return ret;
3774 /* Generate no side effects vsetvl to get the vector length. */
3775 void
3776 expand_select_vl (rtx *ops)
3778 poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3779 if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3781 /* If length is known <= VF, we just use the length directly instead
3782 of using vsetvli.
3784 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3785 We move 3 into _255 instead of using explicit vsetvl. */
3786 emit_move_insn (ops[0], ops[1]);
3787 return;
3789 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3790 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
3791 scalar_int_mode mode = QImode;
3792 machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3793 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3796 /* Expand MASK_LEN_{LOAD,STORE}. */
3797 void
3798 expand_load_store (rtx *ops, bool is_load)
3800 rtx mask = ops[2];
3801 rtx len = ops[3];
3802 machine_mode mode = GET_MODE (ops[0]);
3804 if (is_vlmax_len_p (mode, len))
3806 /* If the length operand is equal to VF, it is VLMAX load/store. */
3807 if (is_load)
3809 rtx m_ops[] = {ops[0], mask, ops[1]};
3810 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3812 else
3814 len = gen_reg_rtx (Pmode);
3815 emit_vlmax_vsetvl (mode, len);
3816 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3817 get_avl_type_rtx (VLMAX)));
3820 else
3822 if (!satisfies_constraint_K (len))
3823 len = force_reg (Pmode, len);
3824 if (is_load)
3826 rtx m_ops[] = {ops[0], mask, ops[1]};
3827 emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3828 len);
3830 else
3831 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3832 get_avl_type_rtx (NONVLMAX)));
3836 /* Expand MASK_LEN_STRIDED_LOAD. */
3837 void
3838 expand_strided_load (machine_mode mode, rtx *ops)
3840 rtx v_reg = ops[0];
3841 rtx base = ops[1];
3842 rtx stride = ops[2];
3843 rtx mask = ops[3];
3844 rtx len = ops[4];
3845 poly_int64 len_val;
3847 insn_code icode = code_for_pred_strided_load (mode);
3848 rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
3850 if (poly_int_rtx_p (len, &len_val)
3851 && known_eq (len_val, GET_MODE_NUNITS (mode)))
3852 emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
3853 else
3855 len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
3856 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, emit_ops, len);
3860 /* Expand MASK_LEN_STRIDED_STORE. */
3861 void
3862 expand_strided_store (machine_mode mode, rtx *ops)
3864 rtx v_reg = ops[2];
3865 rtx base = ops[0];
3866 rtx stride = ops[1];
3867 rtx mask = ops[3];
3868 rtx len = ops[4];
3869 poly_int64 len_val;
3870 rtx vl_type;
3872 if (poly_int_rtx_p (len, &len_val)
3873 && known_eq (len_val, GET_MODE_NUNITS (mode)))
3875 len = gen_reg_rtx (Pmode);
3876 emit_vlmax_vsetvl (mode, len);
3877 vl_type = get_avl_type_rtx (VLMAX);
3879 else
3881 len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len);
3882 vl_type = get_avl_type_rtx (NONVLMAX);
3885 emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, base),
3886 mask, stride, v_reg, len, vl_type));
3889 /* Return true if the operation is the floating-point operation need FRM. */
3890 static bool
3891 needs_fp_rounding (unsigned icode, machine_mode mode)
3893 if (!FLOAT_MODE_P (mode))
3894 return false;
3896 return icode != maybe_code_for_pred (SMIN, mode)
3897 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3898 && icode != maybe_code_for_pred (SMAX, mode)
3899 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3900 && icode != maybe_code_for_pred (NEG, mode)
3901 && icode != maybe_code_for_pred (ABS, mode)
3902 /* narrower-FP -> FP */
3903 && icode != maybe_code_for_pred_extend (mode)
3904 /* narrower-INT -> FP */
3905 && icode != maybe_code_for_pred_widen (FLOAT, mode)
3906 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3907 /* vfsgnj */
3908 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3909 && icode != maybe_code_for_pred_mov (mode);
3912 /* Subroutine to expand COND_LEN_* patterns. */
3913 static void
3914 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3916 rtx dest = ops[0];
3917 rtx mask = ops[1];
3918 machine_mode mode = GET_MODE (dest);
3919 machine_mode mask_mode = GET_MODE (mask);
3920 bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3921 bool is_vlmax_len = is_vlmax_len_p (mode, len);
3923 unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3924 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3925 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
3926 simplification in RISC-V backend and may do that in middle-end in the
3927 future. */
3928 if (is_dummy_mask && is_vlmax_len)
3929 insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
3930 else if (is_dummy_mask)
3931 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3932 else if (is_vlmax_len)
3933 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3934 else
3935 insn_flags |= TU_POLICY_P | MU_POLICY_P;
3937 if (needs_fp_rounding (icode, mode))
3938 insn_flags |= FRM_DYN_P;
3940 if (is_vlmax_len)
3941 emit_vlmax_insn (icode, insn_flags, ops);
3942 else
3943 emit_nonvlmax_insn (icode, insn_flags, ops, len);
3946 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
3947 static rtx
3948 get_else_operand (rtx op)
3950 return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3953 /* Expand unary ops COND_LEN_*. */
3954 void
3955 expand_cond_len_unop (unsigned icode, rtx *ops)
3957 rtx dest = ops[0];
3958 rtx mask = ops[1];
3959 rtx src = ops[2];
3960 rtx merge = get_else_operand (ops[3]);
3961 rtx len = ops[4];
3963 rtx cond_ops[] = {dest, mask, merge, src};
3964 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3967 /* Expand unary ops COND_*. */
3968 void
3969 expand_cond_unop (unsigned icode, rtx *ops)
3971 rtx dest = ops[0];
3972 rtx mask = ops[1];
3973 rtx src = ops[2];
3974 rtx merge = get_else_operand (ops[3]);
3975 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3977 rtx cond_ops[] = {dest, mask, merge, src};
3978 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3981 /* Expand binary ops COND_LEN_*. */
3982 void
3983 expand_cond_len_binop (unsigned icode, rtx *ops)
3985 rtx dest = ops[0];
3986 rtx mask = ops[1];
3987 rtx src1 = ops[2];
3988 rtx src2 = ops[3];
3989 rtx merge = get_else_operand (ops[4]);
3990 rtx len = ops[5];
3992 rtx cond_ops[] = {dest, mask, merge, src1, src2};
3993 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3996 /* Expand binary ops COND_*. */
3997 void
3998 expand_cond_binop (unsigned icode, rtx *ops)
4000 rtx dest = ops[0];
4001 rtx mask = ops[1];
4002 rtx src1 = ops[2];
4003 rtx src2 = ops[3];
4004 rtx merge = get_else_operand (ops[4]);
4005 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4007 rtx cond_ops[] = {dest, mask, merge, src1, src2};
4008 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
4011 /* Prepare insn_code for gather_load/scatter_store according to
4012 the vector mode and index mode. */
4013 static insn_code
4014 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
4015 bool is_load)
4017 if (!is_load)
4018 return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
4019 else
4021 unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
4022 unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
4023 if (dst_eew_bitsize == src_eew_bitsize)
4024 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
4025 else if (dst_eew_bitsize > src_eew_bitsize)
4027 unsigned factor = dst_eew_bitsize / src_eew_bitsize;
4028 switch (factor)
4030 case 2:
4031 return code_for_pred_indexed_load_x2_greater_eew (
4032 UNSPEC_UNORDERED, vec_mode);
4033 case 4:
4034 return code_for_pred_indexed_load_x4_greater_eew (
4035 UNSPEC_UNORDERED, vec_mode);
4036 case 8:
4037 return code_for_pred_indexed_load_x8_greater_eew (
4038 UNSPEC_UNORDERED, vec_mode);
4039 default:
4040 gcc_unreachable ();
4043 else
4045 unsigned factor = src_eew_bitsize / dst_eew_bitsize;
4046 switch (factor)
4048 case 2:
4049 return code_for_pred_indexed_load_x2_smaller_eew (
4050 UNSPEC_UNORDERED, vec_mode);
4051 case 4:
4052 return code_for_pred_indexed_load_x4_smaller_eew (
4053 UNSPEC_UNORDERED, vec_mode);
4054 case 8:
4055 return code_for_pred_indexed_load_x8_smaller_eew (
4056 UNSPEC_UNORDERED, vec_mode);
4057 default:
4058 gcc_unreachable ();
4064 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
4065 void
4066 expand_gather_scatter (rtx *ops, bool is_load)
4068 rtx ptr, vec_offset, vec_reg;
4069 bool zero_extend_p;
4070 int shift;
4071 rtx mask = ops[5];
4072 rtx len = ops[6];
4073 if (is_load)
4075 vec_reg = ops[0];
4076 ptr = ops[1];
4077 vec_offset = ops[2];
4078 zero_extend_p = INTVAL (ops[3]);
4079 shift = exact_log2 (INTVAL (ops[4]));
4081 else
4083 vec_reg = ops[4];
4084 ptr = ops[0];
4085 vec_offset = ops[1];
4086 zero_extend_p = INTVAL (ops[2]);
4087 shift = exact_log2 (INTVAL (ops[3]));
4090 machine_mode vec_mode = GET_MODE (vec_reg);
4091 machine_mode idx_mode = GET_MODE (vec_offset);
4092 scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4093 unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4094 poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4095 bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4097 bool use_widening_shift = false;
4099 /* Extend the offset element to address width. */
4100 if (inner_offsize < BITS_PER_WORD)
4102 use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
4103 /* 7.2. Vector Load/Store Addressing Modes.
4104 If the vector offset elements are narrower than XLEN, they are
4105 zero-extended to XLEN before adding to the ptr effective address. If
4106 the vector offset elements are wider than XLEN, the least-significant
4107 XLEN bits are used in the address calculation. An implementation must
4108 raise an illegal instruction exception if the EEW is not supported for
4109 offset elements.
4111 RVV spec only refers to the shift == 0 case. */
4112 if (!zero_extend_p || shift)
4114 if (zero_extend_p)
4115 inner_idx_mode
4116 = int_mode_for_size (inner_offsize * 2, 0).require ();
4117 else
4118 inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4119 machine_mode new_idx_mode
4120 = get_vector_mode (inner_idx_mode, nunits).require ();
4121 if (!use_widening_shift)
4123 rtx tmp = gen_reg_rtx (new_idx_mode);
4124 emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4125 zero_extend_p ? true : false));
4126 vec_offset = tmp;
4128 idx_mode = new_idx_mode;
4132 if (shift)
4134 rtx tmp;
4135 if (!use_widening_shift)
4136 tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4137 gen_int_mode (shift, Pmode), NULL_RTX, 0,
4138 OPTAB_DIRECT);
4139 else
4141 tmp = gen_reg_rtx (idx_mode);
4142 insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
4143 rtx ops[] = {tmp, vec_offset, const1_rtx};
4144 emit_vlmax_insn (icode, BINARY_OP, ops);
4147 vec_offset = tmp;
4150 insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4151 if (is_vlmax)
4153 if (is_load)
4155 rtx load_ops[]
4156 = {vec_reg, mask, ptr, vec_offset};
4157 emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4159 else
4161 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4162 emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4165 else
4167 if (is_load)
4169 rtx load_ops[]
4170 = {vec_reg, mask, ptr, vec_offset};
4171 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4173 else
4175 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4176 emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4181 /* Expand COND_LEN_*. */
4182 void
4183 expand_cond_len_ternop (unsigned icode, rtx *ops)
4185 rtx dest = ops[0];
4186 rtx mask = ops[1];
4187 rtx src1 = ops[2];
4188 rtx src2 = ops[3];
4189 rtx src3 = ops[4];
4190 rtx merge = get_else_operand (ops[5]);
4191 rtx len = ops[6];
4193 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4194 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4197 /* Expand COND_*. */
4198 void
4199 expand_cond_ternop (unsigned icode, rtx *ops)
4201 rtx dest = ops[0];
4202 rtx mask = ops[1];
4203 rtx src1 = ops[2];
4204 rtx src2 = ops[3];
4205 rtx src3 = ops[4];
4206 rtx merge = get_else_operand (ops[5]);
4207 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4209 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4210 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4213 /* Expand reduction operations.
4214 Case 1: ops = {scalar_dest, vector_src}
4215 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4217 void
4218 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4220 rtx scalar_dest = ops[0];
4221 rtx vector_src = ops[1];
4222 machine_mode vmode = GET_MODE (vector_src);
4223 machine_mode vel_mode = GET_MODE (scalar_dest);
4224 machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4226 rtx m1_tmp = gen_reg_rtx (m1_mode);
4227 rtx scalar_move_ops[] = {m1_tmp, init};
4228 insn_code icode = code_for_pred_broadcast (m1_mode);
4229 if (need_mask_operand_p (insn_flags))
4230 emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, ops[3]);
4231 else
4232 emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
4234 rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4235 rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4236 icode = code_for_pred (unspec, vmode);
4238 if (need_mask_operand_p (insn_flags))
4240 rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4241 emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4243 else
4244 emit_vlmax_insn (icode, insn_flags, reduc_ops);
4246 emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4249 /* Prepare ops for ternary operations.
4250 It can be called before or after RA. */
4251 void
4252 prepare_ternary_operands (rtx *ops)
4254 machine_mode mode = GET_MODE (ops[0]);
4256 if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4257 && (VECTOR_MODE_P (GET_MODE (ops[2]))
4258 && !rtx_equal_p (ops[2], ops[5]))
4259 && !rtx_equal_p (ops[3], ops[5])
4260 && !rtx_equal_p (ops[4], ops[5]))
4262 /* RA will fail to find vector REG and report ICE, so we pre-merge
4263 the ops for LMUL = 8. */
4264 if (satisfies_constraint_Wc1 (ops[1]))
4266 emit_move_insn (ops[0], ops[5]);
4267 emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4268 ops[7], ops[8], ops[9]));
4270 else
4271 emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4272 ops[4], ops[1], ops[6], ops[7], ops[9]));
4273 ops[5] = ops[4] = ops[0];
4275 else
4277 /* Swap the multiplication ops if the fallback value is the
4278 second of the two. */
4279 if (rtx_equal_p (ops[3], ops[5]))
4280 std::swap (ops[2], ops[3]);
4282 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4283 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4285 gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4286 || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4289 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4290 void
4291 expand_lanes_load_store (rtx *ops, bool is_load)
4293 rtx mask = ops[2];
4294 rtx len = ops[3];
4295 rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4296 rtx reg = is_load ? ops[0] : ops[1];
4297 machine_mode mode = GET_MODE (ops[0]);
4299 if (is_vlmax_len_p (mode, len))
4301 /* If the length operand is equal to VF, it is VLMAX load/store. */
4302 if (is_load)
4304 rtx m_ops[] = {reg, mask, addr};
4305 emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4306 m_ops);
4308 else
4310 len = gen_reg_rtx (Pmode);
4311 emit_vlmax_vsetvl (mode, len);
4312 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4313 get_avl_type_rtx (VLMAX)));
4316 else
4318 if (!satisfies_constraint_K (len))
4319 len = force_reg (Pmode, len);
4320 if (is_load)
4322 rtx m_ops[] = {reg, mask, addr};
4323 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4324 UNARY_OP_TAMA, m_ops, len);
4326 else
4327 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4328 get_avl_type_rtx (NONVLMAX)));
4332 /* Expand LEN_FOLD_EXTRACT_LAST. */
4333 void
4334 expand_fold_extract_last (rtx *ops)
4336 rtx dst = ops[0];
4337 rtx default_value = ops[1];
4338 rtx mask = ops[2];
4339 rtx anchor = gen_reg_rtx (Pmode);
4340 rtx index = gen_reg_rtx (Pmode);
4341 rtx vect = ops[3];
4342 rtx else_label = gen_label_rtx ();
4343 rtx end_label = gen_label_rtx ();
4344 rtx len = ops[4];
4345 machine_mode mode = GET_MODE (vect);
4346 machine_mode mask_mode = GET_MODE (mask);
4347 rtx compress_vect = gen_reg_rtx (mode);
4348 rtx slide_vect = gen_reg_rtx (mode);
4349 insn_code icode;
4351 if (is_vlmax_len_p (mode, len))
4352 len = NULL_RTX;
4354 /* Calculate the number of 1-bit in mask. */
4355 rtx cpop_ops[] = {anchor, mask};
4356 if (len)
4357 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4358 cpop_ops, len);
4359 else
4360 emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4361 cpop_ops);
4363 riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4364 emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4365 /* Compress the vector. */
4366 icode = code_for_pred_compress (mode);
4367 rtx compress_ops[] = {compress_vect, vect, mask};
4368 if (len)
4369 emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4370 else
4371 emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4372 /* Emit the slide down to index 0 in a new vector. */
4373 rtx slide_ops[] = {slide_vect, compress_vect, index};
4374 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4375 if (len)
4376 emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4377 else
4378 emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4379 /* Emit v(f)mv.[xf].s. */
4380 emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4382 emit_jump_insn (gen_jump (end_label));
4383 emit_barrier ();
4384 emit_label (else_label);
4385 emit_move_insn (dst, default_value);
4386 emit_label (end_label);
4389 /* Return true if the LMUL of comparison less than or equal to one. */
4390 bool
4391 cmp_lmul_le_one (machine_mode mode)
4393 if (riscv_v_ext_vector_mode_p (mode))
4394 return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4395 else if (riscv_v_ext_vls_mode_p (mode))
4396 return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4397 return false;
4400 /* Return true if the LMUL of comparison greater than one. */
4401 bool
4402 cmp_lmul_gt_one (machine_mode mode)
4404 if (riscv_v_ext_vector_mode_p (mode))
4405 return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4406 else if (riscv_v_ext_vls_mode_p (mode))
4407 return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4408 return false;
4411 /* Return true if the VLS mode is legal. There are 2 cases here.
4413 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4414 is the highest priority choice and should not conflict with VLS modes.
4415 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4416 VLS mode are smaller than the minimal vla.
4418 Take vlen = 2048 as example for case 2.
4420 Note: Below table based on vlen = 2048.
4421 +----------------------------------------------------+----------------------+
4422 | VLS mode | VLA mode |
4423 +----------------------------------------------------+----------------------+
4424 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4425 +------------+-----------+-----------------+---------+-----------+----------+
4426 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4427 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4428 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4429 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4430 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4431 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4432 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4433 | ... | ... | ... | ... | RVVMF64BI | 32 |
4434 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4435 +------------+-----------+-----------------+---------+-----------+----------+
4436 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4437 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4438 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4439 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4440 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4441 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4442 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4443 | ... | ... | .. | ... | RVVMF8QI | 256 |
4444 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4445 +------------+-----------+-----------------+---------+-----------+----------+
4446 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4447 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4448 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4449 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4450 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4451 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4452 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4453 | ... | ... | .. | ... | RVVMF4HI | 512 |
4454 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4455 +------------+-----------+-----------------+---------+-----------+----------+
4456 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4457 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4458 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4459 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4460 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4461 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4462 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4463 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4464 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4465 +------------+-----------+-----------------+---------+-----------+----------+
4466 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4467 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4468 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4469 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4470 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4471 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4472 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4473 | ... | ... | .. | ... | RVVM1DI | 2048 |
4474 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4475 +------------+-----------+-----------------+---------+-----------+----------+
4477 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4478 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4479 bool
4480 vls_mode_valid_p (machine_mode vls_mode)
4482 if (!TARGET_VECTOR || TARGET_XTHEADVECTOR)
4483 return false;
4485 if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE)
4487 if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4488 && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4489 GET_MODE_PRECISION (vls_mode)))
4490 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4491 BITS_PER_RISCV_VECTOR.
4493 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4494 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4495 false between VLA modes with size = (128, 128) bits and VLS mode
4496 with size = 128 bits, we will end up with multiple ICEs in
4497 middle-end generic codes. */
4498 return false;
4499 return true;
4502 if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
4504 machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4505 int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4506 int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4508 return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4511 return false;
4514 /* We don't have to convert the floating point to integer when the
4515 mantissa is zero. Thus, ther will be a limitation for both the
4516 single and double precision floating point. There will be no
4517 mantissa if the floating point is greater than the limit.
4519 1. Half floating point.
4520 +-----------+---------------+
4521 | float | binary layout |
4522 +-----------+---------------+
4523 | 1023.5 | 0x63ff |
4524 +-----------+---------------+
4525 | 1024.0 | 0x6400 |
4526 +-----------+---------------+
4527 | 1025.0 | 0x6401 |
4528 +-----------+---------------+
4529 | ... | ... |
4531 All half floating point will be unchanged for ceil if it is
4532 greater than and equal to 1024.
4534 2. Single floating point.
4535 +-----------+---------------+
4536 | float | binary layout |
4537 +-----------+---------------+
4538 | 8388607.5 | 0x4affffff |
4539 +-----------+---------------+
4540 | 8388608.0 | 0x4b000000 |
4541 +-----------+---------------+
4542 | 8388609.0 | 0x4b000001 |
4543 +-----------+---------------+
4544 | ... | ... |
4546 All single floating point will be unchanged for ceil if it is
4547 greater than and equal to 8388608.
4549 3. Double floating point.
4550 +--------------------+--------------------+
4551 | float | binary layout |
4552 +--------------------+--------------------+
4553 | 4503599627370495.5 | 0X432fffffffffffff |
4554 +--------------------+--------------------+
4555 | 4503599627370496.0 | 0X4330000000000000 |
4556 +--------------------+--------------------+
4557 | 4503599627370497.0 | 0X4340000000000000 |
4558 +--------------------+--------------------+
4559 | ... | ... |
4561 All double floating point will be unchanged for ceil if it is
4562 greater than and equal to 4503599627370496.
4565 get_fp_rounding_coefficient (machine_mode inner_mode)
4567 REAL_VALUE_TYPE real;
4569 if (inner_mode == E_HFmode)
4570 real_from_integer (&real, inner_mode, 1024, SIGNED);
4571 else if (inner_mode == E_SFmode)
4572 real_from_integer (&real, inner_mode, 8388608, SIGNED);
4573 else if (inner_mode == E_DFmode)
4574 real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4575 else
4576 gcc_unreachable ();
4578 return const_double_from_real_value (real, inner_mode);
4581 static rtx
4582 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4583 machine_mode vec_fp_mode)
4585 /* Step-1: Prepare the scalar float compare register. */
4586 rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4587 emit_insn (gen_move_insn (fp_reg, fp_scalar));
4589 /* Step-2: Generate the mask. */
4590 machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4591 rtx mask = gen_reg_rtx (mask_mode);
4592 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4593 rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4594 insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4595 emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4597 return mask;
4600 static void
4601 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4602 machine_mode vec_mode)
4604 rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4605 insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4607 emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4610 static void
4611 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4613 rtx abs_ops[] = {op_dest, op_src};
4614 insn_code icode = code_for_pred (ABS, vec_mode);
4616 emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4619 static void
4620 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4621 insn_type type, machine_mode vec_mode)
4623 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4625 if (type & USE_VUNDEF_MERGE_P)
4627 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4628 emit_vlmax_insn (icode, type, cvt_x_ops);
4630 else
4632 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4633 emit_vlmax_insn (icode, type, cvt_x_ops);
4637 static void
4638 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4639 machine_mode vec_mode)
4641 rtx ops[] = {op_dest, op_src};
4642 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4644 emit_vlmax_insn (icode, type, ops);
4647 static void
4648 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4649 machine_mode vec_mode)
4651 rtx ops[] = {op_dest, op_src};
4652 insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4654 emit_vlmax_insn (icode, type, ops);
4657 static void
4658 emit_vec_widen_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4659 machine_mode vec_mode)
4661 rtx ops[] = {op_dest, op_src};
4662 insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4664 emit_vlmax_insn (icode, type, ops);
4667 static void
4668 emit_vec_widen_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4669 machine_mode vec_mode)
4671 rtx ops[] = {op_dest, op_src};
4672 insn_code icode = code_for_pred_extend (vec_mode);
4674 emit_vlmax_insn (icode, type, ops);
4677 static void
4678 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4679 insn_type type, machine_mode vec_mode)
4681 rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4682 insn_code icode = code_for_pred (FLOAT, vec_mode);
4684 emit_vlmax_insn (icode, type, cvt_fp_ops);
4687 static void
4688 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4689 insn_type type, machine_mode vec_mode)
4691 insn_code icode = code_for_pred (FIX, vec_mode);
4693 if (type & USE_VUNDEF_MERGE_P)
4695 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4696 emit_vlmax_insn (icode, type, cvt_x_ops);
4698 else
4700 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4701 emit_vlmax_insn (icode, type, cvt_x_ops);
4705 static void
4706 emit_vec_binary_alu (rtx op_dest, rtx op_1, rtx op_2, enum rtx_code rcode,
4707 machine_mode vec_mode)
4709 rtx ops[] = {op_dest, op_1, op_2};
4710 insn_code icode = code_for_pred (rcode, vec_mode);
4712 emit_vlmax_insn (icode, BINARY_OP, ops);
4715 void
4716 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4717 machine_mode vec_int_mode)
4719 /* Step-1: Get the abs float value for mask generation. */
4720 emit_vec_abs (op_0, op_1, vec_fp_mode);
4722 /* Step-2: Generate the mask on const fp. */
4723 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4724 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4726 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
4727 rtx tmp = gen_reg_rtx (vec_int_mode);
4728 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4730 /* Step-4: Convert to floating-point on mask for the final result.
4731 To avoid unnecessary frm register access, we use RUP here and it will
4732 never do the rounding up because the tmp rtx comes from the float
4733 to int conversion. */
4734 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4736 /* Step-5: Retrieve the sign bit for -0.0. */
4737 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4740 void
4741 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4742 machine_mode vec_int_mode)
4744 /* Step-1: Get the abs float value for mask generation. */
4745 emit_vec_abs (op_0, op_1, vec_fp_mode);
4747 /* Step-2: Generate the mask on const fp. */
4748 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4749 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4751 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
4752 rtx tmp = gen_reg_rtx (vec_int_mode);
4753 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4755 /* Step-4: Convert to floating-point on mask for the floor result. */
4756 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4758 /* Step-5: Retrieve the sign bit for -0.0. */
4759 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4762 void
4763 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4764 machine_mode vec_int_mode)
4766 /* Step-1: Get the abs float value for mask generation. */
4767 emit_vec_abs (op_0, op_1, vec_fp_mode);
4769 /* Step-2: Generate the mask on const fp. */
4770 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4771 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4773 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4774 rtx fflags = gen_reg_rtx (SImode);
4775 emit_insn (gen_riscv_frflags (fflags));
4777 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
4778 rtx tmp = gen_reg_rtx (vec_int_mode);
4779 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4781 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
4782 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4784 /* Step-6: Restore FP exception flags. */
4785 emit_insn (gen_riscv_fsflags (fflags));
4787 /* Step-7: Retrieve the sign bit for -0.0. */
4788 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4791 void
4792 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4793 machine_mode vec_int_mode)
4795 /* Step-1: Get the abs float value for mask generation. */
4796 emit_vec_abs (op_0, op_1, vec_fp_mode);
4798 /* Step-2: Generate the mask on const fp. */
4799 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4800 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4802 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
4803 rtx tmp = gen_reg_rtx (vec_int_mode);
4804 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4806 /* Step-4: Convert to floating-point on mask for the rint result. */
4807 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4809 /* Step-5: Retrieve the sign bit for -0.0. */
4810 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4813 void
4814 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4815 machine_mode vec_int_mode)
4817 /* Step-1: Get the abs float value for mask generation. */
4818 emit_vec_abs (op_0, op_1, vec_fp_mode);
4820 /* Step-2: Generate the mask on const fp. */
4821 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4822 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4824 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
4825 rtx tmp = gen_reg_rtx (vec_int_mode);
4826 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4828 /* Step-4: Convert to floating-point on mask for the round result. */
4829 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4831 /* Step-5: Retrieve the sign bit for -0.0. */
4832 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4835 void
4836 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4837 machine_mode vec_int_mode)
4839 /* Step-1: Get the abs float value for mask generation. */
4840 emit_vec_abs (op_0, op_1, vec_fp_mode);
4842 /* Step-2: Generate the mask on const fp. */
4843 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4844 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4846 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
4847 rtx tmp = gen_reg_rtx (vec_int_mode);
4848 emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4850 /* Step-4: Convert to floating-point on mask for the rint result. */
4851 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4853 /* Step-5: Retrieve the sign bit for -0.0. */
4854 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4857 void
4858 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4859 machine_mode vec_int_mode)
4861 /* Step-1: Get the abs float value for mask generation. */
4862 emit_vec_abs (op_0, op_1, vec_fp_mode);
4864 /* Step-2: Generate the mask on const fp. */
4865 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4866 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4868 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
4869 rtx tmp = gen_reg_rtx (vec_int_mode);
4870 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4872 /* Step-4: Convert to floating-point on mask for the rint result. */
4873 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4875 /* Step-5: Retrieve the sign bit for -0.0. */
4876 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4879 /* Handling the rounding from floating-point to int/long/long long. */
4880 static void
4881 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4882 machine_mode vec_fp_mode,
4883 machine_mode vec_int_mode,
4884 machine_mode vec_bridge_mode = E_VOIDmode)
4886 poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4887 poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4889 if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI. */
4890 emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4891 else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI. */
4892 emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4893 else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI. */
4894 emit_vec_widen_cvt_x_f (op_0, op_1, type, vec_int_mode);
4895 else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI. */
4897 gcc_assert (vec_bridge_mode != E_VOIDmode);
4899 rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4901 /* Step-1: HF => SF, no rounding here. */
4902 emit_vec_widen_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4903 /* Step-2: SF => DI. */
4904 emit_vec_widen_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4906 else
4907 gcc_unreachable ();
4910 void
4911 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4912 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4914 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4915 vec_int_mode, vec_bridge_mode);
4918 void
4919 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4920 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4922 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4923 vec_int_mode, vec_bridge_mode);
4926 void
4927 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4928 machine_mode vec_int_mode)
4930 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4931 vec_int_mode);
4934 void
4935 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4936 machine_mode vec_int_mode)
4938 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4939 vec_int_mode);
4942 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
4943 the vector fixed point vector single-width saturating add directly. */
4945 void
4946 expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4948 emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode);
4951 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
4952 the vector fixed point vector single-width saturating add directly. */
4954 void
4955 expand_vec_ssadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4957 emit_vec_binary_alu (op_0, op_1, op_2, SS_PLUS, vec_mode);
4960 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
4961 the vector fixed point vector single-width saturating add directly. */
4963 void
4964 expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4966 emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
4969 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
4970 the vector fixed point vector single-width saturating add directly. */
4972 void
4973 expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
4975 emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode);
4978 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
4979 DI => SI. we can leverage the vector fixed point vector narrowing
4980 fixed-point clip directly. */
4982 void
4983 expand_vec_double_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
4985 insn_code icode;
4986 rtx zero = CONST0_RTX (Xmode);
4987 enum unspec unspec = UNSPEC_VNCLIPU;
4988 rtx ops[] = {op_0, op_1, zero};
4990 icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
4991 emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
4994 /* Expand the standard name sstrunc<m><n>2 for double vector mode, like
4995 DI => SI. we can leverage the vector fixed point vector narrowing
4996 fixed-point clip directly. */
4998 void
4999 expand_vec_double_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode)
5001 insn_code icode;
5002 rtx zero = CONST0_RTX (Xmode);
5003 enum unspec unspec = UNSPEC_VNCLIP;
5004 rtx ops[] = {op_0, op_1, zero};
5006 icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode);
5007 emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops);
5010 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5011 DI => HI. we can leverage the vector fixed point vector narrowing
5012 fixed-point clip directly. */
5014 void
5015 expand_vec_quad_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5016 machine_mode double_mode)
5018 rtx double_rtx = gen_reg_rtx (double_mode);
5020 expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5021 expand_vec_double_ustrunc (op_0, double_rtx, double_mode);
5024 /* Expand the standard name sstrunc<m><n>2 for quad vector mode, like
5025 DI => HI. we can leverage the vector fixed point vector narrowing
5026 fixed-point clip directly. */
5028 void
5029 expand_vec_quad_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5030 machine_mode double_mode)
5032 rtx double_rtx = gen_reg_rtx (double_mode);
5034 expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5035 expand_vec_double_sstrunc (op_0, double_rtx, double_mode);
5038 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5039 DI => QI. we can leverage the vector fixed point vector narrowing
5040 fixed-point clip directly. */
5042 void
5043 expand_vec_oct_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5044 machine_mode double_mode, machine_mode quad_mode)
5046 rtx double_rtx = gen_reg_rtx (double_mode);
5047 rtx quad_rtx = gen_reg_rtx (quad_mode);
5049 expand_vec_double_ustrunc (double_rtx, op_1, vec_mode);
5050 expand_vec_double_ustrunc (quad_rtx, double_rtx, double_mode);
5051 expand_vec_double_ustrunc (op_0, quad_rtx, quad_mode);
5054 /* Expand the standard name sstrunc<m><n>2 for oct vector mode, like
5055 DI => QI. we can leverage the vector fixed point vector narrowing
5056 fixed-point clip directly. */
5058 void
5059 expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode,
5060 machine_mode double_mode, machine_mode quad_mode)
5062 rtx double_rtx = gen_reg_rtx (double_mode);
5063 rtx quad_rtx = gen_reg_rtx (quad_mode);
5065 expand_vec_double_sstrunc (double_rtx, op_1, vec_mode);
5066 expand_vec_double_sstrunc (quad_rtx, double_rtx, double_mode);
5067 expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
5070 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5071 well. */
5072 void
5073 expand_popcount (rtx *ops)
5075 rtx dst = ops[0];
5076 rtx src = ops[1];
5077 machine_mode mode = GET_MODE (dst);
5078 scalar_mode imode = GET_MODE_INNER (mode);
5079 static const uint64_t m5 = 0x5555555555555555ULL;
5080 static const uint64_t m3 = 0x3333333333333333ULL;
5081 static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
5082 static const uint64_t m1 = 0x0101010101010101ULL;
5084 rtx x1 = gen_reg_rtx (mode);
5085 rtx x2 = gen_reg_rtx (mode);
5086 rtx x3 = gen_reg_rtx (mode);
5087 rtx x4 = gen_reg_rtx (mode);
5089 /* x1 = src - (src >> 1) & 0x555...); */
5090 rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
5091 OPTAB_DIRECT);
5093 rtx and1 = gen_reg_rtx (mode);
5094 rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
5095 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5096 ops1);
5098 x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
5100 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5102 rtx and2 = gen_reg_rtx (mode);
5103 rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
5104 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5105 ops2);
5107 rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
5108 OPTAB_DIRECT);
5110 rtx and22 = gen_reg_rtx (mode);
5111 rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
5112 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5113 ops22);
5115 x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
5117 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
5118 rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
5119 OPTAB_DIRECT);
5121 rtx plus3
5122 = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
5124 rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
5125 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
5126 ops3);
5128 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
5129 rtx mul4 = gen_reg_rtx (mode);
5130 rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
5131 emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
5132 ops4);
5134 x4 = expand_binop (mode, lshr_optab, mul4,
5135 GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
5136 OPTAB_DIRECT);
5138 emit_move_insn (dst, x4);
5141 /* Return true if it is VLMAX AVL TYPE. */
5142 bool
5143 vlmax_avl_type_p (rtx_insn *rinsn)
5145 extract_insn_cached (rinsn);
5146 int index = get_attr_avl_type_idx (rinsn);
5147 if (index == INVALID_ATTRIBUTE)
5148 return false;
5149 rtx avl_type = recog_data.operand[index];
5150 return INTVAL (avl_type) == VLMAX;
5153 /* Return true if it is an RVV instruction depends on VL global
5154 status register. */
5155 bool
5156 has_vl_op (rtx_insn *rinsn)
5158 return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
5161 /* Get default tail policy. */
5162 static bool
5163 get_default_ta ()
5165 /* For the instruction that doesn't require TA, we still need a default value
5166 to emit vsetvl. We pick up the default value according to prefer policy. */
5167 return (bool) (get_prefer_tail_policy () & 0x1
5168 || (get_prefer_tail_policy () >> 1 & 0x1));
5171 /* Helper function to get TA operand. */
5172 bool
5173 tail_agnostic_p (rtx_insn *rinsn)
5175 /* If it doesn't have TA, we return agnostic by default. */
5176 extract_insn_cached (rinsn);
5177 int ta = get_attr_ta (rinsn);
5178 return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
5181 /* Change insn and Assert the change always happens. */
5182 void
5183 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
5185 bool change_p = validate_change (object, loc, new_rtx, in_group);
5186 gcc_assert (change_p);
5189 /* Return true if it is NONVLMAX AVL TYPE. */
5190 bool
5191 nonvlmax_avl_type_p (rtx_insn *rinsn)
5193 extract_insn_cached (rinsn);
5194 int index = get_attr_avl_type_idx (rinsn);
5195 if (index == INVALID_ATTRIBUTE)
5196 return false;
5197 rtx avl_type = recog_data.operand[index];
5198 return INTVAL (avl_type) == NONVLMAX;
5201 /* Return true if RTX is RVV VLMAX AVL. */
5202 bool
5203 vlmax_avl_p (rtx x)
5205 return x && rtx_equal_p (x, RVV_VLMAX);
5208 /* Helper function to get SEW operand. We always have SEW value for
5209 all RVV instructions that have VTYPE OP. */
5210 uint8_t
5211 get_sew (rtx_insn *rinsn)
5213 return get_attr_sew (rinsn);
5216 /* Helper function to get VLMUL operand. We always have VLMUL value for
5217 all RVV instructions that have VTYPE OP. */
5218 enum vlmul_type
5219 get_vlmul (rtx_insn *rinsn)
5221 return (enum vlmul_type) get_attr_vlmul (rinsn);
5224 /* Count the number of REGNO in RINSN. */
5226 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5228 int count = 0;
5229 extract_insn (rinsn);
5230 for (int i = 0; i < recog_data.n_operands; i++)
5231 if (refers_to_regno_p (regno, recog_data.operand[i]))
5232 count++;
5233 return count;
5236 /* Return true if the OP can be directly broadcasted. */
5237 bool
5238 can_be_broadcasted_p (rtx op)
5240 machine_mode mode = GET_MODE (op);
5241 /* We don't allow RA (register allocation) reload generate
5242 (vec_duplicate:DI reg) in RV32 system wheras we allow
5243 (vec_duplicate:DI mem) in RV32 system. */
5244 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5245 && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5246 && !satisfies_constraint_Wdm (op))
5247 return false;
5249 if (satisfies_constraint_K (op) || register_operand (op, mode)
5250 || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5251 return true;
5253 return can_create_pseudo_p () && nonmemory_operand (op, mode);
5256 void
5257 emit_vec_extract (rtx target, rtx src, rtx index)
5259 machine_mode vmode = GET_MODE (src);
5260 machine_mode smode = GET_MODE (target);
5261 class expand_operand ops[3];
5262 enum insn_code icode
5263 = convert_optab_handler (vec_extract_optab, vmode, smode);
5264 gcc_assert (icode != CODE_FOR_nothing);
5265 create_output_operand (&ops[0], target, smode);
5266 ops[0].target = 1;
5267 create_input_operand (&ops[1], src, vmode);
5269 poly_int64 val;
5270 if (poly_int_rtx_p (index, &val))
5271 create_integer_operand (&ops[2], val);
5272 else
5273 create_input_operand (&ops[2], index, Pmode);
5275 expand_insn (icode, 3, ops);
5276 if (ops[0].value != target)
5277 emit_move_insn (target, ops[0].value);
5280 /* Return true if the offset mode is valid mode that we use for gather/scatter
5281 autovectorization. */
5282 bool
5283 gather_scatter_valid_offset_p (machine_mode mode)
5285 /* If the element size of offset mode is already >= Pmode size,
5286 we don't need any extensions. */
5287 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5288 return true;
5290 /* Since we are very likely extend the offset mode into vector Pmode,
5291 Disable gather/scatter autovectorization if we can't extend the offset
5292 mode into vector Pmode. */
5293 if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5294 return false;
5295 return true;
5298 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5299 Look into the tuning structure for an estimate.
5300 KIND specifies the type of requested estimate: min, max or likely.
5301 For cores with a known VLA width all three estimates are the same.
5302 For generic VLA tuning we want to distinguish the maximum estimate from
5303 the minimum and likely ones.
5304 The likely estimate is the same as the minimum in that case to give a
5305 conservative behavior of auto-vectorizing with VLA when it is a win
5306 even for VLA vectorization.
5307 When VLA width information is available VAL.coeffs[1] is multiplied by
5308 the number of VLA chunks over the initial VLS bits. */
5309 HOST_WIDE_INT
5310 estimated_poly_value (poly_int64 val, unsigned int kind)
5312 unsigned int width_source
5313 = BITS_PER_RISCV_VECTOR.is_constant ()
5314 ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5315 : (unsigned int) RVV_VECTOR_BITS_SCALABLE;
5317 /* If there is no core-specific information then the minimum and likely
5318 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5319 the architectural maximum of 65536 bits. */
5320 unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5321 if (width_source == RVV_VECTOR_BITS_SCALABLE)
5322 switch (kind)
5324 case POLY_VALUE_MIN:
5325 case POLY_VALUE_LIKELY:
5326 return val.coeffs[0];
5328 case POLY_VALUE_MAX:
5329 return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5332 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5333 lowest as likely. This could be made more general if future -mtune
5334 options need it to be. */
5335 if (kind == POLY_VALUE_MAX)
5336 width_source = 1 << floor_log2 (width_source);
5337 else
5338 width_source = least_bit_hwi (width_source);
5340 /* If the core provides width information, use that. */
5341 HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5342 return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5345 /* Return true it is whole register-register move. */
5346 bool
5347 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
5349 /* An operation is a whole-register move if either
5350 (1) Its vlmax operand equals VLMAX
5351 (2) Its vl operand equals the number of units of its mode. */
5352 if (register_operand (ops[0], mode)
5353 && register_operand (ops[3], mode)
5354 && satisfies_constraint_vu (ops[2])
5355 && satisfies_constraint_Wc1 (ops[1]))
5357 if (INTVAL (ops[avl_type_index]) == VLMAX)
5358 return true;
5359 /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5360 into NON-VLMAX with LEN = NUNITS. */
5361 else if (CONST_INT_P (ops[4])
5362 && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
5363 return true;
5365 return false;
5368 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
5369 bool
5370 splat_to_scalar_move_p (rtx *ops)
5372 return satisfies_constraint_Wc1 (ops[1])
5373 && satisfies_constraint_vu (ops[2])
5374 && !MEM_P (ops[3])
5375 && satisfies_constraint_c01 (ops[4])
5376 && INTVAL (ops[7]) == NONVLMAX
5377 && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
5380 } // namespace riscv_vector