RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in...
[official-gcc.git] / gcc / config / riscv / riscv-v.cc
blobb4c7e0f0126eb68412628fccb8c8379c46427f5b
1 /* Subroutines used for code generation for RISC-V 'V' Extension for
2 GNU compiler.
3 Copyright (C) 2022-2023 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
25 the vector.md. */
26 #define RVV_INSN_OPERANDS_MAX 11
28 #include "config.h"
29 #include "system.h"
30 #include "coretypes.h"
31 #include "tm.h"
32 #include "backend.h"
33 #include "rtl.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
36 #include "recog.h"
37 #include "alias.h"
38 #include "tree.h"
39 #include "stringpool.h"
40 #include "attribs.h"
41 #include "explow.h"
42 #include "memmodel.h"
43 #include "emit-rtl.h"
44 #include "tm_p.h"
45 #include "target.h"
46 #include "targhooks.h"
47 #include "expr.h"
48 #include "optabs.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
52 #include "predict.h"
54 using namespace riscv_vector;
56 namespace riscv_vector {
58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli. */
59 bool
60 imm_avl_p (machine_mode mode)
62 poly_uint64 nunits = GET_MODE_NUNITS (mode);
64 return nunits.is_constant ()
65 /* The vsetivli can only hold register 0~31. */
66 ? (IN_RANGE (nunits.to_constant (), 0, 31))
67 /* Only allowed in VLS-VLMAX mode. */
68 : false;
71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
72 static bool
73 is_vlmax_len_p (machine_mode mode, rtx len)
75 poly_int64 value;
76 return poly_int_rtx_p (len, &value)
77 && known_eq (value, GET_MODE_NUNITS (mode))
78 && !satisfies_constraint_K (len);
81 /* Helper functions for insn_flags && insn_types */
83 /* Return true if caller need pass mask operand for insn pattern with
84 INSN_FLAGS. */
86 static bool
87 need_mask_operand_p (unsigned insn_flags)
89 return (insn_flags & HAS_MASK_P)
90 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
93 template <int MAX_OPERANDS> class insn_expander
95 public:
96 insn_expander () = delete;
98 insn_expander (unsigned insn_flags, bool vlmax_p)
99 : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
100 m_vl_op (NULL_RTX)
102 check_insn_flags ();
105 void check_insn_flags () const
107 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
108 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
109 gcc_assert ((m_insn_flags & HAS_MASK_P));
111 if (m_insn_flags & USE_ALL_TRUES_MASK_P)
112 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
113 gcc_assert ((m_insn_flags & HAS_MASK_P));
115 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
116 gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
117 && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
119 if (m_insn_flags & USE_VUNDEF_MERGE_P)
120 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
121 gcc_assert ((m_insn_flags & HAS_MERGE_P));
123 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
124 gcc_assert (
125 !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
127 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
128 gcc_assert (
129 !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
131 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
132 exclusive. */
133 gcc_assert (
134 !((m_insn_flags & NULLARY_OP_P)
135 && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
136 || (m_insn_flags & TERNARY_OP_P))));
137 gcc_assert (
138 !((m_insn_flags & UNARY_OP_P)
139 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
140 || (m_insn_flags & TERNARY_OP_P))));
141 gcc_assert (
142 !((m_insn_flags & BINARY_OP_P)
143 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
144 || (m_insn_flags & TERNARY_OP_P))));
145 gcc_assert (
146 !((m_insn_flags & TERNARY_OP_P)
147 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
148 || (m_insn_flags & BINARY_OP_P))));
151 void set_vl (rtx vl) { m_vl_op = vl; }
153 void add_output_operand (rtx x, machine_mode mode)
155 create_output_operand (&m_ops[m_opno++], x, mode);
156 gcc_assert (m_opno <= MAX_OPERANDS);
158 void add_input_operand (rtx x, machine_mode mode)
160 create_input_operand (&m_ops[m_opno++], x, mode);
161 gcc_assert (m_opno <= MAX_OPERANDS);
163 void add_all_one_mask_operand (machine_mode mask_mode)
165 add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
167 void add_first_one_true_mask_operand (machine_mode mask_mode)
169 add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
171 void add_vundef_operand (machine_mode dest_mode)
173 add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
175 void add_policy_operand ()
177 if (m_insn_flags & TU_POLICY_P)
179 rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
180 add_input_operand (tail_policy_rtx, Pmode);
182 else if (m_insn_flags & TDEFAULT_POLICY_P)
184 rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
185 add_input_operand (tail_policy_rtx, Pmode);
188 if (m_insn_flags & MU_POLICY_P)
190 rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
191 add_input_operand (mask_policy_rtx, Pmode);
193 else if (m_insn_flags & MDEFAULT_POLICY_P)
195 rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
196 add_input_operand (mask_policy_rtx, Pmode);
199 void add_avl_type_operand (avl_type type)
201 add_input_operand (gen_int_mode (type, Pmode), Pmode);
204 void
205 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
207 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
208 add_input_operand (frm_rtx, Pmode);
211 /* Return the vtype mode based on insn_flags.
212 vtype mode mean the mode vsetvl insn set. */
213 machine_mode
214 get_vtype_mode (rtx *ops)
216 machine_mode vtype_mode;
217 if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
218 vtype_mode = GET_MODE (ops[1]);
219 else
220 vtype_mode = GET_MODE (ops[0]);
221 return vtype_mode;
224 void emit_insn (enum insn_code icode, rtx *ops)
226 int opno = 0;
227 int num_ops;
228 /* It's true if any operand is memory operand. */
229 bool any_mem_p = false;
231 machine_mode vtype_mode = get_vtype_mode (ops);
232 machine_mode mask_mode = get_mask_mode (vtype_mode);
234 /* Add dest operand. */
235 if (m_insn_flags & HAS_DEST_P)
237 rtx op = ops[opno++];
238 any_mem_p |= MEM_P (op);
239 add_output_operand (op, GET_MODE (op));
242 /* Add mask operand. */
243 if (m_insn_flags & USE_ONE_TRUE_MASK_P)
244 add_first_one_true_mask_operand (mask_mode);
245 else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
246 add_all_one_mask_operand (mask_mode);
247 else if (m_insn_flags & HAS_MASK_P)
249 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
250 gcc_assert (mode != VOIDmode);
251 add_input_operand (ops[opno++], mode);
254 /* Add merge operand. */
255 if (m_insn_flags & USE_VUNDEF_MERGE_P)
256 /* Same as dest operand. */
257 add_vundef_operand (GET_MODE (ops[0]));
258 else if (m_insn_flags & HAS_MERGE_P)
260 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
261 gcc_assert (mode != VOIDmode);
262 add_input_operand (ops[opno++], mode);
265 if (m_insn_flags & NULLARY_OP_P)
266 num_ops = 0;
267 else if (m_insn_flags & UNARY_OP_P)
268 num_ops = 1;
269 else if (m_insn_flags & BINARY_OP_P)
270 num_ops = 2;
271 else if (m_insn_flags & TERNARY_OP_P)
272 num_ops = 3;
273 else
274 gcc_unreachable ();
276 /* Add the remain operands. */
277 for (; num_ops; num_ops--, opno++)
279 any_mem_p |= MEM_P (ops[opno]);
280 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
281 /* 'create_input_operand doesn't allow VOIDmode.
282 According to vector.md, we may have some patterns that do not have
283 explicit machine mode specifying the operand. Such operands are
284 always Pmode. */
285 if (mode == VOIDmode)
286 mode = Pmode;
287 else
288 /* Early assertion ensures same mode since maybe_legitimize_operand
289 will check this. */
290 gcc_assert (GET_MODE (ops[opno]) == VOIDmode
291 || GET_MODE (ops[opno]) == mode);
293 add_input_operand (ops[opno], mode);
296 /* Add vl operand. */
297 rtx len = m_vl_op;
298 bool vls_p = false;
299 if (m_vlmax_p)
301 if (riscv_v_ext_vls_mode_p (vtype_mode))
303 /* VLS modes always set VSETVL by
304 "vsetvl zero, rs1/imm". */
305 poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
306 len = gen_int_mode (nunits, Pmode);
307 vls_p = true;
309 else if (can_create_pseudo_p ())
311 len = gen_reg_rtx (Pmode);
312 emit_vlmax_vsetvl (vtype_mode, len);
316 gcc_assert (len != NULL_RTX);
317 add_input_operand (len, Pmode);
319 /* Add tail and mask policy operands. */
320 add_policy_operand ();
322 /* Add avl_type operand. */
323 add_avl_type_operand (
324 vls_p ? avl_type::VLS
325 : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
327 /* Add rounding mode operand. */
328 if (m_insn_flags & FRM_DYN_P)
329 add_rounding_mode_operand (FRM_DYN);
330 else if (m_insn_flags & FRM_RUP_P)
331 add_rounding_mode_operand (FRM_RUP);
332 else if (m_insn_flags & FRM_RDN_P)
333 add_rounding_mode_operand (FRM_RDN);
334 else if (m_insn_flags & FRM_RMM_P)
335 add_rounding_mode_operand (FRM_RMM);
336 else if (m_insn_flags & FRM_RNE_P)
337 add_rounding_mode_operand (FRM_RNE);
339 gcc_assert (insn_data[(int) icode].n_operands == m_opno);
340 expand (icode, any_mem_p);
343 void expand (enum insn_code icode, bool temporary_volatile_p = false)
345 if (temporary_volatile_p)
347 temporary_volatile_ok v (true);
348 expand_insn (icode, m_opno, m_ops);
350 else
351 expand_insn (icode, m_opno, m_ops);
354 private:
355 unsigned m_insn_flags;
356 int m_opno;
357 bool m_vlmax_p;
358 rtx m_vl_op;
359 expand_operand m_ops[MAX_OPERANDS];
362 /* Emit an RVV insn with a vector length that equals the number of units of the
363 vector mode. For VLA modes this corresponds to VLMAX.
365 Unless the vector length can be encoded in the vsetivl[i] instruction this
366 function must only be used as long as we can create pseudo registers. This is
367 because it will set a pseudo register to VLMAX using vsetvl and use this as
368 definition for the vector length. */
369 void
370 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
372 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
373 gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
375 e.emit_insn ((enum insn_code) icode, ops);
378 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
379 registers anymore. This function, however, takes a predefined vector length
380 from the value in VL. */
381 void
382 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
384 gcc_assert (!can_create_pseudo_p ());
385 machine_mode mode = GET_MODE (ops[0]);
387 if (imm_avl_p (mode))
389 /* Even though VL is a real hardreg already allocated since
390 it is post-RA now, we still gain benefits that we emit
391 vsetivli zero, imm instead of vsetvli VL, zero which is
392 we can be more flexible in post-RA instruction scheduling. */
393 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
394 e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
395 e.emit_insn ((enum insn_code) icode, ops);
397 else
399 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
400 e.set_vl (vl);
401 e.emit_insn ((enum insn_code) icode, ops);
405 /* Emit an RVV insn with a predefined vector length. Contrary to
406 emit_vlmax_insn the instruction's vector length is not deduced from its mode
407 but taken from the value in VL. */
408 void
409 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
411 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
412 e.set_vl (vl);
413 e.emit_insn ((enum insn_code) icode, ops);
416 class rvv_builder : public rtx_vector_builder
418 public:
419 rvv_builder () : rtx_vector_builder () {}
420 rvv_builder (machine_mode mode, unsigned int npatterns,
421 unsigned int nelts_per_pattern)
422 : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
424 m_inner_mode = GET_MODE_INNER (mode);
425 m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
426 m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
427 m_mask_mode = get_mask_mode (mode);
429 gcc_assert (
430 int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
431 m_int_mode
432 = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
435 bool can_duplicate_repeating_sequence_p ();
436 rtx get_merged_repeating_sequence ();
438 bool repeating_sequence_use_merge_profitable_p ();
439 bool combine_sequence_use_slideup_profitable_p ();
440 bool combine_sequence_use_merge_profitable_p ();
441 rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
443 bool single_step_npatterns_p () const;
444 bool npatterns_all_equal_p () const;
445 bool interleaved_stepped_npatterns_p () const;
446 bool npatterns_vid_diff_repeated_p () const;
448 machine_mode new_mode () const { return m_new_mode; }
449 scalar_mode inner_mode () const { return m_inner_mode; }
450 scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
451 machine_mode mask_mode () const { return m_mask_mode; }
452 machine_mode int_mode () const { return m_int_mode; }
453 unsigned int inner_bits_size () const { return m_inner_bits_size; }
454 unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
456 private:
457 scalar_mode m_inner_mode;
458 scalar_int_mode m_inner_int_mode;
459 machine_mode m_new_mode;
460 scalar_int_mode m_new_inner_mode;
461 machine_mode m_mask_mode;
462 machine_mode m_int_mode;
463 unsigned int m_inner_bits_size;
464 unsigned int m_inner_bytes_size;
467 /* Return true if the vector duplicated by a super element which is the fusion
468 of consecutive elements.
470 v = { a, b, a, b } super element = ab, v = { ab, ab } */
471 bool
472 rvv_builder::can_duplicate_repeating_sequence_p ()
474 poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
475 unsigned int new_inner_size = m_inner_bits_size * npatterns ();
476 if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
477 || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
478 || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
479 return false;
480 if (full_nelts ().is_constant ())
481 return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
482 return nelts_per_pattern () == 1;
485 /* Return true if it is a repeating sequence that using
486 merge approach has better codegen than using default
487 approach (slide1down).
489 Sequence A:
490 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
492 nelts = 16
493 npatterns = 2
495 for merging a we need mask 101010....
496 for merging b we need mask 010101....
498 Foreach element in the npattern, we need to build a mask in scalar register.
499 Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
500 instruction and 1 scalar move to v0 register. Finally we need vector merge
501 to merge them.
503 lui a5, #imm
504 add a5, #imm
505 vmov.s.x v0, a5
506 vmerge.vxm v9, v9, a1, v0
508 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
509 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
510 So return true in this case as it is profitable.
512 Sequence B:
513 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
515 nelts = 16
516 npatterns = 8
518 COST of merge approach = (3 + 1) * npatterns = 24
519 COST of slide1down approach = nelts = 16
520 Return false in this case as it is NOT profitable in merge approach.
522 bool
523 rvv_builder::repeating_sequence_use_merge_profitable_p ()
525 if (inner_bytes_size () > UNITS_PER_WORD)
526 return false;
528 unsigned int nelts = full_nelts ().to_constant ();
530 if (!repeating_sequence_p (0, nelts, npatterns ()))
531 return false;
533 unsigned int merge_cost = 1;
534 unsigned int build_merge_mask_cost = 3;
535 unsigned int slide1down_cost = nelts;
537 return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
540 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
541 bool
542 rvv_builder::combine_sequence_use_slideup_profitable_p ()
544 int nelts = full_nelts ().to_constant ();
545 int leading_ndups = this->count_dups (0, nelts - 1, 1);
546 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
548 /* ??? Current heuristic we do is we do combine 2 vectors
549 by slideup when:
550 1. # of leading same elements is equal to # of trailing same elements.
551 2. Both of above are equal to nelts / 2.
552 Otherwise, it is not profitable. */
553 return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
556 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
557 bool
558 rvv_builder::combine_sequence_use_merge_profitable_p ()
560 int nelts = full_nelts ().to_constant ();
561 int leading_ndups = this->count_dups (0, nelts - 1, 1);
562 int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
563 int nregs = riscv_get_v_regno_alignment (int_mode ());
565 if (leading_ndups + trailing_ndups != nelts)
566 return false;
568 /* Leading elements num > 255 which exceeds the maximum value
569 of QImode, we will need to use HImode. */
570 machine_mode mode;
571 if (leading_ndups > 255 || nregs > 2)
573 if (!get_vector_mode (HImode, nelts).exists (&mode))
574 return false;
575 /* We will need one more AVL/VL toggling vsetvl instruction. */
576 return leading_ndups > 4 && trailing_ndups > 4;
579 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
580 consume 3 slide instructions. */
581 return leading_ndups > 3 && trailing_ndups > 3;
584 /* Merge the repeating sequence into a single element and return the RTX. */
586 rvv_builder::get_merged_repeating_sequence ()
588 scalar_int_mode mode = Pmode;
589 rtx target = gen_reg_rtx (mode);
590 emit_move_insn (target, const0_rtx);
591 rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
592 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
593 for (unsigned int i = 0; i < npatterns (); i++)
595 unsigned int loc = m_inner_bits_size * i;
596 rtx shift = gen_int_mode (loc, mode);
597 rtx ele = gen_lowpart (mode, elt (i));
598 rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
599 OPTAB_DIRECT);
600 rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
601 OPTAB_DIRECT);
602 rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
603 OPTAB_DIRECT);
604 emit_move_insn (target, tmp3);
606 if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
607 return gen_lowpart (m_new_inner_mode, target);
608 return target;
611 /* Get the mask for merge approach.
613 Consider such following case:
614 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
615 To merge "a", the mask should be 1010....
616 To merge "b", the mask should be 0101....
619 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
620 machine_mode inner_mode) const
622 unsigned HOST_WIDE_INT mask = 0;
623 unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
624 /* Here we construct a mask pattern that will later be broadcast
625 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
626 is determined by the length of a vector element (ELEN) and not by
627 XLEN so make sure we do not exceed it. One example is -march=zve32*
628 which mandates ELEN == 32 but can be combined with -march=rv64
629 with XLEN == 64. */
630 unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
632 gcc_assert (elen % npatterns () == 0);
634 int limit = elen / npatterns ();
636 for (int i = 0; i < limit; i++)
637 mask |= base_mask << (i * npatterns ());
639 return gen_int_mode (mask, inner_mode);
642 /* Return true if the variable-length vector is single step.
643 Single step means step all patterns in NPATTERNS are equal.
644 Consider this following case:
646 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
647 { 0, 2, 2, 4, 4, 6, ... }
648 First pattern: step1 = 2 - 0 = 2
649 step2 = 4 - 2 = 2
650 Second pattern: step1 = 4 - 2 = 2
651 step2 = 6 - 4 = 2
652 Since all steps of NPATTERNS are equal step = 2.
653 Return true in this case.
655 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
656 { 0, 1, 2, 4, 4, 7, ... }
657 First pattern: step1 = 2 - 0 = 2
658 step2 = 4 - 2 = 2
659 Second pattern: step1 = 4 - 1 = 3
660 step2 = 7 - 4 = 3
661 Since not all steps are equal, return false. */
662 bool
663 rvv_builder::single_step_npatterns_p () const
665 if (nelts_per_pattern () != 3)
666 return false;
668 poly_int64 step
669 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
670 for (unsigned int i = 0; i < npatterns (); i++)
672 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
673 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
674 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
675 poly_int64 diff1 = ele1 - ele0;
676 poly_int64 diff2 = ele2 - ele1;
677 if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
678 return false;
680 return true;
683 /* Return true if the diff between const vector and vid sequence
684 is repeated. For example as below cases:
685 The diff means the const vector - vid.
686 CASE 1:
687 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
688 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
689 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
690 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
691 return TRUE for case 1.
693 CASE 2:
694 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
695 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
696 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
697 The diff sequence {-4, 3} is not repated in the npattern and
698 return FALSE for case 2. */
699 bool
700 rvv_builder::npatterns_vid_diff_repeated_p () const
702 if (nelts_per_pattern () != 3)
703 return false;
704 else if (npatterns () == 0)
705 return false;
707 for (unsigned i = 0; i < npatterns (); i++)
709 poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
710 poly_int64 diff_1
711 = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
713 if (maybe_ne (diff_0, diff_1))
714 return false;
717 return true;
720 /* Return true if the permutation consists of two
721 interleaved patterns with a constant step each.
722 TODO: We currently only support NPATTERNS = 2. */
723 bool
724 rvv_builder::interleaved_stepped_npatterns_p () const
726 if (npatterns () != 2 || nelts_per_pattern () != 3)
727 return false;
728 for (unsigned int i = 0; i < npatterns (); i++)
730 poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
731 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
732 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
733 poly_int64 diff1 = ele1 - ele0;
734 poly_int64 diff2 = ele2 - ele1;
735 if (maybe_ne (diff1, diff2))
736 return false;
738 return true;
741 /* Return true if all elements of NPATTERNS are equal.
743 E.g. NPATTERNS = 4:
744 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
745 E.g. NPATTERNS = 8:
746 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
747 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
748 We don't need to check the elements[n] with n >= NPATTERNS since
749 they don't belong to the same pattern.
751 bool
752 rvv_builder::npatterns_all_equal_p () const
754 poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
755 for (unsigned int i = 1; i < npatterns (); i++)
757 poly_int64 ele = rtx_to_poly_int64 (elt (i));
758 if (!known_eq (ele, ele0))
759 return false;
761 return true;
764 static unsigned
765 get_sew (machine_mode mode)
767 unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
769 : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
770 return sew;
773 /* Return true if X is a const_vector with all duplicate elements, which is in
774 the range between MINVAL and MAXVAL. */
775 bool
776 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
777 HOST_WIDE_INT maxval)
779 rtx elt;
780 return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
781 && IN_RANGE (INTVAL (elt), minval, maxval));
784 /* Return true if VEC is a constant in which every element is in the range
785 [MINVAL, MAXVAL]. The elements do not need to have the same value.
787 This function also exists in aarch64, we may unify it in middle-end in the
788 future. */
790 static bool
791 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
793 if (!CONST_VECTOR_P (vec)
794 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
795 return false;
797 int nunits;
798 if (!CONST_VECTOR_STEPPED_P (vec))
799 nunits = const_vector_encoded_nelts (vec);
800 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
801 return false;
803 for (int i = 0; i < nunits; i++)
805 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
806 poly_int64 value;
807 if (!poly_int_rtx_p (vec_elem, &value)
808 || maybe_lt (value, minval)
809 || maybe_gt (value, maxval))
810 return false;
812 return true;
815 /* Return a const vector of VAL. The VAL can be either const_int or
816 const_poly_int. */
818 static rtx
819 gen_const_vector_dup (machine_mode mode, poly_int64 val)
821 scalar_mode smode = GET_MODE_INNER (mode);
822 rtx c = gen_int_mode (val, smode);
823 if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
825 /* When VAL is const_poly_int value, we need to explicitly broadcast
826 it into a vector using RVV broadcast instruction. */
827 return expand_vector_broadcast (mode, c);
829 return gen_const_vec_duplicate (mode, c);
832 /* Emit a vlmax vsetvl instruction. This should only be used when
833 optimization is disabled or after vsetvl insertion pass. */
834 void
835 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
837 unsigned int sew = get_sew (vmode);
838 emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
839 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
840 const0_rtx));
843 void
844 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
846 unsigned int sew = get_sew (vmode);
847 enum vlmul_type vlmul = get_vlmul (vmode);
848 unsigned int ratio = calculate_ratio (sew, vlmul);
850 if (!optimize)
851 emit_hard_vlmax_vsetvl (vmode, vl);
852 else
853 emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
856 /* Calculate SEW/LMUL ratio. */
857 unsigned int
858 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
860 unsigned int ratio;
861 switch (vlmul)
863 case LMUL_1:
864 ratio = sew;
865 break;
866 case LMUL_2:
867 ratio = sew / 2;
868 break;
869 case LMUL_4:
870 ratio = sew / 4;
871 break;
872 case LMUL_8:
873 ratio = sew / 8;
874 break;
875 case LMUL_F8:
876 ratio = sew * 8;
877 break;
878 case LMUL_F4:
879 ratio = sew * 4;
880 break;
881 case LMUL_F2:
882 ratio = sew * 2;
883 break;
884 default:
885 gcc_unreachable ();
887 return ratio;
890 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
891 compile-time unknown). FIXED meands that the vector-length is specific
892 (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
893 auto-vectorization using VLMAX vsetvl configuration. */
894 static bool
895 autovec_use_vlmax_p (void)
897 return (riscv_autovec_preference == RVV_SCALABLE
898 || riscv_autovec_preference == RVV_FIXED_VLMAX);
901 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
902 is a const duplicate vector. Otherwise, emit vrgather.vv. */
903 static void
904 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
906 rtx elt;
907 insn_code icode;
908 machine_mode data_mode = GET_MODE (target);
909 machine_mode sel_mode = GET_MODE (sel);
910 if (const_vec_duplicate_p (sel, &elt))
912 icode = code_for_pred_gather_scalar (data_mode);
913 sel = elt;
915 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
916 icode = code_for_pred_gatherei16 (data_mode);
917 else
918 icode = code_for_pred_gather (data_mode);
919 rtx ops[] = {target, op, sel};
920 emit_vlmax_insn (icode, BINARY_OP, ops);
923 static void
924 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
926 rtx elt;
927 insn_code icode;
928 machine_mode data_mode = GET_MODE (target);
929 machine_mode sel_mode = GET_MODE (sel);
930 if (const_vec_duplicate_p (sel, &elt))
932 icode = code_for_pred_gather_scalar (data_mode);
933 sel = elt;
935 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
936 icode = code_for_pred_gatherei16 (data_mode);
937 else
938 icode = code_for_pred_gather (data_mode);
939 rtx ops[] = {target, mask, target, op, sel};
940 emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
943 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
944 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
946 There is no inverse vdecompress provided, as this operation can be readily
947 synthesized using iota and a masked vrgather:
949 Desired functionality of 'vdecompress'
950 7 6 5 4 3 2 1 0 # vid
952 e d c b a # packed vector of 5 elements
953 1 0 0 1 1 1 0 1 # mask vector of 8 elements
954 p q r s t u v w # destination register before vdecompress
956 e q r d c b v a # result of vdecompress
957 # v0 holds mask
958 # v1 holds packed data
959 # v11 holds input expanded vector and result
960 viota.m v10, v0 # Calc iota from mask in v0
961 vrgather.vv v11, v1, v10, v0.t # Expand into destination
962 p q r s t u v w # v11 destination register
963 e d c b a # v1 source vector
964 1 0 0 1 1 1 0 1 # v0 mask vector
966 4 4 4 3 2 1 1 0 # v10 result of viota.m
967 e q r d c b v a # v11 destination after vrgather using viota.m under mask
969 static void
970 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
972 machine_mode data_mode = GET_MODE (target);
973 machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
974 if (GET_MODE_INNER (data_mode) == QImode)
975 sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
977 rtx sel = gen_reg_rtx (sel_mode);
978 rtx iota_ops[] = {sel, mask};
979 emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
980 emit_vlmax_gather_insn (target, op0, sel);
981 emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
984 /* Emit merge instruction. */
986 static machine_mode
987 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
988 machine_mode mask_bit_mode)
990 unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
991 unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
992 ? builder.inner_bits_size () : mask_precision;
994 scalar_mode inner_mode;
995 unsigned minimal_bits_size;
997 switch (mask_scalar_size)
999 case 8:
1000 inner_mode = QImode;
1001 minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */
1002 break;
1003 case 16:
1004 inner_mode = HImode;
1005 minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */
1006 break;
1007 case 32:
1008 inner_mode = SImode;
1009 minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */
1010 break;
1011 case 64:
1012 inner_mode = DImode;
1013 minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */
1014 break;
1015 default:
1016 gcc_unreachable ();
1017 break;
1020 gcc_assert (mask_precision % mask_scalar_size == 0);
1022 uint64_t dup_nunit = mask_precision > mask_scalar_size
1023 ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
1025 return get_vector_mode (inner_mode, dup_nunit).require ();
1028 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1029 instructions to generate sequence for VID:
1031 VID = { 0, 1, 2, 3, ... }
1033 Otherwise, we use the VID argument directly. */
1035 void
1036 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
1038 machine_mode mode = GET_MODE (dest);
1039 poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
1040 poly_int64 value;
1041 rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
1043 /* VECT_IV = BASE + I * STEP. */
1045 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1046 bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
1047 && poly_int_rtx_p (base, &value)
1048 && known_eq (nunits_m1, value);
1049 if (!vid)
1051 vid = gen_reg_rtx (mode);
1052 rtx op[] = {vid};
1053 emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
1056 rtx step_adj;
1057 if (reverse_p)
1059 /* Special case:
1060 {nunits - 1, nunits - 2, ... , 0}.
1061 nunits can be either const_int or const_poly_int.
1063 Code sequence:
1064 vid.v v
1065 vrsub nunits - 1, v. */
1066 rtx ops[]
1067 = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
1068 insn_code icode = code_for_pred_sub_reverse_scalar (mode);
1069 emit_vlmax_insn (icode, BINARY_OP, ops);
1071 else
1073 /* Step 2: Generate I * STEP.
1074 - STEP is 1, we don't emit any instructions.
1075 - STEP is power of 2, we use vsll.vi/vsll.vx.
1076 - STEP is non-power of 2, we use vmul.vx. */
1077 if (rtx_equal_p (step, const1_rtx))
1078 step_adj = vid;
1079 else
1081 step_adj = gen_reg_rtx (mode);
1082 if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
1084 /* Emit logical left shift operation. */
1085 int shift = exact_log2 (INTVAL (step));
1086 rtx shift_amount = gen_int_mode (shift, Pmode);
1087 insn_code icode = code_for_pred_scalar (ASHIFT, mode);
1088 rtx ops[] = {step_adj, vid, shift_amount};
1089 emit_vlmax_insn (icode, BINARY_OP, ops);
1091 else
1093 insn_code icode = code_for_pred_scalar (MULT, mode);
1094 rtx ops[] = {step_adj, vid, step};
1095 emit_vlmax_insn (icode, BINARY_OP, ops);
1099 /* Step 3: Generate BASE + I * STEP.
1100 - BASE is 0, use result of vid.
1101 - BASE is not 0, we use vadd.vx/vadd.vi. */
1102 if (rtx_equal_p (base, const0_rtx))
1103 emit_move_insn (result, step_adj);
1104 else
1106 insn_code icode = code_for_pred_scalar (PLUS, mode);
1107 rtx ops[] = {result, step_adj, base};
1108 emit_vlmax_insn (icode, BINARY_OP, ops);
1112 if (result != dest)
1113 emit_move_insn (dest, result);
1116 static void
1117 expand_const_vector (rtx target, rtx src)
1119 machine_mode mode = GET_MODE (target);
1120 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1122 rtx elt;
1123 gcc_assert (
1124 const_vec_duplicate_p (src, &elt)
1125 && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
1126 rtx ops[] = {target, src};
1127 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
1128 return;
1131 rtx elt;
1132 if (const_vec_duplicate_p (src, &elt))
1134 rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
1135 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1136 we use vmv.v.i instruction. */
1137 if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
1139 rtx ops[] = {tmp, src};
1140 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
1142 else
1144 /* Emit vec_duplicate<mode> split pattern before RA so that
1145 we could have a better optimization opportunity in LICM
1146 which will hoist vmv.v.x outside the loop and in fwprop && combine
1147 which will transform 'vv' into 'vx' instruction.
1149 The reason we don't emit vec_duplicate<mode> split pattern during
1150 RA since the split stage after RA is a too late stage to generate
1151 RVV instruction which need an additional register (We can't
1152 allocate a new register after RA) for VL operand of vsetvl
1153 instruction (vsetvl a5, zero). */
1154 if (lra_in_progress)
1156 rtx ops[] = {tmp, elt};
1157 emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
1159 else
1161 struct expand_operand ops[2];
1162 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
1163 gcc_assert (icode != CODE_FOR_nothing);
1164 create_output_operand (&ops[0], tmp, mode);
1165 create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
1166 expand_insn (icode, 2, ops);
1167 tmp = ops[0].value;
1171 if (tmp != target)
1172 emit_move_insn (target, tmp);
1173 return;
1176 /* Support scalable const series vector. */
1177 rtx base, step;
1178 if (const_vec_series_p (src, &base, &step))
1180 expand_vec_series (target, base, step);
1181 return;
1184 /* Handle variable-length vector. */
1185 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
1186 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
1187 rvv_builder builder (mode, npatterns, nelts_per_pattern);
1188 for (unsigned int i = 0; i < nelts_per_pattern; i++)
1190 for (unsigned int j = 0; j < npatterns; j++)
1191 builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
1193 builder.finalize ();
1195 if (CONST_VECTOR_DUPLICATE_P (src))
1197 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1198 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1199 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1200 The elements within NPATTERNS are not necessary regular. */
1201 if (builder.can_duplicate_repeating_sequence_p ())
1203 /* We handle the case that we can find a vector containter to hold
1204 element bitsize = NPATTERNS * ele_bitsize.
1206 NPATTERNS = 8, element width = 8
1207 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1208 In this case, we can combine NPATTERNS element into a larger
1209 element. Use element width = 64 and broadcast a vector with
1210 all element equal to 0x0706050403020100. */
1211 rtx ele = builder.get_merged_repeating_sequence ();
1212 rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
1213 emit_move_insn (target, gen_lowpart (mode, dup));
1215 else
1217 /* We handle the case that we can't find a vector containter to hold
1218 element bitsize = NPATTERNS * ele_bitsize.
1220 NPATTERNS = 8, element width = 16
1221 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1222 Since NPATTERNS * element width = 128, we can't find a container
1223 to hold it.
1225 In this case, we use NPATTERNS merge operations to generate such
1226 vector. */
1227 unsigned int nbits = npatterns - 1;
1229 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1230 rtx vid = gen_reg_rtx (builder.int_mode ());
1231 rtx op[] = {vid};
1232 emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
1233 NULLARY_OP, op);
1235 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1236 rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
1237 rtx and_ops[] = {vid_repeat, vid,
1238 gen_int_mode (nbits, builder.inner_int_mode ())};
1239 emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
1240 BINARY_OP, and_ops);
1242 rtx tmp = gen_reg_rtx (builder.mode ());
1243 rtx dup_ops[] = {tmp, builder.elt (0)};
1244 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
1245 dup_ops);
1246 for (unsigned int i = 1; i < builder.npatterns (); i++)
1248 /* Generate mask according to i. */
1249 rtx mask = gen_reg_rtx (builder.mask_mode ());
1250 rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
1251 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
1253 /* Merge scalar to each i. */
1254 rtx tmp2 = gen_reg_rtx (builder.mode ());
1255 rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
1256 insn_code icode = code_for_pred_merge_scalar (builder.mode ());
1257 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
1258 tmp = tmp2;
1260 emit_move_insn (target, tmp);
1263 else if (CONST_VECTOR_STEPPED_P (src))
1265 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
1266 if (builder.single_step_npatterns_p ())
1268 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1269 insn_code icode;
1271 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1272 rtx vid = gen_reg_rtx (builder.mode ());
1273 rtx vid_ops[] = {vid};
1274 icode = code_for_pred_series (builder.mode ());
1275 emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
1277 if (builder.npatterns_all_equal_p ())
1279 /* Generate the variable-length vector following this rule:
1280 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1281 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1282 /* We want to create a pattern where value[ix] = floor (ix /
1283 NPATTERNS). As NPATTERNS is always a power of two we can
1284 rewrite this as = ix & -NPATTERNS. */
1285 /* Step 2: VID AND -NPATTERNS:
1286 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1288 rtx imm
1289 = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
1290 rtx tmp = gen_reg_rtx (builder.mode ());
1291 rtx and_ops[] = {tmp, vid, imm};
1292 icode = code_for_pred_scalar (AND, builder.mode ());
1293 emit_vlmax_insn (icode, BINARY_OP, and_ops);
1294 HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
1295 if (init_val == 0)
1296 emit_move_insn (target, tmp);
1297 else
1299 rtx dup = gen_const_vector_dup (builder.mode (), init_val);
1300 rtx add_ops[] = {target, tmp, dup};
1301 icode = code_for_pred (PLUS, builder.mode ());
1302 emit_vlmax_insn (icode, BINARY_OP, add_ops);
1305 else
1307 /* Generate the variable-length vector following this rule:
1308 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1310 if (builder.npatterns_vid_diff_repeated_p ())
1312 /* Case 1: For example as below:
1313 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1314 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1315 repeated as below after minus vid.
1316 {3, 1, -1, -3, 3, 1, -1, -3...}
1317 Then we can simplify the diff code gen to at most
1318 npatterns(). */
1319 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1321 /* Step 1: Generate diff = TARGET - VID. */
1322 for (unsigned int i = 0; i < v.npatterns (); ++i)
1324 poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
1325 v.quick_push (gen_int_mode (diff, v.inner_mode ()));
1328 /* Step 2: Generate result = VID + diff. */
1329 rtx vec = v.build ();
1330 rtx add_ops[] = {target, vid, vec};
1331 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1332 BINARY_OP, add_ops);
1334 else
1336 /* Case 2: For example as below:
1337 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1339 rvv_builder v (builder.mode (), builder.npatterns (), 1);
1341 /* Step 1: Generate { a, b, a, b, ... } */
1342 for (unsigned int i = 0; i < v.npatterns (); ++i)
1343 v.quick_push (builder.elt (i));
1344 rtx new_base = v.build ();
1346 /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS).  */
1347 rtx shift_count
1348 = gen_int_mode (exact_log2 (builder.npatterns ()),
1349 builder.inner_mode ());
1350 rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT,
1351 vid, shift_count, NULL_RTX,
1352 false, OPTAB_DIRECT);
1354 /* Step 3: Generate tmp2 = tmp * step.  */
1355 rtx tmp2 = gen_reg_rtx (builder.mode ());
1356 rtx step
1357 = simplify_binary_operation (MINUS, builder.inner_mode (),
1358 builder.elt (v.npatterns()),
1359 builder.elt (0));
1360 expand_vec_series (tmp2, const0_rtx, step, tmp);
1362 /* Step 4: Generate target = tmp2 + new_base.  */
1363 rtx add_ops[] = {target, tmp2, new_base};
1364 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
1365 BINARY_OP, add_ops);
1369 else if (builder.interleaved_stepped_npatterns_p ())
1371 rtx base1 = builder.elt (0);
1372 rtx base2 = builder.elt (1);
1373 poly_int64 step1
1374 = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
1375 - rtx_to_poly_int64 (base1);
1376 poly_int64 step2
1377 = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
1378 - rtx_to_poly_int64 (base2);
1380 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1381 integer vector mode to generate such vector efficiently.
1383 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1385 can be interpreted into:
1387 EEW = 32, { 2, 4, ... } */
1388 unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
1389 scalar_int_mode new_smode;
1390 machine_mode new_mode;
1391 poly_uint64 new_nunits
1392 = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
1393 if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
1394 && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
1396 rtx tmp = gen_reg_rtx (new_mode);
1397 base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
1398 expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode));
1400 if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
1401 /* { 1, 0, 2, 0, ... }. */
1402 emit_move_insn (target, gen_lowpart (mode, tmp));
1403 else if (known_eq (step2, 0))
1405 /* { 1, 1, 2, 1, ... }. */
1406 rtx scalar = expand_simple_binop (
1407 new_smode, ASHIFT,
1408 gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
1409 gen_int_mode (builder.inner_bits_size (), new_smode),
1410 NULL_RTX, false, OPTAB_DIRECT);
1411 rtx tmp2 = gen_reg_rtx (new_mode);
1412 rtx and_ops[] = {tmp2, tmp, scalar};
1413 emit_vlmax_insn (code_for_pred_scalar (AND, new_mode),
1414 BINARY_OP, and_ops);
1415 emit_move_insn (target, gen_lowpart (mode, tmp2));
1417 else
1419 /* { 1, 3, 2, 6, ... }. */
1420 rtx tmp2 = gen_reg_rtx (new_mode);
1421 base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
1422 expand_vec_series (tmp2, base2,
1423 gen_int_mode (step2, new_smode));
1424 rtx shifted_tmp2 = expand_simple_binop (
1425 new_mode, ASHIFT, tmp2,
1426 gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
1427 false, OPTAB_DIRECT);
1428 rtx tmp3 = gen_reg_rtx (new_mode);
1429 rtx ior_ops[] = {tmp3, tmp, shifted_tmp2};
1430 emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
1431 ior_ops);
1432 emit_move_insn (target, gen_lowpart (mode, tmp3));
1435 else
1437 rtx vid = gen_reg_rtx (mode);
1438 expand_vec_series (vid, const0_rtx, const1_rtx);
1439 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1440 rtx shifted_vid
1441 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
1442 NULL_RTX, false, OPTAB_DIRECT);
1443 rtx tmp1 = gen_reg_rtx (mode);
1444 rtx tmp2 = gen_reg_rtx (mode);
1445 expand_vec_series (tmp1, base1,
1446 gen_int_mode (step1, builder.inner_mode ()),
1447 shifted_vid);
1448 expand_vec_series (tmp2, base2,
1449 gen_int_mode (step2, builder.inner_mode ()),
1450 shifted_vid);
1452 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1453 rtx and_vid = gen_reg_rtx (mode);
1454 rtx and_ops[] = {and_vid, vid, const1_rtx};
1455 emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
1456 and_ops);
1457 rtx mask = gen_reg_rtx (builder.mask_mode ());
1458 expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
1460 rtx ops[] = {target, tmp1, tmp2, mask};
1461 emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
1464 else if (npatterns == 1 && nelts_per_pattern == 3)
1466 /* Generate the following CONST_VECTOR:
1467 { base0, base1, base1 + step, base1 + step * 2, ... } */
1468 rtx base0 = builder.elt (0);
1469 rtx base1 = builder.elt (1);
1470 rtx base2 = builder.elt (2);
1472 rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
1473 base2, base1);
1475 /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */
1476 rtx tmp = gen_reg_rtx (mode);
1477 expand_vec_series (tmp, base1, step);
1478 /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... } */
1479 if (!rtx_equal_p (base0, const0_rtx))
1480 base0 = force_reg (builder.inner_mode (), base0);
1482 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
1483 gcc_assert (icode != CODE_FOR_nothing);
1484 emit_insn (GEN_FCN (icode) (target, tmp, base0));
1486 else
1487 /* TODO: We will enable more variable-length vector in the future. */
1488 gcc_unreachable ();
1490 else
1491 gcc_unreachable ();
1494 /* Get the frm mode with given CONST_INT rtx, the default mode is
1495 FRM_DYN. */
1496 enum floating_point_rounding_mode
1497 get_frm_mode (rtx operand)
1499 gcc_assert (CONST_INT_P (operand));
1501 switch (INTVAL (operand))
1503 case FRM_RNE:
1504 return FRM_RNE;
1505 case FRM_RTZ:
1506 return FRM_RTZ;
1507 case FRM_RDN:
1508 return FRM_RDN;
1509 case FRM_RUP:
1510 return FRM_RUP;
1511 case FRM_RMM:
1512 return FRM_RMM;
1513 case FRM_DYN:
1514 return FRM_DYN;
1515 default:
1516 gcc_unreachable ();
1519 gcc_unreachable ();
1522 /* Expand a pre-RA RVV data move from SRC to DEST.
1523 It expands move for RVV fractional vector modes.
1524 Return true if the move as already been emitted. */
1525 bool
1526 legitimize_move (rtx dest, rtx *srcp)
1528 rtx src = *srcp;
1529 machine_mode mode = GET_MODE (dest);
1530 if (CONST_VECTOR_P (src))
1532 expand_const_vector (dest, src);
1533 return true;
1536 if (riscv_v_ext_vls_mode_p (mode))
1538 if (GET_MODE_NUNITS (mode).to_constant () <= 31)
1540 /* For NUNITS <= 31 VLS modes, we don't need extrac
1541 scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
1542 if (can_create_pseudo_p ())
1544 /* Need to force register if mem <- !reg. */
1545 if (MEM_P (dest) && !REG_P (src))
1546 *srcp = force_reg (mode, src);
1548 return false;
1551 else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
1553 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1554 return true;
1557 else
1559 /* In order to decrease the memory traffic, we don't use whole register
1560 * load/store for the LMUL less than 1 and mask mode, so those case will
1561 * require one extra general purpose register, but it's not allowed during
1562 * LRA process, so we have a special move pattern used for LRA, which will
1563 * defer the expansion after LRA. */
1564 if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1565 || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1566 && lra_in_progress)
1568 emit_insn (gen_mov_lra (mode, Pmode, dest, src));
1569 return true;
1572 if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
1573 && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
1575 /* Need to force register if mem <- !reg. */
1576 if (MEM_P (dest) && !REG_P (src))
1577 *srcp = force_reg (mode, src);
1579 return false;
1583 if (register_operand (src, mode) && register_operand (dest, mode))
1585 emit_insn (gen_rtx_SET (dest, src));
1586 return true;
1589 unsigned insn_flags
1590 = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
1591 if (!register_operand (src, mode) && !register_operand (dest, mode))
1593 rtx tmp = gen_reg_rtx (mode);
1594 if (MEM_P (src))
1596 rtx ops[] = {tmp, src};
1597 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1599 else
1600 emit_move_insn (tmp, src);
1601 src = tmp;
1604 if (satisfies_constraint_vu (src))
1605 return false;
1607 rtx ops[] = {dest, src};
1608 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
1609 return true;
1612 /* VTYPE information for machine_mode. */
1613 struct mode_vtype_group
1615 enum vlmul_type vlmul[NUM_MACHINE_MODES];
1616 uint8_t ratio[NUM_MACHINE_MODES];
1617 machine_mode subpart_mode[NUM_MACHINE_MODES];
1618 uint8_t nf[NUM_MACHINE_MODES];
1619 mode_vtype_group ()
1621 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1622 vlmul[MODE##mode] = VLMUL; \
1623 ratio[MODE##mode] = RATIO;
1624 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1625 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1626 nf[MODE##mode] = NF; \
1627 vlmul[MODE##mode] = VLMUL; \
1628 ratio[MODE##mode] = RATIO;
1629 #include "riscv-vector-switch.def"
1630 #undef ENTRY
1631 #undef TUPLE_ENTRY
1635 static mode_vtype_group mode_vtype_infos;
1637 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1638 enum vlmul_type
1639 get_vlmul (machine_mode mode)
1641 /* For VLS modes, the vlmul should be dynamically
1642 calculated since we need to adjust VLMUL according
1643 to TARGET_MIN_VLEN. */
1644 if (riscv_v_ext_vls_mode_p (mode))
1646 int size = GET_MODE_BITSIZE (mode).to_constant ();
1647 int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
1648 if (size < TARGET_MIN_VLEN)
1650 int factor = TARGET_MIN_VLEN / size;
1651 if (inner_size == 8)
1652 factor = MIN (factor, 8);
1653 else if (inner_size == 16)
1654 factor = MIN (factor, 4);
1655 else if (inner_size == 32)
1656 factor = MIN (factor, 2);
1657 else if (inner_size == 64)
1658 factor = MIN (factor, 1);
1659 else
1660 gcc_unreachable ();
1662 switch (factor)
1664 case 1:
1665 return LMUL_1;
1666 case 2:
1667 return LMUL_F2;
1668 case 4:
1669 return LMUL_F4;
1670 case 8:
1671 return LMUL_F8;
1673 default:
1674 gcc_unreachable ();
1677 else
1679 int factor = size / TARGET_MIN_VLEN;
1680 switch (factor)
1682 case 1:
1683 return LMUL_1;
1684 case 2:
1685 return LMUL_2;
1686 case 4:
1687 return LMUL_4;
1688 case 8:
1689 return LMUL_8;
1691 default:
1692 gcc_unreachable ();
1696 return mode_vtype_infos.vlmul[mode];
1699 /* Return the VLMAX rtx of vector mode MODE. */
1701 get_vlmax_rtx (machine_mode mode)
1703 gcc_assert (riscv_v_ext_vector_mode_p (mode));
1704 return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
1707 /* Return the NF value of the corresponding mode. */
1708 unsigned int
1709 get_nf (machine_mode mode)
1711 /* We don't allow non-tuple modes go through this function. */
1712 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1713 return mode_vtype_infos.nf[mode];
1716 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1717 the subpart mode is RVVM2SImode. This will help to build
1718 array/struct type in builtins. */
1719 machine_mode
1720 get_subpart_mode (machine_mode mode)
1722 /* We don't allow non-tuple modes go through this function. */
1723 gcc_assert (riscv_v_ext_tuple_mode_p (mode));
1724 return mode_vtype_infos.subpart_mode[mode];
1727 /* Get ratio according to machine mode. */
1728 unsigned int
1729 get_ratio (machine_mode mode)
1731 if (riscv_v_ext_vls_mode_p (mode))
1733 unsigned int sew = get_sew (mode);
1734 vlmul_type vlmul = get_vlmul (mode);
1735 switch (vlmul)
1737 case LMUL_1:
1738 return sew;
1739 case LMUL_2:
1740 return sew / 2;
1741 case LMUL_4:
1742 return sew / 4;
1743 case LMUL_8:
1744 return sew / 8;
1745 case LMUL_F8:
1746 return sew * 8;
1747 case LMUL_F4:
1748 return sew * 4;
1749 case LMUL_F2:
1750 return sew * 2;
1752 default:
1753 gcc_unreachable ();
1756 return mode_vtype_infos.ratio[mode];
1759 /* Get ta according to operand[tail_op_idx]. */
1761 get_ta (rtx ta)
1763 if (INTVAL (ta) == TAIL_ANY)
1764 return INVALID_ATTRIBUTE;
1765 return INTVAL (ta);
1768 /* Get ma according to operand[mask_op_idx]. */
1770 get_ma (rtx ma)
1772 if (INTVAL (ma) == MASK_ANY)
1773 return INVALID_ATTRIBUTE;
1774 return INTVAL (ma);
1777 /* Get prefer tail policy. */
1778 enum tail_policy
1779 get_prefer_tail_policy ()
1781 /* TODO: By default, we choose to use TAIL_ANY which allows
1782 compiler pick up either agnostic or undisturbed. Maybe we
1783 will have a compile option like -mprefer=agnostic to set
1784 this value???. */
1785 return TAIL_ANY;
1788 /* Get prefer mask policy. */
1789 enum mask_policy
1790 get_prefer_mask_policy ()
1792 /* TODO: By default, we choose to use MASK_ANY which allows
1793 compiler pick up either agnostic or undisturbed. Maybe we
1794 will have a compile option like -mprefer=agnostic to set
1795 this value???. */
1796 return MASK_ANY;
1799 /* Get avl_type rtx. */
1801 get_avl_type_rtx (enum avl_type type)
1803 return gen_int_mode (type, Pmode);
1806 /* Return the appropriate mask mode for MODE. */
1808 machine_mode
1809 get_mask_mode (machine_mode mode)
1811 poly_int64 nunits = GET_MODE_NUNITS (mode);
1812 if (riscv_v_ext_tuple_mode_p (mode))
1814 unsigned int nf = get_nf (mode);
1815 nunits = exact_div (nunits, nf);
1817 return get_vector_mode (BImode, nunits).require ();
1820 /* Return the appropriate M1 mode for MODE. */
1822 static opt_machine_mode
1823 get_m1_mode (machine_mode mode)
1825 scalar_mode smode = GET_MODE_INNER (mode);
1826 unsigned int bytes = GET_MODE_SIZE (smode);
1827 poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
1828 return get_vector_mode (smode, m1_nunits);
1831 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1832 This function is not only used by builtins, but also will be used by
1833 auto-vectorization in the future. */
1834 opt_machine_mode
1835 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
1837 enum mode_class mclass;
1838 if (inner_mode == E_BImode)
1839 mclass = MODE_VECTOR_BOOL;
1840 else if (FLOAT_MODE_P (inner_mode))
1841 mclass = MODE_VECTOR_FLOAT;
1842 else
1843 mclass = MODE_VECTOR_INT;
1844 machine_mode mode;
1845 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1846 if (inner_mode == GET_MODE_INNER (mode)
1847 && known_eq (nunits, GET_MODE_NUNITS (mode))
1848 && (riscv_v_ext_vector_mode_p (mode)
1849 || riscv_v_ext_vls_mode_p (mode)))
1850 return mode;
1851 return opt_machine_mode ();
1854 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1855 corresponding subpart mode and NF. */
1856 opt_machine_mode
1857 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
1859 poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
1860 scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
1861 enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
1862 machine_mode mode;
1863 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1864 if (inner_mode == GET_MODE_INNER (mode)
1865 && known_eq (nunits, GET_MODE_NUNITS (mode))
1866 && riscv_v_ext_tuple_mode_p (mode)
1867 && get_subpart_mode (mode) == subpart_mode)
1868 return mode;
1869 return opt_machine_mode ();
1872 bool
1873 simm5_p (rtx x)
1875 if (!CONST_INT_P (x))
1876 return false;
1877 return IN_RANGE (INTVAL (x), -16, 15);
1880 bool
1881 neg_simm5_p (rtx x)
1883 if (!CONST_INT_P (x))
1884 return false;
1885 return IN_RANGE (INTVAL (x), -15, 16);
1888 bool
1889 has_vi_variant_p (rtx_code code, rtx x)
1891 switch (code)
1893 case PLUS:
1894 case AND:
1895 case IOR:
1896 case XOR:
1897 case SS_PLUS:
1898 case US_PLUS:
1899 case EQ:
1900 case NE:
1901 case LE:
1902 case LEU:
1903 case GT:
1904 case GTU:
1905 return simm5_p (x);
1907 case LT:
1908 case LTU:
1909 case GE:
1910 case GEU:
1911 case MINUS:
1912 case SS_MINUS:
1913 return neg_simm5_p (x);
1915 default:
1916 return false;
1920 bool
1921 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
1922 machine_mode vector_mode, bool has_vi_variant_p,
1923 void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
1925 machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
1926 if (has_vi_variant_p)
1928 *scalar_op = force_reg (scalar_mode, *scalar_op);
1929 return false;
1932 if (TARGET_64BIT)
1934 if (!rtx_equal_p (*scalar_op, const0_rtx))
1935 *scalar_op = force_reg (scalar_mode, *scalar_op);
1936 return false;
1939 if (immediate_operand (*scalar_op, Pmode))
1941 if (!rtx_equal_p (*scalar_op, const0_rtx))
1942 *scalar_op = force_reg (Pmode, *scalar_op);
1944 *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
1945 return false;
1948 if (CONST_INT_P (*scalar_op))
1950 if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
1951 *scalar_op = force_const_mem (scalar_mode, *scalar_op);
1952 else
1953 *scalar_op = force_reg (scalar_mode, *scalar_op);
1956 rtx tmp = gen_reg_rtx (vector_mode);
1957 rtx ops[] = {tmp, *scalar_op};
1958 if (type == VLMAX)
1959 emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
1960 else
1961 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
1962 vl);
1963 emit_vector_func (operands, tmp);
1965 return true;
1968 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
1970 gen_scalar_move_mask (machine_mode mode)
1972 rtx_vector_builder builder (mode, 1, 2);
1973 builder.quick_push (const1_rtx);
1974 builder.quick_push (const0_rtx);
1975 return builder.build ();
1978 static unsigned
1979 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
1981 // Original equation:
1982 // VLMAX = (VectorBits / EltSize) * LMUL
1983 // where LMUL = MinSize / TARGET_MIN_VLEN
1984 // The following equations have been reordered to prevent loss of precision
1985 // when calculating fractional LMUL.
1986 return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
1989 static unsigned
1990 get_unknown_min_value (machine_mode mode)
1992 enum vlmul_type vlmul = get_vlmul (mode);
1993 switch (vlmul)
1995 case LMUL_1:
1996 return TARGET_MIN_VLEN;
1997 case LMUL_2:
1998 return TARGET_MIN_VLEN * 2;
1999 case LMUL_4:
2000 return TARGET_MIN_VLEN * 4;
2001 case LMUL_8:
2002 return TARGET_MIN_VLEN * 8;
2003 default:
2004 gcc_unreachable ();
2008 static rtx
2009 force_vector_length_operand (rtx vl)
2011 if (CONST_INT_P (vl) && !satisfies_constraint_K (vl))
2012 return force_reg (Pmode, vl);
2013 return vl;
2017 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
2019 unsigned int sew = get_sew (vmode);
2020 rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
2021 rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
2022 return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
2023 gen_int_mode (get_vlmul (vmode), Pmode),
2024 tail_policy, mask_policy);
2027 /* GET VL * 2 rtx. */
2028 static rtx
2029 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
2031 rtx i32vl = NULL_RTX;
2032 if (CONST_INT_P (avl))
2034 unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
2035 unsigned min_size = get_unknown_min_value (mode);
2036 unsigned vlen_max = RVV_65536;
2037 unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
2038 unsigned vlen_min = TARGET_MIN_VLEN;
2039 unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
2041 unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
2042 if (avl_int <= vlmax_min)
2043 i32vl = gen_int_mode (2 * avl_int, Pmode);
2044 else if (avl_int >= 2 * vlmax_max)
2046 // Just set i32vl to VLMAX in this situation
2047 i32vl = gen_reg_rtx (Pmode);
2048 emit_insn (
2049 gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
2051 else
2053 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2054 // is related to the hardware implementation.
2055 // So let the following code handle
2058 if (!i32vl)
2060 // Using vsetvli instruction to get actually used length which related to
2061 // the hardware implementation
2062 rtx i64vl = gen_reg_rtx (Pmode);
2063 emit_insn (
2064 gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
2065 // scale 2 for 32-bit length
2066 i32vl = gen_reg_rtx (Pmode);
2067 emit_insn (
2068 gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
2071 return force_vector_length_operand (i32vl);
2074 bool
2075 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
2076 machine_mode demote_mask_mode, rtx *ops)
2078 rtx scalar_op = ops[4];
2079 rtx avl = ops[5];
2080 machine_mode scalar_mode = GET_MODE_INNER (mode);
2081 if (rtx_equal_p (scalar_op, const0_rtx))
2083 ops[5] = force_vector_length_operand (ops[5]);
2084 return false;
2087 if (TARGET_64BIT)
2089 ops[4] = force_reg (scalar_mode, scalar_op);
2090 ops[5] = force_vector_length_operand (ops[5]);
2091 return false;
2094 if (immediate_operand (scalar_op, Pmode))
2096 ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
2097 ops[5] = force_vector_length_operand (ops[5]);
2098 return false;
2101 if (CONST_INT_P (scalar_op))
2102 scalar_op = force_reg (scalar_mode, scalar_op);
2104 rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
2106 rtx demote_scalar_op1, demote_scalar_op2;
2107 if (unspec == UNSPEC_VSLIDE1UP)
2109 demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
2110 demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
2112 else
2114 demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
2115 demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
2118 rtx temp = gen_reg_rtx (demote_mode);
2119 rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
2120 rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
2121 rtx merge = RVV_VUNDEF (demote_mode);
2122 /* Handle vslide1<ud>_tu. */
2123 if (register_operand (ops[2], mode)
2124 && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
2126 merge = gen_lowpart (demote_mode, ops[2]);
2127 ta = ops[6];
2128 ma = ops[7];
2131 emit_insn (gen_pred_slide (unspec, demote_mode, temp,
2132 CONSTM1_RTX (demote_mask_mode), merge,
2133 gen_lowpart (demote_mode, ops[3]),
2134 demote_scalar_op1, vl_x2, ta, ma, ops[8]));
2135 emit_insn (gen_pred_slide (unspec, demote_mode,
2136 gen_lowpart (demote_mode, ops[0]),
2137 CONSTM1_RTX (demote_mask_mode), merge, temp,
2138 demote_scalar_op2, vl_x2, ta, ma, ops[8]));
2140 if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
2141 && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
2142 emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
2143 force_vector_length_operand (ops[5]), ops[6],
2144 ops[8]));
2145 return true;
2149 gen_avl_for_scalar_move (rtx avl)
2151 /* AVL for scalar move has different behavior between 0 and large than 0. */
2152 if (CONST_INT_P (avl))
2154 /* So we could just set AVL to 1 for any constant other than 0. */
2155 if (rtx_equal_p (avl, const0_rtx))
2156 return const0_rtx;
2157 else
2158 return const1_rtx;
2160 else
2162 /* For non-constant value, we set any non zero value to 1 by
2163 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2164 rtx tmp = gen_reg_rtx (Pmode);
2165 emit_insn (
2166 gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
2167 return tmp;
2171 /* Expand tuple modes data movement for. */
2172 void
2173 expand_tuple_move (rtx *ops)
2175 unsigned int i;
2176 machine_mode tuple_mode = GET_MODE (ops[0]);
2177 machine_mode subpart_mode = get_subpart_mode (tuple_mode);
2178 poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
2179 unsigned int nf = get_nf (tuple_mode);
2180 bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
2182 if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
2184 rtx val;
2185 gcc_assert (can_create_pseudo_p ()
2186 && const_vec_duplicate_p (ops[1], &val));
2187 for (i = 0; i < nf; ++i)
2189 poly_int64 offset = i * subpart_size;
2190 rtx subreg
2191 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2192 rtx dup = gen_const_vec_duplicate (subpart_mode, val);
2193 emit_move_insn (subreg, dup);
2196 else if (REG_P (ops[0]) && REG_P (ops[1]))
2198 for (i = 0; i < nf; ++i)
2200 int index = i;
2202 /* Take NF = 2 and LMUL = 1 for example:
2204 - move v8 to v9:
2205 vmv1r v10,v9
2206 vmv1r v9,v8
2208 - move v8 to v7:
2209 vmv1r v7,v8
2210 vmv1r v8,v9 */
2211 if (REGNO (ops[0]) > REGNO (ops[1]))
2212 index = nf - 1 - i;
2213 poly_int64 offset = index * subpart_size;
2214 rtx dst_subreg
2215 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
2216 rtx src_subreg
2217 = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
2218 emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
2221 else
2223 /* Expand tuple memory data movement. */
2224 gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
2225 rtx offset = gen_int_mode (subpart_size, Pmode);
2226 if (!subpart_size.is_constant ())
2228 emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
2229 if (fractional_p)
2231 unsigned int factor
2232 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
2233 .to_constant ();
2234 rtx pat
2235 = gen_rtx_ASHIFTRT (Pmode, ops[2],
2236 gen_int_mode (exact_log2 (factor), Pmode));
2237 emit_insn (gen_rtx_SET (ops[2], pat));
2240 if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
2242 unsigned int factor
2243 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
2244 .to_constant ();
2245 rtx pat
2246 = gen_rtx_ASHIFT (Pmode, ops[2],
2247 gen_int_mode (exact_log2 (factor), Pmode));
2248 emit_insn (gen_rtx_SET (ops[2], pat));
2250 offset = ops[2];
2253 /* Non-fractional LMUL has whole register moves that don't require a
2254 vsetvl for VLMAX. */
2255 if (fractional_p)
2256 emit_vlmax_vsetvl (subpart_mode, ops[4]);
2257 if (MEM_P (ops[1]))
2259 /* Load operations. */
2260 emit_move_insn (ops[3], XEXP (ops[1], 0));
2261 for (i = 0; i < nf; i++)
2263 rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
2264 tuple_mode, i * subpart_size);
2265 if (i != 0)
2267 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2268 emit_insn (gen_rtx_SET (ops[3], new_addr));
2270 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2272 if (fractional_p)
2274 rtx operands[] = {subreg, mem};
2275 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2276 UNARY_OP, operands, ops[4]);
2278 else
2279 emit_move_insn (subreg, mem);
2282 else
2284 /* Store operations. */
2285 emit_move_insn (ops[3], XEXP (ops[0], 0));
2286 for (i = 0; i < nf; i++)
2288 rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
2289 tuple_mode, i * subpart_size);
2290 if (i != 0)
2292 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
2293 emit_insn (gen_rtx_SET (ops[3], new_addr));
2295 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
2297 if (fractional_p)
2299 rtx operands[] = {mem, subreg};
2300 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
2301 UNARY_OP, operands, ops[4]);
2303 else
2304 emit_move_insn (mem, subreg);
2310 /* Return the vectorization machine mode for RVV according to LMUL. */
2311 machine_mode
2312 preferred_simd_mode (scalar_mode mode)
2314 if (autovec_use_vlmax_p ())
2316 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2317 riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
2318 get the auto-vectorization mode. */
2319 poly_uint64 nunits;
2320 poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2321 poly_uint64 scalar_size = GET_MODE_SIZE (mode);
2322 /* Disable vectorization when we can't find a RVV mode for it.
2323 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2324 a double (DFmode) type. */
2325 if (!multiple_p (vector_size, scalar_size, &nunits))
2326 return word_mode;
2327 machine_mode rvv_mode;
2328 if (get_vector_mode (mode, nunits).exists (&rvv_mode))
2329 return rvv_mode;
2331 return word_mode;
2334 /* Subroutine of riscv_vector_expand_vector_init.
2335 Works as follows:
2336 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
2337 (b) Skip leading elements from BUILDER, which are the same as
2338 element NELTS_REQD - 1.
2339 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
2341 static void
2342 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
2343 int nelts_reqd)
2345 machine_mode mode = GET_MODE (target);
2346 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2347 emit_move_insn (target, dup);
2348 int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2349 for (int i = ndups; i < nelts_reqd; i++)
2351 unsigned int unspec
2352 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
2353 insn_code icode = code_for_pred_slide (unspec, mode);
2354 rtx ops[] = {target, target, builder.elt (i)};
2355 emit_vlmax_insn (icode, BINARY_OP, ops);
2359 /* Use merge approach to initialize the vector with repeating sequence.
2360 v = {a, b, a, b, a, b, a, b}.
2362 v = broadcast (a).
2363 mask = 0b01010101....
2364 v = merge (v, b, mask)
2366 static void
2367 expand_vector_init_merge_repeating_sequence (rtx target,
2368 const rvv_builder &builder)
2370 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2371 since we don't have such instruction in RVV.
2372 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2373 instruction to generate the mask data we want. */
2374 machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
2375 machine_mode mask_int_mode
2376 = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
2377 uint64_t full_nelts = builder.full_nelts ().to_constant ();
2379 /* Step 1: Broadcast the first pattern. */
2380 rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
2381 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
2382 UNARY_OP, ops);
2383 /* Step 2: Merge the rest iteration of pattern. */
2384 for (unsigned int i = 1; i < builder.npatterns (); i++)
2386 /* Step 2-1: Generate mask register v0 for each merge. */
2387 rtx merge_mask
2388 = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
2389 rtx mask = gen_reg_rtx (mask_bit_mode);
2390 rtx dup = gen_reg_rtx (mask_int_mode);
2392 if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
2394 rtx ops[] = {dup, merge_mask};
2395 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
2396 SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
2398 else /* vmv.v.x. */
2400 rtx ops[] = {dup,
2401 force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
2402 rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
2403 Pmode);
2404 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
2405 ops, vl);
2408 emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
2410 /* Step 2-2: Merge pattern according to the mask. */
2411 rtx ops[] = {target, target, builder.elt (i), mask};
2412 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
2413 MERGE_OP, ops);
2417 /* Use slideup approach to combine the vectors.
2418 v = {a, a, a, a, b, b, b, b}
2420 First:
2421 v1 = {a, a, a, a, a, a, a, a}
2422 v2 = {b, b, b, b, b, b, b, b}
2423 v = slideup (v1, v2, nelt / 2)
2425 static void
2426 expand_vector_init_slideup_combine_sequence (rtx target,
2427 const rvv_builder &builder)
2429 machine_mode mode = GET_MODE (target);
2430 int nelts = builder.full_nelts ().to_constant ();
2431 rtx first_elt = builder.elt (0);
2432 rtx last_elt = builder.elt (nelts - 1);
2433 rtx low = expand_vector_broadcast (mode, first_elt);
2434 rtx high = expand_vector_broadcast (mode, last_elt);
2435 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
2436 rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
2437 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
2440 /* Use merge approach to merge a scalar into a vector.
2441 v = {a, a, a, a, a, a, b, b}
2443 v1 = {a, a, a, a, a, a, a, a}
2444 scalar = b
2445 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2447 static void
2448 expand_vector_init_merge_combine_sequence (rtx target,
2449 const rvv_builder &builder)
2451 machine_mode mode = GET_MODE (target);
2452 machine_mode imode = builder.int_mode ();
2453 machine_mode mmode = builder.mask_mode ();
2454 int nelts = builder.full_nelts ().to_constant ();
2455 int leading_ndups = builder.count_dups (0, nelts - 1, 1);
2456 if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
2457 || riscv_get_v_regno_alignment (imode) > 1)
2458 imode = get_vector_mode (HImode, nelts).require ();
2460 /* Generate vid = { 0, 1, 2, ..., n }. */
2461 rtx vid = gen_reg_rtx (imode);
2462 expand_vec_series (vid, const0_rtx, const1_rtx);
2464 /* Generate mask. */
2465 rtx mask = gen_reg_rtx (mmode);
2466 insn_code icode = code_for_pred_cmp_scalar (imode);
2467 rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
2468 rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
2469 /* vmsgtu.vi/vmsgtu.vx. */
2470 rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
2471 rtx sel = builder.elt (nelts - 1);
2472 rtx mask_ops[] = {mask, cmp, vid, index};
2473 emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
2475 /* Duplicate the first elements. */
2476 rtx dup = expand_vector_broadcast (mode, builder.elt (0));
2477 /* Merge scalar into vector according to mask. */
2478 rtx merge_ops[] = {target, dup, sel, mask};
2479 icode = code_for_pred_merge_scalar (mode);
2480 emit_vlmax_insn (icode, MERGE_OP, merge_ops);
2483 /* Subroutine of expand_vec_init to handle case
2484 when all trailing elements of builder are same.
2485 This works as follows:
2486 (a) Use expand_insn interface to broadcast last vector element in TARGET.
2487 (b) Insert remaining elements in TARGET using insr.
2489 ??? The heuristic used is to do above if number of same trailing elements
2490 is greater than leading_ndups, loosely based on
2491 heuristic from mostly_zeros_p. May need fine-tuning. */
2493 static bool
2494 expand_vector_init_trailing_same_elem (rtx target,
2495 const rtx_vector_builder &builder,
2496 int nelts_reqd)
2498 int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
2499 int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
2500 machine_mode mode = GET_MODE (target);
2502 if (trailing_ndups > leading_ndups)
2504 rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
2505 for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
2507 unsigned int unspec
2508 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
2509 insn_code icode = code_for_pred_slide (unspec, mode);
2510 rtx tmp = gen_reg_rtx (mode);
2511 rtx ops[] = {tmp, dup, builder.elt (i)};
2512 emit_vlmax_insn (icode, BINARY_OP, ops);
2513 /* slide1up need source and dest to be different REG. */
2514 dup = tmp;
2517 emit_move_insn (target, dup);
2518 return true;
2521 return false;
2524 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2526 void
2527 expand_vec_init (rtx target, rtx vals)
2529 machine_mode mode = GET_MODE (target);
2530 int nelts = XVECLEN (vals, 0);
2532 rvv_builder v (mode, nelts, 1);
2533 for (int i = 0; i < nelts; i++)
2534 v.quick_push (XVECEXP (vals, 0, i));
2535 v.finalize ();
2537 if (nelts > 3)
2539 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2540 if (v.can_duplicate_repeating_sequence_p ())
2542 rtx ele = v.get_merged_repeating_sequence ();
2543 rtx dup = expand_vector_broadcast (v.new_mode (), ele);
2544 emit_move_insn (target, gen_lowpart (mode, dup));
2545 return;
2548 /* Case 2: Optimize repeating sequence cases that Case 1 can
2549 not handle and it is profitable. For example:
2550 ELEMENT BITSIZE = 64.
2551 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2552 We can't find a vector mode for "ab" which will be combined into
2553 128-bit element to duplicate. */
2554 if (v.repeating_sequence_use_merge_profitable_p ())
2556 expand_vector_init_merge_repeating_sequence (target, v);
2557 return;
2560 /* Case 3: Optimize combine sequence.
2561 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2562 We can combine:
2563 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2565 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2566 by slideup. */
2567 if (v.combine_sequence_use_slideup_profitable_p ())
2569 expand_vector_init_slideup_combine_sequence (target, v);
2570 return;
2573 /* Case 4: Optimize combine sequence.
2574 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2576 Generate vector:
2577 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2579 Generate mask:
2580 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2582 Merge b into v by mask:
2583 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2584 if (v.combine_sequence_use_merge_profitable_p ())
2586 expand_vector_init_merge_combine_sequence (target, v);
2587 return;
2591 /* Optimize trailing same elements sequence:
2592 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2593 if (!expand_vector_init_trailing_same_elem (target, v, nelts))
2594 /* Handle common situation by vslide1down. This function can handle any
2595 situation of vec_init<mode>. Only the cases that are not optimized above
2596 will fall through here. */
2597 expand_vector_init_insert_elems (target, v, nelts);
2600 /* Get insn code for corresponding comparison. */
2602 static insn_code
2603 get_cmp_insn_code (rtx_code code, machine_mode mode)
2605 insn_code icode;
2606 switch (code)
2608 case EQ:
2609 case NE:
2610 case LE:
2611 case LEU:
2612 case GT:
2613 case GTU:
2614 case LTGT:
2615 icode = code_for_pred_cmp (mode);
2616 break;
2617 case LT:
2618 case LTU:
2619 case GE:
2620 case GEU:
2621 if (FLOAT_MODE_P (mode))
2622 icode = code_for_pred_cmp (mode);
2623 else
2624 icode = code_for_pred_ltge (mode);
2625 break;
2626 default:
2627 gcc_unreachable ();
2629 return icode;
2632 /* This hook gives the vectorizer more vector mode options. We want it to not
2633 only try modes with the maximum number of units a full vector can hold but
2634 for example also half the number of units for a smaller elements size.
2635 Such vectors can be promoted to a full vector of widened elements
2636 (still with the same number of elements, essentially vectorizing at a
2637 fixed number of units rather than a fixed number of bytes). */
2638 unsigned int
2639 autovectorize_vector_modes (vector_modes *modes, bool)
2641 if (autovec_use_vlmax_p ())
2643 poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
2645 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2646 fit a whole vector.
2647 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2648 is guided by the extensions we have available (vf2, vf4 and vf8).
2650 - full_size: Try using full vectors for all element types.
2651 - full_size / 2:
2652 Try using 16-bit containers for 8-bit elements and full vectors
2653 for wider elements.
2654 - full_size / 4:
2655 Try using 32-bit containers for 8-bit and 16-bit elements and
2656 full vectors for wider elements.
2657 - full_size / 8:
2658 Try using 64-bit containers for all element types. */
2659 static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
2660 for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
2662 poly_uint64 units;
2663 machine_mode mode;
2664 if (can_div_trunc_p (full_size, rvv_factors[i], &units)
2665 && get_vector_mode (QImode, units).exists (&mode))
2666 modes->safe_push (mode);
2669 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2670 unsigned int i = 0;
2671 unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
2672 unsigned int size = base_size;
2673 machine_mode mode;
2674 while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
2676 if (vls_mode_valid_p (mode))
2677 modes->safe_push (mode);
2679 i++;
2680 size = base_size / (1U << i);
2682 /* Enable LOOP_VINFO comparison in COST model. */
2683 return VECT_COMPARE_COSTS;
2686 /* Return true if we can find the related MODE according to default LMUL. */
2687 static bool
2688 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
2689 poly_uint64 *nunits)
2691 if (!autovec_use_vlmax_p ())
2692 return false;
2693 if (riscv_v_ext_vector_mode_p (vector_mode)
2694 && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
2695 GET_MODE_SIZE (element_mode), nunits))
2696 return true;
2697 if (riscv_v_ext_vls_mode_p (vector_mode)
2698 && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
2699 GET_MODE_SIZE (element_mode), nunits))
2700 return true;
2701 return false;
2704 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2705 of units that fit into a full vector at the given ELEMENT_MODE.
2706 We will have the vectorizer call us with a successively decreasing
2707 number of units (as specified in autovectorize_vector_modes).
2708 The starting mode is always the one specified by preferred_simd_mode. */
2709 opt_machine_mode
2710 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
2711 poly_uint64 nunits)
2713 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2714 poly_uint64 min_units;
2715 if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
2717 machine_mode rvv_mode;
2718 if (maybe_ne (nunits, 0U))
2720 /* If we were given a number of units NUNITS, try to find an
2721 RVV vector mode of inner mode ELEMENT_MODE with the same
2722 number of units. */
2723 if (multiple_p (min_units, nunits)
2724 && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
2725 return rvv_mode;
2727 else
2729 /* Look for a vector mode with the same number of units as the
2730 VECTOR_MODE we were given. We keep track of the minimum
2731 number of units so far which determines the smallest necessary
2732 but largest possible, suitable mode for vectorization. */
2733 min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
2734 if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
2735 return rvv_mode;
2739 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2742 /* Expand an RVV comparison. */
2744 void
2745 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1)
2747 machine_mode mask_mode = GET_MODE (target);
2748 machine_mode data_mode = GET_MODE (op0);
2749 insn_code icode = get_cmp_insn_code (code, data_mode);
2751 if (code == LTGT)
2753 rtx lt = gen_reg_rtx (mask_mode);
2754 rtx gt = gen_reg_rtx (mask_mode);
2755 expand_vec_cmp (lt, LT, op0, op1);
2756 expand_vec_cmp (gt, GT, op0, op1);
2757 icode = code_for_pred (IOR, mask_mode);
2758 rtx ops[] = {target, lt, gt};
2759 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2760 return;
2763 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2764 rtx ops[] = {target, cmp, op0, op1};
2765 emit_vlmax_insn (icode, COMPARE_OP, ops);
2768 void
2769 expand_vec_cmp (rtx target, rtx_code code, rtx mask, rtx maskoff, rtx op0,
2770 rtx op1)
2772 machine_mode mask_mode = GET_MODE (target);
2773 machine_mode data_mode = GET_MODE (op0);
2774 insn_code icode = get_cmp_insn_code (code, data_mode);
2776 if (code == LTGT)
2778 rtx lt = gen_reg_rtx (mask_mode);
2779 rtx gt = gen_reg_rtx (mask_mode);
2780 expand_vec_cmp (lt, LT, mask, maskoff, op0, op1);
2781 expand_vec_cmp (gt, GT, mask, maskoff, op0, op1);
2782 icode = code_for_pred (IOR, mask_mode);
2783 rtx ops[] = {target, lt, gt};
2784 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2785 return;
2788 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
2789 rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
2790 emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
2793 /* Expand an RVV floating-point comparison:
2795 If CAN_INVERT_P is true, the caller can also handle inverted results;
2796 return true if the result is in fact inverted. */
2798 bool
2799 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
2800 bool can_invert_p)
2802 machine_mode mask_mode = GET_MODE (target);
2803 machine_mode data_mode = GET_MODE (op0);
2805 /* If can_invert_p = true:
2806 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2808 vmfeq.vv v0, va, va
2809 vmfeq.vv v1, vb, vb
2810 vmand.mm v0, v0, v1
2811 vmflt.vv v0, va, vb, v0.t
2812 vmnot.m v0, v0
2814 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2815 second vmfeq.vv:
2817 vmfeq.vv v0, va, va
2818 vmfeq.vv v0, vb, vb, v0.t
2819 vmflt.vv v0, va, vb, v0.t
2820 vmnot.m v0, v0
2822 If can_invert_p = false:
2824 # Example of implementing isgreater()
2825 vmfeq.vv v0, va, va # Only set where A is not NaN.
2826 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2827 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2828 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2831 rtx eq0 = gen_reg_rtx (mask_mode);
2832 rtx eq1 = gen_reg_rtx (mask_mode);
2833 switch (code)
2835 case EQ:
2836 case NE:
2837 case LT:
2838 case LE:
2839 case GT:
2840 case GE:
2841 case LTGT:
2842 /* There is native support for the comparison. */
2843 expand_vec_cmp (target, code, op0, op1);
2844 return false;
2845 case UNEQ:
2846 case ORDERED:
2847 case UNORDERED:
2848 case UNLT:
2849 case UNLE:
2850 case UNGT:
2851 case UNGE:
2852 /* vmfeq.vv v0, va, va */
2853 expand_vec_cmp (eq0, EQ, op0, op0);
2854 if (HONOR_SNANS (data_mode))
2857 vmfeq.vv v1, vb, vb
2858 vmand.mm v0, v0, v1
2860 expand_vec_cmp (eq1, EQ, op1, op1);
2861 insn_code icode = code_for_pred (AND, mask_mode);
2862 rtx ops[] = {eq0, eq0, eq1};
2863 emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
2865 else
2867 /* vmfeq.vv v0, vb, vb, v0.t */
2868 expand_vec_cmp (eq0, EQ, eq0, eq0, op1, op1);
2870 break;
2871 default:
2872 gcc_unreachable ();
2875 if (code == ORDERED)
2877 emit_move_insn (target, eq0);
2878 return false;
2881 /* There is native support for the inverse comparison. */
2882 code = reverse_condition_maybe_unordered (code);
2883 if (code == ORDERED)
2884 emit_move_insn (target, eq0);
2885 else
2886 expand_vec_cmp (eq0, code, eq0, eq0, op0, op1);
2888 if (can_invert_p)
2890 emit_move_insn (target, eq0);
2891 return true;
2894 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2895 into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */
2896 emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
2897 return false;
2900 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2901 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2902 2 * nunits - 1. */
2903 static rtx
2904 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
2906 rtx sel_mod;
2907 machine_mode sel_mode = GET_MODE (sel);
2908 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2909 poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
2910 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2911 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2912 indice. */
2913 if (CONST_VECTOR_P (sel)
2914 && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
2915 sel_mod = sel;
2916 else
2918 rtx mod = gen_const_vector_dup (sel_mode, max_sel);
2919 sel_mod
2920 = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
2922 return sel_mod;
2925 /* Implement vec_perm<mode>. */
2927 void
2928 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
2930 machine_mode data_mode = GET_MODE (target);
2931 machine_mode sel_mode = GET_MODE (sel);
2932 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
2934 /* Check if the sel only references the first values vector. If each select
2935 index is in range of [0, nunits - 1]. A single vrgather instructions is
2936 enough. Since we will use vrgatherei16.vv for variable-length vector,
2937 it is never out of range and we don't need to modulo the index. */
2938 if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
2940 emit_vlmax_gather_insn (target, op0, sel);
2941 return;
2944 /* Check if all the indices are same. */
2945 rtx elt;
2946 if (const_vec_duplicate_p (sel, &elt))
2948 poly_uint64 value = rtx_to_poly_int64 (elt);
2949 rtx op = op0;
2950 if (maybe_gt (value, nunits - 1))
2952 sel = gen_const_vector_dup (sel_mode, value - nunits);
2953 op = op1;
2955 emit_vlmax_gather_insn (target, op, sel);
2958 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2959 size of the two value vectors, i.e. the upper bits of the indices
2960 are effectively ignored. RVV vrgather instead produces 0 for any
2961 out-of-range indices, so we need to modulo all the vec_perm indices
2962 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2963 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2964 rtx sel_mod = modulo_sel_indices (op0, op1, sel);
2966 /* Check if the two values vectors are the same. */
2967 if (rtx_equal_p (op0, op1))
2969 emit_vlmax_gather_insn (target, op0, sel_mod);
2970 return;
2973 /* This following sequence is handling the case that:
2974 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2975 value in range of [0, 2 * nunits - 1]. */
2976 machine_mode mask_mode;
2977 mask_mode = get_mask_mode (data_mode);
2978 rtx mask = gen_reg_rtx (mask_mode);
2979 rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
2981 /* Step 1: generate a mask that should select everything >= nunits into the
2982 * mask. */
2983 expand_vec_cmp (mask, GEU, sel_mod, max_sel);
2985 /* Step2: gather every op0 values indexed by sel into target,
2986 we don't need to care about the result of the element
2987 whose index >= nunits. */
2988 emit_vlmax_gather_insn (target, op0, sel_mod);
2990 /* Step3: shift the range from (nunits, max_of_mode] to
2991 [0, max_of_mode - nunits]. */
2992 rtx tmp = gen_reg_rtx (sel_mode);
2993 rtx ops[] = {tmp, sel_mod, max_sel};
2994 emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
2996 /* Step4: gather those into the previously masked-out elements
2997 of target. */
2998 emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
3001 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3003 /* vec_perm support. */
3005 struct expand_vec_perm_d
3007 rtx target, op0, op1;
3008 vec_perm_indices perm;
3009 machine_mode vmode;
3010 machine_mode op_mode;
3011 bool one_vector_p;
3012 bool testing_p;
3015 /* Return the appropriate index mode for gather instructions. */
3016 opt_machine_mode
3017 get_gather_index_mode (struct expand_vec_perm_d *d)
3019 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3020 poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
3022 if (GET_MODE_INNER (d->vmode) == QImode)
3024 if (nunits.is_constant ())
3026 /* If indice is LMUL8 CONST_VECTOR and any element value
3027 exceed the range of 0 ~ 255, Forbid such permutation
3028 since we need vector HI mode to hold such indice and
3029 we don't have it. */
3030 if (!d->perm.all_in_range_p (0, 255)
3031 && !get_vector_mode (HImode, nunits).exists (&sel_mode))
3032 return opt_machine_mode ();
3034 else
3036 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3037 Otherwise, it could overflow the index range. */
3038 if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
3039 return opt_machine_mode ();
3042 else if (riscv_get_v_regno_alignment (sel_mode) > 1
3043 && GET_MODE_INNER (sel_mode) != HImode)
3044 sel_mode = get_vector_mode (HImode, nunits).require ();
3045 return sel_mode;
3048 /* Recognize the patterns that we can use merge operation to shuffle the
3049 vectors. The value of Each element (index i) in selector can only be
3050 either i or nunits + i. We will check the pattern is actually monotonic.
3052 E.g.
3053 v = VEC_PERM_EXPR (v0, v1, selector),
3054 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3056 We can transform such pattern into:
3058 v = vcond_mask (v0, v1, mask),
3059 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3061 static bool
3062 shuffle_merge_patterns (struct expand_vec_perm_d *d)
3064 machine_mode vmode = d->vmode;
3065 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3066 int n_patterns = d->perm.encoding ().npatterns ();
3067 poly_int64 vec_len = d->perm.length ();
3069 for (int i = 0; i < n_patterns; ++i)
3070 if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
3071 return false;
3073 /* Check the pattern is monotonic here, otherwise, return false. */
3074 for (int i = n_patterns; i < n_patterns * 2; i++)
3075 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
3076 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
3077 return false;
3079 /* We need to use precomputed mask for such situation and such mask
3080 can only be computed in compile-time known size modes. */
3081 bool indices_fit_selector_p
3082 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
3083 if (!indices_fit_selector_p && !vec_len.is_constant ())
3084 return false;
3086 if (d->testing_p)
3087 return true;
3089 machine_mode mask_mode = get_mask_mode (vmode);
3090 rtx mask = gen_reg_rtx (mask_mode);
3092 if (indices_fit_selector_p)
3094 /* MASK = SELECTOR < NUNTIS ? 1 : 0. */
3095 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3096 rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
3097 insn_code icode = code_for_pred_cmp_scalar (sel_mode);
3098 rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
3099 rtx ops[] = {mask, cmp, sel, x};
3100 emit_vlmax_insn (icode, COMPARE_OP, ops);
3102 else
3104 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3105 directly to generate the selector mask, instead, we can only use
3106 precomputed mask.
3108 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3109 don't have a QImode scalar register to hold larger than 255.
3110 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3111 since there is no larger HI mode vector we cannot create a larger
3112 selector.
3114 As the mask is a simple {0, 1, ...} pattern and the length is known we
3115 can store it in a scalar register and broadcast it to a mask register.
3117 gcc_assert (vec_len.is_constant ());
3118 int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
3119 machine_mode mode = get_vector_mode (QImode, size).require ();
3120 rtx tmp = gen_reg_rtx (mode);
3121 rvv_builder v (mode, 1, size);
3122 for (int i = 0; i < vec_len.to_constant () / 8; i++)
3124 uint8_t value = 0;
3125 for (int j = 0; j < 8; j++)
3127 int index = i * 8 + j;
3128 if (known_lt (d->perm[index], 256))
3129 value |= 1 << j;
3131 v.quick_push (gen_int_mode (value, QImode));
3133 emit_move_insn (tmp, v.build ());
3134 emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
3137 /* TARGET = MASK ? OP0 : OP1. */
3138 /* swap op0 and op1 since the order is opposite to pred_merge. */
3139 rtx ops2[] = {d->target, d->op1, d->op0, mask};
3140 emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
3141 return true;
3144 /* Recognize the consecutive index that we can use a single
3145 vrgather.v[x|i] to shuffle the vectors.
3147 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3148 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3149 static bool
3150 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
3152 machine_mode vmode = d->vmode;
3153 scalar_mode smode = GET_MODE_INNER (vmode);
3154 poly_int64 vec_len = d->perm.length ();
3155 HOST_WIDE_INT elt;
3157 if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
3158 return false;
3159 int vlen = vec_len.to_constant ();
3161 /* Compute the last element index of consecutive pattern from the leading
3162 consecutive elements. */
3163 int last_consecutive_idx = -1;
3164 int consecutive_num = -1;
3165 for (int i = 1; i < vlen; i++)
3167 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3168 break;
3169 last_consecutive_idx = i;
3170 consecutive_num = last_consecutive_idx + 1;
3173 int new_vlen = vlen / consecutive_num;
3174 if (last_consecutive_idx < 0 || consecutive_num == vlen
3175 || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
3176 return false;
3177 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3178 All elements of index, index + 1, ... index + consecutive_num - 1 should
3179 locate at the same vector. */
3180 if (maybe_ge (d->perm[0], vec_len)
3181 != maybe_ge (d->perm[last_consecutive_idx], vec_len))
3182 return false;
3183 /* If a vector has 8 elements. We allow optimizations on consecutive
3184 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3185 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3186 to be optimized. */
3187 if (d->perm[0].to_constant () % consecutive_num != 0)
3188 return false;
3189 unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
3190 if (container_bits > 64)
3191 return false;
3192 else if (container_bits == 64)
3194 if (!TARGET_VECTOR_ELEN_64)
3195 return false;
3196 else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
3197 return false;
3200 /* Check the rest of elements are the same consecutive pattern. */
3201 for (int i = consecutive_num; i < vlen; i++)
3202 if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
3203 return false;
3205 if (FLOAT_MODE_P (smode))
3206 smode = float_mode_for_size (container_bits).require ();
3207 else
3208 smode = int_mode_for_size (container_bits, 0).require ();
3209 if (!get_vector_mode (smode, new_vlen).exists (&vmode))
3210 return false;
3211 machine_mode sel_mode = related_int_vector_mode (vmode).require ();
3213 /* Success! */
3214 if (d->testing_p)
3215 return true;
3217 int index = elt / consecutive_num;
3218 if (index >= new_vlen)
3219 index = index - new_vlen;
3220 rtx sel = gen_const_vector_dup (sel_mode, index);
3221 rtx op = elt >= vlen ? d->op0 : d->op1;
3222 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
3223 gen_lowpart (vmode, op), sel);
3224 return true;
3227 /* Recognize the patterns that we can use compress operation to shuffle the
3228 vectors. The perm selector of compress pattern is divided into 2 part:
3229 The first part is the random index number < NUNITS.
3230 The second part is consecutive last N index number >= NUNITS.
3232 E.g.
3233 v = VEC_PERM_EXPR (v0, v1, selector),
3234 selector = { 0, 2, 6, 7 }
3236 We can transform such pattern into:
3238 op1 = vcompress (op0, mask)
3239 mask = { 1, 0, 1, 0 }
3240 v = op1. */
3242 static bool
3243 shuffle_compress_patterns (struct expand_vec_perm_d *d)
3245 machine_mode vmode = d->vmode;
3246 poly_int64 vec_len = d->perm.length ();
3248 if (!vec_len.is_constant ())
3249 return false;
3251 int vlen = vec_len.to_constant ();
3253 /* It's not worthwhile the compress pattern has elemenets < 4
3254 and we can't modulo indices for compress pattern. */
3255 if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
3256 return false;
3258 /* Compress pattern doesn't work for one vector. */
3259 if (d->one_vector_p)
3260 return false;
3262 /* Compress point is the point that all elements value with index i >=
3263 compress point of the selector are all consecutive series increasing and
3264 each selector value >= NUNTIS. In this case, we could compress all elements
3265 of i < compress point into the op1. */
3266 int compress_point = -1;
3267 for (int i = 0; i < vlen; i++)
3269 if (compress_point < 0 && known_ge (d->perm[i], vec_len))
3271 compress_point = i;
3272 break;
3276 /* We don't apply compress approach if we can't find the compress point. */
3277 if (compress_point < 0)
3278 return false;
3280 /* We can only apply compress approach when all index values from 0 to
3281 compress point are increasing. */
3282 for (int i = 1; i < compress_point; i++)
3283 if (maybe_le (d->perm[i], d->perm[i - 1]))
3284 return false;
3286 /* It must be series increasing from compress point. */
3287 for (int i = 1 + compress_point; i < vlen; i++)
3288 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
3289 return false;
3291 /* Success! */
3292 if (d->testing_p)
3293 return true;
3295 /* Check whether we need to slideup op1 to apply compress approach.
3297 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3298 is 2 * NUNITS - 1, so we don't need to slide up.
3300 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3301 we apply compress approach. */
3302 bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
3303 && !const_vec_duplicate_p (d->op1);
3305 /* If we leave it directly be handled by general gather,
3306 the code sequence will be:
3307 VECTOR LOAD selector
3308 GEU mask, selector, NUNITS
3309 GATHER dest, op0, selector
3310 SUB selector, selector, NUNITS
3311 GATHER dest, op1, selector, mask
3312 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3313 as COST = 4. So, we consider the general gather handling COST = 9.
3314 TODO: This cost is not accurate, we can adjust it by tune info. */
3315 int general_cost = 9;
3317 /* If we can use compress approach, the code squence will be:
3318 MASK LOAD mask
3319 COMPRESS op1, op0, mask
3320 If it needs slide up, it will be:
3321 MASK LOAD mask
3322 SLIDEUP op1
3323 COMPRESS op1, op0, mask
3324 By default, mask load COST = 2.
3325 TODO: This cost is not accurate, we can adjust it by tune info. */
3326 int compress_cost = 4;
3328 if (general_cost <= compress_cost)
3329 return false;
3331 /* Build a mask that is true when selector element is true. */
3332 machine_mode mask_mode = get_mask_mode (vmode);
3333 rvv_builder builder (mask_mode, vlen, 1);
3334 for (int i = 0; i < vlen; i++)
3336 bool is_compress_index = false;
3337 for (int j = 0; j < compress_point; j++)
3339 if (known_eq (d->perm[j], i))
3341 is_compress_index = true;
3342 break;
3345 if (is_compress_index)
3346 builder.quick_push (CONST1_RTX (BImode));
3347 else
3348 builder.quick_push (CONST0_RTX (BImode));
3350 rtx mask = force_reg (mask_mode, builder.build ());
3352 rtx merge = d->op1;
3353 if (need_slideup_p)
3355 int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
3356 merge = gen_reg_rtx (vmode);
3357 rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
3358 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
3359 emit_vlmax_insn (icode, BINARY_OP, ops);
3362 insn_code icode = code_for_pred_compress (vmode);
3363 rtx ops[] = {d->target, merge, d->op0, mask};
3364 emit_vlmax_insn (icode, COMPRESS_OP_MERGE, ops);
3365 return true;
3368 /* Recognize decompress patterns:
3370 1. VEC_PERM_EXPR op0 and op1
3371 with isel = { 0, nunits, 1, nunits + 1, ... }.
3372 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3374 2. VEC_PERM_EXPR op0 and op1
3375 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3376 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3377 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3379 static bool
3380 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
3382 poly_uint64 nelt = d->perm.length ();
3383 machine_mode mask_mode = get_mask_mode (d->vmode);
3385 /* For constant size indices, we dont't need to handle it here.
3386 Just leave it to vec_perm<mode>. */
3387 if (d->perm.length ().is_constant ())
3388 return false;
3390 poly_uint64 first = d->perm[0];
3391 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
3392 || !d->perm.series_p (0, 2, first, 1)
3393 || !d->perm.series_p (1, 2, first + nelt, 1))
3394 return false;
3396 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3397 Otherwise, it could overflow the index range. */
3398 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
3399 if (GET_MODE_INNER (d->vmode) == QImode
3400 && !get_vector_mode (HImode, nelt).exists (&sel_mode))
3401 return false;
3403 /* Success! */
3404 if (d->testing_p)
3405 return true;
3407 rtx op0, op1;
3408 if (known_eq (first, 0U))
3410 op0 = d->op0;
3411 op1 = d->op1;
3413 else
3415 op0 = gen_reg_rtx (d->vmode);
3416 op1 = gen_reg_rtx (d->vmode);
3417 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
3418 rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
3419 rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
3420 emit_vlmax_insn (icode, BINARY_OP, ops0);
3421 emit_vlmax_insn (icode, BINARY_OP, ops1);
3423 /* Generate { 0, 1, .... } mask. */
3424 rtx vid = gen_reg_rtx (sel_mode);
3425 rtx vid_repeat = gen_reg_rtx (sel_mode);
3426 expand_vec_series (vid, const0_rtx, const1_rtx);
3427 rtx and_ops[] = {vid_repeat, vid, const1_rtx};
3428 emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
3429 rtx const_vec = gen_const_vector_dup (sel_mode, 1);
3430 rtx mask = gen_reg_rtx (mask_mode);
3431 expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
3432 emit_vlmax_decompress_insn (d->target, op0, op1, mask);
3433 return true;
3436 static bool
3437 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
3439 HOST_WIDE_INT diff;
3440 unsigned i, size, step;
3442 if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
3443 return false;
3445 step = diff + 1;
3446 size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
3448 switch (size)
3450 case 16:
3451 break;
3452 case 32:
3453 case 64:
3454 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3455 __builtin_bswap. It will generate about 9 instructions in
3456 loop as below, no matter it is bswap16, bswap32 or bswap64.
3457 .L2:
3458 1 vle16.v v4,0(a0)
3459 2 vmv.v.x v2,a7
3460 3 vand.vv v2,v6,v2
3461 4 slli a2,a5,1
3462 5 vrgatherei16.vv v1,v4,v2
3463 6 sub a4,a4,a5
3464 7 vse16.v v1,0(a3)
3465 8 add a0,a0,a2
3466 9 add a3,a3,a2
3467 bne a4,zero,.L2
3469 But for bswap16 we may have a even simple code gen, which
3470 has only 7 instructions in loop as below.
3472 1 vle8.v v2,0(a5)
3473 2 addi a5,a5,32
3474 3 vsrl.vi v4,v2,8
3475 4 vsll.vi v2,v2,8
3476 5 vor.vv v4,v4,v2
3477 6 vse8.v v4,0(a4)
3478 7 addi a4,a4,32
3479 bne a5,a6,.L5
3481 Unfortunately, the instructions in loop will grow to 13 and 24
3482 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3483 for both the bswap64 and bswap32, but take shift and or (7 insn)
3484 for bswap16.
3486 default:
3487 return false;
3490 for (i = 0; i < step; i++)
3491 if (!d->perm.series_p (i, step, diff - i, step))
3492 return false;
3494 /* Disable when nunits < 4 since the later generic approach
3495 is more profitable on BSWAP. */
3496 if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
3497 return false;
3499 if (d->testing_p)
3500 return true;
3502 machine_mode vhi_mode;
3503 poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
3505 if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
3506 return false;
3508 /* Step-1: Move op0 to src with VHI mode. */
3509 rtx src = gen_reg_rtx (vhi_mode);
3510 emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
3512 /* Step-2: Shift right 8 bits to dest. */
3513 rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
3514 NULL_RTX, 0, OPTAB_DIRECT);
3516 /* Step-3: Shift left 8 bits to src. */
3517 src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
3518 NULL_RTX, 0, OPTAB_DIRECT);
3520 /* Step-4: Logic Or dest and src to dest. */
3521 dest = expand_binop (vhi_mode, ior_optab, dest, src,
3522 NULL_RTX, 0, OPTAB_DIRECT);
3524 /* Step-5: Move src to target with VQI mode. */
3525 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
3527 return true;
3530 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3531 approach. */
3533 static bool
3534 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
3536 poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
3538 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3539 if (!d->perm.series_p (0, 2, nunits - 1, 2)
3540 || !d->perm.series_p (1, 2, nunits, 2))
3541 return false;
3543 /* Disable when nunits < 4 since the later generic approach
3544 is more profitable on indice = { nunits - 1, nunits }. */
3545 if (!known_gt (nunits, 2))
3546 return false;
3548 /* Success! */
3549 if (d->testing_p)
3550 return true;
3552 /* Extract the last element of the first vector. */
3553 scalar_mode smode = GET_MODE_INNER (d->vmode);
3554 rtx tmp = gen_reg_rtx (smode);
3555 emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
3557 /* Insert the scalar into element 0. */
3558 unsigned int unspec
3559 = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
3560 insn_code icode = code_for_pred_slide (unspec, d->vmode);
3561 rtx ops[] = {d->target, d->op1, tmp};
3562 emit_vlmax_insn (icode, BINARY_OP, ops);
3563 return true;
3566 static bool
3567 shuffle_series_patterns (struct expand_vec_perm_d *d)
3569 if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
3570 return false;
3572 poly_int64 el1 = d->perm[0];
3573 poly_int64 el2 = d->perm[1];
3574 poly_int64 el3 = d->perm[2];
3576 poly_int64 step1 = el2 - el1;
3577 poly_int64 step2 = el3 - el2;
3579 bool need_insert = false;
3580 bool have_series = false;
3582 /* Check for a full series. */
3583 if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
3584 have_series = true;
3586 /* Check for a series starting at the second element. */
3587 else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
3589 have_series = true;
3590 need_insert = true;
3593 if (!have_series)
3594 return false;
3596 /* Disable shuffle if we can't find an appropriate integer index mode for
3597 gather. */
3598 machine_mode sel_mode;
3599 if (!get_gather_index_mode (d).exists (&sel_mode))
3600 return false;
3602 /* Success! */
3603 if (d->testing_p)
3604 return true;
3606 /* Create the series. */
3607 machine_mode eltmode = Pmode;
3608 rtx series = gen_reg_rtx (sel_mode);
3609 expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
3610 gen_int_mode (need_insert ? step2 : step1, eltmode));
3612 /* Insert the remaining element if necessary. */
3613 if (need_insert)
3615 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
3616 rtx ops[]
3617 = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
3618 emit_vlmax_insn (icode, BINARY_OP, ops);
3621 emit_vlmax_gather_insn (d->target, d->op0, series);
3623 return true;
3626 /* Recognize the pattern that can be shuffled by generic approach. */
3628 static bool
3629 shuffle_generic_patterns (struct expand_vec_perm_d *d)
3631 machine_mode sel_mode;
3633 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3634 if (!pow2p_hwi (d->perm.encoding().npatterns ()))
3635 return false;
3637 /* Disable shuffle if we can't find an appropriate integer index mode for
3638 gather. */
3639 if (!get_gather_index_mode (d).exists (&sel_mode))
3640 return false;
3642 /* Success! */
3643 if (d->testing_p)
3644 return true;
3646 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
3647 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3648 instead of expand vec_perm<mode>, we handle it directly. */
3649 expand_vec_perm (d->target, d->op0, d->op1, sel);
3650 return true;
3653 /* This function recognizes and supports different permutation patterns
3654 and enable VLA SLP auto-vectorization. */
3655 static bool
3656 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
3658 gcc_assert (d->op_mode != E_VOIDmode);
3660 /* The pattern matching functions above are written to look for a small
3661 number to begin the sequence (0, 1, N/2). If we begin with an index
3662 from the second operand, we can swap the operands. */
3663 poly_int64 nelt = d->perm.length ();
3664 if (known_ge (d->perm[0], nelt))
3666 d->perm.rotate_inputs (1);
3667 std::swap (d->op0, d->op1);
3670 if (known_gt (nelt, 1))
3672 if (d->vmode == d->op_mode)
3674 if (shuffle_merge_patterns (d))
3675 return true;
3676 if (shuffle_consecutive_patterns (d))
3677 return true;
3678 if (shuffle_compress_patterns (d))
3679 return true;
3680 if (shuffle_decompress_patterns (d))
3681 return true;
3682 if (shuffle_bswap_pattern (d))
3683 return true;
3684 if (shuffle_extract_and_slide1up_patterns (d))
3685 return true;
3686 if (shuffle_series_patterns (d))
3687 return true;
3688 if (shuffle_generic_patterns (d))
3689 return true;
3690 return false;
3692 else
3693 return false;
3695 return false;
3698 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3699 * instructions. */
3700 bool
3701 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
3702 rtx op0, rtx op1, const vec_perm_indices &sel)
3704 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3705 mask to do the iteration loop control. Just disable it directly. */
3706 if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
3707 return false;
3708 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3709 may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3710 Ideally, middle-end loop vectorizer should be able to disable it
3711 itself, We can remove the codes here when middle-end code is able
3712 to disable VLA SLP vectorization for poly size (1, 1) VF. */
3713 if (!BYTES_PER_RISCV_VECTOR.is_constant ()
3714 && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
3715 poly_int64 (16, 16)))
3716 return false;
3718 struct expand_vec_perm_d d;
3720 /* Check whether the mask can be applied to a single vector. */
3721 if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
3722 d.one_vector_p = true;
3723 else if (sel.all_from_input_p (0))
3725 d.one_vector_p = true;
3726 op1 = op0;
3728 else if (sel.all_from_input_p (1))
3730 d.one_vector_p = true;
3731 op0 = op1;
3733 else
3734 d.one_vector_p = false;
3736 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
3737 sel.nelts_per_input ());
3738 d.vmode = vmode;
3739 d.op_mode = op_mode;
3740 d.target = target;
3741 d.op0 = op0;
3742 if (op0 == op1)
3743 d.op1 = d.op0;
3744 else
3745 d.op1 = op1;
3746 d.testing_p = !target;
3748 if (!d.testing_p)
3749 return expand_vec_perm_const_1 (&d);
3751 rtx_insn *last = get_last_insn ();
3752 bool ret = expand_vec_perm_const_1 (&d);
3753 gcc_assert (last == get_last_insn ());
3755 return ret;
3758 /* Generate no side effects vsetvl to get the vector length. */
3759 void
3760 expand_select_vl (rtx *ops)
3762 poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
3763 if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
3765 /* If length is known <= VF, we just use the length directly instead
3766 of using vsetvli.
3768 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3769 We move 3 into _255 intead of using explicit vsetvl. */
3770 emit_move_insn (ops[0], ops[1]);
3771 return;
3773 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3774 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
3775 scalar_int_mode mode = QImode;
3776 machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
3777 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
3780 /* Expand MASK_LEN_{LOAD,STORE}. */
3781 void
3782 expand_load_store (rtx *ops, bool is_load)
3784 poly_int64 value;
3785 rtx mask = ops[2];
3786 rtx len = ops[3];
3787 machine_mode mode = GET_MODE (ops[0]);
3789 if (is_vlmax_len_p (mode, len))
3791 /* If the length operand is equal to VF, it is VLMAX load/store. */
3792 if (is_load)
3794 rtx m_ops[] = {ops[0], mask, ops[1]};
3795 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
3797 else
3799 len = gen_reg_rtx (Pmode);
3800 emit_vlmax_vsetvl (mode, len);
3801 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3802 get_avl_type_rtx (VLMAX)));
3805 else
3807 if (!satisfies_constraint_K (len))
3808 len = force_reg (Pmode, len);
3809 if (is_load)
3811 rtx m_ops[] = {ops[0], mask, ops[1]};
3812 emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
3813 len);
3815 else
3816 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
3817 get_avl_type_rtx (NONVLMAX)));
3822 /* Return true if the operation is the floating-point operation need FRM. */
3823 static bool
3824 needs_fp_rounding (unsigned icode, machine_mode mode)
3826 if (!FLOAT_MODE_P (mode))
3827 return false;
3829 return icode != maybe_code_for_pred (SMIN, mode)
3830 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
3831 && icode != maybe_code_for_pred (SMAX, mode)
3832 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
3833 && icode != maybe_code_for_pred (NEG, mode)
3834 && icode != maybe_code_for_pred (ABS, mode)
3835 /* narrower-FP -> FP */
3836 && icode != maybe_code_for_pred_extend (mode)
3837 /* narrower-INT -> FP */
3838 && icode != maybe_code_for_pred_widen (FLOAT, mode)
3839 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
3840 /* vfsgnj */
3841 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
3842 && icode != maybe_code_for_pred_mov (mode);
3845 /* Subroutine to expand COND_LEN_* patterns. */
3846 static void
3847 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
3849 rtx dest = ops[0];
3850 rtx mask = ops[1];
3851 machine_mode mode = GET_MODE (dest);
3852 machine_mode mask_mode = GET_MODE (mask);
3853 poly_int64 value;
3854 bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
3855 bool is_vlmax_len = is_vlmax_len_p (mode, len);
3857 unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
3858 if (is_dummy_mask)
3859 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
3860 else if (is_vlmax_len)
3861 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
3862 else
3863 insn_flags |= TU_POLICY_P | MU_POLICY_P;
3865 if (needs_fp_rounding (icode, mode))
3866 insn_flags |= FRM_DYN_P;
3868 if (is_vlmax_len)
3869 emit_vlmax_insn (icode, insn_flags, ops);
3870 else
3871 emit_nonvlmax_insn (icode, insn_flags, ops, len);
3874 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
3875 static rtx
3876 get_else_operand (rtx op)
3878 return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
3881 /* Expand unary ops COND_LEN_*. */
3882 void
3883 expand_cond_len_unop (unsigned icode, rtx *ops)
3885 rtx dest = ops[0];
3886 rtx mask = ops[1];
3887 rtx src = ops[2];
3888 rtx merge = get_else_operand (ops[3]);
3889 rtx len = ops[4];
3891 rtx cond_ops[] = {dest, mask, merge, src};
3892 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3895 /* Expand unary ops COND_*. */
3896 void
3897 expand_cond_unop (unsigned icode, rtx *ops)
3899 rtx dest = ops[0];
3900 rtx mask = ops[1];
3901 rtx src = ops[2];
3902 rtx merge = get_else_operand (ops[3]);
3903 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3905 rtx cond_ops[] = {dest, mask, merge, src};
3906 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
3909 /* Expand binary ops COND_LEN_*. */
3910 void
3911 expand_cond_len_binop (unsigned icode, rtx *ops)
3913 rtx dest = ops[0];
3914 rtx mask = ops[1];
3915 rtx src1 = ops[2];
3916 rtx src2 = ops[3];
3917 rtx merge = get_else_operand (ops[4]);
3918 rtx len = ops[5];
3920 rtx cond_ops[] = {dest, mask, merge, src1, src2};
3921 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3924 /* Expand binary ops COND_*. */
3925 void
3926 expand_cond_binop (unsigned icode, rtx *ops)
3928 rtx dest = ops[0];
3929 rtx mask = ops[1];
3930 rtx src1 = ops[2];
3931 rtx src2 = ops[3];
3932 rtx merge = get_else_operand (ops[4]);
3933 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
3935 rtx cond_ops[] = {dest, mask, merge, src1, src2};
3936 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
3939 /* Prepare insn_code for gather_load/scatter_store according to
3940 the vector mode and index mode. */
3941 static insn_code
3942 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
3943 bool is_load)
3945 if (!is_load)
3946 return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
3947 else
3949 unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
3950 unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
3951 if (dst_eew_bitsize == src_eew_bitsize)
3952 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
3953 else if (dst_eew_bitsize > src_eew_bitsize)
3955 unsigned factor = dst_eew_bitsize / src_eew_bitsize;
3956 switch (factor)
3958 case 2:
3959 return code_for_pred_indexed_load_x2_greater_eew (
3960 UNSPEC_UNORDERED, vec_mode);
3961 case 4:
3962 return code_for_pred_indexed_load_x4_greater_eew (
3963 UNSPEC_UNORDERED, vec_mode);
3964 case 8:
3965 return code_for_pred_indexed_load_x8_greater_eew (
3966 UNSPEC_UNORDERED, vec_mode);
3967 default:
3968 gcc_unreachable ();
3971 else
3973 unsigned factor = src_eew_bitsize / dst_eew_bitsize;
3974 switch (factor)
3976 case 2:
3977 return code_for_pred_indexed_load_x2_smaller_eew (
3978 UNSPEC_UNORDERED, vec_mode);
3979 case 4:
3980 return code_for_pred_indexed_load_x4_smaller_eew (
3981 UNSPEC_UNORDERED, vec_mode);
3982 case 8:
3983 return code_for_pred_indexed_load_x8_smaller_eew (
3984 UNSPEC_UNORDERED, vec_mode);
3985 default:
3986 gcc_unreachable ();
3992 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
3993 void
3994 expand_gather_scatter (rtx *ops, bool is_load)
3996 rtx ptr, vec_offset, vec_reg;
3997 bool zero_extend_p;
3998 int scale_log2;
3999 rtx mask = ops[5];
4000 rtx len = ops[6];
4001 if (is_load)
4003 vec_reg = ops[0];
4004 ptr = ops[1];
4005 vec_offset = ops[2];
4006 zero_extend_p = INTVAL (ops[3]);
4007 scale_log2 = exact_log2 (INTVAL (ops[4]));
4009 else
4011 vec_reg = ops[4];
4012 ptr = ops[0];
4013 vec_offset = ops[1];
4014 zero_extend_p = INTVAL (ops[2]);
4015 scale_log2 = exact_log2 (INTVAL (ops[3]));
4018 machine_mode vec_mode = GET_MODE (vec_reg);
4019 machine_mode idx_mode = GET_MODE (vec_offset);
4020 scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
4021 unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
4022 poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
4023 poly_int64 value;
4024 bool is_vlmax = is_vlmax_len_p (vec_mode, len);
4026 /* Extend the offset element to address width. */
4027 if (inner_offsize < BITS_PER_WORD)
4029 /* 7.2. Vector Load/Store Addressing Modes.
4030 If the vector offset elements are narrower than XLEN, they are
4031 zero-extended to XLEN before adding to the ptr effective address. If
4032 the vector offset elements are wider than XLEN, the least-significant
4033 XLEN bits are used in the address calculation. An implementation must
4034 raise an illegal instruction exception if the EEW is not supported for
4035 offset elements.
4037 RVV spec only refers to the scale_log == 0 case. */
4038 if (!zero_extend_p || scale_log2 != 0)
4040 if (zero_extend_p)
4041 inner_idx_mode
4042 = int_mode_for_size (inner_offsize * 2, 0).require ();
4043 else
4044 inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
4045 machine_mode new_idx_mode
4046 = get_vector_mode (inner_idx_mode, nunits).require ();
4047 rtx tmp = gen_reg_rtx (new_idx_mode);
4048 emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
4049 zero_extend_p ? true : false));
4050 vec_offset = tmp;
4051 idx_mode = new_idx_mode;
4055 if (scale_log2 != 0)
4057 rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
4058 gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
4059 OPTAB_DIRECT);
4060 vec_offset = tmp;
4063 insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
4064 if (is_vlmax)
4066 if (is_load)
4068 rtx load_ops[]
4069 = {vec_reg, mask, ptr, vec_offset};
4070 emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
4072 else
4074 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4075 emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
4078 else
4080 if (is_load)
4082 rtx load_ops[]
4083 = {vec_reg, mask, ptr, vec_offset};
4084 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
4086 else
4088 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
4089 emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
4094 /* Expand COND_LEN_*. */
4095 void
4096 expand_cond_len_ternop (unsigned icode, rtx *ops)
4098 rtx dest = ops[0];
4099 rtx mask = ops[1];
4100 rtx src1 = ops[2];
4101 rtx src2 = ops[3];
4102 rtx src3 = ops[4];
4103 rtx merge = get_else_operand (ops[5]);
4104 rtx len = ops[6];
4106 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4107 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4110 /* Expand COND_*. */
4111 void
4112 expand_cond_ternop (unsigned icode, rtx *ops)
4114 rtx dest = ops[0];
4115 rtx mask = ops[1];
4116 rtx src1 = ops[2];
4117 rtx src2 = ops[3];
4118 rtx src3 = ops[4];
4119 rtx merge = get_else_operand (ops[5]);
4120 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
4122 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
4123 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
4126 /* Expand reduction operations.
4127 Case 1: ops = {scalar_dest, vector_src}
4128 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4130 void
4131 expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init)
4133 rtx scalar_dest = ops[0];
4134 rtx vector_src = ops[1];
4135 machine_mode vmode = GET_MODE (vector_src);
4136 machine_mode vel_mode = GET_MODE (scalar_dest);
4137 machine_mode m1_mode = get_m1_mode (vel_mode).require ();
4139 rtx m1_tmp = gen_reg_rtx (m1_mode);
4140 rtx scalar_move_ops[] = {m1_tmp, init};
4141 emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP,
4142 scalar_move_ops,
4143 need_mask_operand_p (insn_flags) ? ops[3]
4144 : CONST1_RTX (Pmode));
4145 rtx m1_tmp2 = gen_reg_rtx (m1_mode);
4146 rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
4147 insn_code icode = code_for_pred (unspec, vmode);
4149 if (need_mask_operand_p (insn_flags))
4151 rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
4152 emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, ops[3]);
4154 else
4155 emit_vlmax_insn (icode, insn_flags, reduc_ops);
4157 emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
4160 /* Prepare ops for ternary operations.
4161 It can be called before or after RA. */
4162 void
4163 prepare_ternary_operands (rtx *ops)
4165 machine_mode mode = GET_MODE (ops[0]);
4167 if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4168 && (VECTOR_MODE_P (GET_MODE (ops[2]))
4169 && !rtx_equal_p (ops[2], ops[5]))
4170 && !rtx_equal_p (ops[3], ops[5])
4171 && !rtx_equal_p (ops[4], ops[5]))
4173 /* RA will fail to find vector REG and report ICE, so we pre-merge
4174 the ops for LMUL = 8. */
4175 if (satisfies_constraint_Wc1 (ops[1]))
4177 emit_move_insn (ops[0], ops[5]);
4178 emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
4179 ops[7], ops[8], ops[9]));
4181 else
4182 emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
4183 ops[4], ops[1], ops[6], ops[7], ops[9]));
4184 ops[5] = ops[4] = ops[0];
4186 else
4188 /* Swap the multiplication ops if the fallback value is the
4189 second of the two. */
4190 if (rtx_equal_p (ops[3], ops[5]))
4191 std::swap (ops[2], ops[3]);
4193 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4194 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4196 gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
4197 || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
4200 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4201 void
4202 expand_lanes_load_store (rtx *ops, bool is_load)
4204 poly_int64 value;
4205 rtx mask = ops[2];
4206 rtx len = ops[3];
4207 rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
4208 rtx reg = is_load ? ops[0] : ops[1];
4209 machine_mode mode = GET_MODE (ops[0]);
4211 if (is_vlmax_len_p (mode, len))
4213 /* If the length operand is equal to VF, it is VLMAX load/store. */
4214 if (is_load)
4216 rtx m_ops[] = {reg, mask, addr};
4217 emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
4218 m_ops);
4220 else
4222 len = gen_reg_rtx (Pmode);
4223 emit_vlmax_vsetvl (mode, len);
4224 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4225 get_avl_type_rtx (VLMAX)));
4228 else
4230 if (!satisfies_constraint_K (len))
4231 len = force_reg (Pmode, len);
4232 if (is_load)
4234 rtx m_ops[] = {reg, mask, addr};
4235 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
4236 UNARY_OP_TAMA, m_ops, len);
4238 else
4239 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
4240 get_avl_type_rtx (NONVLMAX)));
4244 /* Expand LEN_FOLD_EXTRACT_LAST. */
4245 void
4246 expand_fold_extract_last (rtx *ops)
4248 rtx dst = ops[0];
4249 rtx default_value = ops[1];
4250 rtx mask = ops[2];
4251 rtx anchor = gen_reg_rtx (Pmode);
4252 rtx index = gen_reg_rtx (Pmode);
4253 rtx vect = ops[3];
4254 rtx else_label = gen_label_rtx ();
4255 rtx end_label = gen_label_rtx ();
4256 rtx len = ops[4];
4257 poly_int64 value;
4258 machine_mode mode = GET_MODE (vect);
4259 machine_mode mask_mode = GET_MODE (mask);
4260 rtx compress_vect = gen_reg_rtx (mode);
4261 rtx slide_vect = gen_reg_rtx (mode);
4262 insn_code icode;
4264 if (is_vlmax_len_p (mode, len))
4265 len = NULL_RTX;
4267 /* Calculate the number of 1-bit in mask. */
4268 rtx cpop_ops[] = {anchor, mask};
4269 if (len)
4270 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4271 cpop_ops, len);
4272 else
4273 emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
4274 cpop_ops);
4276 riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
4277 emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
4278 /* Compress the vector. */
4279 icode = code_for_pred_compress (mode);
4280 rtx compress_ops[] = {compress_vect, vect, mask};
4281 if (len)
4282 emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
4283 else
4284 emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
4285 /* Emit the slide down to index 0 in a new vector. */
4286 rtx slide_ops[] = {slide_vect, compress_vect, index};
4287 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
4288 if (len)
4289 emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
4290 else
4291 emit_vlmax_insn (icode, BINARY_OP, slide_ops);
4292 /* Emit v(f)mv.[xf].s. */
4293 emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
4295 emit_jump_insn (gen_jump (end_label));
4296 emit_barrier ();
4297 emit_label (else_label);
4298 emit_move_insn (dst, default_value);
4299 emit_label (end_label);
4302 /* Return true if the LMUL of comparison less than or equal to one. */
4303 bool
4304 cmp_lmul_le_one (machine_mode mode)
4306 if (riscv_v_ext_vector_mode_p (mode))
4307 return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4308 else if (riscv_v_ext_vls_mode_p (mode))
4309 return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4310 return false;
4313 /* Return true if the LMUL of comparison greater than one. */
4314 bool
4315 cmp_lmul_gt_one (machine_mode mode)
4317 if (riscv_v_ext_vector_mode_p (mode))
4318 return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
4319 else if (riscv_v_ext_vls_mode_p (mode))
4320 return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
4321 return false;
4324 /* Return true if the VLS mode is legal. There are 2 cases here.
4326 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4327 is the highest priority choice and should not conflict with VLS modes.
4328 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4329 VLS mode are smaller than the minimal vla.
4331 Take vlen = 2048 as example for case 2.
4333 Note: Below table based on vlen = 2048.
4334 +----------------------------------------------------+----------------------+
4335 | VLS mode | VLA mode |
4336 +----------------------------------------------------+----------------------+
4337 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4338 +------------+-----------+-----------------+---------+-----------+----------+
4339 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4340 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4341 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4342 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4343 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4344 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4345 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4346 | ... | ... | ... | ... | RVVMF64BI | 32 |
4347 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4348 +------------+-----------+-----------------+---------+-----------+----------+
4349 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4350 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4351 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4352 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4353 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4354 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4355 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4356 | ... | ... | .. | ... | RVVMF8QI | 256 |
4357 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4358 +------------+-----------+-----------------+---------+-----------+----------+
4359 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4360 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4361 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4362 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4363 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4364 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4365 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4366 | ... | ... | .. | ... | RVVMF4HI | 512 |
4367 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4368 +------------+-----------+-----------------+---------+-----------+----------+
4369 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4370 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4371 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4372 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4373 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4374 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4375 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4376 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4377 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4378 +------------+-----------+-----------------+---------+-----------+----------+
4379 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4380 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4381 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4382 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4383 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4384 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4385 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4386 | ... | ... | .. | ... | RVVM1DI | 2048 |
4387 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4388 +------------+-----------+-----------------+---------+-----------+----------+
4390 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4391 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4392 bool
4393 vls_mode_valid_p (machine_mode vls_mode)
4395 if (!TARGET_VECTOR)
4396 return false;
4398 if (riscv_autovec_preference == RVV_SCALABLE)
4400 if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
4401 && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
4402 GET_MODE_PRECISION (vls_mode)))
4403 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4404 BITS_PER_RISCV_VECTOR.
4406 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4407 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4408 false between VLA modes with size = (128, 128) bits and VLS mode
4409 with size = 128 bits, we will end up with multiple ICEs in
4410 middle-end generic codes. */
4411 return false;
4412 return true;
4415 if (riscv_autovec_preference == RVV_FIXED_VLMAX)
4417 machine_mode inner_mode = GET_MODE_INNER (vls_mode);
4418 int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
4419 int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
4421 return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
4424 return false;
4427 /* We don't have to convert the floating point to integer when the
4428 mantissa is zero. Thus, ther will be a limitation for both the
4429 single and double precision floating point. There will be no
4430 mantissa if the floating point is greater than the limit.
4432 1. Half floating point.
4433 +-----------+---------------+
4434 | float | binary layout |
4435 +-----------+---------------+
4436 | 1023.5 | 0x63ff |
4437 +-----------+---------------+
4438 | 1024.0 | 0x6400 |
4439 +-----------+---------------+
4440 | 1025.0 | 0x6401 |
4441 +-----------+---------------+
4442 | ... | ... |
4444 All half floating point will be unchanged for ceil if it is
4445 greater than and equal to 1024.
4447 2. Single floating point.
4448 +-----------+---------------+
4449 | float | binary layout |
4450 +-----------+---------------+
4451 | 8388607.5 | 0x4affffff |
4452 +-----------+---------------+
4453 | 8388608.0 | 0x4b000000 |
4454 +-----------+---------------+
4455 | 8388609.0 | 0x4b000001 |
4456 +-----------+---------------+
4457 | ... | ... |
4459 All single floating point will be unchanged for ceil if it is
4460 greater than and equal to 8388608.
4462 3. Double floating point.
4463 +--------------------+--------------------+
4464 | float | binary layout |
4465 +--------------------+--------------------+
4466 | 4503599627370495.5 | 0X432fffffffffffff |
4467 +--------------------+--------------------+
4468 | 4503599627370496.0 | 0X4330000000000000 |
4469 +--------------------+--------------------+
4470 | 4503599627370497.0 | 0X4340000000000000 |
4471 +--------------------+--------------------+
4472 | ... | ... |
4474 All double floating point will be unchanged for ceil if it is
4475 greater than and equal to 4503599627370496.
4477 static rtx
4478 get_fp_rounding_coefficient (machine_mode inner_mode)
4480 REAL_VALUE_TYPE real;
4482 if (inner_mode == E_HFmode)
4483 real_from_integer (&real, inner_mode, 1024, SIGNED);
4484 else if (inner_mode == E_SFmode)
4485 real_from_integer (&real, inner_mode, 8388608, SIGNED);
4486 else if (inner_mode == E_DFmode)
4487 real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
4488 else
4489 gcc_unreachable ();
4491 return const_double_from_real_value (real, inner_mode);
4494 static rtx
4495 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
4496 machine_mode vec_fp_mode)
4498 /* Step-1: Prepare the scalar float compare register. */
4499 rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
4500 emit_insn (gen_move_insn (fp_reg, fp_scalar));
4502 /* Step-2: Generate the mask. */
4503 machine_mode mask_mode = get_mask_mode (vec_fp_mode);
4504 rtx mask = gen_reg_rtx (mask_mode);
4505 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
4506 rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
4507 insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
4508 emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
4510 return mask;
4513 static void
4514 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
4515 machine_mode vec_mode)
4517 rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
4518 insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
4520 emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
4523 static void
4524 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
4526 rtx abs_ops[] = {op_dest, op_src};
4527 insn_code icode = code_for_pred (ABS, vec_mode);
4529 emit_vlmax_insn (icode, UNARY_OP, abs_ops);
4532 static void
4533 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
4534 insn_type type, machine_mode vec_mode)
4536 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4538 if (type & USE_VUNDEF_MERGE_P)
4540 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4541 emit_vlmax_insn (icode, type, cvt_x_ops);
4543 else
4545 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4546 emit_vlmax_insn (icode, type, cvt_x_ops);
4550 static void
4551 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4552 machine_mode vec_mode)
4554 rtx ops[] = {op_dest, op_src};
4555 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4557 emit_vlmax_insn (icode, type, ops);
4560 static void
4561 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4562 machine_mode vec_mode)
4564 rtx ops[] = {op_dest, op_src};
4565 insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4567 emit_vlmax_insn (icode, type, ops);
4570 static void
4571 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
4572 machine_mode vec_mode)
4574 rtx ops[] = {op_dest, op_src};
4575 insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
4577 emit_vlmax_insn (icode, type, ops);
4580 static void
4581 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
4582 machine_mode vec_mode)
4584 rtx ops[] = {op_dest, op_src};
4585 insn_code icode = code_for_pred_extend (vec_mode);
4587 emit_vlmax_insn (icode, type, ops);
4590 static void
4591 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
4592 insn_type type, machine_mode vec_mode)
4594 rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
4595 insn_code icode = code_for_pred (FLOAT, vec_mode);
4597 emit_vlmax_insn (icode, type, cvt_fp_ops);
4600 static void
4601 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
4602 insn_type type, machine_mode vec_mode)
4604 insn_code icode = code_for_pred (FIX, vec_mode);
4606 if (type & USE_VUNDEF_MERGE_P)
4608 rtx cvt_x_ops[] = {op_dest, mask, op_src};
4609 emit_vlmax_insn (icode, type, cvt_x_ops);
4611 else
4613 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
4614 emit_vlmax_insn (icode, type, cvt_x_ops);
4618 void
4619 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4620 machine_mode vec_int_mode)
4622 /* Step-1: Get the abs float value for mask generation. */
4623 emit_vec_abs (op_0, op_1, vec_fp_mode);
4625 /* Step-2: Generate the mask on const fp. */
4626 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4627 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4629 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
4630 rtx tmp = gen_reg_rtx (vec_int_mode);
4631 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
4633 /* Step-4: Convert to floating-point on mask for the final result.
4634 To avoid unnecessary frm register access, we use RUP here and it will
4635 never do the rounding up because the tmp rtx comes from the float
4636 to int conversion. */
4637 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
4639 /* Step-5: Retrieve the sign bit for -0.0. */
4640 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4643 void
4644 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4645 machine_mode vec_int_mode)
4647 /* Step-1: Get the abs float value for mask generation. */
4648 emit_vec_abs (op_0, op_1, vec_fp_mode);
4650 /* Step-2: Generate the mask on const fp. */
4651 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4652 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4654 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
4655 rtx tmp = gen_reg_rtx (vec_int_mode);
4656 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
4658 /* Step-4: Convert to floating-point on mask for the floor result. */
4659 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
4661 /* Step-5: Retrieve the sign bit for -0.0. */
4662 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4665 void
4666 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4667 machine_mode vec_int_mode)
4669 /* Step-1: Get the abs float value for mask generation. */
4670 emit_vec_abs (op_0, op_1, vec_fp_mode);
4672 /* Step-2: Generate the mask on const fp. */
4673 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4674 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4676 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4677 rtx fflags = gen_reg_rtx (SImode);
4678 emit_insn (gen_riscv_frflags (fflags));
4680 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
4681 rtx tmp = gen_reg_rtx (vec_int_mode);
4682 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4684 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
4685 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4687 /* Step-6: Restore FP exception flags. */
4688 emit_insn (gen_riscv_fsflags (fflags));
4690 /* Step-7: Retrieve the sign bit for -0.0. */
4691 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4694 void
4695 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4696 machine_mode vec_int_mode)
4698 /* Step-1: Get the abs float value for mask generation. */
4699 emit_vec_abs (op_0, op_1, vec_fp_mode);
4701 /* Step-2: Generate the mask on const fp. */
4702 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4703 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4705 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
4706 rtx tmp = gen_reg_rtx (vec_int_mode);
4707 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
4709 /* Step-4: Convert to floating-point on mask for the rint result. */
4710 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4712 /* Step-5: Retrieve the sign bit for -0.0. */
4713 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4716 void
4717 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4718 machine_mode vec_int_mode)
4720 /* Step-1: Get the abs float value for mask generation. */
4721 emit_vec_abs (op_0, op_1, vec_fp_mode);
4723 /* Step-2: Generate the mask on const fp. */
4724 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4725 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4727 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
4728 rtx tmp = gen_reg_rtx (vec_int_mode);
4729 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
4731 /* Step-4: Convert to floating-point on mask for the round result. */
4732 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
4734 /* Step-5: Retrieve the sign bit for -0.0. */
4735 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4738 void
4739 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4740 machine_mode vec_int_mode)
4742 /* Step-1: Get the abs float value for mask generation. */
4743 emit_vec_abs (op_0, op_1, vec_fp_mode);
4745 /* Step-2: Generate the mask on const fp. */
4746 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4747 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4749 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
4750 rtx tmp = gen_reg_rtx (vec_int_mode);
4751 emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
4753 /* Step-4: Convert to floating-point on mask for the rint result. */
4754 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
4756 /* Step-5: Retrieve the sign bit for -0.0. */
4757 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4760 void
4761 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4762 machine_mode vec_int_mode)
4764 /* Step-1: Get the abs float value for mask generation. */
4765 emit_vec_abs (op_0, op_1, vec_fp_mode);
4767 /* Step-2: Generate the mask on const fp. */
4768 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
4769 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
4771 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
4772 rtx tmp = gen_reg_rtx (vec_int_mode);
4773 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
4775 /* Step-4: Convert to floating-point on mask for the rint result. */
4776 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
4778 /* Step-5: Retrieve the sign bit for -0.0. */
4779 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
4782 /* Handling the rounding from floating-point to int/long/long long. */
4783 static void
4784 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
4785 machine_mode vec_fp_mode,
4786 machine_mode vec_int_mode,
4787 machine_mode vec_bridge_mode = E_VOIDmode)
4789 poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
4790 poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
4792 if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI. */
4793 emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4794 else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI. */
4795 emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
4796 else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI. */
4797 emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode);
4798 else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI. */
4800 gcc_assert (vec_bridge_mode != E_VOIDmode);
4802 rtx op_sf = gen_reg_rtx (vec_bridge_mode);
4804 /* Step-1: HF => SF, no rounding here. */
4805 emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
4806 /* Step-2: SF => DI. */
4807 emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode);
4809 else
4810 gcc_unreachable ();
4813 void
4814 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4815 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4817 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
4818 vec_int_mode, vec_bridge_mode);
4821 void
4822 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4823 machine_mode vec_int_mode, machine_mode vec_bridge_mode)
4825 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
4826 vec_int_mode, vec_bridge_mode);
4829 void
4830 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4831 machine_mode vec_int_mode)
4833 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
4834 vec_int_mode);
4837 void
4838 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
4839 machine_mode vec_int_mode)
4841 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
4842 vec_int_mode);
4845 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
4846 well. */
4847 void
4848 expand_popcount (rtx *ops)
4850 rtx dst = ops[0];
4851 rtx src = ops[1];
4852 machine_mode mode = GET_MODE (dst);
4853 scalar_mode imode = GET_MODE_INNER (mode);
4854 static const uint64_t m5 = 0x5555555555555555ULL;
4855 static const uint64_t m3 = 0x3333333333333333ULL;
4856 static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
4857 static const uint64_t m1 = 0x0101010101010101ULL;
4859 rtx x1 = gen_reg_rtx (mode);
4860 rtx x2 = gen_reg_rtx (mode);
4861 rtx x3 = gen_reg_rtx (mode);
4862 rtx x4 = gen_reg_rtx (mode);
4864 /* x1 = src - (src >> 1) & 0x555...); */
4865 rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
4866 OPTAB_DIRECT);
4868 rtx and1 = gen_reg_rtx (mode);
4869 rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
4870 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4871 ops1);
4873 x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
4875 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
4877 rtx and2 = gen_reg_rtx (mode);
4878 rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
4879 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4880 ops2);
4882 rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
4883 OPTAB_DIRECT);
4885 rtx and22 = gen_reg_rtx (mode);
4886 rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
4887 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4888 ops22);
4890 x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
4892 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
4893 rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
4894 OPTAB_DIRECT);
4896 rtx plus3
4897 = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
4899 rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
4900 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
4901 ops3);
4903 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
4904 rtx mul4 = gen_reg_rtx (mode);
4905 rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
4906 emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
4907 ops4);
4909 x4 = expand_binop (mode, lshr_optab, mul4,
4910 GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
4911 OPTAB_DIRECT);
4913 emit_move_insn (dst, x4);
4916 /* Return true if it is VLMAX AVL TYPE. */
4917 bool
4918 vlmax_avl_type_p (rtx_insn *rinsn)
4920 extract_insn_cached (rinsn);
4921 int index = get_attr_avl_type_idx (rinsn);
4922 if (index == INVALID_ATTRIBUTE)
4923 return false;
4924 rtx avl_type = recog_data.operand[index];
4925 return INTVAL (avl_type) == VLMAX;
4928 /* Return true if it is an RVV instruction depends on VL global
4929 status register. */
4930 bool
4931 has_vl_op (rtx_insn *rinsn)
4933 return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
4936 /* Get default tail policy. */
4937 static bool
4938 get_default_ta ()
4940 /* For the instruction that doesn't require TA, we still need a default value
4941 to emit vsetvl. We pick up the default value according to prefer policy. */
4942 return (bool) (get_prefer_tail_policy () & 0x1
4943 || (get_prefer_tail_policy () >> 1 & 0x1));
4946 /* Helper function to get TA operand. */
4947 bool
4948 tail_agnostic_p (rtx_insn *rinsn)
4950 /* If it doesn't have TA, we return agnostic by default. */
4951 extract_insn_cached (rinsn);
4952 int ta = get_attr_ta (rinsn);
4953 return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
4956 /* Change insn and Assert the change always happens. */
4957 void
4958 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
4960 bool change_p = validate_change (object, loc, new_rtx, in_group);
4961 gcc_assert (change_p);
4964 /* Return true if it is NONVLMAX AVL TYPE. */
4965 bool
4966 nonvlmax_avl_type_p (rtx_insn *rinsn)
4968 extract_insn_cached (rinsn);
4969 int index = get_attr_avl_type_idx (rinsn);
4970 if (index == INVALID_ATTRIBUTE)
4971 return false;
4972 rtx avl_type = recog_data.operand[index];
4973 return INTVAL (avl_type) == NONVLMAX;
4976 /* Return true if RTX is RVV VLMAX AVL. */
4977 bool
4978 vlmax_avl_p (rtx x)
4980 return x && rtx_equal_p (x, RVV_VLMAX);
4983 /* Helper function to get SEW operand. We always have SEW value for
4984 all RVV instructions that have VTYPE OP. */
4985 uint8_t
4986 get_sew (rtx_insn *rinsn)
4988 return get_attr_sew (rinsn);
4991 /* Helper function to get VLMUL operand. We always have VLMUL value for
4992 all RVV instructions that have VTYPE OP. */
4993 enum vlmul_type
4994 get_vlmul (rtx_insn *rinsn)
4996 return (enum vlmul_type) get_attr_vlmul (rinsn);
4999 /* Count the number of REGNO in RINSN. */
5001 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
5003 int count = 0;
5004 extract_insn (rinsn);
5005 for (int i = 0; i < recog_data.n_operands; i++)
5006 if (refers_to_regno_p (regno, recog_data.operand[i]))
5007 count++;
5008 return count;
5011 /* Return true if the OP can be directly broadcasted. */
5012 bool
5013 can_be_broadcasted_p (rtx op)
5015 machine_mode mode = GET_MODE (op);
5016 /* We don't allow RA (register allocation) reload generate
5017 (vec_duplicate:DI reg) in RV32 system wheras we allow
5018 (vec_duplicate:DI mem) in RV32 system. */
5019 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
5020 && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
5021 && !satisfies_constraint_Wdm (op))
5022 return false;
5024 if (satisfies_constraint_K (op) || register_operand (op, mode)
5025 || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
5026 return true;
5028 return can_create_pseudo_p () && nonmemory_operand (op, mode);
5031 void
5032 emit_vec_extract (rtx target, rtx src, rtx index)
5034 machine_mode vmode = GET_MODE (src);
5035 machine_mode smode = GET_MODE (target);
5036 class expand_operand ops[3];
5037 enum insn_code icode
5038 = convert_optab_handler (vec_extract_optab, vmode, smode);
5039 gcc_assert (icode != CODE_FOR_nothing);
5040 create_output_operand (&ops[0], target, smode);
5041 ops[0].target = 1;
5042 create_input_operand (&ops[1], src, vmode);
5044 poly_int64 val;
5045 if (poly_int_rtx_p (index, &val))
5046 create_integer_operand (&ops[2], val);
5047 else
5048 create_input_operand (&ops[2], index, Pmode);
5050 expand_insn (icode, 3, ops);
5051 if (ops[0].value != target)
5052 emit_move_insn (target, ops[0].value);
5055 /* Return true if the offset mode is valid mode that we use for gather/scatter
5056 autovectorization. */
5057 bool
5058 gather_scatter_valid_offset_p (machine_mode mode)
5060 /* If the element size of offset mode is already >= Pmode size,
5061 we don't need any extensions. */
5062 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
5063 return true;
5065 /* Since we are very likely extend the offset mode into vector Pmode,
5066 Disable gather/scatter autovectorization if we can't extend the offset
5067 mode into vector Pmode. */
5068 if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
5069 return false;
5070 return true;
5073 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5074 Look into the tuning structure for an estimate.
5075 KIND specifies the type of requested estimate: min, max or likely.
5076 For cores with a known VLA width all three estimates are the same.
5077 For generic VLA tuning we want to distinguish the maximum estimate from
5078 the minimum and likely ones.
5079 The likely estimate is the same as the minimum in that case to give a
5080 conservative behavior of auto-vectorizing with VLA when it is a win
5081 even for VLA vectorization.
5082 When VLA width information is available VAL.coeffs[1] is multiplied by
5083 the number of VLA chunks over the initial VLS bits. */
5084 HOST_WIDE_INT
5085 estimated_poly_value (poly_int64 val, unsigned int kind)
5087 unsigned int width_source
5088 = BITS_PER_RISCV_VECTOR.is_constant ()
5089 ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
5090 : (unsigned int) RVV_SCALABLE;
5092 /* If there is no core-specific information then the minimum and likely
5093 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5094 the architectural maximum of 65536 bits. */
5095 unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
5096 if (width_source == RVV_SCALABLE)
5097 switch (kind)
5099 case POLY_VALUE_MIN:
5100 case POLY_VALUE_LIKELY:
5101 return val.coeffs[0];
5103 case POLY_VALUE_MAX:
5104 return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
5107 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5108 lowest as likely. This could be made more general if future -mtune
5109 options need it to be. */
5110 if (kind == POLY_VALUE_MAX)
5111 width_source = 1 << floor_log2 (width_source);
5112 else
5113 width_source = least_bit_hwi (width_source);
5115 /* If the core provides width information, use that. */
5116 HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
5117 return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
5120 } // namespace riscv_vector