1 /* Subroutines used for code generation for RISC-V 'V' Extension for
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define IN_TARGET_CODE 1
24 /* We have a maximum of 11 operands for RVV instruction patterns according to
26 #define RVV_INSN_OPERANDS_MAX 11
30 #include "coretypes.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
39 #include "stringpool.h"
46 #include "targhooks.h"
49 #include "tm-constrs.h"
50 #include "rtx-vector-builder.h"
51 #include "targhooks.h"
56 using namespace riscv_vector
;
58 namespace riscv_vector
{
60 /* Return true if NUNITS <=31 so that we can use immediate AVL in vsetivli. */
62 imm_avl_p (machine_mode mode
)
64 poly_uint64 nunits
= GET_MODE_NUNITS (mode
);
66 return nunits
.is_constant ()
67 /* The vsetivli can only hold register 0~31. */
68 ? (IN_RANGE (nunits
.to_constant (), 0, 31))
69 /* Only allowed in VLS-VLMAX mode. */
73 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */
75 is_vlmax_len_p (machine_mode mode
, rtx len
)
78 return poly_int_rtx_p (len
, &value
)
79 && known_eq (value
, GET_MODE_NUNITS (mode
));
82 /* Helper functions for insn_flags && insn_types */
84 /* Return true if caller need pass mask operand for insn pattern with
88 need_mask_operand_p (unsigned insn_flags
)
90 return (insn_flags
& HAS_MASK_P
)
91 && !(insn_flags
& (USE_ONE_TRUE_MASK_P
| USE_ALL_TRUES_MASK_P
));
94 template <int MAX_OPERANDS
> class insn_expander
97 insn_expander () = delete;
99 insn_expander (unsigned insn_flags
, bool vlmax_p
)
100 : m_insn_flags (insn_flags
), m_opno (0), m_vlmax_p (vlmax_p
),
106 void check_insn_flags () const
108 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
109 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */
110 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
112 if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
113 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */
114 gcc_assert ((m_insn_flags
& HAS_MASK_P
));
116 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */
117 gcc_assert (!((m_insn_flags
& USE_ONE_TRUE_MASK_P
)
118 && (m_insn_flags
& USE_ALL_TRUES_MASK_P
)));
120 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
121 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */
122 gcc_assert ((m_insn_flags
& HAS_MERGE_P
));
124 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */
126 !((m_insn_flags
& TU_POLICY_P
) && (m_insn_flags
& TDEFAULT_POLICY_P
)));
128 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */
130 !((m_insn_flags
& MU_POLICY_P
) && (m_insn_flags
& MDEFAULT_POLICY_P
)));
132 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
135 !((m_insn_flags
& NULLARY_OP_P
)
136 && ((m_insn_flags
& UNARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
137 || (m_insn_flags
& TERNARY_OP_P
))));
139 !((m_insn_flags
& UNARY_OP_P
)
140 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& BINARY_OP_P
)
141 || (m_insn_flags
& TERNARY_OP_P
))));
143 !((m_insn_flags
& BINARY_OP_P
)
144 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
145 || (m_insn_flags
& TERNARY_OP_P
))));
147 !((m_insn_flags
& TERNARY_OP_P
)
148 && ((m_insn_flags
& NULLARY_OP_P
) || (m_insn_flags
& UNARY_OP_P
)
149 || (m_insn_flags
& BINARY_OP_P
))));
152 void set_vl (rtx vl
) { m_vl_op
= vl
; }
154 void add_output_operand (rtx x
, machine_mode mode
)
156 create_output_operand (&m_ops
[m_opno
++], x
, mode
);
157 gcc_assert (m_opno
<= MAX_OPERANDS
);
159 void add_input_operand (rtx x
, machine_mode mode
)
161 create_input_operand (&m_ops
[m_opno
++], x
, mode
);
162 gcc_assert (m_opno
<= MAX_OPERANDS
);
164 void add_all_one_mask_operand (machine_mode mask_mode
)
166 add_input_operand (CONSTM1_RTX (mask_mode
), mask_mode
);
168 void add_first_one_true_mask_operand (machine_mode mask_mode
)
170 add_input_operand (gen_scalar_move_mask (mask_mode
), mask_mode
);
172 void add_vundef_operand (machine_mode dest_mode
)
174 add_input_operand (RVV_VUNDEF (dest_mode
), dest_mode
);
176 void add_policy_operand ()
178 if (m_insn_flags
& TU_POLICY_P
)
180 rtx tail_policy_rtx
= gen_int_mode (TAIL_UNDISTURBED
, Pmode
);
181 add_input_operand (tail_policy_rtx
, Pmode
);
183 else if (m_insn_flags
& TDEFAULT_POLICY_P
)
185 rtx tail_policy_rtx
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
186 add_input_operand (tail_policy_rtx
, Pmode
);
189 if (m_insn_flags
& MU_POLICY_P
)
191 rtx mask_policy_rtx
= gen_int_mode (MASK_UNDISTURBED
, Pmode
);
192 add_input_operand (mask_policy_rtx
, Pmode
);
194 else if (m_insn_flags
& MDEFAULT_POLICY_P
)
196 rtx mask_policy_rtx
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
197 add_input_operand (mask_policy_rtx
, Pmode
);
200 void add_avl_type_operand (avl_type type
)
202 add_input_operand (gen_int_mode (type
, Pmode
), Pmode
);
206 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode
)
208 rtx frm_rtx
= gen_int_mode (rounding_mode
, Pmode
);
209 add_input_operand (frm_rtx
, Pmode
);
213 add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode
)
215 rtx frm_rtx
= gen_int_mode (rounding_mode
, Pmode
);
216 add_input_operand (frm_rtx
, Pmode
);
219 /* Return the vtype mode based on insn_flags.
220 vtype mode mean the mode vsetvl insn set. */
222 get_vtype_mode (rtx
*ops
)
224 machine_mode vtype_mode
;
225 if (m_insn_flags
& VTYPE_MODE_FROM_OP1_P
)
226 vtype_mode
= GET_MODE (ops
[1]);
228 vtype_mode
= GET_MODE (ops
[0]);
232 void emit_insn (enum insn_code icode
, rtx
*ops
)
236 /* It's true if any operand is memory operand. */
237 bool any_mem_p
= false;
239 machine_mode vtype_mode
= get_vtype_mode (ops
);
240 machine_mode mask_mode
= get_mask_mode (vtype_mode
);
242 /* Add dest operand. */
243 if (m_insn_flags
& HAS_DEST_P
)
245 rtx op
= ops
[opno
++];
246 any_mem_p
|= MEM_P (op
);
247 add_output_operand (op
, GET_MODE (op
));
250 /* Add mask operand. */
251 if (m_insn_flags
& USE_ONE_TRUE_MASK_P
)
252 add_first_one_true_mask_operand (mask_mode
);
253 else if (m_insn_flags
& USE_ALL_TRUES_MASK_P
)
254 add_all_one_mask_operand (mask_mode
);
255 else if (m_insn_flags
& HAS_MASK_P
)
257 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
258 gcc_assert (mode
!= VOIDmode
);
259 add_input_operand (ops
[opno
++], mode
);
262 /* Add merge operand. */
263 if (m_insn_flags
& USE_VUNDEF_MERGE_P
)
264 /* Same as dest operand. */
265 add_vundef_operand (GET_MODE (ops
[0]));
266 else if (m_insn_flags
& HAS_MERGE_P
)
268 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
269 gcc_assert (mode
!= VOIDmode
);
270 add_input_operand (ops
[opno
++], mode
);
273 if (m_insn_flags
& NULLARY_OP_P
)
275 else if (m_insn_flags
& UNARY_OP_P
)
277 else if (m_insn_flags
& BINARY_OP_P
)
279 else if (m_insn_flags
& TERNARY_OP_P
)
284 /* Add the remain operands. */
285 for (; num_ops
; num_ops
--, opno
++)
287 any_mem_p
|= MEM_P (ops
[opno
]);
288 machine_mode mode
= insn_data
[(int) icode
].operand
[m_opno
].mode
;
289 /* 'create_input_operand doesn't allow VOIDmode.
290 According to vector.md, we may have some patterns that do not have
291 explicit machine mode specifying the operand. Such operands are
293 if (mode
== VOIDmode
)
296 /* Early assertion ensures same mode since maybe_legitimize_operand
298 machine_mode required_mode
= GET_MODE (ops
[opno
]);
299 if (required_mode
!= VOIDmode
&& required_mode
!= mode
)
300 internal_error ("expected mode %s for operand %d of "
301 "insn %s but got mode %s.\n",
302 GET_MODE_NAME (mode
),
304 insn_data
[(int) icode
].name
,
305 GET_MODE_NAME (required_mode
));
307 add_input_operand (ops
[opno
], mode
);
310 /* Add vl operand. */
315 if (riscv_v_ext_vls_mode_p (vtype_mode
))
317 /* VLS modes always set VSETVL by
318 "vsetvl zero, rs1/imm". */
319 poly_uint64 nunits
= GET_MODE_NUNITS (vtype_mode
);
320 len
= gen_int_mode (nunits
, Pmode
);
323 else if (can_create_pseudo_p ())
325 len
= gen_reg_rtx (Pmode
);
326 emit_vlmax_vsetvl (vtype_mode
, len
);
330 gcc_assert (len
!= NULL_RTX
);
331 add_input_operand (len
, Pmode
);
333 /* Add tail and mask policy operands. */
334 add_policy_operand ();
336 /* Add avl_type operand. */
337 add_avl_type_operand (
338 vls_p
? avl_type::VLS
339 : (m_vlmax_p
? avl_type::VLMAX
: avl_type::NONVLMAX
));
341 /* Add rounding mode operand. */
342 if (m_insn_flags
& FRM_DYN_P
)
343 add_rounding_mode_operand (FRM_DYN
);
344 else if (m_insn_flags
& FRM_RUP_P
)
345 add_rounding_mode_operand (FRM_RUP
);
346 else if (m_insn_flags
& FRM_RDN_P
)
347 add_rounding_mode_operand (FRM_RDN
);
348 else if (m_insn_flags
& FRM_RMM_P
)
349 add_rounding_mode_operand (FRM_RMM
);
350 else if (m_insn_flags
& FRM_RNE_P
)
351 add_rounding_mode_operand (FRM_RNE
);
352 else if (m_insn_flags
& VXRM_RNU_P
)
353 add_rounding_mode_operand (VXRM_RNU
);
354 else if (m_insn_flags
& VXRM_RDN_P
)
355 add_rounding_mode_operand (VXRM_RDN
);
358 if (insn_data
[(int) icode
].n_operands
!= m_opno
)
359 internal_error ("invalid number of operands for insn %s, "
360 "expected %d but got %d.\n",
361 insn_data
[(int) icode
].name
,
362 insn_data
[(int) icode
].n_operands
, m_opno
);
364 expand (icode
, any_mem_p
);
367 void expand (enum insn_code icode
, bool temporary_volatile_p
= false)
369 if (temporary_volatile_p
)
371 temporary_volatile_ok
v (true);
372 expand_insn (icode
, m_opno
, m_ops
);
375 expand_insn (icode
, m_opno
, m_ops
);
379 unsigned m_insn_flags
;
383 expand_operand m_ops
[MAX_OPERANDS
];
386 /* Emit an RVV insn with a vector length that equals the number of units of the
387 vector mode. For VLA modes this corresponds to VLMAX.
389 Unless the vector length can be encoded in the vsetivl[i] instruction this
390 function must only be used as long as we can create pseudo registers. This is
391 because it will set a pseudo register to VLMAX using vsetvl and use this as
392 definition for the vector length. */
394 emit_vlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
)
396 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
397 gcc_assert (can_create_pseudo_p () || imm_avl_p (e
.get_vtype_mode (ops
)));
399 e
.emit_insn ((enum insn_code
) icode
, ops
);
402 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
403 registers anymore. This function, however, takes a predefined vector length
404 from the value in VL. */
406 emit_vlmax_insn_lra (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
408 gcc_assert (!can_create_pseudo_p ());
409 machine_mode mode
= GET_MODE (ops
[0]);
411 if (imm_avl_p (mode
))
413 /* Even though VL is a real hardreg already allocated since
414 it is post-RA now, we still gain benefits that we emit
415 vsetivli zero, imm instead of vsetvli VL, zero which is
416 we can be more flexible in post-RA instruction scheduling. */
417 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
418 e
.set_vl (gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
));
419 e
.emit_insn ((enum insn_code
) icode
, ops
);
423 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, true);
425 e
.emit_insn ((enum insn_code
) icode
, ops
);
429 /* Emit an RVV insn with a predefined vector length. Contrary to
430 emit_vlmax_insn the instruction's vector length is not deduced from its mode
431 but taken from the value in VL. */
433 emit_nonvlmax_insn (unsigned icode
, unsigned insn_flags
, rtx
*ops
, rtx vl
)
435 insn_expander
<RVV_INSN_OPERANDS_MAX
> e (insn_flags
, false);
437 e
.emit_insn ((enum insn_code
) icode
, ops
);
440 /* Return true if the vector duplicated by a super element which is the fusion
441 of consecutive elements.
443 v = { a, b, a, b } super element = ab, v = { ab, ab } */
445 rvv_builder::can_duplicate_repeating_sequence_p ()
447 poly_uint64 new_size
= exact_div (full_nelts (), npatterns ());
448 unsigned int new_inner_size
= m_inner_bits_size
* npatterns ();
449 if (m_inner_mode
== Pmode
450 || !int_mode_for_size (new_inner_size
, 0).exists (&m_new_inner_mode
)
451 || GET_MODE_SIZE (m_new_inner_mode
) > UNITS_PER_WORD
452 || !get_vector_mode (m_new_inner_mode
, new_size
).exists (&m_new_mode
))
454 return repeating_sequence_p (0, encoded_nelts (), npatterns ());
457 /* Return true if the vector is a simple sequence with one pattern and all
458 elements the same. */
460 rvv_builder::is_repeating_sequence ()
462 if (npatterns () > 1)
464 return repeating_sequence_p (0, encoded_nelts (), 1);
467 /* Return true if it is a repeating sequence that using
468 merge approach has better codegen than using default
469 approach (slide1down).
472 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
477 for merging a we need mask 101010....
478 for merging b we need mask 010101....
480 Foreach element in the npattern, we need to build a mask in scalar register.
481 Mostly we need 3 instructions (aka COST = 3), which consists of 2 scalar
482 instructions and 1 scalar move to v0 register. Finally we need vector merge
488 vmerge.vxm v9, v9, a1, v0
490 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
491 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
492 So return true in this case as it is profitable.
495 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
500 COST of merge approach = (3 + 1) * npatterns = 24
501 COST of slide1down approach = nelts = 16
502 Return false in this case as it is NOT profitable in merge approach.
505 rvv_builder::repeating_sequence_use_merge_profitable_p ()
507 if (inner_bytes_size () > UNITS_PER_WORD
)
510 unsigned int nelts
= full_nelts ().to_constant ();
512 if (!repeating_sequence_p (0, encoded_nelts (), npatterns ()))
515 unsigned int merge_cost
= 1;
516 unsigned int build_merge_mask_cost
= 3;
517 unsigned int slide1down_cost
= nelts
;
519 return (build_merge_mask_cost
+ merge_cost
) * npatterns () < slide1down_cost
;
522 /* Return true if it's worthwhile to use slideup combine 2 vectors. */
524 rvv_builder::combine_sequence_use_slideup_profitable_p ()
526 int nelts
= full_nelts ().to_constant ();
527 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
528 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
530 /* ??? Current heuristic we do is we do combine 2 vectors
532 1. # of leading same elements is equal to # of trailing same elements.
533 2. Both of above are equal to nelts / 2.
534 Otherwise, it is not profitable. */
535 return leading_ndups
== trailing_ndups
&& trailing_ndups
== nelts
/ 2;
538 /* Return true if it's worthwhile to use merge combine vector with a scalar. */
540 rvv_builder::combine_sequence_use_merge_profitable_p ()
542 int nelts
= full_nelts ().to_constant ();
543 int leading_ndups
= this->count_dups (0, nelts
- 1, 1);
544 int trailing_ndups
= this->count_dups (nelts
- 1, -1, -1);
545 int nregs
= riscv_get_v_regno_alignment (int_mode ());
547 if (leading_ndups
+ trailing_ndups
!= nelts
)
550 /* Leading elements num > 255 which exceeds the maximum value
551 of QImode, we will need to use HImode. */
553 if (leading_ndups
> 255 || nregs
> 2)
555 if (!get_vector_mode (HImode
, nelts
).exists (&mode
))
557 /* We will need one more AVL/VL toggling vsetvl instruction. */
558 return leading_ndups
> 4 && trailing_ndups
> 4;
561 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
562 consume 3 slide instructions. */
563 return leading_ndups
> 3 && trailing_ndups
> 3;
566 /* Merge the repeating sequence into a single element and return the RTX. */
568 rvv_builder::get_merged_repeating_sequence ()
570 scalar_int_mode mode
= Pmode
;
571 rtx target
= gen_reg_rtx (mode
);
572 emit_move_insn (target
, const0_rtx
);
573 rtx imm
= gen_int_mode ((1ULL << m_inner_bits_size
) - 1, mode
);
574 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
575 for (unsigned int i
= 0; i
< npatterns (); i
++)
577 unsigned int loc
= m_inner_bits_size
* i
;
578 rtx shift
= gen_int_mode (loc
, mode
);
579 rtx ele
= gen_lowpart (mode
, elt (i
));
580 rtx tmp
= expand_simple_binop (mode
, AND
, ele
, imm
, NULL_RTX
, false,
582 rtx tmp2
= expand_simple_binop (mode
, ASHIFT
, tmp
, shift
, NULL_RTX
, false,
584 rtx tmp3
= expand_simple_binop (mode
, IOR
, tmp2
, target
, NULL_RTX
, false,
586 emit_move_insn (target
, tmp3
);
588 if (GET_MODE_SIZE (m_new_inner_mode
) < UNITS_PER_WORD
)
589 return gen_lowpart (m_new_inner_mode
, target
);
593 /* Get the mask for merge approach.
595 Consider such following case:
596 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
597 To merge "a", the mask should be 1010....
598 To merge "b", the mask should be 0101....
601 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern
,
602 machine_mode inner_mode
) const
604 unsigned HOST_WIDE_INT mask
= 0;
605 unsigned HOST_WIDE_INT base_mask
= (1ULL << index_in_pattern
);
606 /* Here we construct a mask pattern that will later be broadcast
607 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x
608 is determined by the length of a vector element (ELEN) and not by
609 XLEN so make sure we do not exceed it. One example is -march=zve32*
610 which mandates ELEN == 32 but can be combined with -march=rv64
612 unsigned int elen
= TARGET_VECTOR_ELEN_64
? 64 : 32;
614 gcc_assert (elen
% npatterns () == 0);
616 int limit
= elen
/ npatterns ();
618 for (int i
= 0; i
< limit
; i
++)
619 mask
|= base_mask
<< (i
* npatterns ());
621 return gen_int_mode (mask
, inner_mode
);
624 /* Return true if the variable-length vector is single step.
625 Single step means step all patterns in NPATTERNS are equal.
626 Consider this following case:
628 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
629 { 0, 2, 2, 4, 4, 6, ... }
630 First pattern: step1 = 2 - 0 = 2
632 Second pattern: step1 = 4 - 2 = 2
634 Since all steps of NPATTERNS are equal step = 2.
635 Return true in this case.
637 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
638 { 0, 1, 2, 4, 4, 7, ... }
639 First pattern: step1 = 2 - 0 = 2
641 Second pattern: step1 = 4 - 1 = 3
643 Since not all steps are equal, return false. */
645 rvv_builder::single_step_npatterns_p () const
647 if (nelts_per_pattern () != 3)
651 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
652 for (unsigned int i
= 0; i
< npatterns (); i
++)
654 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
655 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
656 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
657 poly_int64 diff1
= ele1
- ele0
;
658 poly_int64 diff2
= ele2
- ele1
;
659 if (maybe_ne (step
, diff1
) || maybe_ne (step
, diff2
))
665 /* Return true if the diff between const vector and vid sequence
666 is repeated. For example as below cases:
667 The diff means the const vector - vid.
669 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
670 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... }
671 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
672 The diff sequence {3, 1,-1,-3} is repeated in the npattern and
673 return TRUE for case 1.
676 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
677 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
678 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
679 The diff sequence {-4, 3} is not repeated in the npattern and
680 return FALSE for case 2. */
682 rvv_builder::npatterns_vid_diff_repeated_p () const
684 if (nelts_per_pattern () != 3)
686 else if (npatterns () == 0)
689 for (unsigned i
= 0; i
< npatterns (); i
++)
691 poly_int64 diff_0
= rtx_to_poly_int64 (elt (i
)) - i
;
693 = rtx_to_poly_int64 (elt (npatterns () + i
)) - npatterns () - i
;
695 if (maybe_ne (diff_0
, diff_1
))
702 /* Return true if the permutation consists of two
703 interleaved patterns with a constant step each.
704 TODO: We currently only support NPATTERNS = 2. */
706 rvv_builder::interleaved_stepped_npatterns_p () const
708 if (npatterns () != 2 || nelts_per_pattern () != 3)
710 for (unsigned int i
= 0; i
< npatterns (); i
++)
712 poly_int64 ele0
= rtx_to_poly_int64 (elt (i
));
713 poly_int64 ele1
= rtx_to_poly_int64 (elt (npatterns () + i
));
714 poly_int64 ele2
= rtx_to_poly_int64 (elt (npatterns () * 2 + i
));
715 poly_int64 diff1
= ele1
- ele0
;
716 poly_int64 diff2
= ele2
- ele1
;
717 if (maybe_ne (diff1
, diff2
))
723 /* Return true if all elements of NPATTERNS are equal.
726 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
728 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
729 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
730 We don't need to check the elements[n] with n >= NPATTERNS since
731 they don't belong to the same pattern.
734 rvv_builder::npatterns_all_equal_p () const
736 poly_int64 ele0
= rtx_to_poly_int64 (elt (0));
737 for (unsigned int i
= 1; i
< npatterns (); i
++)
739 poly_int64 ele
= rtx_to_poly_int64 (elt (i
));
740 if (!known_eq (ele
, ele0
))
747 get_sew (machine_mode mode
)
749 unsigned int sew
= GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
751 : GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
755 /* Return true if X is a const_vector with all duplicate elements, which is in
756 the range between MINVAL and MAXVAL. */
758 const_vec_all_same_in_range_p (rtx x
, HOST_WIDE_INT minval
,
759 HOST_WIDE_INT maxval
)
762 return (const_vec_duplicate_p (x
, &elt
) && CONST_INT_P (elt
)
763 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
766 /* Return true if VEC is a constant in which every element is in the range
767 [MINVAL, MAXVAL]. The elements do not need to have the same value.
769 This function also exists in aarch64, we may unify it in middle-end in the
773 const_vec_all_in_range_p (rtx vec
, poly_int64 minval
, poly_int64 maxval
)
775 if (!CONST_VECTOR_P (vec
)
776 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
780 if (!CONST_VECTOR_STEPPED_P (vec
))
781 nunits
= const_vector_encoded_nelts (vec
);
782 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
785 for (int i
= 0; i
< nunits
; i
++)
787 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
789 if (!poly_int_rtx_p (vec_elem
, &value
)
790 || maybe_lt (value
, minval
)
791 || maybe_gt (value
, maxval
))
797 /* Returns true if the vector's elements are all duplicates in
798 range -16 ~ 15 integer or 0.0 floating-point. */
801 valid_vec_immediate_p (rtx x
)
803 return (satisfies_constraint_vi (x
) || satisfies_constraint_Wc0 (x
));
806 /* Return a const vector of VAL. The VAL can be either const_int or
810 gen_const_vector_dup (machine_mode mode
, poly_int64 val
)
812 scalar_mode smode
= GET_MODE_INNER (mode
);
813 rtx c
= gen_int_mode (val
, smode
);
814 if (!val
.is_constant () && GET_MODE_SIZE (smode
) > GET_MODE_SIZE (Pmode
))
816 /* When VAL is const_poly_int value, we need to explicitly broadcast
817 it into a vector using RVV broadcast instruction. */
818 return expand_vector_broadcast (mode
, c
);
820 return gen_const_vec_duplicate (mode
, c
);
823 /* Emit a vlmax vsetvl instruction. This should only be used when
824 optimization is disabled or after vsetvl insertion pass. */
826 emit_hard_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
828 unsigned int sew
= get_sew (vmode
);
829 emit_insn (gen_vsetvl (Pmode
, vl
, RVV_VLMAX
, gen_int_mode (sew
, Pmode
),
830 gen_int_mode (get_vlmul (vmode
), Pmode
), const0_rtx
,
835 emit_vlmax_vsetvl (machine_mode vmode
, rtx vl
)
837 unsigned int sew
= get_sew (vmode
);
838 enum vlmul_type vlmul
= get_vlmul (vmode
);
839 unsigned int ratio
= calculate_ratio (sew
, vlmul
);
842 emit_hard_vlmax_vsetvl (vmode
, vl
);
844 emit_insn (gen_vlmax_avl (Pmode
, vl
, gen_int_mode (ratio
, Pmode
)));
847 /* Calculate SEW/LMUL ratio. */
849 calculate_ratio (unsigned int sew
, enum vlmul_type vlmul
)
881 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
882 compile-time unknown). ZVL means that the vector-length is specific
883 (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
884 auto-vectorization using VLMAX vsetvl configuration. */
886 autovec_use_vlmax_p (void)
888 return rvv_vector_bits
== RVV_VECTOR_BITS_SCALABLE
889 || rvv_vector_bits
== RVV_VECTOR_BITS_ZVL
;
892 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
893 is a const duplicate vector. Otherwise, emit vrgather.vv. */
895 emit_vlmax_gather_insn (rtx target
, rtx op
, rtx sel
)
899 machine_mode data_mode
= GET_MODE (target
);
900 machine_mode sel_mode
= GET_MODE (sel
);
901 if (const_vec_duplicate_p (sel
, &elt
))
903 icode
= code_for_pred_gather_scalar (data_mode
);
906 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
907 icode
= code_for_pred_gatherei16 (data_mode
);
909 icode
= code_for_pred_gather (data_mode
);
910 rtx ops
[] = {target
, op
, sel
};
911 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
915 emit_vlmax_masked_gather_mu_insn (rtx target
, rtx op
, rtx sel
, rtx mask
)
919 machine_mode data_mode
= GET_MODE (target
);
920 machine_mode sel_mode
= GET_MODE (sel
);
921 if (const_vec_duplicate_p (sel
, &elt
))
923 icode
= code_for_pred_gather_scalar (data_mode
);
926 else if (maybe_ne (GET_MODE_SIZE (data_mode
), GET_MODE_SIZE (sel_mode
)))
927 icode
= code_for_pred_gatherei16 (data_mode
);
929 icode
= code_for_pred_gather (data_mode
);
930 rtx ops
[] = {target
, mask
, target
, op
, sel
};
931 emit_vlmax_insn (icode
, BINARY_OP_TAMU
, ops
);
934 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
935 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
937 There is no inverse vdecompress provided, as this operation can be readily
938 synthesized using iota and a masked vrgather:
940 Desired functionality of 'vdecompress'
941 7 6 5 4 3 2 1 0 # vid
943 e d c b a # packed vector of 5 elements
944 1 0 0 1 1 1 0 1 # mask vector of 8 elements
945 p q r s t u v w # destination register before vdecompress
947 e q r d c b v a # result of vdecompress
949 # v1 holds packed data
950 # v11 holds input expanded vector and result
951 viota.m v10, v0 # Calc iota from mask in v0
952 vrgather.vv v11, v1, v10, v0.t # Expand into destination
953 p q r s t u v w # v11 destination register
954 e d c b a # v1 source vector
955 1 0 0 1 1 1 0 1 # v0 mask vector
957 4 4 4 3 2 1 1 0 # v10 result of viota.m
958 e q r d c b v a # v11 destination after vrgather using viota.m under mask
961 emit_vlmax_decompress_insn (rtx target
, rtx op0
, rtx op1
, rtx mask
)
963 machine_mode data_mode
= GET_MODE (target
);
964 machine_mode sel_mode
= related_int_vector_mode (data_mode
).require ();
965 if (GET_MODE_INNER (data_mode
) == QImode
)
966 sel_mode
= get_vector_mode (HImode
, GET_MODE_NUNITS (data_mode
)).require ();
968 rtx sel
= gen_reg_rtx (sel_mode
);
969 rtx iota_ops
[] = {sel
, mask
};
970 emit_vlmax_insn (code_for_pred_iota (sel_mode
), UNARY_OP
, iota_ops
);
971 emit_vlmax_gather_insn (target
, op0
, sel
);
972 emit_vlmax_masked_gather_mu_insn (target
, op1
, sel
, mask
);
975 /* Emit merge instruction. */
978 get_repeating_sequence_dup_machine_mode (const rvv_builder
&builder
,
979 machine_mode mask_bit_mode
)
981 unsigned mask_precision
= GET_MODE_PRECISION (mask_bit_mode
).to_constant ();
982 unsigned mask_scalar_size
= mask_precision
> builder
.inner_bits_size ()
983 ? builder
.inner_bits_size () : mask_precision
;
985 scalar_mode inner_mode
;
986 unsigned minimal_bits_size
;
988 switch (mask_scalar_size
)
992 minimal_bits_size
= TARGET_MIN_VLEN
/ 8; /* AKA RVVMF8. */
996 minimal_bits_size
= TARGET_MIN_VLEN
/ 4; /* AKA RVVMF4. */
1000 minimal_bits_size
= TARGET_MIN_VLEN
/ 2; /* AKA RVVMF2. */
1003 inner_mode
= DImode
;
1004 minimal_bits_size
= TARGET_MIN_VLEN
/ 1; /* AKA RVVM1. */
1011 gcc_assert (mask_precision
% mask_scalar_size
== 0);
1013 uint64_t dup_nunit
= mask_precision
> mask_scalar_size
1014 ? mask_precision
/ mask_scalar_size
: minimal_bits_size
/ mask_scalar_size
;
1016 return get_vector_mode (inner_mode
, dup_nunit
).require ();
1019 /* Expand series const vector. If VID is NULL_RTX, we use vid.v
1020 instructions to generate sequence for VID:
1022 VID = { 0, 1, 2, 3, ... }
1024 Otherwise, we use the VID argument directly. */
1027 expand_vec_series (rtx dest
, rtx base
, rtx step
, rtx vid
)
1029 machine_mode mode
= GET_MODE (dest
);
1030 poly_int64 nunits_m1
= GET_MODE_NUNITS (mode
) - 1;
1032 rtx result
= register_operand (dest
, mode
) ? dest
: gen_reg_rtx (mode
);
1034 /* VECT_IV = BASE + I * STEP. */
1036 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */
1037 bool reverse_p
= !vid
&& rtx_equal_p (step
, constm1_rtx
)
1038 && poly_int_rtx_p (base
, &value
)
1039 && known_eq (nunits_m1
, value
);
1042 vid
= gen_reg_rtx (mode
);
1044 emit_vlmax_insn (code_for_pred_series (mode
), NULLARY_OP
, op
);
1051 {nunits - 1, nunits - 2, ... , 0}.
1052 nunits can be either const_int or const_poly_int.
1056 vrsub nunits - 1, v. */
1058 = {result
, vid
, gen_int_mode (nunits_m1
, GET_MODE_INNER (mode
))};
1059 insn_code icode
= code_for_pred_sub_reverse_scalar (mode
);
1060 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1064 /* Step 2: Generate I * STEP.
1065 - STEP is 1, we don't emit any instructions.
1066 - STEP is power of 2, we use vsll.vi/vsll.vx.
1067 - STEP is non-power of 2, we use vmul.vx. */
1068 if (rtx_equal_p (step
, const1_rtx
))
1072 step_adj
= gen_reg_rtx (mode
);
1073 if (CONST_INT_P (step
) && pow2p_hwi (INTVAL (step
)))
1075 /* Emit logical left shift operation. */
1076 int shift
= exact_log2 (INTVAL (step
));
1077 rtx shift_amount
= gen_int_mode (shift
, Pmode
);
1078 insn_code icode
= code_for_pred_scalar (ASHIFT
, mode
);
1079 rtx ops
[] = {step_adj
, vid
, shift_amount
};
1080 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1084 insn_code icode
= code_for_pred_scalar (MULT
, mode
);
1085 rtx ops
[] = {step_adj
, vid
, step
};
1086 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1090 /* Step 3: Generate BASE + I * STEP.
1091 - BASE is 0, use result of vid.
1092 - BASE is not 0, we use vadd.vx/vadd.vi. */
1093 if (rtx_equal_p (base
, const0_rtx
))
1094 emit_move_insn (result
, step_adj
);
1097 insn_code icode
= code_for_pred_scalar (PLUS
, mode
);
1098 rtx ops
[] = {result
, step_adj
, base
};
1099 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1104 emit_move_insn (dest
, result
);
1107 /* Subroutine of riscv_vector_expand_vector_init.
1109 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
1110 (b) Skip leading elements from BUILDER, which are the same as
1111 element NELTS_REQD - 1.
1112 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */
1115 expand_vector_init_insert_elems (rtx target
, const rvv_builder
&builder
,
1118 machine_mode mode
= GET_MODE (target
);
1119 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
1120 emit_move_insn (target
, dup
);
1121 int ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
1122 for (int i
= ndups
; i
< nelts_reqd
; i
++)
1125 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1DOWN
: UNSPEC_VSLIDE1DOWN
;
1126 insn_code icode
= code_for_pred_slide (unspec
, mode
);
1127 rtx ops
[] = {target
, target
, builder
.elt (i
)};
1128 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1132 /* Subroutine of expand_vec_init to handle case
1133 when all trailing elements of builder are same.
1134 This works as follows:
1135 (a) Use expand_insn interface to broadcast last vector element in TARGET.
1136 (b) Insert remaining elements in TARGET using insr.
1138 ??? The heuristic used is to do above if number of same trailing elements
1139 is greater than leading_ndups, loosely based on
1140 heuristic from mostly_zeros_p. May need fine-tuning. */
1143 expand_vector_init_trailing_same_elem (rtx target
,
1144 const rtx_vector_builder
&builder
,
1147 int leading_ndups
= builder
.count_dups (0, nelts_reqd
- 1, 1);
1148 int trailing_ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
1149 machine_mode mode
= GET_MODE (target
);
1151 if (trailing_ndups
> leading_ndups
)
1153 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (nelts_reqd
- 1));
1154 for (int i
= nelts_reqd
- trailing_ndups
- 1; i
>= 0; i
--)
1157 = FLOAT_MODE_P (mode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
1158 insn_code icode
= code_for_pred_slide (unspec
, mode
);
1159 rtx tmp
= gen_reg_rtx (mode
);
1160 rtx ops
[] = {tmp
, dup
, builder
.elt (i
)};
1161 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1162 /* slide1up need source and dest to be different REG. */
1166 emit_move_insn (target
, dup
);
1174 expand_const_vector (rtx target
, rtx src
)
1176 machine_mode mode
= GET_MODE (target
);
1177 rtx result
= register_operand (target
, mode
) ? target
: gen_reg_rtx (mode
);
1179 if (const_vec_duplicate_p (src
, &elt
))
1181 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1183 gcc_assert (rtx_equal_p (elt
, const0_rtx
)
1184 || rtx_equal_p (elt
, const1_rtx
));
1185 rtx ops
[] = {result
, src
};
1186 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_MASK_OP
, ops
);
1188 /* Element in range -16 ~ 15 integer or 0.0 floating-point,
1189 we use vmv.v.i instruction. */
1190 else if (valid_vec_immediate_p (src
))
1192 rtx ops
[] = {result
, src
};
1193 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP
, ops
);
1197 /* Emit vec_duplicate<mode> split pattern before RA so that
1198 we could have a better optimization opportunity in LICM
1199 which will hoist vmv.v.x outside the loop and in fwprop && combine
1200 which will transform 'vv' into 'vx' instruction.
1202 The reason we don't emit vec_duplicate<mode> split pattern during
1203 RA since the split stage after RA is a too late stage to generate
1204 RVV instruction which need an additional register (We can't
1205 allocate a new register after RA) for VL operand of vsetvl
1206 instruction (vsetvl a5, zero). */
1207 if (lra_in_progress
)
1209 rtx ops
[] = {result
, elt
};
1210 emit_vlmax_insn (code_for_pred_broadcast (mode
), UNARY_OP
, ops
);
1214 struct expand_operand ops
[2];
1215 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
1216 gcc_assert (icode
!= CODE_FOR_nothing
);
1217 create_output_operand (&ops
[0], result
, mode
);
1218 create_input_operand (&ops
[1], elt
, GET_MODE_INNER (mode
));
1219 expand_insn (icode
, 2, ops
);
1220 result
= ops
[0].value
;
1224 if (result
!= target
)
1225 emit_move_insn (target
, result
);
1229 /* Support scalable const series vector. */
1231 if (const_vec_series_p (src
, &base
, &step
))
1233 expand_vec_series (result
, base
, step
);
1235 if (result
!= target
)
1236 emit_move_insn (target
, result
);
1240 /* Handle variable-length vector. */
1241 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
1242 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
1243 rvv_builder
builder (mode
, npatterns
, nelts_per_pattern
);
1244 for (unsigned int i
= 0; i
< nelts_per_pattern
; i
++)
1246 for (unsigned int j
= 0; j
< npatterns
; j
++)
1247 builder
.quick_push (CONST_VECTOR_ELT (src
, i
* npatterns
+ j
));
1249 builder
.finalize ();
1251 if (CONST_VECTOR_DUPLICATE_P (src
))
1253 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
1254 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
1255 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
1256 The elements within NPATTERNS are not necessary regular. */
1257 if (builder
.can_duplicate_repeating_sequence_p ())
1259 /* We handle the case that we can find a vector container to hold
1260 element bitsize = NPATTERNS * ele_bitsize.
1262 NPATTERNS = 8, element width = 8
1263 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1264 In this case, we can combine NPATTERNS element into a larger
1265 element. Use element width = 64 and broadcast a vector with
1266 all element equal to 0x0706050403020100. */
1267 rtx ele
= builder
.get_merged_repeating_sequence ();
1268 rtx dup
= expand_vector_broadcast (builder
.new_mode (), ele
);
1269 emit_move_insn (result
, gen_lowpart (mode
, dup
));
1273 /* We handle the case that we can't find a vector container to hold
1274 element bitsize = NPATTERNS * ele_bitsize.
1276 NPATTERNS = 8, element width = 16
1277 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
1278 Since NPATTERNS * element width = 128, we can't find a container
1281 In this case, we use NPATTERNS merge operations to generate such
1283 unsigned int nbits
= npatterns
- 1;
1285 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1286 rtx vid
= gen_reg_rtx (builder
.int_mode ());
1288 emit_vlmax_insn (code_for_pred_series (builder
.int_mode ()),
1291 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */
1292 rtx vid_repeat
= gen_reg_rtx (builder
.int_mode ());
1293 rtx and_ops
[] = {vid_repeat
, vid
,
1294 gen_int_mode (nbits
, builder
.inner_int_mode ())};
1295 emit_vlmax_insn (code_for_pred_scalar (AND
, builder
.int_mode ()),
1296 BINARY_OP
, and_ops
);
1298 rtx tmp1
= gen_reg_rtx (builder
.mode ());
1299 rtx dup_ops
[] = {tmp1
, builder
.elt (0)};
1300 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()), UNARY_OP
,
1302 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
1304 /* Generate mask according to i. */
1305 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1306 rtx const_vec
= gen_const_vector_dup (builder
.int_mode (), i
);
1307 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
1309 /* Merge scalar to each i. */
1310 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1311 rtx merge_ops
[] = {tmp2
, tmp1
, builder
.elt (i
), mask
};
1312 insn_code icode
= code_for_pred_merge_scalar (builder
.mode ());
1313 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
1316 emit_move_insn (result
, tmp1
);
1319 else if (CONST_VECTOR_STEPPED_P (src
))
1321 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
1322 if (builder
.single_step_npatterns_p ())
1324 /* Describe the case by choosing NPATTERNS = 4 as an example. */
1327 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
1328 rtx vid
= gen_reg_rtx (builder
.mode ());
1329 rtx vid_ops
[] = {vid
};
1330 icode
= code_for_pred_series (builder
.mode ());
1331 emit_vlmax_insn (icode
, NULLARY_OP
, vid_ops
);
1333 if (builder
.npatterns_all_equal_p ())
1335 /* Generate the variable-length vector following this rule:
1336 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
1337 E.g. { 0, 0, 8, 8, 16, 16, ... } */
1339 /* We want to create a pattern where value[idx] = floor (idx /
1340 NPATTERNS). As NPATTERNS is always a power of two we can
1341 rewrite this as = idx & -NPATTERNS. */
1342 /* Step 2: VID AND -NPATTERNS:
1343 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
1346 = gen_int_mode (-builder
.npatterns (), builder
.inner_mode ());
1347 rtx tmp1
= gen_reg_rtx (builder
.mode ());
1348 rtx and_ops
[] = {tmp1
, vid
, imm
};
1349 icode
= code_for_pred_scalar (AND
, builder
.mode ());
1350 emit_vlmax_insn (icode
, BINARY_OP
, and_ops
);
1352 /* Step 3: Convert to step size 1. */
1353 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1354 /* log2 (npatterns) to get the shift amount to convert
1355 Eg. { 0, 0, 0, 0, 4, 4, ... }
1356 into { 0, 0, 0, 0, 1, 1, ... }. */
1357 HOST_WIDE_INT shift_amt
= exact_log2 (builder
.npatterns ()) ;
1358 rtx shift
= gen_int_mode (shift_amt
, builder
.inner_mode ());
1359 rtx shift_ops
[] = {tmp2
, tmp1
, shift
};
1360 icode
= code_for_pred_scalar (ASHIFTRT
, builder
.mode ());
1361 emit_vlmax_insn (icode
, BINARY_OP
, shift_ops
);
1363 /* Step 4: Multiply to step size n. */
1364 HOST_WIDE_INT step_size
=
1365 INTVAL (builder
.elt (builder
.npatterns ()))
1366 - INTVAL (builder
.elt (0));
1367 rtx tmp3
= gen_reg_rtx (builder
.mode ());
1368 if (pow2p_hwi (step_size
))
1370 /* Power of 2 can be handled with a left shift. */
1371 HOST_WIDE_INT shift
= exact_log2 (step_size
);
1372 rtx shift_amount
= gen_int_mode (shift
, Pmode
);
1373 insn_code icode
= code_for_pred_scalar (ASHIFT
, mode
);
1374 rtx ops
[] = {tmp3
, tmp2
, shift_amount
};
1375 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1379 rtx mult_amt
= gen_int_mode (step_size
, builder
.inner_mode ());
1380 insn_code icode
= code_for_pred_scalar (MULT
, builder
.mode ());
1381 rtx ops
[] = {tmp3
, tmp2
, mult_amt
};
1382 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
1385 /* Step 5: Add starting value to all elements. */
1386 HOST_WIDE_INT init_val
= INTVAL (builder
.elt (0));
1388 emit_move_insn (result
, tmp3
);
1391 rtx dup
= gen_const_vector_dup (builder
.mode (), init_val
);
1392 rtx add_ops
[] = {result
, tmp3
, dup
};
1393 icode
= code_for_pred (PLUS
, builder
.mode ());
1394 emit_vlmax_insn (icode
, BINARY_OP
, add_ops
);
1399 /* Generate the variable-length vector following this rule:
1400 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */
1402 if (builder
.npatterns_vid_diff_repeated_p ())
1404 /* Case 1: For example as below:
1405 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
1406 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
1407 repeated as below after minus vid.
1408 {3, 1, -1, -3, 3, 1, -1, -3...}
1409 Then we can simplify the diff code gen to at most
1411 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1413 /* Step 1: Generate diff = TARGET - VID. */
1414 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1416 poly_int64 diff
= rtx_to_poly_int64 (builder
.elt (i
)) - i
;
1417 v
.quick_push (gen_int_mode (diff
, v
.inner_mode ()));
1420 /* Step 2: Generate result = VID + diff. */
1421 rtx vec
= v
.build ();
1422 rtx add_ops
[] = {result
, vid
, vec
};
1423 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1424 BINARY_OP
, add_ops
);
1428 /* Case 2: For example as below:
1429 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
1431 rvv_builder
v (builder
.mode (), builder
.npatterns (), 1);
1433 /* Step 1: Generate { a, b, a, b, ... } */
1434 for (unsigned int i
= 0; i
< v
.npatterns (); ++i
)
1435 v
.quick_push (builder
.elt (i
));
1436 rtx new_base
= v
.build ();
1438 /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS). Â */
1440 = gen_int_mode (exact_log2 (builder
.npatterns ()),
1441 builder
.inner_mode ());
1442 rtx tmp1
= expand_simple_binop (builder
.mode (), LSHIFTRT
,
1443 vid
, shift_count
, NULL_RTX
,
1444 false, OPTAB_DIRECT
);
1446 /* Step 3: Generate tmp2 = tmp1 * step. Â */
1447 rtx tmp2
= gen_reg_rtx (builder
.mode ());
1449 = simplify_binary_operation (MINUS
, builder
.inner_mode (),
1450 builder
.elt (v
.npatterns()),
1452 expand_vec_series (tmp2
, const0_rtx
, step
, tmp1
);
1454 /* Step 4: Generate result = tmp2 + new_base. Â */
1455 rtx add_ops
[] = {result
, tmp2
, new_base
};
1456 emit_vlmax_insn (code_for_pred (PLUS
, builder
.mode ()),
1457 BINARY_OP
, add_ops
);
1461 else if (builder
.interleaved_stepped_npatterns_p ())
1463 rtx base1
= builder
.elt (0);
1464 rtx base2
= builder
.elt (1);
1466 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns ()))
1467 - rtx_to_poly_int64 (base1
);
1469 = rtx_to_poly_int64 (builder
.elt (builder
.npatterns () + 1))
1470 - rtx_to_poly_int64 (base2
);
1472 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
1473 integer vector mode to generate such vector efficiently.
1475 E.g. EEW = 16, { 2, 0, 4, 0, ... }
1477 can be interpreted into:
1479 EEW = 32, { 2, 4, ... } */
1480 unsigned int new_smode_bitsize
= builder
.inner_bits_size () * 2;
1481 scalar_int_mode new_smode
;
1482 machine_mode new_mode
;
1483 poly_uint64 new_nunits
1484 = exact_div (GET_MODE_NUNITS (builder
.mode ()), 2);
1485 if (int_mode_for_size (new_smode_bitsize
, 0).exists (&new_smode
)
1486 && get_vector_mode (new_smode
, new_nunits
).exists (&new_mode
))
1488 rtx tmp1
= gen_reg_rtx (new_mode
);
1489 base1
= gen_int_mode (rtx_to_poly_int64 (base1
), new_smode
);
1490 expand_vec_series (tmp1
, base1
, gen_int_mode (step1
, new_smode
));
1492 if (rtx_equal_p (base2
, const0_rtx
) && known_eq (step2
, 0))
1493 /* { 1, 0, 2, 0, ... }. */
1494 emit_move_insn (result
, gen_lowpart (mode
, tmp1
));
1495 else if (known_eq (step2
, 0))
1497 /* { 1, 1, 2, 1, ... }. */
1498 rtx scalar
= expand_simple_binop (
1500 gen_int_mode (rtx_to_poly_int64 (base2
), new_smode
),
1501 gen_int_mode (builder
.inner_bits_size (), new_smode
),
1502 NULL_RTX
, false, OPTAB_DIRECT
);
1503 rtx tmp2
= gen_reg_rtx (new_mode
);
1504 rtx ior_ops
[] = {tmp2
, tmp1
, scalar
};
1505 emit_vlmax_insn (code_for_pred_scalar (IOR
, new_mode
),
1506 BINARY_OP
, ior_ops
);
1507 emit_move_insn (result
, gen_lowpart (mode
, tmp2
));
1511 /* { 1, 3, 2, 6, ... }. */
1512 rtx tmp2
= gen_reg_rtx (new_mode
);
1513 base2
= gen_int_mode (rtx_to_poly_int64 (base2
), new_smode
);
1514 expand_vec_series (tmp2
, base2
,
1515 gen_int_mode (step2
, new_smode
));
1516 rtx shifted_tmp2
= expand_simple_binop (
1517 new_mode
, ASHIFT
, tmp2
,
1518 gen_int_mode (builder
.inner_bits_size (), Pmode
), NULL_RTX
,
1519 false, OPTAB_DIRECT
);
1520 rtx tmp3
= gen_reg_rtx (new_mode
);
1521 rtx ior_ops
[] = {tmp3
, tmp1
, shifted_tmp2
};
1522 emit_vlmax_insn (code_for_pred (IOR
, new_mode
), BINARY_OP
,
1524 emit_move_insn (result
, gen_lowpart (mode
, tmp3
));
1529 rtx vid
= gen_reg_rtx (mode
);
1530 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
1531 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */
1533 = expand_simple_binop (mode
, LSHIFTRT
, vid
, const1_rtx
,
1534 NULL_RTX
, false, OPTAB_DIRECT
);
1535 rtx tmp1
= gen_reg_rtx (mode
);
1536 rtx tmp2
= gen_reg_rtx (mode
);
1537 expand_vec_series (tmp1
, base1
,
1538 gen_int_mode (step1
, builder
.inner_mode ()),
1540 expand_vec_series (tmp2
, base2
,
1541 gen_int_mode (step2
, builder
.inner_mode ()),
1544 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */
1545 rtx and_vid
= gen_reg_rtx (mode
);
1546 rtx and_ops
[] = {and_vid
, vid
, const1_rtx
};
1547 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), BINARY_OP
,
1549 rtx mask
= gen_reg_rtx (builder
.mask_mode ());
1550 expand_vec_cmp (mask
, EQ
, and_vid
, CONST1_RTX (mode
));
1552 rtx ops
[] = {result
, tmp1
, tmp2
, mask
};
1553 emit_vlmax_insn (code_for_pred_merge (mode
), MERGE_OP
, ops
);
1557 /* TODO: We will enable more variable-length vector in the future. */
1563 if (result
!= target
)
1564 emit_move_insn (target
, result
);
1567 /* Get the frm mode with given CONST_INT rtx, the default mode is
1569 enum floating_point_rounding_mode
1570 get_frm_mode (rtx operand
)
1572 gcc_assert (CONST_INT_P (operand
));
1574 switch (INTVAL (operand
))
1595 /* Expand a pre-RA RVV data move from SRC to DEST.
1596 It expands move for RVV fractional vector modes.
1597 Return true if the move as already been emitted. */
1599 legitimize_move (rtx dest
, rtx
*srcp
)
1602 machine_mode mode
= GET_MODE (dest
);
1603 if (CONST_VECTOR_P (src
))
1605 expand_const_vector (dest
, src
);
1609 if (riscv_v_ext_vls_mode_p (mode
))
1611 if (GET_MODE_NUNITS (mode
).to_constant () <= 31)
1613 /* For NUNITS <= 31 VLS modes, we don't need extract
1614 scalar registers so we apply the naive (set (op0) (op1)) pattern. */
1615 if (can_create_pseudo_p ())
1617 /* Need to force register if mem <- !reg. */
1618 if (MEM_P (dest
) && !REG_P (src
))
1619 *srcp
= force_reg (mode
, src
);
1624 else if (GET_MODE_NUNITS (mode
).to_constant () > 31 && lra_in_progress
)
1626 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1632 /* In order to decrease the memory traffic, we don't use whole register
1633 * load/store for the LMUL less than 1 and mask mode, so those case will
1634 * require one extra general purpose register, but it's not allowed during
1635 * LRA process, so we have a special move pattern used for LRA, which will
1636 * defer the expansion after LRA. */
1637 if ((known_lt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1638 || GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
1641 emit_insn (gen_mov_lra (mode
, Pmode
, dest
, src
));
1645 if (known_ge (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
)
1646 && GET_MODE_CLASS (mode
) != MODE_VECTOR_BOOL
)
1648 /* Need to force register if mem <- !reg. */
1649 if (MEM_P (dest
) && !REG_P (src
))
1650 *srcp
= force_reg (mode
, src
);
1656 if (register_operand (src
, mode
) && register_operand (dest
, mode
))
1658 emit_insn (gen_rtx_SET (dest
, src
));
1663 = GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
? UNARY_MASK_OP
: UNARY_OP
;
1664 if (!register_operand (src
, mode
) && !register_operand (dest
, mode
))
1666 rtx tmp
= gen_reg_rtx (mode
);
1669 rtx ops
[] = {tmp
, src
};
1670 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1673 emit_move_insn (tmp
, src
);
1677 if (satisfies_constraint_vu (src
))
1680 rtx ops
[] = {dest
, src
};
1681 emit_vlmax_insn (code_for_pred_mov (mode
), insn_flags
, ops
);
1685 /* VTYPE information for machine_mode. */
1686 struct mode_vtype_group
1688 enum vlmul_type vlmul
[NUM_MACHINE_MODES
];
1689 uint8_t ratio
[NUM_MACHINE_MODES
];
1690 machine_mode subpart_mode
[NUM_MACHINE_MODES
];
1691 uint8_t nf
[NUM_MACHINE_MODES
];
1694 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \
1695 vlmul[MODE##mode] = VLMUL; \
1696 ratio[MODE##mode] = RATIO;
1697 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \
1698 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \
1699 nf[MODE##mode] = NF; \
1700 vlmul[MODE##mode] = VLMUL; \
1701 ratio[MODE##mode] = RATIO;
1702 #include "riscv-vector-switch.def"
1708 static mode_vtype_group mode_vtype_infos
;
1710 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */
1712 get_vlmul (machine_mode mode
)
1714 /* For VLS modes, the vlmul should be dynamically
1715 calculated since we need to adjust VLMUL according
1716 to TARGET_MIN_VLEN. */
1717 if (riscv_v_ext_vls_mode_p (mode
))
1719 int size
= GET_MODE_BITSIZE (mode
).to_constant ();
1720 int inner_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
1721 if (size
< TARGET_MIN_VLEN
)
1723 int factor
= TARGET_MIN_VLEN
/ size
;
1724 if (inner_size
== 8)
1725 factor
= MIN (factor
, 8);
1726 else if (inner_size
== 16)
1727 factor
= MIN (factor
, 4);
1728 else if (inner_size
== 32)
1729 factor
= MIN (factor
, 2);
1730 else if (inner_size
== 64)
1731 factor
= MIN (factor
, 1);
1752 int factor
= size
/ TARGET_MIN_VLEN
;
1769 return mode_vtype_infos
.vlmul
[mode
];
1772 /* Return the VLMAX rtx of vector mode MODE. */
1774 get_vlmax_rtx (machine_mode mode
)
1776 gcc_assert (riscv_v_ext_vector_mode_p (mode
));
1777 return gen_int_mode (GET_MODE_NUNITS (mode
), Pmode
);
1780 /* Return the NF value of the corresponding mode. */
1782 get_nf (machine_mode mode
)
1784 /* We don't allow non-tuple modes go through this function. */
1785 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1786 return mode_vtype_infos
.nf
[mode
];
1789 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
1790 the subpart mode is RVVM2SImode. This will help to build
1791 array/struct type in builtins. */
1793 get_subpart_mode (machine_mode mode
)
1795 /* We don't allow non-tuple modes go through this function. */
1796 gcc_assert (riscv_v_ext_tuple_mode_p (mode
));
1797 return mode_vtype_infos
.subpart_mode
[mode
];
1800 /* Get ratio according to machine mode. */
1802 get_ratio (machine_mode mode
)
1804 if (riscv_v_ext_vls_mode_p (mode
))
1806 unsigned int sew
= get_sew (mode
);
1807 vlmul_type vlmul
= get_vlmul (mode
);
1829 return mode_vtype_infos
.ratio
[mode
];
1832 /* Get ta according to operand[tail_op_idx]. */
1836 if (INTVAL (ta
) == TAIL_ANY
)
1837 return INVALID_ATTRIBUTE
;
1841 /* Get ma according to operand[mask_op_idx]. */
1845 if (INTVAL (ma
) == MASK_ANY
)
1846 return INVALID_ATTRIBUTE
;
1850 /* Get prefer tail policy. */
1852 get_prefer_tail_policy ()
1854 /* TODO: By default, we choose to use TAIL_ANY which allows
1855 compiler pick up either agnostic or undisturbed. Maybe we
1856 will have a compile option like -mprefer=agnostic to set
1861 /* Get prefer mask policy. */
1863 get_prefer_mask_policy ()
1865 /* TODO: By default, we choose to use MASK_ANY which allows
1866 compiler pick up either agnostic or undisturbed. Maybe we
1867 will have a compile option like -mprefer=agnostic to set
1872 /* Get avl_type rtx. */
1874 get_avl_type_rtx (enum avl_type type
)
1876 return gen_int_mode (type
, Pmode
);
1879 /* Return the appropriate mask mode for MODE. */
1882 get_mask_mode (machine_mode mode
)
1884 poly_int64 nunits
= GET_MODE_NUNITS (mode
);
1885 if (riscv_v_ext_tuple_mode_p (mode
))
1887 unsigned int nf
= get_nf (mode
);
1888 nunits
= exact_div (nunits
, nf
);
1890 return get_vector_mode (BImode
, nunits
).require ();
1893 /* Return the appropriate LMUL mode for MODE. */
1896 get_lmul_mode (scalar_mode mode
, int lmul
)
1898 poly_uint64 lmul_nunits
;
1899 unsigned int bytes
= GET_MODE_SIZE (mode
);
1900 if (multiple_p (BYTES_PER_RISCV_VECTOR
* lmul
, bytes
, &lmul_nunits
))
1901 return get_vector_mode (mode
, lmul_nunits
);
1905 /* Return the appropriate M1 mode for MODE. */
1907 static opt_machine_mode
1908 get_m1_mode (machine_mode mode
)
1910 scalar_mode smode
= GET_MODE_INNER (mode
);
1911 unsigned int bytes
= GET_MODE_SIZE (smode
);
1912 poly_uint64 m1_nunits
= exact_div (BYTES_PER_RISCV_VECTOR
, bytes
);
1913 return get_vector_mode (smode
, m1_nunits
);
1916 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
1917 This function is not only used by builtins, but also will be used by
1918 auto-vectorization in the future. */
1920 get_vector_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1922 enum mode_class mclass
;
1923 if (inner_mode
== E_BImode
)
1924 mclass
= MODE_VECTOR_BOOL
;
1925 else if (FLOAT_MODE_P (inner_mode
))
1926 mclass
= MODE_VECTOR_FLOAT
;
1928 mclass
= MODE_VECTOR_INT
;
1930 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1931 if (inner_mode
== GET_MODE_INNER (mode
)
1932 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1933 && (riscv_v_ext_vector_mode_p (mode
)
1934 || riscv_v_ext_vls_mode_p (mode
)))
1936 return opt_machine_mode ();
1939 /* Return the RVV tuple mode if we can find the legal tuple mode for the
1940 corresponding subpart mode and NF. */
1942 get_tuple_mode (machine_mode subpart_mode
, unsigned int nf
)
1944 poly_uint64 nunits
= GET_MODE_NUNITS (subpart_mode
) * nf
;
1945 scalar_mode inner_mode
= GET_MODE_INNER (subpart_mode
);
1946 enum mode_class mclass
= GET_MODE_CLASS (subpart_mode
);
1948 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1949 if (inner_mode
== GET_MODE_INNER (mode
)
1950 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1951 && riscv_v_ext_tuple_mode_p (mode
)
1952 && get_subpart_mode (mode
) == subpart_mode
)
1954 return opt_machine_mode ();
1960 if (!CONST_INT_P (x
))
1962 return IN_RANGE (INTVAL (x
), -16, 15);
1968 if (!CONST_INT_P (x
))
1970 return IN_RANGE (INTVAL (x
), -15, 16);
1974 has_vi_variant_p (rtx_code code
, rtx x
)
1998 return neg_simm5_p (x
);
2006 sew64_scalar_helper (rtx
*operands
, rtx
*scalar_op
, rtx vl
,
2007 machine_mode vector_mode
, bool has_vi_variant_p
,
2008 void (*emit_vector_func
) (rtx
*, rtx
), enum avl_type type
)
2010 machine_mode scalar_mode
= GET_MODE_INNER (vector_mode
);
2011 if (has_vi_variant_p
)
2013 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2019 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
2020 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2024 if (immediate_operand (*scalar_op
, Pmode
))
2026 if (!rtx_equal_p (*scalar_op
, const0_rtx
))
2027 *scalar_op
= force_reg (Pmode
, *scalar_op
);
2029 *scalar_op
= gen_rtx_SIGN_EXTEND (scalar_mode
, *scalar_op
);
2033 if (CONST_INT_P (*scalar_op
))
2035 if (maybe_gt (GET_MODE_SIZE (scalar_mode
), GET_MODE_SIZE (Pmode
)))
2036 *scalar_op
= force_const_mem (scalar_mode
, *scalar_op
);
2038 *scalar_op
= force_reg (scalar_mode
, *scalar_op
);
2041 rtx tmp
= gen_reg_rtx (vector_mode
);
2042 rtx ops
[] = {tmp
, *scalar_op
};
2044 emit_vlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
);
2046 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode
), UNARY_OP
, ops
,
2048 emit_vector_func (operands
, tmp
);
2053 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */
2055 gen_scalar_move_mask (machine_mode mode
)
2057 rtx_vector_builder
builder (mode
, 1, 2);
2058 builder
.quick_push (const1_rtx
);
2059 builder
.quick_push (const0_rtx
);
2060 return builder
.build ();
2064 compute_vlmax (unsigned vector_bits
, unsigned elt_size
, unsigned min_size
)
2066 // Original equation:
2067 // VLMAX = (VectorBits / EltSize) * LMUL
2068 // where LMUL = MinSize / TARGET_MIN_VLEN
2069 // The following equations have been reordered to prevent loss of precision
2070 // when calculating fractional LMUL.
2071 return ((vector_bits
/ elt_size
) * min_size
) / TARGET_MIN_VLEN
;
2075 get_unknown_min_value (machine_mode mode
)
2077 enum vlmul_type vlmul
= get_vlmul (mode
);
2081 return TARGET_MIN_VLEN
;
2083 return TARGET_MIN_VLEN
* 2;
2085 return TARGET_MIN_VLEN
* 4;
2087 return TARGET_MIN_VLEN
* 8;
2094 force_vector_length_operand (rtx vl
)
2096 if (CONST_INT_P (vl
) && !satisfies_constraint_K (vl
))
2097 return force_reg (Pmode
, vl
);
2102 gen_no_side_effects_vsetvl_rtx (machine_mode vmode
, rtx vl
, rtx avl
)
2104 unsigned int sew
= get_sew (vmode
);
2105 rtx tail_policy
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2106 rtx mask_policy
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2107 return gen_vsetvl_no_side_effects (Pmode
, vl
, avl
, gen_int_mode (sew
, Pmode
),
2108 gen_int_mode (get_vlmul (vmode
), Pmode
),
2109 tail_policy
, mask_policy
);
2112 /* GET VL * 2 rtx. */
2114 get_vl_x2_rtx (rtx avl
, machine_mode mode
, machine_mode demote_mode
)
2116 rtx i32vl
= NULL_RTX
;
2117 if (CONST_INT_P (avl
))
2119 unsigned elt_size
= GET_MODE_BITSIZE (GET_MODE_INNER (mode
));
2120 unsigned min_size
= get_unknown_min_value (mode
);
2121 unsigned vlen_max
= RVV_65536
;
2122 unsigned vlmax_max
= compute_vlmax (vlen_max
, elt_size
, min_size
);
2123 unsigned vlen_min
= TARGET_MIN_VLEN
;
2124 unsigned vlmax_min
= compute_vlmax (vlen_min
, elt_size
, min_size
);
2126 unsigned HOST_WIDE_INT avl_int
= INTVAL (avl
);
2127 if (avl_int
<= vlmax_min
)
2128 i32vl
= gen_int_mode (2 * avl_int
, Pmode
);
2129 else if (avl_int
>= 2 * vlmax_max
)
2131 // Just set i32vl to VLMAX in this situation
2132 i32vl
= gen_reg_rtx (Pmode
);
2134 gen_no_side_effects_vsetvl_rtx (demote_mode
, i32vl
, RVV_VLMAX
));
2138 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
2139 // is related to the hardware implementation.
2140 // So let the following code handle
2145 // Using vsetvli instruction to get actually used length which related to
2146 // the hardware implementation
2147 rtx i64vl
= gen_reg_rtx (Pmode
);
2149 gen_no_side_effects_vsetvl_rtx (mode
, i64vl
, force_reg (Pmode
, avl
)));
2150 // scale 2 for 32-bit length
2151 i32vl
= gen_reg_rtx (Pmode
);
2153 gen_rtx_SET (i32vl
, gen_rtx_ASHIFT (Pmode
, i64vl
, const1_rtx
)));
2156 return force_vector_length_operand (i32vl
);
2160 slide1_sew64_helper (int unspec
, machine_mode mode
, machine_mode demote_mode
,
2161 machine_mode demote_mask_mode
, rtx
*ops
)
2163 rtx scalar_op
= ops
[4];
2165 machine_mode scalar_mode
= GET_MODE_INNER (mode
);
2166 if (rtx_equal_p (scalar_op
, const0_rtx
))
2168 ops
[5] = force_vector_length_operand (ops
[5]);
2174 ops
[4] = force_reg (scalar_mode
, scalar_op
);
2175 ops
[5] = force_vector_length_operand (ops
[5]);
2179 if (immediate_operand (scalar_op
, Pmode
))
2181 ops
[4] = gen_rtx_SIGN_EXTEND (scalar_mode
, force_reg (Pmode
, scalar_op
));
2182 ops
[5] = force_vector_length_operand (ops
[5]);
2186 if (CONST_INT_P (scalar_op
))
2187 scalar_op
= force_reg (scalar_mode
, scalar_op
);
2189 rtx vl_x2
= get_vl_x2_rtx (avl
, mode
, demote_mode
);
2191 rtx demote_scalar_op1
, demote_scalar_op2
;
2192 if (unspec
== UNSPEC_VSLIDE1UP
)
2194 demote_scalar_op1
= gen_highpart (Pmode
, scalar_op
);
2195 demote_scalar_op2
= gen_lowpart (Pmode
, scalar_op
);
2199 demote_scalar_op1
= gen_lowpart (Pmode
, scalar_op
);
2200 demote_scalar_op2
= gen_highpart (Pmode
, scalar_op
);
2203 rtx temp
= gen_reg_rtx (demote_mode
);
2204 rtx ta
= gen_int_mode (get_prefer_tail_policy (), Pmode
);
2205 rtx ma
= gen_int_mode (get_prefer_mask_policy (), Pmode
);
2206 rtx merge
= RVV_VUNDEF (demote_mode
);
2207 /* Handle vslide1<ud>_tu. */
2208 if (register_operand (ops
[2], mode
)
2209 && rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1]))))
2211 merge
= gen_lowpart (demote_mode
, ops
[2]);
2216 emit_insn (gen_pred_slide (unspec
, demote_mode
, temp
,
2217 CONSTM1_RTX (demote_mask_mode
), merge
,
2218 gen_lowpart (demote_mode
, ops
[3]),
2219 demote_scalar_op1
, vl_x2
, ta
, ma
, ops
[8]));
2220 emit_insn (gen_pred_slide (unspec
, demote_mode
,
2221 gen_lowpart (demote_mode
, ops
[0]),
2222 CONSTM1_RTX (demote_mask_mode
), merge
, temp
,
2223 demote_scalar_op2
, vl_x2
, ta
, ma
, ops
[8]));
2225 if (!rtx_equal_p (ops
[1], CONSTM1_RTX (GET_MODE (ops
[1])))
2226 && !rtx_equal_p (ops
[2], RVV_VUNDEF (GET_MODE (ops
[2]))))
2227 emit_insn (gen_pred_merge (mode
, ops
[0], ops
[2], ops
[2], ops
[0], ops
[1],
2228 force_vector_length_operand (ops
[5]), ops
[6],
2234 gen_avl_for_scalar_move (rtx avl
)
2236 /* AVL for scalar move has different behavior between 0 and large than 0. */
2237 if (CONST_INT_P (avl
))
2239 /* So we could just set AVL to 1 for any constant other than 0. */
2240 if (rtx_equal_p (avl
, const0_rtx
))
2247 /* For non-constant value, we set any non zero value to 1 by
2248 `sgtu new_avl,input_avl,zero` + `vsetvli`. */
2249 rtx tmp
= gen_reg_rtx (Pmode
);
2251 gen_rtx_SET (tmp
, gen_rtx_fmt_ee (GTU
, Pmode
, avl
, const0_rtx
)));
2256 /* Expand tuple modes data movement for. */
2258 expand_tuple_move (rtx
*ops
)
2261 machine_mode tuple_mode
= GET_MODE (ops
[0]);
2262 machine_mode subpart_mode
= get_subpart_mode (tuple_mode
);
2263 poly_int64 subpart_size
= GET_MODE_SIZE (subpart_mode
);
2264 unsigned int nf
= get_nf (tuple_mode
);
2265 bool fractional_p
= known_lt (subpart_size
, BYTES_PER_RISCV_VECTOR
);
2267 if (REG_P (ops
[0]) && CONST_VECTOR_P (ops
[1]))
2270 gcc_assert (can_create_pseudo_p ()
2271 && const_vec_duplicate_p (ops
[1], &val
));
2272 for (i
= 0; i
< nf
; ++i
)
2274 poly_int64 offset
= i
* subpart_size
;
2276 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2277 rtx dup
= gen_const_vec_duplicate (subpart_mode
, val
);
2278 emit_move_insn (subreg
, dup
);
2281 else if (REG_P (ops
[0]) && REG_P (ops
[1]))
2283 for (i
= 0; i
< nf
; ++i
)
2287 /* Take NF = 2 and LMUL = 1 for example:
2296 if (REGNO (ops
[0]) > REGNO (ops
[1]))
2298 poly_int64 offset
= index
* subpart_size
;
2300 = simplify_gen_subreg (subpart_mode
, ops
[0], tuple_mode
, offset
);
2302 = simplify_gen_subreg (subpart_mode
, ops
[1], tuple_mode
, offset
);
2303 emit_insn (gen_rtx_SET (dst_subreg
, src_subreg
));
2308 /* Expand tuple memory data movement. */
2309 gcc_assert (MEM_P (ops
[0]) || MEM_P (ops
[1]));
2310 rtx offset
= gen_int_mode (subpart_size
, Pmode
);
2311 if (!subpart_size
.is_constant ())
2313 emit_move_insn (ops
[2], gen_int_mode (BYTES_PER_RISCV_VECTOR
, Pmode
));
2317 = exact_div (BYTES_PER_RISCV_VECTOR
, subpart_size
)
2320 = gen_rtx_ASHIFTRT (Pmode
, ops
[2],
2321 gen_int_mode (exact_log2 (factor
), Pmode
));
2322 emit_insn (gen_rtx_SET (ops
[2], pat
));
2325 if (known_gt (subpart_size
, BYTES_PER_RISCV_VECTOR
))
2328 = exact_div (subpart_size
, BYTES_PER_RISCV_VECTOR
)
2331 = gen_rtx_ASHIFT (Pmode
, ops
[2],
2332 gen_int_mode (exact_log2 (factor
), Pmode
));
2333 emit_insn (gen_rtx_SET (ops
[2], pat
));
2338 /* Non-fractional LMUL has whole register moves that don't require a
2339 vsetvl for VLMAX. */
2341 emit_vlmax_vsetvl (subpart_mode
, ops
[4]);
2344 /* Load operations. */
2345 emit_move_insn (ops
[3], XEXP (ops
[1], 0));
2346 for (i
= 0; i
< nf
; i
++)
2348 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[0],
2349 tuple_mode
, i
* subpart_size
);
2352 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2353 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2355 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2359 rtx operands
[] = {subreg
, mem
};
2360 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2361 UNARY_OP
, operands
, ops
[4]);
2364 emit_move_insn (subreg
, mem
);
2369 /* Store operations. */
2370 emit_move_insn (ops
[3], XEXP (ops
[0], 0));
2371 for (i
= 0; i
< nf
; i
++)
2373 rtx subreg
= simplify_gen_subreg (subpart_mode
, ops
[1],
2374 tuple_mode
, i
* subpart_size
);
2377 rtx new_addr
= gen_rtx_PLUS (Pmode
, ops
[3], offset
);
2378 emit_insn (gen_rtx_SET (ops
[3], new_addr
));
2380 rtx mem
= gen_rtx_MEM (subpart_mode
, ops
[3]);
2384 rtx operands
[] = {mem
, subreg
};
2385 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode
),
2386 UNARY_OP
, operands
, ops
[4]);
2389 emit_move_insn (mem
, subreg
);
2395 /* Return the vectorization machine mode for RVV according to LMUL. */
2397 preferred_simd_mode (scalar_mode mode
)
2399 if (autovec_use_vlmax_p ())
2401 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
2402 rvv_max_lmul as multiply factor to calculate the NUNITS to
2403 get the auto-vectorization mode. */
2405 poly_uint64 vector_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2406 poly_uint64 scalar_size
= GET_MODE_SIZE (mode
);
2407 /* Disable vectorization when we can't find a RVV mode for it.
2408 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
2409 a double (DFmode) type. */
2410 if (!multiple_p (vector_size
, scalar_size
, &nunits
))
2412 machine_mode rvv_mode
;
2413 if (get_vector_mode (mode
, nunits
).exists (&rvv_mode
))
2419 /* Use merge approach to initialize the vector with repeating sequence.
2420 v = {a, b, a, b, a, b, a, b}.
2423 mask = 0b01010101....
2424 v = merge (v, b, mask)
2427 expand_vector_init_merge_repeating_sequence (rtx target
,
2428 const rvv_builder
&builder
)
2430 /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
2431 since we don't have such instruction in RVV.
2432 Instead, we should use INT mode (QI/HI/SI/DI) with integer move
2433 instruction to generate the mask data we want. */
2434 machine_mode mask_bit_mode
= get_mask_mode (builder
.mode ());
2435 machine_mode mask_int_mode
2436 = get_repeating_sequence_dup_machine_mode (builder
, mask_bit_mode
);
2437 uint64_t full_nelts
= builder
.full_nelts ().to_constant ();
2439 /* Step 1: Broadcast the first pattern. */
2440 rtx ops
[] = {target
, force_reg (builder
.inner_mode (), builder
.elt (0))};
2441 emit_vlmax_insn (code_for_pred_broadcast (builder
.mode ()),
2443 /* Step 2: Merge the rest iteration of pattern. */
2444 for (unsigned int i
= 1; i
< builder
.npatterns (); i
++)
2446 /* Step 2-1: Generate mask register v0 for each merge. */
2448 = builder
.get_merge_scalar_mask (i
, GET_MODE_INNER (mask_int_mode
));
2449 rtx mask
= gen_reg_rtx (mask_bit_mode
);
2450 rtx dup
= gen_reg_rtx (mask_int_mode
);
2452 if (full_nelts
<= builder
.inner_bits_size ()) /* vmv.s.x. */
2454 rtx ops
[] = {dup
, merge_mask
};
2455 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup
)),
2456 SCALAR_MOVE_OP
, ops
, CONST1_RTX (Pmode
));
2461 force_reg (GET_MODE_INNER (mask_int_mode
), merge_mask
)};
2462 rtx vl
= gen_int_mode (CEIL (full_nelts
, builder
.inner_bits_size ()),
2464 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode
), UNARY_OP
,
2468 emit_move_insn (mask
, gen_lowpart (mask_bit_mode
, dup
));
2470 /* Step 2-2: Merge pattern according to the mask. */
2471 rtx ops
[] = {target
, target
, builder
.elt (i
), mask
};
2472 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target
)),
2477 /* Use slideup approach to combine the vectors.
2478 v = {a, a, a, a, b, b, b, b}
2481 v1 = {a, a, a, a, a, a, a, a}
2482 v2 = {b, b, b, b, b, b, b, b}
2483 v = slideup (v1, v2, nelt / 2)
2486 expand_vector_init_slideup_combine_sequence (rtx target
,
2487 const rvv_builder
&builder
)
2489 machine_mode mode
= GET_MODE (target
);
2490 int nelts
= builder
.full_nelts ().to_constant ();
2491 rtx first_elt
= builder
.elt (0);
2492 rtx last_elt
= builder
.elt (nelts
- 1);
2493 rtx low
= expand_vector_broadcast (mode
, first_elt
);
2494 rtx high
= expand_vector_broadcast (mode
, last_elt
);
2495 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, mode
);
2496 rtx ops
[] = {target
, low
, high
, gen_int_mode (nelts
/ 2, Pmode
)};
2497 emit_vlmax_insn (icode
, SLIDEUP_OP_MERGE
, ops
);
2500 /* Use merge approach to merge a scalar into a vector.
2501 v = {a, a, a, a, a, a, b, b}
2503 v1 = {a, a, a, a, a, a, a, a}
2505 mask = {0, 0, 0, 0, 0, 0, 1, 1}
2508 expand_vector_init_merge_combine_sequence (rtx target
,
2509 const rvv_builder
&builder
)
2511 machine_mode mode
= GET_MODE (target
);
2512 machine_mode imode
= builder
.int_mode ();
2513 machine_mode mmode
= builder
.mask_mode ();
2514 int nelts
= builder
.full_nelts ().to_constant ();
2515 int leading_ndups
= builder
.count_dups (0, nelts
- 1, 1);
2516 if ((leading_ndups
> 255 && GET_MODE_INNER (imode
) == QImode
)
2517 || riscv_get_v_regno_alignment (imode
) > 1)
2518 imode
= get_vector_mode (HImode
, nelts
).require ();
2520 /* Generate vid = { 0, 1, 2, ..., n }. */
2521 rtx vid
= gen_reg_rtx (imode
);
2522 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
2524 /* Generate mask. */
2525 rtx mask
= gen_reg_rtx (mmode
);
2526 insn_code icode
= code_for_pred_cmp_scalar (imode
);
2527 rtx index
= gen_int_mode (leading_ndups
- 1, builder
.inner_int_mode ());
2528 rtx dup_rtx
= gen_rtx_VEC_DUPLICATE (imode
, index
);
2529 /* vmsgtu.vi/vmsgtu.vx. */
2530 rtx cmp
= gen_rtx_fmt_ee (GTU
, mmode
, vid
, dup_rtx
);
2531 rtx sel
= builder
.elt (nelts
- 1);
2532 rtx mask_ops
[] = {mask
, cmp
, vid
, index
};
2533 emit_vlmax_insn (icode
, COMPARE_OP
, mask_ops
);
2535 /* Duplicate the first elements. */
2536 rtx dup
= expand_vector_broadcast (mode
, builder
.elt (0));
2537 /* Merge scalar into vector according to mask. */
2538 rtx merge_ops
[] = {target
, dup
, sel
, mask
};
2539 icode
= code_for_pred_merge_scalar (mode
);
2540 emit_vlmax_insn (icode
, MERGE_OP
, merge_ops
);
2543 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
2546 expand_vec_init (rtx target
, rtx vals
)
2548 machine_mode mode
= GET_MODE (target
);
2549 int nelts
= XVECLEN (vals
, 0);
2551 rvv_builder
v (mode
, nelts
, 1);
2552 for (int i
= 0; i
< nelts
; i
++)
2553 v
.quick_push (XVECEXP (vals
, 0, i
));
2556 /* If the sequence is v = { a, a, a, a } just broadcast an element. */
2557 if (v
.is_repeating_sequence ())
2559 machine_mode mode
= GET_MODE (target
);
2560 rtx dup
= expand_vector_broadcast (mode
, v
.elt (0));
2561 emit_move_insn (target
, dup
);
2567 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */
2568 if (v
.can_duplicate_repeating_sequence_p ())
2570 rtx ele
= v
.get_merged_repeating_sequence ();
2571 rtx dup
= expand_vector_broadcast (v
.new_mode (), ele
);
2572 emit_move_insn (target
, gen_lowpart (mode
, dup
));
2576 /* Case 2: Optimize repeating sequence cases that Case 1 can
2577 not handle and it is profitable. For example:
2578 ELEMENT BITSIZE = 64.
2579 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
2580 We can't find a vector mode for "ab" which will be combined into
2581 128-bit element to duplicate. */
2582 if (v
.repeating_sequence_use_merge_profitable_p ())
2584 expand_vector_init_merge_repeating_sequence (target
, v
);
2588 /* Case 3: Optimize combine sequence.
2589 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
2591 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2593 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
2595 if (v
.combine_sequence_use_slideup_profitable_p ())
2597 expand_vector_init_slideup_combine_sequence (target
, v
);
2601 /* Case 4: Optimize combine sequence.
2602 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
2605 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
2608 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
2610 Merge b into v by mask:
2611 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */
2612 if (v
.combine_sequence_use_merge_profitable_p ())
2614 expand_vector_init_merge_combine_sequence (target
, v
);
2619 /* Optimize trailing same elements sequence:
2620 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
2621 if (!expand_vector_init_trailing_same_elem (target
, v
, nelts
))
2622 /* Handle common situation by vslide1down. This function can handle any
2623 situation of vec_init<mode>. Only the cases that are not optimized above
2624 will fall through here. */
2625 expand_vector_init_insert_elems (target
, v
, nelts
);
2628 /* Get insn code for corresponding comparison. */
2631 get_cmp_insn_code (rtx_code code
, machine_mode mode
)
2643 icode
= code_for_pred_cmp (mode
);
2649 if (FLOAT_MODE_P (mode
))
2650 icode
= code_for_pred_cmp (mode
);
2652 icode
= code_for_pred_ltge (mode
);
2660 /* This hook gives the vectorizer more vector mode options. We want it to not
2661 only try modes with the maximum number of units a full vector can hold but
2662 for example also half the number of units for a smaller elements size.
2663 Such vectors can be promoted to a full vector of widened elements
2664 (still with the same number of elements, essentially vectorizing at a
2665 fixed number of units rather than a fixed number of bytes). */
2667 autovectorize_vector_modes (vector_modes
*modes
, bool)
2669 if (autovec_use_vlmax_p ())
2671 poly_uint64 full_size
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
2673 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
2675 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
2676 is guided by the extensions we have available (vf2, vf4 and vf8).
2678 - full_size: Try using full vectors for all element types.
2680 Try using 16-bit containers for 8-bit elements and full vectors
2683 Try using 32-bit containers for 8-bit and 16-bit elements and
2684 full vectors for wider elements.
2686 Try using 64-bit containers for all element types. */
2687 static const int rvv_factors
[] = {1, 2, 4, 8, 16, 32, 64};
2688 for (unsigned int i
= 0; i
< sizeof (rvv_factors
) / sizeof (int); i
++)
2692 if (can_div_trunc_p (full_size
, rvv_factors
[i
], &units
)
2693 && get_vector_mode (QImode
, units
).exists (&mode
))
2694 modes
->safe_push (mode
);
2697 /* Push all VLSmodes according to TARGET_MIN_VLEN. */
2699 unsigned int base_size
= TARGET_MIN_VLEN
* TARGET_MAX_LMUL
/ 8;
2700 unsigned int size
= base_size
;
2702 while (size
> 0 && get_vector_mode (QImode
, size
).exists (&mode
))
2704 if (vls_mode_valid_p (mode
))
2705 modes
->safe_push (mode
);
2708 size
= base_size
/ (1U << i
);
2710 /* Enable LOOP_VINFO comparison in COST model. */
2711 return VECT_COMPARE_COSTS
;
2714 /* Return true if we can find the related MODE according to default LMUL. */
2716 can_find_related_mode_p (machine_mode vector_mode
, scalar_mode element_mode
,
2717 poly_uint64
*nunits
)
2719 if (!autovec_use_vlmax_p ())
2721 if (riscv_v_ext_vector_mode_p (vector_mode
)
2722 && multiple_p (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
2723 GET_MODE_SIZE (element_mode
), nunits
))
2725 if (riscv_v_ext_vls_mode_p (vector_mode
)
2726 && multiple_p (TARGET_MIN_VLEN
* TARGET_MAX_LMUL
,
2727 GET_MODE_SIZE (element_mode
), nunits
))
2732 /* If the given VECTOR_MODE is an RVV mode, first get the largest number
2733 of units that fit into a full vector at the given ELEMENT_MODE.
2734 We will have the vectorizer call us with a successively decreasing
2735 number of units (as specified in autovectorize_vector_modes).
2736 The starting mode is always the one specified by preferred_simd_mode. */
2738 vectorize_related_mode (machine_mode vector_mode
, scalar_mode element_mode
,
2741 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
2742 poly_uint64 min_units
;
2743 if (can_find_related_mode_p (vector_mode
, element_mode
, &min_units
))
2745 machine_mode rvv_mode
;
2746 if (maybe_ne (nunits
, 0U))
2748 /* If we were given a number of units NUNITS, try to find an
2749 RVV vector mode of inner mode ELEMENT_MODE with the same
2751 if (multiple_p (min_units
, nunits
)
2752 && get_vector_mode (element_mode
, nunits
).exists (&rvv_mode
))
2757 /* Look for a vector mode with the same number of units as the
2758 VECTOR_MODE we were given. We keep track of the minimum
2759 number of units so far which determines the smallest necessary
2760 but largest possible, suitable mode for vectorization. */
2761 min_units
= ordered_min (min_units
, GET_MODE_SIZE (vector_mode
));
2762 if (get_vector_mode (element_mode
, min_units
).exists (&rvv_mode
))
2767 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
2770 /* Expand an RVV comparison. */
2773 expand_vec_cmp (rtx target
, rtx_code code
, rtx op0
, rtx op1
, rtx mask
,
2776 machine_mode mask_mode
= GET_MODE (target
);
2777 machine_mode data_mode
= GET_MODE (op0
);
2778 insn_code icode
= get_cmp_insn_code (code
, data_mode
);
2782 rtx lt
= gen_reg_rtx (mask_mode
);
2783 rtx gt
= gen_reg_rtx (mask_mode
);
2784 expand_vec_cmp (lt
, LT
, op0
, op1
, mask
, maskoff
);
2785 expand_vec_cmp (gt
, GT
, op0
, op1
, mask
, maskoff
);
2786 icode
= code_for_pred (IOR
, mask_mode
);
2787 rtx ops
[] = {target
, lt
, gt
};
2788 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2792 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, op0
, op1
);
2793 if (!mask
&& !maskoff
)
2795 rtx ops
[] = {target
, cmp
, op0
, op1
};
2796 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
2800 rtx ops
[] = {target
, mask
, maskoff
, cmp
, op0
, op1
};
2801 emit_vlmax_insn (icode
, COMPARE_OP_MU
, ops
);
2805 /* Expand an RVV floating-point comparison:
2807 If CAN_INVERT_P is true, the caller can also handle inverted results;
2808 return true if the result is in fact inverted. */
2811 expand_vec_cmp_float (rtx target
, rtx_code code
, rtx op0
, rtx op1
,
2814 machine_mode mask_mode
= GET_MODE (target
);
2815 machine_mode data_mode
= GET_MODE (op0
);
2817 /* If can_invert_p = true:
2818 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
2823 vmflt.vv v0, va, vb, v0.t
2826 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
2830 vmfeq.vv v0, vb, vb, v0.t
2831 vmflt.vv v0, va, vb, v0.t
2834 If can_invert_p = false:
2836 # Example of implementing isgreater()
2837 vmfeq.vv v0, va, va # Only set where A is not NaN.
2838 vmfeq.vv v1, vb, vb # Only set where B is not NaN.
2839 vmand.mm v0, v0, v1 # Only set where A and B are ordered,
2840 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values.
2843 rtx eq0
= gen_reg_rtx (mask_mode
);
2844 rtx eq1
= gen_reg_rtx (mask_mode
);
2854 /* There is native support for the comparison. */
2855 expand_vec_cmp (target
, code
, op0
, op1
);
2864 /* vmfeq.vv v0, va, va */
2865 expand_vec_cmp (eq0
, EQ
, op0
, op0
);
2866 if (HONOR_SNANS (data_mode
))
2872 expand_vec_cmp (eq1
, EQ
, op1
, op1
);
2873 insn_code icode
= code_for_pred (AND
, mask_mode
);
2874 rtx ops
[] = {eq0
, eq0
, eq1
};
2875 emit_vlmax_insn (icode
, BINARY_MASK_OP
, ops
);
2879 /* vmfeq.vv v0, vb, vb, v0.t */
2880 expand_vec_cmp (eq0
, EQ
, op1
, op1
, eq0
, eq0
);
2887 if (code
== ORDERED
)
2889 emit_move_insn (target
, eq0
);
2893 /* There is native support for the inverse comparison. */
2894 code
= reverse_condition_maybe_unordered (code
);
2895 if (code
== ORDERED
)
2896 emit_move_insn (target
, eq0
);
2898 expand_vec_cmp (eq0
, code
, op0
, op1
, eq0
, eq0
);
2902 emit_move_insn (target
, eq0
);
2906 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
2907 into: vmand.mm/vmnor.mm/vmnand.mm/vmxnor.mm. */
2908 emit_insn (gen_rtx_SET (target
, gen_rtx_NOT (mask_mode
, eq0
)));
2912 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
2913 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
2916 modulo_sel_indices (rtx op0
, rtx op1
, rtx sel
)
2919 machine_mode sel_mode
= GET_MODE (sel
);
2920 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2921 poly_uint64 max_sel
= rtx_equal_p (op0
, op1
) ? nunits
- 1 : 2 * nunits
- 1;
2922 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
2923 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
2925 if (CONST_VECTOR_P (sel
)
2926 && (!nunits
.is_constant () || const_vec_all_in_range_p (sel
, 0, max_sel
)))
2930 rtx mod
= gen_const_vector_dup (sel_mode
, max_sel
);
2932 = expand_simple_binop (sel_mode
, AND
, sel
, mod
, NULL
, 0, OPTAB_DIRECT
);
2937 /* Implement vec_perm<mode>. */
2940 expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
2942 machine_mode data_mode
= GET_MODE (target
);
2943 machine_mode sel_mode
= GET_MODE (sel
);
2944 poly_uint64 nunits
= GET_MODE_NUNITS (sel_mode
);
2946 /* Check if the sel only references the first values vector. If each select
2947 index is in range of [0, nunits - 1]. A single vrgather instructions is
2948 enough. Since we will use vrgatherei16.vv for variable-length vector,
2949 it is never out of range and we don't need to modulo the index. */
2950 if (nunits
.is_constant () && const_vec_all_in_range_p (sel
, 0, nunits
- 1))
2952 emit_vlmax_gather_insn (target
, op0
, sel
);
2956 /* Check if all the indices are same. */
2958 if (const_vec_duplicate_p (sel
, &elt
))
2960 poly_uint64 value
= rtx_to_poly_int64 (elt
);
2962 if (maybe_gt (value
, nunits
- 1))
2964 sel
= gen_const_vector_dup (sel_mode
, value
- nunits
);
2967 emit_vlmax_gather_insn (target
, op
, sel
);
2970 /* Note: vec_perm indices are supposed to wrap when they go beyond the
2971 size of the two value vectors, i.e. the upper bits of the indices
2972 are effectively ignored. RVV vrgather instead produces 0 for any
2973 out-of-range indices, so we need to modulo all the vec_perm indices
2974 to ensure they are all in range of [0, nunits - 1] when op0 == op1
2975 or all in range of [0, 2 * nunits - 1] when op0 != op1. */
2976 rtx sel_mod
= modulo_sel_indices (op0
, op1
, sel
);
2978 /* Check if the two values vectors are the same. */
2979 if (rtx_equal_p (op0
, op1
))
2981 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
2985 /* This following sequence is handling the case that:
2986 __builtin_shufflevector (vec1, vec2, index...), the index can be any
2987 value in range of [0, 2 * nunits - 1]. */
2988 machine_mode mask_mode
;
2989 mask_mode
= get_mask_mode (data_mode
);
2990 rtx mask
= gen_reg_rtx (mask_mode
);
2991 rtx max_sel
= gen_const_vector_dup (sel_mode
, nunits
);
2993 /* Step 1: generate a mask that should select everything >= nunits into the
2995 expand_vec_cmp (mask
, GEU
, sel_mod
, max_sel
);
2997 /* Step2: gather every op0 values indexed by sel into target,
2998 we don't need to care about the result of the element
2999 whose index >= nunits. */
3000 emit_vlmax_gather_insn (target
, op0
, sel_mod
);
3002 /* Step3: shift the range from (nunits, max_of_mode] to
3003 [0, max_of_mode - nunits]. */
3004 rtx tmp
= gen_reg_rtx (sel_mode
);
3005 rtx ops
[] = {tmp
, sel_mod
, max_sel
};
3006 emit_vlmax_insn (code_for_pred (MINUS
, sel_mode
), BINARY_OP
, ops
);
3008 /* Step4: gather those into the previously masked-out elements
3010 emit_vlmax_masked_gather_mu_insn (target
, op1
, tmp
, mask
);
3013 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */
3015 /* vec_perm support. */
3017 struct expand_vec_perm_d
3019 rtx target
, op0
, op1
;
3020 vec_perm_indices perm
;
3022 machine_mode op_mode
;
3027 /* Return the appropriate index mode for gather instructions. */
3029 get_gather_index_mode (struct expand_vec_perm_d
*d
)
3031 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3032 poly_uint64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3034 if (GET_MODE_INNER (d
->vmode
) == QImode
)
3036 if (nunits
.is_constant ())
3038 /* If indice is LMUL8 CONST_VECTOR and any element value
3039 exceed the range of 0 ~ 255, Forbid such permutation
3040 since we need vector HI mode to hold such indice and
3041 we don't have it. */
3042 if (!d
->perm
.all_in_range_p (0, 255)
3043 && !get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3044 return opt_machine_mode ();
3048 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3049 Otherwise, it could overflow the index range. */
3050 if (!get_vector_mode (HImode
, nunits
).exists (&sel_mode
))
3051 return opt_machine_mode ();
3054 else if (riscv_get_v_regno_alignment (sel_mode
) > 1
3055 && GET_MODE_INNER (sel_mode
) != HImode
)
3056 sel_mode
= get_vector_mode (HImode
, nunits
).require ();
3060 /* Recognize the patterns that we can use merge operation to shuffle the
3061 vectors. The value of Each element (index i) in selector can only be
3062 either i or nunits + i. We will check the pattern is actually monotonic.
3065 v = VEC_PERM_EXPR (v0, v1, selector),
3066 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... }
3068 We can transform such pattern into:
3070 v = vcond_mask (v0, v1, mask),
3071 mask = { 0, 1, 0, 1, 0, 1, ... }. */
3074 shuffle_merge_patterns (struct expand_vec_perm_d
*d
)
3076 machine_mode vmode
= d
->vmode
;
3077 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3078 int n_patterns
= d
->perm
.encoding ().npatterns ();
3079 poly_int64 vec_len
= d
->perm
.length ();
3081 for (int i
= 0; i
< n_patterns
; ++i
)
3082 if (!known_eq (d
->perm
[i
], i
) && !known_eq (d
->perm
[i
], vec_len
+ i
))
3085 /* Check the pattern is monotonic here, otherwise, return false. */
3086 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
3087 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
3088 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
3091 /* We need to use precomputed mask for such situation and such mask
3092 can only be computed in compile-time known size modes. */
3093 bool indices_fit_selector_p
3094 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode
)) > 8 || known_lt (vec_len
, 256);
3095 if (!indices_fit_selector_p
&& !vec_len
.is_constant ())
3101 machine_mode mask_mode
= get_mask_mode (vmode
);
3102 rtx mask
= gen_reg_rtx (mask_mode
);
3104 if (indices_fit_selector_p
)
3106 /* MASK = SELECTOR < NUNITS ? 1 : 0. */
3107 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3108 rtx x
= gen_int_mode (vec_len
, GET_MODE_INNER (sel_mode
));
3109 insn_code icode
= code_for_pred_cmp_scalar (sel_mode
);
3110 rtx cmp
= gen_rtx_fmt_ee (LTU
, mask_mode
, sel
, x
);
3111 rtx ops
[] = {mask
, cmp
, sel
, x
};
3112 emit_vlmax_insn (icode
, COMPARE_OP
, ops
);
3116 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
3117 directly to generate the selector mask, instead, we can only use
3120 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
3121 don't have a QImode scalar register to hold larger than 255.
3122 We also cannot hold that in a vector QImode register if LMUL = 8, and,
3123 since there is no larger HI mode vector we cannot create a larger
3126 As the mask is a simple {0, 1, ...} pattern and the length is known we
3127 can store it in a scalar register and broadcast it to a mask register.
3129 gcc_assert (vec_len
.is_constant ());
3130 int size
= CEIL (GET_MODE_NUNITS (mask_mode
).to_constant (), 8);
3131 machine_mode mode
= get_vector_mode (QImode
, size
).require ();
3132 rtx tmp
= gen_reg_rtx (mode
);
3133 rvv_builder
v (mode
, 1, size
);
3134 for (int i
= 0; i
< vec_len
.to_constant () / 8; i
++)
3137 for (int j
= 0; j
< 8; j
++)
3139 int index
= i
* 8 + j
;
3140 if (known_lt (d
->perm
[index
], 256))
3143 v
.quick_push (gen_int_mode (value
, QImode
));
3145 emit_move_insn (tmp
, v
.build ());
3146 emit_move_insn (mask
, gen_lowpart (mask_mode
, tmp
));
3149 /* TARGET = MASK ? OP0 : OP1. */
3150 /* swap op0 and op1 since the order is opposite to pred_merge. */
3151 rtx ops2
[] = {d
->target
, d
->op1
, d
->op0
, mask
};
3152 emit_vlmax_insn (code_for_pred_merge (vmode
), MERGE_OP
, ops2
);
3156 /* Recognize the consecutive index that we can use a single
3157 vrgather.v[x|i] to shuffle the vectors.
3159 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
3160 Use SEW = 32, index = 1 vrgather.vi to get the result. */
3162 shuffle_consecutive_patterns (struct expand_vec_perm_d
*d
)
3164 machine_mode vmode
= d
->vmode
;
3165 scalar_mode smode
= GET_MODE_INNER (vmode
);
3166 poly_int64 vec_len
= d
->perm
.length ();
3169 if (!vec_len
.is_constant () || !d
->perm
[0].is_constant (&elt
))
3171 int vlen
= vec_len
.to_constant ();
3173 /* Compute the last element index of consecutive pattern from the leading
3174 consecutive elements. */
3175 int last_consecutive_idx
= -1;
3176 int consecutive_num
= -1;
3177 for (int i
= 1; i
< vlen
; i
++)
3179 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3181 last_consecutive_idx
= i
;
3182 consecutive_num
= last_consecutive_idx
+ 1;
3185 int new_vlen
= vlen
/ consecutive_num
;
3186 if (last_consecutive_idx
< 0 || consecutive_num
== vlen
3187 || !pow2p_hwi (consecutive_num
) || !pow2p_hwi (new_vlen
))
3189 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
3190 All elements of index, index + 1, ... index + consecutive_num - 1 should
3191 locate at the same vector. */
3192 if (maybe_ge (d
->perm
[0], vec_len
)
3193 != maybe_ge (d
->perm
[last_consecutive_idx
], vec_len
))
3195 /* If a vector has 8 elements. We allow optimizations on consecutive
3196 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
3197 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
3199 if (d
->perm
[0].to_constant () % consecutive_num
!= 0)
3201 unsigned int container_bits
= consecutive_num
* GET_MODE_BITSIZE (smode
);
3202 if (container_bits
> 64)
3204 else if (container_bits
== 64)
3206 if (!TARGET_VECTOR_ELEN_64
)
3208 else if (FLOAT_MODE_P (smode
) && !TARGET_VECTOR_ELEN_FP_64
)
3212 /* Check the rest of elements are the same consecutive pattern. */
3213 for (int i
= consecutive_num
; i
< vlen
; i
++)
3214 if (maybe_ne (d
->perm
[i
], d
->perm
[i
% consecutive_num
]))
3217 if (FLOAT_MODE_P (smode
))
3218 smode
= float_mode_for_size (container_bits
).require ();
3220 smode
= int_mode_for_size (container_bits
, 0).require ();
3221 if (!get_vector_mode (smode
, new_vlen
).exists (&vmode
))
3223 machine_mode sel_mode
= related_int_vector_mode (vmode
).require ();
3229 int index
= elt
/ consecutive_num
;
3230 if (index
>= new_vlen
)
3231 index
= index
- new_vlen
;
3232 rtx sel
= gen_const_vector_dup (sel_mode
, index
);
3233 rtx op
= elt
>= vlen
? d
->op0
: d
->op1
;
3234 emit_vlmax_gather_insn (gen_lowpart (vmode
, d
->target
),
3235 gen_lowpart (vmode
, op
), sel
);
3239 /* Recognize the patterns that we can use compress operation to shuffle the
3240 vectors. The perm selector of compress pattern is divided into 2 part:
3241 The first part is the random index number < NUNITS.
3242 The second part is consecutive last N index number >= NUNITS.
3245 v = VEC_PERM_EXPR (v0, v1, selector),
3246 selector = { 0, 2, 6, 7 }
3248 We can transform such pattern into:
3250 op1 = vcompress (op0, mask)
3251 mask = { 1, 0, 1, 0 }
3255 shuffle_compress_patterns (struct expand_vec_perm_d
*d
)
3257 machine_mode vmode
= d
->vmode
;
3258 poly_int64 vec_len
= d
->perm
.length ();
3260 if (!vec_len
.is_constant ())
3263 int vlen
= vec_len
.to_constant ();
3265 /* It's not worthwhile the compress pattern has elements < 4
3266 and we can't modulo indices for compress pattern. */
3267 if (known_ge (d
->perm
[vlen
- 1], vlen
* 2) || vlen
< 4)
3270 /* Compress pattern doesn't work for one vector. */
3271 if (d
->one_vector_p
)
3274 /* Compress point is the point that all elements value with index i >=
3275 compress point of the selector are all consecutive series increasing and
3276 each selector value >= NUNITS. In this case, we could compress all elements
3277 of i < compress point into the op1. */
3278 int compress_point
= -1;
3279 for (int i
= 0; i
< vlen
; i
++)
3281 if (compress_point
< 0 && known_ge (d
->perm
[i
], vec_len
))
3288 /* We don't apply compress approach if we can't find the compress point. */
3289 if (compress_point
< 0)
3292 /* We can only apply compress approach when all index values from 0 to
3293 compress point are increasing. */
3294 for (int i
= 1; i
< compress_point
; i
++)
3295 if (maybe_le (d
->perm
[i
], d
->perm
[i
- 1]))
3298 /* It must be series increasing from compress point. */
3299 for (int i
= 1 + compress_point
; i
< vlen
; i
++)
3300 if (maybe_ne (d
->perm
[i
], d
->perm
[i
- 1] + 1))
3307 /* Check whether we need to slideup op1 to apply compress approach.
3309 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
3310 is 2 * NUNITS - 1, so we don't need to slide up.
3312 For index = { 0, 2, 5, 6}, we need to slide op1 up before
3313 we apply compress approach. */
3314 bool need_slideup_p
= maybe_ne (d
->perm
[vlen
- 1], 2 * vec_len
- 1)
3315 && !const_vec_duplicate_p (d
->op1
);
3317 /* If we leave it directly be handled by general gather,
3318 the code sequence will be:
3319 VECTOR LOAD selector
3320 GEU mask, selector, NUNITS
3321 GATHER dest, op0, selector
3322 SUB selector, selector, NUNITS
3323 GATHER dest, op1, selector, mask
3324 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
3325 as COST = 4. So, we consider the general gather handling COST = 9.
3326 TODO: This cost is not accurate, we can adjust it by tune info. */
3327 int general_cost
= 9;
3329 /* If we can use compress approach, the code sequence will be:
3331 COMPRESS op1, op0, mask
3332 If it needs slide up, it will be:
3335 COMPRESS op1, op0, mask
3336 By default, mask load COST = 2.
3337 TODO: This cost is not accurate, we can adjust it by tune info. */
3338 int compress_cost
= 4;
3340 if (general_cost
<= compress_cost
)
3343 /* Build a mask that is true when selector element is true. */
3344 machine_mode mask_mode
= get_mask_mode (vmode
);
3345 rvv_builder
builder (mask_mode
, vlen
, 1);
3346 for (int i
= 0; i
< vlen
; i
++)
3348 bool is_compress_index
= false;
3349 for (int j
= 0; j
< compress_point
; j
++)
3351 if (known_eq (d
->perm
[j
], i
))
3353 is_compress_index
= true;
3357 if (is_compress_index
)
3358 builder
.quick_push (CONST1_RTX (BImode
));
3360 builder
.quick_push (CONST0_RTX (BImode
));
3362 rtx mask
= force_reg (mask_mode
, builder
.build ());
3367 int slideup_cnt
= vlen
- (d
->perm
[vlen
- 1].to_constant () % vlen
) - 1;
3368 merge
= gen_reg_rtx (vmode
);
3369 rtx ops
[] = {merge
, d
->op1
, gen_int_mode (slideup_cnt
, Pmode
)};
3370 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEUP
, vmode
);
3371 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3374 insn_code icode
= code_for_pred_compress (vmode
);
3375 rtx ops
[] = {d
->target
, merge
, d
->op0
, mask
};
3376 emit_vlmax_insn (icode
, COMPRESS_OP_MERGE
, ops
);
3380 /* Recognize decompress patterns:
3382 1. VEC_PERM_EXPR op0 and op1
3383 with isel = { 0, nunits, 1, nunits + 1, ... }.
3384 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3386 2. VEC_PERM_EXPR op0 and op1
3387 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
3388 Slide down op0 and op1 with OFFSET = 1/2 nunits.
3389 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
3392 shuffle_decompress_patterns (struct expand_vec_perm_d
*d
)
3394 poly_uint64 nelt
= d
->perm
.length ();
3395 machine_mode mask_mode
= get_mask_mode (d
->vmode
);
3397 /* For constant size indices, we dont't need to handle it here.
3398 Just leave it to vec_perm<mode>. */
3399 if (d
->perm
.length ().is_constant ())
3402 poly_uint64 first
= d
->perm
[0];
3403 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
3404 || !d
->perm
.series_p (0, 2, first
, 1)
3405 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
3408 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
3409 Otherwise, it could overflow the index range. */
3410 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
3411 if (GET_MODE_INNER (d
->vmode
) == QImode
3412 && !get_vector_mode (HImode
, nelt
).exists (&sel_mode
))
3420 if (known_eq (first
, 0U))
3427 op0
= gen_reg_rtx (d
->vmode
);
3428 op1
= gen_reg_rtx (d
->vmode
);
3429 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, d
->vmode
);
3430 rtx ops0
[] = {op0
, d
->op0
, gen_int_mode (first
, Pmode
)};
3431 rtx ops1
[] = {op1
, d
->op1
, gen_int_mode (first
, Pmode
)};
3432 emit_vlmax_insn (icode
, BINARY_OP
, ops0
);
3433 emit_vlmax_insn (icode
, BINARY_OP
, ops1
);
3435 /* Generate { 0, 1, .... } mask. */
3436 rtx vid
= gen_reg_rtx (sel_mode
);
3437 rtx vid_repeat
= gen_reg_rtx (sel_mode
);
3438 expand_vec_series (vid
, const0_rtx
, const1_rtx
);
3439 rtx and_ops
[] = {vid_repeat
, vid
, const1_rtx
};
3440 emit_vlmax_insn (code_for_pred_scalar (AND
, sel_mode
), BINARY_OP
, and_ops
);
3441 rtx const_vec
= gen_const_vector_dup (sel_mode
, 1);
3442 rtx mask
= gen_reg_rtx (mask_mode
);
3443 expand_vec_cmp (mask
, EQ
, vid_repeat
, const_vec
);
3444 emit_vlmax_decompress_insn (d
->target
, op0
, op1
, mask
);
3449 shuffle_bswap_pattern (struct expand_vec_perm_d
*d
)
3452 unsigned i
, size
, step
;
3454 if (!d
->one_vector_p
|| !d
->perm
[0].is_constant (&diff
) || !diff
)
3458 size
= step
* GET_MODE_UNIT_BITSIZE (d
->vmode
);
3466 /* We will have VEC_PERM_EXPR after rtl expand when invoking
3467 __builtin_bswap. It will generate about 9 instructions in
3468 loop as below, no matter it is bswap16, bswap32 or bswap64.
3474 5 vrgatherei16.vv v1,v4,v2
3481 But for bswap16 we may have a even simple code gen, which
3482 has only 7 instructions in loop as below.
3493 Unfortunately, the instructions in loop will grow to 13 and 24
3494 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
3495 for both the bswap64 and bswap32, but take shift and or (7 insn)
3502 for (i
= 0; i
< step
; i
++)
3503 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
3506 /* Disable when nunits < 4 since the later generic approach
3507 is more profitable on BSWAP. */
3508 if (!known_gt (GET_MODE_NUNITS (d
->vmode
), 2))
3514 machine_mode vhi_mode
;
3515 poly_uint64 vhi_nunits
= exact_div (GET_MODE_NUNITS (d
->vmode
), 2);
3517 if (!get_vector_mode (HImode
, vhi_nunits
).exists (&vhi_mode
))
3520 /* Step-1: Move op0 to src with VHI mode. */
3521 rtx src
= gen_reg_rtx (vhi_mode
);
3522 emit_move_insn (src
, gen_lowpart (vhi_mode
, d
->op0
));
3524 /* Step-2: Shift right 8 bits to dest. */
3525 rtx dest
= expand_binop (vhi_mode
, lshr_optab
, src
, gen_int_mode (8, Pmode
),
3526 NULL_RTX
, 0, OPTAB_DIRECT
);
3528 /* Step-3: Shift left 8 bits to src. */
3529 src
= expand_binop (vhi_mode
, ashl_optab
, src
, gen_int_mode (8, Pmode
),
3530 NULL_RTX
, 0, OPTAB_DIRECT
);
3532 /* Step-4: Logic Or dest and src to dest. */
3533 dest
= expand_binop (vhi_mode
, ior_optab
, dest
, src
,
3534 NULL_RTX
, 0, OPTAB_DIRECT
);
3536 /* Step-5: Move src to target with VQI mode. */
3537 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
3542 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
3546 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d
*d
)
3548 poly_int64 nunits
= GET_MODE_NUNITS (d
->vmode
);
3550 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */
3551 if (!d
->perm
.series_p (0, 2, nunits
- 1, 2)
3552 || !d
->perm
.series_p (1, 2, nunits
, 2))
3555 /* Disable when nunits < 4 since the later generic approach
3556 is more profitable on indice = { nunits - 1, nunits }. */
3557 if (!known_gt (nunits
, 2))
3564 /* Extract the last element of the first vector. */
3565 scalar_mode smode
= GET_MODE_INNER (d
->vmode
);
3566 rtx tmp
= gen_reg_rtx (smode
);
3567 emit_vec_extract (tmp
, d
->op0
, gen_int_mode (nunits
- 1, Pmode
));
3569 /* Insert the scalar into element 0. */
3571 = FLOAT_MODE_P (d
->vmode
) ? UNSPEC_VFSLIDE1UP
: UNSPEC_VSLIDE1UP
;
3572 insn_code icode
= code_for_pred_slide (unspec
, d
->vmode
);
3573 rtx ops
[] = {d
->target
, d
->op1
, tmp
};
3574 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3578 /* This looks for a series pattern in the provided vector permute structure D.
3579 If successful it emits a series insn as well as a gather to implement it.
3580 Return true if successful, false otherwise. */
3583 shuffle_series_patterns (struct expand_vec_perm_d
*d
)
3585 if (!d
->one_vector_p
|| d
->perm
.encoding ().npatterns () != 1)
3588 poly_int64 el1
= d
->perm
[0];
3589 poly_int64 el2
= d
->perm
[1];
3590 poly_int64 el3
= d
->perm
[2];
3592 poly_int64 step1
= el2
- el1
;
3593 poly_int64 step2
= el3
- el2
;
3595 bool need_insert
= false;
3596 bool have_series
= false;
3598 /* Check for a full series. */
3599 if (known_ne (step1
, 0) && d
->perm
.series_p (0, 1, el1
, step1
))
3602 /* Check for a series starting at the second element. */
3603 else if (known_ne (step2
, 0) && d
->perm
.series_p (1, 1, el2
, step2
))
3612 /* Disable shuffle if we can't find an appropriate integer index mode for
3614 machine_mode sel_mode
;
3615 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3622 /* Create the series. */
3623 machine_mode eltmode
= Pmode
;
3624 rtx series
= gen_reg_rtx (sel_mode
);
3625 expand_vec_series (series
, gen_int_mode (need_insert
? el2
: el1
, eltmode
),
3626 gen_int_mode (need_insert
? step2
: step1
, eltmode
));
3628 /* Insert the remaining element if necessary. */
3631 insn_code icode
= code_for_pred_slide (UNSPEC_VSLIDE1UP
, sel_mode
);
3633 = {series
, series
, gen_int_mode (el1
, GET_MODE_INNER (sel_mode
))};
3634 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
3637 emit_vlmax_gather_insn (d
->target
, d
->op0
, series
);
3642 /* Recognize the pattern that can be shuffled by generic approach. */
3645 shuffle_generic_patterns (struct expand_vec_perm_d
*d
)
3647 machine_mode sel_mode
;
3649 /* We don't enable SLP for non-power of 2 NPATTERNS. */
3650 if (!pow2p_hwi (d
->perm
.encoding().npatterns ()))
3653 /* Disable shuffle if we can't find an appropriate integer index mode for
3655 if (!get_gather_index_mode (d
).exists (&sel_mode
))
3662 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
3663 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
3664 instead of expand vec_perm<mode>, we handle it directly. */
3665 expand_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
3669 /* This function recognizes and supports different permutation patterns
3670 and enable VLA SLP auto-vectorization. */
3672 expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
3674 gcc_assert (d
->op_mode
!= E_VOIDmode
);
3676 /* The pattern matching functions above are written to look for a small
3677 number to begin the sequence (0, 1, N/2). If we begin with an index
3678 from the second operand, we can swap the operands. */
3679 poly_int64 nelt
= d
->perm
.length ();
3680 if (known_ge (d
->perm
[0], nelt
))
3682 d
->perm
.rotate_inputs (1);
3683 std::swap (d
->op0
, d
->op1
);
3686 if (known_gt (nelt
, 1))
3688 if (d
->vmode
== d
->op_mode
)
3690 if (shuffle_merge_patterns (d
))
3692 if (shuffle_consecutive_patterns (d
))
3694 if (shuffle_compress_patterns (d
))
3696 if (shuffle_decompress_patterns (d
))
3698 if (shuffle_bswap_pattern (d
))
3700 if (shuffle_extract_and_slide1up_patterns (d
))
3702 if (shuffle_series_patterns (d
))
3704 if (shuffle_generic_patterns (d
))
3714 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
3717 expand_vec_perm_const (machine_mode vmode
, machine_mode op_mode
, rtx target
,
3718 rtx op0
, rtx op1
, const vec_perm_indices
&sel
)
3720 /* RVV doesn't have Mask type pack/unpack instructions and we don't use
3721 mask to do the iteration loop control. Just disable it directly. */
3722 if (GET_MODE_CLASS (vmode
) == MODE_VECTOR_BOOL
)
3724 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
3725 may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
3726 Ideally, middle-end loop vectorizer should be able to disable it
3727 itself, We can remove the codes here when middle-end code is able
3728 to disable VLA SLP vectorization for poly size (1, 1) VF. */
3729 if (!BYTES_PER_RISCV_VECTOR
.is_constant ()
3730 && maybe_lt (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
3731 poly_int64 (16, 16)))
3734 struct expand_vec_perm_d d
;
3736 /* Check whether the mask can be applied to a single vector. */
3737 if (sel
.ninputs () == 1 || (op0
&& rtx_equal_p (op0
, op1
)))
3738 d
.one_vector_p
= true;
3739 else if (sel
.all_from_input_p (0))
3741 d
.one_vector_p
= true;
3744 else if (sel
.all_from_input_p (1))
3746 d
.one_vector_p
= true;
3750 d
.one_vector_p
= false;
3752 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
3753 sel
.nelts_per_input ());
3755 d
.op_mode
= op_mode
;
3762 d
.testing_p
= !target
;
3765 return expand_vec_perm_const_1 (&d
);
3767 rtx_insn
*last
= get_last_insn ();
3768 bool ret
= expand_vec_perm_const_1 (&d
);
3769 gcc_assert (last
== get_last_insn ());
3774 /* Generate no side effects vsetvl to get the vector length. */
3776 expand_select_vl (rtx
*ops
)
3778 poly_int64 nunits
= rtx_to_poly_int64 (ops
[2]);
3779 if (CONST_INT_P (ops
[1]) && known_le (INTVAL (ops
[1]), nunits
))
3781 /* If length is known <= VF, we just use the length directly instead
3784 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
3785 We move 3 into _255 instead of using explicit vsetvl. */
3786 emit_move_insn (ops
[0], ops
[1]);
3789 /* We arbitrary picked QImode as inner scalar mode to get vector mode.
3790 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */
3791 scalar_int_mode mode
= QImode
;
3792 machine_mode rvv_mode
= get_vector_mode (mode
, nunits
).require ();
3793 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode
, ops
[0], ops
[1]));
3796 /* Expand MASK_LEN_{LOAD,STORE}. */
3798 expand_load_store (rtx
*ops
, bool is_load
)
3802 machine_mode mode
= GET_MODE (ops
[0]);
3804 if (is_vlmax_len_p (mode
, len
))
3806 /* If the length operand is equal to VF, it is VLMAX load/store. */
3809 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
3810 emit_vlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
);
3814 len
= gen_reg_rtx (Pmode
);
3815 emit_vlmax_vsetvl (mode
, len
);
3816 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
3817 get_avl_type_rtx (VLMAX
)));
3822 if (!satisfies_constraint_K (len
))
3823 len
= force_reg (Pmode
, len
);
3826 rtx m_ops
[] = {ops
[0], mask
, ops
[1]};
3827 emit_nonvlmax_insn (code_for_pred_mov (mode
), UNARY_OP_TAMA
, m_ops
,
3831 emit_insn (gen_pred_store (mode
, ops
[0], mask
, ops
[1], len
,
3832 get_avl_type_rtx (NONVLMAX
)));
3836 /* Expand MASK_LEN_STRIDED_LOAD. */
3838 expand_strided_load (machine_mode mode
, rtx
*ops
)
3842 rtx stride
= ops
[2];
3847 insn_code icode
= code_for_pred_strided_load (mode
);
3848 rtx emit_ops
[] = {v_reg
, mask
, gen_rtx_MEM (mode
, base
), stride
};
3850 if (poly_int_rtx_p (len
, &len_val
)
3851 && known_eq (len_val
, GET_MODE_NUNITS (mode
)))
3852 emit_vlmax_insn (icode
, BINARY_OP_TAMA
, emit_ops
);
3855 len
= satisfies_constraint_K (len
) ? len
: force_reg (Pmode
, len
);
3856 emit_nonvlmax_insn (icode
, BINARY_OP_TAMA
, emit_ops
, len
);
3860 /* Expand MASK_LEN_STRIDED_STORE. */
3862 expand_strided_store (machine_mode mode
, rtx
*ops
)
3866 rtx stride
= ops
[1];
3872 if (poly_int_rtx_p (len
, &len_val
)
3873 && known_eq (len_val
, GET_MODE_NUNITS (mode
)))
3875 len
= gen_reg_rtx (Pmode
);
3876 emit_vlmax_vsetvl (mode
, len
);
3877 vl_type
= get_avl_type_rtx (VLMAX
);
3881 len
= satisfies_constraint_K (len
) ? len
: force_reg (Pmode
, len
);
3882 vl_type
= get_avl_type_rtx (NONVLMAX
);
3885 emit_insn (gen_pred_strided_store (mode
, gen_rtx_MEM (mode
, base
),
3886 mask
, stride
, v_reg
, len
, vl_type
));
3889 /* Return true if the operation is the floating-point operation need FRM. */
3891 needs_fp_rounding (unsigned icode
, machine_mode mode
)
3893 if (!FLOAT_MODE_P (mode
))
3896 return icode
!= maybe_code_for_pred (SMIN
, mode
)
3897 && icode
!= maybe_code_for_pred (UNSPEC_VFMIN
, mode
)
3898 && icode
!= maybe_code_for_pred (SMAX
, mode
)
3899 && icode
!= maybe_code_for_pred (UNSPEC_VFMAX
, mode
)
3900 && icode
!= maybe_code_for_pred (NEG
, mode
)
3901 && icode
!= maybe_code_for_pred (ABS
, mode
)
3902 /* narrower-FP -> FP */
3903 && icode
!= maybe_code_for_pred_extend (mode
)
3904 /* narrower-INT -> FP */
3905 && icode
!= maybe_code_for_pred_widen (FLOAT
, mode
)
3906 && icode
!= maybe_code_for_pred_widen (UNSIGNED_FLOAT
, mode
)
3908 && icode
!= maybe_code_for_pred (UNSPEC_VCOPYSIGN
, mode
)
3909 && icode
!= maybe_code_for_pred_mov (mode
);
3912 /* Subroutine to expand COND_LEN_* patterns. */
3914 expand_cond_len_op (unsigned icode
, insn_flags op_type
, rtx
*ops
, rtx len
)
3918 machine_mode mode
= GET_MODE (dest
);
3919 machine_mode mask_mode
= GET_MODE (mask
);
3920 bool is_dummy_mask
= rtx_equal_p (mask
, CONSTM1_RTX (mask_mode
));
3921 bool is_vlmax_len
= is_vlmax_len_p (mode
, len
);
3923 unsigned insn_flags
= HAS_DEST_P
| HAS_MASK_P
| HAS_MERGE_P
| op_type
;
3924 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
3925 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such
3926 simplification in RISC-V backend and may do that in middle-end in the
3928 if (is_dummy_mask
&& is_vlmax_len
)
3929 insn_flags
|= TDEFAULT_POLICY_P
| MDEFAULT_POLICY_P
;
3930 else if (is_dummy_mask
)
3931 insn_flags
|= TU_POLICY_P
| MDEFAULT_POLICY_P
;
3932 else if (is_vlmax_len
)
3933 insn_flags
|= TDEFAULT_POLICY_P
| MU_POLICY_P
;
3935 insn_flags
|= TU_POLICY_P
| MU_POLICY_P
;
3937 if (needs_fp_rounding (icode
, mode
))
3938 insn_flags
|= FRM_DYN_P
;
3941 emit_vlmax_insn (icode
, insn_flags
, ops
);
3943 emit_nonvlmax_insn (icode
, insn_flags
, ops
, len
);
3946 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */
3948 get_else_operand (rtx op
)
3950 return GET_CODE (op
) == SCRATCH
? RVV_VUNDEF (GET_MODE (op
)) : op
;
3953 /* Expand unary ops COND_LEN_*. */
3955 expand_cond_len_unop (unsigned icode
, rtx
*ops
)
3960 rtx merge
= get_else_operand (ops
[3]);
3963 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
3964 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
3967 /* Expand unary ops COND_*. */
3969 expand_cond_unop (unsigned icode
, rtx
*ops
)
3974 rtx merge
= get_else_operand (ops
[3]);
3975 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
3977 rtx cond_ops
[] = {dest
, mask
, merge
, src
};
3978 expand_cond_len_op (icode
, UNARY_OP_P
, cond_ops
, len
);
3981 /* Expand binary ops COND_LEN_*. */
3983 expand_cond_len_binop (unsigned icode
, rtx
*ops
)
3989 rtx merge
= get_else_operand (ops
[4]);
3992 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
3993 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
3996 /* Expand binary ops COND_*. */
3998 expand_cond_binop (unsigned icode
, rtx
*ops
)
4004 rtx merge
= get_else_operand (ops
[4]);
4005 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4007 rtx cond_ops
[] = {dest
, mask
, merge
, src1
, src2
};
4008 expand_cond_len_op (icode
, BINARY_OP_P
, cond_ops
, len
);
4011 /* Prepare insn_code for gather_load/scatter_store according to
4012 the vector mode and index mode. */
4014 prepare_gather_scatter (machine_mode vec_mode
, machine_mode idx_mode
,
4018 return code_for_pred_indexed_store (UNSPEC_UNORDERED
, vec_mode
, idx_mode
);
4021 unsigned src_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode
));
4022 unsigned dst_eew_bitsize
= GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode
));
4023 if (dst_eew_bitsize
== src_eew_bitsize
)
4024 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED
, vec_mode
);
4025 else if (dst_eew_bitsize
> src_eew_bitsize
)
4027 unsigned factor
= dst_eew_bitsize
/ src_eew_bitsize
;
4031 return code_for_pred_indexed_load_x2_greater_eew (
4032 UNSPEC_UNORDERED
, vec_mode
);
4034 return code_for_pred_indexed_load_x4_greater_eew (
4035 UNSPEC_UNORDERED
, vec_mode
);
4037 return code_for_pred_indexed_load_x8_greater_eew (
4038 UNSPEC_UNORDERED
, vec_mode
);
4045 unsigned factor
= src_eew_bitsize
/ dst_eew_bitsize
;
4049 return code_for_pred_indexed_load_x2_smaller_eew (
4050 UNSPEC_UNORDERED
, vec_mode
);
4052 return code_for_pred_indexed_load_x4_smaller_eew (
4053 UNSPEC_UNORDERED
, vec_mode
);
4055 return code_for_pred_indexed_load_x8_smaller_eew (
4056 UNSPEC_UNORDERED
, vec_mode
);
4064 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */
4066 expand_gather_scatter (rtx
*ops
, bool is_load
)
4068 rtx ptr
, vec_offset
, vec_reg
;
4077 vec_offset
= ops
[2];
4078 zero_extend_p
= INTVAL (ops
[3]);
4079 shift
= exact_log2 (INTVAL (ops
[4]));
4085 vec_offset
= ops
[1];
4086 zero_extend_p
= INTVAL (ops
[2]);
4087 shift
= exact_log2 (INTVAL (ops
[3]));
4090 machine_mode vec_mode
= GET_MODE (vec_reg
);
4091 machine_mode idx_mode
= GET_MODE (vec_offset
);
4092 scalar_mode inner_idx_mode
= GET_MODE_INNER (idx_mode
);
4093 unsigned inner_offsize
= GET_MODE_BITSIZE (inner_idx_mode
);
4094 poly_int64 nunits
= GET_MODE_NUNITS (vec_mode
);
4095 bool is_vlmax
= is_vlmax_len_p (vec_mode
, len
);
4097 bool use_widening_shift
= false;
4099 /* Extend the offset element to address width. */
4100 if (inner_offsize
< BITS_PER_WORD
)
4102 use_widening_shift
= TARGET_ZVBB
&& zero_extend_p
&& shift
== 1;
4103 /* 7.2. Vector Load/Store Addressing Modes.
4104 If the vector offset elements are narrower than XLEN, they are
4105 zero-extended to XLEN before adding to the ptr effective address. If
4106 the vector offset elements are wider than XLEN, the least-significant
4107 XLEN bits are used in the address calculation. An implementation must
4108 raise an illegal instruction exception if the EEW is not supported for
4111 RVV spec only refers to the shift == 0 case. */
4112 if (!zero_extend_p
|| shift
)
4116 = int_mode_for_size (inner_offsize
* 2, 0).require ();
4118 inner_idx_mode
= int_mode_for_size (BITS_PER_WORD
, 0).require ();
4119 machine_mode new_idx_mode
4120 = get_vector_mode (inner_idx_mode
, nunits
).require ();
4121 if (!use_widening_shift
)
4123 rtx tmp
= gen_reg_rtx (new_idx_mode
);
4124 emit_insn (gen_extend_insn (tmp
, vec_offset
, new_idx_mode
, idx_mode
,
4125 zero_extend_p
? true : false));
4128 idx_mode
= new_idx_mode
;
4135 if (!use_widening_shift
)
4136 tmp
= expand_binop (idx_mode
, ashl_optab
, vec_offset
,
4137 gen_int_mode (shift
, Pmode
), NULL_RTX
, 0,
4141 tmp
= gen_reg_rtx (idx_mode
);
4142 insn_code icode
= code_for_pred_vwsll_scalar (idx_mode
);
4143 rtx ops
[] = {tmp
, vec_offset
, const1_rtx
};
4144 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
4150 insn_code icode
= prepare_gather_scatter (vec_mode
, idx_mode
, is_load
);
4156 = {vec_reg
, mask
, ptr
, vec_offset
};
4157 emit_vlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
);
4161 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4162 emit_vlmax_insn (icode
, SCATTER_OP_M
, store_ops
);
4170 = {vec_reg
, mask
, ptr
, vec_offset
};
4171 emit_nonvlmax_insn (icode
, BINARY_OP_TAMA
, load_ops
, len
);
4175 rtx store_ops
[] = {mask
, ptr
, vec_offset
, vec_reg
};
4176 emit_nonvlmax_insn (icode
, SCATTER_OP_M
, store_ops
, len
);
4181 /* Expand COND_LEN_*. */
4183 expand_cond_len_ternop (unsigned icode
, rtx
*ops
)
4190 rtx merge
= get_else_operand (ops
[5]);
4193 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4194 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4197 /* Expand COND_*. */
4199 expand_cond_ternop (unsigned icode
, rtx
*ops
)
4206 rtx merge
= get_else_operand (ops
[5]);
4207 rtx len
= gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest
)), Pmode
);
4209 rtx cond_ops
[] = {dest
, mask
, src1
, src2
, src3
, merge
};
4210 expand_cond_len_op (icode
, TERNARY_OP_P
, cond_ops
, len
);
4213 /* Expand reduction operations.
4214 Case 1: ops = {scalar_dest, vector_src}
4215 Case 2: ops = {scalar_dest, vector_src, mask, vl}
4218 expand_reduction (unsigned unspec
, unsigned insn_flags
, rtx
*ops
, rtx init
)
4220 rtx scalar_dest
= ops
[0];
4221 rtx vector_src
= ops
[1];
4222 machine_mode vmode
= GET_MODE (vector_src
);
4223 machine_mode vel_mode
= GET_MODE (scalar_dest
);
4224 machine_mode m1_mode
= get_m1_mode (vel_mode
).require ();
4226 rtx m1_tmp
= gen_reg_rtx (m1_mode
);
4227 rtx scalar_move_ops
[] = {m1_tmp
, init
};
4228 insn_code icode
= code_for_pred_broadcast (m1_mode
);
4229 if (need_mask_operand_p (insn_flags
))
4230 emit_nonvlmax_insn (icode
, SCALAR_MOVE_OP
, scalar_move_ops
, ops
[3]);
4232 emit_vlmax_insn (icode
, SCALAR_MOVE_OP
, scalar_move_ops
);
4234 rtx m1_tmp2
= gen_reg_rtx (m1_mode
);
4235 rtx reduc_ops
[] = {m1_tmp2
, vector_src
, m1_tmp
};
4236 icode
= code_for_pred (unspec
, vmode
);
4238 if (need_mask_operand_p (insn_flags
))
4240 rtx mask_len_reduc_ops
[] = {m1_tmp2
, ops
[2], vector_src
, m1_tmp
};
4241 emit_nonvlmax_insn (icode
, insn_flags
, mask_len_reduc_ops
, ops
[3]);
4244 emit_vlmax_insn (icode
, insn_flags
, reduc_ops
);
4246 emit_insn (gen_pred_extract_first (m1_mode
, scalar_dest
, m1_tmp2
));
4249 /* Prepare ops for ternary operations.
4250 It can be called before or after RA. */
4252 prepare_ternary_operands (rtx
*ops
)
4254 machine_mode mode
= GET_MODE (ops
[0]);
4256 if (!rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4257 && (VECTOR_MODE_P (GET_MODE (ops
[2]))
4258 && !rtx_equal_p (ops
[2], ops
[5]))
4259 && !rtx_equal_p (ops
[3], ops
[5])
4260 && !rtx_equal_p (ops
[4], ops
[5]))
4262 /* RA will fail to find vector REG and report ICE, so we pre-merge
4263 the ops for LMUL = 8. */
4264 if (satisfies_constraint_Wc1 (ops
[1]))
4266 emit_move_insn (ops
[0], ops
[5]);
4267 emit_insn (gen_pred_mov (mode
, ops
[0], ops
[1], ops
[0], ops
[4], ops
[6],
4268 ops
[7], ops
[8], ops
[9]));
4271 emit_insn (gen_pred_merge (mode
, ops
[0], RVV_VUNDEF (mode
), ops
[5],
4272 ops
[4], ops
[1], ops
[6], ops
[7], ops
[9]));
4273 ops
[5] = ops
[4] = ops
[0];
4277 /* Swap the multiplication ops if the fallback value is the
4278 second of the two. */
4279 if (rtx_equal_p (ops
[3], ops
[5]))
4280 std::swap (ops
[2], ops
[3]);
4282 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
4283 into PLUS (ASHIFT (a, 2), b) according to uarchs. */
4285 gcc_assert (rtx_equal_p (ops
[5], RVV_VUNDEF (mode
))
4286 || rtx_equal_p (ops
[5], ops
[2]) || rtx_equal_p (ops
[5], ops
[4]));
4289 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */
4291 expand_lanes_load_store (rtx
*ops
, bool is_load
)
4295 rtx addr
= is_load
? XEXP (ops
[1], 0) : XEXP (ops
[0], 0);
4296 rtx reg
= is_load
? ops
[0] : ops
[1];
4297 machine_mode mode
= GET_MODE (ops
[0]);
4299 if (is_vlmax_len_p (mode
, len
))
4301 /* If the length operand is equal to VF, it is VLMAX load/store. */
4304 rtx m_ops
[] = {reg
, mask
, addr
};
4305 emit_vlmax_insn (code_for_pred_unit_strided_load (mode
), UNARY_OP_TAMA
,
4310 len
= gen_reg_rtx (Pmode
);
4311 emit_vlmax_vsetvl (mode
, len
);
4312 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4313 get_avl_type_rtx (VLMAX
)));
4318 if (!satisfies_constraint_K (len
))
4319 len
= force_reg (Pmode
, len
);
4322 rtx m_ops
[] = {reg
, mask
, addr
};
4323 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode
),
4324 UNARY_OP_TAMA
, m_ops
, len
);
4327 emit_insn (gen_pred_unit_strided_store (mode
, mask
, addr
, reg
, len
,
4328 get_avl_type_rtx (NONVLMAX
)));
4332 /* Expand LEN_FOLD_EXTRACT_LAST. */
4334 expand_fold_extract_last (rtx
*ops
)
4337 rtx default_value
= ops
[1];
4339 rtx anchor
= gen_reg_rtx (Pmode
);
4340 rtx index
= gen_reg_rtx (Pmode
);
4342 rtx else_label
= gen_label_rtx ();
4343 rtx end_label
= gen_label_rtx ();
4345 machine_mode mode
= GET_MODE (vect
);
4346 machine_mode mask_mode
= GET_MODE (mask
);
4347 rtx compress_vect
= gen_reg_rtx (mode
);
4348 rtx slide_vect
= gen_reg_rtx (mode
);
4351 if (is_vlmax_len_p (mode
, len
))
4354 /* Calculate the number of 1-bit in mask. */
4355 rtx cpop_ops
[] = {anchor
, mask
};
4357 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4360 emit_vlmax_insn (code_for_pred_popcount (mask_mode
, Pmode
), CPOP_OP
,
4363 riscv_expand_conditional_branch (else_label
, EQ
, anchor
, const0_rtx
);
4364 emit_insn (gen_rtx_SET (index
, gen_rtx_PLUS (Pmode
, anchor
, constm1_rtx
)));
4365 /* Compress the vector. */
4366 icode
= code_for_pred_compress (mode
);
4367 rtx compress_ops
[] = {compress_vect
, vect
, mask
};
4369 emit_nonvlmax_insn (icode
, COMPRESS_OP
, compress_ops
, len
);
4371 emit_vlmax_insn (icode
, COMPRESS_OP
, compress_ops
);
4372 /* Emit the slide down to index 0 in a new vector. */
4373 rtx slide_ops
[] = {slide_vect
, compress_vect
, index
};
4374 icode
= code_for_pred_slide (UNSPEC_VSLIDEDOWN
, mode
);
4376 emit_nonvlmax_insn (icode
, BINARY_OP
, slide_ops
, len
);
4378 emit_vlmax_insn (icode
, BINARY_OP
, slide_ops
);
4379 /* Emit v(f)mv.[xf].s. */
4380 emit_insn (gen_pred_extract_first (mode
, dst
, slide_vect
));
4382 emit_jump_insn (gen_jump (end_label
));
4384 emit_label (else_label
);
4385 emit_move_insn (dst
, default_value
);
4386 emit_label (end_label
);
4389 /* Return true if the LMUL of comparison less than or equal to one. */
4391 cmp_lmul_le_one (machine_mode mode
)
4393 if (riscv_v_ext_vector_mode_p (mode
))
4394 return known_le (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4395 else if (riscv_v_ext_vls_mode_p (mode
))
4396 return known_le (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4400 /* Return true if the LMUL of comparison greater than one. */
4402 cmp_lmul_gt_one (machine_mode mode
)
4404 if (riscv_v_ext_vector_mode_p (mode
))
4405 return known_gt (GET_MODE_SIZE (mode
), BYTES_PER_RISCV_VECTOR
);
4406 else if (riscv_v_ext_vls_mode_p (mode
))
4407 return known_gt (GET_MODE_BITSIZE (mode
), TARGET_MIN_VLEN
);
4411 /* Return true if the VLS mode is legal. There are 2 cases here.
4413 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
4414 is the highest priority choice and should not conflict with VLS modes.
4415 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
4416 VLS mode are smaller than the minimal vla.
4418 Take vlen = 2048 as example for case 2.
4420 Note: Below table based on vlen = 2048.
4421 +----------------------------------------------------+----------------------+
4422 | VLS mode | VLA mode |
4423 +----------------------------------------------------+----------------------+
4424 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits |
4425 +------------+-----------+-----------------+---------+-----------+----------+
4426 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 |
4427 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 |
4428 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 |
4429 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 |
4430 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 |
4431 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 |
4432 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 |
4433 | ... | ... | ... | ... | RVVMF64BI | 32 |
4434 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 |
4435 +------------+-----------+-----------------+---------+-----------+----------+
4436 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 |
4437 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 |
4438 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 |
4439 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 |
4440 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 |
4441 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 |
4442 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 |
4443 | ... | ... | .. | ... | RVVMF8QI | 256 |
4444 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 |
4445 +------------+-----------+-----------------+---------+-----------+----------+
4446 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 |
4447 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 |
4448 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 |
4449 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 |
4450 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 |
4451 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 |
4452 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 |
4453 | ... | ... | .. | ... | RVVMF4HI | 512 |
4454 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 |
4455 +------------+-----------+-----------------+---------+-----------+----------+
4456 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 |
4457 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 |
4458 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 |
4459 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 |
4460 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 |
4461 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 |
4462 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 |
4463 | ... | ... | .. | ... | RVVMF2SI | 1024 |
4464 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 |
4465 +------------+-----------+-----------------+---------+-----------+----------+
4466 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 |
4467 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 |
4468 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 |
4469 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 |
4470 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 |
4471 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 |
4472 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 |
4473 | ... | ... | .. | ... | RVVM1DI | 2048 |
4474 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 |
4475 +------------+-----------+-----------------+---------+-----------+----------+
4477 Then we can have the condition for VLS mode in fixed-vlmax, aka:
4478 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */
4480 vls_mode_valid_p (machine_mode vls_mode
)
4482 if (!TARGET_VECTOR
|| TARGET_XTHEADVECTOR
)
4485 if (rvv_vector_bits
== RVV_VECTOR_BITS_SCALABLE
)
4487 if (GET_MODE_CLASS (vls_mode
) != MODE_VECTOR_BOOL
4488 && !ordered_p (TARGET_MAX_LMUL
* BITS_PER_RISCV_VECTOR
,
4489 GET_MODE_PRECISION (vls_mode
)))
4490 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
4491 BITS_PER_RISCV_VECTOR.
4493 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
4494 We enable VLS modes have fixed size <= 128bit. Since ordered_p is
4495 false between VLA modes with size = (128, 128) bits and VLS mode
4496 with size = 128 bits, we will end up with multiple ICEs in
4497 middle-end generic codes. */
4502 if (rvv_vector_bits
== RVV_VECTOR_BITS_ZVL
)
4504 machine_mode inner_mode
= GET_MODE_INNER (vls_mode
);
4505 int precision
= GET_MODE_PRECISION (inner_mode
).to_constant ();
4506 int min_vlmax_bitsize
= TARGET_MIN_VLEN
/ (64 / precision
);
4508 return GET_MODE_PRECISION (vls_mode
).to_constant () < min_vlmax_bitsize
;
4514 /* We don't have to convert the floating point to integer when the
4515 mantissa is zero. Thus, ther will be a limitation for both the
4516 single and double precision floating point. There will be no
4517 mantissa if the floating point is greater than the limit.
4519 1. Half floating point.
4520 +-----------+---------------+
4521 | float | binary layout |
4522 +-----------+---------------+
4524 +-----------+---------------+
4526 +-----------+---------------+
4528 +-----------+---------------+
4531 All half floating point will be unchanged for ceil if it is
4532 greater than and equal to 1024.
4534 2. Single floating point.
4535 +-----------+---------------+
4536 | float | binary layout |
4537 +-----------+---------------+
4538 | 8388607.5 | 0x4affffff |
4539 +-----------+---------------+
4540 | 8388608.0 | 0x4b000000 |
4541 +-----------+---------------+
4542 | 8388609.0 | 0x4b000001 |
4543 +-----------+---------------+
4546 All single floating point will be unchanged for ceil if it is
4547 greater than and equal to 8388608.
4549 3. Double floating point.
4550 +--------------------+--------------------+
4551 | float | binary layout |
4552 +--------------------+--------------------+
4553 | 4503599627370495.5 | 0X432fffffffffffff |
4554 +--------------------+--------------------+
4555 | 4503599627370496.0 | 0X4330000000000000 |
4556 +--------------------+--------------------+
4557 | 4503599627370497.0 | 0X4340000000000000 |
4558 +--------------------+--------------------+
4561 All double floating point will be unchanged for ceil if it is
4562 greater than and equal to 4503599627370496.
4565 get_fp_rounding_coefficient (machine_mode inner_mode
)
4567 REAL_VALUE_TYPE real
;
4569 if (inner_mode
== E_HFmode
)
4570 real_from_integer (&real
, inner_mode
, 1024, SIGNED
);
4571 else if (inner_mode
== E_SFmode
)
4572 real_from_integer (&real
, inner_mode
, 8388608, SIGNED
);
4573 else if (inner_mode
== E_DFmode
)
4574 real_from_integer (&real
, inner_mode
, 4503599627370496, SIGNED
);
4578 return const_double_from_real_value (real
, inner_mode
);
4582 emit_vec_float_cmp_mask (rtx fp_vector
, rtx_code code
, rtx fp_scalar
,
4583 machine_mode vec_fp_mode
)
4585 /* Step-1: Prepare the scalar float compare register. */
4586 rtx fp_reg
= gen_reg_rtx (GET_MODE_INNER (vec_fp_mode
));
4587 emit_insn (gen_move_insn (fp_reg
, fp_scalar
));
4589 /* Step-2: Generate the mask. */
4590 machine_mode mask_mode
= get_mask_mode (vec_fp_mode
);
4591 rtx mask
= gen_reg_rtx (mask_mode
);
4592 rtx cmp
= gen_rtx_fmt_ee (code
, mask_mode
, fp_vector
, fp_reg
);
4593 rtx cmp_ops
[] = {mask
, cmp
, fp_vector
, fp_reg
};
4594 insn_code icode
= code_for_pred_cmp_scalar (vec_fp_mode
);
4595 emit_vlmax_insn (icode
, COMPARE_OP
, cmp_ops
);
4601 emit_vec_copysign (rtx op_dest
, rtx op_src_0
, rtx op_src_1
,
4602 machine_mode vec_mode
)
4604 rtx sgnj_ops
[] = {op_dest
, op_src_0
, op_src_1
};
4605 insn_code icode
= code_for_pred (UNSPEC_VCOPYSIGN
, vec_mode
);
4607 emit_vlmax_insn (icode
, BINARY_OP
, sgnj_ops
);
4611 emit_vec_abs (rtx op_dest
, rtx op_src
, machine_mode vec_mode
)
4613 rtx abs_ops
[] = {op_dest
, op_src
};
4614 insn_code icode
= code_for_pred (ABS
, vec_mode
);
4616 emit_vlmax_insn (icode
, UNARY_OP
, abs_ops
);
4620 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, rtx mask
,
4621 insn_type type
, machine_mode vec_mode
)
4623 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4625 if (type
& USE_VUNDEF_MERGE_P
)
4627 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
4628 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4632 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4633 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4638 emit_vec_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4639 machine_mode vec_mode
)
4641 rtx ops
[] = {op_dest
, op_src
};
4642 insn_code icode
= code_for_pred_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4644 emit_vlmax_insn (icode
, type
, ops
);
4648 emit_vec_narrow_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4649 machine_mode vec_mode
)
4651 rtx ops
[] = {op_dest
, op_src
};
4652 insn_code icode
= code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4654 emit_vlmax_insn (icode
, type
, ops
);
4658 emit_vec_widen_cvt_x_f (rtx op_dest
, rtx op_src
, insn_type type
,
4659 machine_mode vec_mode
)
4661 rtx ops
[] = {op_dest
, op_src
};
4662 insn_code icode
= code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT
, vec_mode
);
4664 emit_vlmax_insn (icode
, type
, ops
);
4668 emit_vec_widen_cvt_f_f (rtx op_dest
, rtx op_src
, insn_type type
,
4669 machine_mode vec_mode
)
4671 rtx ops
[] = {op_dest
, op_src
};
4672 insn_code icode
= code_for_pred_extend (vec_mode
);
4674 emit_vlmax_insn (icode
, type
, ops
);
4678 emit_vec_cvt_f_x (rtx op_dest
, rtx op_src
, rtx mask
,
4679 insn_type type
, machine_mode vec_mode
)
4681 rtx cvt_fp_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4682 insn_code icode
= code_for_pred (FLOAT
, vec_mode
);
4684 emit_vlmax_insn (icode
, type
, cvt_fp_ops
);
4688 emit_vec_cvt_x_f_rtz (rtx op_dest
, rtx op_src
, rtx mask
,
4689 insn_type type
, machine_mode vec_mode
)
4691 insn_code icode
= code_for_pred (FIX
, vec_mode
);
4693 if (type
& USE_VUNDEF_MERGE_P
)
4695 rtx cvt_x_ops
[] = {op_dest
, mask
, op_src
};
4696 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4700 rtx cvt_x_ops
[] = {op_dest
, mask
, op_dest
, op_src
};
4701 emit_vlmax_insn (icode
, type
, cvt_x_ops
);
4706 emit_vec_binary_alu (rtx op_dest
, rtx op_1
, rtx op_2
, enum rtx_code rcode
,
4707 machine_mode vec_mode
)
4709 rtx ops
[] = {op_dest
, op_1
, op_2
};
4710 insn_code icode
= code_for_pred (rcode
, vec_mode
);
4712 emit_vlmax_insn (icode
, BINARY_OP
, ops
);
4716 expand_vec_ceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4717 machine_mode vec_int_mode
)
4719 /* Step-1: Get the abs float value for mask generation. */
4720 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4722 /* Step-2: Generate the mask on const fp. */
4723 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4724 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4726 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */
4727 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4728 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RUP
, vec_fp_mode
);
4730 /* Step-4: Convert to floating-point on mask for the final result.
4731 To avoid unnecessary frm register access, we use RUP here and it will
4732 never do the rounding up because the tmp rtx comes from the float
4733 to int conversion. */
4734 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RUP
, vec_fp_mode
);
4736 /* Step-5: Retrieve the sign bit for -0.0. */
4737 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4741 expand_vec_floor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4742 machine_mode vec_int_mode
)
4744 /* Step-1: Get the abs float value for mask generation. */
4745 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4747 /* Step-2: Generate the mask on const fp. */
4748 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4749 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4751 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */
4752 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4753 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RDN
, vec_fp_mode
);
4755 /* Step-4: Convert to floating-point on mask for the floor result. */
4756 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RDN
, vec_fp_mode
);
4758 /* Step-5: Retrieve the sign bit for -0.0. */
4759 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4763 expand_vec_nearbyint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4764 machine_mode vec_int_mode
)
4766 /* Step-1: Get the abs float value for mask generation. */
4767 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4769 /* Step-2: Generate the mask on const fp. */
4770 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4771 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4773 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
4774 rtx fflags
= gen_reg_rtx (SImode
);
4775 emit_insn (gen_riscv_frflags (fflags
));
4777 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */
4778 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4779 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
4781 /* Step-5: Convert to floating-point on mask for the nearbyint result. */
4782 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4784 /* Step-6: Restore FP exception flags. */
4785 emit_insn (gen_riscv_fsflags (fflags
));
4787 /* Step-7: Retrieve the sign bit for -0.0. */
4788 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4792 expand_vec_rint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4793 machine_mode vec_int_mode
)
4795 /* Step-1: Get the abs float value for mask generation. */
4796 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4798 /* Step-2: Generate the mask on const fp. */
4799 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4800 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4802 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */
4803 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4804 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_DYN
, vec_fp_mode
);
4806 /* Step-4: Convert to floating-point on mask for the rint result. */
4807 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4809 /* Step-5: Retrieve the sign bit for -0.0. */
4810 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4814 expand_vec_round (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4815 machine_mode vec_int_mode
)
4817 /* Step-1: Get the abs float value for mask generation. */
4818 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4820 /* Step-2: Generate the mask on const fp. */
4821 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4822 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4824 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */
4825 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4826 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RMM
, vec_fp_mode
);
4828 /* Step-4: Convert to floating-point on mask for the round result. */
4829 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RMM
, vec_fp_mode
);
4831 /* Step-5: Retrieve the sign bit for -0.0. */
4832 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4836 expand_vec_trunc (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4837 machine_mode vec_int_mode
)
4839 /* Step-1: Get the abs float value for mask generation. */
4840 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4842 /* Step-2: Generate the mask on const fp. */
4843 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4844 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4846 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */
4847 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4848 emit_vec_cvt_x_f_rtz (tmp
, op_1
, mask
, UNARY_OP_TAMA
, vec_fp_mode
);
4850 /* Step-4: Convert to floating-point on mask for the rint result. */
4851 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_DYN
, vec_fp_mode
);
4853 /* Step-5: Retrieve the sign bit for -0.0. */
4854 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4858 expand_vec_roundeven (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4859 machine_mode vec_int_mode
)
4861 /* Step-1: Get the abs float value for mask generation. */
4862 emit_vec_abs (op_0
, op_1
, vec_fp_mode
);
4864 /* Step-2: Generate the mask on const fp. */
4865 rtx const_fp
= get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode
));
4866 rtx mask
= emit_vec_float_cmp_mask (op_0
, LT
, const_fp
, vec_fp_mode
);
4868 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */
4869 rtx tmp
= gen_reg_rtx (vec_int_mode
);
4870 emit_vec_cvt_x_f (tmp
, op_1
, mask
, UNARY_OP_TAMA_FRM_RNE
, vec_fp_mode
);
4872 /* Step-4: Convert to floating-point on mask for the rint result. */
4873 emit_vec_cvt_f_x (op_0
, tmp
, mask
, UNARY_OP_TAMU_FRM_RNE
, vec_fp_mode
);
4875 /* Step-5: Retrieve the sign bit for -0.0. */
4876 emit_vec_copysign (op_0
, op_0
, op_1
, vec_fp_mode
);
4879 /* Handling the rounding from floating-point to int/long/long long. */
4881 emit_vec_rounding_to_integer (rtx op_0
, rtx op_1
, insn_type type
,
4882 machine_mode vec_fp_mode
,
4883 machine_mode vec_int_mode
,
4884 machine_mode vec_bridge_mode
= E_VOIDmode
)
4886 poly_uint16 vec_fp_size
= GET_MODE_SIZE (vec_fp_mode
);
4887 poly_uint16 vec_int_size
= GET_MODE_SIZE (vec_int_mode
);
4889 if (known_eq (vec_fp_size
, vec_int_size
)) /* SF => SI, DF => DI. */
4890 emit_vec_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
4891 else if (maybe_eq (vec_fp_size
, vec_int_size
* 2)) /* DF => SI. */
4892 emit_vec_narrow_cvt_x_f (op_0
, op_1
, type
, vec_fp_mode
);
4893 else if (maybe_eq (vec_fp_size
* 2, vec_int_size
)) /* SF => DI, HF => SI. */
4894 emit_vec_widen_cvt_x_f (op_0
, op_1
, type
, vec_int_mode
);
4895 else if (maybe_eq (vec_fp_size
* 4, vec_int_size
)) /* HF => DI. */
4897 gcc_assert (vec_bridge_mode
!= E_VOIDmode
);
4899 rtx op_sf
= gen_reg_rtx (vec_bridge_mode
);
4901 /* Step-1: HF => SF, no rounding here. */
4902 emit_vec_widen_cvt_f_f (op_sf
, op_1
, UNARY_OP
, vec_bridge_mode
);
4903 /* Step-2: SF => DI. */
4904 emit_vec_widen_cvt_x_f (op_0
, op_sf
, type
, vec_int_mode
);
4911 expand_vec_lrint (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4912 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
4914 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_DYN
, vec_fp_mode
,
4915 vec_int_mode
, vec_bridge_mode
);
4919 expand_vec_lround (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4920 machine_mode vec_int_mode
, machine_mode vec_bridge_mode
)
4922 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RMM
, vec_fp_mode
,
4923 vec_int_mode
, vec_bridge_mode
);
4927 expand_vec_lceil (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4928 machine_mode vec_int_mode
)
4930 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RUP
, vec_fp_mode
,
4935 expand_vec_lfloor (rtx op_0
, rtx op_1
, machine_mode vec_fp_mode
,
4936 machine_mode vec_int_mode
)
4938 emit_vec_rounding_to_integer (op_0
, op_1
, UNARY_OP_FRM_RDN
, vec_fp_mode
,
4942 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
4943 the vector fixed point vector single-width saturating add directly. */
4946 expand_vec_usadd (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
4948 emit_vec_binary_alu (op_0
, op_1
, op_2
, US_PLUS
, vec_mode
);
4951 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
4952 the vector fixed point vector single-width saturating add directly. */
4955 expand_vec_ssadd (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
4957 emit_vec_binary_alu (op_0
, op_1
, op_2
, SS_PLUS
, vec_mode
);
4960 /* Expand the standard name usadd<mode>3 for vector mode, we can leverage
4961 the vector fixed point vector single-width saturating add directly. */
4964 expand_vec_ussub (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
4966 emit_vec_binary_alu (op_0
, op_1
, op_2
, US_MINUS
, vec_mode
);
4969 /* Expand the standard name ssadd<mode>3 for vector mode, we can leverage
4970 the vector fixed point vector single-width saturating add directly. */
4973 expand_vec_sssub (rtx op_0
, rtx op_1
, rtx op_2
, machine_mode vec_mode
)
4975 emit_vec_binary_alu (op_0
, op_1
, op_2
, SS_MINUS
, vec_mode
);
4978 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
4979 DI => SI. we can leverage the vector fixed point vector narrowing
4980 fixed-point clip directly. */
4983 expand_vec_double_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
)
4986 rtx zero
= CONST0_RTX (Xmode
);
4987 enum unspec unspec
= UNSPEC_VNCLIPU
;
4988 rtx ops
[] = {op_0
, op_1
, zero
};
4990 icode
= code_for_pred_narrow_clip_scalar (unspec
, vec_mode
);
4991 emit_vlmax_insn (icode
, BINARY_OP_VXRM_RNU
, ops
);
4994 /* Expand the standard name sstrunc<m><n>2 for double vector mode, like
4995 DI => SI. we can leverage the vector fixed point vector narrowing
4996 fixed-point clip directly. */
4999 expand_vec_double_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
)
5002 rtx zero
= CONST0_RTX (Xmode
);
5003 enum unspec unspec
= UNSPEC_VNCLIP
;
5004 rtx ops
[] = {op_0
, op_1
, zero
};
5006 icode
= code_for_pred_narrow_clip_scalar (unspec
, vec_mode
);
5007 emit_vlmax_insn (icode
, BINARY_OP_VXRM_RNU
, ops
);
5010 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5011 DI => HI. we can leverage the vector fixed point vector narrowing
5012 fixed-point clip directly. */
5015 expand_vec_quad_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5016 machine_mode double_mode
)
5018 rtx double_rtx
= gen_reg_rtx (double_mode
);
5020 expand_vec_double_ustrunc (double_rtx
, op_1
, vec_mode
);
5021 expand_vec_double_ustrunc (op_0
, double_rtx
, double_mode
);
5024 /* Expand the standard name sstrunc<m><n>2 for quad vector mode, like
5025 DI => HI. we can leverage the vector fixed point vector narrowing
5026 fixed-point clip directly. */
5029 expand_vec_quad_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5030 machine_mode double_mode
)
5032 rtx double_rtx
= gen_reg_rtx (double_mode
);
5034 expand_vec_double_sstrunc (double_rtx
, op_1
, vec_mode
);
5035 expand_vec_double_sstrunc (op_0
, double_rtx
, double_mode
);
5038 /* Expand the standard name ustrunc<m><n>2 for double vector mode, like
5039 DI => QI. we can leverage the vector fixed point vector narrowing
5040 fixed-point clip directly. */
5043 expand_vec_oct_ustrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5044 machine_mode double_mode
, machine_mode quad_mode
)
5046 rtx double_rtx
= gen_reg_rtx (double_mode
);
5047 rtx quad_rtx
= gen_reg_rtx (quad_mode
);
5049 expand_vec_double_ustrunc (double_rtx
, op_1
, vec_mode
);
5050 expand_vec_double_ustrunc (quad_rtx
, double_rtx
, double_mode
);
5051 expand_vec_double_ustrunc (op_0
, quad_rtx
, quad_mode
);
5054 /* Expand the standard name sstrunc<m><n>2 for oct vector mode, like
5055 DI => QI. we can leverage the vector fixed point vector narrowing
5056 fixed-point clip directly. */
5059 expand_vec_oct_sstrunc (rtx op_0
, rtx op_1
, machine_mode vec_mode
,
5060 machine_mode double_mode
, machine_mode quad_mode
)
5062 rtx double_rtx
= gen_reg_rtx (double_mode
);
5063 rtx quad_rtx
= gen_reg_rtx (quad_mode
);
5065 expand_vec_double_sstrunc (double_rtx
, op_1
, vec_mode
);
5066 expand_vec_double_sstrunc (quad_rtx
, double_rtx
, double_mode
);
5067 expand_vec_double_sstrunc (op_0
, quad_rtx
, quad_mode
);
5070 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
5073 expand_popcount (rtx
*ops
)
5077 machine_mode mode
= GET_MODE (dst
);
5078 scalar_mode imode
= GET_MODE_INNER (mode
);
5079 static const uint64_t m5
= 0x5555555555555555ULL
;
5080 static const uint64_t m3
= 0x3333333333333333ULL
;
5081 static const uint64_t mf
= 0x0F0F0F0F0F0F0F0FULL
;
5082 static const uint64_t m1
= 0x0101010101010101ULL
;
5084 rtx x1
= gen_reg_rtx (mode
);
5085 rtx x2
= gen_reg_rtx (mode
);
5086 rtx x3
= gen_reg_rtx (mode
);
5087 rtx x4
= gen_reg_rtx (mode
);
5089 /* x1 = src - (src >> 1) & 0x555...); */
5090 rtx shift1
= expand_binop (mode
, lshr_optab
, src
, GEN_INT (1), NULL
, true,
5093 rtx and1
= gen_reg_rtx (mode
);
5094 rtx ops1
[] = {and1
, shift1
, gen_int_mode (m5
, imode
)};
5095 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5098 x1
= expand_binop (mode
, sub_optab
, src
, and1
, NULL
, true, OPTAB_DIRECT
);
5100 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
5102 rtx and2
= gen_reg_rtx (mode
);
5103 rtx ops2
[] = {and2
, x1
, gen_int_mode (m3
, imode
)};
5104 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5107 rtx shift2
= expand_binop (mode
, lshr_optab
, x1
, GEN_INT (2), NULL
, true,
5110 rtx and22
= gen_reg_rtx (mode
);
5111 rtx ops22
[] = {and22
, shift2
, gen_int_mode (m3
, imode
)};
5112 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5115 x2
= expand_binop (mode
, add_optab
, and2
, and22
, NULL
, true, OPTAB_DIRECT
);
5117 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */
5118 rtx shift3
= expand_binop (mode
, lshr_optab
, x2
, GEN_INT (4), NULL
, true,
5122 = expand_binop (mode
, add_optab
, x2
, shift3
, NULL
, true, OPTAB_DIRECT
);
5124 rtx ops3
[] = {x3
, plus3
, gen_int_mode (mf
, imode
)};
5125 emit_vlmax_insn (code_for_pred_scalar (AND
, mode
), riscv_vector::BINARY_OP
,
5128 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */
5129 rtx mul4
= gen_reg_rtx (mode
);
5130 rtx ops4
[] = {mul4
, x3
, gen_int_mode (m1
, imode
)};
5131 emit_vlmax_insn (code_for_pred_scalar (MULT
, mode
), riscv_vector::BINARY_OP
,
5134 x4
= expand_binop (mode
, lshr_optab
, mul4
,
5135 GEN_INT (GET_MODE_BITSIZE (imode
) - 8), NULL
, true,
5138 emit_move_insn (dst
, x4
);
5141 /* Return true if it is VLMAX AVL TYPE. */
5143 vlmax_avl_type_p (rtx_insn
*rinsn
)
5145 extract_insn_cached (rinsn
);
5146 int index
= get_attr_avl_type_idx (rinsn
);
5147 if (index
== INVALID_ATTRIBUTE
)
5149 rtx avl_type
= recog_data
.operand
[index
];
5150 return INTVAL (avl_type
) == VLMAX
;
5153 /* Return true if it is an RVV instruction depends on VL global
5156 has_vl_op (rtx_insn
*rinsn
)
5158 return recog_memoized (rinsn
) >= 0 && get_attr_has_vl_op (rinsn
);
5161 /* Get default tail policy. */
5165 /* For the instruction that doesn't require TA, we still need a default value
5166 to emit vsetvl. We pick up the default value according to prefer policy. */
5167 return (bool) (get_prefer_tail_policy () & 0x1
5168 || (get_prefer_tail_policy () >> 1 & 0x1));
5171 /* Helper function to get TA operand. */
5173 tail_agnostic_p (rtx_insn
*rinsn
)
5175 /* If it doesn't have TA, we return agnostic by default. */
5176 extract_insn_cached (rinsn
);
5177 int ta
= get_attr_ta (rinsn
);
5178 return ta
== INVALID_ATTRIBUTE
? get_default_ta () : IS_AGNOSTIC (ta
);
5181 /* Change insn and Assert the change always happens. */
5183 validate_change_or_fail (rtx object
, rtx
*loc
, rtx new_rtx
, bool in_group
)
5185 bool change_p
= validate_change (object
, loc
, new_rtx
, in_group
);
5186 gcc_assert (change_p
);
5189 /* Return true if it is NONVLMAX AVL TYPE. */
5191 nonvlmax_avl_type_p (rtx_insn
*rinsn
)
5193 extract_insn_cached (rinsn
);
5194 int index
= get_attr_avl_type_idx (rinsn
);
5195 if (index
== INVALID_ATTRIBUTE
)
5197 rtx avl_type
= recog_data
.operand
[index
];
5198 return INTVAL (avl_type
) == NONVLMAX
;
5201 /* Return true if RTX is RVV VLMAX AVL. */
5205 return x
&& rtx_equal_p (x
, RVV_VLMAX
);
5208 /* Helper function to get SEW operand. We always have SEW value for
5209 all RVV instructions that have VTYPE OP. */
5211 get_sew (rtx_insn
*rinsn
)
5213 return get_attr_sew (rinsn
);
5216 /* Helper function to get VLMUL operand. We always have VLMUL value for
5217 all RVV instructions that have VTYPE OP. */
5219 get_vlmul (rtx_insn
*rinsn
)
5221 return (enum vlmul_type
) get_attr_vlmul (rinsn
);
5224 /* Count the number of REGNO in RINSN. */
5226 count_regno_occurrences (rtx_insn
*rinsn
, unsigned int regno
)
5229 extract_insn (rinsn
);
5230 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
5231 if (refers_to_regno_p (regno
, recog_data
.operand
[i
]))
5236 /* Return true if the OP can be directly broadcasted. */
5238 can_be_broadcasted_p (rtx op
)
5240 machine_mode mode
= GET_MODE (op
);
5241 /* We don't allow RA (register allocation) reload generate
5242 (vec_duplicate:DI reg) in RV32 system wheras we allow
5243 (vec_duplicate:DI mem) in RV32 system. */
5244 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode
)
5245 && maybe_gt (GET_MODE_SIZE (mode
), GET_MODE_SIZE (Pmode
))
5246 && !satisfies_constraint_Wdm (op
))
5249 if (satisfies_constraint_K (op
) || register_operand (op
, mode
)
5250 || satisfies_constraint_Wdm (op
) || rtx_equal_p (op
, CONST0_RTX (mode
)))
5253 return can_create_pseudo_p () && nonmemory_operand (op
, mode
);
5257 emit_vec_extract (rtx target
, rtx src
, rtx index
)
5259 machine_mode vmode
= GET_MODE (src
);
5260 machine_mode smode
= GET_MODE (target
);
5261 class expand_operand ops
[3];
5262 enum insn_code icode
5263 = convert_optab_handler (vec_extract_optab
, vmode
, smode
);
5264 gcc_assert (icode
!= CODE_FOR_nothing
);
5265 create_output_operand (&ops
[0], target
, smode
);
5267 create_input_operand (&ops
[1], src
, vmode
);
5270 if (poly_int_rtx_p (index
, &val
))
5271 create_integer_operand (&ops
[2], val
);
5273 create_input_operand (&ops
[2], index
, Pmode
);
5275 expand_insn (icode
, 3, ops
);
5276 if (ops
[0].value
!= target
)
5277 emit_move_insn (target
, ops
[0].value
);
5280 /* Return true if the offset mode is valid mode that we use for gather/scatter
5281 autovectorization. */
5283 gather_scatter_valid_offset_p (machine_mode mode
)
5285 /* If the element size of offset mode is already >= Pmode size,
5286 we don't need any extensions. */
5287 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode
)), UNITS_PER_WORD
))
5290 /* Since we are very likely extend the offset mode into vector Pmode,
5291 Disable gather/scatter autovectorization if we can't extend the offset
5292 mode into vector Pmode. */
5293 if (!get_vector_mode (Pmode
, GET_MODE_NUNITS (mode
)).exists ())
5298 /* Implement TARGET_ESTIMATED_POLY_VALUE.
5299 Look into the tuning structure for an estimate.
5300 KIND specifies the type of requested estimate: min, max or likely.
5301 For cores with a known VLA width all three estimates are the same.
5302 For generic VLA tuning we want to distinguish the maximum estimate from
5303 the minimum and likely ones.
5304 The likely estimate is the same as the minimum in that case to give a
5305 conservative behavior of auto-vectorizing with VLA when it is a win
5306 even for VLA vectorization.
5307 When VLA width information is available VAL.coeffs[1] is multiplied by
5308 the number of VLA chunks over the initial VLS bits. */
5310 estimated_poly_value (poly_int64 val
, unsigned int kind
)
5312 unsigned int width_source
5313 = BITS_PER_RISCV_VECTOR
.is_constant ()
5314 ? (unsigned int) BITS_PER_RISCV_VECTOR
.to_constant ()
5315 : (unsigned int) RVV_VECTOR_BITS_SCALABLE
;
5317 /* If there is no core-specific information then the minimum and likely
5318 values are based on TARGET_MIN_VLEN vectors and the maximum is based on
5319 the architectural maximum of 65536 bits. */
5320 unsigned int min_vlen_bytes
= TARGET_MIN_VLEN
/ 8 - 1;
5321 if (width_source
== RVV_VECTOR_BITS_SCALABLE
)
5324 case POLY_VALUE_MIN
:
5325 case POLY_VALUE_LIKELY
:
5326 return val
.coeffs
[0];
5328 case POLY_VALUE_MAX
:
5329 return val
.coeffs
[0] + val
.coeffs
[1] * min_vlen_bytes
;
5332 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
5333 lowest as likely. This could be made more general if future -mtune
5334 options need it to be. */
5335 if (kind
== POLY_VALUE_MAX
)
5336 width_source
= 1 << floor_log2 (width_source
);
5338 width_source
= least_bit_hwi (width_source
);
5340 /* If the core provides width information, use that. */
5341 HOST_WIDE_INT over_min_vlen
= width_source
- TARGET_MIN_VLEN
;
5342 return val
.coeffs
[0] + val
.coeffs
[1] * over_min_vlen
/ TARGET_MIN_VLEN
;
5345 /* Return true it is whole register-register move. */
5347 whole_reg_to_reg_move_p (rtx
*ops
, machine_mode mode
, int avl_type_index
)
5349 /* An operation is a whole-register move if either
5350 (1) Its vlmax operand equals VLMAX
5351 (2) Its vl operand equals the number of units of its mode. */
5352 if (register_operand (ops
[0], mode
)
5353 && register_operand (ops
[3], mode
)
5354 && satisfies_constraint_vu (ops
[2])
5355 && satisfies_constraint_Wc1 (ops
[1]))
5357 if (INTVAL (ops
[avl_type_index
]) == VLMAX
)
5359 /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
5360 into NON-VLMAX with LEN = NUNITS. */
5361 else if (CONST_INT_P (ops
[4])
5362 && known_eq (INTVAL (ops
[4]), GET_MODE_NUNITS (mode
)))
5368 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
5370 splat_to_scalar_move_p (rtx
*ops
)
5372 return satisfies_constraint_Wc1 (ops
[1])
5373 && satisfies_constraint_vu (ops
[2])
5375 && satisfies_constraint_c01 (ops
[4])
5376 && INTVAL (ops
[7]) == NONVLMAX
5377 && known_ge (GET_MODE_SIZE (Pmode
), GET_MODE_SIZE (GET_MODE (ops
[3])));
5380 } // namespace riscv_vector