1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2023 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
26 #include "coretypes.h"
31 #include "basic-block.h"
34 #include "targhooks.h"
36 #include "fold-const.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
46 /* This file should be included last. */
47 #include "riscv-vector-costs.h"
49 namespace riscv_vector
{
51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
54 - Collect all vectorize STMTs locally for each loop block.
55 - Build program point based graph, ignore non-vectorize STMTs:
57 vectorize STMT 0 - point 0
58 scalar STMT 0 - ignore.
59 vectorize STMT 1 - point 1
61 - Compute the number of live V_REGs live at each program point
62 - Determine LMUL in VECTOR COST model according to the program point
63 which has maximum live V_REGs.
67 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
68 It's important for mixed size auto-vectorization (Conversions, ... etc).
69 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
70 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
71 We compute the number live V_REGs at each program point according to
73 - We only compute program points and live ranges locally (within a block)
74 since we just need to compute the number of live V_REGs at each program
75 point and we are not really allocating the registers for each SSA.
76 We can make the variable has another local live range in another block
77 if it live out/live in to another block. Such approach doesn't affect
78 out accurate live range analysis.
79 - Current analysis didn't consider any instruction scheduling which
80 may improve the register pressure. So we are conservatively doing the
81 analysis which may end up with smaller LMUL.
82 TODO: Maybe we could support a reasonable live range shrink algorithm
83 which take advantage of instruction scheduling.
84 - We may have these following possible autovec modes analysis:
86 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
87 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
88 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
92 is_gimple_assign_or_call (gimple
*stmt
)
94 return is_gimple_assign (stmt
) || is_gimple_call (stmt
);
97 /* Return the program point of 1st vectorized lanes statement. */
99 get_first_lane_point (const vec
<stmt_point
> program_points
,
100 stmt_vec_info stmt_info
)
102 for (const auto program_point
: program_points
)
103 if (program_point
.stmt_info
== DR_GROUP_FIRST_ELEMENT (stmt_info
))
104 return program_point
.point
;
108 /* Return the program point of last vectorized lanes statement. */
110 get_last_lane_point (const vec
<stmt_point
> program_points
,
111 stmt_vec_info stmt_info
)
113 unsigned int max_point
= 0;
114 for (auto s
= DR_GROUP_FIRST_ELEMENT (stmt_info
); s
!= NULL
;
115 s
= DR_GROUP_NEXT_ELEMENT (s
))
117 for (const auto program_point
: program_points
)
118 if (program_point
.stmt_info
== s
&& program_point
.point
> max_point
)
119 max_point
= program_point
.point
;
124 /* Return the last variable that is in the live range list. */
126 get_live_range (hash_map
<tree
, pair
> *live_ranges
, tree arg
)
128 auto *r
= live_ranges
->get (arg
);
134 gimple
*def_stmt
= NULL
;
135 while (t
&& TREE_CODE (t
) == SSA_NAME
&& !r
136 && (def_stmt
= SSA_NAME_DEF_STMT (t
)))
138 if (gimple_assign_cast_p (def_stmt
))
140 t
= gimple_assign_rhs1 (def_stmt
);
141 r
= live_ranges
->get (t
);
145 /* FIXME: Currently we don't see any fold for
146 non-conversion statements. */
153 bool insert_p
= live_ranges
->put (arg
, pair (0, 0));
154 gcc_assert (!insert_p
);
155 return live_ranges
->get (arg
);
160 /* Collect all STMTs that are vectorized and compute their program points.
161 Note that we don't care about the STMTs that are not vectorized and
162 we only build the local graph (within a block) of program points.
166 STMT 1 (be vectorized) -- point 0
167 STMT 2 (not be vectorized) -- ignored
168 STMT 3 (be vectorized) -- point 1
169 STMT 4 (be vectorized) -- point 2
170 STMT 5 (be vectorized) -- point 3
173 STMT 1 (be vectorized) -- point 0
174 STMT 2 (be vectorized) -- point 1
175 STMT 3 (not be vectorized) -- ignored
176 STMT 4 (not be vectorized) -- ignored
177 STMT 5 (be vectorized) -- point 2
181 compute_local_program_points (
183 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
)
185 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
187 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
188 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
189 unsigned int nbbs
= loop
->num_nodes
;
190 gimple_stmt_iterator si
;
192 /* Collect the stmts that is vectorized and mark their program point. */
193 for (i
= 0; i
< nbbs
; i
++)
196 basic_block bb
= bbs
[i
];
197 vec
<stmt_point
> program_points
= vNULL
;
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_NOTE
, vect_location
,
200 "Compute local program points for bb %d:\n",
202 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
204 if (!is_gimple_assign_or_call (gsi_stmt (si
)))
206 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
207 enum stmt_vec_info_type type
208 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
209 if (type
!= undef_vec_info_type
)
211 stmt_point info
= {point
, gsi_stmt (si
), stmt_info
};
212 program_points
.safe_push (info
);
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE
, vect_location
,
216 "program point %d: %G", info
.point
,
220 program_points_per_bb
.put (bb
, program_points
);
226 get_biggest_mode (machine_mode mode1
, machine_mode mode2
)
228 unsigned int mode1_size
= GET_MODE_BITSIZE (mode1
).to_constant ();
229 unsigned int mode2_size
= GET_MODE_BITSIZE (mode2
).to_constant ();
230 return mode1_size
>= mode2_size
? mode1
: mode2
;
233 /* Compute local live ranges of each vectorized variable.
234 Note that we only compute local live ranges (within a block) since
235 local live ranges information is accurate enough for us to determine
236 the LMUL/vectorization factor of the loop.
241 STMT 2 (def SSA 1) -- point 1
242 STMT 3 (use SSA 1) -- point 2
248 STMT 4 (use SSA 2) -- point 3
250 The live range of SSA 1 is [1, 3] in bb 2.
251 The live range of SSA 2 is [0, 4] in bb 3. */
253 compute_local_live_ranges (
254 const hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
255 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
)
257 machine_mode biggest_mode
= QImode
;
258 if (!program_points_per_bb
.is_empty ())
260 auto_vec
<tree
> visited_vars
;
262 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
263 = program_points_per_bb
.begin ();
264 iter
!= program_points_per_bb
.end (); ++iter
)
266 basic_block bb
= (*iter
).first
;
267 vec
<stmt_point
> program_points
= (*iter
).second
;
268 bool existed_p
= false;
269 hash_map
<tree
, pair
> *live_ranges
270 = &live_ranges_per_bb
.get_or_insert (bb
, &existed_p
);
271 gcc_assert (!existed_p
);
272 if (dump_enabled_p ())
273 dump_printf_loc (MSG_NOTE
, vect_location
,
274 "Compute local live ranges for bb %d:\n",
276 for (const auto program_point
: program_points
)
278 unsigned int point
= program_point
.point
;
279 gimple
*stmt
= program_point
.stmt
;
280 stmt_vec_info stmt_info
= program_point
.stmt_info
;
281 tree lhs
= gimple_get_lhs (stmt
);
282 if (lhs
!= NULL_TREE
&& is_gimple_reg (lhs
)
283 && (!POINTER_TYPE_P (TREE_TYPE (lhs
))
284 || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
285 != store_vec_info_type
))
287 biggest_mode
= get_biggest_mode (biggest_mode
,
288 TYPE_MODE (TREE_TYPE (lhs
)));
289 bool existed_p
= false;
291 = live_ranges
->get_or_insert (lhs
, &existed_p
);
292 gcc_assert (!existed_p
);
293 if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point
.stmt_info
)
294 == VMAT_LOAD_STORE_LANES
)
295 point
= get_first_lane_point (program_points
,
296 program_point
.stmt_info
);
297 live_range
= pair (point
, point
);
299 for (i
= 0; i
< gimple_num_args (stmt
); i
++)
301 tree var
= gimple_arg (stmt
, i
);
302 /* Both IMM and REG are included since a VECTOR_CST may be
303 potentially held in a vector register. However, it's not
304 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
307 TODO: We may elide the cases that the unnecessary IMM in
309 if (poly_int_tree_p (var
)
310 || (is_gimple_val (var
)
311 && (!POINTER_TYPE_P (TREE_TYPE (var
))
313 vect_stmt_to_vectorize (stmt_info
))
314 != load_vec_info_type
)))
317 = get_biggest_mode (biggest_mode
,
318 TYPE_MODE (TREE_TYPE (var
)));
319 bool existed_p
= false;
321 = live_ranges
->get_or_insert (var
, &existed_p
);
322 if (STMT_VINFO_MEMORY_ACCESS_TYPE (
323 program_point
.stmt_info
)
324 == VMAT_LOAD_STORE_LANES
)
325 point
= get_last_lane_point (program_points
,
326 program_point
.stmt_info
);
328 point
= MAX (live_range
.second
, point
);
330 /* We will grow the live range for each use. */
331 live_range
= pair (live_range
.first
, point
);
335 if (TREE_CODE (var
) == SSA_NAME
336 && (def_stmt
= SSA_NAME_DEF_STMT (var
))
337 && gimple_bb (def_stmt
) == bb
338 && is_gimple_assign_or_call (def_stmt
))
340 live_ranges
->remove (var
);
341 for (unsigned int j
= 0;
342 j
< gimple_num_args (def_stmt
); j
++)
344 tree arg
= gimple_arg (def_stmt
, j
);
345 auto *r
= get_live_range (live_ranges
, arg
);
347 (*r
).second
= MAX (point
, (*r
).second
);
351 /* The splat vector lives the whole block. */
352 live_range
= pair (0, program_points
.length ());
357 if (dump_enabled_p ())
358 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
->begin ();
359 iter
!= live_ranges
->end (); ++iter
)
360 dump_printf_loc (MSG_NOTE
, vect_location
,
361 "%T: type = %T, start = %d, end = %d\n",
362 (*iter
).first
, TREE_TYPE ((*iter
).first
),
363 (*iter
).second
.first
, (*iter
).second
.second
);
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE
, vect_location
, "Biggest mode = %s\n",
368 GET_MODE_NAME (biggest_mode
));
372 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
374 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
375 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
377 compute_nregs_for_mode (machine_mode mode
, machine_mode biggest_mode
, int lmul
)
379 unsigned int mode_size
= GET_MODE_SIZE (mode
).to_constant ();
380 unsigned int biggest_size
= GET_MODE_SIZE (biggest_mode
).to_constant ();
381 gcc_assert (biggest_size
>= mode_size
);
382 unsigned int ratio
= biggest_size
/ mode_size
;
383 return MAX (lmul
/ ratio
, 1);
386 /* This function helps to determine whether current LMUL will cause
387 potential vector register (V_REG) spillings according to live range
390 - First, compute how many variable are alive of each program point
391 in each bb of the loop.
392 - Second, compute how many V_REGs are alive of each program point
393 in each bb of the loop according the BIGGEST_MODE and the variable
395 - Third, Return the maximum V_REGs are alive of the loop. */
397 max_number_of_live_regs (const basic_block bb
,
398 const hash_map
<tree
, pair
> &live_ranges
,
399 unsigned int max_point
, machine_mode biggest_mode
,
402 unsigned int max_nregs
= 0;
404 unsigned int live_point
= 0;
405 auto_vec
<unsigned int> live_vars_vec
;
406 live_vars_vec
.safe_grow_cleared (max_point
, true);
407 for (hash_map
<tree
, pair
>::iterator iter
= live_ranges
.begin ();
408 iter
!= live_ranges
.end (); ++iter
)
410 tree var
= (*iter
).first
;
411 pair live_range
= (*iter
).second
;
412 for (i
= live_range
.first
+ 1; i
<= live_range
.second
; i
++)
414 machine_mode mode
= TYPE_MODE (TREE_TYPE (var
));
416 = compute_nregs_for_mode (mode
, biggest_mode
, lmul
);
417 live_vars_vec
[i
] += nregs
;
418 if (live_vars_vec
[i
] > max_nregs
)
420 max_nregs
= live_vars_vec
[i
];
426 /* Collect user explicit RVV type. */
427 auto_vec
<basic_block
> all_preds
428 = get_all_dominated_blocks (CDI_POST_DOMINATORS
, bb
);
430 FOR_EACH_SSA_NAME (i
, t
, cfun
)
432 machine_mode mode
= TYPE_MODE (TREE_TYPE (t
));
433 if (!lookup_vector_type_attribute (TREE_TYPE (t
))
434 && !riscv_v_ext_vls_mode_p (mode
))
437 gimple
*def
= SSA_NAME_DEF_STMT (t
);
438 if (gimple_bb (def
) && !all_preds
.contains (gimple_bb (def
)))
441 imm_use_iterator iterator
;
443 FOR_EACH_IMM_USE_FAST (use_p
, iterator
, t
)
445 if (!USE_STMT (use_p
) || is_gimple_debug (USE_STMT (use_p
))
446 || !dominated_by_p (CDI_POST_DOMINATORS
, bb
,
447 gimple_bb (USE_STMT (use_p
))))
450 int regno_alignment
= riscv_get_v_regno_alignment (mode
);
451 max_nregs
+= regno_alignment
;
452 if (dump_enabled_p ())
454 MSG_NOTE
, vect_location
,
455 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
456 "V_REG live in bb %d at program point %d\n",
457 t
, TREE_TYPE (t
), GET_MODE_NAME (mode
), regno_alignment
,
458 bb
->index
, live_point
);
463 if (dump_enabled_p ())
465 MSG_NOTE
, vect_location
,
466 "Maximum lmul = %d, At most %d number of live V_REG at program "
467 "point %d for bb %d\n",
468 lmul
, max_nregs
, live_point
, bb
->index
);
472 /* Get STORE value. */
474 get_store_value (gimple
*stmt
)
476 if (is_gimple_call (stmt
) && gimple_call_internal_p (stmt
))
478 if (gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
479 return gimple_call_arg (stmt
, 3);
484 return gimple_assign_rhs1 (stmt
);
487 /* Return true if it is non-contiguous load/store. */
489 non_contiguous_memory_access_p (stmt_vec_info stmt_info
)
491 enum stmt_vec_info_type type
492 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
493 return ((type
== load_vec_info_type
|| type
== store_vec_info_type
)
494 && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info
)));
497 /* Return the LMUL of the current analysis. */
499 compute_estimated_lmul (loop_vec_info loop_vinfo
, machine_mode mode
)
501 gcc_assert (GET_MODE_BITSIZE (mode
).is_constant ());
502 int regno_alignment
= riscv_get_v_regno_alignment (loop_vinfo
->vector_mode
);
503 if (riscv_v_ext_vls_mode_p (loop_vinfo
->vector_mode
))
504 return regno_alignment
;
505 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
), 1U))
507 int estimated_vf
= vect_vf_for_cost (loop_vinfo
);
508 return estimated_vf
* GET_MODE_BITSIZE (mode
).to_constant ()
513 /* Estimate the VLA SLP LMUL. */
514 if (regno_alignment
> RVV_M1
)
515 return regno_alignment
;
516 else if (mode
!= QImode
517 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
).is_constant ())
520 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR
,
521 GET_MODE_SIZE (loop_vinfo
->vector_mode
), &ratio
))
533 /* Update the live ranges according PHI.
538 STMT 2 (def SSA 1) -- point 1
539 STMT 3 (use SSA 1) -- point 2
545 STMT 3 (use SSA 2) -- point 2
548 Before this function, the SSA 1 live range is [2, 3] in bb 2
549 and SSA 2 is [0, 3] in bb 3.
551 Then, after this function, we update SSA 1 live range in bb 2
552 into [2, 4] since SSA 1 is live out into bb 3. */
554 update_local_live_ranges (
556 hash_map
<basic_block
, vec
<stmt_point
>> &program_points_per_bb
,
557 hash_map
<basic_block
, hash_map
<tree
, pair
>> &live_ranges_per_bb
,
558 machine_mode
*biggest_mode
)
560 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
564 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
565 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
566 unsigned int nbbs
= loop
->num_nodes
;
569 gimple_stmt_iterator si
;
570 for (i
= 0; i
< nbbs
; i
++)
572 basic_block bb
= bbs
[i
];
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE
, vect_location
,
575 "Update local program points for bb %d:\n",
577 for (psi
= gsi_start_phis (bb
); !gsi_end_p (psi
); gsi_next (&psi
))
579 gphi
*phi
= psi
.phi ();
580 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (phi
);
581 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
))
582 == undef_vec_info_type
)
585 for (j
= 0; j
< gimple_phi_num_args (phi
); j
++)
587 edge e
= gimple_phi_arg_edge (phi
, j
);
588 tree def
= gimple_phi_arg_def (phi
, j
);
589 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
590 auto *live_range
= live_ranges
->get (def
);
591 if (poly_int_tree_p (def
))
593 /* Insert live range of INTEGER_CST or POLY_CST since we will
594 need to allocate a vector register for it.
596 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
597 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
599 The live range for such value is short which only lives
600 from program point 0 to 1. */
603 unsigned int start
= (*live_range
).first
;
604 (*live_range
).first
= 0;
605 if (dump_enabled_p ())
607 MSG_NOTE
, vect_location
,
608 "Update %T start point from %d to 0:\n", def
, start
);
612 live_ranges
->put (def
, pair (0, 1));
613 auto &program_points
= (*program_points_per_bb
.get (bb
));
614 if (program_points
.is_empty ())
616 stmt_point info
= {1, phi
, stmt_info
};
617 program_points
.safe_push (info
);
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE
, vect_location
,
621 "Add %T start point from 0 to 1:\n",
626 if (live_range
&& flow_bb_inside_loop_p (loop
, e
->src
))
628 unsigned int start
= (*live_range
).first
;
629 (*live_range
).first
= 0;
630 if (dump_enabled_p ())
631 dump_printf_loc (MSG_NOTE
, vect_location
,
632 "Update %T start point from %d to %d:\n",
633 def
, start
, (*live_range
).first
);
635 live_ranges
= live_ranges_per_bb
.get (e
->src
);
636 if (!program_points_per_bb
.get (e
->src
))
638 unsigned int max_point
639 = (*program_points_per_bb
.get (e
->src
)).length ();
640 live_range
= live_ranges
->get (def
);
644 unsigned int end
= (*live_range
).second
;
645 (*live_range
).second
= max_point
;
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE
, vect_location
,
648 "Update %T end point from %d to %d:\n", def
,
649 end
, (*live_range
).second
);
652 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
654 if (!is_gimple_assign_or_call (gsi_stmt (si
)))
656 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
657 enum stmt_vec_info_type type
658 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info
));
659 if (non_contiguous_memory_access_p (stmt_info
)
660 /* LOAD_LANES/STORE_LANES doesn't need a perm indice. */
661 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info
)
662 != VMAT_LOAD_STORE_LANES
)
664 /* For non-adjacent load/store STMT, we will potentially
667 1. MASK_LEN_GATHER_LOAD (..., perm indice).
668 2. Continguous load/store + VEC_PERM (..., perm indice)
670 We will be likely using one more vector variable. */
671 unsigned int max_point
672 = (*program_points_per_bb
.get (bb
)).length () - 1;
673 auto *live_ranges
= live_ranges_per_bb
.get (bb
);
674 bool existed_p
= false;
675 tree var
= type
== load_vec_info_type
676 ? gimple_get_lhs (gsi_stmt (si
))
677 : get_store_value (gsi_stmt (si
));
678 tree sel_type
= build_nonstandard_integer_type (
679 TYPE_PRECISION (TREE_TYPE (var
)), 1);
681 = get_biggest_mode (*biggest_mode
, TYPE_MODE (sel_type
));
682 tree sel
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
683 get_identifier ("vect_perm"), sel_type
);
684 pair
&live_range
= live_ranges
->get_or_insert (sel
, &existed_p
);
685 gcc_assert (!existed_p
);
686 live_range
= pair (0, max_point
);
687 if (dump_enabled_p ())
688 dump_printf_loc (MSG_NOTE
, vect_location
,
689 "Add perm indice %T, start = 0, end = %d\n",
696 /* Compute the maximum live V_REGS. */
698 has_unexpected_spills_p (loop_vec_info loop_vinfo
)
700 /* Compute local program points.
701 It's a fast and effective computation. */
702 hash_map
<basic_block
, vec
<stmt_point
>> program_points_per_bb
;
703 compute_local_program_points (loop_vinfo
, program_points_per_bb
);
705 /* Compute local live ranges. */
706 hash_map
<basic_block
, hash_map
<tree
, pair
>> live_ranges_per_bb
;
707 machine_mode biggest_mode
708 = compute_local_live_ranges (program_points_per_bb
, live_ranges_per_bb
);
710 /* Update live ranges according to PHI. */
711 update_local_live_ranges (loop_vinfo
, program_points_per_bb
,
712 live_ranges_per_bb
, &biggest_mode
);
714 int lmul
= compute_estimated_lmul (loop_vinfo
, biggest_mode
);
715 /* TODO: We calculate the maximum live vars base on current STMTS
716 sequence. We can support live range shrink if it can give us
717 big improvement in the future. */
720 if (!live_ranges_per_bb
.is_empty ())
722 unsigned int max_nregs
= 0;
723 for (hash_map
<basic_block
, hash_map
<tree
, pair
>>::iterator iter
724 = live_ranges_per_bb
.begin ();
725 iter
!= live_ranges_per_bb
.end (); ++iter
)
727 basic_block bb
= (*iter
).first
;
728 unsigned int max_point
729 = (*program_points_per_bb
.get (bb
)).length () + 1;
730 if ((*iter
).second
.is_empty ())
732 /* We prefer larger LMUL unless it causes register spillings. */
734 = max_number_of_live_regs (bb
, (*iter
).second
, max_point
,
736 if (nregs
> max_nregs
)
739 live_ranges_per_bb
.empty ();
740 if (max_nregs
> V_REG_NUM
)
744 if (!program_points_per_bb
.is_empty ())
746 for (hash_map
<basic_block
, vec
<stmt_point
>>::iterator iter
747 = program_points_per_bb
.begin ();
748 iter
!= program_points_per_bb
.end (); ++iter
)
750 vec
<stmt_point
> program_points
= (*iter
).second
;
751 if (!program_points
.is_empty ())
752 program_points
.release ();
754 program_points_per_bb
.empty ();
759 costs::costs (vec_info
*vinfo
, bool costing_for_scalar
)
760 : vector_costs (vinfo
, costing_for_scalar
)
762 if (costing_for_scalar
)
763 m_cost_type
= SCALAR_COST
;
764 else if (riscv_v_ext_vector_mode_p (vinfo
->vector_mode
))
765 m_cost_type
= VLA_VECTOR_COST
;
767 m_cost_type
= VLS_VECTOR_COST
;
770 /* Do one-time initialization of the costs given that we're
771 costing the loop vectorization described by LOOP_VINFO. */
773 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo
)
775 /* Record the number of times that the vector loop would execute,
777 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
778 auto scalar_niters
= max_stmt_executions_int (loop
);
779 if (scalar_niters
>= 0)
781 unsigned int vf
= vect_vf_for_cost (loop_vinfo
);
782 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
783 m_num_vector_iterations
= scalar_niters
/ vf
;
785 m_num_vector_iterations
= CEIL (scalar_niters
, vf
);
788 /* Detect whether we're vectorizing for VLA and should apply the unrolling
789 heuristic described above m_unrolled_vls_niters. */
790 record_potential_vls_unrolling (loop_vinfo
);
792 /* Detect whether the LOOP has unexpected spills. */
793 record_potential_unexpected_spills (loop_vinfo
);
796 /* Analyze the vectorized program stataments and use dynamic LMUL
797 heuristic to detect whether the loop has unexpected spills. */
799 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo
)
801 /* We only want to apply the heuristic if LOOP_VINFO is being
802 vectorized for VLA and known NITERS VLS loop. */
803 if (riscv_autovec_lmul
== RVV_DYNAMIC
804 && (m_cost_type
== VLA_VECTOR_COST
805 || (m_cost_type
== VLS_VECTOR_COST
806 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))))
808 bool post_dom_available_p
= dom_info_available_p (CDI_POST_DOMINATORS
);
809 if (!post_dom_available_p
)
810 calculate_dominance_info (CDI_POST_DOMINATORS
);
811 m_has_unexpected_spills_p
= has_unexpected_spills_p (loop_vinfo
);
812 if (!post_dom_available_p
)
813 free_dominance_info (CDI_POST_DOMINATORS
);
817 /* Decide whether to use the unrolling heuristic described above
818 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
819 describes the loop that we're vectorizing. */
821 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo
)
823 /* We only want to apply the heuristic if LOOP_VINFO is being
824 vectorized for VLA. */
825 if (m_cost_type
!= VLA_VECTOR_COST
)
828 /* We don't want to apply the heuristic to outer loops, since it's
829 harder to track two levels of unrolling. */
830 if (LOOP_VINFO_LOOP (loop_vinfo
)->inner
)
833 /* Only handle cases in which the number of VLS iterations
834 would be known at compile time but the number of SVE iterations
836 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
837 || BYTES_PER_RISCV_VECTOR
.is_constant ())
840 /* Guess how many times the VLS loop would iterate and make
841 sure that it is within the complete unrolling limit. Even if the
842 number of iterations is small enough, the number of statements might
843 not be, which is why we need to estimate the number of statements too. */
844 unsigned int vls_vf
= vect_vf_for_cost (loop_vinfo
);
845 unsigned HOST_WIDE_INT unrolled_vls_niters
846 = LOOP_VINFO_INT_NITERS (loop_vinfo
) / vls_vf
;
847 if (unrolled_vls_niters
> (unsigned int) param_max_completely_peel_times
)
850 /* Record that we're applying the heuristic and should try to estimate
851 the number of statements in the VLS loop. */
852 m_unrolled_vls_niters
= unrolled_vls_niters
;
855 /* Return true if (a) we're applying the VLS vs. VLA unrolling
856 heuristic described above m_unrolled_vls_niters and (b) the heuristic
857 says that we should prefer the VLS loop. */
859 costs::prefer_unrolled_loop () const
861 if (!m_unrolled_vls_stmts
)
864 if (dump_enabled_p ())
865 dump_printf_loc (MSG_NOTE
, vect_location
,
867 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
868 m_unrolled_vls_stmts
);
870 /* The balance here is tricky. On the one hand, we can't be sure whether
871 the code is vectorizable with VLS or not. However, even if
872 it isn't vectorizable with VLS, there's a possibility that
873 the scalar code could also be unrolled. Some of the code might then
874 benefit from SLP, or from using LDP and STP. We therefore apply
875 the heuristic regardless of can_use_vls_p. */
876 return (m_unrolled_vls_stmts
877 && (m_unrolled_vls_stmts
878 <= (unsigned int) param_max_completely_peeled_insns
));
882 costs::better_main_loop_than_p (const vector_costs
*uncast_other
) const
884 auto other
= static_cast<const costs
*> (uncast_other
);
885 auto this_loop_vinfo
= as_a
<loop_vec_info
> (this->m_vinfo
);
886 auto other_loop_vinfo
= as_a
<loop_vec_info
> (other
->m_vinfo
);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE
, vect_location
,
890 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
891 GET_MODE_NAME (this_loop_vinfo
->vector_mode
),
892 vect_vf_for_cost (this_loop_vinfo
),
893 GET_MODE_NAME (other_loop_vinfo
->vector_mode
),
894 vect_vf_for_cost (other_loop_vinfo
));
896 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
897 if (bool (m_unrolled_vls_stmts
) != bool (other
->m_unrolled_vls_stmts
))
899 bool this_prefer_unrolled
= this->prefer_unrolled_loop ();
900 bool other_prefer_unrolled
= other
->prefer_unrolled_loop ();
901 if (this_prefer_unrolled
!= other_prefer_unrolled
)
903 if (dump_enabled_p ())
904 dump_printf_loc (MSG_NOTE
, vect_location
,
905 "Preferring VLS loop because"
906 " it can be unrolled\n");
907 return other_prefer_unrolled
;
910 else if (riscv_autovec_lmul
== RVV_DYNAMIC
)
912 if (other
->m_has_unexpected_spills_p
)
914 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE
, vect_location
,
916 "Preferring smaller LMUL loop because"
917 " it has unexpected spills\n");
920 else if (riscv_v_ext_vector_mode_p (other_loop_vinfo
->vector_mode
))
922 if (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo
))
924 if (maybe_gt (LOOP_VINFO_INT_NITERS (this_loop_vinfo
),
925 LOOP_VINFO_VECT_FACTOR (this_loop_vinfo
)))
927 if (dump_enabled_p ())
928 dump_printf_loc (MSG_NOTE
, vect_location
,
929 "Keep current LMUL loop because"
930 " known NITERS exceed the new VF\n");
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE
, vect_location
,
938 "Keep current LMUL loop because"
939 " it is unknown NITERS\n");
945 return vector_costs::better_main_loop_than_p (other
);
949 costs::add_stmt_cost (int count
, vect_cost_for_stmt kind
,
950 stmt_vec_info stmt_info
, slp_tree
, tree vectype
,
951 int misalign
, vect_cost_model_location where
)
954 = targetm
.vectorize
.builtin_vectorization_cost (kind
, vectype
, misalign
);
956 /* Do one-time initialization based on the vinfo. */
957 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
);
958 if (!m_analyzed_vinfo
)
961 analyze_loop_vinfo (loop_vinfo
);
963 m_analyzed_vinfo
= true;
968 /* If we're applying the VLA vs. VLS unrolling heuristic,
969 estimate the number of statements in the unrolled VLS
970 loop. For simplicitly, we assume that one iteration of the
971 VLS loop would need the same number of statements
972 as one iteration of the VLA loop. */
973 if (where
== vect_body
&& m_unrolled_vls_niters
)
974 m_unrolled_vls_stmts
+= count
* m_unrolled_vls_niters
;
977 return record_stmt_cost (stmt_info
, where
, count
* stmt_cost
);
981 costs::finish_cost (const vector_costs
*scalar_costs
)
983 vector_costs::finish_cost (scalar_costs
);
986 } // namespace riscv_vector