RISC-V: Count pointer type SSA into RVV regs liveness for dynamic LMUL cost model
[official-gcc.git] / gcc / config / riscv / riscv-vector-costs.cc
blobb41a79429d4fe213975168069d57b78d799f102c
1 /* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
2 Copyright (C) 2023-2023 Free Software Foundation, Inc.
3 Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "target.h"
29 #include "function.h"
30 #include "tree.h"
31 #include "basic-block.h"
32 #include "rtl.h"
33 #include "gimple.h"
34 #include "targhooks.h"
35 #include "cfgloop.h"
36 #include "fold-const.h"
37 #include "tm_p.h"
38 #include "tree-vectorizer.h"
39 #include "gimple-iterator.h"
40 #include "bitmap.h"
41 #include "ssa.h"
42 #include "backend.h"
43 #include "tree-data-ref.h"
44 #include "tree-ssa-loop-niter.h"
46 /* This file should be included last. */
47 #include "riscv-vector-costs.h"
49 namespace riscv_vector {
51 /* Dynamic LMUL philosophy - Local linear-scan SSA live range based analysis
52 determine LMUL
54 - Collect all vectorize STMTs locally for each loop block.
55 - Build program point based graph, ignore non-vectorize STMTs:
57 vectorize STMT 0 - point 0
58 scalar STMT 0 - ignore.
59 vectorize STMT 1 - point 1
60 ...
61 - Compute the number of live V_REGs live at each program point
62 - Determine LMUL in VECTOR COST model according to the program point
63 which has maximum live V_REGs.
65 Note:
67 - BIGGEST_MODE is the biggest LMUL auto-vectorization element mode.
68 It's important for mixed size auto-vectorization (Conversions, ... etc).
69 E.g. For a loop that is vectorizing conversion of INT32 -> INT64.
70 The biggest mode is DImode and LMUL = 8, LMUL = 4 for SImode.
71 We compute the number live V_REGs at each program point according to
72 this information.
73 - We only compute program points and live ranges locally (within a block)
74 since we just need to compute the number of live V_REGs at each program
75 point and we are not really allocating the registers for each SSA.
76 We can make the variable has another local live range in another block
77 if it live out/live in to another block. Such approach doesn't affect
78 out accurate live range analysis.
79 - Current analysis didn't consider any instruction scheduling which
80 may improve the register pressure. So we are conservatively doing the
81 analysis which may end up with smaller LMUL.
82 TODO: Maybe we could support a reasonable live range shrink algorithm
83 which take advantage of instruction scheduling.
84 - We may have these following possible autovec modes analysis:
86 1. M8 -> M4 -> M2 -> M1 (stop analysis here) -> MF2 -> MF4 -> MF8
87 2. M8 -> M1(M4) -> MF2(M2) -> MF4(M1) (stop analysis here) -> MF8(MF2)
88 3. M1(M8) -> MF2(M4) -> MF4(M2) -> MF8(M1)
91 static bool
92 is_gimple_assign_or_call (gimple *stmt)
94 return is_gimple_assign (stmt) || is_gimple_call (stmt);
97 /* Return the program point of 1st vectorized lanes statement. */
98 static unsigned int
99 get_first_lane_point (const vec<stmt_point> program_points,
100 stmt_vec_info stmt_info)
102 for (const auto program_point : program_points)
103 if (program_point.stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info))
104 return program_point.point;
105 return 0;
108 /* Return the program point of last vectorized lanes statement. */
109 static unsigned int
110 get_last_lane_point (const vec<stmt_point> program_points,
111 stmt_vec_info stmt_info)
113 unsigned int max_point = 0;
114 for (auto s = DR_GROUP_FIRST_ELEMENT (stmt_info); s != NULL;
115 s = DR_GROUP_NEXT_ELEMENT (s))
117 for (const auto program_point : program_points)
118 if (program_point.stmt_info == s && program_point.point > max_point)
119 max_point = program_point.point;
121 return max_point;
124 /* Return the last variable that is in the live range list. */
125 static pair *
126 get_live_range (hash_map<tree, pair> *live_ranges, tree arg)
128 auto *r = live_ranges->get (arg);
129 if (r)
130 return r;
131 else
133 tree t = arg;
134 gimple *def_stmt = NULL;
135 while (t && TREE_CODE (t) == SSA_NAME && !r
136 && (def_stmt = SSA_NAME_DEF_STMT (t)))
138 if (gimple_assign_cast_p (def_stmt))
140 t = gimple_assign_rhs1 (def_stmt);
141 r = live_ranges->get (t);
142 def_stmt = NULL;
144 else
145 /* FIXME: Currently we don't see any fold for
146 non-conversion statements. */
147 t = NULL_TREE;
149 if (r)
150 return r;
151 else
153 bool insert_p = live_ranges->put (arg, pair (0, 0));
154 gcc_assert (!insert_p);
155 return live_ranges->get (arg);
160 /* Collect all STMTs that are vectorized and compute their program points.
161 Note that we don't care about the STMTs that are not vectorized and
162 we only build the local graph (within a block) of program points.
164 Loop:
165 bb 2:
166 STMT 1 (be vectorized) -- point 0
167 STMT 2 (not be vectorized) -- ignored
168 STMT 3 (be vectorized) -- point 1
169 STMT 4 (be vectorized) -- point 2
170 STMT 5 (be vectorized) -- point 3
172 bb 3:
173 STMT 1 (be vectorized) -- point 0
174 STMT 2 (be vectorized) -- point 1
175 STMT 3 (not be vectorized) -- ignored
176 STMT 4 (not be vectorized) -- ignored
177 STMT 5 (be vectorized) -- point 2
180 static void
181 compute_local_program_points (
182 vec_info *vinfo,
183 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb)
185 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
187 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
189 unsigned int nbbs = loop->num_nodes;
190 gimple_stmt_iterator si;
191 unsigned int i;
192 /* Collect the stmts that is vectorized and mark their program point. */
193 for (i = 0; i < nbbs; i++)
195 int point = 1;
196 basic_block bb = bbs[i];
197 vec<stmt_point> program_points = vNULL;
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_NOTE, vect_location,
200 "Compute local program points for bb %d:\n",
201 bb->index);
202 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
204 if (!is_gimple_assign_or_call (gsi_stmt (si)))
205 continue;
206 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
207 enum stmt_vec_info_type type
208 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
209 if (type != undef_vec_info_type)
211 stmt_point info = {point, gsi_stmt (si), stmt_info};
212 program_points.safe_push (info);
213 point++;
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "program point %d: %G", info.point,
217 gsi_stmt (si));
220 program_points_per_bb.put (bb, program_points);
225 static machine_mode
226 get_biggest_mode (machine_mode mode1, machine_mode mode2)
228 unsigned int mode1_size = GET_MODE_BITSIZE (mode1).to_constant ();
229 unsigned int mode2_size = GET_MODE_BITSIZE (mode2).to_constant ();
230 return mode1_size >= mode2_size ? mode1 : mode2;
233 /* Compute local live ranges of each vectorized variable.
234 Note that we only compute local live ranges (within a block) since
235 local live ranges information is accurate enough for us to determine
236 the LMUL/vectorization factor of the loop.
238 Loop:
239 bb 2:
240 STMT 1 -- point 0
241 STMT 2 (def SSA 1) -- point 1
242 STMT 3 (use SSA 1) -- point 2
243 STMT 4 -- point 3
244 bb 3:
245 STMT 1 -- point 0
246 STMT 2 -- point 1
247 STMT 3 -- point 2
248 STMT 4 (use SSA 2) -- point 3
250 The live range of SSA 1 is [1, 3] in bb 2.
251 The live range of SSA 2 is [0, 4] in bb 3. */
252 static machine_mode
253 compute_local_live_ranges (
254 const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
255 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
257 machine_mode biggest_mode = QImode;
258 if (!program_points_per_bb.is_empty ())
260 auto_vec<tree> visited_vars;
261 unsigned int i;
262 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
263 = program_points_per_bb.begin ();
264 iter != program_points_per_bb.end (); ++iter)
266 basic_block bb = (*iter).first;
267 vec<stmt_point> program_points = (*iter).second;
268 bool existed_p = false;
269 hash_map<tree, pair> *live_ranges
270 = &live_ranges_per_bb.get_or_insert (bb, &existed_p);
271 gcc_assert (!existed_p);
272 if (dump_enabled_p ())
273 dump_printf_loc (MSG_NOTE, vect_location,
274 "Compute local live ranges for bb %d:\n",
275 bb->index);
276 for (const auto program_point : program_points)
278 unsigned int point = program_point.point;
279 gimple *stmt = program_point.stmt;
280 stmt_vec_info stmt_info = program_point.stmt_info;
281 tree lhs = gimple_get_lhs (stmt);
282 if (lhs != NULL_TREE && is_gimple_reg (lhs)
283 && (!POINTER_TYPE_P (TREE_TYPE (lhs))
284 || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
285 != store_vec_info_type))
287 biggest_mode = get_biggest_mode (biggest_mode,
288 TYPE_MODE (TREE_TYPE (lhs)));
289 bool existed_p = false;
290 pair &live_range
291 = live_ranges->get_or_insert (lhs, &existed_p);
292 gcc_assert (!existed_p);
293 if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info)
294 == VMAT_LOAD_STORE_LANES)
295 point = get_first_lane_point (program_points,
296 program_point.stmt_info);
297 live_range = pair (point, point);
299 for (i = 0; i < gimple_num_args (stmt); i++)
301 tree var = gimple_arg (stmt, i);
302 /* Both IMM and REG are included since a VECTOR_CST may be
303 potentially held in a vector register. However, it's not
304 accurate, since a PLUS_EXPR can be vectorized into vadd.vi
305 if IMM is -16 ~ 15.
307 TODO: We may elide the cases that the unnecessary IMM in
308 the future. */
309 if (poly_int_tree_p (var)
310 || (is_gimple_val (var)
311 && (!POINTER_TYPE_P (TREE_TYPE (var))
312 || STMT_VINFO_TYPE (
313 vect_stmt_to_vectorize (stmt_info))
314 != load_vec_info_type)))
316 biggest_mode
317 = get_biggest_mode (biggest_mode,
318 TYPE_MODE (TREE_TYPE (var)));
319 bool existed_p = false;
320 pair &live_range
321 = live_ranges->get_or_insert (var, &existed_p);
322 if (STMT_VINFO_MEMORY_ACCESS_TYPE (
323 program_point.stmt_info)
324 == VMAT_LOAD_STORE_LANES)
325 point = get_last_lane_point (program_points,
326 program_point.stmt_info);
327 else if (existed_p)
328 point = MAX (live_range.second, point);
329 if (existed_p)
330 /* We will grow the live range for each use. */
331 live_range = pair (live_range.first, point);
332 else
334 gimple *def_stmt;
335 if (TREE_CODE (var) == SSA_NAME
336 && (def_stmt = SSA_NAME_DEF_STMT (var))
337 && gimple_bb (def_stmt) == bb
338 && is_gimple_assign_or_call (def_stmt))
340 live_ranges->remove (var);
341 for (unsigned int j = 0;
342 j < gimple_num_args (def_stmt); j++)
344 tree arg = gimple_arg (def_stmt, j);
345 auto *r = get_live_range (live_ranges, arg);
346 gcc_assert (r);
347 (*r).second = MAX (point, (*r).second);
350 else
351 /* The splat vector lives the whole block. */
352 live_range = pair (0, program_points.length ());
357 if (dump_enabled_p ())
358 for (hash_map<tree, pair>::iterator iter = live_ranges->begin ();
359 iter != live_ranges->end (); ++iter)
360 dump_printf_loc (MSG_NOTE, vect_location,
361 "%T: type = %T, start = %d, end = %d\n",
362 (*iter).first, TREE_TYPE ((*iter).first),
363 (*iter).second.first, (*iter).second.second);
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE, vect_location, "Biggest mode = %s\n",
368 GET_MODE_NAME (biggest_mode));
369 return biggest_mode;
372 /* Compute the mode for MODE, BIGGEST_MODE and LMUL.
374 E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
375 Then return RVVM4SImode (LMUL = 4, element mode = SImode). */
376 static unsigned int
377 compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul)
379 unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
380 unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
381 gcc_assert (biggest_size >= mode_size);
382 unsigned int ratio = biggest_size / mode_size;
383 return MAX (lmul / ratio, 1);
386 /* This function helps to determine whether current LMUL will cause
387 potential vector register (V_REG) spillings according to live range
388 information.
390 - First, compute how many variable are alive of each program point
391 in each bb of the loop.
392 - Second, compute how many V_REGs are alive of each program point
393 in each bb of the loop according the BIGGEST_MODE and the variable
394 mode.
395 - Third, Return the maximum V_REGs are alive of the loop. */
396 static unsigned int
397 max_number_of_live_regs (const basic_block bb,
398 const hash_map<tree, pair> &live_ranges,
399 unsigned int max_point, machine_mode biggest_mode,
400 int lmul)
402 unsigned int max_nregs = 0;
403 unsigned int i;
404 unsigned int live_point = 0;
405 auto_vec<unsigned int> live_vars_vec;
406 live_vars_vec.safe_grow_cleared (max_point, true);
407 for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
408 iter != live_ranges.end (); ++iter)
410 tree var = (*iter).first;
411 pair live_range = (*iter).second;
412 for (i = live_range.first + 1; i <= live_range.second; i++)
414 machine_mode mode = TYPE_MODE (TREE_TYPE (var));
415 unsigned int nregs
416 = compute_nregs_for_mode (mode, biggest_mode, lmul);
417 live_vars_vec[i] += nregs;
418 if (live_vars_vec[i] > max_nregs)
420 max_nregs = live_vars_vec[i];
421 live_point = i;
426 /* Collect user explicit RVV type. */
427 auto_vec<basic_block> all_preds
428 = get_all_dominated_blocks (CDI_POST_DOMINATORS, bb);
429 tree t;
430 FOR_EACH_SSA_NAME (i, t, cfun)
432 machine_mode mode = TYPE_MODE (TREE_TYPE (t));
433 if (!lookup_vector_type_attribute (TREE_TYPE (t))
434 && !riscv_v_ext_vls_mode_p (mode))
435 continue;
437 gimple *def = SSA_NAME_DEF_STMT (t);
438 if (gimple_bb (def) && !all_preds.contains (gimple_bb (def)))
439 continue;
440 use_operand_p use_p;
441 imm_use_iterator iterator;
443 FOR_EACH_IMM_USE_FAST (use_p, iterator, t)
445 if (!USE_STMT (use_p) || is_gimple_debug (USE_STMT (use_p))
446 || !dominated_by_p (CDI_POST_DOMINATORS, bb,
447 gimple_bb (USE_STMT (use_p))))
448 continue;
450 int regno_alignment = riscv_get_v_regno_alignment (mode);
451 max_nregs += regno_alignment;
452 if (dump_enabled_p ())
453 dump_printf_loc (
454 MSG_NOTE, vect_location,
455 "Explicit used SSA %T, vectype = %T, mode = %s, cause %d "
456 "V_REG live in bb %d at program point %d\n",
457 t, TREE_TYPE (t), GET_MODE_NAME (mode), regno_alignment,
458 bb->index, live_point);
459 break;
463 if (dump_enabled_p ())
464 dump_printf_loc (
465 MSG_NOTE, vect_location,
466 "Maximum lmul = %d, At most %d number of live V_REG at program "
467 "point %d for bb %d\n",
468 lmul, max_nregs, live_point, bb->index);
469 return max_nregs;
472 /* Get STORE value. */
473 static tree
474 get_store_value (gimple *stmt)
476 if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
478 if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
479 return gimple_call_arg (stmt, 3);
480 else
481 gcc_unreachable ();
483 else
484 return gimple_assign_rhs1 (stmt);
487 /* Return true if it is non-contiguous load/store. */
488 static bool
489 non_contiguous_memory_access_p (stmt_vec_info stmt_info)
491 enum stmt_vec_info_type type
492 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
493 return ((type == load_vec_info_type || type == store_vec_info_type)
494 && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
497 /* Return the LMUL of the current analysis. */
498 static int
499 compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
501 gcc_assert (GET_MODE_BITSIZE (mode).is_constant ());
502 int regno_alignment = riscv_get_v_regno_alignment (loop_vinfo->vector_mode);
503 if (riscv_v_ext_vls_mode_p (loop_vinfo->vector_mode))
504 return regno_alignment;
505 else if (known_eq (LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo), 1U))
507 int estimated_vf = vect_vf_for_cost (loop_vinfo);
508 return estimated_vf * GET_MODE_BITSIZE (mode).to_constant ()
509 / TARGET_MIN_VLEN;
511 else
513 /* Estimate the VLA SLP LMUL. */
514 if (regno_alignment > RVV_M1)
515 return regno_alignment;
516 else if (mode != QImode
517 || LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo).is_constant ())
519 int ratio;
520 if (can_div_trunc_p (BYTES_PER_RISCV_VECTOR,
521 GET_MODE_SIZE (loop_vinfo->vector_mode), &ratio))
523 if (ratio == 1)
524 return RVV_M4;
525 else if (ratio == 2)
526 return RVV_M2;
530 return 0;
533 /* Update the live ranges according PHI.
535 Loop:
536 bb 2:
537 STMT 1 -- point 0
538 STMT 2 (def SSA 1) -- point 1
539 STMT 3 (use SSA 1) -- point 2
540 STMT 4 -- point 3
541 bb 3:
542 SSA 2 = PHI<SSA 1>
543 STMT 1 -- point 0
544 STMT 2 -- point 1
545 STMT 3 (use SSA 2) -- point 2
546 STMT 4 -- point 3
548 Before this function, the SSA 1 live range is [2, 3] in bb 2
549 and SSA 2 is [0, 3] in bb 3.
551 Then, after this function, we update SSA 1 live range in bb 2
552 into [2, 4] since SSA 1 is live out into bb 3. */
553 static void
554 update_local_live_ranges (
555 vec_info *vinfo,
556 hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
557 hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb,
558 machine_mode *biggest_mode)
560 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
561 if (!loop_vinfo)
562 return;
564 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
565 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
566 unsigned int nbbs = loop->num_nodes;
567 unsigned int i, j;
568 gphi_iterator psi;
569 gimple_stmt_iterator si;
570 for (i = 0; i < nbbs; i++)
572 basic_block bb = bbs[i];
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location,
575 "Update local program points for bb %d:\n",
576 bbs[i]->index);
577 for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
579 gphi *phi = psi.phi ();
580 stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
581 if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
582 == undef_vec_info_type)
583 continue;
585 for (j = 0; j < gimple_phi_num_args (phi); j++)
587 edge e = gimple_phi_arg_edge (phi, j);
588 tree def = gimple_phi_arg_def (phi, j);
589 auto *live_ranges = live_ranges_per_bb.get (bb);
590 auto *live_range = live_ranges->get (def);
591 if (poly_int_tree_p (def))
593 /* Insert live range of INTEGER_CST or POLY_CST since we will
594 need to allocate a vector register for it.
596 E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
597 into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
599 The live range for such value is short which only lives
600 from program point 0 to 1. */
601 if (live_range)
603 unsigned int start = (*live_range).first;
604 (*live_range).first = 0;
605 if (dump_enabled_p ())
606 dump_printf_loc (
607 MSG_NOTE, vect_location,
608 "Update %T start point from %d to 0:\n", def, start);
610 else
612 live_ranges->put (def, pair (0, 1));
613 auto &program_points = (*program_points_per_bb.get (bb));
614 if (program_points.is_empty ())
616 stmt_point info = {1, phi, stmt_info};
617 program_points.safe_push (info);
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE, vect_location,
621 "Add %T start point from 0 to 1:\n",
622 def);
624 continue;
626 if (live_range && flow_bb_inside_loop_p (loop, e->src))
628 unsigned int start = (*live_range).first;
629 (*live_range).first = 0;
630 if (dump_enabled_p ())
631 dump_printf_loc (MSG_NOTE, vect_location,
632 "Update %T start point from %d to %d:\n",
633 def, start, (*live_range).first);
635 live_ranges = live_ranges_per_bb.get (e->src);
636 if (!program_points_per_bb.get (e->src))
637 continue;
638 unsigned int max_point
639 = (*program_points_per_bb.get (e->src)).length ();
640 live_range = live_ranges->get (def);
641 if (!live_range)
642 continue;
644 unsigned int end = (*live_range).second;
645 (*live_range).second = max_point;
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Update %T end point from %d to %d:\n", def,
649 end, (*live_range).second);
652 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
654 if (!is_gimple_assign_or_call (gsi_stmt (si)))
655 continue;
656 stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
657 enum stmt_vec_info_type type
658 = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
659 if (non_contiguous_memory_access_p (stmt_info)
660 /* LOAD_LANES/STORE_LANES doesn't need a perm indice. */
661 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
662 != VMAT_LOAD_STORE_LANES)
664 /* For non-adjacent load/store STMT, we will potentially
665 convert it into:
667 1. MASK_LEN_GATHER_LOAD (..., perm indice).
668 2. Continguous load/store + VEC_PERM (..., perm indice)
670 We will be likely using one more vector variable. */
671 unsigned int max_point
672 = (*program_points_per_bb.get (bb)).length () - 1;
673 auto *live_ranges = live_ranges_per_bb.get (bb);
674 bool existed_p = false;
675 tree var = type == load_vec_info_type
676 ? gimple_get_lhs (gsi_stmt (si))
677 : get_store_value (gsi_stmt (si));
678 tree sel_type = build_nonstandard_integer_type (
679 TYPE_PRECISION (TREE_TYPE (var)), 1);
680 *biggest_mode
681 = get_biggest_mode (*biggest_mode, TYPE_MODE (sel_type));
682 tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
683 get_identifier ("vect_perm"), sel_type);
684 pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
685 gcc_assert (!existed_p);
686 live_range = pair (0, max_point);
687 if (dump_enabled_p ())
688 dump_printf_loc (MSG_NOTE, vect_location,
689 "Add perm indice %T, start = 0, end = %d\n",
690 sel, max_point);
696 /* Compute the maximum live V_REGS. */
697 static bool
698 has_unexpected_spills_p (loop_vec_info loop_vinfo)
700 /* Compute local program points.
701 It's a fast and effective computation. */
702 hash_map<basic_block, vec<stmt_point>> program_points_per_bb;
703 compute_local_program_points (loop_vinfo, program_points_per_bb);
705 /* Compute local live ranges. */
706 hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
707 machine_mode biggest_mode
708 = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
710 /* Update live ranges according to PHI. */
711 update_local_live_ranges (loop_vinfo, program_points_per_bb,
712 live_ranges_per_bb, &biggest_mode);
714 int lmul = compute_estimated_lmul (loop_vinfo, biggest_mode);
715 /* TODO: We calculate the maximum live vars base on current STMTS
716 sequence. We can support live range shrink if it can give us
717 big improvement in the future. */
718 if (lmul > RVV_M1)
720 if (!live_ranges_per_bb.is_empty ())
722 unsigned int max_nregs = 0;
723 for (hash_map<basic_block, hash_map<tree, pair>>::iterator iter
724 = live_ranges_per_bb.begin ();
725 iter != live_ranges_per_bb.end (); ++iter)
727 basic_block bb = (*iter).first;
728 unsigned int max_point
729 = (*program_points_per_bb.get (bb)).length () + 1;
730 if ((*iter).second.is_empty ())
731 continue;
732 /* We prefer larger LMUL unless it causes register spillings. */
733 unsigned int nregs
734 = max_number_of_live_regs (bb, (*iter).second, max_point,
735 biggest_mode, lmul);
736 if (nregs > max_nregs)
737 max_nregs = nregs;
739 live_ranges_per_bb.empty ();
740 if (max_nregs > V_REG_NUM)
741 return true;
744 if (!program_points_per_bb.is_empty ())
746 for (hash_map<basic_block, vec<stmt_point>>::iterator iter
747 = program_points_per_bb.begin ();
748 iter != program_points_per_bb.end (); ++iter)
750 vec<stmt_point> program_points = (*iter).second;
751 if (!program_points.is_empty ())
752 program_points.release ();
754 program_points_per_bb.empty ();
756 return false;
759 costs::costs (vec_info *vinfo, bool costing_for_scalar)
760 : vector_costs (vinfo, costing_for_scalar)
762 if (costing_for_scalar)
763 m_cost_type = SCALAR_COST;
764 else if (riscv_v_ext_vector_mode_p (vinfo->vector_mode))
765 m_cost_type = VLA_VECTOR_COST;
766 else
767 m_cost_type = VLS_VECTOR_COST;
770 /* Do one-time initialization of the costs given that we're
771 costing the loop vectorization described by LOOP_VINFO. */
772 void
773 costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
775 /* Record the number of times that the vector loop would execute,
776 if known. */
777 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
778 auto scalar_niters = max_stmt_executions_int (loop);
779 if (scalar_niters >= 0)
781 unsigned int vf = vect_vf_for_cost (loop_vinfo);
782 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
783 m_num_vector_iterations = scalar_niters / vf;
784 else
785 m_num_vector_iterations = CEIL (scalar_niters, vf);
788 /* Detect whether we're vectorizing for VLA and should apply the unrolling
789 heuristic described above m_unrolled_vls_niters. */
790 record_potential_vls_unrolling (loop_vinfo);
792 /* Detect whether the LOOP has unexpected spills. */
793 record_potential_unexpected_spills (loop_vinfo);
796 /* Analyze the vectorized program stataments and use dynamic LMUL
797 heuristic to detect whether the loop has unexpected spills. */
798 void
799 costs::record_potential_unexpected_spills (loop_vec_info loop_vinfo)
801 /* We only want to apply the heuristic if LOOP_VINFO is being
802 vectorized for VLA and known NITERS VLS loop. */
803 if (riscv_autovec_lmul == RVV_DYNAMIC
804 && (m_cost_type == VLA_VECTOR_COST
805 || (m_cost_type == VLS_VECTOR_COST
806 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))))
808 bool post_dom_available_p = dom_info_available_p (CDI_POST_DOMINATORS);
809 if (!post_dom_available_p)
810 calculate_dominance_info (CDI_POST_DOMINATORS);
811 m_has_unexpected_spills_p = has_unexpected_spills_p (loop_vinfo);
812 if (!post_dom_available_p)
813 free_dominance_info (CDI_POST_DOMINATORS);
817 /* Decide whether to use the unrolling heuristic described above
818 m_unrolled_vls_niters, updating that field if so. LOOP_VINFO
819 describes the loop that we're vectorizing. */
820 void
821 costs::record_potential_vls_unrolling (loop_vec_info loop_vinfo)
823 /* We only want to apply the heuristic if LOOP_VINFO is being
824 vectorized for VLA. */
825 if (m_cost_type != VLA_VECTOR_COST)
826 return;
828 /* We don't want to apply the heuristic to outer loops, since it's
829 harder to track two levels of unrolling. */
830 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
831 return;
833 /* Only handle cases in which the number of VLS iterations
834 would be known at compile time but the number of SVE iterations
835 would not. */
836 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
837 || BYTES_PER_RISCV_VECTOR.is_constant ())
838 return;
840 /* Guess how many times the VLS loop would iterate and make
841 sure that it is within the complete unrolling limit. Even if the
842 number of iterations is small enough, the number of statements might
843 not be, which is why we need to estimate the number of statements too. */
844 unsigned int vls_vf = vect_vf_for_cost (loop_vinfo);
845 unsigned HOST_WIDE_INT unrolled_vls_niters
846 = LOOP_VINFO_INT_NITERS (loop_vinfo) / vls_vf;
847 if (unrolled_vls_niters > (unsigned int) param_max_completely_peel_times)
848 return;
850 /* Record that we're applying the heuristic and should try to estimate
851 the number of statements in the VLS loop. */
852 m_unrolled_vls_niters = unrolled_vls_niters;
855 /* Return true if (a) we're applying the VLS vs. VLA unrolling
856 heuristic described above m_unrolled_vls_niters and (b) the heuristic
857 says that we should prefer the VLS loop. */
858 bool
859 costs::prefer_unrolled_loop () const
861 if (!m_unrolled_vls_stmts)
862 return false;
864 if (dump_enabled_p ())
865 dump_printf_loc (MSG_NOTE, vect_location,
866 "Number of insns in"
867 " unrolled VLS loop = " HOST_WIDE_INT_PRINT_UNSIGNED "\n",
868 m_unrolled_vls_stmts);
870 /* The balance here is tricky. On the one hand, we can't be sure whether
871 the code is vectorizable with VLS or not. However, even if
872 it isn't vectorizable with VLS, there's a possibility that
873 the scalar code could also be unrolled. Some of the code might then
874 benefit from SLP, or from using LDP and STP. We therefore apply
875 the heuristic regardless of can_use_vls_p. */
876 return (m_unrolled_vls_stmts
877 && (m_unrolled_vls_stmts
878 <= (unsigned int) param_max_completely_peeled_insns));
881 bool
882 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
884 auto other = static_cast<const costs *> (uncast_other);
885 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
886 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location,
890 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
891 GET_MODE_NAME (this_loop_vinfo->vector_mode),
892 vect_vf_for_cost (this_loop_vinfo),
893 GET_MODE_NAME (other_loop_vinfo->vector_mode),
894 vect_vf_for_cost (other_loop_vinfo));
896 /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */
897 if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
899 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
900 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
901 if (this_prefer_unrolled != other_prefer_unrolled)
903 if (dump_enabled_p ())
904 dump_printf_loc (MSG_NOTE, vect_location,
905 "Preferring VLS loop because"
906 " it can be unrolled\n");
907 return other_prefer_unrolled;
910 else if (riscv_autovec_lmul == RVV_DYNAMIC)
912 if (other->m_has_unexpected_spills_p)
914 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "Preferring smaller LMUL loop because"
917 " it has unexpected spills\n");
918 return true;
920 else if (riscv_v_ext_vector_mode_p (other_loop_vinfo->vector_mode))
922 if (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
924 if (maybe_gt (LOOP_VINFO_INT_NITERS (this_loop_vinfo),
925 LOOP_VINFO_VECT_FACTOR (this_loop_vinfo)))
927 if (dump_enabled_p ())
928 dump_printf_loc (MSG_NOTE, vect_location,
929 "Keep current LMUL loop because"
930 " known NITERS exceed the new VF\n");
931 return false;
934 else
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location,
938 "Keep current LMUL loop because"
939 " it is unknown NITERS\n");
940 return false;
945 return vector_costs::better_main_loop_than_p (other);
948 unsigned
949 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
950 stmt_vec_info stmt_info, slp_tree, tree vectype,
951 int misalign, vect_cost_model_location where)
953 int stmt_cost
954 = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
956 /* Do one-time initialization based on the vinfo. */
957 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
958 if (!m_analyzed_vinfo)
960 if (loop_vinfo)
961 analyze_loop_vinfo (loop_vinfo);
963 m_analyzed_vinfo = true;
966 if (stmt_info)
968 /* If we're applying the VLA vs. VLS unrolling heuristic,
969 estimate the number of statements in the unrolled VLS
970 loop. For simplicitly, we assume that one iteration of the
971 VLS loop would need the same number of statements
972 as one iteration of the VLA loop. */
973 if (where == vect_body && m_unrolled_vls_niters)
974 m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
977 return record_stmt_cost (stmt_info, where, count * stmt_cost);
980 void
981 costs::finish_cost (const vector_costs *scalar_costs)
983 vector_costs::finish_cost (scalar_costs);
986 } // namespace riscv_vector