RISC-V: Make stack_save_restore tests more robust
[official-gcc.git] / gcc / tree-vect-slp.cc
blob0cf6e02285e90f73d20ac8188edc453e10929e08
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
121 SLP_TREE_CODE (this) = ERROR_MARK;
122 SLP_TREE_VECTYPE (this) = NULL_TREE;
123 SLP_TREE_REPRESENTATIVE (this) = NULL;
124 SLP_TREE_REF_COUNT (this) = 1;
125 this->failed = NULL;
126 this->max_nunits = 1;
127 this->lanes = 0;
130 /* Tear down a SLP node. */
132 _slp_tree::~_slp_tree ()
134 if (this->prev_node)
135 this->prev_node->next_node = this->next_node;
136 else
137 slp_first_node = this->next_node;
138 if (this->next_node)
139 this->next_node->prev_node = this->prev_node;
140 SLP_TREE_CHILDREN (this).release ();
141 SLP_TREE_SCALAR_STMTS (this).release ();
142 SLP_TREE_SCALAR_OPS (this).release ();
143 SLP_TREE_VEC_DEFS (this).release ();
144 SLP_TREE_LOAD_PERMUTATION (this).release ();
145 SLP_TREE_LANE_PERMUTATION (this).release ();
146 if (this->failed)
147 free (failed);
150 /* Push the single SSA definition in DEF to the vector of vector defs. */
152 void
153 _slp_tree::push_vec_def (gimple *def)
155 if (gphi *phi = dyn_cast <gphi *> (def))
156 vec_defs.quick_push (gimple_phi_result (phi));
157 else
159 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
160 vec_defs.quick_push (get_def_from_ptr (defop));
164 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
166 void
167 vect_free_slp_tree (slp_tree node)
169 int i;
170 slp_tree child;
172 if (--SLP_TREE_REF_COUNT (node) != 0)
173 return;
175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
176 if (child)
177 vect_free_slp_tree (child);
179 /* If the node defines any SLP only patterns then those patterns are no
180 longer valid and should be removed. */
181 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
182 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
184 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
185 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
186 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 delete node;
192 /* Return a location suitable for dumpings related to the SLP instance. */
194 dump_user_location_t
195 _slp_instance::location () const
197 if (!root_stmts.is_empty ())
198 return root_stmts[0]->stmt;
199 else
200 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
204 /* Free the memory allocated for the SLP instance. */
206 void
207 vect_free_slp_instance (slp_instance instance)
209 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
210 SLP_INSTANCE_LOADS (instance).release ();
211 SLP_INSTANCE_ROOT_STMTS (instance).release ();
212 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
213 instance->subgraph_entries.release ();
214 instance->cost_vec.release ();
215 free (instance);
219 /* Create an SLP node for SCALAR_STMTS. */
221 slp_tree
222 vect_create_new_slp_node (unsigned nops, tree_code code)
224 slp_tree node = new _slp_tree;
225 SLP_TREE_SCALAR_STMTS (node) = vNULL;
226 SLP_TREE_CHILDREN (node).create (nops);
227 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
228 SLP_TREE_CODE (node) = code;
229 return node;
231 /* Create an SLP node for SCALAR_STMTS. */
233 static slp_tree
234 vect_create_new_slp_node (slp_tree node,
235 vec<stmt_vec_info> scalar_stmts, unsigned nops)
237 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
238 SLP_TREE_CHILDREN (node).create (nops);
239 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
240 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
241 SLP_TREE_LANES (node) = scalar_stmts.length ();
242 return node;
245 /* Create an SLP node for SCALAR_STMTS. */
247 static slp_tree
248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
250 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
253 /* Create an SLP node for OPS. */
255 static slp_tree
256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
258 SLP_TREE_SCALAR_OPS (node) = ops;
259 SLP_TREE_DEF_TYPE (node) = vect_external_def;
260 SLP_TREE_LANES (node) = ops.length ();
261 return node;
264 /* Create an SLP node for OPS. */
266 static slp_tree
267 vect_create_new_slp_node (vec<tree> ops)
269 return vect_create_new_slp_node (new _slp_tree, ops);
273 /* This structure is used in creation of an SLP tree. Each instance
274 corresponds to the same operand in a group of scalar stmts in an SLP
275 node. */
276 typedef struct _slp_oprnd_info
278 /* Def-stmts for the operands. */
279 vec<stmt_vec_info> def_stmts;
280 /* Operands. */
281 vec<tree> ops;
282 /* Information about the first statement, its vector def-type, type, the
283 operand itself in case it's constant, and an indication if it's a pattern
284 stmt. */
285 tree first_op_type;
286 enum vect_def_type first_dt;
287 bool any_pattern;
288 } *slp_oprnd_info;
291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
292 operand. */
293 static vec<slp_oprnd_info>
294 vect_create_oprnd_info (int nops, int group_size)
296 int i;
297 slp_oprnd_info oprnd_info;
298 vec<slp_oprnd_info> oprnds_info;
300 oprnds_info.create (nops);
301 for (i = 0; i < nops; i++)
303 oprnd_info = XNEW (struct _slp_oprnd_info);
304 oprnd_info->def_stmts.create (group_size);
305 oprnd_info->ops.create (group_size);
306 oprnd_info->first_dt = vect_uninitialized_def;
307 oprnd_info->first_op_type = NULL_TREE;
308 oprnd_info->any_pattern = false;
309 oprnds_info.quick_push (oprnd_info);
312 return oprnds_info;
316 /* Free operands info. */
318 static void
319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
321 int i;
322 slp_oprnd_info oprnd_info;
324 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
326 oprnd_info->def_stmts.release ();
327 oprnd_info->ops.release ();
328 XDELETE (oprnd_info);
331 oprnds_info.release ();
334 /* Return the execution frequency of NODE (so that a higher value indicates
335 a "more important" node when optimizing for speed). */
337 static sreal
338 vect_slp_node_weight (slp_tree node)
340 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
341 basic_block bb = gimple_bb (stmt_info->stmt);
342 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
345 /* Return true if STMTS contains a pattern statement. */
347 static bool
348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
350 stmt_vec_info stmt_info;
351 unsigned int i;
352 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
353 if (is_pattern_stmt_p (stmt_info))
354 return true;
355 return false;
358 /* Return true when all lanes in the external or constant NODE have
359 the same value. */
361 static bool
362 vect_slp_tree_uniform_p (slp_tree node)
364 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
365 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
367 /* Pre-exsting vectors. */
368 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
369 return false;
371 unsigned i;
372 tree op, first = NULL_TREE;
373 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
374 if (!first)
375 first = op;
376 else if (!operand_equal_p (first, op, 0))
377 return false;
379 return true;
382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
383 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
384 of the chain. */
387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
388 stmt_vec_info first_stmt_info)
390 stmt_vec_info next_stmt_info = first_stmt_info;
391 int result = 0;
393 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
394 return -1;
398 if (next_stmt_info == stmt_info)
399 return result;
400 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
401 if (next_stmt_info)
402 result += DR_GROUP_GAP (next_stmt_info);
404 while (next_stmt_info);
406 return -1;
409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
410 using the method implemented by duplicate_and_interleave. Return true
411 if so, returning the number of intermediate vectors in *NVECTORS_OUT
412 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
413 (if nonnull). */
415 bool
416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
417 tree elt_type, unsigned int *nvectors_out,
418 tree *vector_type_out,
419 tree *permutes)
421 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
422 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
423 return false;
425 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
426 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
427 unsigned int nvectors = 1;
428 for (;;)
430 scalar_int_mode int_mode;
431 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
432 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
434 /* Get the natural vector type for this SLP group size. */
435 tree int_type = build_nonstandard_integer_type
436 (GET_MODE_BITSIZE (int_mode), 1);
437 tree vector_type
438 = get_vectype_for_scalar_type (vinfo, int_type, count);
439 poly_int64 half_nelts;
440 if (vector_type
441 && VECTOR_MODE_P (TYPE_MODE (vector_type))
442 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
443 GET_MODE_SIZE (base_vector_mode))
444 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
445 2, &half_nelts))
447 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
448 together into elements of type INT_TYPE and using the result
449 to build NVECTORS vectors. */
450 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
451 vec_perm_builder sel1 (nelts, 2, 3);
452 vec_perm_builder sel2 (nelts, 2, 3);
454 for (unsigned int i = 0; i < 3; ++i)
456 sel1.quick_push (i);
457 sel1.quick_push (i + nelts);
458 sel2.quick_push (half_nelts + i);
459 sel2.quick_push (half_nelts + i + nelts);
461 vec_perm_indices indices1 (sel1, 2, nelts);
462 vec_perm_indices indices2 (sel2, 2, nelts);
463 machine_mode vmode = TYPE_MODE (vector_type);
464 if (can_vec_perm_const_p (vmode, vmode, indices1)
465 && can_vec_perm_const_p (vmode, vmode, indices2))
467 if (nvectors_out)
468 *nvectors_out = nvectors;
469 if (vector_type_out)
470 *vector_type_out = vector_type;
471 if (permutes)
473 permutes[0] = vect_gen_perm_mask_checked (vector_type,
474 indices1);
475 permutes[1] = vect_gen_perm_mask_checked (vector_type,
476 indices2);
478 return true;
482 if (!multiple_p (elt_bytes, 2, &elt_bytes))
483 return false;
484 nvectors *= 2;
488 /* Return true if DTA and DTB match. */
490 static bool
491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
493 return (dta == dtb
494 || ((dta == vect_external_def || dta == vect_constant_def)
495 && (dtb == vect_external_def || dtb == vect_constant_def)));
498 static const int cond_expr_maps[3][5] = {
499 { 4, -1, -2, 1, 2 },
500 { 4, -2, -1, 1, 2 },
501 { 4, -1, -2, 2, 1 }
503 static const int arg1_map[] = { 1, 1 };
504 static const int arg2_map[] = { 1, 2 };
505 static const int arg1_arg4_map[] = { 2, 1, 4 };
506 static const int arg3_arg2_map[] = { 2, 3, 2 };
507 static const int op1_op0_map[] = { 2, 1, 0 };
509 /* For most SLP statements, there is a one-to-one mapping between
510 gimple arguments and child nodes. If that is not true for STMT,
511 return an array that contains:
513 - the number of child nodes, followed by
514 - for each child node, the index of the argument associated with that node.
515 The special index -1 is the first operand of an embedded comparison and
516 the special index -2 is the second operand of an embedded comparison.
518 SWAP is as for vect_get_and_check_slp_defs. */
520 static const int *
521 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
523 if (auto assign = dyn_cast<const gassign *> (stmt))
525 if (gimple_assign_rhs_code (assign) == COND_EXPR
526 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
527 return cond_expr_maps[swap];
528 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
529 && swap)
530 return op1_op0_map;
532 gcc_assert (!swap);
533 if (auto call = dyn_cast<const gcall *> (stmt))
535 if (gimple_call_internal_p (call))
536 switch (gimple_call_internal_fn (call))
538 case IFN_MASK_LOAD:
539 return arg2_map;
541 case IFN_GATHER_LOAD:
542 return arg1_map;
544 case IFN_MASK_GATHER_LOAD:
545 return arg1_arg4_map;
547 case IFN_MASK_STORE:
548 return arg3_arg2_map;
550 default:
551 break;
554 return nullptr;
557 /* Return the SLP node child index for operand OP of STMT. */
560 vect_slp_child_index_for_operand (const gimple *stmt, int op)
562 const int *opmap = vect_get_operand_map (stmt);
563 if (!opmap)
564 return op;
565 for (int i = 1; i < 1 + opmap[0]; ++i)
566 if (opmap[i] == op)
567 return i - 1;
568 gcc_unreachable ();
571 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
572 they are of a valid type and that they match the defs of the first stmt of
573 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
574 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
575 indicates swap is required for cond_expr stmts. Specifically, SWAP
576 is 1 if STMT is cond and operands of comparison need to be swapped;
577 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
579 If there was a fatal error return -1; if the error could be corrected by
580 swapping operands of father node of this one, return 1; if everything is
581 ok return 0. */
582 static int
583 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
584 bool *skip_args,
585 vec<stmt_vec_info> stmts, unsigned stmt_num,
586 vec<slp_oprnd_info> *oprnds_info)
588 stmt_vec_info stmt_info = stmts[stmt_num];
589 tree oprnd;
590 unsigned int i, number_of_oprnds;
591 enum vect_def_type dt = vect_uninitialized_def;
592 slp_oprnd_info oprnd_info;
593 unsigned int commutative_op = -1U;
594 bool first = stmt_num == 0;
596 if (!is_a<gcall *> (stmt_info->stmt)
597 && !is_a<gassign *> (stmt_info->stmt)
598 && !is_a<gphi *> (stmt_info->stmt))
599 return -1;
601 number_of_oprnds = gimple_num_args (stmt_info->stmt);
602 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
603 if (map)
604 number_of_oprnds = *map++;
605 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
607 if (gimple_call_internal_p (stmt))
609 internal_fn ifn = gimple_call_internal_fn (stmt);
610 commutative_op = first_commutative_argument (ifn);
613 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
615 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
616 commutative_op = 0;
619 bool swapped = (swap != 0);
620 bool backedge = false;
621 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
622 for (i = 0; i < number_of_oprnds; i++)
624 int opno = map ? map[i] : int (i);
625 if (opno < 0)
626 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
627 else
629 oprnd = gimple_arg (stmt_info->stmt, opno);
630 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
631 backedge = dominated_by_p (CDI_DOMINATORS,
632 gimple_phi_arg_edge (stmt, opno)->src,
633 gimple_bb (stmt_info->stmt));
635 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
636 oprnd = TREE_OPERAND (oprnd, 0);
638 oprnd_info = (*oprnds_info)[i];
640 stmt_vec_info def_stmt_info;
641 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "Build SLP failed: can't analyze def for %T\n",
646 oprnd);
648 return -1;
651 if (skip_args[i])
653 oprnd_info->def_stmts.quick_push (NULL);
654 oprnd_info->ops.quick_push (NULL_TREE);
655 oprnd_info->first_dt = vect_uninitialized_def;
656 continue;
659 oprnd_info->def_stmts.quick_push (def_stmt_info);
660 oprnd_info->ops.quick_push (oprnd);
662 if (def_stmt_info
663 && is_pattern_stmt_p (def_stmt_info))
665 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
666 != def_stmt_info)
667 oprnd_info->any_pattern = true;
668 else
669 /* If we promote this to external use the original stmt def. */
670 oprnd_info->ops.last ()
671 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
674 /* If there's a extern def on a backedge make sure we can
675 code-generate at the region start.
676 ??? This is another case that could be fixed by adjusting
677 how we split the function but at the moment we'd have conflicting
678 goals there. */
679 if (backedge
680 && dts[i] == vect_external_def
681 && is_a <bb_vec_info> (vinfo)
682 && TREE_CODE (oprnd) == SSA_NAME
683 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
684 && !dominated_by_p (CDI_DOMINATORS,
685 as_a <bb_vec_info> (vinfo)->bbs[0],
686 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "Build SLP failed: extern def %T only defined "
691 "on backedge\n", oprnd);
692 return -1;
695 if (first)
697 tree type = TREE_TYPE (oprnd);
698 dt = dts[i];
699 if ((dt == vect_constant_def
700 || dt == vect_external_def)
701 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
702 && (TREE_CODE (type) == BOOLEAN_TYPE
703 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
704 type)))
706 if (dump_enabled_p ())
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "Build SLP failed: invalid type of def "
709 "for variable-length SLP %T\n", oprnd);
710 return -1;
713 /* For the swapping logic below force vect_reduction_def
714 for the reduction op in a SLP reduction group. */
715 if (!STMT_VINFO_DATA_REF (stmt_info)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
718 && def_stmt_info)
719 dts[i] = dt = vect_reduction_def;
721 /* Check the types of the definition. */
722 switch (dt)
724 case vect_external_def:
725 case vect_constant_def:
726 case vect_internal_def:
727 case vect_reduction_def:
728 case vect_induction_def:
729 case vect_nested_cycle:
730 case vect_first_order_recurrence:
731 break;
733 default:
734 /* FORNOW: Not supported. */
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
737 "Build SLP failed: illegal type of def %T\n",
738 oprnd);
739 return -1;
742 oprnd_info->first_dt = dt;
743 oprnd_info->first_op_type = type;
746 if (first)
747 return 0;
749 /* Now match the operand definition types to that of the first stmt. */
750 for (i = 0; i < number_of_oprnds;)
752 if (skip_args[i])
754 ++i;
755 continue;
758 oprnd_info = (*oprnds_info)[i];
759 dt = dts[i];
760 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
761 oprnd = oprnd_info->ops[stmt_num];
762 tree type = TREE_TYPE (oprnd);
764 if (!types_compatible_p (oprnd_info->first_op_type, type))
766 if (dump_enabled_p ())
767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
768 "Build SLP failed: different operand types\n");
769 return 1;
772 /* Not first stmt of the group, check that the def-stmt/s match
773 the def-stmt/s of the first stmt. Allow different definition
774 types for reduction chains: the first stmt must be a
775 vect_reduction_def (a phi node), and the rest
776 end in the reduction chain. */
777 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
778 && !(oprnd_info->first_dt == vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 && def_stmt_info
782 && !STMT_VINFO_DATA_REF (def_stmt_info)
783 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
784 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
785 || (!STMT_VINFO_DATA_REF (stmt_info)
786 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
787 && ((!def_stmt_info
788 || STMT_VINFO_DATA_REF (def_stmt_info)
789 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
790 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
791 != (oprnd_info->first_dt != vect_reduction_def))))
793 /* Try swapping operands if we got a mismatch. For BB
794 vectorization only in case it will clearly improve things. */
795 if (i == commutative_op && !swapped
796 && (!is_a <bb_vec_info> (vinfo)
797 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
798 dts[i+1])
799 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
800 || vect_def_types_match
801 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_NOTE, vect_location,
805 "trying swapped operands\n");
806 std::swap (dts[i], dts[i+1]);
807 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
808 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
809 std::swap ((*oprnds_info)[i]->ops[stmt_num],
810 (*oprnds_info)[i+1]->ops[stmt_num]);
811 swapped = true;
812 continue;
815 if (is_a <bb_vec_info> (vinfo)
816 && !oprnd_info->any_pattern)
818 /* Now for commutative ops we should see whether we can
819 make the other operand matching. */
820 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "treating operand as external\n");
823 oprnd_info->first_dt = dt = vect_external_def;
825 else
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
829 "Build SLP failed: different types\n");
830 return 1;
834 /* Make sure to demote the overall operand to external. */
835 if (dt == vect_external_def)
836 oprnd_info->first_dt = vect_external_def;
837 /* For a SLP reduction chain we want to duplicate the reduction to
838 each of the chain members. That gets us a sane SLP graph (still
839 the stmts are not 100% correct wrt the initial values). */
840 else if ((dt == vect_internal_def
841 || dt == vect_reduction_def)
842 && oprnd_info->first_dt == vect_reduction_def
843 && !STMT_VINFO_DATA_REF (stmt_info)
844 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
845 && !STMT_VINFO_DATA_REF (def_stmt_info)
846 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
847 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
849 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
850 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
853 ++i;
856 /* Swap operands. */
857 if (swapped)
859 if (dump_enabled_p ())
860 dump_printf_loc (MSG_NOTE, vect_location,
861 "swapped operands to match def types in %G",
862 stmt_info->stmt);
865 return 0;
868 /* Return true if call statements CALL1 and CALL2 are similar enough
869 to be combined into the same SLP group. */
871 bool
872 compatible_calls_p (gcall *call1, gcall *call2)
874 unsigned int nargs = gimple_call_num_args (call1);
875 if (nargs != gimple_call_num_args (call2))
876 return false;
878 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
879 return false;
881 if (gimple_call_internal_p (call1))
883 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
884 TREE_TYPE (gimple_call_lhs (call2))))
885 return false;
886 for (unsigned int i = 0; i < nargs; ++i)
887 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
888 TREE_TYPE (gimple_call_arg (call2, i))))
889 return false;
891 else
893 if (!operand_equal_p (gimple_call_fn (call1),
894 gimple_call_fn (call2), 0))
895 return false;
897 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
898 return false;
901 /* Check that any unvectorized arguments are equal. */
902 if (const int *map = vect_get_operand_map (call1))
904 unsigned int nkept = *map++;
905 unsigned int mapi = 0;
906 for (unsigned int i = 0; i < nargs; ++i)
907 if (mapi < nkept && map[mapi] == int (i))
908 mapi += 1;
909 else if (!operand_equal_p (gimple_call_arg (call1, i),
910 gimple_call_arg (call2, i)))
911 return false;
914 return true;
917 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
918 caller's attempt to find the vector type in STMT_INFO with the narrowest
919 element type. Return true if VECTYPE is nonnull and if it is valid
920 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
921 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
922 vect_build_slp_tree. */
924 static bool
925 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
926 unsigned int group_size,
927 tree vectype, poly_uint64 *max_nunits)
929 if (!vectype)
931 if (dump_enabled_p ())
932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
933 "Build SLP failed: unsupported data-type in %G\n",
934 stmt_info->stmt);
935 /* Fatal mismatch. */
936 return false;
939 /* If populating the vector type requires unrolling then fail
940 before adjusting *max_nunits for basic-block vectorization. */
941 if (is_a <bb_vec_info> (vinfo)
942 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
944 if (dump_enabled_p ())
945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
946 "Build SLP failed: unrolling required "
947 "in basic block SLP\n");
948 /* Fatal mismatch. */
949 return false;
952 /* In case of multiple types we need to detect the smallest type. */
953 vect_update_max_nunits (max_nunits, vectype);
954 return true;
957 /* Verify if the scalar stmts STMTS are isomorphic, require data
958 permutation or are of unsupported types of operation. Return
959 true if they are, otherwise return false and indicate in *MATCHES
960 which stmts are not isomorphic to the first one. If MATCHES[0]
961 is false then this indicates the comparison could not be
962 carried out or the stmts will never be vectorized by SLP.
964 Note COND_EXPR is possibly isomorphic to another one after swapping its
965 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
966 the first stmt by swapping the two operands of comparison; set SWAP[i]
967 to 2 if stmt I is isormorphic to the first stmt by inverting the code
968 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
969 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
971 static bool
972 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
973 vec<stmt_vec_info> stmts, unsigned int group_size,
974 poly_uint64 *max_nunits, bool *matches,
975 bool *two_operators, tree *node_vectype)
977 unsigned int i;
978 stmt_vec_info first_stmt_info = stmts[0];
979 code_helper first_stmt_code = ERROR_MARK;
980 code_helper alt_stmt_code = ERROR_MARK;
981 code_helper rhs_code = ERROR_MARK;
982 code_helper first_cond_code = ERROR_MARK;
983 tree lhs;
984 bool need_same_oprnds = false;
985 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
986 stmt_vec_info first_load = NULL, prev_first_load = NULL;
987 bool first_stmt_ldst_p = false, ldst_p = false;
988 bool first_stmt_phi_p = false, phi_p = false;
989 bool maybe_soft_fail = false;
990 tree soft_fail_nunits_vectype = NULL_TREE;
992 /* For every stmt in NODE find its def stmt/s. */
993 stmt_vec_info stmt_info;
994 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
996 gimple *stmt = stmt_info->stmt;
997 swap[i] = 0;
998 matches[i] = false;
1000 if (dump_enabled_p ())
1001 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1003 /* Fail to vectorize statements marked as unvectorizable, throw
1004 or are volatile. */
1005 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1006 || stmt_can_throw_internal (cfun, stmt)
1007 || gimple_has_volatile_ops (stmt))
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Build SLP failed: unvectorizable statement %G",
1012 stmt);
1013 /* ??? For BB vectorization we want to commutate operands in a way
1014 to shuffle all unvectorizable defs into one operand and have
1015 the other still vectorized. The following doesn't reliably
1016 work for this though but it's the easiest we can do here. */
1017 if (is_a <bb_vec_info> (vinfo) && i != 0)
1018 continue;
1019 /* Fatal mismatch. */
1020 matches[0] = false;
1021 return false;
1024 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1025 lhs = gimple_get_lhs (stmt);
1026 if (lhs == NULL_TREE
1027 && (!call_stmt
1028 || !gimple_call_internal_p (stmt)
1029 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: not GIMPLE_ASSIGN nor "
1034 "GIMPLE_CALL %G", stmt);
1035 if (is_a <bb_vec_info> (vinfo) && i != 0)
1036 continue;
1037 /* Fatal mismatch. */
1038 matches[0] = false;
1039 return false;
1042 tree nunits_vectype;
1043 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1044 &nunits_vectype, group_size))
1046 if (is_a <bb_vec_info> (vinfo) && i != 0)
1047 continue;
1048 /* Fatal mismatch. */
1049 matches[0] = false;
1050 return false;
1052 /* Record nunits required but continue analysis, producing matches[]
1053 as if nunits was not an issue. This allows splitting of groups
1054 to happen. */
1055 if (nunits_vectype
1056 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1057 nunits_vectype, max_nunits))
1059 gcc_assert (is_a <bb_vec_info> (vinfo));
1060 maybe_soft_fail = true;
1061 soft_fail_nunits_vectype = nunits_vectype;
1064 gcc_assert (vectype);
1066 if (call_stmt)
1068 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1069 if (cfn != CFN_LAST)
1070 rhs_code = cfn;
1071 else
1072 rhs_code = CALL_EXPR;
1074 if (cfn == CFN_MASK_LOAD
1075 || cfn == CFN_GATHER_LOAD
1076 || cfn == CFN_MASK_GATHER_LOAD)
1077 ldst_p = true;
1078 else if (cfn == CFN_MASK_STORE)
1080 ldst_p = true;
1081 rhs_code = CFN_MASK_STORE;
1083 else if ((internal_fn_p (cfn)
1084 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1085 || gimple_call_tail_p (call_stmt)
1086 || gimple_call_noreturn_p (call_stmt)
1087 || gimple_call_chain (call_stmt))
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: unsupported call type %G",
1092 (gimple *) call_stmt);
1093 if (is_a <bb_vec_info> (vinfo) && i != 0)
1094 continue;
1095 /* Fatal mismatch. */
1096 matches[0] = false;
1097 return false;
1100 else if (gimple_code (stmt) == GIMPLE_PHI)
1102 rhs_code = ERROR_MARK;
1103 phi_p = true;
1105 else
1107 rhs_code = gimple_assign_rhs_code (stmt);
1108 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1111 /* Check the operation. */
1112 if (i == 0)
1114 *node_vectype = vectype;
1115 first_stmt_code = rhs_code;
1116 first_stmt_ldst_p = ldst_p;
1117 first_stmt_phi_p = phi_p;
1119 /* Shift arguments should be equal in all the packed stmts for a
1120 vector shift with scalar shift operand. */
1121 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1122 || rhs_code == LROTATE_EXPR
1123 || rhs_code == RROTATE_EXPR)
1125 /* First see if we have a vector/vector shift. */
1126 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1128 /* No vector/vector shift, try for a vector/scalar shift. */
1129 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1131 if (dump_enabled_p ())
1132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133 "Build SLP failed: "
1134 "op not supported by target.\n");
1135 if (is_a <bb_vec_info> (vinfo) && i != 0)
1136 continue;
1137 /* Fatal mismatch. */
1138 matches[0] = false;
1139 return false;
1141 need_same_oprnds = true;
1142 first_op1 = gimple_assign_rhs2 (stmt);
1145 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1147 need_same_oprnds = true;
1148 first_op1 = gimple_assign_rhs2 (stmt);
1150 else if (!ldst_p
1151 && rhs_code == BIT_FIELD_REF)
1153 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1154 if (!is_a <bb_vec_info> (vinfo)
1155 || TREE_CODE (vec) != SSA_NAME
1156 /* When the element types are not compatible we pun the
1157 source to the target vectype which requires equal size. */
1158 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1159 || !types_compatible_p (TREE_TYPE (vectype),
1160 TREE_TYPE (TREE_TYPE (vec))))
1161 && !operand_equal_p (TYPE_SIZE (vectype),
1162 TYPE_SIZE (TREE_TYPE (vec)))))
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: "
1167 "BIT_FIELD_REF not supported\n");
1168 /* Fatal mismatch. */
1169 matches[0] = false;
1170 return false;
1173 else if (rhs_code == CFN_DIV_POW2)
1175 need_same_oprnds = true;
1176 first_op1 = gimple_call_arg (call_stmt, 1);
1179 else
1181 if (first_stmt_code != rhs_code
1182 && alt_stmt_code == ERROR_MARK)
1183 alt_stmt_code = rhs_code;
1184 if ((first_stmt_code != rhs_code
1185 && (first_stmt_code != IMAGPART_EXPR
1186 || rhs_code != REALPART_EXPR)
1187 && (first_stmt_code != REALPART_EXPR
1188 || rhs_code != IMAGPART_EXPR)
1189 /* Handle mismatches in plus/minus by computing both
1190 and merging the results. */
1191 && !((first_stmt_code == PLUS_EXPR
1192 || first_stmt_code == MINUS_EXPR)
1193 && (alt_stmt_code == PLUS_EXPR
1194 || alt_stmt_code == MINUS_EXPR)
1195 && rhs_code == alt_stmt_code)
1196 && !(first_stmt_code.is_tree_code ()
1197 && rhs_code.is_tree_code ()
1198 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1199 == tcc_comparison)
1200 && (swap_tree_comparison (tree_code (first_stmt_code))
1201 == tree_code (rhs_code)))
1202 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1203 && (first_stmt_code == ARRAY_REF
1204 || first_stmt_code == BIT_FIELD_REF
1205 || first_stmt_code == INDIRECT_REF
1206 || first_stmt_code == COMPONENT_REF
1207 || first_stmt_code == MEM_REF)
1208 && (rhs_code == ARRAY_REF
1209 || rhs_code == BIT_FIELD_REF
1210 || rhs_code == INDIRECT_REF
1211 || rhs_code == COMPONENT_REF
1212 || rhs_code == MEM_REF)))
1213 || first_stmt_ldst_p != ldst_p
1214 || first_stmt_phi_p != phi_p)
1216 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "Build SLP failed: different operation "
1220 "in stmt %G", stmt);
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "original stmt %G", first_stmt_info->stmt);
1224 /* Mismatch. */
1225 continue;
1228 if (!ldst_p
1229 && first_stmt_code == BIT_FIELD_REF
1230 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1231 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: different BIT_FIELD_REF "
1236 "arguments in %G", stmt);
1237 /* Mismatch. */
1238 continue;
1241 if (call_stmt
1242 && first_stmt_code != CFN_MASK_LOAD
1243 && first_stmt_code != CFN_MASK_STORE)
1245 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1246 call_stmt))
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "Build SLP failed: different calls in %G",
1251 stmt);
1252 /* Mismatch. */
1253 continue;
1257 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1258 && (gimple_bb (first_stmt_info->stmt)
1259 != gimple_bb (stmt_info->stmt)))
1261 if (dump_enabled_p ())
1262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263 "Build SLP failed: different BB for PHI "
1264 "or possibly trapping operation in %G", stmt);
1265 /* Mismatch. */
1266 continue;
1269 if (need_same_oprnds)
1271 tree other_op1 = gimple_arg (stmt, 1);
1272 if (!operand_equal_p (first_op1, other_op1, 0))
1274 if (dump_enabled_p ())
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276 "Build SLP failed: different shift "
1277 "arguments in %G", stmt);
1278 /* Mismatch. */
1279 continue;
1283 if (!types_compatible_p (vectype, *node_vectype))
1285 if (dump_enabled_p ())
1286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1287 "Build SLP failed: different vector type "
1288 "in %G", stmt);
1289 /* Mismatch. */
1290 continue;
1294 /* Grouped store or load. */
1295 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1297 gcc_assert (ldst_p);
1298 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1300 /* Store. */
1301 gcc_assert (rhs_code == CFN_MASK_STORE
1302 || REFERENCE_CLASS_P (lhs)
1303 || DECL_P (lhs));
1305 else
1307 /* Load. */
1308 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1309 if (prev_first_load)
1311 /* Check that there are no loads from different interleaving
1312 chains in the same node. */
1313 if (prev_first_load != first_load)
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1317 vect_location,
1318 "Build SLP failed: different "
1319 "interleaving chains in one node %G",
1320 stmt);
1321 /* Mismatch. */
1322 continue;
1325 else
1326 prev_first_load = first_load;
1329 /* Non-grouped store or load. */
1330 else if (ldst_p)
1332 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1333 && rhs_code != CFN_GATHER_LOAD
1334 && rhs_code != CFN_MASK_GATHER_LOAD
1335 /* Not grouped loads are handled as externals for BB
1336 vectorization. For loop vectorization we can handle
1337 splats the same we handle single element interleaving. */
1338 && (is_a <bb_vec_info> (vinfo)
1339 || stmt_info != first_stmt_info
1340 || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1342 /* Not grouped load. */
1343 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345 "Build SLP failed: not grouped load %G", stmt);
1347 if (i != 0)
1348 continue;
1349 /* Fatal mismatch. */
1350 matches[0] = false;
1351 return false;
1354 /* Not memory operation. */
1355 else
1357 if (!phi_p
1358 && rhs_code.is_tree_code ()
1359 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1360 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1361 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1362 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1363 && rhs_code != VIEW_CONVERT_EXPR
1364 && rhs_code != CALL_EXPR
1365 && rhs_code != BIT_FIELD_REF)
1367 if (dump_enabled_p ())
1368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1369 "Build SLP failed: operation unsupported %G",
1370 stmt);
1371 if (is_a <bb_vec_info> (vinfo) && i != 0)
1372 continue;
1373 /* Fatal mismatch. */
1374 matches[0] = false;
1375 return false;
1378 if (rhs_code == COND_EXPR)
1380 tree cond_expr = gimple_assign_rhs1 (stmt);
1381 enum tree_code cond_code = TREE_CODE (cond_expr);
1382 enum tree_code swap_code = ERROR_MARK;
1383 enum tree_code invert_code = ERROR_MARK;
1385 if (i == 0)
1386 first_cond_code = TREE_CODE (cond_expr);
1387 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1389 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1390 swap_code = swap_tree_comparison (cond_code);
1391 invert_code = invert_tree_comparison (cond_code, honor_nans);
1394 if (first_cond_code == cond_code)
1396 /* Isomorphic can be achieved by swapping. */
1397 else if (first_cond_code == swap_code)
1398 swap[i] = 1;
1399 /* Isomorphic can be achieved by inverting. */
1400 else if (first_cond_code == invert_code)
1401 swap[i] = 2;
1402 else
1404 if (dump_enabled_p ())
1405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406 "Build SLP failed: different"
1407 " operation %G", stmt);
1408 /* Mismatch. */
1409 continue;
1413 if (rhs_code.is_tree_code ()
1414 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1415 && (swap_tree_comparison ((tree_code)first_stmt_code)
1416 == (tree_code)rhs_code))
1417 swap[i] = 1;
1420 matches[i] = true;
1423 for (i = 0; i < group_size; ++i)
1424 if (!matches[i])
1425 return false;
1427 /* If we allowed a two-operation SLP node verify the target can cope
1428 with the permute we are going to use. */
1429 if (alt_stmt_code != ERROR_MARK
1430 && (!alt_stmt_code.is_tree_code ()
1431 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1432 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1434 *two_operators = true;
1437 if (maybe_soft_fail)
1439 unsigned HOST_WIDE_INT const_nunits;
1440 if (!TYPE_VECTOR_SUBPARTS
1441 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1442 || const_nunits > group_size)
1443 matches[0] = false;
1444 else
1446 /* With constant vector elements simulate a mismatch at the
1447 point we need to split. */
1448 unsigned tail = group_size & (const_nunits - 1);
1449 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1451 return false;
1454 return true;
1457 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1458 Note we never remove apart from at destruction time so we do not
1459 need a special value for deleted that differs from empty. */
1460 struct bst_traits
1462 typedef vec <stmt_vec_info> value_type;
1463 typedef vec <stmt_vec_info> compare_type;
1464 static inline hashval_t hash (value_type);
1465 static inline bool equal (value_type existing, value_type candidate);
1466 static inline bool is_empty (value_type x) { return !x.exists (); }
1467 static inline bool is_deleted (value_type x) { return !x.exists (); }
1468 static const bool empty_zero_p = true;
1469 static inline void mark_empty (value_type &x) { x.release (); }
1470 static inline void mark_deleted (value_type &x) { x.release (); }
1471 static inline void remove (value_type &x) { x.release (); }
1473 inline hashval_t
1474 bst_traits::hash (value_type x)
1476 inchash::hash h;
1477 for (unsigned i = 0; i < x.length (); ++i)
1478 h.add_int (gimple_uid (x[i]->stmt));
1479 return h.end ();
1481 inline bool
1482 bst_traits::equal (value_type existing, value_type candidate)
1484 if (existing.length () != candidate.length ())
1485 return false;
1486 for (unsigned i = 0; i < existing.length (); ++i)
1487 if (existing[i] != candidate[i])
1488 return false;
1489 return true;
1492 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1493 but then vec::insert does memmove and that's not compatible with
1494 std::pair. */
1495 struct chain_op_t
1497 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1498 : code (code_), dt (dt_), op (op_) {}
1499 tree_code code;
1500 vect_def_type dt;
1501 tree op;
1504 /* Comparator for sorting associatable chains. */
1506 static int
1507 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1509 auto *op1 = (const chain_op_t *) op1_;
1510 auto *op2 = (const chain_op_t *) op2_;
1511 if (op1->dt != op2->dt)
1512 return (int)op1->dt - (int)op2->dt;
1513 return (int)op1->code - (int)op2->code;
1516 /* Linearize the associatable expression chain at START with the
1517 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1518 filling CHAIN with the result and using WORKLIST as intermediate storage.
1519 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1520 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1521 stmts, starting with START. */
1523 static void
1524 vect_slp_linearize_chain (vec_info *vinfo,
1525 vec<std::pair<tree_code, gimple *> > &worklist,
1526 vec<chain_op_t> &chain,
1527 enum tree_code code, gimple *start,
1528 gimple *&code_stmt, gimple *&alt_code_stmt,
1529 vec<gimple *> *chain_stmts)
1531 /* For each lane linearize the addition/subtraction (or other
1532 uniform associatable operation) expression tree. */
1533 worklist.safe_push (std::make_pair (code, start));
1534 while (!worklist.is_empty ())
1536 auto entry = worklist.pop ();
1537 gassign *stmt = as_a <gassign *> (entry.second);
1538 enum tree_code in_code = entry.first;
1539 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1540 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1541 if (!code_stmt
1542 && gimple_assign_rhs_code (stmt) == code)
1543 code_stmt = stmt;
1544 else if (!alt_code_stmt
1545 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1546 alt_code_stmt = stmt;
1547 if (chain_stmts)
1548 chain_stmts->safe_push (stmt);
1549 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1551 tree op = gimple_op (stmt, opnum);
1552 vect_def_type dt;
1553 stmt_vec_info def_stmt_info;
1554 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1555 gcc_assert (res);
1556 if (dt == vect_internal_def
1557 && is_pattern_stmt_p (def_stmt_info))
1558 op = gimple_get_lhs (def_stmt_info->stmt);
1559 gimple *use_stmt;
1560 use_operand_p use_p;
1561 if (dt == vect_internal_def
1562 && single_imm_use (op, &use_p, &use_stmt)
1563 && is_gimple_assign (def_stmt_info->stmt)
1564 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1565 || (code == PLUS_EXPR
1566 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1567 == MINUS_EXPR))))
1569 tree_code op_def_code = this_code;
1570 if (op_def_code == MINUS_EXPR && opnum == 1)
1571 op_def_code = PLUS_EXPR;
1572 if (in_code == MINUS_EXPR)
1573 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1574 worklist.safe_push (std::make_pair (op_def_code,
1575 def_stmt_info->stmt));
1577 else
1579 tree_code op_def_code = this_code;
1580 if (op_def_code == MINUS_EXPR && opnum == 1)
1581 op_def_code = PLUS_EXPR;
1582 if (in_code == MINUS_EXPR)
1583 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1584 chain.safe_push (chain_op_t (op_def_code, dt, op));
1590 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1591 simple_hashmap_traits <bst_traits, slp_tree> >
1592 scalar_stmts_to_slp_tree_map_t;
1594 static slp_tree
1595 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1596 vec<stmt_vec_info> stmts, unsigned int group_size,
1597 poly_uint64 *max_nunits,
1598 bool *matches, unsigned *limit, unsigned *tree_size,
1599 scalar_stmts_to_slp_tree_map_t *bst_map);
1601 static slp_tree
1602 vect_build_slp_tree (vec_info *vinfo,
1603 vec<stmt_vec_info> stmts, unsigned int group_size,
1604 poly_uint64 *max_nunits,
1605 bool *matches, unsigned *limit, unsigned *tree_size,
1606 scalar_stmts_to_slp_tree_map_t *bst_map)
1608 if (slp_tree *leader = bst_map->get (stmts))
1610 if (dump_enabled_p ())
1611 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1612 !(*leader)->failed ? "" : "failed ",
1613 (void *) *leader);
1614 if (!(*leader)->failed)
1616 SLP_TREE_REF_COUNT (*leader)++;
1617 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1618 stmts.release ();
1619 return *leader;
1621 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1622 return NULL;
1625 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1626 so we can pick up backedge destinations during discovery. */
1627 slp_tree res = new _slp_tree;
1628 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1629 SLP_TREE_SCALAR_STMTS (res) = stmts;
1630 bst_map->put (stmts.copy (), res);
1632 if (*limit == 0)
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_NOTE, vect_location,
1636 "SLP discovery limit exceeded\n");
1637 /* Mark the node invalid so we can detect those when still in use
1638 as backedge destinations. */
1639 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1640 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1641 res->failed = XNEWVEC (bool, group_size);
1642 memset (res->failed, 0, sizeof (bool) * group_size);
1643 memset (matches, 0, sizeof (bool) * group_size);
1644 return NULL;
1646 --*limit;
1648 if (dump_enabled_p ())
1649 dump_printf_loc (MSG_NOTE, vect_location,
1650 "starting SLP discovery for node %p\n", (void *) res);
1652 poly_uint64 this_max_nunits = 1;
1653 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1654 &this_max_nunits,
1655 matches, limit, tree_size, bst_map);
1656 if (!res_)
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_NOTE, vect_location,
1660 "SLP discovery for node %p failed\n", (void *) res);
1661 /* Mark the node invalid so we can detect those when still in use
1662 as backedge destinations. */
1663 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1664 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1665 res->failed = XNEWVEC (bool, group_size);
1666 if (flag_checking)
1668 unsigned i;
1669 for (i = 0; i < group_size; ++i)
1670 if (!matches[i])
1671 break;
1672 gcc_assert (i < group_size);
1674 memcpy (res->failed, matches, sizeof (bool) * group_size);
1676 else
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_NOTE, vect_location,
1680 "SLP discovery for node %p succeeded\n",
1681 (void *) res);
1682 gcc_assert (res_ == res);
1683 res->max_nunits = this_max_nunits;
1684 vect_update_max_nunits (max_nunits, this_max_nunits);
1685 /* Keep a reference for the bst_map use. */
1686 SLP_TREE_REF_COUNT (res)++;
1688 return res_;
1691 /* Helper for building an associated SLP node chain. */
1693 static void
1694 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1695 slp_tree op0, slp_tree op1,
1696 stmt_vec_info oper1, stmt_vec_info oper2,
1697 vec<std::pair<unsigned, unsigned> > lperm)
1699 unsigned group_size = SLP_TREE_LANES (op1);
1701 slp_tree child1 = new _slp_tree;
1702 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1703 SLP_TREE_VECTYPE (child1) = vectype;
1704 SLP_TREE_LANES (child1) = group_size;
1705 SLP_TREE_CHILDREN (child1).create (2);
1706 SLP_TREE_CHILDREN (child1).quick_push (op0);
1707 SLP_TREE_CHILDREN (child1).quick_push (op1);
1708 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1710 slp_tree child2 = new _slp_tree;
1711 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1712 SLP_TREE_VECTYPE (child2) = vectype;
1713 SLP_TREE_LANES (child2) = group_size;
1714 SLP_TREE_CHILDREN (child2).create (2);
1715 SLP_TREE_CHILDREN (child2).quick_push (op0);
1716 SLP_TREE_REF_COUNT (op0)++;
1717 SLP_TREE_CHILDREN (child2).quick_push (op1);
1718 SLP_TREE_REF_COUNT (op1)++;
1719 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1721 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1722 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1723 SLP_TREE_VECTYPE (perm) = vectype;
1724 SLP_TREE_LANES (perm) = group_size;
1725 /* ??? We should set this NULL but that's not expected. */
1726 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1727 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1728 SLP_TREE_CHILDREN (perm).quick_push (child1);
1729 SLP_TREE_CHILDREN (perm).quick_push (child2);
1732 /* Recursively build an SLP tree starting from NODE.
1733 Fail (and return a value not equal to zero) if def-stmts are not
1734 isomorphic, require data permutation or are of unsupported types of
1735 operation. Otherwise, return 0.
1736 The value returned is the depth in the SLP tree where a mismatch
1737 was found. */
1739 static slp_tree
1740 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1741 vec<stmt_vec_info> stmts, unsigned int group_size,
1742 poly_uint64 *max_nunits,
1743 bool *matches, unsigned *limit, unsigned *tree_size,
1744 scalar_stmts_to_slp_tree_map_t *bst_map)
1746 unsigned nops, i, this_tree_size = 0;
1747 poly_uint64 this_max_nunits = *max_nunits;
1749 matches[0] = false;
1751 stmt_vec_info stmt_info = stmts[0];
1752 if (!is_a<gcall *> (stmt_info->stmt)
1753 && !is_a<gassign *> (stmt_info->stmt)
1754 && !is_a<gphi *> (stmt_info->stmt))
1755 return NULL;
1757 nops = gimple_num_args (stmt_info->stmt);
1758 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1759 nops = map[0];
1761 /* If the SLP node is a PHI (induction or reduction), terminate
1762 the recursion. */
1763 bool *skip_args = XALLOCAVEC (bool, nops);
1764 memset (skip_args, 0, sizeof (bool) * nops);
1765 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1766 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1768 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1769 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1770 group_size);
1771 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1772 max_nunits))
1773 return NULL;
1775 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1776 if (def_type == vect_induction_def)
1778 /* Induction PHIs are not cycles but walk the initial
1779 value. Only for inner loops through, for outer loops
1780 we need to pick up the value from the actual PHIs
1781 to more easily support peeling and epilogue vectorization. */
1782 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1783 if (!nested_in_vect_loop_p (loop, stmt_info))
1784 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1785 else
1786 loop = loop->inner;
1787 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1789 else if (def_type == vect_reduction_def
1790 || def_type == vect_double_reduction_def
1791 || def_type == vect_nested_cycle
1792 || def_type == vect_first_order_recurrence)
1794 /* Else def types have to match. */
1795 stmt_vec_info other_info;
1796 bool all_same = true;
1797 FOR_EACH_VEC_ELT (stmts, i, other_info)
1799 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1800 return NULL;
1801 if (other_info != stmt_info)
1802 all_same = false;
1804 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1805 /* Reduction initial values are not explicitely represented. */
1806 if (def_type != vect_first_order_recurrence
1807 && !nested_in_vect_loop_p (loop, stmt_info))
1808 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1809 /* Reduction chain backedge defs are filled manually.
1810 ??? Need a better way to identify a SLP reduction chain PHI.
1811 Or a better overall way to SLP match those. */
1812 if (all_same && def_type == vect_reduction_def)
1813 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1815 else if (def_type != vect_internal_def)
1816 return NULL;
1820 bool two_operators = false;
1821 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1822 tree vectype = NULL_TREE;
1823 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1824 &this_max_nunits, matches, &two_operators,
1825 &vectype))
1826 return NULL;
1828 /* If the SLP node is a load, terminate the recursion unless masked. */
1829 if (STMT_VINFO_DATA_REF (stmt_info)
1830 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1832 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1833 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1834 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1835 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1836 else
1838 *max_nunits = this_max_nunits;
1839 (*tree_size)++;
1840 node = vect_create_new_slp_node (node, stmts, 0);
1841 SLP_TREE_VECTYPE (node) = vectype;
1842 /* And compute the load permutation. Whether it is actually
1843 a permutation depends on the unrolling factor which is
1844 decided later. */
1845 vec<unsigned> load_permutation;
1846 int j;
1847 stmt_vec_info load_info;
1848 load_permutation.create (group_size);
1849 stmt_vec_info first_stmt_info
1850 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1851 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1853 int load_place;
1854 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1855 load_place = vect_get_place_in_interleaving_chain
1856 (load_info, first_stmt_info);
1857 else
1858 load_place = 0;
1859 gcc_assert (load_place != -1);
1860 load_permutation.safe_push (load_place);
1862 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1863 return node;
1866 else if (gimple_assign_single_p (stmt_info->stmt)
1867 && !gimple_vuse (stmt_info->stmt)
1868 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1870 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1871 the same SSA name vector of a compatible type to vectype. */
1872 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1873 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1874 stmt_vec_info estmt_info;
1875 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1877 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1878 tree bfref = gimple_assign_rhs1 (estmt);
1879 HOST_WIDE_INT lane;
1880 if (!known_eq (bit_field_size (bfref),
1881 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1882 || !constant_multiple_p (bit_field_offset (bfref),
1883 bit_field_size (bfref), &lane))
1885 lperm.release ();
1886 matches[0] = false;
1887 return NULL;
1889 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1891 slp_tree vnode = vect_create_new_slp_node (vNULL);
1892 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1893 /* ??? We record vectype here but we hide eventually necessary
1894 punning and instead rely on code generation to materialize
1895 VIEW_CONVERT_EXPRs as necessary. We instead should make
1896 this explicit somehow. */
1897 SLP_TREE_VECTYPE (vnode) = vectype;
1898 else
1900 /* For different size but compatible elements we can still
1901 use VEC_PERM_EXPR without punning. */
1902 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1903 && types_compatible_p (TREE_TYPE (vectype),
1904 TREE_TYPE (TREE_TYPE (vec))));
1905 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1907 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1908 unsigned HOST_WIDE_INT const_nunits;
1909 if (nunits.is_constant (&const_nunits))
1910 SLP_TREE_LANES (vnode) = const_nunits;
1911 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1912 /* We are always building a permutation node even if it is an identity
1913 permute to shield the rest of the vectorizer from the odd node
1914 representing an actual vector without any scalar ops.
1915 ??? We could hide it completely with making the permute node
1916 external? */
1917 node = vect_create_new_slp_node (node, stmts, 1);
1918 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1919 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1920 SLP_TREE_VECTYPE (node) = vectype;
1921 SLP_TREE_CHILDREN (node).quick_push (vnode);
1922 return node;
1924 /* When discovery reaches an associatable operation see whether we can
1925 improve that to match up lanes in a way superior to the operand
1926 swapping code which at most looks at two defs.
1927 ??? For BB vectorization we cannot do the brute-force search
1928 for matching as we can succeed by means of builds from scalars
1929 and have no good way to "cost" one build against another. */
1930 else if (is_a <loop_vec_info> (vinfo)
1931 /* ??? We don't handle !vect_internal_def defs below. */
1932 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1933 && is_gimple_assign (stmt_info->stmt)
1934 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1935 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1936 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1937 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1938 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1940 /* See if we have a chain of (mixed) adds or subtracts or other
1941 associatable ops. */
1942 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1943 if (code == MINUS_EXPR)
1944 code = PLUS_EXPR;
1945 stmt_vec_info other_op_stmt_info = NULL;
1946 stmt_vec_info op_stmt_info = NULL;
1947 unsigned chain_len = 0;
1948 auto_vec<chain_op_t> chain;
1949 auto_vec<std::pair<tree_code, gimple *> > worklist;
1950 auto_vec<vec<chain_op_t> > chains (group_size);
1951 auto_vec<slp_tree, 4> children;
1952 bool hard_fail = true;
1953 for (unsigned lane = 0; lane < group_size; ++lane)
1955 /* For each lane linearize the addition/subtraction (or other
1956 uniform associatable operation) expression tree. */
1957 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1958 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1959 stmts[lane]->stmt, op_stmt, other_op_stmt,
1960 NULL);
1961 if (!op_stmt_info && op_stmt)
1962 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1963 if (!other_op_stmt_info && other_op_stmt)
1964 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1965 if (chain.length () == 2)
1967 /* In a chain of just two elements resort to the regular
1968 operand swapping scheme. If we run into a length
1969 mismatch still hard-FAIL. */
1970 if (chain_len == 0)
1971 hard_fail = false;
1972 else
1974 matches[lane] = false;
1975 /* ??? We might want to process the other lanes, but
1976 make sure to not give false matching hints to the
1977 caller for lanes we did not process. */
1978 if (lane != group_size - 1)
1979 matches[0] = false;
1981 break;
1983 else if (chain_len == 0)
1984 chain_len = chain.length ();
1985 else if (chain.length () != chain_len)
1987 /* ??? Here we could slip in magic to compensate with
1988 neutral operands. */
1989 matches[lane] = false;
1990 if (lane != group_size - 1)
1991 matches[0] = false;
1992 break;
1994 chains.quick_push (chain.copy ());
1995 chain.truncate (0);
1997 if (chains.length () == group_size)
1999 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2000 if (!op_stmt_info)
2002 hard_fail = false;
2003 goto out;
2005 /* Now we have a set of chains with the same length. */
2006 /* 1. pre-sort according to def_type and operation. */
2007 for (unsigned lane = 0; lane < group_size; ++lane)
2008 chains[lane].stablesort (dt_sort_cmp, vinfo);
2009 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_NOTE, vect_location,
2012 "pre-sorted chains of %s\n",
2013 get_tree_code_name (code));
2014 for (unsigned lane = 0; lane < group_size; ++lane)
2016 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2017 dump_printf (MSG_NOTE, "%s %T ",
2018 get_tree_code_name (chains[lane][opnum].code),
2019 chains[lane][opnum].op);
2020 dump_printf (MSG_NOTE, "\n");
2023 /* 2. try to build children nodes, associating as necessary. */
2024 for (unsigned n = 0; n < chain_len; ++n)
2026 vect_def_type dt = chains[0][n].dt;
2027 unsigned lane;
2028 for (lane = 0; lane < group_size; ++lane)
2029 if (chains[lane][n].dt != dt)
2031 if (dt == vect_constant_def
2032 && chains[lane][n].dt == vect_external_def)
2033 dt = vect_external_def;
2034 else if (dt == vect_external_def
2035 && chains[lane][n].dt == vect_constant_def)
2037 else
2038 break;
2040 if (lane != group_size)
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "giving up on chain due to mismatched "
2045 "def types\n");
2046 matches[lane] = false;
2047 if (lane != group_size - 1)
2048 matches[0] = false;
2049 goto out;
2051 if (dt == vect_constant_def
2052 || dt == vect_external_def)
2054 /* Check whether we can build the invariant. If we can't
2055 we never will be able to. */
2056 tree type = TREE_TYPE (chains[0][n].op);
2057 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2058 && (TREE_CODE (type) == BOOLEAN_TYPE
2059 || !can_duplicate_and_interleave_p (vinfo, group_size,
2060 type)))
2062 matches[0] = false;
2063 goto out;
2065 vec<tree> ops;
2066 ops.create (group_size);
2067 for (lane = 0; lane < group_size; ++lane)
2068 ops.quick_push (chains[lane][n].op);
2069 slp_tree child = vect_create_new_slp_node (ops);
2070 SLP_TREE_DEF_TYPE (child) = dt;
2071 children.safe_push (child);
2073 else if (dt != vect_internal_def)
2075 /* Not sure, we might need sth special.
2076 gcc.dg/vect/pr96854.c,
2077 gfortran.dg/vect/fast-math-pr37021.f90
2078 and gfortran.dg/vect/pr61171.f trigger. */
2079 /* Soft-fail for now. */
2080 hard_fail = false;
2081 goto out;
2083 else
2085 vec<stmt_vec_info> op_stmts;
2086 op_stmts.create (group_size);
2087 slp_tree child = NULL;
2088 /* Brute-force our way. We have to consider a lane
2089 failing after fixing an earlier fail up in the
2090 SLP discovery recursion. So track the current
2091 permute per lane. */
2092 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2093 memset (perms, 0, sizeof (unsigned) * group_size);
2096 op_stmts.truncate (0);
2097 for (lane = 0; lane < group_size; ++lane)
2098 op_stmts.quick_push
2099 (vinfo->lookup_def (chains[lane][n].op));
2100 child = vect_build_slp_tree (vinfo, op_stmts,
2101 group_size, &this_max_nunits,
2102 matches, limit,
2103 &this_tree_size, bst_map);
2104 /* ??? We're likely getting too many fatal mismatches
2105 here so maybe we want to ignore them (but then we
2106 have no idea which lanes fatally mismatched). */
2107 if (child || !matches[0])
2108 break;
2109 /* Swap another lane we have not yet matched up into
2110 lanes that did not match. If we run out of
2111 permute possibilities for a lane terminate the
2112 search. */
2113 bool term = false;
2114 for (lane = 1; lane < group_size; ++lane)
2115 if (!matches[lane])
2117 if (n + perms[lane] + 1 == chain_len)
2119 term = true;
2120 break;
2122 std::swap (chains[lane][n],
2123 chains[lane][n + perms[lane] + 1]);
2124 perms[lane]++;
2126 if (term)
2127 break;
2129 while (1);
2130 if (!child)
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_NOTE, vect_location,
2134 "failed to match up op %d\n", n);
2135 op_stmts.release ();
2136 if (lane != group_size - 1)
2137 matches[0] = false;
2138 else
2139 matches[lane] = false;
2140 goto out;
2142 if (dump_enabled_p ())
2144 dump_printf_loc (MSG_NOTE, vect_location,
2145 "matched up op %d to\n", n);
2146 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2148 children.safe_push (child);
2151 /* 3. build SLP nodes to combine the chain. */
2152 for (unsigned lane = 0; lane < group_size; ++lane)
2153 if (chains[lane][0].code != code)
2155 /* See if there's any alternate all-PLUS entry. */
2156 unsigned n;
2157 for (n = 1; n < chain_len; ++n)
2159 for (lane = 0; lane < group_size; ++lane)
2160 if (chains[lane][n].code != code)
2161 break;
2162 if (lane == group_size)
2163 break;
2165 if (n != chain_len)
2167 /* Swap that in at first position. */
2168 std::swap (children[0], children[n]);
2169 for (lane = 0; lane < group_size; ++lane)
2170 std::swap (chains[lane][0], chains[lane][n]);
2172 else
2174 /* ??? When this triggers and we end up with two
2175 vect_constant/external_def up-front things break (ICE)
2176 spectacularly finding an insertion place for the
2177 all-constant op. We should have a fully
2178 vect_internal_def operand though(?) so we can swap
2179 that into first place and then prepend the all-zero
2180 constant. */
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE, vect_location,
2183 "inserting constant zero to compensate "
2184 "for (partially) negated first "
2185 "operand\n");
2186 chain_len++;
2187 for (lane = 0; lane < group_size; ++lane)
2188 chains[lane].safe_insert
2189 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2190 vec<tree> zero_ops;
2191 zero_ops.create (group_size);
2192 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2193 for (lane = 1; lane < group_size; ++lane)
2194 zero_ops.quick_push (zero_ops[0]);
2195 slp_tree zero = vect_create_new_slp_node (zero_ops);
2196 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2197 children.safe_insert (0, zero);
2199 break;
2201 for (unsigned i = 1; i < children.length (); ++i)
2203 slp_tree op0 = children[i - 1];
2204 slp_tree op1 = children[i];
2205 bool this_two_op = false;
2206 for (unsigned lane = 0; lane < group_size; ++lane)
2207 if (chains[lane][i].code != chains[0][i].code)
2209 this_two_op = true;
2210 break;
2212 slp_tree child;
2213 if (i == children.length () - 1)
2214 child = vect_create_new_slp_node (node, stmts, 2);
2215 else
2216 child = vect_create_new_slp_node (2, ERROR_MARK);
2217 if (this_two_op)
2219 vec<std::pair<unsigned, unsigned> > lperm;
2220 lperm.create (group_size);
2221 for (unsigned lane = 0; lane < group_size; ++lane)
2222 lperm.quick_push (std::make_pair
2223 (chains[lane][i].code != chains[0][i].code, lane));
2224 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2225 (chains[0][i].code == code
2226 ? op_stmt_info
2227 : other_op_stmt_info),
2228 (chains[0][i].code == code
2229 ? other_op_stmt_info
2230 : op_stmt_info),
2231 lperm);
2233 else
2235 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2236 SLP_TREE_VECTYPE (child) = vectype;
2237 SLP_TREE_LANES (child) = group_size;
2238 SLP_TREE_CHILDREN (child).quick_push (op0);
2239 SLP_TREE_CHILDREN (child).quick_push (op1);
2240 SLP_TREE_REPRESENTATIVE (child)
2241 = (chains[0][i].code == code
2242 ? op_stmt_info : other_op_stmt_info);
2244 children[i] = child;
2246 *tree_size += this_tree_size + 1;
2247 *max_nunits = this_max_nunits;
2248 while (!chains.is_empty ())
2249 chains.pop ().release ();
2250 return node;
2252 out:
2253 while (!children.is_empty ())
2254 vect_free_slp_tree (children.pop ());
2255 while (!chains.is_empty ())
2256 chains.pop ().release ();
2257 /* Hard-fail, otherwise we might run into quadratic processing of the
2258 chains starting one stmt into the chain again. */
2259 if (hard_fail)
2260 return NULL;
2261 /* Fall thru to normal processing. */
2264 /* Get at the operands, verifying they are compatible. */
2265 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2266 slp_oprnd_info oprnd_info;
2267 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2269 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2270 stmts, i, &oprnds_info);
2271 if (res != 0)
2272 matches[(res == -1) ? 0 : i] = false;
2273 if (!matches[0])
2274 break;
2276 for (i = 0; i < group_size; ++i)
2277 if (!matches[i])
2279 vect_free_oprnd_info (oprnds_info);
2280 return NULL;
2282 swap = NULL;
2284 auto_vec<slp_tree, 4> children;
2286 stmt_info = stmts[0];
2288 /* Create SLP_TREE nodes for the definition node/s. */
2289 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2291 slp_tree child;
2292 unsigned int j;
2294 /* We're skipping certain operands from processing, for example
2295 outer loop reduction initial defs. */
2296 if (skip_args[i])
2298 children.safe_push (NULL);
2299 continue;
2302 if (oprnd_info->first_dt == vect_uninitialized_def)
2304 /* COND_EXPR have one too many eventually if the condition
2305 is a SSA name. */
2306 gcc_assert (i == 3 && nops == 4);
2307 continue;
2310 if (is_a <bb_vec_info> (vinfo)
2311 && oprnd_info->first_dt == vect_internal_def
2312 && !oprnd_info->any_pattern)
2314 /* For BB vectorization, if all defs are the same do not
2315 bother to continue the build along the single-lane
2316 graph but use a splat of the scalar value. */
2317 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2318 for (j = 1; j < group_size; ++j)
2319 if (oprnd_info->def_stmts[j] != first_def)
2320 break;
2321 if (j == group_size
2322 /* But avoid doing this for loads where we may be
2323 able to CSE things, unless the stmt is not
2324 vectorizable. */
2325 && (!STMT_VINFO_VECTORIZABLE (first_def)
2326 || !gimple_vuse (first_def->stmt)))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_NOTE, vect_location,
2330 "Using a splat of the uniform operand %G",
2331 first_def->stmt);
2332 oprnd_info->first_dt = vect_external_def;
2336 if (oprnd_info->first_dt == vect_external_def
2337 || oprnd_info->first_dt == vect_constant_def)
2339 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2340 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2341 oprnd_info->ops = vNULL;
2342 children.safe_push (invnode);
2343 continue;
2346 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2347 group_size, &this_max_nunits,
2348 matches, limit,
2349 &this_tree_size, bst_map)) != NULL)
2351 oprnd_info->def_stmts = vNULL;
2352 children.safe_push (child);
2353 continue;
2356 /* If the SLP build for operand zero failed and operand zero
2357 and one can be commutated try that for the scalar stmts
2358 that failed the match. */
2359 if (i == 0
2360 /* A first scalar stmt mismatch signals a fatal mismatch. */
2361 && matches[0]
2362 /* ??? For COND_EXPRs we can swap the comparison operands
2363 as well as the arms under some constraints. */
2364 && nops == 2
2365 && oprnds_info[1]->first_dt == vect_internal_def
2366 && is_gimple_assign (stmt_info->stmt)
2367 /* Swapping operands for reductions breaks assumptions later on. */
2368 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2369 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2371 /* See whether we can swap the matching or the non-matching
2372 stmt operands. */
2373 bool swap_not_matching = true;
2376 for (j = 0; j < group_size; ++j)
2378 if (matches[j] != !swap_not_matching)
2379 continue;
2380 stmt_vec_info stmt_info = stmts[j];
2381 /* Verify if we can swap operands of this stmt. */
2382 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2383 if (!stmt
2384 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2386 if (!swap_not_matching)
2387 goto fail;
2388 swap_not_matching = false;
2389 break;
2393 while (j != group_size);
2395 /* Swap mismatched definition stmts. */
2396 if (dump_enabled_p ())
2397 dump_printf_loc (MSG_NOTE, vect_location,
2398 "Re-trying with swapped operands of stmts ");
2399 for (j = 0; j < group_size; ++j)
2400 if (matches[j] == !swap_not_matching)
2402 std::swap (oprnds_info[0]->def_stmts[j],
2403 oprnds_info[1]->def_stmts[j]);
2404 std::swap (oprnds_info[0]->ops[j],
2405 oprnds_info[1]->ops[j]);
2406 if (dump_enabled_p ())
2407 dump_printf (MSG_NOTE, "%d ", j);
2409 if (dump_enabled_p ())
2410 dump_printf (MSG_NOTE, "\n");
2411 /* After swapping some operands we lost track whether an
2412 operand has any pattern defs so be conservative here. */
2413 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2414 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2415 /* And try again with scratch 'matches' ... */
2416 bool *tem = XALLOCAVEC (bool, group_size);
2417 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2418 group_size, &this_max_nunits,
2419 tem, limit,
2420 &this_tree_size, bst_map)) != NULL)
2422 oprnd_info->def_stmts = vNULL;
2423 children.safe_push (child);
2424 continue;
2427 fail:
2429 /* If the SLP build failed and we analyze a basic-block
2430 simply treat nodes we fail to build as externally defined
2431 (and thus build vectors from the scalar defs).
2432 The cost model will reject outright expensive cases.
2433 ??? This doesn't treat cases where permutation ultimatively
2434 fails (or we don't try permutation below). Ideally we'd
2435 even compute a permutation that will end up with the maximum
2436 SLP tree size... */
2437 if (is_a <bb_vec_info> (vinfo)
2438 /* ??? Rejecting patterns this way doesn't work. We'd have to
2439 do extra work to cancel the pattern so the uses see the
2440 scalar version. */
2441 && !is_pattern_stmt_p (stmt_info)
2442 && !oprnd_info->any_pattern)
2444 /* But if there's a leading vector sized set of matching stmts
2445 fail here so we can split the group. This matches the condition
2446 vect_analyze_slp_instance uses. */
2447 /* ??? We might want to split here and combine the results to support
2448 multiple vector sizes better. */
2449 for (j = 0; j < group_size; ++j)
2450 if (!matches[j])
2451 break;
2452 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_NOTE, vect_location,
2456 "Building vector operands from scalars\n");
2457 this_tree_size++;
2458 child = vect_create_new_slp_node (oprnd_info->ops);
2459 children.safe_push (child);
2460 oprnd_info->ops = vNULL;
2461 continue;
2465 gcc_assert (child == NULL);
2466 FOR_EACH_VEC_ELT (children, j, child)
2467 if (child)
2468 vect_free_slp_tree (child);
2469 vect_free_oprnd_info (oprnds_info);
2470 return NULL;
2473 vect_free_oprnd_info (oprnds_info);
2475 /* If we have all children of a child built up from uniform scalars
2476 or does more than one possibly expensive vector construction then
2477 just throw that away, causing it built up from scalars.
2478 The exception is the SLP node for the vector store. */
2479 if (is_a <bb_vec_info> (vinfo)
2480 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2481 /* ??? Rejecting patterns this way doesn't work. We'd have to
2482 do extra work to cancel the pattern so the uses see the
2483 scalar version. */
2484 && !is_pattern_stmt_p (stmt_info))
2486 slp_tree child;
2487 unsigned j;
2488 bool all_uniform_p = true;
2489 unsigned n_vector_builds = 0;
2490 FOR_EACH_VEC_ELT (children, j, child)
2492 if (!child)
2494 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2495 all_uniform_p = false;
2496 else if (!vect_slp_tree_uniform_p (child))
2498 all_uniform_p = false;
2499 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2500 n_vector_builds++;
2503 if (all_uniform_p
2504 || n_vector_builds > 1
2505 || (n_vector_builds == children.length ()
2506 && is_a <gphi *> (stmt_info->stmt)))
2508 /* Roll back. */
2509 matches[0] = false;
2510 FOR_EACH_VEC_ELT (children, j, child)
2511 if (child)
2512 vect_free_slp_tree (child);
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "Building parent vector operands from "
2517 "scalars instead\n");
2518 return NULL;
2522 *tree_size += this_tree_size + 1;
2523 *max_nunits = this_max_nunits;
2525 if (two_operators)
2527 /* ??? We'd likely want to either cache in bst_map sth like
2528 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2529 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2530 explicit stmts to put in so the keying on 'stmts' doesn't
2531 work (but we have the same issue with nodes that use 'ops'). */
2532 slp_tree one = new _slp_tree;
2533 slp_tree two = new _slp_tree;
2534 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2535 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2536 SLP_TREE_VECTYPE (one) = vectype;
2537 SLP_TREE_VECTYPE (two) = vectype;
2538 SLP_TREE_CHILDREN (one).safe_splice (children);
2539 SLP_TREE_CHILDREN (two).safe_splice (children);
2540 slp_tree child;
2541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2542 SLP_TREE_REF_COUNT (child)++;
2544 /* Here we record the original defs since this
2545 node represents the final lane configuration. */
2546 node = vect_create_new_slp_node (node, stmts, 2);
2547 SLP_TREE_VECTYPE (node) = vectype;
2548 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2549 SLP_TREE_CHILDREN (node).quick_push (one);
2550 SLP_TREE_CHILDREN (node).quick_push (two);
2551 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2552 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2553 enum tree_code ocode = ERROR_MARK;
2554 stmt_vec_info ostmt_info;
2555 unsigned j = 0;
2556 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2558 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2559 if (gimple_assign_rhs_code (ostmt) != code0)
2561 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2562 ocode = gimple_assign_rhs_code (ostmt);
2563 j = i;
2565 else
2566 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2568 SLP_TREE_CODE (one) = code0;
2569 SLP_TREE_CODE (two) = ocode;
2570 SLP_TREE_LANES (one) = stmts.length ();
2571 SLP_TREE_LANES (two) = stmts.length ();
2572 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2573 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2574 return node;
2577 node = vect_create_new_slp_node (node, stmts, nops);
2578 SLP_TREE_VECTYPE (node) = vectype;
2579 SLP_TREE_CHILDREN (node).splice (children);
2580 return node;
2583 /* Dump a single SLP tree NODE. */
2585 static void
2586 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2587 slp_tree node)
2589 unsigned i, j;
2590 slp_tree child;
2591 stmt_vec_info stmt_info;
2592 tree op;
2594 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2595 dump_user_location_t user_loc = loc.get_user_location ();
2596 dump_printf_loc (metadata, user_loc,
2597 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2598 ", refcnt=%u)",
2599 SLP_TREE_DEF_TYPE (node) == vect_external_def
2600 ? " (external)"
2601 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2602 ? " (constant)"
2603 : ""), (void *) node,
2604 estimated_poly_value (node->max_nunits),
2605 SLP_TREE_REF_COUNT (node));
2606 if (SLP_TREE_VECTYPE (node))
2607 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2608 dump_printf (metadata, "\n");
2609 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2611 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2612 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2613 else
2614 dump_printf_loc (metadata, user_loc, "op template: %G",
2615 SLP_TREE_REPRESENTATIVE (node)->stmt);
2617 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2618 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2619 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2620 else
2622 dump_printf_loc (metadata, user_loc, "\t{ ");
2623 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2624 dump_printf (metadata, "%T%s ", op,
2625 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2626 dump_printf (metadata, "}\n");
2628 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2630 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2631 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2632 dump_printf (dump_kind, " %u", j);
2633 dump_printf (dump_kind, " }\n");
2635 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2637 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2638 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2639 dump_printf (dump_kind, " %u[%u]",
2640 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2641 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2642 dump_printf (dump_kind, " }\n");
2644 if (SLP_TREE_CHILDREN (node).is_empty ())
2645 return;
2646 dump_printf_loc (metadata, user_loc, "\tchildren");
2647 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2648 dump_printf (dump_kind, " %p", (void *)child);
2649 dump_printf (dump_kind, "\n");
2652 DEBUG_FUNCTION void
2653 debug (slp_tree node)
2655 debug_dump_context ctx;
2656 vect_print_slp_tree (MSG_NOTE,
2657 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2658 node);
2661 /* Recursive helper for the dot producer below. */
2663 static void
2664 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2666 if (visited.add (node))
2667 return;
2669 fprintf (f, "\"%p\" [label=\"", (void *)node);
2670 vect_print_slp_tree (MSG_NOTE,
2671 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2672 node);
2673 fprintf (f, "\"];\n");
2676 for (slp_tree child : SLP_TREE_CHILDREN (node))
2677 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2679 for (slp_tree child : SLP_TREE_CHILDREN (node))
2680 if (child)
2681 dot_slp_tree (f, child, visited);
2684 DEBUG_FUNCTION void
2685 dot_slp_tree (const char *fname, slp_tree node)
2687 FILE *f = fopen (fname, "w");
2688 fprintf (f, "digraph {\n");
2689 fflush (f);
2691 debug_dump_context ctx (f);
2692 hash_set<slp_tree> visited;
2693 dot_slp_tree (f, node, visited);
2695 fflush (f);
2696 fprintf (f, "}\n");
2697 fclose (f);
2700 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2702 static void
2703 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2704 slp_tree node, hash_set<slp_tree> &visited)
2706 unsigned i;
2707 slp_tree child;
2709 if (visited.add (node))
2710 return;
2712 vect_print_slp_tree (dump_kind, loc, node);
2714 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2715 if (child)
2716 vect_print_slp_graph (dump_kind, loc, child, visited);
2719 static void
2720 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2721 slp_tree entry)
2723 hash_set<slp_tree> visited;
2724 vect_print_slp_graph (dump_kind, loc, entry, visited);
2727 /* Mark the tree rooted at NODE with PURE_SLP. */
2729 static void
2730 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2732 int i;
2733 stmt_vec_info stmt_info;
2734 slp_tree child;
2736 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2737 return;
2739 if (visited.add (node))
2740 return;
2742 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2743 STMT_SLP_TYPE (stmt_info) = pure_slp;
2745 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2746 if (child)
2747 vect_mark_slp_stmts (child, visited);
2750 static void
2751 vect_mark_slp_stmts (slp_tree node)
2753 hash_set<slp_tree> visited;
2754 vect_mark_slp_stmts (node, visited);
2757 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2759 static void
2760 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2762 int i;
2763 stmt_vec_info stmt_info;
2764 slp_tree child;
2766 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2767 return;
2769 if (visited.add (node))
2770 return;
2772 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2774 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2775 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2776 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2779 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2780 if (child)
2781 vect_mark_slp_stmts_relevant (child, visited);
2784 static void
2785 vect_mark_slp_stmts_relevant (slp_tree node)
2787 hash_set<slp_tree> visited;
2788 vect_mark_slp_stmts_relevant (node, visited);
2792 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2794 static void
2795 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2796 hash_set<slp_tree> &visited)
2798 if (!node || visited.add (node))
2799 return;
2801 if (SLP_TREE_CHILDREN (node).length () == 0)
2803 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2804 return;
2805 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2806 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2808 loads.safe_push (node);
2810 else
2812 unsigned i;
2813 slp_tree child;
2814 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2815 vect_gather_slp_loads (loads, child, visited);
2820 /* Find the last store in SLP INSTANCE. */
2822 stmt_vec_info
2823 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2825 stmt_vec_info last = NULL;
2826 stmt_vec_info stmt_vinfo;
2828 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2830 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2831 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2834 return last;
2837 /* Find the first stmt in NODE. */
2839 stmt_vec_info
2840 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2842 stmt_vec_info first = NULL;
2843 stmt_vec_info stmt_vinfo;
2845 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2847 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2848 if (!first
2849 || get_later_stmt (stmt_vinfo, first) == first)
2850 first = stmt_vinfo;
2853 return first;
2856 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2857 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2858 (also containing the first GROUP1_SIZE stmts, since stores are
2859 consecutive), the second containing the remainder.
2860 Return the first stmt in the second group. */
2862 static stmt_vec_info
2863 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2865 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2866 gcc_assert (group1_size > 0);
2867 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2868 gcc_assert (group2_size > 0);
2869 DR_GROUP_SIZE (first_vinfo) = group1_size;
2871 stmt_vec_info stmt_info = first_vinfo;
2872 for (unsigned i = group1_size; i > 1; i--)
2874 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2875 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2877 /* STMT is now the last element of the first group. */
2878 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2879 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2881 DR_GROUP_SIZE (group2) = group2_size;
2882 for (stmt_info = group2; stmt_info;
2883 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2885 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2886 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2889 /* For the second group, the DR_GROUP_GAP is that before the original group,
2890 plus skipping over the first vector. */
2891 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2893 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2894 DR_GROUP_GAP (first_vinfo) += group2_size;
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2898 group1_size, group2_size);
2900 return group2;
2903 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2904 statements and a vector of NUNITS elements. */
2906 static poly_uint64
2907 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2909 return exact_div (common_multiple (nunits, group_size), group_size);
2912 /* Helper that checks to see if a node is a load node. */
2914 static inline bool
2915 vect_is_slp_load_node (slp_tree root)
2917 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2918 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2919 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2923 /* Helper function of optimize_load_redistribution that performs the operation
2924 recursively. */
2926 static slp_tree
2927 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2928 vec_info *vinfo, unsigned int group_size,
2929 hash_map<slp_tree, slp_tree> *load_map,
2930 slp_tree root)
2932 if (slp_tree *leader = load_map->get (root))
2933 return *leader;
2935 slp_tree node;
2936 unsigned i;
2938 /* For now, we don't know anything about externals so do not do anything. */
2939 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2940 return NULL;
2941 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2943 /* First convert this node into a load node and add it to the leaves
2944 list and flatten the permute from a lane to a load one. If it's
2945 unneeded it will be elided later. */
2946 vec<stmt_vec_info> stmts;
2947 stmts.create (SLP_TREE_LANES (root));
2948 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2949 for (unsigned j = 0; j < lane_perm.length (); j++)
2951 std::pair<unsigned, unsigned> perm = lane_perm[j];
2952 node = SLP_TREE_CHILDREN (root)[perm.first];
2954 if (!vect_is_slp_load_node (node)
2955 || SLP_TREE_CHILDREN (node).exists ())
2957 stmts.release ();
2958 goto next;
2961 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2964 if (dump_enabled_p ())
2965 dump_printf_loc (MSG_NOTE, vect_location,
2966 "converting stmts on permute node %p\n",
2967 (void *) root);
2969 bool *matches = XALLOCAVEC (bool, group_size);
2970 poly_uint64 max_nunits = 1;
2971 unsigned tree_size = 0, limit = 1;
2972 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2973 matches, &limit, &tree_size, bst_map);
2974 if (!node)
2975 stmts.release ();
2977 load_map->put (root, node);
2978 return node;
2981 next:
2982 load_map->put (root, NULL);
2984 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2986 slp_tree value
2987 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2988 node);
2989 if (value)
2991 SLP_TREE_REF_COUNT (value)++;
2992 SLP_TREE_CHILDREN (root)[i] = value;
2993 /* ??? We know the original leafs of the replaced nodes will
2994 be referenced by bst_map, only the permutes created by
2995 pattern matching are not. */
2996 if (SLP_TREE_REF_COUNT (node) == 1)
2997 load_map->remove (node);
2998 vect_free_slp_tree (node);
3002 return NULL;
3005 /* Temporary workaround for loads not being CSEd during SLP build. This
3006 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3007 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3008 same DR such that the final operation is equal to a permuted load. Such
3009 NODES are then directly converted into LOADS themselves. The nodes are
3010 CSEd using BST_MAP. */
3012 static void
3013 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3014 vec_info *vinfo, unsigned int group_size,
3015 hash_map<slp_tree, slp_tree> *load_map,
3016 slp_tree root)
3018 slp_tree node;
3019 unsigned i;
3021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3023 slp_tree value
3024 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3025 node);
3026 if (value)
3028 SLP_TREE_REF_COUNT (value)++;
3029 SLP_TREE_CHILDREN (root)[i] = value;
3030 /* ??? We know the original leafs of the replaced nodes will
3031 be referenced by bst_map, only the permutes created by
3032 pattern matching are not. */
3033 if (SLP_TREE_REF_COUNT (node) == 1)
3034 load_map->remove (node);
3035 vect_free_slp_tree (node);
3040 /* Helper function of vect_match_slp_patterns.
3042 Attempts to match patterns against the slp tree rooted in REF_NODE using
3043 VINFO. Patterns are matched in post-order traversal.
3045 If matching is successful the value in REF_NODE is updated and returned, if
3046 not then it is returned unchanged. */
3048 static bool
3049 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3050 slp_tree_to_load_perm_map_t *perm_cache,
3051 slp_compat_nodes_map_t *compat_cache,
3052 hash_set<slp_tree> *visited)
3054 unsigned i;
3055 slp_tree node = *ref_node;
3056 bool found_p = false;
3057 if (!node || visited->add (node))
3058 return false;
3060 slp_tree child;
3061 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3062 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3063 vinfo, perm_cache, compat_cache,
3064 visited);
3066 for (unsigned x = 0; x < num__slp_patterns; x++)
3068 vect_pattern *pattern
3069 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3070 if (pattern)
3072 pattern->build (vinfo);
3073 delete pattern;
3074 found_p = true;
3078 return found_p;
3081 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3082 vec_info VINFO.
3084 The modified tree is returned. Patterns are tried in order and multiple
3085 patterns may match. */
3087 static bool
3088 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3089 hash_set<slp_tree> *visited,
3090 slp_tree_to_load_perm_map_t *perm_cache,
3091 slp_compat_nodes_map_t *compat_cache)
3093 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3094 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3096 if (dump_enabled_p ())
3097 dump_printf_loc (MSG_NOTE, vect_location,
3098 "Analyzing SLP tree %p for patterns\n",
3099 (void *) SLP_INSTANCE_TREE (instance));
3101 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3102 visited);
3105 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3106 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3107 Return true if we could use IFN_STORE_LANES instead and if that appears
3108 to be the better approach. */
3110 static bool
3111 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3112 unsigned int group_size,
3113 unsigned int new_group_size)
3115 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3116 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3117 if (!vectype)
3118 return false;
3119 /* Allow the split if one of the two new groups would operate on full
3120 vectors *within* rather than across one scalar loop iteration.
3121 This is purely a heuristic, but it should work well for group
3122 sizes of 3 and 4, where the possible splits are:
3124 3->2+1: OK if the vector has exactly two elements
3125 4->2+2: Likewise
3126 4->3+1: Less clear-cut. */
3127 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3128 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3129 return false;
3130 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3133 /* Analyze an SLP instance starting from a group of grouped stores. Call
3134 vect_build_slp_tree to build a tree of packed stmts if possible.
3135 Return FALSE if it's impossible to SLP any stmt in the loop. */
3137 static bool
3138 vect_analyze_slp_instance (vec_info *vinfo,
3139 scalar_stmts_to_slp_tree_map_t *bst_map,
3140 stmt_vec_info stmt_info, slp_instance_kind kind,
3141 unsigned max_tree_size, unsigned *limit);
3143 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3144 of KIND. Return true if successful. */
3146 static bool
3147 vect_build_slp_instance (vec_info *vinfo,
3148 slp_instance_kind kind,
3149 vec<stmt_vec_info> &scalar_stmts,
3150 vec<stmt_vec_info> &root_stmt_infos,
3151 vec<tree> &remain,
3152 unsigned max_tree_size, unsigned *limit,
3153 scalar_stmts_to_slp_tree_map_t *bst_map,
3154 /* ??? We need stmt_info for group splitting. */
3155 stmt_vec_info stmt_info_)
3157 if (kind == slp_inst_kind_ctor)
3159 if (dump_enabled_p ())
3160 dump_printf_loc (MSG_NOTE, vect_location,
3161 "Analyzing vectorizable constructor: %G\n",
3162 root_stmt_infos[0]->stmt);
3165 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE, vect_location,
3168 "Starting SLP discovery for\n");
3169 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3170 dump_printf_loc (MSG_NOTE, vect_location,
3171 " %G", scalar_stmts[i]->stmt);
3174 /* When a BB reduction doesn't have an even number of lanes
3175 strip it down, treating the remaining lane as scalar.
3176 ??? Selecting the optimal set of lanes to vectorize would be nice
3177 but SLP build for all lanes will fail quickly because we think
3178 we're going to need unrolling. */
3179 if (kind == slp_inst_kind_bb_reduc
3180 && (scalar_stmts.length () & 1))
3181 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3183 /* Build the tree for the SLP instance. */
3184 unsigned int group_size = scalar_stmts.length ();
3185 bool *matches = XALLOCAVEC (bool, group_size);
3186 poly_uint64 max_nunits = 1;
3187 unsigned tree_size = 0;
3188 unsigned i;
3189 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3190 &max_nunits, matches, limit,
3191 &tree_size, bst_map);
3192 if (node != NULL)
3194 /* Calculate the unrolling factor based on the smallest type. */
3195 poly_uint64 unrolling_factor
3196 = calculate_unrolling_factor (max_nunits, group_size);
3198 if (maybe_ne (unrolling_factor, 1U)
3199 && is_a <bb_vec_info> (vinfo))
3201 unsigned HOST_WIDE_INT const_max_nunits;
3202 if (!max_nunits.is_constant (&const_max_nunits)
3203 || const_max_nunits > group_size)
3205 if (dump_enabled_p ())
3206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3207 "Build SLP failed: store group "
3208 "size not a multiple of the vector size "
3209 "in basic block SLP\n");
3210 vect_free_slp_tree (node);
3211 return false;
3213 /* Fatal mismatch. */
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE, vect_location,
3216 "SLP discovery succeeded but node needs "
3217 "splitting\n");
3218 memset (matches, true, group_size);
3219 matches[group_size / const_max_nunits * const_max_nunits] = false;
3220 vect_free_slp_tree (node);
3222 else
3224 /* Create a new SLP instance. */
3225 slp_instance new_instance = XNEW (class _slp_instance);
3226 SLP_INSTANCE_TREE (new_instance) = node;
3227 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3228 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3229 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3230 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3231 SLP_INSTANCE_KIND (new_instance) = kind;
3232 new_instance->reduc_phis = NULL;
3233 new_instance->cost_vec = vNULL;
3234 new_instance->subgraph_entries = vNULL;
3236 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_NOTE, vect_location,
3238 "SLP size %u vs. limit %u.\n",
3239 tree_size, max_tree_size);
3241 /* Fixup SLP reduction chains. */
3242 if (kind == slp_inst_kind_reduc_chain)
3244 /* If this is a reduction chain with a conversion in front
3245 amend the SLP tree with a node for that. */
3246 gimple *scalar_def
3247 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3248 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3250 /* Get at the conversion stmt - we know it's the single use
3251 of the last stmt of the reduction chain. */
3252 use_operand_p use_p;
3253 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3254 &use_p, &scalar_def);
3255 gcc_assert (r);
3256 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3257 next_info = vect_stmt_to_vectorize (next_info);
3258 scalar_stmts = vNULL;
3259 scalar_stmts.create (group_size);
3260 for (unsigned i = 0; i < group_size; ++i)
3261 scalar_stmts.quick_push (next_info);
3262 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3263 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3264 SLP_TREE_CHILDREN (conv).quick_push (node);
3265 SLP_INSTANCE_TREE (new_instance) = conv;
3266 /* We also have to fake this conversion stmt as SLP reduction
3267 group so we don't have to mess with too much code
3268 elsewhere. */
3269 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3270 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3272 /* Fill the backedge child of the PHI SLP node. The
3273 general matching code cannot find it because the
3274 scalar code does not reflect how we vectorize the
3275 reduction. */
3276 use_operand_p use_p;
3277 imm_use_iterator imm_iter;
3278 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3279 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3280 gimple_get_lhs (scalar_def))
3281 /* There are exactly two non-debug uses, the reduction
3282 PHI and the loop-closed PHI node. */
3283 if (!is_gimple_debug (USE_STMT (use_p))
3284 && gimple_bb (USE_STMT (use_p)) == loop->header)
3286 auto_vec<stmt_vec_info, 64> phis (group_size);
3287 stmt_vec_info phi_info
3288 = vinfo->lookup_stmt (USE_STMT (use_p));
3289 for (unsigned i = 0; i < group_size; ++i)
3290 phis.quick_push (phi_info);
3291 slp_tree *phi_node = bst_map->get (phis);
3292 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3293 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3294 = SLP_INSTANCE_TREE (new_instance);
3295 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3299 vinfo->slp_instances.safe_push (new_instance);
3301 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3302 the number of scalar stmts in the root in a few places.
3303 Verify that assumption holds. */
3304 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3305 .length () == group_size);
3307 if (dump_enabled_p ())
3309 dump_printf_loc (MSG_NOTE, vect_location,
3310 "Final SLP tree for instance %p:\n",
3311 (void *) new_instance);
3312 vect_print_slp_graph (MSG_NOTE, vect_location,
3313 SLP_INSTANCE_TREE (new_instance));
3316 return true;
3319 else
3321 /* Failed to SLP. */
3322 /* Free the allocated memory. */
3323 scalar_stmts.release ();
3326 stmt_vec_info stmt_info = stmt_info_;
3327 /* Try to break the group up into pieces. */
3328 if (kind == slp_inst_kind_store)
3330 /* ??? We could delay all the actual splitting of store-groups
3331 until after SLP discovery of the original group completed.
3332 Then we can recurse to vect_build_slp_instance directly. */
3333 for (i = 0; i < group_size; i++)
3334 if (!matches[i])
3335 break;
3337 /* For basic block SLP, try to break the group up into multiples of
3338 a vector size. */
3339 if (is_a <bb_vec_info> (vinfo)
3340 && (i > 1 && i < group_size))
3342 tree scalar_type
3343 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3344 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3345 1 << floor_log2 (i));
3346 unsigned HOST_WIDE_INT const_nunits;
3347 if (vectype
3348 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3350 /* Split into two groups at the first vector boundary. */
3351 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3352 unsigned group1_size = i & ~(const_nunits - 1);
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "Splitting SLP group at stmt %u\n", i);
3357 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3358 group1_size);
3359 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3360 kind, max_tree_size,
3361 limit);
3362 /* Split the rest at the failure point and possibly
3363 re-analyze the remaining matching part if it has
3364 at least two lanes. */
3365 if (group1_size < i
3366 && (i + 1 < group_size
3367 || i - group1_size > 1))
3369 stmt_vec_info rest2 = rest;
3370 rest = vect_split_slp_store_group (rest, i - group1_size);
3371 if (i - group1_size > 1)
3372 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3373 kind, max_tree_size,
3374 limit);
3376 /* Re-analyze the non-matching tail if it has at least
3377 two lanes. */
3378 if (i + 1 < group_size)
3379 res |= vect_analyze_slp_instance (vinfo, bst_map,
3380 rest, kind, max_tree_size,
3381 limit);
3382 return res;
3386 /* For loop vectorization split into arbitrary pieces of size > 1. */
3387 if (is_a <loop_vec_info> (vinfo)
3388 && (i > 1 && i < group_size)
3389 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3391 unsigned group1_size = i;
3393 if (dump_enabled_p ())
3394 dump_printf_loc (MSG_NOTE, vect_location,
3395 "Splitting SLP group at stmt %u\n", i);
3397 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3398 group1_size);
3399 /* Loop vectorization cannot handle gaps in stores, make sure
3400 the split group appears as strided. */
3401 STMT_VINFO_STRIDED_P (rest) = 1;
3402 DR_GROUP_GAP (rest) = 0;
3403 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3404 DR_GROUP_GAP (stmt_info) = 0;
3406 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3407 kind, max_tree_size, limit);
3408 if (i + 1 < group_size)
3409 res |= vect_analyze_slp_instance (vinfo, bst_map,
3410 rest, kind, max_tree_size, limit);
3412 return res;
3415 /* Even though the first vector did not all match, we might be able to SLP
3416 (some) of the remainder. FORNOW ignore this possibility. */
3419 /* Failed to SLP. */
3420 if (dump_enabled_p ())
3421 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3422 return false;
3426 /* Analyze an SLP instance starting from a group of grouped stores. Call
3427 vect_build_slp_tree to build a tree of packed stmts if possible.
3428 Return FALSE if it's impossible to SLP any stmt in the loop. */
3430 static bool
3431 vect_analyze_slp_instance (vec_info *vinfo,
3432 scalar_stmts_to_slp_tree_map_t *bst_map,
3433 stmt_vec_info stmt_info,
3434 slp_instance_kind kind,
3435 unsigned max_tree_size, unsigned *limit)
3437 unsigned int i;
3438 vec<stmt_vec_info> scalar_stmts;
3440 if (is_a <bb_vec_info> (vinfo))
3441 vect_location = stmt_info->stmt;
3443 stmt_vec_info next_info = stmt_info;
3444 if (kind == slp_inst_kind_store)
3446 /* Collect the stores and store them in scalar_stmts. */
3447 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3448 while (next_info)
3450 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3451 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3454 else if (kind == slp_inst_kind_reduc_chain)
3456 /* Collect the reduction stmts and store them in scalar_stmts. */
3457 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3458 while (next_info)
3460 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3461 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3463 /* Mark the first element of the reduction chain as reduction to properly
3464 transform the node. In the reduction analysis phase only the last
3465 element of the chain is marked as reduction. */
3466 STMT_VINFO_DEF_TYPE (stmt_info)
3467 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3468 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3469 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3471 else if (kind == slp_inst_kind_reduc_group)
3473 /* Collect reduction statements. */
3474 const vec<stmt_vec_info> &reductions
3475 = as_a <loop_vec_info> (vinfo)->reductions;
3476 scalar_stmts.create (reductions.length ());
3477 for (i = 0; reductions.iterate (i, &next_info); i++)
3478 if ((STMT_VINFO_RELEVANT_P (next_info)
3479 || STMT_VINFO_LIVE_P (next_info))
3480 /* ??? Make sure we didn't skip a conversion around a reduction
3481 path. In that case we'd have to reverse engineer that conversion
3482 stmt following the chain using reduc_idx and from the PHI
3483 using reduc_def. */
3484 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3485 scalar_stmts.quick_push (next_info);
3486 /* If less than two were relevant/live there's nothing to SLP. */
3487 if (scalar_stmts.length () < 2)
3488 return false;
3490 else
3491 gcc_unreachable ();
3493 vec<stmt_vec_info> roots = vNULL;
3494 vec<tree> remain = vNULL;
3495 /* Build the tree for the SLP instance. */
3496 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3497 roots, remain,
3498 max_tree_size, limit, bst_map,
3499 kind == slp_inst_kind_store
3500 ? stmt_info : NULL);
3502 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3503 where we should do store group splitting. */
3505 return res;
3508 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3509 trees of packed scalar stmts if SLP is possible. */
3511 opt_result
3512 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3514 unsigned int i;
3515 stmt_vec_info first_element;
3516 slp_instance instance;
3518 DUMP_VECT_SCOPE ("vect_analyze_slp");
3520 unsigned limit = max_tree_size;
3522 scalar_stmts_to_slp_tree_map_t *bst_map
3523 = new scalar_stmts_to_slp_tree_map_t ();
3525 /* Find SLP sequences starting from groups of grouped stores. */
3526 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3527 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3528 slp_inst_kind_store, max_tree_size, &limit);
3530 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3532 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3534 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3535 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3536 bb_vinfo->roots[i].stmts,
3537 bb_vinfo->roots[i].roots,
3538 bb_vinfo->roots[i].remain,
3539 max_tree_size, &limit, bst_map, NULL))
3541 bb_vinfo->roots[i].stmts = vNULL;
3542 bb_vinfo->roots[i].roots = vNULL;
3543 bb_vinfo->roots[i].remain = vNULL;
3548 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3550 /* Find SLP sequences starting from reduction chains. */
3551 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3552 if (! STMT_VINFO_RELEVANT_P (first_element)
3553 && ! STMT_VINFO_LIVE_P (first_element))
3555 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3556 slp_inst_kind_reduc_chain,
3557 max_tree_size, &limit))
3559 /* Dissolve reduction chain group. */
3560 stmt_vec_info vinfo = first_element;
3561 stmt_vec_info last = NULL;
3562 while (vinfo)
3564 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3565 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3566 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3567 last = vinfo;
3568 vinfo = next;
3570 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3571 /* It can be still vectorized as part of an SLP reduction. */
3572 loop_vinfo->reductions.safe_push (last);
3575 /* Find SLP sequences starting from groups of reductions. */
3576 if (loop_vinfo->reductions.length () > 1)
3577 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3578 slp_inst_kind_reduc_group, max_tree_size,
3579 &limit);
3582 hash_set<slp_tree> visited_patterns;
3583 slp_tree_to_load_perm_map_t perm_cache;
3584 slp_compat_nodes_map_t compat_cache;
3586 /* See if any patterns can be found in the SLP tree. */
3587 bool pattern_found = false;
3588 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3589 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3590 &visited_patterns, &perm_cache,
3591 &compat_cache);
3593 /* If any were found optimize permutations of loads. */
3594 if (pattern_found)
3596 hash_map<slp_tree, slp_tree> load_map;
3597 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3599 slp_tree root = SLP_INSTANCE_TREE (instance);
3600 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3601 &load_map, root);
3607 /* The map keeps a reference on SLP nodes built, release that. */
3608 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3609 it != bst_map->end (); ++it)
3610 if ((*it).second)
3611 vect_free_slp_tree ((*it).second);
3612 delete bst_map;
3614 if (pattern_found && dump_enabled_p ())
3616 dump_printf_loc (MSG_NOTE, vect_location,
3617 "Pattern matched SLP tree\n");
3618 hash_set<slp_tree> visited;
3619 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3620 vect_print_slp_graph (MSG_NOTE, vect_location,
3621 SLP_INSTANCE_TREE (instance), visited);
3624 return opt_result::success ();
3627 /* Estimates the cost of inserting layout changes into the SLP graph.
3628 It can also say that the insertion is impossible. */
3630 struct slpg_layout_cost
3632 slpg_layout_cost () = default;
3633 slpg_layout_cost (sreal, bool);
3635 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3636 bool is_possible () const { return depth != sreal::max (); }
3638 bool operator== (const slpg_layout_cost &) const;
3639 bool operator!= (const slpg_layout_cost &) const;
3641 bool is_better_than (const slpg_layout_cost &, bool) const;
3643 void add_parallel_cost (const slpg_layout_cost &);
3644 void add_serial_cost (const slpg_layout_cost &);
3645 void split (unsigned int);
3647 /* The longest sequence of layout changes needed during any traversal
3648 of the partition dag, weighted by execution frequency.
3650 This is the most important metric when optimizing for speed, since
3651 it helps to ensure that we keep the number of operations on
3652 critical paths to a minimum. */
3653 sreal depth = 0;
3655 /* An estimate of the total number of operations needed. It is weighted by
3656 execution frequency when optimizing for speed but not when optimizing for
3657 size. In order to avoid double-counting, a node with a fanout of N will
3658 distribute 1/N of its total cost to each successor.
3660 This is the most important metric when optimizing for size, since
3661 it helps to keep the total number of operations to a minimum, */
3662 sreal total = 0;
3665 /* Construct costs for a node with weight WEIGHT. A higher weight
3666 indicates more frequent execution. IS_FOR_SIZE is true if we are
3667 optimizing for size rather than speed. */
3669 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3670 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3674 bool
3675 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3677 return depth == other.depth && total == other.total;
3680 bool
3681 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3683 return !operator== (other);
3686 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3687 true if we are optimizing for size rather than speed. */
3689 bool
3690 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3691 bool is_for_size) const
3693 if (is_for_size)
3695 if (total != other.total)
3696 return total < other.total;
3697 return depth < other.depth;
3699 else
3701 if (depth != other.depth)
3702 return depth < other.depth;
3703 return total < other.total;
3707 /* Increase the costs to account for something with cost INPUT_COST
3708 happening in parallel with the current costs. */
3710 void
3711 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3713 depth = std::max (depth, input_cost.depth);
3714 total += input_cost.total;
3717 /* Increase the costs to account for something with cost INPUT_COST
3718 happening in series with the current costs. */
3720 void
3721 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3723 depth += other.depth;
3724 total += other.total;
3727 /* Split the total cost among TIMES successors or predecessors. */
3729 void
3730 slpg_layout_cost::split (unsigned int times)
3732 if (times > 1)
3733 total /= times;
3736 /* Information about one node in the SLP graph, for use during
3737 vect_optimize_slp_pass. */
3739 struct slpg_vertex
3741 slpg_vertex (slp_tree node_) : node (node_) {}
3743 /* The node itself. */
3744 slp_tree node;
3746 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3747 partitions are flexible; they can have whichever layout consumers
3748 want them to have. */
3749 int partition = -1;
3751 /* The number of nodes that directly use the result of this one
3752 (i.e. the number of nodes that count this one as a child). */
3753 unsigned int out_degree = 0;
3755 /* The execution frequency of the node. */
3756 sreal weight = 0;
3758 /* The total execution frequency of all nodes that directly use the
3759 result of this one. */
3760 sreal out_weight = 0;
3763 /* Information about one partition of the SLP graph, for use during
3764 vect_optimize_slp_pass. */
3766 struct slpg_partition_info
3768 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3769 of m_partitioned_nodes. */
3770 unsigned int node_begin = 0;
3771 unsigned int node_end = 0;
3773 /* Which layout we've chosen to use for this partition, or -1 if
3774 we haven't picked one yet. */
3775 int layout = -1;
3777 /* The number of predecessors and successors in the partition dag.
3778 The predecessors always have lower partition numbers and the
3779 successors always have higher partition numbers.
3781 Note that the directions of these edges are not necessarily the
3782 same as in the data flow graph. For example, if an SCC has separate
3783 partitions for an inner loop and an outer loop, the inner loop's
3784 partition will have at least two incoming edges from the outer loop's
3785 partition: one for a live-in value and one for a live-out value.
3786 In data flow terms, one of these edges would also be from the outer loop
3787 to the inner loop, but the other would be in the opposite direction. */
3788 unsigned int in_degree = 0;
3789 unsigned int out_degree = 0;
3792 /* Information about the costs of using a particular layout for a
3793 particular partition. It can also say that the combination is
3794 impossible. */
3796 struct slpg_partition_layout_costs
3798 bool is_possible () const { return internal_cost.is_possible (); }
3799 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3801 /* The costs inherited from predecessor partitions. */
3802 slpg_layout_cost in_cost;
3804 /* The inherent cost of the layout within the node itself. For example,
3805 this is nonzero for a load if choosing a particular layout would require
3806 the load to permute the loaded elements. It is nonzero for a
3807 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3808 to full-vector moves. */
3809 slpg_layout_cost internal_cost;
3811 /* The costs inherited from successor partitions. */
3812 slpg_layout_cost out_cost;
3815 /* This class tries to optimize the layout of vectors in order to avoid
3816 unnecessary shuffling. At the moment, the set of possible layouts are
3817 restricted to bijective permutations.
3819 The goal of the pass depends on whether we're optimizing for size or
3820 for speed. When optimizing for size, the goal is to reduce the overall
3821 number of layout changes (including layout changes implied by things
3822 like load permutations). When optimizing for speed, the goal is to
3823 reduce the maximum latency attributable to layout changes on any
3824 non-cyclical path through the data flow graph.
3826 For example, when optimizing a loop nest for speed, we will prefer
3827 to make layout changes outside of a loop rather than inside of a loop,
3828 and will prefer to make layout changes in parallel rather than serially,
3829 even if that increases the overall number of layout changes.
3831 The high-level procedure is:
3833 (1) Build a graph in which edges go from uses (parents) to definitions
3834 (children).
3836 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3838 (3) When optimizing for speed, partition the nodes in each SCC based
3839 on their containing cfg loop. When optimizing for size, treat
3840 each SCC as a single partition.
3842 This gives us a dag of partitions. The goal is now to assign a
3843 layout to each partition.
3845 (4) Construct a set of vector layouts that are worth considering.
3846 Record which nodes must keep their current layout.
3848 (5) Perform a forward walk over the partition dag (from loads to stores)
3849 accumulating the "forward" cost of using each layout. When visiting
3850 each partition, assign a tentative choice of layout to the partition
3851 and use that choice when calculating the cost of using a different
3852 layout in successor partitions.
3854 (6) Perform a backward walk over the partition dag (from stores to loads),
3855 accumulating the "backward" cost of using each layout. When visiting
3856 each partition, make a final choice of layout for that partition based
3857 on the accumulated forward costs (from (5)) and backward costs
3858 (from (6)).
3860 (7) Apply the chosen layouts to the SLP graph.
3862 For example, consider the SLP statements:
3864 S1: a_1 = load
3865 loop:
3866 S2: a_2 = PHI<a_1, a_3>
3867 S3: b_1 = load
3868 S4: a_3 = a_2 + b_1
3869 exit:
3870 S5: a_4 = PHI<a_3>
3871 S6: store a_4
3873 S2 and S4 form an SCC and are part of the same loop. Every other
3874 statement is in a singleton SCC. In this example there is a one-to-one
3875 mapping between SCCs and partitions and the partition dag looks like this;
3877 S1 S3
3879 S2+S4
3885 S2, S3 and S4 will have a higher execution frequency than the other
3886 statements, so when optimizing for speed, the goal is to avoid any
3887 layout changes:
3889 - within S3
3890 - within S2+S4
3891 - on the S3->S2+S4 edge
3893 For example, if S3 was originally a reversing load, the goal of the
3894 pass is to make it an unreversed load and change the layout on the
3895 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3896 on S1->S2+S4 and S5->S6 would also be acceptable.)
3898 The difference between SCCs and partitions becomes important if we
3899 add an outer loop:
3901 S1: a_1 = ...
3902 loop1:
3903 S2: a_2 = PHI<a_1, a_6>
3904 S3: b_1 = load
3905 S4: a_3 = a_2 + b_1
3906 loop2:
3907 S5: a_4 = PHI<a_3, a_5>
3908 S6: c_1 = load
3909 S7: a_5 = a_4 + c_1
3910 exit2:
3911 S8: a_6 = PHI<a_5>
3912 S9: store a_6
3913 exit1:
3915 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3916 for speed, we usually do not want restrictions in the outer loop to "infect"
3917 the decision for the inner loop. For example, if an outer-loop node
3918 in the SCC contains a statement with a fixed layout, that should not
3919 prevent the inner loop from using a different layout. Conversely,
3920 the inner loop should not dictate a layout to the outer loop: if the
3921 outer loop does a lot of computation, then it may not be efficient to
3922 do all of that computation in the inner loop's preferred layout.
3924 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3925 and S5+S7 (inner). We also try to arrange partitions so that:
3927 - the partition for an outer loop comes before the partition for
3928 an inner loop
3930 - if a sibling loop A dominates a sibling loop B, A's partition
3931 comes before B's
3933 This gives the following partition dag for the example above:
3935 S1 S3
3937 S2+S4+S8 S6
3938 | \\ /
3939 | S5+S7
3943 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3944 one for a reversal of the edge S7->S8.
3946 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3947 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3948 preferred layout against the cost of changing the layout on entry to the
3949 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3951 Although this works well when optimizing for speed, it has the downside
3952 when optimizing for size that the choice of layout for S5+S7 is completely
3953 independent of S9, which lessens the chance of reducing the overall number
3954 of permutations. We therefore do not partition SCCs when optimizing
3955 for size.
3957 To give a concrete example of the difference between optimizing
3958 for size and speed, consider:
3960 a[0] = (b[1] << c[3]) - d[1];
3961 a[1] = (b[0] << c[2]) - d[0];
3962 a[2] = (b[3] << c[1]) - d[3];
3963 a[3] = (b[2] << c[0]) - d[2];
3965 There are three different layouts here: one for a, one for b and d,
3966 and one for c. When optimizing for speed it is better to permute each
3967 of b, c and d into the order required by a, since those permutations
3968 happen in parallel. But when optimizing for size, it is better to:
3970 - permute c into the same order as b
3971 - do the arithmetic
3972 - permute the result into the order required by a
3974 This gives 2 permutations rather than 3. */
3976 class vect_optimize_slp_pass
3978 public:
3979 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3980 void run ();
3982 private:
3983 /* Graph building. */
3984 struct loop *containing_loop (slp_tree);
3985 bool is_cfg_latch_edge (graph_edge *);
3986 void build_vertices (hash_set<slp_tree> &, slp_tree);
3987 void build_vertices ();
3988 void build_graph ();
3990 /* Partitioning. */
3991 void create_partitions ();
3992 template<typename T> void for_each_partition_edge (unsigned int, T);
3994 /* Layout selection. */
3995 bool is_compatible_layout (slp_tree, unsigned int);
3996 int change_layout_cost (slp_tree, unsigned int, unsigned int);
3997 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3998 unsigned int);
3999 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4000 int, unsigned int);
4001 int internal_node_cost (slp_tree, int, unsigned int);
4002 void start_choosing_layouts ();
4004 /* Cost propagation. */
4005 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4006 unsigned int, unsigned int);
4007 slpg_layout_cost total_in_cost (unsigned int);
4008 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4009 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4010 void forward_pass ();
4011 void backward_pass ();
4013 /* Rematerialization. */
4014 slp_tree get_result_with_layout (slp_tree, unsigned int);
4015 void materialize ();
4017 /* Clean-up. */
4018 void remove_redundant_permutations ();
4020 void dump ();
4022 vec_info *m_vinfo;
4024 /* True if we should optimize the graph for size, false if we should
4025 optimize it for speed. (It wouldn't be easy to make this decision
4026 more locally.) */
4027 bool m_optimize_size;
4029 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4030 In other words, a node's predecessors are its slp_tree parents and
4031 a node's successors are its slp_tree children. */
4032 graph *m_slpg = nullptr;
4034 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4035 auto_vec<slpg_vertex> m_vertices;
4037 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4038 and loads. */
4039 auto_vec<int> m_leafs;
4041 /* This array has one entry for every vector layout that we're considering.
4042 Element 0 is null and indicates "no change". Other entries describe
4043 permutations that are inherent in the current graph and that we would
4044 like to reverse if possible.
4046 For example, a permutation { 1, 2, 3, 0 } means that something has
4047 effectively been permuted in that way, such as a load group
4048 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4049 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4050 in order to put things "back" in order. */
4051 auto_vec<vec<unsigned> > m_perms;
4053 /* A partitioning of the nodes for which a layout must be chosen.
4054 Each partition represents an <SCC, cfg loop> pair; that is,
4055 nodes in different SCCs belong to different partitions, and nodes
4056 within an SCC can be further partitioned according to a containing
4057 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4059 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4060 from leaves (such as loads) to roots (such as stores).
4062 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4063 auto_vec<slpg_partition_info> m_partitions;
4065 /* The list of all nodes for which a layout must be chosen. Nodes for
4066 partition P come before the nodes for partition P+1. Nodes within a
4067 partition are in reverse postorder. */
4068 auto_vec<unsigned int> m_partitioned_nodes;
4070 /* Index P * num-layouts + L contains the cost of using layout L
4071 for partition P. */
4072 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4074 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4075 original output of node N adjusted to have layout L. */
4076 auto_vec<slp_tree> m_node_layouts;
4079 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4080 Also record whether we should optimize anything for speed rather
4081 than size. */
4083 void
4084 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4085 slp_tree node)
4087 unsigned i;
4088 slp_tree child;
4090 if (visited.add (node))
4091 return;
4093 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4095 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4096 if (optimize_bb_for_speed_p (bb))
4097 m_optimize_size = false;
4100 node->vertex = m_vertices.length ();
4101 m_vertices.safe_push (slpg_vertex (node));
4103 bool leaf = true;
4104 bool force_leaf = false;
4105 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4106 if (child)
4108 leaf = false;
4109 build_vertices (visited, child);
4111 else
4112 force_leaf = true;
4113 /* Since SLP discovery works along use-def edges all cycles have an
4114 entry - but there's the exception of cycles where we do not handle
4115 the entry explicitely (but with a NULL SLP node), like some reductions
4116 and inductions. Force those SLP PHIs to act as leafs to make them
4117 backwards reachable. */
4118 if (leaf || force_leaf)
4119 m_leafs.safe_push (node->vertex);
4122 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4124 void
4125 vect_optimize_slp_pass::build_vertices ()
4127 hash_set<slp_tree> visited;
4128 unsigned i;
4129 slp_instance instance;
4130 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4131 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4134 /* Apply (reverse) bijectite PERM to VEC. */
4136 template <class T>
4137 static void
4138 vect_slp_permute (vec<unsigned> perm,
4139 vec<T> &vec, bool reverse)
4141 auto_vec<T, 64> saved;
4142 saved.create (vec.length ());
4143 for (unsigned i = 0; i < vec.length (); ++i)
4144 saved.quick_push (vec[i]);
4146 if (reverse)
4148 for (unsigned i = 0; i < vec.length (); ++i)
4149 vec[perm[i]] = saved[i];
4150 for (unsigned i = 0; i < vec.length (); ++i)
4151 gcc_assert (vec[perm[i]] == saved[i]);
4153 else
4155 for (unsigned i = 0; i < vec.length (); ++i)
4156 vec[i] = saved[perm[i]];
4157 for (unsigned i = 0; i < vec.length (); ++i)
4158 gcc_assert (vec[i] == saved[perm[i]]);
4162 /* Return the cfg loop that contains NODE. */
4164 struct loop *
4165 vect_optimize_slp_pass::containing_loop (slp_tree node)
4167 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4168 if (!rep)
4169 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4170 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4173 /* Return true if UD (an edge from a use to a definition) is associated
4174 with a loop latch edge in the cfg. */
4176 bool
4177 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4179 slp_tree use = m_vertices[ud->src].node;
4180 slp_tree def = m_vertices[ud->dest].node;
4181 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4182 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4183 return false;
4185 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4186 return (is_a<gphi *> (use_rep->stmt)
4187 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4188 && containing_loop (def) == containing_loop (use));
4191 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4192 a nonnull data field. */
4194 void
4195 vect_optimize_slp_pass::build_graph ()
4197 m_optimize_size = true;
4198 build_vertices ();
4200 m_slpg = new_graph (m_vertices.length ());
4201 for (slpg_vertex &v : m_vertices)
4202 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4203 if (child)
4205 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4206 if (is_cfg_latch_edge (ud))
4207 ud->data = this;
4211 /* Return true if E corresponds to a loop latch edge in the cfg. */
4213 static bool
4214 skip_cfg_latch_edges (graph_edge *e)
4216 return e->data;
4219 /* Create the node partitions. */
4221 void
4222 vect_optimize_slp_pass::create_partitions ()
4224 /* Calculate a postorder of the graph, ignoring edges that correspond
4225 to natural latch edges in the cfg. Reading the vector from the end
4226 to the beginning gives the reverse postorder. */
4227 auto_vec<int> initial_rpo;
4228 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4229 false, NULL, skip_cfg_latch_edges);
4230 gcc_assert (initial_rpo.length () == m_vertices.length ());
4232 /* Calculate the strongly connected components of the graph. */
4233 auto_vec<int> scc_grouping;
4234 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4236 /* Create a new index order in which all nodes from the same SCC are
4237 consecutive. Use scc_pos to record the index of the first node in
4238 each SCC. */
4239 auto_vec<unsigned int> scc_pos (num_sccs);
4240 int last_component = -1;
4241 unsigned int node_count = 0;
4242 for (unsigned int node_i : scc_grouping)
4244 if (last_component != m_slpg->vertices[node_i].component)
4246 last_component = m_slpg->vertices[node_i].component;
4247 gcc_assert (last_component == int (scc_pos.length ()));
4248 scc_pos.quick_push (node_count);
4250 node_count += 1;
4252 gcc_assert (node_count == initial_rpo.length ()
4253 && last_component + 1 == int (num_sccs));
4255 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4256 inside each SCC following the RPO we calculated above. The fact that
4257 we ignored natural latch edges when calculating the RPO should ensure
4258 that, for natural loop nests:
4260 - the first node that we encounter in a cfg loop is the loop header phi
4261 - the loop header phis are in dominance order
4263 Arranging for this is an optimization (see below) rather than a
4264 correctness issue. Unnatural loops with a tangled mess of backedges
4265 will still work correctly, but might give poorer results.
4267 Also update scc_pos so that it gives 1 + the index of the last node
4268 in the SCC. */
4269 m_partitioned_nodes.safe_grow (node_count);
4270 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4272 unsigned int node_i = initial_rpo[old_i];
4273 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4274 m_partitioned_nodes[new_i] = node_i;
4277 /* When optimizing for speed, partition each SCC based on the containing
4278 cfg loop. The order we constructed above should ensure that, for natural
4279 cfg loops, we'll create sub-SCC partitions for outer loops before
4280 the corresponding sub-SCC partitions for inner loops. Similarly,
4281 when one sibling loop A dominates another sibling loop B, we should
4282 create a sub-SCC partition for A before a sub-SCC partition for B.
4284 As above, nothing depends for correctness on whether this achieves
4285 a natural nesting, but we should get better results when it does. */
4286 m_partitions.reserve (m_vertices.length ());
4287 unsigned int next_partition_i = 0;
4288 hash_map<struct loop *, int> loop_partitions;
4289 unsigned int rpo_begin = 0;
4290 unsigned int num_partitioned_nodes = 0;
4291 for (unsigned int rpo_end : scc_pos)
4293 loop_partitions.empty ();
4294 unsigned int partition_i = next_partition_i;
4295 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4297 /* Handle externals and constants optimistically throughout.
4298 But treat existing vectors as fixed since we do not handle
4299 permuting them. */
4300 unsigned int node_i = m_partitioned_nodes[rpo_i];
4301 auto &vertex = m_vertices[node_i];
4302 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4303 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4304 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4305 vertex.partition = -1;
4306 else
4308 bool existed;
4309 if (m_optimize_size)
4310 existed = next_partition_i > partition_i;
4311 else
4313 struct loop *loop = containing_loop (vertex.node);
4314 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4315 if (!existed)
4316 entry = next_partition_i;
4317 partition_i = entry;
4319 if (!existed)
4321 m_partitions.quick_push (slpg_partition_info ());
4322 next_partition_i += 1;
4324 vertex.partition = partition_i;
4325 num_partitioned_nodes += 1;
4326 m_partitions[partition_i].node_end += 1;
4329 rpo_begin = rpo_end;
4332 /* Assign ranges of consecutive node indices to each partition,
4333 in partition order. Start with node_end being the same as
4334 node_begin so that the next loop can use it as a counter. */
4335 unsigned int node_begin = 0;
4336 for (auto &partition : m_partitions)
4338 partition.node_begin = node_begin;
4339 node_begin += partition.node_end;
4340 partition.node_end = partition.node_begin;
4342 gcc_assert (node_begin == num_partitioned_nodes);
4344 /* Finally build the list of nodes in partition order. */
4345 m_partitioned_nodes.truncate (num_partitioned_nodes);
4346 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4348 int partition_i = m_vertices[node_i].partition;
4349 if (partition_i >= 0)
4351 unsigned int order_i = m_partitions[partition_i].node_end++;
4352 m_partitioned_nodes[order_i] = node_i;
4357 /* Look for edges from earlier partitions into node NODE_I and edges from
4358 node NODE_I into later partitions. Call:
4360 FN (ud, other_node_i)
4362 for each such use-to-def edge ud, where other_node_i is the node at the
4363 other end of the edge. */
4365 template<typename T>
4366 void
4367 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4369 int partition_i = m_vertices[node_i].partition;
4370 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4371 pred; pred = pred->pred_next)
4373 int src_partition_i = m_vertices[pred->src].partition;
4374 if (src_partition_i >= 0 && src_partition_i != partition_i)
4375 fn (pred, pred->src);
4377 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4378 succ; succ = succ->succ_next)
4380 int dest_partition_i = m_vertices[succ->dest].partition;
4381 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4382 fn (succ, succ->dest);
4386 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4387 that NODE would operate on. This test is independent of NODE's actual
4388 operation. */
4390 bool
4391 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4392 unsigned int layout_i)
4394 if (layout_i == 0)
4395 return true;
4397 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4398 return false;
4400 return true;
4403 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4404 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4405 layouts is incompatible with NODE or if the change is not possible for
4406 some other reason.
4408 The properties taken from NODE include the number of lanes and the
4409 vector type. The actual operation doesn't matter. */
4412 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4413 unsigned int from_layout_i,
4414 unsigned int to_layout_i)
4416 if (!is_compatible_layout (node, from_layout_i)
4417 || !is_compatible_layout (node, to_layout_i))
4418 return -1;
4420 if (from_layout_i == to_layout_i)
4421 return 0;
4423 auto_vec<slp_tree, 1> children (1);
4424 children.quick_push (node);
4425 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4426 if (from_layout_i > 0)
4427 for (unsigned int i : m_perms[from_layout_i])
4428 perm.quick_push ({ 0, i });
4429 else
4430 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4431 perm.quick_push ({ 0, i });
4432 if (to_layout_i > 0)
4433 vect_slp_permute (m_perms[to_layout_i], perm, true);
4434 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4435 children, false);
4436 if (count >= 0)
4437 return MAX (count, 1);
4439 /* ??? In principle we could try changing via layout 0, giving two
4440 layout changes rather than 1. Doing that would require
4441 corresponding support in get_result_with_layout. */
4442 return -1;
4445 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4447 inline slpg_partition_layout_costs &
4448 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4449 unsigned int layout_i)
4451 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4454 /* Change PERM in one of two ways:
4456 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4457 chosen for child I of NODE.
4459 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4461 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4463 void
4464 vect_optimize_slp_pass::
4465 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4466 int in_layout_i, unsigned int out_layout_i)
4468 for (auto &entry : perm)
4470 int this_in_layout_i = in_layout_i;
4471 if (this_in_layout_i < 0)
4473 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4474 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4475 this_in_layout_i = m_partitions[in_partition_i].layout;
4477 if (this_in_layout_i > 0)
4478 entry.second = m_perms[this_in_layout_i][entry.second];
4480 if (out_layout_i > 0)
4481 vect_slp_permute (m_perms[out_layout_i], perm, true);
4484 /* Check whether the target allows NODE to be rearranged so that the node's
4485 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4486 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4488 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4489 NODE can adapt to the layout changes that have (perhaps provisionally)
4490 been chosen for NODE's children, so that no extra permutations are
4491 needed on either the input or the output of NODE.
4493 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4494 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4496 IN_LAYOUT_I has no meaning for other types of node.
4498 Keeping the node as-is is always valid. If the target doesn't appear
4499 to support the node as-is, but might realistically support other layouts,
4500 then layout 0 instead has the cost of a worst-case permutation. On the
4501 one hand, this ensures that every node has at least one valid layout,
4502 avoiding what would otherwise be an awkward special case. On the other,
4503 it still encourages the pass to change an invalid pre-existing layout
4504 choice into a valid one. */
4507 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4508 unsigned int out_layout_i)
4510 const int fallback_cost = 1;
4512 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4514 auto_lane_permutation_t tmp_perm;
4515 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4517 /* Check that the child nodes support the chosen layout. Checking
4518 the first child is enough, since any second child would have the
4519 same shape. */
4520 auto first_child = SLP_TREE_CHILDREN (node)[0];
4521 if (in_layout_i > 0
4522 && !is_compatible_layout (first_child, in_layout_i))
4523 return -1;
4525 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4526 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4527 node, tmp_perm,
4528 SLP_TREE_CHILDREN (node),
4529 false);
4530 if (count < 0)
4532 if (in_layout_i == 0 && out_layout_i == 0)
4534 /* Use the fallback cost if the node could in principle support
4535 some nonzero layout for both the inputs and the outputs.
4536 Otherwise assume that the node will be rejected later
4537 and rebuilt from scalars. */
4538 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4539 return fallback_cost;
4540 return 0;
4542 return -1;
4545 /* We currently have no way of telling whether the new layout is cheaper
4546 or more expensive than the old one. But at least in principle,
4547 it should be worth making zero permutations (whole-vector shuffles)
4548 cheaper than real permutations, in case the pass is able to remove
4549 the latter. */
4550 return count == 0 ? 0 : 1;
4553 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4554 if (rep
4555 && STMT_VINFO_DATA_REF (rep)
4556 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4557 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4559 auto_load_permutation_t tmp_perm;
4560 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4561 if (out_layout_i > 0)
4562 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4564 poly_uint64 vf = 1;
4565 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4566 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4567 unsigned int n_perms;
4568 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4569 nullptr, vf, true, false, &n_perms))
4571 auto rep = SLP_TREE_REPRESENTATIVE (node);
4572 if (out_layout_i == 0)
4574 /* Use the fallback cost if the load is an N-to-N permutation.
4575 Otherwise assume that the node will be rejected later
4576 and rebuilt from scalars. */
4577 if (STMT_VINFO_GROUPED_ACCESS (rep)
4578 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4579 == SLP_TREE_LANES (node)))
4580 return fallback_cost;
4581 return 0;
4583 return -1;
4586 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4587 return n_perms == 0 ? 0 : 1;
4590 return 0;
4593 /* Decide which element layouts we should consider using. Calculate the
4594 weights associated with inserting layout changes on partition edges.
4595 Also mark partitions that cannot change layout, by setting their
4596 layout to zero. */
4598 void
4599 vect_optimize_slp_pass::start_choosing_layouts ()
4601 /* Used to assign unique permutation indices. */
4602 using perm_hash = unbounded_hashmap_traits<
4603 vec_free_hash_base<int_hash_base<unsigned>>,
4604 int_hash<int, -1, -2>
4606 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4608 /* Layout 0 is "no change". */
4609 m_perms.safe_push (vNULL);
4611 /* Create layouts from existing permutations. */
4612 auto_load_permutation_t tmp_perm;
4613 for (unsigned int node_i : m_partitioned_nodes)
4615 /* Leafs also double as entries to the reverse graph. Allow the
4616 layout of those to be changed. */
4617 auto &vertex = m_vertices[node_i];
4618 auto &partition = m_partitions[vertex.partition];
4619 if (!m_slpg->vertices[node_i].succ)
4620 partition.layout = 0;
4622 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4623 slp_tree node = vertex.node;
4624 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4625 slp_tree child;
4626 unsigned HOST_WIDE_INT imin, imax = 0;
4627 bool any_permute = false;
4628 tmp_perm.truncate (0);
4629 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4631 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4632 unpermuted, record a layout that reverses this permutation.
4634 We would need more work to cope with loads that are internally
4635 permuted and also have inputs (such as masks for
4636 IFN_MASK_LOADs). */
4637 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4638 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4640 partition.layout = -1;
4641 continue;
4643 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4644 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4645 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4647 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4648 && SLP_TREE_CHILDREN (node).length () == 1
4649 && (child = SLP_TREE_CHILDREN (node)[0])
4650 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4651 .is_constant (&imin)))
4653 /* If the child has the same vector size as this node,
4654 reversing the permutation can make the permutation a no-op.
4655 In other cases it can change a true permutation into a
4656 full-vector extract. */
4657 tmp_perm.reserve (SLP_TREE_LANES (node));
4658 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4659 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4661 else
4662 continue;
4664 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4666 unsigned idx = tmp_perm[j];
4667 imin = MIN (imin, idx);
4668 imax = MAX (imax, idx);
4669 if (idx - tmp_perm[0] != j)
4670 any_permute = true;
4672 /* If the span doesn't match we'd disrupt VF computation, avoid
4673 that for now. */
4674 if (imax - imin + 1 != SLP_TREE_LANES (node))
4675 continue;
4676 /* If there's no permute no need to split one out. In this case
4677 we can consider turning a load into a permuted load, if that
4678 turns out to be cheaper than alternatives. */
4679 if (!any_permute)
4681 partition.layout = -1;
4682 continue;
4685 /* For now only handle true permutes, like
4686 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4687 when permuting constants and invariants keeping the permute
4688 bijective. */
4689 auto_sbitmap load_index (SLP_TREE_LANES (node));
4690 bitmap_clear (load_index);
4691 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4692 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4693 unsigned j;
4694 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4695 if (!bitmap_bit_p (load_index, j))
4696 break;
4697 if (j != SLP_TREE_LANES (node))
4698 continue;
4700 vec<unsigned> perm = vNULL;
4701 perm.safe_grow (SLP_TREE_LANES (node), true);
4702 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4703 perm[j] = tmp_perm[j] - imin;
4705 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4707 /* Continue to use existing layouts, but don't add any more. */
4708 int *entry = layout_ids.get (perm);
4709 partition.layout = entry ? *entry : 0;
4710 perm.release ();
4712 else
4714 bool existed;
4715 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4716 if (existed)
4717 perm.release ();
4718 else
4720 layout_i = m_perms.length ();
4721 m_perms.safe_push (perm);
4723 partition.layout = layout_i;
4727 /* Initially assume that every layout is possible and has zero cost
4728 in every partition. */
4729 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4730 * m_perms.length ());
4732 /* We have to mark outgoing permutations facing non-associating-reduction
4733 graph entries that are not represented as to be materialized.
4734 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4735 for (slp_instance instance : m_vinfo->slp_instances)
4736 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4738 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4739 m_partitions[m_vertices[node_i].partition].layout = 0;
4741 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4743 stmt_vec_info stmt_info
4744 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4745 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4746 if (needs_fold_left_reduction_p (TREE_TYPE
4747 (gimple_get_lhs (stmt_info->stmt)),
4748 STMT_VINFO_REDUC_CODE (reduc_info)))
4750 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4751 m_partitions[m_vertices[node_i].partition].layout = 0;
4755 /* Check which layouts each node and partition can handle. Calculate the
4756 weights associated with inserting layout changes on edges. */
4757 for (unsigned int node_i : m_partitioned_nodes)
4759 auto &vertex = m_vertices[node_i];
4760 auto &partition = m_partitions[vertex.partition];
4761 slp_tree node = vertex.node;
4763 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4765 vertex.weight = vect_slp_node_weight (node);
4767 /* We do not handle stores with a permutation, so all
4768 incoming permutations must have been materialized.
4770 We also don't handle masked grouped loads, which lack a
4771 permutation vector. In this case the memory locations
4772 form an implicit second input to the loads, on top of the
4773 explicit mask input, and the memory input's layout cannot
4774 be changed.
4776 On the other hand, we do support permuting gather loads and
4777 masked gather loads, where each scalar load is independent
4778 of the others. This can be useful if the address/index input
4779 benefits from permutation. */
4780 if (STMT_VINFO_DATA_REF (rep)
4781 && STMT_VINFO_GROUPED_ACCESS (rep)
4782 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4783 partition.layout = 0;
4785 /* We cannot change the layout of an operation that is
4786 not independent on lanes. Note this is an explicit
4787 negative list since that's much shorter than the respective
4788 positive one but it's critical to keep maintaining it. */
4789 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4790 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4792 case CFN_COMPLEX_ADD_ROT90:
4793 case CFN_COMPLEX_ADD_ROT270:
4794 case CFN_COMPLEX_MUL:
4795 case CFN_COMPLEX_MUL_CONJ:
4796 case CFN_VEC_ADDSUB:
4797 case CFN_VEC_FMADDSUB:
4798 case CFN_VEC_FMSUBADD:
4799 partition.layout = 0;
4800 default:;
4804 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4806 auto &other_vertex = m_vertices[other_node_i];
4808 /* Count the number of edges from earlier partitions and the number
4809 of edges to later partitions. */
4810 if (other_vertex.partition < vertex.partition)
4811 partition.in_degree += 1;
4812 else
4813 partition.out_degree += 1;
4815 /* If the current node uses the result of OTHER_NODE_I, accumulate
4816 the effects of that. */
4817 if (ud->src == int (node_i))
4819 other_vertex.out_weight += vertex.weight;
4820 other_vertex.out_degree += 1;
4823 for_each_partition_edge (node_i, process_edge);
4827 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4828 its current (provisional) choice of layout. The inputs do not necessarily
4829 have the same layout as each other. */
4831 slpg_layout_cost
4832 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4834 auto &vertex = m_vertices[node_i];
4835 slpg_layout_cost cost;
4836 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4838 auto &other_vertex = m_vertices[other_node_i];
4839 if (other_vertex.partition < vertex.partition)
4841 auto &other_partition = m_partitions[other_vertex.partition];
4842 auto &other_costs = partition_layout_costs (other_vertex.partition,
4843 other_partition.layout);
4844 slpg_layout_cost this_cost = other_costs.in_cost;
4845 this_cost.add_serial_cost (other_costs.internal_cost);
4846 this_cost.split (other_partition.out_degree);
4847 cost.add_parallel_cost (this_cost);
4850 for_each_partition_edge (node_i, add_cost);
4851 return cost;
4854 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4855 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4856 slpg_layout_cost::impossible () if the change isn't possible. */
4858 slpg_layout_cost
4859 vect_optimize_slp_pass::
4860 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4861 unsigned int layout2_i)
4863 auto &def_vertex = m_vertices[ud->dest];
4864 auto &use_vertex = m_vertices[ud->src];
4865 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4866 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4867 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4868 use_layout_i);
4869 if (factor < 0)
4870 return slpg_layout_cost::impossible ();
4872 /* We have a choice of putting the layout change at the site of the
4873 definition or at the site of the use. Prefer the former when
4874 optimizing for size or when the execution frequency of the
4875 definition is no greater than the combined execution frequencies of
4876 the uses. When putting the layout change at the site of the definition,
4877 divvy up the cost among all consumers. */
4878 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4880 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4881 cost.split (def_vertex.out_degree);
4882 return cost;
4884 return { use_vertex.weight * factor, m_optimize_size };
4887 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4888 partition; FROM_NODE_I could be the definition node or the use node.
4889 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4890 Return the cost of any necessary fix-ups on edge UD, or return
4891 slpg_layout_cost::impossible () if the change isn't possible.
4893 At this point, FROM_NODE_I's partition has chosen the cheapest
4894 layout based on the information available so far, but this choice
4895 is only provisional. */
4897 slpg_layout_cost
4898 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4899 unsigned int to_layout_i)
4901 auto &from_vertex = m_vertices[from_node_i];
4902 unsigned int from_partition_i = from_vertex.partition;
4903 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4904 gcc_assert (from_partition.layout >= 0);
4906 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4907 with its current layout preference. */
4908 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4909 auto edge_cost = edge_layout_cost (ud, from_node_i,
4910 from_partition.layout, to_layout_i);
4911 if (edge_cost.is_possible ())
4913 auto &from_costs = partition_layout_costs (from_partition_i,
4914 from_partition.layout);
4915 cost = from_costs.in_cost;
4916 cost.add_serial_cost (from_costs.internal_cost);
4917 cost.split (from_partition.out_degree);
4918 cost.add_serial_cost (edge_cost);
4921 /* Take the minimum of that cost and the cost that applies if
4922 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4923 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4924 to_layout_i);
4925 if (direct_layout_costs.is_possible ())
4927 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4928 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4929 direct_cost.split (from_partition.out_degree);
4930 if (!cost.is_possible ()
4931 || direct_cost.is_better_than (cost, m_optimize_size))
4932 cost = direct_cost;
4935 return cost;
4938 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4939 partition; TO_NODE_I could be the definition node or the use node.
4940 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4941 return the cost of any necessary fix-ups on edge UD, or
4942 slpg_layout_cost::impossible () if the choice cannot be made.
4944 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4946 slpg_layout_cost
4947 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4948 unsigned int from_layout_i)
4950 auto &to_vertex = m_vertices[to_node_i];
4951 unsigned int to_partition_i = to_vertex.partition;
4952 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4953 gcc_assert (to_partition.layout >= 0);
4955 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4956 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4957 any other inputs keep their current choice of layout. */
4958 auto &to_costs = partition_layout_costs (to_partition_i,
4959 to_partition.layout);
4960 if (ud->src == int (to_node_i)
4961 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4963 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4964 auto old_layout = from_partition.layout;
4965 from_partition.layout = from_layout_i;
4966 int factor = internal_node_cost (to_vertex.node, -1,
4967 to_partition.layout);
4968 from_partition.layout = old_layout;
4969 if (factor >= 0)
4971 slpg_layout_cost cost = to_costs.out_cost;
4972 cost.add_serial_cost ({ to_vertex.weight * factor,
4973 m_optimize_size });
4974 cost.split (to_partition.in_degree);
4975 return cost;
4979 /* Compute the cost if we insert any necessary layout change on edge UD. */
4980 auto edge_cost = edge_layout_cost (ud, to_node_i,
4981 to_partition.layout, from_layout_i);
4982 if (edge_cost.is_possible ())
4984 slpg_layout_cost cost = to_costs.out_cost;
4985 cost.add_serial_cost (to_costs.internal_cost);
4986 cost.split (to_partition.in_degree);
4987 cost.add_serial_cost (edge_cost);
4988 return cost;
4991 return slpg_layout_cost::impossible ();
4994 /* Make a forward pass through the partitions, accumulating input costs.
4995 Make a tentative (provisional) choice of layout for each partition,
4996 ensuring that this choice still allows later partitions to keep
4997 their original layout. */
4999 void
5000 vect_optimize_slp_pass::forward_pass ()
5002 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5003 ++partition_i)
5005 auto &partition = m_partitions[partition_i];
5007 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5008 the incoming cost that would apply if every predecessor partition
5009 keeps its current layout. This is used within the loop below. */
5010 slpg_layout_cost in_cost;
5011 slp_tree single_node = nullptr;
5012 if (partition.node_end == partition.node_begin + 1)
5014 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5015 single_node = m_vertices[node_i].node;
5016 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5017 in_cost = total_in_cost (node_i);
5020 /* Go through the possible layouts. Decide which ones are valid
5021 for this partition and record which of the valid layouts has
5022 the lowest cost. */
5023 unsigned int min_layout_i = 0;
5024 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5025 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5027 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5028 if (!layout_costs.is_possible ())
5029 continue;
5031 /* If the recorded layout is already 0 then the layout cannot
5032 change. */
5033 if (partition.layout == 0 && layout_i != 0)
5035 layout_costs.mark_impossible ();
5036 continue;
5039 bool is_possible = true;
5040 for (unsigned int order_i = partition.node_begin;
5041 order_i < partition.node_end; ++order_i)
5043 unsigned int node_i = m_partitioned_nodes[order_i];
5044 auto &vertex = m_vertices[node_i];
5046 /* Reject the layout if it is individually incompatible
5047 with any node in the partition. */
5048 if (!is_compatible_layout (vertex.node, layout_i))
5050 is_possible = false;
5051 break;
5054 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5056 auto &other_vertex = m_vertices[other_node_i];
5057 if (other_vertex.partition < vertex.partition)
5059 /* Accumulate the incoming costs from earlier
5060 partitions, plus the cost of any layout changes
5061 on UD itself. */
5062 auto cost = forward_cost (ud, other_node_i, layout_i);
5063 if (!cost.is_possible ())
5064 is_possible = false;
5065 else
5066 layout_costs.in_cost.add_parallel_cost (cost);
5068 else
5069 /* Reject the layout if it would make layout 0 impossible
5070 for later partitions. This amounts to testing that the
5071 target supports reversing the layout change on edges
5072 to later partitions.
5074 In principle, it might be possible to push a layout
5075 change all the way down a graph, so that it never
5076 needs to be reversed and so that the target doesn't
5077 need to support the reverse operation. But it would
5078 be awkward to bail out if we hit a partition that
5079 does not support the new layout, especially since
5080 we are not dealing with a lattice. */
5081 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5082 layout_i).is_possible ();
5084 for_each_partition_edge (node_i, add_cost);
5086 /* Accumulate the cost of using LAYOUT_I within NODE,
5087 both for the inputs and the outputs. */
5088 int factor = internal_node_cost (vertex.node, layout_i,
5089 layout_i);
5090 if (factor < 0)
5092 is_possible = false;
5093 break;
5095 else if (factor)
5096 layout_costs.internal_cost.add_serial_cost
5097 ({ vertex.weight * factor, m_optimize_size });
5099 if (!is_possible)
5101 layout_costs.mark_impossible ();
5102 continue;
5105 /* Combine the incoming and partition-internal costs. */
5106 slpg_layout_cost combined_cost = layout_costs.in_cost;
5107 combined_cost.add_serial_cost (layout_costs.internal_cost);
5109 /* If this partition consists of a single VEC_PERM_EXPR, see
5110 if the VEC_PERM_EXPR can be changed to support output layout
5111 LAYOUT_I while keeping all the provisional choices of input
5112 layout. */
5113 if (single_node
5114 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5116 int factor = internal_node_cost (single_node, -1, layout_i);
5117 if (factor >= 0)
5119 auto weight = m_vertices[single_node->vertex].weight;
5120 slpg_layout_cost internal_cost
5121 = { weight * factor, m_optimize_size };
5123 slpg_layout_cost alt_cost = in_cost;
5124 alt_cost.add_serial_cost (internal_cost);
5125 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5127 combined_cost = alt_cost;
5128 layout_costs.in_cost = in_cost;
5129 layout_costs.internal_cost = internal_cost;
5134 /* Record the layout with the lowest cost. Prefer layout 0 in
5135 the event of a tie between it and another layout. */
5136 if (!min_layout_cost.is_possible ()
5137 || combined_cost.is_better_than (min_layout_cost,
5138 m_optimize_size))
5140 min_layout_i = layout_i;
5141 min_layout_cost = combined_cost;
5145 /* This loop's handling of earlier partitions should ensure that
5146 choosing the original layout for the current partition is no
5147 less valid than it was in the original graph, even with the
5148 provisional layout choices for those earlier partitions. */
5149 gcc_assert (min_layout_cost.is_possible ());
5150 partition.layout = min_layout_i;
5154 /* Make a backward pass through the partitions, accumulating output costs.
5155 Make a final choice of layout for each partition. */
5157 void
5158 vect_optimize_slp_pass::backward_pass ()
5160 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5162 auto &partition = m_partitions[partition_i];
5164 unsigned int min_layout_i = 0;
5165 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5166 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5168 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5169 if (!layout_costs.is_possible ())
5170 continue;
5172 /* Accumulate the costs from successor partitions. */
5173 bool is_possible = true;
5174 for (unsigned int order_i = partition.node_begin;
5175 order_i < partition.node_end; ++order_i)
5177 unsigned int node_i = m_partitioned_nodes[order_i];
5178 auto &vertex = m_vertices[node_i];
5179 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5181 auto &other_vertex = m_vertices[other_node_i];
5182 auto &other_partition = m_partitions[other_vertex.partition];
5183 if (other_vertex.partition > vertex.partition)
5185 /* Accumulate the incoming costs from later
5186 partitions, plus the cost of any layout changes
5187 on UD itself. */
5188 auto cost = backward_cost (ud, other_node_i, layout_i);
5189 if (!cost.is_possible ())
5190 is_possible = false;
5191 else
5192 layout_costs.out_cost.add_parallel_cost (cost);
5194 else
5195 /* Make sure that earlier partitions can (if necessary
5196 or beneficial) keep the layout that they chose in
5197 the forward pass. This ensures that there is at
5198 least one valid choice of layout. */
5199 is_possible &= edge_layout_cost (ud, other_node_i,
5200 other_partition.layout,
5201 layout_i).is_possible ();
5203 for_each_partition_edge (node_i, add_cost);
5205 if (!is_possible)
5207 layout_costs.mark_impossible ();
5208 continue;
5211 /* Locally combine the costs from the forward and backward passes.
5212 (This combined cost is not passed on, since that would lead
5213 to double counting.) */
5214 slpg_layout_cost combined_cost = layout_costs.in_cost;
5215 combined_cost.add_serial_cost (layout_costs.internal_cost);
5216 combined_cost.add_serial_cost (layout_costs.out_cost);
5218 /* Record the layout with the lowest cost. Prefer layout 0 in
5219 the event of a tie between it and another layout. */
5220 if (!min_layout_cost.is_possible ()
5221 || combined_cost.is_better_than (min_layout_cost,
5222 m_optimize_size))
5224 min_layout_i = layout_i;
5225 min_layout_cost = combined_cost;
5229 gcc_assert (min_layout_cost.is_possible ());
5230 partition.layout = min_layout_i;
5234 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5235 NODE already has the layout that was selected for its partition. */
5237 slp_tree
5238 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5239 unsigned int to_layout_i)
5241 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5242 slp_tree result = m_node_layouts[result_i];
5243 if (result)
5244 return result;
5246 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5247 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5248 /* We can't permute vector defs in place. */
5249 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5251 /* If the vector is uniform or unchanged, there's nothing to do. */
5252 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5253 result = node;
5254 else
5256 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5257 result = vect_create_new_slp_node (scalar_ops);
5258 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5261 else
5263 unsigned int partition_i = m_vertices[node->vertex].partition;
5264 unsigned int from_layout_i = m_partitions[partition_i].layout;
5265 if (from_layout_i == to_layout_i)
5266 return node;
5268 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5269 permutation instead of a serial one. Leave the new permutation
5270 in TMP_PERM on success. */
5271 auto_lane_permutation_t tmp_perm;
5272 unsigned int num_inputs = 1;
5273 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5275 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5276 if (from_layout_i != 0)
5277 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5278 if (to_layout_i != 0)
5279 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5280 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5281 tmp_perm,
5282 SLP_TREE_CHILDREN (node),
5283 false) >= 0)
5284 num_inputs = SLP_TREE_CHILDREN (node).length ();
5285 else
5286 tmp_perm.truncate (0);
5289 if (dump_enabled_p ())
5291 if (tmp_perm.length () > 0)
5292 dump_printf_loc (MSG_NOTE, vect_location,
5293 "duplicating permutation node %p with"
5294 " layout %d\n",
5295 (void *) node, to_layout_i);
5296 else
5297 dump_printf_loc (MSG_NOTE, vect_location,
5298 "inserting permutation node in place of %p\n",
5299 (void *) node);
5302 unsigned int num_lanes = SLP_TREE_LANES (node);
5303 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5304 if (SLP_TREE_SCALAR_STMTS (node).length ())
5306 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5307 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5308 if (from_layout_i != 0)
5309 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5310 if (to_layout_i != 0)
5311 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5313 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5314 SLP_TREE_LANES (result) = num_lanes;
5315 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5316 result->vertex = -1;
5318 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5319 if (tmp_perm.length ())
5321 lane_perm.safe_splice (tmp_perm);
5322 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5324 else
5326 lane_perm.create (num_lanes);
5327 for (unsigned j = 0; j < num_lanes; ++j)
5328 lane_perm.quick_push ({ 0, j });
5329 if (from_layout_i != 0)
5330 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5331 if (to_layout_i != 0)
5332 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5333 SLP_TREE_CHILDREN (result).safe_push (node);
5335 for (slp_tree child : SLP_TREE_CHILDREN (result))
5336 child->refcnt++;
5338 m_node_layouts[result_i] = result;
5339 return result;
5342 /* Apply the chosen vector layouts to the SLP graph. */
5344 void
5345 vect_optimize_slp_pass::materialize ()
5347 /* We no longer need the costs, so avoid having two O(N * P) arrays
5348 live at the same time. */
5349 m_partition_layout_costs.release ();
5350 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5352 auto_sbitmap fully_folded (m_vertices.length ());
5353 bitmap_clear (fully_folded);
5354 for (unsigned int node_i : m_partitioned_nodes)
5356 auto &vertex = m_vertices[node_i];
5357 slp_tree node = vertex.node;
5358 int layout_i = m_partitions[vertex.partition].layout;
5359 gcc_assert (layout_i >= 0);
5361 /* Rearrange the scalar statements to match the chosen layout. */
5362 if (layout_i > 0)
5363 vect_slp_permute (m_perms[layout_i],
5364 SLP_TREE_SCALAR_STMTS (node), true);
5366 /* Update load and lane permutations. */
5367 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5369 /* First try to absorb the input vector layouts. If that fails,
5370 force the inputs to have layout LAYOUT_I too. We checked that
5371 that was possible before deciding to use nonzero output layouts.
5372 (Note that at this stage we don't really have any guarantee that
5373 the target supports the original VEC_PERM_EXPR.) */
5374 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5375 auto_lane_permutation_t tmp_perm;
5376 tmp_perm.safe_splice (perm);
5377 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5378 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5379 tmp_perm,
5380 SLP_TREE_CHILDREN (node),
5381 false) >= 0)
5383 if (dump_enabled_p ()
5384 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5385 perm.begin ()))
5386 dump_printf_loc (MSG_NOTE, vect_location,
5387 "absorbing input layouts into %p\n",
5388 (void *) node);
5389 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5390 bitmap_set_bit (fully_folded, node_i);
5392 else
5394 /* Not MSG_MISSED because it would make no sense to users. */
5395 if (dump_enabled_p ())
5396 dump_printf_loc (MSG_NOTE, vect_location,
5397 "failed to absorb input layouts into %p\n",
5398 (void *) node);
5399 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5402 else
5404 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5405 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5406 if (layout_i > 0)
5407 /* ??? When we handle non-bijective permutes the idea
5408 is that we can force the load-permutation to be
5409 { min, min + 1, min + 2, ... max }. But then the
5410 scalar defs might no longer match the lane content
5411 which means wrong-code with live lane vectorization.
5412 So we possibly have to have NULL entries for those. */
5413 vect_slp_permute (m_perms[layout_i], load_perm, true);
5417 /* Do this before any nodes disappear, since it involves a walk
5418 over the leaves. */
5419 remove_redundant_permutations ();
5421 /* Replace each child with a correctly laid-out version. */
5422 for (unsigned int node_i : m_partitioned_nodes)
5424 /* Skip nodes that have already been handled above. */
5425 if (bitmap_bit_p (fully_folded, node_i))
5426 continue;
5428 auto &vertex = m_vertices[node_i];
5429 int in_layout_i = m_partitions[vertex.partition].layout;
5430 gcc_assert (in_layout_i >= 0);
5432 unsigned j;
5433 slp_tree child;
5434 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5436 if (!child)
5437 continue;
5439 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5440 if (new_child != child)
5442 vect_free_slp_tree (child);
5443 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5444 new_child->refcnt += 1;
5450 /* Elide load permutations that are not necessary. Such permutations might
5451 be pre-existing, rather than created by the layout optimizations. */
5453 void
5454 vect_optimize_slp_pass::remove_redundant_permutations ()
5456 for (unsigned int node_i : m_leafs)
5458 slp_tree node = m_vertices[node_i].node;
5459 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5460 continue;
5462 /* In basic block vectorization we allow any subchain of an interleaving
5463 chain.
5464 FORNOW: not in loop SLP because of realignment complications. */
5465 if (is_a <bb_vec_info> (m_vinfo))
5467 bool subchain_p = true;
5468 stmt_vec_info next_load_info = NULL;
5469 stmt_vec_info load_info;
5470 unsigned j;
5471 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5473 if (j != 0
5474 && (next_load_info != load_info
5475 || DR_GROUP_GAP (load_info) != 1))
5477 subchain_p = false;
5478 break;
5480 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5482 if (subchain_p)
5484 SLP_TREE_LOAD_PERMUTATION (node).release ();
5485 continue;
5488 else
5490 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5491 stmt_vec_info load_info;
5492 bool this_load_permuted = false;
5493 unsigned j;
5494 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5495 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5497 this_load_permuted = true;
5498 break;
5500 /* When this isn't a grouped access we know it's single element
5501 and contiguous. */
5502 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5504 if (!this_load_permuted
5505 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5506 || SLP_TREE_LANES (node) == 1))
5507 SLP_TREE_LOAD_PERMUTATION (node).release ();
5508 continue;
5510 stmt_vec_info first_stmt_info
5511 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5512 if (!this_load_permuted
5513 /* The load requires permutation when unrolling exposes
5514 a gap either because the group is larger than the SLP
5515 group-size or because there is a gap between the groups. */
5516 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5517 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5518 && DR_GROUP_GAP (first_stmt_info) == 0)))
5520 SLP_TREE_LOAD_PERMUTATION (node).release ();
5521 continue;
5527 /* Print the partition graph and layout information to the dump file. */
5529 void
5530 vect_optimize_slp_pass::dump ()
5532 dump_printf_loc (MSG_NOTE, vect_location,
5533 "SLP optimize permutations:\n");
5534 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5536 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5537 const char *sep = "";
5538 for (unsigned int idx : m_perms[layout_i])
5540 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5541 sep = ", ";
5543 dump_printf (MSG_NOTE, " }\n");
5545 dump_printf_loc (MSG_NOTE, vect_location,
5546 "SLP optimize partitions:\n");
5547 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5548 ++partition_i)
5550 auto &partition = m_partitions[partition_i];
5551 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5552 dump_printf_loc (MSG_NOTE, vect_location,
5553 " partition %d (layout %d):\n",
5554 partition_i, partition.layout);
5555 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5556 for (unsigned int order_i = partition.node_begin;
5557 order_i < partition.node_end; ++order_i)
5559 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5560 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5561 (void *) vertex.node);
5562 dump_printf_loc (MSG_NOTE, vect_location,
5563 " weight: %f\n",
5564 vertex.weight.to_double ());
5565 if (vertex.out_degree)
5566 dump_printf_loc (MSG_NOTE, vect_location,
5567 " out weight: %f (degree %d)\n",
5568 vertex.out_weight.to_double (),
5569 vertex.out_degree);
5570 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5571 dump_printf_loc (MSG_NOTE, vect_location,
5572 " op: VEC_PERM_EXPR\n");
5573 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5574 dump_printf_loc (MSG_NOTE, vect_location,
5575 " op template: %G", rep->stmt);
5577 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5578 for (unsigned int order_i = partition.node_begin;
5579 order_i < partition.node_end; ++order_i)
5581 unsigned int node_i = m_partitioned_nodes[order_i];
5582 auto &vertex = m_vertices[node_i];
5583 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5585 auto &other_vertex = m_vertices[other_node_i];
5586 if (other_vertex.partition < vertex.partition)
5587 dump_printf_loc (MSG_NOTE, vect_location,
5588 " - %p [%d] --> %p\n",
5589 (void *) other_vertex.node,
5590 other_vertex.partition,
5591 (void *) vertex.node);
5592 else
5593 dump_printf_loc (MSG_NOTE, vect_location,
5594 " - %p --> [%d] %p\n",
5595 (void *) vertex.node,
5596 other_vertex.partition,
5597 (void *) other_vertex.node);
5599 for_each_partition_edge (node_i, print_edge);
5602 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5604 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5605 if (layout_costs.is_possible ())
5607 dump_printf_loc (MSG_NOTE, vect_location,
5608 " layout %d:%s\n", layout_i,
5609 partition.layout == int (layout_i)
5610 ? " (*)" : "");
5611 slpg_layout_cost combined_cost = layout_costs.in_cost;
5612 combined_cost.add_serial_cost (layout_costs.internal_cost);
5613 combined_cost.add_serial_cost (layout_costs.out_cost);
5614 #define TEMPLATE "{depth: %f, total: %f}"
5615 dump_printf_loc (MSG_NOTE, vect_location,
5616 " " TEMPLATE "\n",
5617 layout_costs.in_cost.depth.to_double (),
5618 layout_costs.in_cost.total.to_double ());
5619 dump_printf_loc (MSG_NOTE, vect_location,
5620 " + " TEMPLATE "\n",
5621 layout_costs.internal_cost.depth.to_double (),
5622 layout_costs.internal_cost.total.to_double ());
5623 dump_printf_loc (MSG_NOTE, vect_location,
5624 " + " TEMPLATE "\n",
5625 layout_costs.out_cost.depth.to_double (),
5626 layout_costs.out_cost.total.to_double ());
5627 dump_printf_loc (MSG_NOTE, vect_location,
5628 " = " TEMPLATE "\n",
5629 combined_cost.depth.to_double (),
5630 combined_cost.total.to_double ());
5631 #undef TEMPLATE
5633 else
5634 dump_printf_loc (MSG_NOTE, vect_location,
5635 " layout %d: rejected\n", layout_i);
5640 /* Main entry point for the SLP graph optimization pass. */
5642 void
5643 vect_optimize_slp_pass::run ()
5645 build_graph ();
5646 create_partitions ();
5647 start_choosing_layouts ();
5648 if (m_perms.length () > 1)
5650 forward_pass ();
5651 backward_pass ();
5652 if (dump_enabled_p ())
5653 dump ();
5654 materialize ();
5655 while (!m_perms.is_empty ())
5656 m_perms.pop ().release ();
5658 else
5659 remove_redundant_permutations ();
5660 free_graph (m_slpg);
5663 /* Optimize the SLP graph of VINFO. */
5665 void
5666 vect_optimize_slp (vec_info *vinfo)
5668 if (vinfo->slp_instances.is_empty ())
5669 return;
5670 vect_optimize_slp_pass (vinfo).run ();
5673 /* Gather loads reachable from the individual SLP graph entries. */
5675 void
5676 vect_gather_slp_loads (vec_info *vinfo)
5678 unsigned i;
5679 slp_instance instance;
5680 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5682 hash_set<slp_tree> visited;
5683 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5684 SLP_INSTANCE_TREE (instance), visited);
5689 /* For each possible SLP instance decide whether to SLP it and calculate overall
5690 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5691 least one instance. */
5693 bool
5694 vect_make_slp_decision (loop_vec_info loop_vinfo)
5696 unsigned int i;
5697 poly_uint64 unrolling_factor = 1;
5698 const vec<slp_instance> &slp_instances
5699 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5700 slp_instance instance;
5701 int decided_to_slp = 0;
5703 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5705 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5707 /* FORNOW: SLP if you can. */
5708 /* All unroll factors have the form:
5710 GET_MODE_SIZE (vinfo->vector_mode) * X
5712 for some rational X, so they must have a common multiple. */
5713 unrolling_factor
5714 = force_common_multiple (unrolling_factor,
5715 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5717 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5718 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5719 loop-based vectorization. Such stmts will be marked as HYBRID. */
5720 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5721 decided_to_slp++;
5724 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5726 if (decided_to_slp && dump_enabled_p ())
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 "Decided to SLP %d instances. Unrolling factor ",
5730 decided_to_slp);
5731 dump_dec (MSG_NOTE, unrolling_factor);
5732 dump_printf (MSG_NOTE, "\n");
5735 return (decided_to_slp > 0);
5738 /* Private data for vect_detect_hybrid_slp. */
5739 struct vdhs_data
5741 loop_vec_info loop_vinfo;
5742 vec<stmt_vec_info> *worklist;
5745 /* Walker for walk_gimple_op. */
5747 static tree
5748 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5750 walk_stmt_info *wi = (walk_stmt_info *)data;
5751 vdhs_data *dat = (vdhs_data *)wi->info;
5753 if (wi->is_lhs)
5754 return NULL_TREE;
5756 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5757 if (!def_stmt_info)
5758 return NULL_TREE;
5759 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5760 if (PURE_SLP_STMT (def_stmt_info))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5764 def_stmt_info->stmt);
5765 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5766 dat->worklist->safe_push (def_stmt_info);
5769 return NULL_TREE;
5772 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5773 if so, otherwise pushing it to WORKLIST. */
5775 static void
5776 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5777 vec<stmt_vec_info> &worklist,
5778 stmt_vec_info stmt_info)
5780 if (dump_enabled_p ())
5781 dump_printf_loc (MSG_NOTE, vect_location,
5782 "Processing hybrid candidate : %G", stmt_info->stmt);
5783 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5784 imm_use_iterator iter2;
5785 ssa_op_iter iter1;
5786 use_operand_p use_p;
5787 def_operand_p def_p;
5788 bool any_def = false;
5789 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5791 any_def = true;
5792 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5794 if (is_gimple_debug (USE_STMT (use_p)))
5795 continue;
5796 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5797 /* An out-of loop use means this is a loop_vect sink. */
5798 if (!use_info)
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_NOTE, vect_location,
5802 "Found loop_vect sink: %G", stmt_info->stmt);
5803 worklist.safe_push (stmt_info);
5804 return;
5806 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5808 if (dump_enabled_p ())
5809 dump_printf_loc (MSG_NOTE, vect_location,
5810 "Found loop_vect use: %G", use_info->stmt);
5811 worklist.safe_push (stmt_info);
5812 return;
5816 /* No def means this is a loo_vect sink. */
5817 if (!any_def)
5819 if (dump_enabled_p ())
5820 dump_printf_loc (MSG_NOTE, vect_location,
5821 "Found loop_vect sink: %G", stmt_info->stmt);
5822 worklist.safe_push (stmt_info);
5823 return;
5825 if (dump_enabled_p ())
5826 dump_printf_loc (MSG_NOTE, vect_location,
5827 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5828 STMT_SLP_TYPE (stmt_info) = pure_slp;
5831 /* Find stmts that must be both vectorized and SLPed. */
5833 void
5834 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5836 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5838 /* All stmts participating in SLP are marked pure_slp, all other
5839 stmts are loop_vect.
5840 First collect all loop_vect stmts into a worklist.
5841 SLP patterns cause not all original scalar stmts to appear in
5842 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5843 Rectify this here and do a backward walk over the IL only considering
5844 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5845 mark them as pure_slp. */
5846 auto_vec<stmt_vec_info> worklist;
5847 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5849 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5850 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5851 gsi_next (&gsi))
5853 gphi *phi = gsi.phi ();
5854 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5855 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5856 maybe_push_to_hybrid_worklist (loop_vinfo,
5857 worklist, stmt_info);
5859 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5860 gsi_prev (&gsi))
5862 gimple *stmt = gsi_stmt (gsi);
5863 if (is_gimple_debug (stmt))
5864 continue;
5865 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5866 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5868 for (gimple_stmt_iterator gsi2
5869 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5870 !gsi_end_p (gsi2); gsi_next (&gsi2))
5872 stmt_vec_info patt_info
5873 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5874 if (!STMT_SLP_TYPE (patt_info)
5875 && STMT_VINFO_RELEVANT (patt_info))
5876 maybe_push_to_hybrid_worklist (loop_vinfo,
5877 worklist, patt_info);
5879 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5881 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5882 maybe_push_to_hybrid_worklist (loop_vinfo,
5883 worklist, stmt_info);
5887 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5888 mark any SLP vectorized stmt as hybrid.
5889 ??? We're visiting def stmts N times (once for each non-SLP and
5890 once for each hybrid-SLP use). */
5891 walk_stmt_info wi;
5892 vdhs_data dat;
5893 dat.worklist = &worklist;
5894 dat.loop_vinfo = loop_vinfo;
5895 memset (&wi, 0, sizeof (wi));
5896 wi.info = (void *)&dat;
5897 while (!worklist.is_empty ())
5899 stmt_vec_info stmt_info = worklist.pop ();
5900 /* Since SSA operands are not set up for pattern stmts we need
5901 to use walk_gimple_op. */
5902 wi.is_lhs = 0;
5903 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5904 /* For gather/scatter make sure to walk the offset operand, that
5905 can be a scaling and conversion away. */
5906 gather_scatter_info gs_info;
5907 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5908 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5910 int dummy;
5911 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5917 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5919 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5920 : vec_info (vec_info::bb, shared),
5921 bbs (_bbs),
5922 roots (vNULL)
5924 for (unsigned i = 0; i < bbs.length (); ++i)
5926 if (i != 0)
5927 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5928 gsi_next (&si))
5930 gphi *phi = si.phi ();
5931 gimple_set_uid (phi, 0);
5932 add_stmt (phi);
5934 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5935 !gsi_end_p (gsi); gsi_next (&gsi))
5937 gimple *stmt = gsi_stmt (gsi);
5938 gimple_set_uid (stmt, 0);
5939 if (is_gimple_debug (stmt))
5940 continue;
5941 add_stmt (stmt);
5947 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5948 stmts in the basic block. */
5950 _bb_vec_info::~_bb_vec_info ()
5952 /* Reset region marker. */
5953 for (unsigned i = 0; i < bbs.length (); ++i)
5955 if (i != 0)
5956 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5957 gsi_next (&si))
5959 gphi *phi = si.phi ();
5960 gimple_set_uid (phi, -1);
5962 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5963 !gsi_end_p (gsi); gsi_next (&gsi))
5965 gimple *stmt = gsi_stmt (gsi);
5966 gimple_set_uid (stmt, -1);
5970 for (unsigned i = 0; i < roots.length (); ++i)
5972 roots[i].stmts.release ();
5973 roots[i].roots.release ();
5974 roots[i].remain.release ();
5976 roots.release ();
5979 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5980 given then that child nodes have already been processed, and that
5981 their def types currently match their SLP node's def type. */
5983 static bool
5984 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5985 slp_instance node_instance,
5986 stmt_vector_for_cost *cost_vec)
5988 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5990 /* Calculate the number of vector statements to be created for the
5991 scalar stmts in this node. For SLP reductions it is equal to the
5992 number of vector statements in the children (which has already been
5993 calculated by the recursive call). Otherwise it is the number of
5994 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5995 VF divided by the number of elements in a vector. */
5996 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5997 && !STMT_VINFO_DATA_REF (stmt_info)
5998 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6000 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6001 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6003 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6004 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6005 break;
6008 else
6010 poly_uint64 vf;
6011 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6012 vf = loop_vinfo->vectorization_factor;
6013 else
6014 vf = 1;
6015 unsigned int group_size = SLP_TREE_LANES (node);
6016 tree vectype = SLP_TREE_VECTYPE (node);
6017 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6018 = vect_get_num_vectors (vf * group_size, vectype);
6021 /* Handle purely internal nodes. */
6022 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6024 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6025 return false;
6027 stmt_vec_info slp_stmt_info;
6028 unsigned int i;
6029 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6031 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6032 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6033 node_instance, i,
6034 false, cost_vec))
6035 return false;
6037 return true;
6040 bool dummy;
6041 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6042 node, node_instance, cost_vec);
6045 /* Try to build NODE from scalars, returning true on success.
6046 NODE_INSTANCE is the SLP instance that contains NODE. */
6048 static bool
6049 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6050 slp_instance node_instance)
6052 stmt_vec_info stmt_info;
6053 unsigned int i;
6055 if (!is_a <bb_vec_info> (vinfo)
6056 || node == SLP_INSTANCE_TREE (node_instance)
6057 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6058 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6059 /* Force the mask use to be built from scalars instead. */
6060 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6061 return false;
6063 if (dump_enabled_p ())
6064 dump_printf_loc (MSG_NOTE, vect_location,
6065 "Building vector operands of %p from scalars instead\n",
6066 (void *) node);
6068 /* Don't remove and free the child nodes here, since they could be
6069 referenced by other structures. The analysis and scheduling phases
6070 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6071 unsigned int group_size = SLP_TREE_LANES (node);
6072 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6073 /* Invariants get their vector type from the uses. */
6074 SLP_TREE_VECTYPE (node) = NULL_TREE;
6075 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6076 SLP_TREE_LOAD_PERMUTATION (node).release ();
6077 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6079 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6080 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6082 return true;
6085 /* Return true if all elements of the slice are the same. */
6086 bool
6087 vect_scalar_ops_slice::all_same_p () const
6089 for (unsigned int i = 1; i < length; ++i)
6090 if (!operand_equal_p (op (0), op (i)))
6091 return false;
6092 return true;
6095 hashval_t
6096 vect_scalar_ops_slice_hash::hash (const value_type &s)
6098 hashval_t hash = 0;
6099 for (unsigned i = 0; i < s.length; ++i)
6100 hash = iterative_hash_expr (s.op (i), hash);
6101 return hash;
6104 bool
6105 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6106 const compare_type &s2)
6108 if (s1.length != s2.length)
6109 return false;
6110 for (unsigned i = 0; i < s1.length; ++i)
6111 if (!operand_equal_p (s1.op (i), s2.op (i)))
6112 return false;
6113 return true;
6116 /* Compute the prologue cost for invariant or constant operands represented
6117 by NODE. */
6119 static void
6120 vect_prologue_cost_for_slp (slp_tree node,
6121 stmt_vector_for_cost *cost_vec)
6123 /* There's a special case of an existing vector, that costs nothing. */
6124 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6125 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6126 return;
6127 /* Without looking at the actual initializer a vector of
6128 constants can be implemented as load from the constant pool.
6129 When all elements are the same we can use a splat. */
6130 tree vectype = SLP_TREE_VECTYPE (node);
6131 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6132 unsigned HOST_WIDE_INT const_nunits;
6133 unsigned nelt_limit;
6134 auto ops = &SLP_TREE_SCALAR_OPS (node);
6135 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6136 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6137 && ! multiple_p (const_nunits, group_size))
6139 nelt_limit = const_nunits;
6140 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6141 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6142 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6143 starts.quick_push (i * const_nunits);
6145 else
6147 /* If either the vector has variable length or the vectors
6148 are composed of repeated whole groups we only need to
6149 cost construction once. All vectors will be the same. */
6150 nelt_limit = group_size;
6151 starts.quick_push (0);
6153 /* ??? We're just tracking whether vectors in a single node are the same.
6154 Ideally we'd do something more global. */
6155 bool passed = false;
6156 for (unsigned int start : starts)
6158 vect_cost_for_stmt kind;
6159 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6160 kind = vector_load;
6161 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6162 kind = scalar_to_vec;
6163 else
6164 kind = vec_construct;
6165 /* The target cost hook has no idea which part of the SLP node
6166 we are costing so avoid passing it down more than once. Pass
6167 it to the first vec_construct or scalar_to_vec part since for those
6168 the x86 backend tries to account for GPR to XMM register moves. */
6169 record_stmt_cost (cost_vec, 1, kind,
6170 (kind != vector_load && !passed) ? node : nullptr,
6171 vectype, 0, vect_prologue);
6172 if (kind != vector_load)
6173 passed = true;
6177 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6178 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6180 Return true if the operations are supported. */
6182 static bool
6183 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6184 slp_instance node_instance,
6185 hash_set<slp_tree> &visited_set,
6186 vec<slp_tree> &visited_vec,
6187 stmt_vector_for_cost *cost_vec)
6189 int i, j;
6190 slp_tree child;
6192 /* Assume we can code-generate all invariants. */
6193 if (!node
6194 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6195 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6196 return true;
6198 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6200 if (dump_enabled_p ())
6201 dump_printf_loc (MSG_NOTE, vect_location,
6202 "Failed cyclic SLP reference in %p\n", (void *) node);
6203 return false;
6205 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6207 /* If we already analyzed the exact same set of scalar stmts we're done.
6208 We share the generated vector stmts for those. */
6209 if (visited_set.add (node))
6210 return true;
6211 visited_vec.safe_push (node);
6213 bool res = true;
6214 unsigned visited_rec_start = visited_vec.length ();
6215 unsigned cost_vec_rec_start = cost_vec->length ();
6216 bool seen_non_constant_child = false;
6217 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6219 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6220 visited_set, visited_vec,
6221 cost_vec);
6222 if (!res)
6223 break;
6224 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6225 seen_non_constant_child = true;
6227 /* We're having difficulties scheduling nodes with just constant
6228 operands and no scalar stmts since we then cannot compute a stmt
6229 insertion place. */
6230 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6232 if (dump_enabled_p ())
6233 dump_printf_loc (MSG_NOTE, vect_location,
6234 "Cannot vectorize all-constant op node %p\n",
6235 (void *) node);
6236 res = false;
6239 if (res)
6240 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6241 cost_vec);
6242 /* If analysis failed we have to pop all recursive visited nodes
6243 plus ourselves. */
6244 if (!res)
6246 while (visited_vec.length () >= visited_rec_start)
6247 visited_set.remove (visited_vec.pop ());
6248 cost_vec->truncate (cost_vec_rec_start);
6251 /* When the node can be vectorized cost invariant nodes it references.
6252 This is not done in DFS order to allow the refering node
6253 vectorizable_* calls to nail down the invariant nodes vector type
6254 and possibly unshare it if it needs a different vector type than
6255 other referrers. */
6256 if (res)
6257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6258 if (child
6259 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6260 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6261 /* Perform usual caching, note code-generation still
6262 code-gens these nodes multiple times but we expect
6263 to CSE them later. */
6264 && !visited_set.add (child))
6266 visited_vec.safe_push (child);
6267 /* ??? After auditing more code paths make a "default"
6268 and push the vector type from NODE to all children
6269 if it is not already set. */
6270 /* Compute the number of vectors to be generated. */
6271 tree vector_type = SLP_TREE_VECTYPE (child);
6272 if (!vector_type)
6274 /* For shifts with a scalar argument we don't need
6275 to cost or code-generate anything.
6276 ??? Represent this more explicitely. */
6277 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6278 == shift_vec_info_type)
6279 && j == 1);
6280 continue;
6282 unsigned group_size = SLP_TREE_LANES (child);
6283 poly_uint64 vf = 1;
6284 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6285 vf = loop_vinfo->vectorization_factor;
6286 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6287 = vect_get_num_vectors (vf * group_size, vector_type);
6288 /* And cost them. */
6289 vect_prologue_cost_for_slp (child, cost_vec);
6292 /* If this node or any of its children can't be vectorized, try pruning
6293 the tree here rather than felling the whole thing. */
6294 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6296 /* We'll need to revisit this for invariant costing and number
6297 of vectorized stmt setting. */
6298 res = true;
6301 return res;
6304 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6305 region and that can be vectorized using vectorizable_live_operation
6306 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6307 scalar code computing it to be retained. */
6309 static void
6310 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6311 slp_instance instance,
6312 stmt_vector_for_cost *cost_vec,
6313 hash_set<stmt_vec_info> &svisited,
6314 hash_set<slp_tree> &visited)
6316 if (visited.add (node))
6317 return;
6319 unsigned i;
6320 stmt_vec_info stmt_info;
6321 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6322 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6324 if (svisited.contains (stmt_info))
6325 continue;
6326 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6327 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6328 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6329 /* Only the pattern root stmt computes the original scalar value. */
6330 continue;
6331 bool mark_visited = true;
6332 gimple *orig_stmt = orig_stmt_info->stmt;
6333 ssa_op_iter op_iter;
6334 def_operand_p def_p;
6335 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6337 imm_use_iterator use_iter;
6338 gimple *use_stmt;
6339 stmt_vec_info use_stmt_info;
6340 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6341 if (!is_gimple_debug (use_stmt))
6343 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6344 if (!use_stmt_info
6345 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6347 STMT_VINFO_LIVE_P (stmt_info) = true;
6348 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6349 node, instance, i,
6350 false, cost_vec))
6351 /* ??? So we know we can vectorize the live stmt
6352 from one SLP node. If we cannot do so from all
6353 or none consistently we'd have to record which
6354 SLP node (and lane) we want to use for the live
6355 operation. So make sure we can code-generate
6356 from all nodes. */
6357 mark_visited = false;
6358 else
6359 STMT_VINFO_LIVE_P (stmt_info) = false;
6360 break;
6363 /* We have to verify whether we can insert the lane extract
6364 before all uses. The following is a conservative approximation.
6365 We cannot put this into vectorizable_live_operation because
6366 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6367 doesn't work.
6368 Note that while the fact that we emit code for loads at the
6369 first load should make this a non-problem leafs we construct
6370 from scalars are vectorized after the last scalar def.
6371 ??? If we'd actually compute the insert location during
6372 analysis we could use sth less conservative than the last
6373 scalar stmt in the node for the dominance check. */
6374 /* ??? What remains is "live" uses in vector CTORs in the same
6375 SLP graph which is where those uses can end up code-generated
6376 right after their definition instead of close to their original
6377 use. But that would restrict us to code-generate lane-extracts
6378 from the latest stmt in a node. So we compensate for this
6379 during code-generation, simply not replacing uses for those
6380 hopefully rare cases. */
6381 if (STMT_VINFO_LIVE_P (stmt_info))
6382 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6383 if (!is_gimple_debug (use_stmt)
6384 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6385 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6386 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6388 if (dump_enabled_p ())
6389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6390 "Cannot determine insertion place for "
6391 "lane extract\n");
6392 STMT_VINFO_LIVE_P (stmt_info) = false;
6393 mark_visited = true;
6396 if (mark_visited)
6397 svisited.add (stmt_info);
6400 slp_tree child;
6401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6402 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6403 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6404 cost_vec, svisited, visited);
6407 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6409 static bool
6410 vectorizable_bb_reduc_epilogue (slp_instance instance,
6411 stmt_vector_for_cost *cost_vec)
6413 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6414 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6415 if (reduc_code == MINUS_EXPR)
6416 reduc_code = PLUS_EXPR;
6417 internal_fn reduc_fn;
6418 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6419 if (!vectype
6420 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6421 || reduc_fn == IFN_LAST
6422 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6423 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6424 TREE_TYPE (vectype)))
6426 if (dump_enabled_p ())
6427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6428 "not vectorized: basic block reduction epilogue "
6429 "operation unsupported.\n");
6430 return false;
6433 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6434 cost log2 vector operations plus shuffles and one extraction. */
6435 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6436 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6437 vectype, 0, vect_body);
6438 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6439 vectype, 0, vect_body);
6440 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6441 vectype, 0, vect_body);
6443 /* Since we replace all stmts of a possibly longer scalar reduction
6444 chain account for the extra scalar stmts for that. */
6445 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6446 instance->root_stmts[0], 0, vect_body);
6447 return true;
6450 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6451 and recurse to children. */
6453 static void
6454 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6455 hash_set<slp_tree> &visited)
6457 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6458 || visited.add (node))
6459 return;
6461 stmt_vec_info stmt;
6462 unsigned i;
6463 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6464 roots.remove (vect_orig_stmt (stmt));
6466 slp_tree child;
6467 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6468 if (child)
6469 vect_slp_prune_covered_roots (child, roots, visited);
6472 /* Analyze statements in SLP instances of VINFO. Return true if the
6473 operations are supported. */
6475 bool
6476 vect_slp_analyze_operations (vec_info *vinfo)
6478 slp_instance instance;
6479 int i;
6481 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6483 hash_set<slp_tree> visited;
6484 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6486 auto_vec<slp_tree> visited_vec;
6487 stmt_vector_for_cost cost_vec;
6488 cost_vec.create (2);
6489 if (is_a <bb_vec_info> (vinfo))
6490 vect_location = instance->location ();
6491 if (!vect_slp_analyze_node_operations (vinfo,
6492 SLP_INSTANCE_TREE (instance),
6493 instance, visited, visited_vec,
6494 &cost_vec)
6495 /* CTOR instances require vectorized defs for the SLP tree root. */
6496 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6497 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6498 != vect_internal_def
6499 /* Make sure we vectorized with the expected type. */
6500 || !useless_type_conversion_p
6501 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6502 (instance->root_stmts[0]->stmt))),
6503 TREE_TYPE (SLP_TREE_VECTYPE
6504 (SLP_INSTANCE_TREE (instance))))))
6505 /* Check we can vectorize the reduction. */
6506 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6507 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6509 slp_tree node = SLP_INSTANCE_TREE (instance);
6510 stmt_vec_info stmt_info;
6511 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6512 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6513 else
6514 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_NOTE, vect_location,
6517 "removing SLP instance operations starting from: %G",
6518 stmt_info->stmt);
6519 vect_free_slp_instance (instance);
6520 vinfo->slp_instances.ordered_remove (i);
6521 cost_vec.release ();
6522 while (!visited_vec.is_empty ())
6523 visited.remove (visited_vec.pop ());
6525 else
6527 i++;
6528 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6530 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6531 cost_vec.release ();
6533 else
6534 /* For BB vectorization remember the SLP graph entry
6535 cost for later. */
6536 instance->cost_vec = cost_vec;
6540 /* Now look for SLP instances with a root that are covered by other
6541 instances and remove them. */
6542 hash_set<stmt_vec_info> roots;
6543 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6544 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6545 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6546 if (!roots.is_empty ())
6548 visited.empty ();
6549 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6550 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6551 visited);
6552 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6553 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6554 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6556 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6557 if (dump_enabled_p ())
6558 dump_printf_loc (MSG_NOTE, vect_location,
6559 "removing SLP instance operations starting "
6560 "from: %G", root->stmt);
6561 vect_free_slp_instance (instance);
6562 vinfo->slp_instances.ordered_remove (i);
6564 else
6565 ++i;
6568 /* Compute vectorizable live stmts. */
6569 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6571 hash_set<stmt_vec_info> svisited;
6572 hash_set<slp_tree> visited;
6573 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6575 vect_location = instance->location ();
6576 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6577 instance, &instance->cost_vec, svisited,
6578 visited);
6582 return !vinfo->slp_instances.is_empty ();
6585 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6586 closing the eventual chain. */
6588 static slp_instance
6589 get_ultimate_leader (slp_instance instance,
6590 hash_map<slp_instance, slp_instance> &instance_leader)
6592 auto_vec<slp_instance *, 8> chain;
6593 slp_instance *tem;
6594 while (*(tem = instance_leader.get (instance)) != instance)
6596 chain.safe_push (tem);
6597 instance = *tem;
6599 while (!chain.is_empty ())
6600 *chain.pop () = instance;
6601 return instance;
6604 namespace {
6605 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6606 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6607 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6609 INSTANCE_LEADER is as for get_ultimate_leader. */
6611 template<typename T>
6612 bool
6613 vect_map_to_instance (slp_instance instance, T key,
6614 hash_map<T, slp_instance> &key_to_instance,
6615 hash_map<slp_instance, slp_instance> &instance_leader)
6617 bool existed_p;
6618 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6619 if (!existed_p)
6621 else if (key_instance != instance)
6623 /* If we're running into a previously marked key make us the
6624 leader of the current ultimate leader. This keeps the
6625 leader chain acyclic and works even when the current instance
6626 connects two previously independent graph parts. */
6627 slp_instance key_leader
6628 = get_ultimate_leader (key_instance, instance_leader);
6629 if (key_leader != instance)
6630 instance_leader.put (key_leader, instance);
6632 key_instance = instance;
6633 return existed_p;
6637 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6639 static void
6640 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6641 slp_instance instance, slp_tree node,
6642 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6643 hash_map<slp_tree, slp_instance> &node_to_instance,
6644 hash_map<slp_instance, slp_instance> &instance_leader)
6646 stmt_vec_info stmt_info;
6647 unsigned i;
6649 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6650 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6651 instance_leader);
6653 if (vect_map_to_instance (instance, node, node_to_instance,
6654 instance_leader))
6655 return;
6657 slp_tree child;
6658 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6659 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6660 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6661 node_to_instance, instance_leader);
6664 /* Partition the SLP graph into pieces that can be costed independently. */
6666 static void
6667 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6669 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6671 /* First walk the SLP graph assigning each involved scalar stmt a
6672 corresponding SLP graph entry and upon visiting a previously
6673 marked stmt, make the stmts leader the current SLP graph entry. */
6674 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6675 hash_map<slp_tree, slp_instance> node_to_instance;
6676 hash_map<slp_instance, slp_instance> instance_leader;
6677 slp_instance instance;
6678 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6680 instance_leader.put (instance, instance);
6681 vect_bb_partition_graph_r (bb_vinfo,
6682 instance, SLP_INSTANCE_TREE (instance),
6683 stmt_to_instance, node_to_instance,
6684 instance_leader);
6687 /* Then collect entries to each independent subgraph. */
6688 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6690 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6691 leader->subgraph_entries.safe_push (instance);
6692 if (dump_enabled_p ()
6693 && leader != instance)
6694 dump_printf_loc (MSG_NOTE, vect_location,
6695 "instance %p is leader of %p\n",
6696 (void *) leader, (void *) instance);
6700 /* Compute the set of scalar stmts participating in internal and external
6701 nodes. */
6703 static void
6704 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6705 hash_set<slp_tree> &visited,
6706 hash_set<stmt_vec_info> &vstmts,
6707 hash_set<stmt_vec_info> &estmts)
6709 int i;
6710 stmt_vec_info stmt_info;
6711 slp_tree child;
6713 if (visited.add (node))
6714 return;
6716 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6718 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6719 vstmts.add (stmt_info);
6721 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6722 if (child)
6723 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6724 vstmts, estmts);
6726 else
6727 for (tree def : SLP_TREE_SCALAR_OPS (node))
6729 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6730 if (def_stmt)
6731 estmts.add (def_stmt);
6736 /* Compute the scalar cost of the SLP node NODE and its children
6737 and return it. Do not account defs that are marked in LIFE and
6738 update LIFE according to uses of NODE. */
6740 static void
6741 vect_bb_slp_scalar_cost (vec_info *vinfo,
6742 slp_tree node, vec<bool, va_heap> *life,
6743 stmt_vector_for_cost *cost_vec,
6744 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6745 hash_set<slp_tree> &visited)
6747 unsigned i;
6748 stmt_vec_info stmt_info;
6749 slp_tree child;
6751 if (visited.add (node))
6752 return;
6754 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6756 ssa_op_iter op_iter;
6757 def_operand_p def_p;
6759 if ((*life)[i])
6760 continue;
6762 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6763 gimple *orig_stmt = orig_stmt_info->stmt;
6765 /* If there is a non-vectorized use of the defs then the scalar
6766 stmt is kept live in which case we do not account it or any
6767 required defs in the SLP children in the scalar cost. This
6768 way we make the vectorization more costly when compared to
6769 the scalar cost. */
6770 if (!STMT_VINFO_LIVE_P (stmt_info))
6772 auto_vec<gimple *, 8> worklist;
6773 hash_set<gimple *> *worklist_visited = NULL;
6774 worklist.quick_push (orig_stmt);
6777 gimple *work_stmt = worklist.pop ();
6778 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6780 imm_use_iterator use_iter;
6781 gimple *use_stmt;
6782 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6783 DEF_FROM_PTR (def_p))
6784 if (!is_gimple_debug (use_stmt))
6786 stmt_vec_info use_stmt_info
6787 = vinfo->lookup_stmt (use_stmt);
6788 if (!use_stmt_info
6789 || !vectorized_scalar_stmts.contains (use_stmt_info))
6791 if (use_stmt_info
6792 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6794 /* For stmts participating in patterns we have
6795 to check its uses recursively. */
6796 if (!worklist_visited)
6797 worklist_visited = new hash_set<gimple *> ();
6798 if (!worklist_visited->add (use_stmt))
6799 worklist.safe_push (use_stmt);
6800 continue;
6802 (*life)[i] = true;
6803 goto next_lane;
6808 while (!worklist.is_empty ());
6809 next_lane:
6810 if (worklist_visited)
6811 delete worklist_visited;
6812 if ((*life)[i])
6813 continue;
6816 /* Count scalar stmts only once. */
6817 if (gimple_visited_p (orig_stmt))
6818 continue;
6819 gimple_set_visited (orig_stmt, true);
6821 vect_cost_for_stmt kind;
6822 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6824 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6825 kind = scalar_load;
6826 else
6827 kind = scalar_store;
6829 else if (vect_nop_conversion_p (orig_stmt_info))
6830 continue;
6831 /* For single-argument PHIs assume coalescing which means zero cost
6832 for the scalar and the vector PHIs. This avoids artificially
6833 favoring the vector path (but may pessimize it in some cases). */
6834 else if (is_a <gphi *> (orig_stmt_info->stmt)
6835 && gimple_phi_num_args
6836 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6837 continue;
6838 else
6839 kind = scalar_stmt;
6840 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6841 SLP_TREE_VECTYPE (node), 0, vect_body);
6844 auto_vec<bool, 20> subtree_life;
6845 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6847 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6849 /* Do not directly pass LIFE to the recursive call, copy it to
6850 confine changes in the callee to the current child/subtree. */
6851 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6853 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6854 for (unsigned j = 0;
6855 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6857 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6858 if (perm.first == i)
6859 subtree_life[perm.second] = (*life)[j];
6862 else
6864 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6865 subtree_life.safe_splice (*life);
6867 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6868 vectorized_scalar_stmts, visited);
6869 subtree_life.truncate (0);
6874 /* Comparator for the loop-index sorted cost vectors. */
6876 static int
6877 li_cost_vec_cmp (const void *a_, const void *b_)
6879 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6880 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6881 if (a->first < b->first)
6882 return -1;
6883 else if (a->first == b->first)
6884 return 0;
6885 return 1;
6888 /* Check if vectorization of the basic block is profitable for the
6889 subgraph denoted by SLP_INSTANCES. */
6891 static bool
6892 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6893 vec<slp_instance> slp_instances,
6894 loop_p orig_loop)
6896 slp_instance instance;
6897 int i;
6898 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6899 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6901 if (dump_enabled_p ())
6903 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6904 hash_set<slp_tree> visited;
6905 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6906 vect_print_slp_graph (MSG_NOTE, vect_location,
6907 SLP_INSTANCE_TREE (instance), visited);
6910 /* Compute the set of scalar stmts we know will go away 'locally' when
6911 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6912 not accurate for nodes promoted extern late or for scalar stmts that
6913 are used both in extern defs and in vectorized defs. */
6914 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6915 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6916 hash_set<slp_tree> visited;
6917 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6919 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6920 SLP_INSTANCE_TREE (instance),
6921 visited,
6922 vectorized_scalar_stmts,
6923 scalar_stmts_in_externs);
6924 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6925 vectorized_scalar_stmts.add (rstmt);
6927 /* Scalar stmts used as defs in external nodes need to be preseved, so
6928 remove them from vectorized_scalar_stmts. */
6929 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6930 vectorized_scalar_stmts.remove (stmt);
6932 /* Calculate scalar cost and sum the cost for the vector stmts
6933 previously collected. */
6934 stmt_vector_for_cost scalar_costs = vNULL;
6935 stmt_vector_for_cost vector_costs = vNULL;
6936 visited.empty ();
6937 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6939 auto_vec<bool, 20> life;
6940 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6941 true);
6942 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6943 record_stmt_cost (&scalar_costs,
6944 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6945 scalar_stmt,
6946 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6947 vect_bb_slp_scalar_cost (bb_vinfo,
6948 SLP_INSTANCE_TREE (instance),
6949 &life, &scalar_costs, vectorized_scalar_stmts,
6950 visited);
6951 vector_costs.safe_splice (instance->cost_vec);
6952 instance->cost_vec.release ();
6955 if (dump_enabled_p ())
6956 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6958 /* When costing non-loop vectorization we need to consider each covered
6959 loop independently and make sure vectorization is profitable. For
6960 now we assume a loop may be not entered or executed an arbitrary
6961 number of iterations (??? static information can provide more
6962 precise info here) which means we can simply cost each containing
6963 loops stmts separately. */
6965 /* First produce cost vectors sorted by loop index. */
6966 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6967 li_scalar_costs (scalar_costs.length ());
6968 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6969 li_vector_costs (vector_costs.length ());
6970 stmt_info_for_cost *cost;
6971 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6973 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6974 li_scalar_costs.quick_push (std::make_pair (l, cost));
6976 /* Use a random used loop as fallback in case the first vector_costs
6977 entry does not have a stmt_info associated with it. */
6978 unsigned l = li_scalar_costs[0].first;
6979 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6981 /* We inherit from the previous COST, invariants, externals and
6982 extracts immediately follow the cost for the related stmt. */
6983 if (cost->stmt_info)
6984 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6985 li_vector_costs.quick_push (std::make_pair (l, cost));
6987 li_scalar_costs.qsort (li_cost_vec_cmp);
6988 li_vector_costs.qsort (li_cost_vec_cmp);
6990 /* Now cost the portions individually. */
6991 unsigned vi = 0;
6992 unsigned si = 0;
6993 bool profitable = true;
6994 while (si < li_scalar_costs.length ()
6995 && vi < li_vector_costs.length ())
6997 unsigned sl = li_scalar_costs[si].first;
6998 unsigned vl = li_vector_costs[vi].first;
6999 if (sl != vl)
7001 if (dump_enabled_p ())
7002 dump_printf_loc (MSG_NOTE, vect_location,
7003 "Scalar %d and vector %d loop part do not "
7004 "match up, skipping scalar part\n", sl, vl);
7005 /* Skip the scalar part, assuming zero cost on the vector side. */
7008 si++;
7010 while (si < li_scalar_costs.length ()
7011 && li_scalar_costs[si].first == sl);
7012 continue;
7015 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7018 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7019 si++;
7021 while (si < li_scalar_costs.length ()
7022 && li_scalar_costs[si].first == sl);
7023 unsigned dummy;
7024 finish_cost (scalar_target_cost_data, nullptr,
7025 &dummy, &scalar_cost, &dummy);
7027 /* Complete the target-specific vector cost calculation. */
7028 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7031 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7032 vi++;
7034 while (vi < li_vector_costs.length ()
7035 && li_vector_costs[vi].first == vl);
7036 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7037 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7038 delete scalar_target_cost_data;
7039 delete vect_target_cost_data;
7041 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7043 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_NOTE, vect_location,
7046 "Cost model analysis for part in loop %d:\n", sl);
7047 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7048 vec_inside_cost + vec_outside_cost);
7049 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7052 /* Vectorization is profitable if its cost is more than the cost of scalar
7053 version. Note that we err on the vector side for equal cost because
7054 the cost estimate is otherwise quite pessimistic (constant uses are
7055 free on the scalar side but cost a load on the vector side for
7056 example). */
7057 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7059 profitable = false;
7060 break;
7063 if (profitable && vi < li_vector_costs.length ())
7065 if (dump_enabled_p ())
7066 dump_printf_loc (MSG_NOTE, vect_location,
7067 "Excess vector cost for part in loop %d:\n",
7068 li_vector_costs[vi].first);
7069 profitable = false;
7072 /* Unset visited flag. This is delayed when the subgraph is profitable
7073 and we process the loop for remaining unvectorized if-converted code. */
7074 if (!orig_loop || !profitable)
7075 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7076 gimple_set_visited (cost->stmt_info->stmt, false);
7078 scalar_costs.release ();
7079 vector_costs.release ();
7081 return profitable;
7084 /* qsort comparator for lane defs. */
7086 static int
7087 vld_cmp (const void *a_, const void *b_)
7089 auto *a = (const std::pair<unsigned, tree> *)a_;
7090 auto *b = (const std::pair<unsigned, tree> *)b_;
7091 return a->first - b->first;
7094 /* Return true if USE_STMT is a vector lane insert into VEC and set
7095 *THIS_LANE to the lane number that is set. */
7097 static bool
7098 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7100 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7101 if (!use_ass
7102 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7103 || (vec
7104 ? gimple_assign_rhs1 (use_ass) != vec
7105 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7106 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7107 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7108 || !constant_multiple_p
7109 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7110 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7111 this_lane))
7112 return false;
7113 return true;
7116 /* Find any vectorizable constructors and add them to the grouped_store
7117 array. */
7119 static void
7120 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7122 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7123 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7124 !gsi_end_p (gsi); gsi_next (&gsi))
7126 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7127 if (!assign)
7128 continue;
7130 tree rhs = gimple_assign_rhs1 (assign);
7131 enum tree_code code = gimple_assign_rhs_code (assign);
7132 use_operand_p use_p;
7133 gimple *use_stmt;
7134 if (code == CONSTRUCTOR)
7136 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7137 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7138 CONSTRUCTOR_NELTS (rhs))
7139 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7140 || uniform_vector_p (rhs))
7141 continue;
7143 unsigned j;
7144 tree val;
7145 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7146 if (TREE_CODE (val) != SSA_NAME
7147 || !bb_vinfo->lookup_def (val))
7148 break;
7149 if (j != CONSTRUCTOR_NELTS (rhs))
7150 continue;
7152 vec<stmt_vec_info> roots = vNULL;
7153 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7154 vec<stmt_vec_info> stmts;
7155 stmts.create (CONSTRUCTOR_NELTS (rhs));
7156 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7157 stmts.quick_push
7158 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7159 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7160 stmts, roots));
7162 else if (code == BIT_INSERT_EXPR
7163 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7164 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7165 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7166 && integer_zerop (gimple_assign_rhs3 (assign))
7167 && useless_type_conversion_p
7168 (TREE_TYPE (TREE_TYPE (rhs)),
7169 TREE_TYPE (gimple_assign_rhs2 (assign)))
7170 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7172 /* We start to match on insert to lane zero but since the
7173 inserts need not be ordered we'd have to search both
7174 the def and the use chains. */
7175 tree vectype = TREE_TYPE (rhs);
7176 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7177 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7178 auto_sbitmap lanes (nlanes);
7179 bitmap_clear (lanes);
7180 bitmap_set_bit (lanes, 0);
7181 tree def = gimple_assign_lhs (assign);
7182 lane_defs.quick_push
7183 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7184 unsigned lanes_found = 1;
7185 /* Start with the use chains, the last stmt will be the root. */
7186 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7187 vec<stmt_vec_info> roots = vNULL;
7188 roots.safe_push (last);
7191 use_operand_p use_p;
7192 gimple *use_stmt;
7193 if (!single_imm_use (def, &use_p, &use_stmt))
7194 break;
7195 unsigned this_lane;
7196 if (!bb_vinfo->lookup_stmt (use_stmt)
7197 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7198 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7199 break;
7200 if (bitmap_bit_p (lanes, this_lane))
7201 break;
7202 lanes_found++;
7203 bitmap_set_bit (lanes, this_lane);
7204 gassign *use_ass = as_a <gassign *> (use_stmt);
7205 lane_defs.quick_push (std::make_pair
7206 (this_lane, gimple_assign_rhs2 (use_ass)));
7207 last = bb_vinfo->lookup_stmt (use_ass);
7208 roots.safe_push (last);
7209 def = gimple_assign_lhs (use_ass);
7211 while (lanes_found < nlanes);
7212 if (roots.length () > 1)
7213 std::swap(roots[0], roots[roots.length () - 1]);
7214 if (lanes_found < nlanes)
7216 /* Now search the def chain. */
7217 def = gimple_assign_rhs1 (assign);
7220 if (TREE_CODE (def) != SSA_NAME
7221 || !has_single_use (def))
7222 break;
7223 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7224 unsigned this_lane;
7225 if (!bb_vinfo->lookup_stmt (def_stmt)
7226 || !vect_slp_is_lane_insert (def_stmt,
7227 NULL_TREE, &this_lane)
7228 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7229 break;
7230 if (bitmap_bit_p (lanes, this_lane))
7231 break;
7232 lanes_found++;
7233 bitmap_set_bit (lanes, this_lane);
7234 lane_defs.quick_push (std::make_pair
7235 (this_lane,
7236 gimple_assign_rhs2 (def_stmt)));
7237 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7238 def = gimple_assign_rhs1 (def_stmt);
7240 while (lanes_found < nlanes);
7242 if (lanes_found == nlanes)
7244 /* Sort lane_defs after the lane index and register the root. */
7245 lane_defs.qsort (vld_cmp);
7246 vec<stmt_vec_info> stmts;
7247 stmts.create (nlanes);
7248 for (unsigned i = 0; i < nlanes; ++i)
7249 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7250 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7251 stmts, roots));
7253 else
7254 roots.release ();
7256 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7257 && (associative_tree_code (code) || code == MINUS_EXPR)
7258 /* ??? This pessimizes a two-element reduction. PR54400.
7259 ??? In-order reduction could be handled if we only
7260 traverse one operand chain in vect_slp_linearize_chain. */
7261 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7262 /* Ops with constants at the tail can be stripped here. */
7263 && TREE_CODE (rhs) == SSA_NAME
7264 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7265 /* Should be the chain end. */
7266 && (!single_imm_use (gimple_assign_lhs (assign),
7267 &use_p, &use_stmt)
7268 || !is_gimple_assign (use_stmt)
7269 || (gimple_assign_rhs_code (use_stmt) != code
7270 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7271 || (gimple_assign_rhs_code (use_stmt)
7272 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7274 /* We start the match at the end of a possible association
7275 chain. */
7276 auto_vec<chain_op_t> chain;
7277 auto_vec<std::pair<tree_code, gimple *> > worklist;
7278 auto_vec<gimple *> chain_stmts;
7279 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7280 if (code == MINUS_EXPR)
7281 code = PLUS_EXPR;
7282 internal_fn reduc_fn;
7283 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7284 || reduc_fn == IFN_LAST)
7285 continue;
7286 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7287 /* ??? */
7288 code_stmt, alt_code_stmt, &chain_stmts);
7289 if (chain.length () > 1)
7291 /* Sort the chain according to def_type and operation. */
7292 chain.sort (dt_sort_cmp, bb_vinfo);
7293 /* ??? Now we'd want to strip externals and constants
7294 but record those to be handled in the epilogue. */
7295 /* ??? For now do not allow mixing ops or externs/constants. */
7296 bool invalid = false;
7297 unsigned remain_cnt = 0;
7298 for (unsigned i = 0; i < chain.length (); ++i)
7300 if (chain[i].code != code)
7302 invalid = true;
7303 break;
7305 if (chain[i].dt != vect_internal_def)
7306 remain_cnt++;
7308 if (!invalid && chain.length () - remain_cnt > 1)
7310 vec<stmt_vec_info> stmts;
7311 vec<tree> remain = vNULL;
7312 stmts.create (chain.length ());
7313 if (remain_cnt > 0)
7314 remain.create (remain_cnt);
7315 for (unsigned i = 0; i < chain.length (); ++i)
7317 if (chain[i].dt == vect_internal_def)
7318 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7319 else
7320 remain.quick_push (chain[i].op);
7322 vec<stmt_vec_info> roots;
7323 roots.create (chain_stmts.length ());
7324 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7325 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7326 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7327 stmts, roots, remain));
7334 /* Walk the grouped store chains and replace entries with their
7335 pattern variant if any. */
7337 static void
7338 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7340 stmt_vec_info first_element;
7341 unsigned i;
7343 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7345 /* We also have CTORs in this array. */
7346 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7347 continue;
7348 if (STMT_VINFO_IN_PATTERN_P (first_element))
7350 stmt_vec_info orig = first_element;
7351 first_element = STMT_VINFO_RELATED_STMT (first_element);
7352 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7353 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7354 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7355 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7356 vinfo->grouped_stores[i] = first_element;
7358 stmt_vec_info prev = first_element;
7359 while (DR_GROUP_NEXT_ELEMENT (prev))
7361 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7362 if (STMT_VINFO_IN_PATTERN_P (elt))
7364 stmt_vec_info orig = elt;
7365 elt = STMT_VINFO_RELATED_STMT (elt);
7366 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7367 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7368 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7370 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7371 prev = elt;
7376 /* Check if the region described by BB_VINFO can be vectorized, returning
7377 true if so. When returning false, set FATAL to true if the same failure
7378 would prevent vectorization at other vector sizes, false if it is still
7379 worth trying other sizes. N_STMTS is the number of statements in the
7380 region. */
7382 static bool
7383 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7384 vec<int> *dataref_groups)
7386 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7388 slp_instance instance;
7389 int i;
7390 poly_uint64 min_vf = 2;
7392 /* The first group of checks is independent of the vector size. */
7393 fatal = true;
7395 /* Analyze the data references. */
7397 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7399 if (dump_enabled_p ())
7400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7401 "not vectorized: unhandled data-ref in basic "
7402 "block.\n");
7403 return false;
7406 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7408 if (dump_enabled_p ())
7409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7410 "not vectorized: unhandled data access in "
7411 "basic block.\n");
7412 return false;
7415 vect_slp_check_for_roots (bb_vinfo);
7417 /* If there are no grouped stores and no constructors in the region
7418 there is no need to continue with pattern recog as vect_analyze_slp
7419 will fail anyway. */
7420 if (bb_vinfo->grouped_stores.is_empty ()
7421 && bb_vinfo->roots.is_empty ())
7423 if (dump_enabled_p ())
7424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7425 "not vectorized: no grouped stores in "
7426 "basic block.\n");
7427 return false;
7430 /* While the rest of the analysis below depends on it in some way. */
7431 fatal = false;
7433 vect_pattern_recog (bb_vinfo);
7435 /* Update store groups from pattern processing. */
7436 vect_fixup_store_groups_with_patterns (bb_vinfo);
7438 /* Check the SLP opportunities in the basic block, analyze and build SLP
7439 trees. */
7440 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7442 if (dump_enabled_p ())
7444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445 "Failed to SLP the basic block.\n");
7446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7447 "not vectorized: failed to find SLP opportunities "
7448 "in basic block.\n");
7450 return false;
7453 /* Optimize permutations. */
7454 vect_optimize_slp (bb_vinfo);
7456 /* Gather the loads reachable from the SLP graph entries. */
7457 vect_gather_slp_loads (bb_vinfo);
7459 vect_record_base_alignments (bb_vinfo);
7461 /* Analyze and verify the alignment of data references and the
7462 dependence in the SLP instances. */
7463 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7465 vect_location = instance->location ();
7466 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7467 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7469 slp_tree node = SLP_INSTANCE_TREE (instance);
7470 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7471 if (dump_enabled_p ())
7472 dump_printf_loc (MSG_NOTE, vect_location,
7473 "removing SLP instance operations starting from: %G",
7474 stmt_info->stmt);
7475 vect_free_slp_instance (instance);
7476 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7477 continue;
7480 /* Mark all the statements that we want to vectorize as pure SLP and
7481 relevant. */
7482 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7483 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7484 unsigned j;
7485 stmt_vec_info root;
7486 /* Likewise consider instance root stmts as vectorized. */
7487 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7488 STMT_SLP_TYPE (root) = pure_slp;
7490 i++;
7492 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7493 return false;
7495 if (!vect_slp_analyze_operations (bb_vinfo))
7497 if (dump_enabled_p ())
7498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7499 "not vectorized: bad operation in basic block.\n");
7500 return false;
7503 vect_bb_partition_graph (bb_vinfo);
7505 return true;
7508 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7509 basic blocks in BBS, returning true on success.
7510 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7512 static bool
7513 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7514 vec<int> *dataref_groups, unsigned int n_stmts,
7515 loop_p orig_loop)
7517 bb_vec_info bb_vinfo;
7518 auto_vector_modes vector_modes;
7520 /* Autodetect first vector size we try. */
7521 machine_mode next_vector_mode = VOIDmode;
7522 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7523 unsigned int mode_i = 0;
7525 vec_info_shared shared;
7527 machine_mode autodetected_vector_mode = VOIDmode;
7528 while (1)
7530 bool vectorized = false;
7531 bool fatal = false;
7532 bb_vinfo = new _bb_vec_info (bbs, &shared);
7534 bool first_time_p = shared.datarefs.is_empty ();
7535 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7536 if (first_time_p)
7537 bb_vinfo->shared->save_datarefs ();
7538 else
7539 bb_vinfo->shared->check_datarefs ();
7540 bb_vinfo->vector_mode = next_vector_mode;
7542 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7544 if (dump_enabled_p ())
7546 dump_printf_loc (MSG_NOTE, vect_location,
7547 "***** Analysis succeeded with vector mode"
7548 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7549 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7552 bb_vinfo->shared->check_datarefs ();
7554 auto_vec<slp_instance> profitable_subgraphs;
7555 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7557 if (instance->subgraph_entries.is_empty ())
7558 continue;
7560 dump_user_location_t saved_vect_location = vect_location;
7561 vect_location = instance->location ();
7562 if (!unlimited_cost_model (NULL)
7563 && !vect_bb_vectorization_profitable_p
7564 (bb_vinfo, instance->subgraph_entries, orig_loop))
7566 if (dump_enabled_p ())
7567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7568 "not vectorized: vectorization is not "
7569 "profitable.\n");
7570 vect_location = saved_vect_location;
7571 continue;
7574 vect_location = saved_vect_location;
7575 if (!dbg_cnt (vect_slp))
7576 continue;
7578 profitable_subgraphs.safe_push (instance);
7581 /* When we're vectorizing an if-converted loop body make sure
7582 we vectorized all if-converted code. */
7583 if (!profitable_subgraphs.is_empty ()
7584 && orig_loop)
7586 gcc_assert (bb_vinfo->bbs.length () == 1);
7587 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7588 !gsi_end_p (gsi); gsi_next (&gsi))
7590 /* The costing above left us with DCEable vectorized scalar
7591 stmts having the visited flag set on profitable
7592 subgraphs. Do the delayed clearing of the flag here. */
7593 if (gimple_visited_p (gsi_stmt (gsi)))
7595 gimple_set_visited (gsi_stmt (gsi), false);
7596 continue;
7598 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7599 continue;
7601 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7602 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7604 if (!profitable_subgraphs.is_empty ()
7605 && dump_enabled_p ())
7606 dump_printf_loc (MSG_NOTE, vect_location,
7607 "not profitable because of "
7608 "unprofitable if-converted scalar "
7609 "code\n");
7610 profitable_subgraphs.truncate (0);
7615 /* Finally schedule the profitable subgraphs. */
7616 for (slp_instance instance : profitable_subgraphs)
7618 if (!vectorized && dump_enabled_p ())
7619 dump_printf_loc (MSG_NOTE, vect_location,
7620 "Basic block will be vectorized "
7621 "using SLP\n");
7622 vectorized = true;
7624 /* Dump before scheduling as store vectorization will remove
7625 the original stores and mess with the instance tree
7626 so querying its location will eventually ICE. */
7627 if (flag_checking)
7628 for (slp_instance sub : instance->subgraph_entries)
7629 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7630 unsigned HOST_WIDE_INT bytes;
7631 if (dump_enabled_p ())
7632 for (slp_instance sub : instance->subgraph_entries)
7634 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7635 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7636 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7637 sub->location (),
7638 "basic block part vectorized using %wu "
7639 "byte vectors\n", bytes);
7640 else
7641 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7642 sub->location (),
7643 "basic block part vectorized using "
7644 "variable length vectors\n");
7647 dump_user_location_t saved_vect_location = vect_location;
7648 vect_location = instance->location ();
7650 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7652 vect_location = saved_vect_location;
7655 else
7657 if (dump_enabled_p ())
7658 dump_printf_loc (MSG_NOTE, vect_location,
7659 "***** Analysis failed with vector mode %s\n",
7660 GET_MODE_NAME (bb_vinfo->vector_mode));
7663 if (mode_i == 0)
7664 autodetected_vector_mode = bb_vinfo->vector_mode;
7666 if (!fatal)
7667 while (mode_i < vector_modes.length ()
7668 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7670 if (dump_enabled_p ())
7671 dump_printf_loc (MSG_NOTE, vect_location,
7672 "***** The result for vector mode %s would"
7673 " be the same\n",
7674 GET_MODE_NAME (vector_modes[mode_i]));
7675 mode_i += 1;
7678 delete bb_vinfo;
7680 if (mode_i < vector_modes.length ()
7681 && VECTOR_MODE_P (autodetected_vector_mode)
7682 && (related_vector_mode (vector_modes[mode_i],
7683 GET_MODE_INNER (autodetected_vector_mode))
7684 == autodetected_vector_mode)
7685 && (related_vector_mode (autodetected_vector_mode,
7686 GET_MODE_INNER (vector_modes[mode_i]))
7687 == vector_modes[mode_i]))
7689 if (dump_enabled_p ())
7690 dump_printf_loc (MSG_NOTE, vect_location,
7691 "***** Skipping vector mode %s, which would"
7692 " repeat the analysis for %s\n",
7693 GET_MODE_NAME (vector_modes[mode_i]),
7694 GET_MODE_NAME (autodetected_vector_mode));
7695 mode_i += 1;
7698 if (vectorized
7699 || mode_i == vector_modes.length ()
7700 || autodetected_vector_mode == VOIDmode
7701 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7702 vector sizes will fail do not bother iterating. */
7703 || fatal)
7704 return vectorized;
7706 /* Try the next biggest vector size. */
7707 next_vector_mode = vector_modes[mode_i++];
7708 if (dump_enabled_p ())
7709 dump_printf_loc (MSG_NOTE, vect_location,
7710 "***** Re-trying analysis with vector mode %s\n",
7711 GET_MODE_NAME (next_vector_mode));
7716 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7717 true if anything in the basic-block was vectorized. */
7719 static bool
7720 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7722 vec<data_reference_p> datarefs = vNULL;
7723 auto_vec<int> dataref_groups;
7724 int insns = 0;
7725 int current_group = 0;
7727 for (unsigned i = 0; i < bbs.length (); i++)
7729 basic_block bb = bbs[i];
7730 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7731 gsi_next (&gsi))
7733 gimple *stmt = gsi_stmt (gsi);
7734 if (is_gimple_debug (stmt))
7735 continue;
7737 insns++;
7739 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7740 vect_location = stmt;
7742 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7743 &dataref_groups, current_group))
7744 ++current_group;
7746 /* New BBs always start a new DR group. */
7747 ++current_group;
7750 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7753 /* Special entry for the BB vectorizer. Analyze and transform a single
7754 if-converted BB with ORIG_LOOPs body being the not if-converted
7755 representation. Returns true if anything in the basic-block was
7756 vectorized. */
7758 bool
7759 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7761 auto_vec<basic_block> bbs;
7762 bbs.safe_push (bb);
7763 return vect_slp_bbs (bbs, orig_loop);
7766 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7767 true if anything in the basic-block was vectorized. */
7769 bool
7770 vect_slp_function (function *fun)
7772 bool r = false;
7773 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7774 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7776 /* For the moment split the function into pieces to avoid making
7777 the iteration on the vector mode moot. Split at points we know
7778 to not handle well which is CFG merges (SLP discovery doesn't
7779 handle non-loop-header PHIs) and loop exits. Since pattern
7780 recog requires reverse iteration to visit uses before defs
7781 simply chop RPO into pieces. */
7782 auto_vec<basic_block> bbs;
7783 for (unsigned i = 0; i < n; i++)
7785 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7786 bool split = false;
7788 /* Split when a BB is not dominated by the first block. */
7789 if (!bbs.is_empty ()
7790 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7792 if (dump_enabled_p ())
7793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7794 "splitting region at dominance boundary bb%d\n",
7795 bb->index);
7796 split = true;
7798 /* Split when the loop determined by the first block
7799 is exited. This is because we eventually insert
7800 invariants at region begin. */
7801 else if (!bbs.is_empty ()
7802 && bbs[0]->loop_father != bb->loop_father
7803 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7805 if (dump_enabled_p ())
7806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7807 "splitting region at loop %d exit at bb%d\n",
7808 bbs[0]->loop_father->num, bb->index);
7809 split = true;
7811 else if (!bbs.is_empty ()
7812 && bb->loop_father->header == bb
7813 && bb->loop_father->dont_vectorize)
7815 if (dump_enabled_p ())
7816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817 "splitting region at dont-vectorize loop %d "
7818 "entry at bb%d\n",
7819 bb->loop_father->num, bb->index);
7820 split = true;
7823 if (split && !bbs.is_empty ())
7825 r |= vect_slp_bbs (bbs, NULL);
7826 bbs.truncate (0);
7829 if (bbs.is_empty ())
7831 /* We need to be able to insert at the head of the region which
7832 we cannot for region starting with a returns-twice call. */
7833 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7834 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7836 if (dump_enabled_p ())
7837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7838 "skipping bb%d as start of region as it "
7839 "starts with returns-twice call\n",
7840 bb->index);
7841 continue;
7843 /* If the loop this BB belongs to is marked as not to be vectorized
7844 honor that also for BB vectorization. */
7845 if (bb->loop_father->dont_vectorize)
7846 continue;
7849 bbs.safe_push (bb);
7851 /* When we have a stmt ending this block and defining a
7852 value we have to insert on edges when inserting after it for
7853 a vector containing its definition. Avoid this for now. */
7854 if (gimple *last = *gsi_last_bb (bb))
7855 if (gimple_get_lhs (last)
7856 && is_ctrl_altering_stmt (last))
7858 if (dump_enabled_p ())
7859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7860 "splitting region at control altering "
7861 "definition %G", last);
7862 r |= vect_slp_bbs (bbs, NULL);
7863 bbs.truncate (0);
7867 if (!bbs.is_empty ())
7868 r |= vect_slp_bbs (bbs, NULL);
7870 free (rpo);
7872 return r;
7875 /* Build a variable-length vector in which the elements in ELTS are repeated
7876 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7877 RESULTS and add any new instructions to SEQ.
7879 The approach we use is:
7881 (1) Find a vector mode VM with integer elements of mode IM.
7883 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7884 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7885 from small vectors to IM.
7887 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7889 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7890 correct byte contents.
7892 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7894 We try to find the largest IM for which this sequence works, in order
7895 to cut down on the number of interleaves. */
7897 void
7898 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7899 const vec<tree> &elts, unsigned int nresults,
7900 vec<tree> &results)
7902 unsigned int nelts = elts.length ();
7903 tree element_type = TREE_TYPE (vector_type);
7905 /* (1) Find a vector mode VM with integer elements of mode IM. */
7906 unsigned int nvectors = 1;
7907 tree new_vector_type;
7908 tree permutes[2];
7909 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7910 &nvectors, &new_vector_type,
7911 permutes))
7912 gcc_unreachable ();
7914 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7915 unsigned int partial_nelts = nelts / nvectors;
7916 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7918 tree_vector_builder partial_elts;
7919 auto_vec<tree, 32> pieces (nvectors * 2);
7920 pieces.quick_grow_cleared (nvectors * 2);
7921 for (unsigned int i = 0; i < nvectors; ++i)
7923 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7924 ELTS' has mode IM. */
7925 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7926 for (unsigned int j = 0; j < partial_nelts; ++j)
7927 partial_elts.quick_push (elts[i * partial_nelts + j]);
7928 tree t = gimple_build_vector (seq, &partial_elts);
7929 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7930 TREE_TYPE (new_vector_type), t);
7932 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7933 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7936 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7937 correct byte contents.
7939 Conceptually, we need to repeat the following operation log2(nvectors)
7940 times, where hi_start = nvectors / 2:
7942 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7943 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7945 However, if each input repeats every N elements and the VF is
7946 a multiple of N * 2, the HI result is the same as the LO result.
7947 This will be true for the first N1 iterations of the outer loop,
7948 followed by N2 iterations for which both the LO and HI results
7949 are needed. I.e.:
7951 N1 + N2 = log2(nvectors)
7953 Each "N1 iteration" doubles the number of redundant vectors and the
7954 effect of the process as a whole is to have a sequence of nvectors/2**N1
7955 vectors that repeats 2**N1 times. Rather than generate these redundant
7956 vectors, we halve the number of vectors for each N1 iteration. */
7957 unsigned int in_start = 0;
7958 unsigned int out_start = nvectors;
7959 unsigned int new_nvectors = nvectors;
7960 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7962 unsigned int hi_start = new_nvectors / 2;
7963 unsigned int out_i = 0;
7964 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7966 if ((in_i & 1) != 0
7967 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7968 2 * in_repeat))
7969 continue;
7971 tree output = make_ssa_name (new_vector_type);
7972 tree input1 = pieces[in_start + (in_i / 2)];
7973 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7974 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7975 input1, input2,
7976 permutes[in_i & 1]);
7977 gimple_seq_add_stmt (seq, stmt);
7978 pieces[out_start + out_i] = output;
7979 out_i += 1;
7981 std::swap (in_start, out_start);
7982 new_nvectors = out_i;
7985 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7986 results.reserve (nresults);
7987 for (unsigned int i = 0; i < nresults; ++i)
7988 if (i < new_nvectors)
7989 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7990 pieces[in_start + i]));
7991 else
7992 results.quick_push (results[i - new_nvectors]);
7996 /* For constant and loop invariant defs in OP_NODE this function creates
7997 vector defs that will be used in the vectorized stmts and stores them
7998 to SLP_TREE_VEC_DEFS of OP_NODE. */
8000 static void
8001 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8003 unsigned HOST_WIDE_INT nunits;
8004 tree vec_cst;
8005 unsigned j, number_of_places_left_in_vector;
8006 tree vector_type;
8007 tree vop;
8008 int group_size = op_node->ops.length ();
8009 unsigned int vec_num, i;
8010 unsigned number_of_copies = 1;
8011 bool constant_p;
8012 gimple_seq ctor_seq = NULL;
8013 auto_vec<tree, 16> permute_results;
8015 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8016 vector_type = SLP_TREE_VECTYPE (op_node);
8018 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8019 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8020 auto_vec<tree> voprnds (number_of_vectors);
8022 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8023 created vectors. It is greater than 1 if unrolling is performed.
8025 For example, we have two scalar operands, s1 and s2 (e.g., group of
8026 strided accesses of size two), while NUNITS is four (i.e., four scalars
8027 of this type can be packed in a vector). The output vector will contain
8028 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8029 will be 2).
8031 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8032 containing the operands.
8034 For example, NUNITS is four as before, and the group size is 8
8035 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8036 {s5, s6, s7, s8}. */
8038 /* When using duplicate_and_interleave, we just need one element for
8039 each scalar statement. */
8040 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8041 nunits = group_size;
8043 number_of_copies = nunits * number_of_vectors / group_size;
8045 number_of_places_left_in_vector = nunits;
8046 constant_p = true;
8047 tree_vector_builder elts (vector_type, nunits, 1);
8048 elts.quick_grow (nunits);
8049 stmt_vec_info insert_after = NULL;
8050 for (j = 0; j < number_of_copies; j++)
8052 tree op;
8053 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8055 /* Create 'vect_ = {op0,op1,...,opn}'. */
8056 number_of_places_left_in_vector--;
8057 tree orig_op = op;
8058 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8060 if (CONSTANT_CLASS_P (op))
8062 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8064 /* Can't use VIEW_CONVERT_EXPR for booleans because
8065 of possibly different sizes of scalar value and
8066 vector element. */
8067 if (integer_zerop (op))
8068 op = build_int_cst (TREE_TYPE (vector_type), 0);
8069 else if (integer_onep (op))
8070 op = build_all_ones_cst (TREE_TYPE (vector_type));
8071 else
8072 gcc_unreachable ();
8074 else
8075 op = fold_unary (VIEW_CONVERT_EXPR,
8076 TREE_TYPE (vector_type), op);
8077 gcc_assert (op && CONSTANT_CLASS_P (op));
8079 else
8081 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8082 gimple *init_stmt;
8083 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8085 tree true_val
8086 = build_all_ones_cst (TREE_TYPE (vector_type));
8087 tree false_val
8088 = build_zero_cst (TREE_TYPE (vector_type));
8089 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8090 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8091 op, true_val,
8092 false_val);
8094 else
8096 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8097 op);
8098 init_stmt
8099 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8100 op);
8102 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8103 op = new_temp;
8106 elts[number_of_places_left_in_vector] = op;
8107 if (!CONSTANT_CLASS_P (op))
8108 constant_p = false;
8109 /* For BB vectorization we have to compute an insert location
8110 when a def is inside the analyzed region since we cannot
8111 simply insert at the BB start in this case. */
8112 stmt_vec_info opdef;
8113 if (TREE_CODE (orig_op) == SSA_NAME
8114 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8115 && is_a <bb_vec_info> (vinfo)
8116 && (opdef = vinfo->lookup_def (orig_op)))
8118 if (!insert_after)
8119 insert_after = opdef;
8120 else
8121 insert_after = get_later_stmt (insert_after, opdef);
8124 if (number_of_places_left_in_vector == 0)
8126 if (constant_p
8127 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8128 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8129 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8130 else
8132 if (permute_results.is_empty ())
8133 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8134 elts, number_of_vectors,
8135 permute_results);
8136 vec_cst = permute_results[number_of_vectors - j - 1];
8138 if (!gimple_seq_empty_p (ctor_seq))
8140 if (insert_after)
8142 gimple_stmt_iterator gsi;
8143 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8145 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8146 gsi_insert_seq_before (&gsi, ctor_seq,
8147 GSI_CONTINUE_LINKING);
8149 else if (!stmt_ends_bb_p (insert_after->stmt))
8151 gsi = gsi_for_stmt (insert_after->stmt);
8152 gsi_insert_seq_after (&gsi, ctor_seq,
8153 GSI_CONTINUE_LINKING);
8155 else
8157 /* When we want to insert after a def where the
8158 defining stmt throws then insert on the fallthru
8159 edge. */
8160 edge e = find_fallthru_edge
8161 (gimple_bb (insert_after->stmt)->succs);
8162 basic_block new_bb
8163 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8164 gcc_assert (!new_bb);
8167 else
8168 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8169 ctor_seq = NULL;
8171 voprnds.quick_push (vec_cst);
8172 insert_after = NULL;
8173 number_of_places_left_in_vector = nunits;
8174 constant_p = true;
8175 elts.new_vector (vector_type, nunits, 1);
8176 elts.quick_grow (nunits);
8181 /* Since the vectors are created in the reverse order, we should invert
8182 them. */
8183 vec_num = voprnds.length ();
8184 for (j = vec_num; j != 0; j--)
8186 vop = voprnds[j - 1];
8187 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8190 /* In case that VF is greater than the unrolling factor needed for the SLP
8191 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8192 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8193 to replicate the vectors. */
8194 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8195 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8196 i++)
8197 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8200 /* Get the Ith vectorized definition from SLP_NODE. */
8202 tree
8203 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8205 return SLP_TREE_VEC_DEFS (slp_node)[i];
8208 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8210 void
8211 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8213 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8214 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8217 /* Get N vectorized definitions for SLP_NODE. */
8219 void
8220 vect_get_slp_defs (vec_info *,
8221 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8223 if (n == -1U)
8224 n = SLP_TREE_CHILDREN (slp_node).length ();
8226 for (unsigned i = 0; i < n; ++i)
8228 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8229 vec<tree> vec_defs = vNULL;
8230 vect_get_slp_defs (child, &vec_defs);
8231 vec_oprnds->quick_push (vec_defs);
8235 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8236 - PERM gives the permutation that the caller wants to use for NODE,
8237 which might be different from SLP_LOAD_PERMUTATION.
8238 - DUMP_P controls whether the function dumps information. */
8240 static bool
8241 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8242 load_permutation_t &perm,
8243 const vec<tree> &dr_chain,
8244 gimple_stmt_iterator *gsi, poly_uint64 vf,
8245 bool analyze_only, bool dump_p,
8246 unsigned *n_perms, unsigned int *n_loads,
8247 bool dce_chain)
8249 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8250 int vec_index = 0;
8251 tree vectype = SLP_TREE_VECTYPE (node);
8252 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8253 unsigned int mask_element;
8254 unsigned dr_group_size;
8255 machine_mode mode;
8257 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8258 dr_group_size = 1;
8259 else
8261 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8262 dr_group_size = DR_GROUP_SIZE (stmt_info);
8265 mode = TYPE_MODE (vectype);
8266 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8267 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8269 /* Initialize the vect stmts of NODE to properly insert the generated
8270 stmts later. */
8271 if (! analyze_only)
8272 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8273 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8275 /* Generate permutation masks for every NODE. Number of masks for each NODE
8276 is equal to GROUP_SIZE.
8277 E.g., we have a group of three nodes with three loads from the same
8278 location in each node, and the vector size is 4. I.e., we have a
8279 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8280 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8281 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8284 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8285 The last mask is illegal since we assume two operands for permute
8286 operation, and the mask element values can't be outside that range.
8287 Hence, the last mask must be converted into {2,5,5,5}.
8288 For the first two permutations we need the first and the second input
8289 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8290 we need the second and the third vectors: {b1,c1,a2,b2} and
8291 {c2,a3,b3,c3}. */
8293 int vect_stmts_counter = 0;
8294 unsigned int index = 0;
8295 int first_vec_index = -1;
8296 int second_vec_index = -1;
8297 bool noop_p = true;
8298 *n_perms = 0;
8300 vec_perm_builder mask;
8301 unsigned int nelts_to_build;
8302 unsigned int nvectors_per_build;
8303 unsigned int in_nlanes;
8304 bool repeating_p = (group_size == dr_group_size
8305 && multiple_p (nunits, group_size));
8306 if (repeating_p)
8308 /* A single vector contains a whole number of copies of the node, so:
8309 (a) all permutes can use the same mask; and
8310 (b) the permutes only need a single vector input. */
8311 mask.new_vector (nunits, group_size, 3);
8312 nelts_to_build = mask.encoded_nelts ();
8313 /* It's possible to obtain zero nstmts during analyze_only, so make
8314 it at least one to ensure the later computation for n_perms
8315 proceed. */
8316 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8317 in_nlanes = dr_group_size * 3;
8319 else
8321 /* We need to construct a separate mask for each vector statement. */
8322 unsigned HOST_WIDE_INT const_nunits, const_vf;
8323 if (!nunits.is_constant (&const_nunits)
8324 || !vf.is_constant (&const_vf))
8325 return false;
8326 mask.new_vector (const_nunits, const_nunits, 1);
8327 nelts_to_build = const_vf * group_size;
8328 nvectors_per_build = 1;
8329 in_nlanes = const_vf * dr_group_size;
8331 auto_sbitmap used_in_lanes (in_nlanes);
8332 bitmap_clear (used_in_lanes);
8333 auto_bitmap used_defs;
8335 unsigned int count = mask.encoded_nelts ();
8336 mask.quick_grow (count);
8337 vec_perm_indices indices;
8339 for (unsigned int j = 0; j < nelts_to_build; j++)
8341 unsigned int iter_num = j / group_size;
8342 unsigned int stmt_num = j % group_size;
8343 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8344 bitmap_set_bit (used_in_lanes, i);
8345 if (repeating_p)
8347 first_vec_index = 0;
8348 mask_element = i;
8350 else
8352 /* Enforced before the loop when !repeating_p. */
8353 unsigned int const_nunits = nunits.to_constant ();
8354 vec_index = i / const_nunits;
8355 mask_element = i % const_nunits;
8356 if (vec_index == first_vec_index
8357 || first_vec_index == -1)
8359 first_vec_index = vec_index;
8361 else if (vec_index == second_vec_index
8362 || second_vec_index == -1)
8364 second_vec_index = vec_index;
8365 mask_element += const_nunits;
8367 else
8369 if (dump_p)
8370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8371 "permutation requires at "
8372 "least three vectors %G",
8373 stmt_info->stmt);
8374 gcc_assert (analyze_only);
8375 return false;
8378 gcc_assert (mask_element < 2 * const_nunits);
8381 if (mask_element != index)
8382 noop_p = false;
8383 mask[index++] = mask_element;
8385 if (index == count)
8387 if (!noop_p)
8389 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8390 if (!can_vec_perm_const_p (mode, mode, indices))
8392 if (dump_p)
8394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8395 "unsupported vect permute { ");
8396 for (i = 0; i < count; ++i)
8398 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8399 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8401 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8403 gcc_assert (analyze_only);
8404 return false;
8407 tree mask_vec = NULL_TREE;
8408 if (!analyze_only)
8409 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8411 if (second_vec_index == -1)
8412 second_vec_index = first_vec_index;
8414 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8416 ++*n_perms;
8417 if (analyze_only)
8418 continue;
8419 /* Generate the permute statement if necessary. */
8420 tree first_vec = dr_chain[first_vec_index + ri];
8421 tree second_vec = dr_chain[second_vec_index + ri];
8422 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8423 tree perm_dest
8424 = vect_create_destination_var (gimple_assign_lhs (stmt),
8425 vectype);
8426 perm_dest = make_ssa_name (perm_dest);
8427 gimple *perm_stmt
8428 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8429 second_vec, mask_vec);
8430 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8431 gsi);
8432 if (dce_chain)
8434 bitmap_set_bit (used_defs, first_vec_index + ri);
8435 bitmap_set_bit (used_defs, second_vec_index + ri);
8438 /* Store the vector statement in NODE. */
8439 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8442 else if (!analyze_only)
8444 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8446 tree first_vec = dr_chain[first_vec_index + ri];
8447 /* If mask was NULL_TREE generate the requested
8448 identity transform. */
8449 if (dce_chain)
8450 bitmap_set_bit (used_defs, first_vec_index + ri);
8452 /* Store the vector statement in NODE. */
8453 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8457 index = 0;
8458 first_vec_index = -1;
8459 second_vec_index = -1;
8460 noop_p = true;
8464 if (n_loads)
8466 if (repeating_p)
8467 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8468 else
8470 /* Enforced above when !repeating_p. */
8471 unsigned int const_nunits = nunits.to_constant ();
8472 *n_loads = 0;
8473 bool load_seen = false;
8474 for (unsigned i = 0; i < in_nlanes; ++i)
8476 if (i % const_nunits == 0)
8478 if (load_seen)
8479 *n_loads += 1;
8480 load_seen = false;
8482 if (bitmap_bit_p (used_in_lanes, i))
8483 load_seen = true;
8485 if (load_seen)
8486 *n_loads += 1;
8490 if (dce_chain)
8491 for (unsigned i = 0; i < dr_chain.length (); ++i)
8492 if (!bitmap_bit_p (used_defs, i))
8494 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8495 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8496 gsi_remove (&rgsi, true);
8497 release_defs (stmt);
8500 return true;
8503 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8504 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8505 permute statements for the SLP node NODE. Store the number of vector
8506 permute instructions in *N_PERMS and the number of vector load
8507 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8508 that were not needed. */
8510 bool
8511 vect_transform_slp_perm_load (vec_info *vinfo,
8512 slp_tree node, const vec<tree> &dr_chain,
8513 gimple_stmt_iterator *gsi, poly_uint64 vf,
8514 bool analyze_only, unsigned *n_perms,
8515 unsigned int *n_loads, bool dce_chain)
8517 return vect_transform_slp_perm_load_1 (vinfo, node,
8518 SLP_TREE_LOAD_PERMUTATION (node),
8519 dr_chain, gsi, vf, analyze_only,
8520 dump_enabled_p (), n_perms, n_loads,
8521 dce_chain);
8524 /* Produce the next vector result for SLP permutation NODE by adding a vector
8525 statement at GSI. If MASK_VEC is nonnull, add:
8527 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8529 otherwise add:
8531 <new SSA name> = FIRST_DEF. */
8533 static void
8534 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8535 slp_tree node, tree first_def, tree second_def,
8536 tree mask_vec, poly_uint64 identity_offset)
8538 tree vectype = SLP_TREE_VECTYPE (node);
8540 /* ??? We SLP match existing vector element extracts but
8541 allow punning which we need to re-instantiate at uses
8542 but have no good way of explicitly representing. */
8543 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8544 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8546 gassign *conv_stmt
8547 = gimple_build_assign (make_ssa_name (vectype),
8548 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8549 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8550 first_def = gimple_assign_lhs (conv_stmt);
8552 gassign *perm_stmt;
8553 tree perm_dest = make_ssa_name (vectype);
8554 if (mask_vec)
8556 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8557 TYPE_SIZE (vectype))
8558 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8560 gassign *conv_stmt
8561 = gimple_build_assign (make_ssa_name (vectype),
8562 build1 (VIEW_CONVERT_EXPR,
8563 vectype, second_def));
8564 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8565 second_def = gimple_assign_lhs (conv_stmt);
8567 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8568 first_def, second_def,
8569 mask_vec);
8571 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8573 /* For identity permutes we still need to handle the case
8574 of offsetted extracts or concats. */
8575 unsigned HOST_WIDE_INT c;
8576 auto first_def_nunits
8577 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8578 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8580 unsigned HOST_WIDE_INT elsz
8581 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8582 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8583 TYPE_SIZE (vectype),
8584 bitsize_int (identity_offset * elsz));
8585 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8587 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8588 first_def_nunits, &c) && c == 2)
8590 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8591 NULL_TREE, second_def);
8592 perm_stmt = gimple_build_assign (perm_dest, ctor);
8594 else
8595 gcc_unreachable ();
8597 else
8599 /* We need a copy here in case the def was external. */
8600 perm_stmt = gimple_build_assign (perm_dest, first_def);
8602 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8603 /* Store the vector statement in NODE. */
8604 node->push_vec_def (perm_stmt);
8607 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8608 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8609 If GSI is nonnull, emit the permutation there.
8611 When GSI is null, the only purpose of NODE is to give properties
8612 of the result, such as the vector type and number of SLP lanes.
8613 The node does not need to be a VEC_PERM_EXPR.
8615 If the target supports the operation, return the number of individual
8616 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8617 dump file if DUMP_P is true. */
8619 static int
8620 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8621 slp_tree node, lane_permutation_t &perm,
8622 vec<slp_tree> &children, bool dump_p)
8624 tree vectype = SLP_TREE_VECTYPE (node);
8626 /* ??? We currently only support all same vector input types
8627 while the SLP IL should really do a concat + select and thus accept
8628 arbitrary mismatches. */
8629 slp_tree child;
8630 unsigned i;
8631 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8632 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8633 tree op_vectype = NULL_TREE;
8634 FOR_EACH_VEC_ELT (children, i, child)
8635 if (SLP_TREE_VECTYPE (child))
8637 op_vectype = SLP_TREE_VECTYPE (child);
8638 break;
8640 if (!op_vectype)
8641 op_vectype = vectype;
8642 FOR_EACH_VEC_ELT (children, i, child)
8644 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8645 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8646 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8647 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8649 if (dump_p)
8650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8651 "Unsupported vector types in lane permutation\n");
8652 return -1;
8654 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8655 repeating_p = false;
8658 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8659 if (dump_p)
8661 dump_printf_loc (MSG_NOTE, vect_location,
8662 "vectorizing permutation");
8663 for (unsigned i = 0; i < perm.length (); ++i)
8664 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8665 if (repeating_p)
8666 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8667 dump_printf (MSG_NOTE, "\n");
8670 /* REPEATING_P is true if every output vector is guaranteed to use the
8671 same permute vector. We can handle that case for both variable-length
8672 and constant-length vectors, but we only handle other cases for
8673 constant-length vectors.
8675 Set:
8677 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8678 mask vector that we want to build.
8680 - NCOPIES to the number of copies of PERM that we need in order
8681 to build the necessary permute mask vectors.
8683 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8684 for each permute mask vector. This is only relevant when GSI is
8685 nonnull. */
8686 uint64_t npatterns;
8687 unsigned nelts_per_pattern;
8688 uint64_t ncopies;
8689 unsigned noutputs_per_mask;
8690 if (repeating_p)
8692 /* We need a single permute mask vector that has the form:
8694 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8696 In other words, the original n-element permute in PERM is
8697 "unrolled" to fill a full vector. The stepped vector encoding
8698 that we use for permutes requires 3n elements. */
8699 npatterns = SLP_TREE_LANES (node);
8700 nelts_per_pattern = ncopies = 3;
8701 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8703 else
8705 /* Calculate every element of every permute mask vector explicitly,
8706 instead of relying on the pattern described above. */
8707 if (!nunits.is_constant (&npatterns))
8708 return -1;
8709 nelts_per_pattern = ncopies = 1;
8710 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8711 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8712 return -1;
8713 noutputs_per_mask = 1;
8715 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8716 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8718 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8719 from the { SLP operand, scalar lane } permutation as recorded in the
8720 SLP node as intermediate step. This part should already work
8721 with SLP children with arbitrary number of lanes. */
8722 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8723 auto_vec<unsigned> active_lane;
8724 vperm.create (olanes);
8725 active_lane.safe_grow_cleared (children.length (), true);
8726 for (unsigned i = 0; i < ncopies; ++i)
8728 for (unsigned pi = 0; pi < perm.length (); ++pi)
8730 std::pair<unsigned, unsigned> p = perm[pi];
8731 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8732 if (repeating_p)
8733 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8734 else
8736 /* We checked above that the vectors are constant-length. */
8737 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8738 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8739 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8740 vperm.quick_push ({{p.first, vi}, vl});
8743 /* Advance to the next group. */
8744 for (unsigned j = 0; j < children.length (); ++j)
8745 active_lane[j] += SLP_TREE_LANES (children[j]);
8748 if (dump_p)
8750 dump_printf_loc (MSG_NOTE, vect_location,
8751 "vectorizing permutation");
8752 for (unsigned i = 0; i < perm.length (); ++i)
8753 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8754 if (repeating_p)
8755 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8756 dump_printf (MSG_NOTE, "\n");
8757 dump_printf_loc (MSG_NOTE, vect_location, "as");
8758 for (unsigned i = 0; i < vperm.length (); ++i)
8760 if (i != 0
8761 && (repeating_p
8762 ? multiple_p (i, npatterns)
8763 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8764 dump_printf (MSG_NOTE, ",");
8765 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8766 vperm[i].first.first, vperm[i].first.second,
8767 vperm[i].second);
8769 dump_printf (MSG_NOTE, "\n");
8772 /* We can only handle two-vector permutes, everything else should
8773 be lowered on the SLP level. The following is closely inspired
8774 by vect_transform_slp_perm_load and is supposed to eventually
8775 replace it.
8776 ??? As intermediate step do code-gen in the SLP tree representation
8777 somehow? */
8778 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8779 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8780 unsigned int index = 0;
8781 poly_uint64 mask_element;
8782 vec_perm_builder mask;
8783 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8784 unsigned int count = mask.encoded_nelts ();
8785 mask.quick_grow (count);
8786 vec_perm_indices indices;
8787 unsigned nperms = 0;
8788 for (unsigned i = 0; i < vperm.length (); ++i)
8790 mask_element = vperm[i].second;
8791 if (first_vec.first == -1U
8792 || first_vec == vperm[i].first)
8793 first_vec = vperm[i].first;
8794 else if (second_vec.first == -1U
8795 || second_vec == vperm[i].first)
8797 second_vec = vperm[i].first;
8798 mask_element += nunits;
8800 else
8802 if (dump_p)
8803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8804 "permutation requires at "
8805 "least three vectors\n");
8806 gcc_assert (!gsi);
8807 return -1;
8810 mask[index++] = mask_element;
8812 if (index == count)
8814 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8815 TYPE_VECTOR_SUBPARTS (op_vectype));
8816 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8817 && constant_multiple_p (mask[0], nunits));
8818 machine_mode vmode = TYPE_MODE (vectype);
8819 machine_mode op_vmode = TYPE_MODE (op_vectype);
8820 unsigned HOST_WIDE_INT c;
8821 if ((!identity_p
8822 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8823 || (identity_p
8824 && !known_le (nunits,
8825 TYPE_VECTOR_SUBPARTS (op_vectype))
8826 && (!constant_multiple_p (nunits,
8827 TYPE_VECTOR_SUBPARTS (op_vectype),
8828 &c) || c != 2)))
8830 if (dump_p)
8832 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8833 vect_location,
8834 "unsupported vect permute { ");
8835 for (i = 0; i < count; ++i)
8837 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8838 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8840 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8842 gcc_assert (!gsi);
8843 return -1;
8846 if (!identity_p)
8847 nperms++;
8848 if (gsi)
8850 if (second_vec.first == -1U)
8851 second_vec = first_vec;
8853 slp_tree
8854 first_node = children[first_vec.first],
8855 second_node = children[second_vec.first];
8857 tree mask_vec = NULL_TREE;
8858 if (!identity_p)
8859 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8861 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8863 tree first_def
8864 = vect_get_slp_vect_def (first_node,
8865 first_vec.second + vi);
8866 tree second_def
8867 = vect_get_slp_vect_def (second_node,
8868 second_vec.second + vi);
8869 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8870 second_def, mask_vec, mask[0]);
8874 index = 0;
8875 first_vec = std::make_pair (-1U, -1U);
8876 second_vec = std::make_pair (-1U, -1U);
8880 return nperms;
8883 /* Vectorize the SLP permutations in NODE as specified
8884 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8885 child number and lane number.
8886 Interleaving of two two-lane two-child SLP subtrees (not supported):
8887 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8888 A blend of two four-lane two-child SLP subtrees:
8889 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8890 Highpart of a four-lane one-child SLP subtree (not supported):
8891 [ { 0, 2 }, { 0, 3 } ]
8892 Where currently only a subset is supported by code generating below. */
8894 static bool
8895 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8896 slp_tree node, stmt_vector_for_cost *cost_vec)
8898 tree vectype = SLP_TREE_VECTYPE (node);
8899 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8900 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8901 SLP_TREE_CHILDREN (node),
8902 dump_enabled_p ());
8903 if (nperms < 0)
8904 return false;
8906 if (!gsi)
8907 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8909 return true;
8912 /* Vectorize SLP NODE. */
8914 static void
8915 vect_schedule_slp_node (vec_info *vinfo,
8916 slp_tree node, slp_instance instance)
8918 gimple_stmt_iterator si;
8919 int i;
8920 slp_tree child;
8922 /* For existing vectors there's nothing to do. */
8923 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8924 && SLP_TREE_VEC_DEFS (node).exists ())
8925 return;
8927 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8929 /* Vectorize externals and constants. */
8930 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8931 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8933 /* ??? vectorizable_shift can end up using a scalar operand which is
8934 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8935 node in this case. */
8936 if (!SLP_TREE_VECTYPE (node))
8937 return;
8939 vect_create_constant_vectors (vinfo, node);
8940 return;
8943 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8945 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8946 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8948 if (dump_enabled_p ())
8949 dump_printf_loc (MSG_NOTE, vect_location,
8950 "------>vectorizing SLP node starting from: %G",
8951 stmt_info->stmt);
8953 if (STMT_VINFO_DATA_REF (stmt_info)
8954 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8956 /* Vectorized loads go before the first scalar load to make it
8957 ready early, vectorized stores go before the last scalar
8958 stmt which is where all uses are ready. */
8959 stmt_vec_info last_stmt_info = NULL;
8960 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8961 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8962 else /* DR_IS_WRITE */
8963 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8964 si = gsi_for_stmt (last_stmt_info->stmt);
8966 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8967 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8968 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8969 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8971 /* For PHI node vectorization we do not use the insertion iterator. */
8972 si = gsi_none ();
8974 else
8976 /* Emit other stmts after the children vectorized defs which is
8977 earliest possible. */
8978 gimple *last_stmt = NULL;
8979 bool seen_vector_def = false;
8980 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8981 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8983 /* For fold-left reductions we are retaining the scalar
8984 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8985 set so the representation isn't perfect. Resort to the
8986 last scalar def here. */
8987 if (SLP_TREE_VEC_DEFS (child).is_empty ())
8989 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8990 == cycle_phi_info_type);
8991 gphi *phi = as_a <gphi *>
8992 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8993 if (!last_stmt
8994 || vect_stmt_dominates_stmt_p (last_stmt, phi))
8995 last_stmt = phi;
8997 /* We are emitting all vectorized stmts in the same place and
8998 the last one is the last.
8999 ??? Unless we have a load permutation applied and that
9000 figures to re-use an earlier generated load. */
9001 unsigned j;
9002 tree vdef;
9003 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9005 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9006 if (!last_stmt
9007 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9008 last_stmt = vstmt;
9011 else if (!SLP_TREE_VECTYPE (child))
9013 /* For externals we use unvectorized at all scalar defs. */
9014 unsigned j;
9015 tree def;
9016 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9017 if (TREE_CODE (def) == SSA_NAME
9018 && !SSA_NAME_IS_DEFAULT_DEF (def))
9020 gimple *stmt = SSA_NAME_DEF_STMT (def);
9021 if (!last_stmt
9022 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9023 last_stmt = stmt;
9026 else
9028 /* For externals we have to look at all defs since their
9029 insertion place is decided per vector. But beware
9030 of pre-existing vectors where we need to make sure
9031 we do not insert before the region boundary. */
9032 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9033 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9034 seen_vector_def = true;
9035 else
9037 unsigned j;
9038 tree vdef;
9039 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9040 if (TREE_CODE (vdef) == SSA_NAME
9041 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9043 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9044 if (!last_stmt
9045 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9046 last_stmt = vstmt;
9050 /* This can happen when all children are pre-existing vectors or
9051 constants. */
9052 if (!last_stmt)
9053 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9054 if (!last_stmt)
9056 gcc_assert (seen_vector_def);
9057 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9059 else if (is_ctrl_altering_stmt (last_stmt))
9061 /* We split regions to vectorize at control altering stmts
9062 with a definition so this must be an external which
9063 we can insert at the start of the region. */
9064 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9066 else if (is_a <bb_vec_info> (vinfo)
9067 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9068 && gimple_could_trap_p (stmt_info->stmt))
9070 /* We've constrained possibly trapping operations to all come
9071 from the same basic-block, if vectorized defs would allow earlier
9072 scheduling still force vectorized stmts to the original block.
9073 This is only necessary for BB vectorization since for loop vect
9074 all operations are in a single BB and scalar stmt based
9075 placement doesn't play well with epilogue vectorization. */
9076 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9077 gimple_bb (stmt_info->stmt),
9078 gimple_bb (last_stmt)));
9079 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9081 else if (is_a <gphi *> (last_stmt))
9082 si = gsi_after_labels (gimple_bb (last_stmt));
9083 else
9085 si = gsi_for_stmt (last_stmt);
9086 gsi_next (&si);
9090 /* Handle purely internal nodes. */
9091 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9093 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9094 be shared with different SLP nodes (but usually it's the same
9095 operation apart from the case the stmt is only there for denoting
9096 the actual scalar lane defs ...). So do not call vect_transform_stmt
9097 but open-code it here (partly). */
9098 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9099 gcc_assert (done);
9100 stmt_vec_info slp_stmt_info;
9101 unsigned int i;
9102 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9103 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9105 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9106 instance, i, true, NULL);
9107 gcc_assert (done);
9110 else
9111 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9114 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9115 For loop vectorization this is done in vectorizable_call, but for SLP
9116 it needs to be deferred until end of vect_schedule_slp, because multiple
9117 SLP instances may refer to the same scalar stmt. */
9119 static void
9120 vect_remove_slp_scalar_calls (vec_info *vinfo,
9121 slp_tree node, hash_set<slp_tree> &visited)
9123 gimple *new_stmt;
9124 gimple_stmt_iterator gsi;
9125 int i;
9126 slp_tree child;
9127 tree lhs;
9128 stmt_vec_info stmt_info;
9130 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9131 return;
9133 if (visited.add (node))
9134 return;
9136 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9137 vect_remove_slp_scalar_calls (vinfo, child, visited);
9139 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9141 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9142 if (!stmt || gimple_bb (stmt) == NULL)
9143 continue;
9144 if (is_pattern_stmt_p (stmt_info)
9145 || !PURE_SLP_STMT (stmt_info))
9146 continue;
9147 lhs = gimple_call_lhs (stmt);
9148 if (lhs)
9149 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9150 else
9152 new_stmt = gimple_build_nop ();
9153 unlink_stmt_vdef (stmt_info->stmt);
9155 gsi = gsi_for_stmt (stmt);
9156 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9157 if (lhs)
9158 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9162 static void
9163 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9165 hash_set<slp_tree> visited;
9166 vect_remove_slp_scalar_calls (vinfo, node, visited);
9169 /* Vectorize the instance root. */
9171 void
9172 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9174 gassign *rstmt = NULL;
9176 if (instance->kind == slp_inst_kind_ctor)
9178 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9180 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9181 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9182 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9183 TREE_TYPE (vect_lhs)))
9184 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9185 vect_lhs);
9186 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9188 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9190 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9191 tree child_def;
9192 int j;
9193 vec<constructor_elt, va_gc> *v;
9194 vec_alloc (v, nelts);
9196 /* A CTOR can handle V16HI composition from VNx8HI so we
9197 do not need to convert vector elements if the types
9198 do not match. */
9199 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9200 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9201 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9202 tree rtype
9203 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9204 tree r_constructor = build_constructor (rtype, v);
9205 rstmt = gimple_build_assign (lhs, r_constructor);
9208 else if (instance->kind == slp_inst_kind_bb_reduc)
9210 /* Largely inspired by reduction chain epilogue handling in
9211 vect_create_epilog_for_reduction. */
9212 vec<tree> vec_defs = vNULL;
9213 vect_get_slp_defs (node, &vec_defs);
9214 enum tree_code reduc_code
9215 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9216 /* ??? We actually have to reflect signs somewhere. */
9217 if (reduc_code == MINUS_EXPR)
9218 reduc_code = PLUS_EXPR;
9219 gimple_seq epilogue = NULL;
9220 /* We may end up with more than one vector result, reduce them
9221 to one vector. */
9222 tree vec_def = vec_defs[0];
9223 tree vectype = TREE_TYPE (vec_def);
9224 tree compute_vectype = vectype;
9225 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9226 && TYPE_OVERFLOW_UNDEFINED (vectype)
9227 && operation_can_overflow (reduc_code));
9228 if (pun_for_overflow_p)
9230 compute_vectype = unsigned_type_for (vectype);
9231 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9232 compute_vectype, vec_def);
9234 for (unsigned i = 1; i < vec_defs.length (); ++i)
9236 tree def = vec_defs[i];
9237 if (pun_for_overflow_p)
9238 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9239 compute_vectype, def);
9240 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9241 vec_def, def);
9243 vec_defs.release ();
9244 /* ??? Support other schemes than direct internal fn. */
9245 internal_fn reduc_fn;
9246 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9247 || reduc_fn == IFN_LAST)
9248 gcc_unreachable ();
9249 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9250 TREE_TYPE (compute_vectype), vec_def);
9251 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9253 tree rem_def = NULL_TREE;
9254 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9256 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9257 if (!rem_def)
9258 rem_def = def;
9259 else
9260 rem_def = gimple_build (&epilogue, reduc_code,
9261 TREE_TYPE (scalar_def),
9262 rem_def, def);
9264 scalar_def = gimple_build (&epilogue, reduc_code,
9265 TREE_TYPE (scalar_def),
9266 scalar_def, rem_def);
9268 scalar_def = gimple_convert (&epilogue,
9269 TREE_TYPE (vectype), scalar_def);
9270 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9271 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9272 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9273 update_stmt (gsi_stmt (rgsi));
9274 return;
9276 else
9277 gcc_unreachable ();
9279 gcc_assert (rstmt);
9281 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9282 gsi_replace (&rgsi, rstmt, true);
9285 struct slp_scc_info
9287 bool on_stack;
9288 int dfs;
9289 int lowlink;
9292 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9294 static void
9295 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9296 hash_map<slp_tree, slp_scc_info> &scc_info,
9297 int &maxdfs, vec<slp_tree> &stack)
9299 bool existed_p;
9300 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9301 gcc_assert (!existed_p);
9302 info->dfs = maxdfs;
9303 info->lowlink = maxdfs;
9304 maxdfs++;
9306 /* Leaf. */
9307 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9309 info->on_stack = false;
9310 vect_schedule_slp_node (vinfo, node, instance);
9311 return;
9314 info->on_stack = true;
9315 stack.safe_push (node);
9317 unsigned i;
9318 slp_tree child;
9319 /* DFS recurse. */
9320 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9322 if (!child)
9323 continue;
9324 slp_scc_info *child_info = scc_info.get (child);
9325 if (!child_info)
9327 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9328 /* Recursion might have re-allocated the node. */
9329 info = scc_info.get (node);
9330 child_info = scc_info.get (child);
9331 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9333 else if (child_info->on_stack)
9334 info->lowlink = MIN (info->lowlink, child_info->dfs);
9336 if (info->lowlink != info->dfs)
9337 return;
9339 auto_vec<slp_tree, 4> phis_to_fixup;
9341 /* Singleton. */
9342 if (stack.last () == node)
9344 stack.pop ();
9345 info->on_stack = false;
9346 vect_schedule_slp_node (vinfo, node, instance);
9347 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9348 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9349 phis_to_fixup.quick_push (node);
9351 else
9353 /* SCC. */
9354 int last_idx = stack.length () - 1;
9355 while (stack[last_idx] != node)
9356 last_idx--;
9357 /* We can break the cycle at PHIs who have at least one child
9358 code generated. Then we could re-start the DFS walk until
9359 all nodes in the SCC are covered (we might have new entries
9360 for only back-reachable nodes). But it's simpler to just
9361 iterate and schedule those that are ready. */
9362 unsigned todo = stack.length () - last_idx;
9365 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9367 slp_tree entry = stack[idx];
9368 if (!entry)
9369 continue;
9370 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9371 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9372 bool ready = !phi;
9373 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9374 if (!child)
9376 gcc_assert (phi);
9377 ready = true;
9378 break;
9380 else if (scc_info.get (child)->on_stack)
9382 if (!phi)
9384 ready = false;
9385 break;
9388 else
9390 if (phi)
9392 ready = true;
9393 break;
9396 if (ready)
9398 vect_schedule_slp_node (vinfo, entry, instance);
9399 scc_info.get (entry)->on_stack = false;
9400 stack[idx] = NULL;
9401 todo--;
9402 if (phi)
9403 phis_to_fixup.safe_push (entry);
9407 while (todo != 0);
9409 /* Pop the SCC. */
9410 stack.truncate (last_idx);
9413 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9414 slp_tree phi_node;
9415 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9417 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9418 edge_iterator ei;
9419 edge e;
9420 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9422 unsigned dest_idx = e->dest_idx;
9423 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9424 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9425 continue;
9426 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9427 /* Simply fill all args. */
9428 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9429 != vect_first_order_recurrence)
9430 for (unsigned i = 0; i < n; ++i)
9432 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9433 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9434 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9435 e, gimple_phi_arg_location (phi, dest_idx));
9437 else
9439 /* Unless it is a first order recurrence which needs
9440 args filled in for both the PHI node and the permutes. */
9441 gimple *perm
9442 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9443 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9444 add_phi_arg (as_a <gphi *> (rphi),
9445 vect_get_slp_vect_def (child, n - 1),
9446 e, gimple_phi_arg_location (phi, dest_idx));
9447 for (unsigned i = 0; i < n; ++i)
9449 gimple *perm
9450 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9451 if (i > 0)
9452 gimple_assign_set_rhs1 (perm,
9453 vect_get_slp_vect_def (child, i - 1));
9454 gimple_assign_set_rhs2 (perm,
9455 vect_get_slp_vect_def (child, i));
9456 update_stmt (perm);
9463 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9465 void
9466 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9468 slp_instance instance;
9469 unsigned int i;
9471 hash_map<slp_tree, slp_scc_info> scc_info;
9472 int maxdfs = 0;
9473 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9475 slp_tree node = SLP_INSTANCE_TREE (instance);
9476 if (dump_enabled_p ())
9478 dump_printf_loc (MSG_NOTE, vect_location,
9479 "Vectorizing SLP tree:\n");
9480 /* ??? Dump all? */
9481 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9482 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9483 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9484 vect_print_slp_graph (MSG_NOTE, vect_location,
9485 SLP_INSTANCE_TREE (instance));
9487 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9488 have a PHI be the node breaking the cycle. */
9489 auto_vec<slp_tree> stack;
9490 if (!scc_info.get (node))
9491 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9493 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9494 vectorize_slp_instance_root_stmt (node, instance);
9496 if (dump_enabled_p ())
9497 dump_printf_loc (MSG_NOTE, vect_location,
9498 "vectorizing stmts using SLP.\n");
9501 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9503 slp_tree root = SLP_INSTANCE_TREE (instance);
9504 stmt_vec_info store_info;
9505 unsigned int j;
9507 /* Remove scalar call stmts. Do not do this for basic-block
9508 vectorization as not all uses may be vectorized.
9509 ??? Why should this be necessary? DCE should be able to
9510 remove the stmts itself.
9511 ??? For BB vectorization we can as well remove scalar
9512 stmts starting from the SLP tree root if they have no
9513 uses. */
9514 if (is_a <loop_vec_info> (vinfo))
9515 vect_remove_slp_scalar_calls (vinfo, root);
9517 /* Remove vectorized stores original scalar stmts. */
9518 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9520 if (!STMT_VINFO_DATA_REF (store_info)
9521 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9522 break;
9524 store_info = vect_orig_stmt (store_info);
9525 /* Free the attached stmt_vec_info and remove the stmt. */
9526 vinfo->remove_stmt (store_info);
9528 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9529 to not crash in vect_free_slp_tree later. */
9530 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9531 SLP_TREE_REPRESENTATIVE (root) = NULL;