c++: retval dtor on rethrow [PR112301]
[official-gcc.git] / gcc / tree-vect-slp.cc
blob43d742e3c92e766a14370cfd7d816a0d7f08a332
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
154 void
155 _slp_tree::push_vec_def (gimple *def)
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
168 void
169 vect_free_slp_tree (slp_tree node)
171 int i;
172 slp_tree child;
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
191 delete node;
194 /* Return a location suitable for dumpings related to the SLP instance. */
196 dump_user_location_t
197 _slp_instance::location () const
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
206 /* Free the memory allocated for the SLP instance. */
208 void
209 vect_free_slp_instance (slp_instance instance)
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
221 /* Create an SLP node for SCALAR_STMTS. */
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
233 /* Create an SLP node for SCALAR_STMTS. */
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
247 /* Create an SLP node for SCALAR_STMTS. */
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
255 /* Create an SLP node for OPS. */
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
266 /* Create an SLP node for OPS. */
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
271 return vect_create_new_slp_node (new _slp_tree, ops);
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
317 return oprnds_info;
321 /* Free operands info. */
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
326 int i;
327 slp_oprnd_info oprnd_info;
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
336 oprnds_info.release ();
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
342 static sreal
343 vect_slp_node_weight (slp_tree node)
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
350 /* Return true if STMTS contains a pattern statement. */
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
384 return true;
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
409 while (next_stmt_info);
411 return -1;
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
459 for (unsigned int i = 0; i < 3; ++i)
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
483 return true;
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
493 /* Return true if DTA and DTB match. */
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
508 static const int arg1_map[] = { 1, 1 };
509 static const int arg2_map[] = { 1, 2 };
510 static const int arg1_arg4_map[] = { 2, 1, 4 };
511 static const int arg3_arg2_map[] = { 2, 3, 2 };
512 static const int op1_op0_map[] = { 2, 1, 0 };
513 static const int off_map[] = { 1, -3 };
514 static const int off_op0_map[] = { 2, -3, 0 };
515 static const int off_arg2_map[] = { 2, -3, 2 };
516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
517 static const int mask_call_maps[6][7] = {
518 { 1, 1, },
519 { 2, 1, 2, },
520 { 3, 1, 2, 3, },
521 { 4, 1, 2, 3, 4, },
522 { 5, 1, 2, 3, 4, 5, },
523 { 6, 1, 2, 3, 4, 5, 6 },
526 /* For most SLP statements, there is a one-to-one mapping between
527 gimple arguments and child nodes. If that is not true for STMT,
528 return an array that contains:
530 - the number of child nodes, followed by
531 - for each child node, the index of the argument associated with that node.
532 The special index -1 is the first operand of an embedded comparison and
533 the special index -2 is the second operand of an embedded comparison.
534 The special indes -3 is the offset of a gather as analyzed by
535 vect_check_gather_scatter.
537 SWAP is as for vect_get_and_check_slp_defs. */
539 static const int *
540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
541 unsigned char swap = 0)
543 if (auto assign = dyn_cast<const gassign *> (stmt))
545 if (gimple_assign_rhs_code (assign) == COND_EXPR
546 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
547 return cond_expr_maps[swap];
548 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
549 && swap)
550 return op1_op0_map;
551 if (gather_scatter_p)
552 return gimple_vdef (stmt) ? off_op0_map : off_map;
554 gcc_assert (!swap);
555 if (auto call = dyn_cast<const gcall *> (stmt))
557 if (gimple_call_internal_p (call))
558 switch (gimple_call_internal_fn (call))
560 case IFN_MASK_LOAD:
561 return gather_scatter_p ? off_arg2_map : arg2_map;
563 case IFN_GATHER_LOAD:
564 return arg1_map;
566 case IFN_MASK_GATHER_LOAD:
567 case IFN_MASK_LEN_GATHER_LOAD:
568 return arg1_arg4_map;
570 case IFN_MASK_STORE:
571 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
573 case IFN_MASK_CALL:
575 unsigned nargs = gimple_call_num_args (call);
576 if (nargs >= 2 && nargs <= 7)
577 return mask_call_maps[nargs-2];
578 else
579 return nullptr;
582 default:
583 break;
586 return nullptr;
589 /* Return the SLP node child index for operand OP of STMT. */
592 vect_slp_child_index_for_operand (const gimple *stmt, int op)
594 const int *opmap = vect_get_operand_map (stmt);
595 if (!opmap)
596 return op;
597 for (int i = 1; i < 1 + opmap[0]; ++i)
598 if (opmap[i] == op)
599 return i - 1;
600 gcc_unreachable ();
603 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
604 they are of a valid type and that they match the defs of the first stmt of
605 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
606 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
607 indicates swap is required for cond_expr stmts. Specifically, SWAP
608 is 1 if STMT is cond and operands of comparison need to be swapped;
609 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
611 If there was a fatal error return -1; if the error could be corrected by
612 swapping operands of father node of this one, return 1; if everything is
613 ok return 0. */
614 static int
615 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
616 bool *skip_args,
617 vec<stmt_vec_info> stmts, unsigned stmt_num,
618 vec<slp_oprnd_info> *oprnds_info)
620 stmt_vec_info stmt_info = stmts[stmt_num];
621 tree oprnd;
622 unsigned int i, number_of_oprnds;
623 enum vect_def_type dt = vect_uninitialized_def;
624 slp_oprnd_info oprnd_info;
625 gather_scatter_info gs_info;
626 unsigned int gs_op = -1u;
627 unsigned int commutative_op = -1U;
628 bool first = stmt_num == 0;
630 if (!is_a<gcall *> (stmt_info->stmt)
631 && !is_a<gassign *> (stmt_info->stmt)
632 && !is_a<gphi *> (stmt_info->stmt))
633 return -1;
635 number_of_oprnds = gimple_num_args (stmt_info->stmt);
636 const int *map
637 = vect_get_operand_map (stmt_info->stmt,
638 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
639 if (map)
640 number_of_oprnds = *map++;
641 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
643 if (gimple_call_internal_p (stmt))
645 internal_fn ifn = gimple_call_internal_fn (stmt);
646 commutative_op = first_commutative_argument (ifn);
649 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
651 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
652 commutative_op = 0;
655 bool swapped = (swap != 0);
656 bool backedge = false;
657 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
658 for (i = 0; i < number_of_oprnds; i++)
660 oprnd_info = (*oprnds_info)[i];
661 int opno = map ? map[i] : int (i);
662 if (opno == -3)
664 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
665 if (!is_a <loop_vec_info> (vinfo)
666 || !vect_check_gather_scatter (stmt_info,
667 as_a <loop_vec_info> (vinfo),
668 first ? &oprnd_info->first_gs_info
669 : &gs_info))
670 return -1;
672 if (first)
674 oprnd_info->first_gs_p = true;
675 oprnd = oprnd_info->first_gs_info.offset;
677 else
679 gs_op = i;
680 oprnd = gs_info.offset;
683 else if (opno < 0)
684 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
685 else
687 oprnd = gimple_arg (stmt_info->stmt, opno);
688 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
690 edge e = gimple_phi_arg_edge (stmt, opno);
691 backedge = (is_a <bb_vec_info> (vinfo)
692 ? e->flags & EDGE_DFS_BACK
693 : dominated_by_p (CDI_DOMINATORS, e->src,
694 gimple_bb (stmt_info->stmt)));
697 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
698 oprnd = TREE_OPERAND (oprnd, 0);
700 stmt_vec_info def_stmt_info;
701 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
703 if (dump_enabled_p ())
704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
705 "Build SLP failed: can't analyze def for %T\n",
706 oprnd);
708 return -1;
711 if (skip_args[i])
713 oprnd_info->def_stmts.quick_push (NULL);
714 oprnd_info->ops.quick_push (NULL_TREE);
715 oprnd_info->first_dt = vect_uninitialized_def;
716 continue;
719 oprnd_info->def_stmts.quick_push (def_stmt_info);
720 oprnd_info->ops.quick_push (oprnd);
722 if (def_stmt_info
723 && is_pattern_stmt_p (def_stmt_info))
725 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
726 != def_stmt_info)
727 oprnd_info->any_pattern = true;
728 else
729 /* If we promote this to external use the original stmt def. */
730 oprnd_info->ops.last ()
731 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
734 /* If there's a extern def on a backedge make sure we can
735 code-generate at the region start.
736 ??? This is another case that could be fixed by adjusting
737 how we split the function but at the moment we'd have conflicting
738 goals there. */
739 if (backedge
740 && dts[i] == vect_external_def
741 && is_a <bb_vec_info> (vinfo)
742 && TREE_CODE (oprnd) == SSA_NAME
743 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
744 && !dominated_by_p (CDI_DOMINATORS,
745 as_a <bb_vec_info> (vinfo)->bbs[0],
746 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: extern def %T only defined "
751 "on backedge\n", oprnd);
752 return -1;
755 if (first)
757 tree type = TREE_TYPE (oprnd);
758 dt = dts[i];
759 if ((dt == vect_constant_def
760 || dt == vect_external_def)
761 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
762 && (TREE_CODE (type) == BOOLEAN_TYPE
763 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
764 type)))
766 if (dump_enabled_p ())
767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
768 "Build SLP failed: invalid type of def "
769 "for variable-length SLP %T\n", oprnd);
770 return -1;
773 /* For the swapping logic below force vect_reduction_def
774 for the reduction op in a SLP reduction group. */
775 if (!STMT_VINFO_DATA_REF (stmt_info)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
777 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
778 && def_stmt_info)
779 dts[i] = dt = vect_reduction_def;
781 /* Check the types of the definition. */
782 switch (dt)
784 case vect_external_def:
785 case vect_constant_def:
786 case vect_internal_def:
787 case vect_reduction_def:
788 case vect_induction_def:
789 case vect_nested_cycle:
790 case vect_first_order_recurrence:
791 break;
793 default:
794 /* FORNOW: Not supported. */
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
797 "Build SLP failed: illegal type of def %T\n",
798 oprnd);
799 return -1;
802 oprnd_info->first_dt = dt;
803 oprnd_info->first_op_type = type;
806 if (first)
807 return 0;
809 /* Now match the operand definition types to that of the first stmt. */
810 for (i = 0; i < number_of_oprnds;)
812 if (skip_args[i])
814 ++i;
815 continue;
818 oprnd_info = (*oprnds_info)[i];
819 dt = dts[i];
820 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
821 oprnd = oprnd_info->ops[stmt_num];
822 tree type = TREE_TYPE (oprnd);
824 if (!types_compatible_p (oprnd_info->first_op_type, type))
826 if (dump_enabled_p ())
827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
828 "Build SLP failed: different operand types\n");
829 return 1;
832 if ((gs_op == i) != oprnd_info->first_gs_p)
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "Build SLP failed: mixed gather and non-gather\n");
837 return 1;
839 else if (gs_op == i)
841 if (!operand_equal_p (oprnd_info->first_gs_info.base,
842 gs_info.base))
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: different gather base\n");
847 return 1;
849 if (oprnd_info->first_gs_info.scale != gs_info.scale)
851 if (dump_enabled_p ())
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
853 "Build SLP failed: different gather scale\n");
854 return 1;
858 /* Not first stmt of the group, check that the def-stmt/s match
859 the def-stmt/s of the first stmt. Allow different definition
860 types for reduction chains: the first stmt must be a
861 vect_reduction_def (a phi node), and the rest
862 end in the reduction chain. */
863 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
864 && !(oprnd_info->first_dt == vect_reduction_def
865 && !STMT_VINFO_DATA_REF (stmt_info)
866 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 && def_stmt_info
868 && !STMT_VINFO_DATA_REF (def_stmt_info)
869 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
871 || (!STMT_VINFO_DATA_REF (stmt_info)
872 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
873 && ((!def_stmt_info
874 || STMT_VINFO_DATA_REF (def_stmt_info)
875 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
876 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
877 != (oprnd_info->first_dt != vect_reduction_def))))
879 /* Try swapping operands if we got a mismatch. For BB
880 vectorization only in case it will clearly improve things. */
881 if (i == commutative_op && !swapped
882 && (!is_a <bb_vec_info> (vinfo)
883 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
884 dts[i+1])
885 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
886 || vect_def_types_match
887 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_NOTE, vect_location,
891 "trying swapped operands\n");
892 std::swap (dts[i], dts[i+1]);
893 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
894 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
895 std::swap ((*oprnds_info)[i]->ops[stmt_num],
896 (*oprnds_info)[i+1]->ops[stmt_num]);
897 swapped = true;
898 continue;
901 if (is_a <bb_vec_info> (vinfo)
902 && !oprnd_info->any_pattern)
904 /* Now for commutative ops we should see whether we can
905 make the other operand matching. */
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "treating operand as external\n");
909 oprnd_info->first_dt = dt = vect_external_def;
911 else
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: different types\n");
916 return 1;
920 /* Make sure to demote the overall operand to external. */
921 if (dt == vect_external_def)
922 oprnd_info->first_dt = vect_external_def;
923 /* For a SLP reduction chain we want to duplicate the reduction to
924 each of the chain members. That gets us a sane SLP graph (still
925 the stmts are not 100% correct wrt the initial values). */
926 else if ((dt == vect_internal_def
927 || dt == vect_reduction_def)
928 && oprnd_info->first_dt == vect_reduction_def
929 && !STMT_VINFO_DATA_REF (stmt_info)
930 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
931 && !STMT_VINFO_DATA_REF (def_stmt_info)
932 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
933 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
935 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
936 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
939 ++i;
942 /* Swap operands. */
943 if (swapped)
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "swapped operands to match def types in %G",
948 stmt_info->stmt);
951 return 0;
954 /* Return true if call statements CALL1 and CALL2 are similar enough
955 to be combined into the same SLP group. */
957 bool
958 compatible_calls_p (gcall *call1, gcall *call2)
960 unsigned int nargs = gimple_call_num_args (call1);
961 if (nargs != gimple_call_num_args (call2))
962 return false;
964 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
965 return false;
967 if (gimple_call_internal_p (call1))
969 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
970 TREE_TYPE (gimple_call_lhs (call2))))
971 return false;
972 for (unsigned int i = 0; i < nargs; ++i)
973 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
974 TREE_TYPE (gimple_call_arg (call2, i))))
975 return false;
977 else
979 if (!operand_equal_p (gimple_call_fn (call1),
980 gimple_call_fn (call2), 0))
981 return false;
983 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
984 return false;
987 /* Check that any unvectorized arguments are equal. */
988 if (const int *map = vect_get_operand_map (call1))
990 unsigned int nkept = *map++;
991 unsigned int mapi = 0;
992 for (unsigned int i = 0; i < nargs; ++i)
993 if (mapi < nkept && map[mapi] == int (i))
994 mapi += 1;
995 else if (!operand_equal_p (gimple_call_arg (call1, i),
996 gimple_call_arg (call2, i)))
997 return false;
1000 return true;
1003 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1004 caller's attempt to find the vector type in STMT_INFO with the narrowest
1005 element type. Return true if VECTYPE is nonnull and if it is valid
1006 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1007 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1008 vect_build_slp_tree. */
1010 static bool
1011 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1012 unsigned int group_size,
1013 tree vectype, poly_uint64 *max_nunits)
1015 if (!vectype)
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported data-type in %G\n",
1020 stmt_info->stmt);
1021 /* Fatal mismatch. */
1022 return false;
1025 /* If populating the vector type requires unrolling then fail
1026 before adjusting *max_nunits for basic-block vectorization. */
1027 if (is_a <bb_vec_info> (vinfo)
1028 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "Build SLP failed: unrolling required "
1033 "in basic block SLP\n");
1034 /* Fatal mismatch. */
1035 return false;
1038 /* In case of multiple types we need to detect the smallest type. */
1039 vect_update_max_nunits (max_nunits, vectype);
1040 return true;
1043 /* Verify if the scalar stmts STMTS are isomorphic, require data
1044 permutation or are of unsupported types of operation. Return
1045 true if they are, otherwise return false and indicate in *MATCHES
1046 which stmts are not isomorphic to the first one. If MATCHES[0]
1047 is false then this indicates the comparison could not be
1048 carried out or the stmts will never be vectorized by SLP.
1050 Note COND_EXPR is possibly isomorphic to another one after swapping its
1051 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1052 the first stmt by swapping the two operands of comparison; set SWAP[i]
1053 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1054 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1055 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1057 static bool
1058 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1059 vec<stmt_vec_info> stmts, unsigned int group_size,
1060 poly_uint64 *max_nunits, bool *matches,
1061 bool *two_operators, tree *node_vectype)
1063 unsigned int i;
1064 stmt_vec_info first_stmt_info = stmts[0];
1065 code_helper first_stmt_code = ERROR_MARK;
1066 code_helper alt_stmt_code = ERROR_MARK;
1067 code_helper rhs_code = ERROR_MARK;
1068 code_helper first_cond_code = ERROR_MARK;
1069 tree lhs;
1070 bool need_same_oprnds = false;
1071 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1072 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1073 bool first_stmt_ldst_p = false, ldst_p = false;
1074 bool first_stmt_phi_p = false, phi_p = false;
1075 bool maybe_soft_fail = false;
1076 tree soft_fail_nunits_vectype = NULL_TREE;
1078 /* For every stmt in NODE find its def stmt/s. */
1079 stmt_vec_info stmt_info;
1080 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1082 gimple *stmt = stmt_info->stmt;
1083 swap[i] = 0;
1084 matches[i] = false;
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1089 /* Fail to vectorize statements marked as unvectorizable, throw
1090 or are volatile. */
1091 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1092 || stmt_can_throw_internal (cfun, stmt)
1093 || gimple_has_volatile_ops (stmt))
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "Build SLP failed: unvectorizable statement %G",
1098 stmt);
1099 /* ??? For BB vectorization we want to commutate operands in a way
1100 to shuffle all unvectorizable defs into one operand and have
1101 the other still vectorized. The following doesn't reliably
1102 work for this though but it's the easiest we can do here. */
1103 if (is_a <bb_vec_info> (vinfo) && i != 0)
1104 continue;
1105 /* Fatal mismatch. */
1106 matches[0] = false;
1107 return false;
1110 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1111 lhs = gimple_get_lhs (stmt);
1112 if (lhs == NULL_TREE
1113 && (!call_stmt
1114 || !gimple_call_internal_p (stmt)
1115 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Build SLP failed: not GIMPLE_ASSIGN nor "
1120 "GIMPLE_CALL %G", stmt);
1121 if (is_a <bb_vec_info> (vinfo) && i != 0)
1122 continue;
1123 /* Fatal mismatch. */
1124 matches[0] = false;
1125 return false;
1128 tree nunits_vectype;
1129 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1130 &nunits_vectype, group_size))
1132 if (is_a <bb_vec_info> (vinfo) && i != 0)
1133 continue;
1134 /* Fatal mismatch. */
1135 matches[0] = false;
1136 return false;
1138 /* Record nunits required but continue analysis, producing matches[]
1139 as if nunits was not an issue. This allows splitting of groups
1140 to happen. */
1141 if (nunits_vectype
1142 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1143 nunits_vectype, max_nunits))
1145 gcc_assert (is_a <bb_vec_info> (vinfo));
1146 maybe_soft_fail = true;
1147 soft_fail_nunits_vectype = nunits_vectype;
1150 gcc_assert (vectype);
1152 if (call_stmt)
1154 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1155 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1156 rhs_code = cfn;
1157 else
1158 rhs_code = CALL_EXPR;
1160 if (cfn == CFN_MASK_LOAD
1161 || cfn == CFN_GATHER_LOAD
1162 || cfn == CFN_MASK_GATHER_LOAD
1163 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1164 ldst_p = true;
1165 else if (cfn == CFN_MASK_STORE)
1167 ldst_p = true;
1168 rhs_code = CFN_MASK_STORE;
1170 else if ((cfn != CFN_LAST
1171 && cfn != CFN_MASK_CALL
1172 && internal_fn_p (cfn)
1173 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1174 || gimple_call_tail_p (call_stmt)
1175 || gimple_call_noreturn_p (call_stmt)
1176 || gimple_call_chain (call_stmt))
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "Build SLP failed: unsupported call type %G",
1181 (gimple *) call_stmt);
1182 if (is_a <bb_vec_info> (vinfo) && i != 0)
1183 continue;
1184 /* Fatal mismatch. */
1185 matches[0] = false;
1186 return false;
1189 else if (gimple_code (stmt) == GIMPLE_PHI)
1191 rhs_code = ERROR_MARK;
1192 phi_p = true;
1194 else
1196 rhs_code = gimple_assign_rhs_code (stmt);
1197 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1200 /* Check the operation. */
1201 if (i == 0)
1203 *node_vectype = vectype;
1204 first_stmt_code = rhs_code;
1205 first_stmt_ldst_p = ldst_p;
1206 first_stmt_phi_p = phi_p;
1208 /* Shift arguments should be equal in all the packed stmts for a
1209 vector shift with scalar shift operand. */
1210 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1211 || rhs_code == LROTATE_EXPR
1212 || rhs_code == RROTATE_EXPR)
1214 /* First see if we have a vector/vector shift. */
1215 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1217 /* No vector/vector shift, try for a vector/scalar shift. */
1218 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "Build SLP failed: "
1223 "op not supported by target.\n");
1224 if (is_a <bb_vec_info> (vinfo) && i != 0)
1225 continue;
1226 /* Fatal mismatch. */
1227 matches[0] = false;
1228 return false;
1230 need_same_oprnds = true;
1231 first_op1 = gimple_assign_rhs2 (stmt);
1234 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1236 need_same_oprnds = true;
1237 first_op1 = gimple_assign_rhs2 (stmt);
1239 else if (!ldst_p
1240 && rhs_code == BIT_FIELD_REF)
1242 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1243 if (!is_a <bb_vec_info> (vinfo)
1244 || TREE_CODE (vec) != SSA_NAME
1245 /* When the element types are not compatible we pun the
1246 source to the target vectype which requires equal size. */
1247 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1248 || !types_compatible_p (TREE_TYPE (vectype),
1249 TREE_TYPE (TREE_TYPE (vec))))
1250 && !operand_equal_p (TYPE_SIZE (vectype),
1251 TYPE_SIZE (TREE_TYPE (vec)))))
1253 if (dump_enabled_p ())
1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255 "Build SLP failed: "
1256 "BIT_FIELD_REF not supported\n");
1257 /* Fatal mismatch. */
1258 matches[0] = false;
1259 return false;
1262 else if (rhs_code == CFN_DIV_POW2)
1264 need_same_oprnds = true;
1265 first_op1 = gimple_call_arg (call_stmt, 1);
1268 else
1270 if (first_stmt_code != rhs_code
1271 && alt_stmt_code == ERROR_MARK)
1272 alt_stmt_code = rhs_code;
1273 if ((first_stmt_code != rhs_code
1274 && (first_stmt_code != IMAGPART_EXPR
1275 || rhs_code != REALPART_EXPR)
1276 && (first_stmt_code != REALPART_EXPR
1277 || rhs_code != IMAGPART_EXPR)
1278 /* Handle mismatches in plus/minus by computing both
1279 and merging the results. */
1280 && !((first_stmt_code == PLUS_EXPR
1281 || first_stmt_code == MINUS_EXPR)
1282 && (alt_stmt_code == PLUS_EXPR
1283 || alt_stmt_code == MINUS_EXPR)
1284 && rhs_code == alt_stmt_code)
1285 && !(first_stmt_code.is_tree_code ()
1286 && rhs_code.is_tree_code ()
1287 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1288 == tcc_comparison)
1289 && (swap_tree_comparison (tree_code (first_stmt_code))
1290 == tree_code (rhs_code)))
1291 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1292 && (first_stmt_code == ARRAY_REF
1293 || first_stmt_code == BIT_FIELD_REF
1294 || first_stmt_code == INDIRECT_REF
1295 || first_stmt_code == COMPONENT_REF
1296 || first_stmt_code == MEM_REF)
1297 && (rhs_code == ARRAY_REF
1298 || rhs_code == BIT_FIELD_REF
1299 || rhs_code == INDIRECT_REF
1300 || rhs_code == COMPONENT_REF
1301 || rhs_code == MEM_REF)))
1302 || (ldst_p
1303 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1304 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1305 || (ldst_p
1306 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1307 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1308 || first_stmt_ldst_p != ldst_p
1309 || first_stmt_phi_p != phi_p)
1311 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "Build SLP failed: different operation "
1315 "in stmt %G", stmt);
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "original stmt %G", first_stmt_info->stmt);
1319 /* Mismatch. */
1320 continue;
1323 if (!ldst_p
1324 && first_stmt_code == BIT_FIELD_REF
1325 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1326 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "Build SLP failed: different BIT_FIELD_REF "
1331 "arguments in %G", stmt);
1332 /* Mismatch. */
1333 continue;
1336 if (call_stmt
1337 && first_stmt_code != CFN_MASK_LOAD
1338 && first_stmt_code != CFN_MASK_STORE)
1340 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1341 call_stmt))
1343 if (dump_enabled_p ())
1344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345 "Build SLP failed: different calls in %G",
1346 stmt);
1347 /* Mismatch. */
1348 continue;
1352 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1353 && (gimple_bb (first_stmt_info->stmt)
1354 != gimple_bb (stmt_info->stmt)))
1356 if (dump_enabled_p ())
1357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1358 "Build SLP failed: different BB for PHI "
1359 "or possibly trapping operation in %G", stmt);
1360 /* Mismatch. */
1361 continue;
1364 if (need_same_oprnds)
1366 tree other_op1 = gimple_arg (stmt, 1);
1367 if (!operand_equal_p (first_op1, other_op1, 0))
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "Build SLP failed: different shift "
1372 "arguments in %G", stmt);
1373 /* Mismatch. */
1374 continue;
1378 if (!types_compatible_p (vectype, *node_vectype))
1380 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1382 "Build SLP failed: different vector type "
1383 "in %G", stmt);
1384 /* Mismatch. */
1385 continue;
1389 /* Grouped store or load. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1392 gcc_assert (ldst_p);
1393 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1395 /* Store. */
1396 gcc_assert (rhs_code == CFN_MASK_STORE
1397 || REFERENCE_CLASS_P (lhs)
1398 || DECL_P (lhs));
1400 else
1402 /* Load. */
1403 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1404 if (prev_first_load)
1406 /* Check that there are no loads from different interleaving
1407 chains in the same node. */
1408 if (prev_first_load != first_load)
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1412 vect_location,
1413 "Build SLP failed: different "
1414 "interleaving chains in one node %G",
1415 stmt);
1416 /* Mismatch. */
1417 continue;
1420 else
1421 prev_first_load = first_load;
1424 /* Non-grouped store or load. */
1425 else if (ldst_p)
1427 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1428 && rhs_code != CFN_GATHER_LOAD
1429 && rhs_code != CFN_MASK_GATHER_LOAD
1430 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1431 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1432 /* Not grouped loads are handled as externals for BB
1433 vectorization. For loop vectorization we can handle
1434 splats the same we handle single element interleaving. */
1435 && (is_a <bb_vec_info> (vinfo)
1436 || stmt_info != first_stmt_info))
1438 /* Not grouped load. */
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441 "Build SLP failed: not grouped load %G", stmt);
1443 if (i != 0)
1444 continue;
1445 /* Fatal mismatch. */
1446 matches[0] = false;
1447 return false;
1450 /* Not memory operation. */
1451 else
1453 if (!phi_p
1454 && rhs_code.is_tree_code ()
1455 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1456 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1457 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1458 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1459 && rhs_code != VIEW_CONVERT_EXPR
1460 && rhs_code != CALL_EXPR
1461 && rhs_code != BIT_FIELD_REF)
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "Build SLP failed: operation unsupported %G",
1466 stmt);
1467 if (is_a <bb_vec_info> (vinfo) && i != 0)
1468 continue;
1469 /* Fatal mismatch. */
1470 matches[0] = false;
1471 return false;
1474 if (rhs_code == COND_EXPR)
1476 tree cond_expr = gimple_assign_rhs1 (stmt);
1477 enum tree_code cond_code = TREE_CODE (cond_expr);
1478 enum tree_code swap_code = ERROR_MARK;
1479 enum tree_code invert_code = ERROR_MARK;
1481 if (i == 0)
1482 first_cond_code = TREE_CODE (cond_expr);
1483 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1485 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1486 swap_code = swap_tree_comparison (cond_code);
1487 invert_code = invert_tree_comparison (cond_code, honor_nans);
1490 if (first_cond_code == cond_code)
1492 /* Isomorphic can be achieved by swapping. */
1493 else if (first_cond_code == swap_code)
1494 swap[i] = 1;
1495 /* Isomorphic can be achieved by inverting. */
1496 else if (first_cond_code == invert_code)
1497 swap[i] = 2;
1498 else
1500 if (dump_enabled_p ())
1501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502 "Build SLP failed: different"
1503 " operation %G", stmt);
1504 /* Mismatch. */
1505 continue;
1509 if (rhs_code.is_tree_code ()
1510 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1511 && (swap_tree_comparison ((tree_code)first_stmt_code)
1512 == (tree_code)rhs_code))
1513 swap[i] = 1;
1516 matches[i] = true;
1519 for (i = 0; i < group_size; ++i)
1520 if (!matches[i])
1521 return false;
1523 /* If we allowed a two-operation SLP node verify the target can cope
1524 with the permute we are going to use. */
1525 if (alt_stmt_code != ERROR_MARK
1526 && (!alt_stmt_code.is_tree_code ()
1527 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1528 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1530 *two_operators = true;
1533 if (maybe_soft_fail)
1535 unsigned HOST_WIDE_INT const_nunits;
1536 if (!TYPE_VECTOR_SUBPARTS
1537 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1538 || const_nunits > group_size)
1539 matches[0] = false;
1540 else
1542 /* With constant vector elements simulate a mismatch at the
1543 point we need to split. */
1544 unsigned tail = group_size & (const_nunits - 1);
1545 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1547 return false;
1550 return true;
1553 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1554 Note we never remove apart from at destruction time so we do not
1555 need a special value for deleted that differs from empty. */
1556 struct bst_traits
1558 typedef vec <stmt_vec_info> value_type;
1559 typedef vec <stmt_vec_info> compare_type;
1560 static inline hashval_t hash (value_type);
1561 static inline bool equal (value_type existing, value_type candidate);
1562 static inline bool is_empty (value_type x) { return !x.exists (); }
1563 static inline bool is_deleted (value_type x) { return !x.exists (); }
1564 static const bool empty_zero_p = true;
1565 static inline void mark_empty (value_type &x) { x.release (); }
1566 static inline void mark_deleted (value_type &x) { x.release (); }
1567 static inline void remove (value_type &x) { x.release (); }
1569 inline hashval_t
1570 bst_traits::hash (value_type x)
1572 inchash::hash h;
1573 for (unsigned i = 0; i < x.length (); ++i)
1574 h.add_int (gimple_uid (x[i]->stmt));
1575 return h.end ();
1577 inline bool
1578 bst_traits::equal (value_type existing, value_type candidate)
1580 if (existing.length () != candidate.length ())
1581 return false;
1582 for (unsigned i = 0; i < existing.length (); ++i)
1583 if (existing[i] != candidate[i])
1584 return false;
1585 return true;
1588 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1589 but then vec::insert does memmove and that's not compatible with
1590 std::pair. */
1591 struct chain_op_t
1593 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1594 : code (code_), dt (dt_), op (op_) {}
1595 tree_code code;
1596 vect_def_type dt;
1597 tree op;
1600 /* Comparator for sorting associatable chains. */
1602 static int
1603 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1605 auto *op1 = (const chain_op_t *) op1_;
1606 auto *op2 = (const chain_op_t *) op2_;
1607 if (op1->dt != op2->dt)
1608 return (int)op1->dt - (int)op2->dt;
1609 return (int)op1->code - (int)op2->code;
1612 /* Linearize the associatable expression chain at START with the
1613 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1614 filling CHAIN with the result and using WORKLIST as intermediate storage.
1615 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1616 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1617 stmts, starting with START. */
1619 static void
1620 vect_slp_linearize_chain (vec_info *vinfo,
1621 vec<std::pair<tree_code, gimple *> > &worklist,
1622 vec<chain_op_t> &chain,
1623 enum tree_code code, gimple *start,
1624 gimple *&code_stmt, gimple *&alt_code_stmt,
1625 vec<gimple *> *chain_stmts)
1627 /* For each lane linearize the addition/subtraction (or other
1628 uniform associatable operation) expression tree. */
1629 worklist.safe_push (std::make_pair (code, start));
1630 while (!worklist.is_empty ())
1632 auto entry = worklist.pop ();
1633 gassign *stmt = as_a <gassign *> (entry.second);
1634 enum tree_code in_code = entry.first;
1635 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1636 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1637 if (!code_stmt
1638 && gimple_assign_rhs_code (stmt) == code)
1639 code_stmt = stmt;
1640 else if (!alt_code_stmt
1641 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1642 alt_code_stmt = stmt;
1643 if (chain_stmts)
1644 chain_stmts->safe_push (stmt);
1645 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1647 tree op = gimple_op (stmt, opnum);
1648 vect_def_type dt;
1649 stmt_vec_info def_stmt_info;
1650 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1651 gcc_assert (res);
1652 if (dt == vect_internal_def
1653 && is_pattern_stmt_p (def_stmt_info))
1654 op = gimple_get_lhs (def_stmt_info->stmt);
1655 gimple *use_stmt;
1656 use_operand_p use_p;
1657 if (dt == vect_internal_def
1658 && single_imm_use (op, &use_p, &use_stmt)
1659 && is_gimple_assign (def_stmt_info->stmt)
1660 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1661 || (code == PLUS_EXPR
1662 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1663 == MINUS_EXPR))))
1665 tree_code op_def_code = this_code;
1666 if (op_def_code == MINUS_EXPR && opnum == 1)
1667 op_def_code = PLUS_EXPR;
1668 if (in_code == MINUS_EXPR)
1669 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1670 worklist.safe_push (std::make_pair (op_def_code,
1671 def_stmt_info->stmt));
1673 else
1675 tree_code op_def_code = this_code;
1676 if (op_def_code == MINUS_EXPR && opnum == 1)
1677 op_def_code = PLUS_EXPR;
1678 if (in_code == MINUS_EXPR)
1679 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1680 chain.safe_push (chain_op_t (op_def_code, dt, op));
1686 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1687 simple_hashmap_traits <bst_traits, slp_tree> >
1688 scalar_stmts_to_slp_tree_map_t;
1690 static slp_tree
1691 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1692 vec<stmt_vec_info> stmts, unsigned int group_size,
1693 poly_uint64 *max_nunits,
1694 bool *matches, unsigned *limit, unsigned *tree_size,
1695 scalar_stmts_to_slp_tree_map_t *bst_map);
1697 static slp_tree
1698 vect_build_slp_tree (vec_info *vinfo,
1699 vec<stmt_vec_info> stmts, unsigned int group_size,
1700 poly_uint64 *max_nunits,
1701 bool *matches, unsigned *limit, unsigned *tree_size,
1702 scalar_stmts_to_slp_tree_map_t *bst_map)
1704 if (slp_tree *leader = bst_map->get (stmts))
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1708 !(*leader)->failed ? "" : "failed ",
1709 (void *) *leader);
1710 if (!(*leader)->failed)
1712 SLP_TREE_REF_COUNT (*leader)++;
1713 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1714 stmts.release ();
1715 return *leader;
1717 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1718 return NULL;
1721 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1722 so we can pick up backedge destinations during discovery. */
1723 slp_tree res = new _slp_tree;
1724 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1725 SLP_TREE_SCALAR_STMTS (res) = stmts;
1726 bst_map->put (stmts.copy (), res);
1728 if (*limit == 0)
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "SLP discovery limit exceeded\n");
1733 /* Mark the node invalid so we can detect those when still in use
1734 as backedge destinations. */
1735 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1736 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1737 res->failed = XNEWVEC (bool, group_size);
1738 memset (res->failed, 0, sizeof (bool) * group_size);
1739 memset (matches, 0, sizeof (bool) * group_size);
1740 return NULL;
1742 --*limit;
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_NOTE, vect_location,
1746 "starting SLP discovery for node %p\n", (void *) res);
1748 poly_uint64 this_max_nunits = 1;
1749 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1750 &this_max_nunits,
1751 matches, limit, tree_size, bst_map);
1752 if (!res_)
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "SLP discovery for node %p failed\n", (void *) res);
1757 /* Mark the node invalid so we can detect those when still in use
1758 as backedge destinations. */
1759 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1760 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1761 res->failed = XNEWVEC (bool, group_size);
1762 if (flag_checking)
1764 unsigned i;
1765 for (i = 0; i < group_size; ++i)
1766 if (!matches[i])
1767 break;
1768 gcc_assert (i < group_size);
1770 memcpy (res->failed, matches, sizeof (bool) * group_size);
1772 else
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "SLP discovery for node %p succeeded\n",
1777 (void *) res);
1778 gcc_assert (res_ == res);
1779 res->max_nunits = this_max_nunits;
1780 vect_update_max_nunits (max_nunits, this_max_nunits);
1781 /* Keep a reference for the bst_map use. */
1782 SLP_TREE_REF_COUNT (res)++;
1784 return res_;
1787 /* Helper for building an associated SLP node chain. */
1789 static void
1790 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1791 slp_tree op0, slp_tree op1,
1792 stmt_vec_info oper1, stmt_vec_info oper2,
1793 vec<std::pair<unsigned, unsigned> > lperm)
1795 unsigned group_size = SLP_TREE_LANES (op1);
1797 slp_tree child1 = new _slp_tree;
1798 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1799 SLP_TREE_VECTYPE (child1) = vectype;
1800 SLP_TREE_LANES (child1) = group_size;
1801 SLP_TREE_CHILDREN (child1).create (2);
1802 SLP_TREE_CHILDREN (child1).quick_push (op0);
1803 SLP_TREE_CHILDREN (child1).quick_push (op1);
1804 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1806 slp_tree child2 = new _slp_tree;
1807 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1808 SLP_TREE_VECTYPE (child2) = vectype;
1809 SLP_TREE_LANES (child2) = group_size;
1810 SLP_TREE_CHILDREN (child2).create (2);
1811 SLP_TREE_CHILDREN (child2).quick_push (op0);
1812 SLP_TREE_REF_COUNT (op0)++;
1813 SLP_TREE_CHILDREN (child2).quick_push (op1);
1814 SLP_TREE_REF_COUNT (op1)++;
1815 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1817 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1818 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1819 SLP_TREE_VECTYPE (perm) = vectype;
1820 SLP_TREE_LANES (perm) = group_size;
1821 /* ??? We should set this NULL but that's not expected. */
1822 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1823 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1824 SLP_TREE_CHILDREN (perm).quick_push (child1);
1825 SLP_TREE_CHILDREN (perm).quick_push (child2);
1828 /* Recursively build an SLP tree starting from NODE.
1829 Fail (and return a value not equal to zero) if def-stmts are not
1830 isomorphic, require data permutation or are of unsupported types of
1831 operation. Otherwise, return 0.
1832 The value returned is the depth in the SLP tree where a mismatch
1833 was found. */
1835 static slp_tree
1836 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1837 vec<stmt_vec_info> stmts, unsigned int group_size,
1838 poly_uint64 *max_nunits,
1839 bool *matches, unsigned *limit, unsigned *tree_size,
1840 scalar_stmts_to_slp_tree_map_t *bst_map)
1842 unsigned nops, i, this_tree_size = 0;
1843 poly_uint64 this_max_nunits = *max_nunits;
1845 matches[0] = false;
1847 stmt_vec_info stmt_info = stmts[0];
1848 if (!is_a<gcall *> (stmt_info->stmt)
1849 && !is_a<gassign *> (stmt_info->stmt)
1850 && !is_a<gphi *> (stmt_info->stmt))
1851 return NULL;
1853 nops = gimple_num_args (stmt_info->stmt);
1854 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1855 STMT_VINFO_GATHER_SCATTER_P
1856 (stmt_info)))
1857 nops = map[0];
1859 /* If the SLP node is a PHI (induction or reduction), terminate
1860 the recursion. */
1861 bool *skip_args = XALLOCAVEC (bool, nops);
1862 memset (skip_args, 0, sizeof (bool) * nops);
1863 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1864 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1866 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1867 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1868 group_size);
1869 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1870 max_nunits))
1871 return NULL;
1873 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1874 if (def_type == vect_induction_def)
1876 /* Induction PHIs are not cycles but walk the initial
1877 value. Only for inner loops through, for outer loops
1878 we need to pick up the value from the actual PHIs
1879 to more easily support peeling and epilogue vectorization. */
1880 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1881 if (!nested_in_vect_loop_p (loop, stmt_info))
1882 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1883 else
1884 loop = loop->inner;
1885 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1887 else if (def_type == vect_reduction_def
1888 || def_type == vect_double_reduction_def
1889 || def_type == vect_nested_cycle
1890 || def_type == vect_first_order_recurrence)
1892 /* Else def types have to match. */
1893 stmt_vec_info other_info;
1894 bool all_same = true;
1895 FOR_EACH_VEC_ELT (stmts, i, other_info)
1897 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1898 return NULL;
1899 if (other_info != stmt_info)
1900 all_same = false;
1902 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1903 /* Reduction initial values are not explicitely represented. */
1904 if (def_type != vect_first_order_recurrence
1905 && !nested_in_vect_loop_p (loop, stmt_info))
1906 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1907 /* Reduction chain backedge defs are filled manually.
1908 ??? Need a better way to identify a SLP reduction chain PHI.
1909 Or a better overall way to SLP match those. */
1910 if (all_same && def_type == vect_reduction_def)
1911 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1913 else if (def_type != vect_internal_def)
1914 return NULL;
1918 bool two_operators = false;
1919 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1920 tree vectype = NULL_TREE;
1921 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1922 &this_max_nunits, matches, &two_operators,
1923 &vectype))
1924 return NULL;
1926 /* If the SLP node is a load, terminate the recursion unless masked. */
1927 if (STMT_VINFO_DATA_REF (stmt_info)
1928 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1930 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1931 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1932 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1933 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1934 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1935 else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1936 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1937 else
1939 *max_nunits = this_max_nunits;
1940 (*tree_size)++;
1941 node = vect_create_new_slp_node (node, stmts, 0);
1942 SLP_TREE_VECTYPE (node) = vectype;
1943 /* And compute the load permutation. Whether it is actually
1944 a permutation depends on the unrolling factor which is
1945 decided later. */
1946 vec<unsigned> load_permutation;
1947 int j;
1948 stmt_vec_info load_info;
1949 load_permutation.create (group_size);
1950 stmt_vec_info first_stmt_info
1951 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1952 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1954 int load_place;
1955 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1956 load_place = vect_get_place_in_interleaving_chain
1957 (load_info, first_stmt_info);
1958 else
1959 load_place = 0;
1960 gcc_assert (load_place != -1);
1961 load_permutation.safe_push (load_place);
1963 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1964 return node;
1967 else if (gimple_assign_single_p (stmt_info->stmt)
1968 && !gimple_vuse (stmt_info->stmt)
1969 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1971 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1972 the same SSA name vector of a compatible type to vectype. */
1973 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1974 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1975 stmt_vec_info estmt_info;
1976 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1978 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1979 tree bfref = gimple_assign_rhs1 (estmt);
1980 HOST_WIDE_INT lane;
1981 if (!known_eq (bit_field_size (bfref),
1982 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1983 || !constant_multiple_p (bit_field_offset (bfref),
1984 bit_field_size (bfref), &lane))
1986 lperm.release ();
1987 matches[0] = false;
1988 return NULL;
1990 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1992 slp_tree vnode = vect_create_new_slp_node (vNULL);
1993 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1994 /* ??? We record vectype here but we hide eventually necessary
1995 punning and instead rely on code generation to materialize
1996 VIEW_CONVERT_EXPRs as necessary. We instead should make
1997 this explicit somehow. */
1998 SLP_TREE_VECTYPE (vnode) = vectype;
1999 else
2001 /* For different size but compatible elements we can still
2002 use VEC_PERM_EXPR without punning. */
2003 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2004 && types_compatible_p (TREE_TYPE (vectype),
2005 TREE_TYPE (TREE_TYPE (vec))));
2006 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2008 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2009 unsigned HOST_WIDE_INT const_nunits;
2010 if (nunits.is_constant (&const_nunits))
2011 SLP_TREE_LANES (vnode) = const_nunits;
2012 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2013 /* We are always building a permutation node even if it is an identity
2014 permute to shield the rest of the vectorizer from the odd node
2015 representing an actual vector without any scalar ops.
2016 ??? We could hide it completely with making the permute node
2017 external? */
2018 node = vect_create_new_slp_node (node, stmts, 1);
2019 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2020 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2021 SLP_TREE_VECTYPE (node) = vectype;
2022 SLP_TREE_CHILDREN (node).quick_push (vnode);
2023 return node;
2025 /* When discovery reaches an associatable operation see whether we can
2026 improve that to match up lanes in a way superior to the operand
2027 swapping code which at most looks at two defs.
2028 ??? For BB vectorization we cannot do the brute-force search
2029 for matching as we can succeed by means of builds from scalars
2030 and have no good way to "cost" one build against another. */
2031 else if (is_a <loop_vec_info> (vinfo)
2032 /* ??? We don't handle !vect_internal_def defs below. */
2033 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2034 && is_gimple_assign (stmt_info->stmt)
2035 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2036 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2037 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2038 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2039 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2041 /* See if we have a chain of (mixed) adds or subtracts or other
2042 associatable ops. */
2043 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2044 if (code == MINUS_EXPR)
2045 code = PLUS_EXPR;
2046 stmt_vec_info other_op_stmt_info = NULL;
2047 stmt_vec_info op_stmt_info = NULL;
2048 unsigned chain_len = 0;
2049 auto_vec<chain_op_t> chain;
2050 auto_vec<std::pair<tree_code, gimple *> > worklist;
2051 auto_vec<vec<chain_op_t> > chains (group_size);
2052 auto_vec<slp_tree, 4> children;
2053 bool hard_fail = true;
2054 for (unsigned lane = 0; lane < group_size; ++lane)
2056 /* For each lane linearize the addition/subtraction (or other
2057 uniform associatable operation) expression tree. */
2058 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2059 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2060 stmts[lane]->stmt, op_stmt, other_op_stmt,
2061 NULL);
2062 if (!op_stmt_info && op_stmt)
2063 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2064 if (!other_op_stmt_info && other_op_stmt)
2065 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2066 if (chain.length () == 2)
2068 /* In a chain of just two elements resort to the regular
2069 operand swapping scheme. If we run into a length
2070 mismatch still hard-FAIL. */
2071 if (chain_len == 0)
2072 hard_fail = false;
2073 else
2075 matches[lane] = false;
2076 /* ??? We might want to process the other lanes, but
2077 make sure to not give false matching hints to the
2078 caller for lanes we did not process. */
2079 if (lane != group_size - 1)
2080 matches[0] = false;
2082 break;
2084 else if (chain_len == 0)
2085 chain_len = chain.length ();
2086 else if (chain.length () != chain_len)
2088 /* ??? Here we could slip in magic to compensate with
2089 neutral operands. */
2090 matches[lane] = false;
2091 if (lane != group_size - 1)
2092 matches[0] = false;
2093 break;
2095 chains.quick_push (chain.copy ());
2096 chain.truncate (0);
2098 if (chains.length () == group_size)
2100 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2101 if (!op_stmt_info)
2103 hard_fail = false;
2104 goto out;
2106 /* Now we have a set of chains with the same length. */
2107 /* 1. pre-sort according to def_type and operation. */
2108 for (unsigned lane = 0; lane < group_size; ++lane)
2109 chains[lane].stablesort (dt_sort_cmp, vinfo);
2110 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_NOTE, vect_location,
2113 "pre-sorted chains of %s\n",
2114 get_tree_code_name (code));
2115 for (unsigned lane = 0; lane < group_size; ++lane)
2117 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2118 dump_printf (MSG_NOTE, "%s %T ",
2119 get_tree_code_name (chains[lane][opnum].code),
2120 chains[lane][opnum].op);
2121 dump_printf (MSG_NOTE, "\n");
2124 /* 2. try to build children nodes, associating as necessary. */
2125 for (unsigned n = 0; n < chain_len; ++n)
2127 vect_def_type dt = chains[0][n].dt;
2128 unsigned lane;
2129 for (lane = 0; lane < group_size; ++lane)
2130 if (chains[lane][n].dt != dt)
2132 if (dt == vect_constant_def
2133 && chains[lane][n].dt == vect_external_def)
2134 dt = vect_external_def;
2135 else if (dt == vect_external_def
2136 && chains[lane][n].dt == vect_constant_def)
2138 else
2139 break;
2141 if (lane != group_size)
2143 if (dump_enabled_p ())
2144 dump_printf_loc (MSG_NOTE, vect_location,
2145 "giving up on chain due to mismatched "
2146 "def types\n");
2147 matches[lane] = false;
2148 if (lane != group_size - 1)
2149 matches[0] = false;
2150 goto out;
2152 if (dt == vect_constant_def
2153 || dt == vect_external_def)
2155 /* Check whether we can build the invariant. If we can't
2156 we never will be able to. */
2157 tree type = TREE_TYPE (chains[0][n].op);
2158 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2159 && (TREE_CODE (type) == BOOLEAN_TYPE
2160 || !can_duplicate_and_interleave_p (vinfo, group_size,
2161 type)))
2163 matches[0] = false;
2164 goto out;
2166 vec<tree> ops;
2167 ops.create (group_size);
2168 for (lane = 0; lane < group_size; ++lane)
2169 ops.quick_push (chains[lane][n].op);
2170 slp_tree child = vect_create_new_slp_node (ops);
2171 SLP_TREE_DEF_TYPE (child) = dt;
2172 children.safe_push (child);
2174 else if (dt != vect_internal_def)
2176 /* Not sure, we might need sth special.
2177 gcc.dg/vect/pr96854.c,
2178 gfortran.dg/vect/fast-math-pr37021.f90
2179 and gfortran.dg/vect/pr61171.f trigger. */
2180 /* Soft-fail for now. */
2181 hard_fail = false;
2182 goto out;
2184 else
2186 vec<stmt_vec_info> op_stmts;
2187 op_stmts.create (group_size);
2188 slp_tree child = NULL;
2189 /* Brute-force our way. We have to consider a lane
2190 failing after fixing an earlier fail up in the
2191 SLP discovery recursion. So track the current
2192 permute per lane. */
2193 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2194 memset (perms, 0, sizeof (unsigned) * group_size);
2197 op_stmts.truncate (0);
2198 for (lane = 0; lane < group_size; ++lane)
2199 op_stmts.quick_push
2200 (vinfo->lookup_def (chains[lane][n].op));
2201 child = vect_build_slp_tree (vinfo, op_stmts,
2202 group_size, &this_max_nunits,
2203 matches, limit,
2204 &this_tree_size, bst_map);
2205 /* ??? We're likely getting too many fatal mismatches
2206 here so maybe we want to ignore them (but then we
2207 have no idea which lanes fatally mismatched). */
2208 if (child || !matches[0])
2209 break;
2210 /* Swap another lane we have not yet matched up into
2211 lanes that did not match. If we run out of
2212 permute possibilities for a lane terminate the
2213 search. */
2214 bool term = false;
2215 for (lane = 1; lane < group_size; ++lane)
2216 if (!matches[lane])
2218 if (n + perms[lane] + 1 == chain_len)
2220 term = true;
2221 break;
2223 std::swap (chains[lane][n],
2224 chains[lane][n + perms[lane] + 1]);
2225 perms[lane]++;
2227 if (term)
2228 break;
2230 while (1);
2231 if (!child)
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "failed to match up op %d\n", n);
2236 op_stmts.release ();
2237 if (lane != group_size - 1)
2238 matches[0] = false;
2239 else
2240 matches[lane] = false;
2241 goto out;
2243 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_NOTE, vect_location,
2246 "matched up op %d to\n", n);
2247 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2249 children.safe_push (child);
2252 /* 3. build SLP nodes to combine the chain. */
2253 for (unsigned lane = 0; lane < group_size; ++lane)
2254 if (chains[lane][0].code != code)
2256 /* See if there's any alternate all-PLUS entry. */
2257 unsigned n;
2258 for (n = 1; n < chain_len; ++n)
2260 for (lane = 0; lane < group_size; ++lane)
2261 if (chains[lane][n].code != code)
2262 break;
2263 if (lane == group_size)
2264 break;
2266 if (n != chain_len)
2268 /* Swap that in at first position. */
2269 std::swap (children[0], children[n]);
2270 for (lane = 0; lane < group_size; ++lane)
2271 std::swap (chains[lane][0], chains[lane][n]);
2273 else
2275 /* ??? When this triggers and we end up with two
2276 vect_constant/external_def up-front things break (ICE)
2277 spectacularly finding an insertion place for the
2278 all-constant op. We should have a fully
2279 vect_internal_def operand though(?) so we can swap
2280 that into first place and then prepend the all-zero
2281 constant. */
2282 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_NOTE, vect_location,
2284 "inserting constant zero to compensate "
2285 "for (partially) negated first "
2286 "operand\n");
2287 chain_len++;
2288 for (lane = 0; lane < group_size; ++lane)
2289 chains[lane].safe_insert
2290 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2291 vec<tree> zero_ops;
2292 zero_ops.create (group_size);
2293 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2294 for (lane = 1; lane < group_size; ++lane)
2295 zero_ops.quick_push (zero_ops[0]);
2296 slp_tree zero = vect_create_new_slp_node (zero_ops);
2297 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2298 children.safe_insert (0, zero);
2300 break;
2302 for (unsigned i = 1; i < children.length (); ++i)
2304 slp_tree op0 = children[i - 1];
2305 slp_tree op1 = children[i];
2306 bool this_two_op = false;
2307 for (unsigned lane = 0; lane < group_size; ++lane)
2308 if (chains[lane][i].code != chains[0][i].code)
2310 this_two_op = true;
2311 break;
2313 slp_tree child;
2314 if (i == children.length () - 1)
2315 child = vect_create_new_slp_node (node, stmts, 2);
2316 else
2317 child = vect_create_new_slp_node (2, ERROR_MARK);
2318 if (this_two_op)
2320 vec<std::pair<unsigned, unsigned> > lperm;
2321 lperm.create (group_size);
2322 for (unsigned lane = 0; lane < group_size; ++lane)
2323 lperm.quick_push (std::make_pair
2324 (chains[lane][i].code != chains[0][i].code, lane));
2325 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2326 (chains[0][i].code == code
2327 ? op_stmt_info
2328 : other_op_stmt_info),
2329 (chains[0][i].code == code
2330 ? other_op_stmt_info
2331 : op_stmt_info),
2332 lperm);
2334 else
2336 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2337 SLP_TREE_VECTYPE (child) = vectype;
2338 SLP_TREE_LANES (child) = group_size;
2339 SLP_TREE_CHILDREN (child).quick_push (op0);
2340 SLP_TREE_CHILDREN (child).quick_push (op1);
2341 SLP_TREE_REPRESENTATIVE (child)
2342 = (chains[0][i].code == code
2343 ? op_stmt_info : other_op_stmt_info);
2345 children[i] = child;
2347 *tree_size += this_tree_size + 1;
2348 *max_nunits = this_max_nunits;
2349 while (!chains.is_empty ())
2350 chains.pop ().release ();
2351 return node;
2353 out:
2354 while (!children.is_empty ())
2355 vect_free_slp_tree (children.pop ());
2356 while (!chains.is_empty ())
2357 chains.pop ().release ();
2358 /* Hard-fail, otherwise we might run into quadratic processing of the
2359 chains starting one stmt into the chain again. */
2360 if (hard_fail)
2361 return NULL;
2362 /* Fall thru to normal processing. */
2365 /* Get at the operands, verifying they are compatible. */
2366 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2367 slp_oprnd_info oprnd_info;
2368 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2370 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2371 stmts, i, &oprnds_info);
2372 if (res != 0)
2373 matches[(res == -1) ? 0 : i] = false;
2374 if (!matches[0])
2375 break;
2377 for (i = 0; i < group_size; ++i)
2378 if (!matches[i])
2380 vect_free_oprnd_info (oprnds_info);
2381 return NULL;
2383 swap = NULL;
2385 auto_vec<slp_tree, 4> children;
2387 stmt_info = stmts[0];
2389 /* Create SLP_TREE nodes for the definition node/s. */
2390 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2392 slp_tree child;
2393 unsigned int j;
2395 /* We're skipping certain operands from processing, for example
2396 outer loop reduction initial defs. */
2397 if (skip_args[i])
2399 children.safe_push (NULL);
2400 continue;
2403 if (oprnd_info->first_dt == vect_uninitialized_def)
2405 /* COND_EXPR have one too many eventually if the condition
2406 is a SSA name. */
2407 gcc_assert (i == 3 && nops == 4);
2408 continue;
2411 if (is_a <bb_vec_info> (vinfo)
2412 && oprnd_info->first_dt == vect_internal_def
2413 && !oprnd_info->any_pattern)
2415 /* For BB vectorization, if all defs are the same do not
2416 bother to continue the build along the single-lane
2417 graph but use a splat of the scalar value. */
2418 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2419 for (j = 1; j < group_size; ++j)
2420 if (oprnd_info->def_stmts[j] != first_def)
2421 break;
2422 if (j == group_size
2423 /* But avoid doing this for loads where we may be
2424 able to CSE things, unless the stmt is not
2425 vectorizable. */
2426 && (!STMT_VINFO_VECTORIZABLE (first_def)
2427 || !gimple_vuse (first_def->stmt)))
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "Using a splat of the uniform operand %G",
2432 first_def->stmt);
2433 oprnd_info->first_dt = vect_external_def;
2437 if (oprnd_info->first_dt == vect_external_def
2438 || oprnd_info->first_dt == vect_constant_def)
2440 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2441 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2442 oprnd_info->ops = vNULL;
2443 children.safe_push (invnode);
2444 continue;
2447 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2448 group_size, &this_max_nunits,
2449 matches, limit,
2450 &this_tree_size, bst_map)) != NULL)
2452 oprnd_info->def_stmts = vNULL;
2453 children.safe_push (child);
2454 continue;
2457 /* If the SLP build for operand zero failed and operand zero
2458 and one can be commutated try that for the scalar stmts
2459 that failed the match. */
2460 if (i == 0
2461 /* A first scalar stmt mismatch signals a fatal mismatch. */
2462 && matches[0]
2463 /* ??? For COND_EXPRs we can swap the comparison operands
2464 as well as the arms under some constraints. */
2465 && nops == 2
2466 && oprnds_info[1]->first_dt == vect_internal_def
2467 && is_gimple_assign (stmt_info->stmt)
2468 /* Swapping operands for reductions breaks assumptions later on. */
2469 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2472 /* See whether we can swap the matching or the non-matching
2473 stmt operands. */
2474 bool swap_not_matching = true;
2477 for (j = 0; j < group_size; ++j)
2479 if (matches[j] != !swap_not_matching)
2480 continue;
2481 stmt_vec_info stmt_info = stmts[j];
2482 /* Verify if we can swap operands of this stmt. */
2483 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2484 if (!stmt
2485 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2487 if (!swap_not_matching)
2488 goto fail;
2489 swap_not_matching = false;
2490 break;
2494 while (j != group_size);
2496 /* Swap mismatched definition stmts. */
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "Re-trying with swapped operands of stmts ");
2500 for (j = 0; j < group_size; ++j)
2501 if (matches[j] == !swap_not_matching)
2503 std::swap (oprnds_info[0]->def_stmts[j],
2504 oprnds_info[1]->def_stmts[j]);
2505 std::swap (oprnds_info[0]->ops[j],
2506 oprnds_info[1]->ops[j]);
2507 if (dump_enabled_p ())
2508 dump_printf (MSG_NOTE, "%d ", j);
2510 if (dump_enabled_p ())
2511 dump_printf (MSG_NOTE, "\n");
2512 /* After swapping some operands we lost track whether an
2513 operand has any pattern defs so be conservative here. */
2514 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2515 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2516 /* And try again with scratch 'matches' ... */
2517 bool *tem = XALLOCAVEC (bool, group_size);
2518 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2519 group_size, &this_max_nunits,
2520 tem, limit,
2521 &this_tree_size, bst_map)) != NULL)
2523 oprnd_info->def_stmts = vNULL;
2524 children.safe_push (child);
2525 continue;
2528 fail:
2530 /* If the SLP build failed and we analyze a basic-block
2531 simply treat nodes we fail to build as externally defined
2532 (and thus build vectors from the scalar defs).
2533 The cost model will reject outright expensive cases.
2534 ??? This doesn't treat cases where permutation ultimatively
2535 fails (or we don't try permutation below). Ideally we'd
2536 even compute a permutation that will end up with the maximum
2537 SLP tree size... */
2538 if (is_a <bb_vec_info> (vinfo)
2539 /* ??? Rejecting patterns this way doesn't work. We'd have to
2540 do extra work to cancel the pattern so the uses see the
2541 scalar version. */
2542 && !is_pattern_stmt_p (stmt_info)
2543 && !oprnd_info->any_pattern)
2545 /* But if there's a leading vector sized set of matching stmts
2546 fail here so we can split the group. This matches the condition
2547 vect_analyze_slp_instance uses. */
2548 /* ??? We might want to split here and combine the results to support
2549 multiple vector sizes better. */
2550 for (j = 0; j < group_size; ++j)
2551 if (!matches[j])
2552 break;
2553 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2555 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_NOTE, vect_location,
2557 "Building vector operands from scalars\n");
2558 this_tree_size++;
2559 child = vect_create_new_slp_node (oprnd_info->ops);
2560 children.safe_push (child);
2561 oprnd_info->ops = vNULL;
2562 continue;
2566 gcc_assert (child == NULL);
2567 FOR_EACH_VEC_ELT (children, j, child)
2568 if (child)
2569 vect_free_slp_tree (child);
2570 vect_free_oprnd_info (oprnds_info);
2571 return NULL;
2574 vect_free_oprnd_info (oprnds_info);
2576 /* If we have all children of a child built up from uniform scalars
2577 or does more than one possibly expensive vector construction then
2578 just throw that away, causing it built up from scalars.
2579 The exception is the SLP node for the vector store. */
2580 if (is_a <bb_vec_info> (vinfo)
2581 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2582 /* ??? Rejecting patterns this way doesn't work. We'd have to
2583 do extra work to cancel the pattern so the uses see the
2584 scalar version. */
2585 && !is_pattern_stmt_p (stmt_info))
2587 slp_tree child;
2588 unsigned j;
2589 bool all_uniform_p = true;
2590 unsigned n_vector_builds = 0;
2591 FOR_EACH_VEC_ELT (children, j, child)
2593 if (!child)
2595 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2596 all_uniform_p = false;
2597 else if (!vect_slp_tree_uniform_p (child))
2599 all_uniform_p = false;
2600 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2601 n_vector_builds++;
2604 if (all_uniform_p
2605 || n_vector_builds > 1
2606 || (n_vector_builds == children.length ()
2607 && is_a <gphi *> (stmt_info->stmt)))
2609 /* Roll back. */
2610 matches[0] = false;
2611 FOR_EACH_VEC_ELT (children, j, child)
2612 if (child)
2613 vect_free_slp_tree (child);
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE, vect_location,
2617 "Building parent vector operands from "
2618 "scalars instead\n");
2619 return NULL;
2623 *tree_size += this_tree_size + 1;
2624 *max_nunits = this_max_nunits;
2626 if (two_operators)
2628 /* ??? We'd likely want to either cache in bst_map sth like
2629 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2630 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2631 explicit stmts to put in so the keying on 'stmts' doesn't
2632 work (but we have the same issue with nodes that use 'ops'). */
2633 slp_tree one = new _slp_tree;
2634 slp_tree two = new _slp_tree;
2635 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2636 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2637 SLP_TREE_VECTYPE (one) = vectype;
2638 SLP_TREE_VECTYPE (two) = vectype;
2639 SLP_TREE_CHILDREN (one).safe_splice (children);
2640 SLP_TREE_CHILDREN (two).safe_splice (children);
2641 slp_tree child;
2642 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2643 SLP_TREE_REF_COUNT (child)++;
2645 /* Here we record the original defs since this
2646 node represents the final lane configuration. */
2647 node = vect_create_new_slp_node (node, stmts, 2);
2648 SLP_TREE_VECTYPE (node) = vectype;
2649 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2650 SLP_TREE_CHILDREN (node).quick_push (one);
2651 SLP_TREE_CHILDREN (node).quick_push (two);
2652 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2653 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2654 enum tree_code ocode = ERROR_MARK;
2655 stmt_vec_info ostmt_info;
2656 unsigned j = 0;
2657 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2659 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2660 if (gimple_assign_rhs_code (ostmt) != code0)
2662 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2663 ocode = gimple_assign_rhs_code (ostmt);
2664 j = i;
2666 else
2667 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2669 SLP_TREE_CODE (one) = code0;
2670 SLP_TREE_CODE (two) = ocode;
2671 SLP_TREE_LANES (one) = stmts.length ();
2672 SLP_TREE_LANES (two) = stmts.length ();
2673 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2674 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2675 return node;
2678 node = vect_create_new_slp_node (node, stmts, nops);
2679 SLP_TREE_VECTYPE (node) = vectype;
2680 SLP_TREE_CHILDREN (node).splice (children);
2681 return node;
2684 /* Dump a single SLP tree NODE. */
2686 static void
2687 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2688 slp_tree node)
2690 unsigned i, j;
2691 slp_tree child;
2692 stmt_vec_info stmt_info;
2693 tree op;
2695 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2696 dump_user_location_t user_loc = loc.get_user_location ();
2697 dump_printf_loc (metadata, user_loc,
2698 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2699 ", refcnt=%u)",
2700 SLP_TREE_DEF_TYPE (node) == vect_external_def
2701 ? " (external)"
2702 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2703 ? " (constant)"
2704 : ""), (void *) node,
2705 estimated_poly_value (node->max_nunits),
2706 SLP_TREE_REF_COUNT (node));
2707 if (SLP_TREE_VECTYPE (node))
2708 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2709 dump_printf (metadata, "\n");
2710 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2712 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2713 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2714 else
2715 dump_printf_loc (metadata, user_loc, "op template: %G",
2716 SLP_TREE_REPRESENTATIVE (node)->stmt);
2718 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2719 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2720 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2721 else
2723 dump_printf_loc (metadata, user_loc, "\t{ ");
2724 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2725 dump_printf (metadata, "%T%s ", op,
2726 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2727 dump_printf (metadata, "}\n");
2729 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2731 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2732 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2733 dump_printf (dump_kind, " %u", j);
2734 dump_printf (dump_kind, " }\n");
2736 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2738 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2739 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2740 dump_printf (dump_kind, " %u[%u]",
2741 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2742 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2743 dump_printf (dump_kind, " }\n");
2745 if (SLP_TREE_CHILDREN (node).is_empty ())
2746 return;
2747 dump_printf_loc (metadata, user_loc, "\tchildren");
2748 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2749 dump_printf (dump_kind, " %p", (void *)child);
2750 dump_printf (dump_kind, "\n");
2753 DEBUG_FUNCTION void
2754 debug (slp_tree node)
2756 debug_dump_context ctx;
2757 vect_print_slp_tree (MSG_NOTE,
2758 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2759 node);
2762 /* Recursive helper for the dot producer below. */
2764 static void
2765 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2767 if (visited.add (node))
2768 return;
2770 fprintf (f, "\"%p\" [label=\"", (void *)node);
2771 vect_print_slp_tree (MSG_NOTE,
2772 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2773 node);
2774 fprintf (f, "\"];\n");
2777 for (slp_tree child : SLP_TREE_CHILDREN (node))
2778 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2780 for (slp_tree child : SLP_TREE_CHILDREN (node))
2781 if (child)
2782 dot_slp_tree (f, child, visited);
2785 DEBUG_FUNCTION void
2786 dot_slp_tree (const char *fname, slp_tree node)
2788 FILE *f = fopen (fname, "w");
2789 fprintf (f, "digraph {\n");
2790 fflush (f);
2792 debug_dump_context ctx (f);
2793 hash_set<slp_tree> visited;
2794 dot_slp_tree (f, node, visited);
2796 fflush (f);
2797 fprintf (f, "}\n");
2798 fclose (f);
2801 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2803 static void
2804 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2805 slp_tree node, hash_set<slp_tree> &visited)
2807 unsigned i;
2808 slp_tree child;
2810 if (visited.add (node))
2811 return;
2813 vect_print_slp_tree (dump_kind, loc, node);
2815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2816 if (child)
2817 vect_print_slp_graph (dump_kind, loc, child, visited);
2820 static void
2821 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2822 slp_tree entry)
2824 hash_set<slp_tree> visited;
2825 vect_print_slp_graph (dump_kind, loc, entry, visited);
2828 /* Mark the tree rooted at NODE with PURE_SLP. */
2830 static void
2831 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2833 int i;
2834 stmt_vec_info stmt_info;
2835 slp_tree child;
2837 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2838 return;
2840 if (visited.add (node))
2841 return;
2843 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2844 STMT_SLP_TYPE (stmt_info) = pure_slp;
2846 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2847 if (child)
2848 vect_mark_slp_stmts (child, visited);
2851 static void
2852 vect_mark_slp_stmts (slp_tree node)
2854 hash_set<slp_tree> visited;
2855 vect_mark_slp_stmts (node, visited);
2858 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2860 static void
2861 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2863 int i;
2864 stmt_vec_info stmt_info;
2865 slp_tree child;
2867 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2868 return;
2870 if (visited.add (node))
2871 return;
2873 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2875 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2876 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2877 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2880 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2881 if (child)
2882 vect_mark_slp_stmts_relevant (child, visited);
2885 static void
2886 vect_mark_slp_stmts_relevant (slp_tree node)
2888 hash_set<slp_tree> visited;
2889 vect_mark_slp_stmts_relevant (node, visited);
2893 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2895 static void
2896 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2897 hash_set<slp_tree> &visited)
2899 if (!node || visited.add (node))
2900 return;
2902 if (SLP_TREE_CHILDREN (node).length () == 0)
2904 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2905 return;
2906 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2907 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2908 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2909 loads.safe_push (node);
2911 else
2913 unsigned i;
2914 slp_tree child;
2915 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2916 vect_gather_slp_loads (loads, child, visited);
2921 /* Find the last store in SLP INSTANCE. */
2923 stmt_vec_info
2924 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2926 stmt_vec_info last = NULL;
2927 stmt_vec_info stmt_vinfo;
2929 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2931 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2932 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2935 return last;
2938 /* Find the first stmt in NODE. */
2940 stmt_vec_info
2941 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2943 stmt_vec_info first = NULL;
2944 stmt_vec_info stmt_vinfo;
2946 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2948 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2949 if (!first
2950 || get_later_stmt (stmt_vinfo, first) == first)
2951 first = stmt_vinfo;
2954 return first;
2957 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2958 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2959 (also containing the first GROUP1_SIZE stmts, since stores are
2960 consecutive), the second containing the remainder.
2961 Return the first stmt in the second group. */
2963 static stmt_vec_info
2964 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2966 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2967 gcc_assert (group1_size > 0);
2968 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2969 gcc_assert (group2_size > 0);
2970 DR_GROUP_SIZE (first_vinfo) = group1_size;
2972 stmt_vec_info stmt_info = first_vinfo;
2973 for (unsigned i = group1_size; i > 1; i--)
2975 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2976 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2978 /* STMT is now the last element of the first group. */
2979 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2980 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2982 DR_GROUP_SIZE (group2) = group2_size;
2983 for (stmt_info = group2; stmt_info;
2984 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2986 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2987 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2990 /* For the second group, the DR_GROUP_GAP is that before the original group,
2991 plus skipping over the first vector. */
2992 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2994 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2995 DR_GROUP_GAP (first_vinfo) += group2_size;
2997 if (dump_enabled_p ())
2998 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2999 group1_size, group2_size);
3001 return group2;
3004 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3005 statements and a vector of NUNITS elements. */
3007 static poly_uint64
3008 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3010 return exact_div (common_multiple (nunits, group_size), group_size);
3013 /* Helper that checks to see if a node is a load node. */
3015 static inline bool
3016 vect_is_slp_load_node (slp_tree root)
3018 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3019 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3020 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3024 /* Helper function of optimize_load_redistribution that performs the operation
3025 recursively. */
3027 static slp_tree
3028 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3029 vec_info *vinfo, unsigned int group_size,
3030 hash_map<slp_tree, slp_tree> *load_map,
3031 slp_tree root)
3033 if (slp_tree *leader = load_map->get (root))
3034 return *leader;
3036 slp_tree node;
3037 unsigned i;
3039 /* For now, we don't know anything about externals so do not do anything. */
3040 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3041 return NULL;
3042 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3044 /* First convert this node into a load node and add it to the leaves
3045 list and flatten the permute from a lane to a load one. If it's
3046 unneeded it will be elided later. */
3047 vec<stmt_vec_info> stmts;
3048 stmts.create (SLP_TREE_LANES (root));
3049 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3050 for (unsigned j = 0; j < lane_perm.length (); j++)
3052 std::pair<unsigned, unsigned> perm = lane_perm[j];
3053 node = SLP_TREE_CHILDREN (root)[perm.first];
3055 if (!vect_is_slp_load_node (node)
3056 || SLP_TREE_CHILDREN (node).exists ())
3058 stmts.release ();
3059 goto next;
3062 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "converting stmts on permute node %p\n",
3068 (void *) root);
3070 bool *matches = XALLOCAVEC (bool, group_size);
3071 poly_uint64 max_nunits = 1;
3072 unsigned tree_size = 0, limit = 1;
3073 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3074 matches, &limit, &tree_size, bst_map);
3075 if (!node)
3076 stmts.release ();
3078 load_map->put (root, node);
3079 return node;
3082 next:
3083 load_map->put (root, NULL);
3085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3087 slp_tree value
3088 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3089 node);
3090 if (value)
3092 SLP_TREE_REF_COUNT (value)++;
3093 SLP_TREE_CHILDREN (root)[i] = value;
3094 /* ??? We know the original leafs of the replaced nodes will
3095 be referenced by bst_map, only the permutes created by
3096 pattern matching are not. */
3097 if (SLP_TREE_REF_COUNT (node) == 1)
3098 load_map->remove (node);
3099 vect_free_slp_tree (node);
3103 return NULL;
3106 /* Temporary workaround for loads not being CSEd during SLP build. This
3107 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3108 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3109 same DR such that the final operation is equal to a permuted load. Such
3110 NODES are then directly converted into LOADS themselves. The nodes are
3111 CSEd using BST_MAP. */
3113 static void
3114 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3115 vec_info *vinfo, unsigned int group_size,
3116 hash_map<slp_tree, slp_tree> *load_map,
3117 slp_tree root)
3119 slp_tree node;
3120 unsigned i;
3122 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3124 slp_tree value
3125 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3126 node);
3127 if (value)
3129 SLP_TREE_REF_COUNT (value)++;
3130 SLP_TREE_CHILDREN (root)[i] = value;
3131 /* ??? We know the original leafs of the replaced nodes will
3132 be referenced by bst_map, only the permutes created by
3133 pattern matching are not. */
3134 if (SLP_TREE_REF_COUNT (node) == 1)
3135 load_map->remove (node);
3136 vect_free_slp_tree (node);
3141 /* Helper function of vect_match_slp_patterns.
3143 Attempts to match patterns against the slp tree rooted in REF_NODE using
3144 VINFO. Patterns are matched in post-order traversal.
3146 If matching is successful the value in REF_NODE is updated and returned, if
3147 not then it is returned unchanged. */
3149 static bool
3150 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3151 slp_tree_to_load_perm_map_t *perm_cache,
3152 slp_compat_nodes_map_t *compat_cache,
3153 hash_set<slp_tree> *visited)
3155 unsigned i;
3156 slp_tree node = *ref_node;
3157 bool found_p = false;
3158 if (!node || visited->add (node))
3159 return false;
3161 slp_tree child;
3162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3163 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3164 vinfo, perm_cache, compat_cache,
3165 visited);
3167 for (unsigned x = 0; x < num__slp_patterns; x++)
3169 vect_pattern *pattern
3170 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3171 if (pattern)
3173 pattern->build (vinfo);
3174 delete pattern;
3175 found_p = true;
3179 return found_p;
3182 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3183 vec_info VINFO.
3185 The modified tree is returned. Patterns are tried in order and multiple
3186 patterns may match. */
3188 static bool
3189 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3190 hash_set<slp_tree> *visited,
3191 slp_tree_to_load_perm_map_t *perm_cache,
3192 slp_compat_nodes_map_t *compat_cache)
3194 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3195 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_NOTE, vect_location,
3199 "Analyzing SLP tree %p for patterns\n",
3200 (void *) SLP_INSTANCE_TREE (instance));
3202 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3203 visited);
3206 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3207 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3208 Return true if we could use IFN_STORE_LANES instead and if that appears
3209 to be the better approach. */
3211 static bool
3212 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3213 unsigned int group_size,
3214 unsigned int new_group_size)
3216 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3217 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3218 if (!vectype)
3219 return false;
3220 /* Allow the split if one of the two new groups would operate on full
3221 vectors *within* rather than across one scalar loop iteration.
3222 This is purely a heuristic, but it should work well for group
3223 sizes of 3 and 4, where the possible splits are:
3225 3->2+1: OK if the vector has exactly two elements
3226 4->2+2: Likewise
3227 4->3+1: Less clear-cut. */
3228 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3229 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3230 return false;
3231 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3234 /* Analyze an SLP instance starting from a group of grouped stores. Call
3235 vect_build_slp_tree to build a tree of packed stmts if possible.
3236 Return FALSE if it's impossible to SLP any stmt in the loop. */
3238 static bool
3239 vect_analyze_slp_instance (vec_info *vinfo,
3240 scalar_stmts_to_slp_tree_map_t *bst_map,
3241 stmt_vec_info stmt_info, slp_instance_kind kind,
3242 unsigned max_tree_size, unsigned *limit);
3244 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3245 of KIND. Return true if successful. */
3247 static bool
3248 vect_build_slp_instance (vec_info *vinfo,
3249 slp_instance_kind kind,
3250 vec<stmt_vec_info> &scalar_stmts,
3251 vec<stmt_vec_info> &root_stmt_infos,
3252 vec<tree> &remain,
3253 unsigned max_tree_size, unsigned *limit,
3254 scalar_stmts_to_slp_tree_map_t *bst_map,
3255 /* ??? We need stmt_info for group splitting. */
3256 stmt_vec_info stmt_info_)
3258 if (kind == slp_inst_kind_ctor)
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Analyzing vectorizable constructor: %G\n",
3263 root_stmt_infos[0]->stmt);
3266 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "Starting SLP discovery for\n");
3270 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3271 dump_printf_loc (MSG_NOTE, vect_location,
3272 " %G", scalar_stmts[i]->stmt);
3275 /* When a BB reduction doesn't have an even number of lanes
3276 strip it down, treating the remaining lane as scalar.
3277 ??? Selecting the optimal set of lanes to vectorize would be nice
3278 but SLP build for all lanes will fail quickly because we think
3279 we're going to need unrolling. */
3280 if (kind == slp_inst_kind_bb_reduc
3281 && (scalar_stmts.length () & 1))
3282 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3284 /* Build the tree for the SLP instance. */
3285 unsigned int group_size = scalar_stmts.length ();
3286 bool *matches = XALLOCAVEC (bool, group_size);
3287 poly_uint64 max_nunits = 1;
3288 unsigned tree_size = 0;
3289 unsigned i;
3290 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3291 &max_nunits, matches, limit,
3292 &tree_size, bst_map);
3293 if (node != NULL)
3295 /* Calculate the unrolling factor based on the smallest type. */
3296 poly_uint64 unrolling_factor
3297 = calculate_unrolling_factor (max_nunits, group_size);
3299 if (maybe_ne (unrolling_factor, 1U)
3300 && is_a <bb_vec_info> (vinfo))
3302 unsigned HOST_WIDE_INT const_max_nunits;
3303 if (!max_nunits.is_constant (&const_max_nunits)
3304 || const_max_nunits > group_size)
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 "Build SLP failed: store group "
3309 "size not a multiple of the vector size "
3310 "in basic block SLP\n");
3311 vect_free_slp_tree (node);
3312 return false;
3314 /* Fatal mismatch. */
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "SLP discovery succeeded but node needs "
3318 "splitting\n");
3319 memset (matches, true, group_size);
3320 matches[group_size / const_max_nunits * const_max_nunits] = false;
3321 vect_free_slp_tree (node);
3323 else
3325 /* Create a new SLP instance. */
3326 slp_instance new_instance = XNEW (class _slp_instance);
3327 SLP_INSTANCE_TREE (new_instance) = node;
3328 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3329 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3330 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3331 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3332 SLP_INSTANCE_KIND (new_instance) = kind;
3333 new_instance->reduc_phis = NULL;
3334 new_instance->cost_vec = vNULL;
3335 new_instance->subgraph_entries = vNULL;
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_NOTE, vect_location,
3339 "SLP size %u vs. limit %u.\n",
3340 tree_size, max_tree_size);
3342 /* Fixup SLP reduction chains. */
3343 if (kind == slp_inst_kind_reduc_chain)
3345 /* If this is a reduction chain with a conversion in front
3346 amend the SLP tree with a node for that. */
3347 gimple *scalar_def
3348 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3349 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3351 /* Get at the conversion stmt - we know it's the single use
3352 of the last stmt of the reduction chain. */
3353 use_operand_p use_p;
3354 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3355 &use_p, &scalar_def);
3356 gcc_assert (r);
3357 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3358 next_info = vect_stmt_to_vectorize (next_info);
3359 scalar_stmts = vNULL;
3360 scalar_stmts.create (group_size);
3361 for (unsigned i = 0; i < group_size; ++i)
3362 scalar_stmts.quick_push (next_info);
3363 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3364 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3365 SLP_TREE_CHILDREN (conv).quick_push (node);
3366 SLP_INSTANCE_TREE (new_instance) = conv;
3367 /* We also have to fake this conversion stmt as SLP reduction
3368 group so we don't have to mess with too much code
3369 elsewhere. */
3370 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3371 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3373 /* Fill the backedge child of the PHI SLP node. The
3374 general matching code cannot find it because the
3375 scalar code does not reflect how we vectorize the
3376 reduction. */
3377 use_operand_p use_p;
3378 imm_use_iterator imm_iter;
3379 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3380 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3381 gimple_get_lhs (scalar_def))
3382 /* There are exactly two non-debug uses, the reduction
3383 PHI and the loop-closed PHI node. */
3384 if (!is_gimple_debug (USE_STMT (use_p))
3385 && gimple_bb (USE_STMT (use_p)) == loop->header)
3387 auto_vec<stmt_vec_info, 64> phis (group_size);
3388 stmt_vec_info phi_info
3389 = vinfo->lookup_stmt (USE_STMT (use_p));
3390 for (unsigned i = 0; i < group_size; ++i)
3391 phis.quick_push (phi_info);
3392 slp_tree *phi_node = bst_map->get (phis);
3393 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3394 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3395 = SLP_INSTANCE_TREE (new_instance);
3396 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3400 vinfo->slp_instances.safe_push (new_instance);
3402 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3403 the number of scalar stmts in the root in a few places.
3404 Verify that assumption holds. */
3405 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3406 .length () == group_size);
3408 if (dump_enabled_p ())
3410 dump_printf_loc (MSG_NOTE, vect_location,
3411 "Final SLP tree for instance %p:\n",
3412 (void *) new_instance);
3413 vect_print_slp_graph (MSG_NOTE, vect_location,
3414 SLP_INSTANCE_TREE (new_instance));
3417 return true;
3420 else
3422 /* Failed to SLP. */
3423 /* Free the allocated memory. */
3424 scalar_stmts.release ();
3427 stmt_vec_info stmt_info = stmt_info_;
3428 /* Try to break the group up into pieces. */
3429 if (kind == slp_inst_kind_store)
3431 /* ??? We could delay all the actual splitting of store-groups
3432 until after SLP discovery of the original group completed.
3433 Then we can recurse to vect_build_slp_instance directly. */
3434 for (i = 0; i < group_size; i++)
3435 if (!matches[i])
3436 break;
3438 /* For basic block SLP, try to break the group up into multiples of
3439 a vector size. */
3440 if (is_a <bb_vec_info> (vinfo)
3441 && (i > 1 && i < group_size))
3443 tree scalar_type
3444 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3445 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3446 1 << floor_log2 (i));
3447 unsigned HOST_WIDE_INT const_nunits;
3448 if (vectype
3449 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3451 /* Split into two groups at the first vector boundary. */
3452 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3453 unsigned group1_size = i & ~(const_nunits - 1);
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location,
3457 "Splitting SLP group at stmt %u\n", i);
3458 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3459 group1_size);
3460 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3461 kind, max_tree_size,
3462 limit);
3463 /* Split the rest at the failure point and possibly
3464 re-analyze the remaining matching part if it has
3465 at least two lanes. */
3466 if (group1_size < i
3467 && (i + 1 < group_size
3468 || i - group1_size > 1))
3470 stmt_vec_info rest2 = rest;
3471 rest = vect_split_slp_store_group (rest, i - group1_size);
3472 if (i - group1_size > 1)
3473 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3474 kind, max_tree_size,
3475 limit);
3477 /* Re-analyze the non-matching tail if it has at least
3478 two lanes. */
3479 if (i + 1 < group_size)
3480 res |= vect_analyze_slp_instance (vinfo, bst_map,
3481 rest, kind, max_tree_size,
3482 limit);
3483 return res;
3487 /* For loop vectorization split into arbitrary pieces of size > 1. */
3488 if (is_a <loop_vec_info> (vinfo)
3489 && (i > 1 && i < group_size)
3490 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3492 unsigned group1_size = i;
3494 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_NOTE, vect_location,
3496 "Splitting SLP group at stmt %u\n", i);
3498 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3499 group1_size);
3500 /* Loop vectorization cannot handle gaps in stores, make sure
3501 the split group appears as strided. */
3502 STMT_VINFO_STRIDED_P (rest) = 1;
3503 DR_GROUP_GAP (rest) = 0;
3504 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3505 DR_GROUP_GAP (stmt_info) = 0;
3507 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3508 kind, max_tree_size, limit);
3509 if (i + 1 < group_size)
3510 res |= vect_analyze_slp_instance (vinfo, bst_map,
3511 rest, kind, max_tree_size, limit);
3513 return res;
3516 /* Even though the first vector did not all match, we might be able to SLP
3517 (some) of the remainder. FORNOW ignore this possibility. */
3520 /* Failed to SLP. */
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3523 return false;
3527 /* Analyze an SLP instance starting from a group of grouped stores. Call
3528 vect_build_slp_tree to build a tree of packed stmts if possible.
3529 Return FALSE if it's impossible to SLP any stmt in the loop. */
3531 static bool
3532 vect_analyze_slp_instance (vec_info *vinfo,
3533 scalar_stmts_to_slp_tree_map_t *bst_map,
3534 stmt_vec_info stmt_info,
3535 slp_instance_kind kind,
3536 unsigned max_tree_size, unsigned *limit)
3538 unsigned int i;
3539 vec<stmt_vec_info> scalar_stmts;
3541 if (is_a <bb_vec_info> (vinfo))
3542 vect_location = stmt_info->stmt;
3544 stmt_vec_info next_info = stmt_info;
3545 if (kind == slp_inst_kind_store)
3547 /* Collect the stores and store them in scalar_stmts. */
3548 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3549 while (next_info)
3551 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3552 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3555 else if (kind == slp_inst_kind_reduc_chain)
3557 /* Collect the reduction stmts and store them in scalar_stmts. */
3558 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3559 while (next_info)
3561 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3562 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3564 /* Mark the first element of the reduction chain as reduction to properly
3565 transform the node. In the reduction analysis phase only the last
3566 element of the chain is marked as reduction. */
3567 STMT_VINFO_DEF_TYPE (stmt_info)
3568 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3569 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3570 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3572 else if (kind == slp_inst_kind_reduc_group)
3574 /* Collect reduction statements. */
3575 const vec<stmt_vec_info> &reductions
3576 = as_a <loop_vec_info> (vinfo)->reductions;
3577 scalar_stmts.create (reductions.length ());
3578 for (i = 0; reductions.iterate (i, &next_info); i++)
3579 if ((STMT_VINFO_RELEVANT_P (next_info)
3580 || STMT_VINFO_LIVE_P (next_info))
3581 /* ??? Make sure we didn't skip a conversion around a reduction
3582 path. In that case we'd have to reverse engineer that conversion
3583 stmt following the chain using reduc_idx and from the PHI
3584 using reduc_def. */
3585 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3586 scalar_stmts.quick_push (next_info);
3587 /* If less than two were relevant/live there's nothing to SLP. */
3588 if (scalar_stmts.length () < 2)
3589 return false;
3591 else
3592 gcc_unreachable ();
3594 vec<stmt_vec_info> roots = vNULL;
3595 vec<tree> remain = vNULL;
3596 /* Build the tree for the SLP instance. */
3597 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3598 roots, remain,
3599 max_tree_size, limit, bst_map,
3600 kind == slp_inst_kind_store
3601 ? stmt_info : NULL);
3603 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3604 where we should do store group splitting. */
3606 return res;
3609 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3610 trees of packed scalar stmts if SLP is possible. */
3612 opt_result
3613 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3615 unsigned int i;
3616 stmt_vec_info first_element;
3617 slp_instance instance;
3619 DUMP_VECT_SCOPE ("vect_analyze_slp");
3621 unsigned limit = max_tree_size;
3623 scalar_stmts_to_slp_tree_map_t *bst_map
3624 = new scalar_stmts_to_slp_tree_map_t ();
3626 /* Find SLP sequences starting from groups of grouped stores. */
3627 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3628 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3629 slp_inst_kind_store, max_tree_size, &limit);
3631 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3633 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3635 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3636 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3637 bb_vinfo->roots[i].stmts,
3638 bb_vinfo->roots[i].roots,
3639 bb_vinfo->roots[i].remain,
3640 max_tree_size, &limit, bst_map, NULL))
3642 bb_vinfo->roots[i].stmts = vNULL;
3643 bb_vinfo->roots[i].roots = vNULL;
3644 bb_vinfo->roots[i].remain = vNULL;
3649 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3651 /* Find SLP sequences starting from reduction chains. */
3652 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3653 if (! STMT_VINFO_RELEVANT_P (first_element)
3654 && ! STMT_VINFO_LIVE_P (first_element))
3656 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3657 slp_inst_kind_reduc_chain,
3658 max_tree_size, &limit))
3660 /* Dissolve reduction chain group. */
3661 stmt_vec_info vinfo = first_element;
3662 stmt_vec_info last = NULL;
3663 while (vinfo)
3665 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3666 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3667 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3668 last = vinfo;
3669 vinfo = next;
3671 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3672 /* It can be still vectorized as part of an SLP reduction. */
3673 loop_vinfo->reductions.safe_push (last);
3676 /* Find SLP sequences starting from groups of reductions. */
3677 if (loop_vinfo->reductions.length () > 1)
3678 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3679 slp_inst_kind_reduc_group, max_tree_size,
3680 &limit);
3683 hash_set<slp_tree> visited_patterns;
3684 slp_tree_to_load_perm_map_t perm_cache;
3685 slp_compat_nodes_map_t compat_cache;
3687 /* See if any patterns can be found in the SLP tree. */
3688 bool pattern_found = false;
3689 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3690 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3691 &visited_patterns, &perm_cache,
3692 &compat_cache);
3694 /* If any were found optimize permutations of loads. */
3695 if (pattern_found)
3697 hash_map<slp_tree, slp_tree> load_map;
3698 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3700 slp_tree root = SLP_INSTANCE_TREE (instance);
3701 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3702 &load_map, root);
3708 /* The map keeps a reference on SLP nodes built, release that. */
3709 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3710 it != bst_map->end (); ++it)
3711 if ((*it).second)
3712 vect_free_slp_tree ((*it).second);
3713 delete bst_map;
3715 if (pattern_found && dump_enabled_p ())
3717 dump_printf_loc (MSG_NOTE, vect_location,
3718 "Pattern matched SLP tree\n");
3719 hash_set<slp_tree> visited;
3720 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3721 vect_print_slp_graph (MSG_NOTE, vect_location,
3722 SLP_INSTANCE_TREE (instance), visited);
3725 return opt_result::success ();
3728 /* Estimates the cost of inserting layout changes into the SLP graph.
3729 It can also say that the insertion is impossible. */
3731 struct slpg_layout_cost
3733 slpg_layout_cost () = default;
3734 slpg_layout_cost (sreal, bool);
3736 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3737 bool is_possible () const { return depth != sreal::max (); }
3739 bool operator== (const slpg_layout_cost &) const;
3740 bool operator!= (const slpg_layout_cost &) const;
3742 bool is_better_than (const slpg_layout_cost &, bool) const;
3744 void add_parallel_cost (const slpg_layout_cost &);
3745 void add_serial_cost (const slpg_layout_cost &);
3746 void split (unsigned int);
3748 /* The longest sequence of layout changes needed during any traversal
3749 of the partition dag, weighted by execution frequency.
3751 This is the most important metric when optimizing for speed, since
3752 it helps to ensure that we keep the number of operations on
3753 critical paths to a minimum. */
3754 sreal depth = 0;
3756 /* An estimate of the total number of operations needed. It is weighted by
3757 execution frequency when optimizing for speed but not when optimizing for
3758 size. In order to avoid double-counting, a node with a fanout of N will
3759 distribute 1/N of its total cost to each successor.
3761 This is the most important metric when optimizing for size, since
3762 it helps to keep the total number of operations to a minimum, */
3763 sreal total = 0;
3766 /* Construct costs for a node with weight WEIGHT. A higher weight
3767 indicates more frequent execution. IS_FOR_SIZE is true if we are
3768 optimizing for size rather than speed. */
3770 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3771 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3775 bool
3776 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3778 return depth == other.depth && total == other.total;
3781 bool
3782 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3784 return !operator== (other);
3787 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3788 true if we are optimizing for size rather than speed. */
3790 bool
3791 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3792 bool is_for_size) const
3794 if (is_for_size)
3796 if (total != other.total)
3797 return total < other.total;
3798 return depth < other.depth;
3800 else
3802 if (depth != other.depth)
3803 return depth < other.depth;
3804 return total < other.total;
3808 /* Increase the costs to account for something with cost INPUT_COST
3809 happening in parallel with the current costs. */
3811 void
3812 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3814 depth = std::max (depth, input_cost.depth);
3815 total += input_cost.total;
3818 /* Increase the costs to account for something with cost INPUT_COST
3819 happening in series with the current costs. */
3821 void
3822 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3824 depth += other.depth;
3825 total += other.total;
3828 /* Split the total cost among TIMES successors or predecessors. */
3830 void
3831 slpg_layout_cost::split (unsigned int times)
3833 if (times > 1)
3834 total /= times;
3837 /* Information about one node in the SLP graph, for use during
3838 vect_optimize_slp_pass. */
3840 struct slpg_vertex
3842 slpg_vertex (slp_tree node_) : node (node_) {}
3844 /* The node itself. */
3845 slp_tree node;
3847 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3848 partitions are flexible; they can have whichever layout consumers
3849 want them to have. */
3850 int partition = -1;
3852 /* The number of nodes that directly use the result of this one
3853 (i.e. the number of nodes that count this one as a child). */
3854 unsigned int out_degree = 0;
3856 /* The execution frequency of the node. */
3857 sreal weight = 0;
3859 /* The total execution frequency of all nodes that directly use the
3860 result of this one. */
3861 sreal out_weight = 0;
3864 /* Information about one partition of the SLP graph, for use during
3865 vect_optimize_slp_pass. */
3867 struct slpg_partition_info
3869 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3870 of m_partitioned_nodes. */
3871 unsigned int node_begin = 0;
3872 unsigned int node_end = 0;
3874 /* Which layout we've chosen to use for this partition, or -1 if
3875 we haven't picked one yet. */
3876 int layout = -1;
3878 /* The number of predecessors and successors in the partition dag.
3879 The predecessors always have lower partition numbers and the
3880 successors always have higher partition numbers.
3882 Note that the directions of these edges are not necessarily the
3883 same as in the data flow graph. For example, if an SCC has separate
3884 partitions for an inner loop and an outer loop, the inner loop's
3885 partition will have at least two incoming edges from the outer loop's
3886 partition: one for a live-in value and one for a live-out value.
3887 In data flow terms, one of these edges would also be from the outer loop
3888 to the inner loop, but the other would be in the opposite direction. */
3889 unsigned int in_degree = 0;
3890 unsigned int out_degree = 0;
3893 /* Information about the costs of using a particular layout for a
3894 particular partition. It can also say that the combination is
3895 impossible. */
3897 struct slpg_partition_layout_costs
3899 bool is_possible () const { return internal_cost.is_possible (); }
3900 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3902 /* The costs inherited from predecessor partitions. */
3903 slpg_layout_cost in_cost;
3905 /* The inherent cost of the layout within the node itself. For example,
3906 this is nonzero for a load if choosing a particular layout would require
3907 the load to permute the loaded elements. It is nonzero for a
3908 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3909 to full-vector moves. */
3910 slpg_layout_cost internal_cost;
3912 /* The costs inherited from successor partitions. */
3913 slpg_layout_cost out_cost;
3916 /* This class tries to optimize the layout of vectors in order to avoid
3917 unnecessary shuffling. At the moment, the set of possible layouts are
3918 restricted to bijective permutations.
3920 The goal of the pass depends on whether we're optimizing for size or
3921 for speed. When optimizing for size, the goal is to reduce the overall
3922 number of layout changes (including layout changes implied by things
3923 like load permutations). When optimizing for speed, the goal is to
3924 reduce the maximum latency attributable to layout changes on any
3925 non-cyclical path through the data flow graph.
3927 For example, when optimizing a loop nest for speed, we will prefer
3928 to make layout changes outside of a loop rather than inside of a loop,
3929 and will prefer to make layout changes in parallel rather than serially,
3930 even if that increases the overall number of layout changes.
3932 The high-level procedure is:
3934 (1) Build a graph in which edges go from uses (parents) to definitions
3935 (children).
3937 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3939 (3) When optimizing for speed, partition the nodes in each SCC based
3940 on their containing cfg loop. When optimizing for size, treat
3941 each SCC as a single partition.
3943 This gives us a dag of partitions. The goal is now to assign a
3944 layout to each partition.
3946 (4) Construct a set of vector layouts that are worth considering.
3947 Record which nodes must keep their current layout.
3949 (5) Perform a forward walk over the partition dag (from loads to stores)
3950 accumulating the "forward" cost of using each layout. When visiting
3951 each partition, assign a tentative choice of layout to the partition
3952 and use that choice when calculating the cost of using a different
3953 layout in successor partitions.
3955 (6) Perform a backward walk over the partition dag (from stores to loads),
3956 accumulating the "backward" cost of using each layout. When visiting
3957 each partition, make a final choice of layout for that partition based
3958 on the accumulated forward costs (from (5)) and backward costs
3959 (from (6)).
3961 (7) Apply the chosen layouts to the SLP graph.
3963 For example, consider the SLP statements:
3965 S1: a_1 = load
3966 loop:
3967 S2: a_2 = PHI<a_1, a_3>
3968 S3: b_1 = load
3969 S4: a_3 = a_2 + b_1
3970 exit:
3971 S5: a_4 = PHI<a_3>
3972 S6: store a_4
3974 S2 and S4 form an SCC and are part of the same loop. Every other
3975 statement is in a singleton SCC. In this example there is a one-to-one
3976 mapping between SCCs and partitions and the partition dag looks like this;
3978 S1 S3
3980 S2+S4
3986 S2, S3 and S4 will have a higher execution frequency than the other
3987 statements, so when optimizing for speed, the goal is to avoid any
3988 layout changes:
3990 - within S3
3991 - within S2+S4
3992 - on the S3->S2+S4 edge
3994 For example, if S3 was originally a reversing load, the goal of the
3995 pass is to make it an unreversed load and change the layout on the
3996 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3997 on S1->S2+S4 and S5->S6 would also be acceptable.)
3999 The difference between SCCs and partitions becomes important if we
4000 add an outer loop:
4002 S1: a_1 = ...
4003 loop1:
4004 S2: a_2 = PHI<a_1, a_6>
4005 S3: b_1 = load
4006 S4: a_3 = a_2 + b_1
4007 loop2:
4008 S5: a_4 = PHI<a_3, a_5>
4009 S6: c_1 = load
4010 S7: a_5 = a_4 + c_1
4011 exit2:
4012 S8: a_6 = PHI<a_5>
4013 S9: store a_6
4014 exit1:
4016 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4017 for speed, we usually do not want restrictions in the outer loop to "infect"
4018 the decision for the inner loop. For example, if an outer-loop node
4019 in the SCC contains a statement with a fixed layout, that should not
4020 prevent the inner loop from using a different layout. Conversely,
4021 the inner loop should not dictate a layout to the outer loop: if the
4022 outer loop does a lot of computation, then it may not be efficient to
4023 do all of that computation in the inner loop's preferred layout.
4025 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4026 and S5+S7 (inner). We also try to arrange partitions so that:
4028 - the partition for an outer loop comes before the partition for
4029 an inner loop
4031 - if a sibling loop A dominates a sibling loop B, A's partition
4032 comes before B's
4034 This gives the following partition dag for the example above:
4036 S1 S3
4038 S2+S4+S8 S6
4039 | \\ /
4040 | S5+S7
4044 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4045 one for a reversal of the edge S7->S8.
4047 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4048 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4049 preferred layout against the cost of changing the layout on entry to the
4050 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4052 Although this works well when optimizing for speed, it has the downside
4053 when optimizing for size that the choice of layout for S5+S7 is completely
4054 independent of S9, which lessens the chance of reducing the overall number
4055 of permutations. We therefore do not partition SCCs when optimizing
4056 for size.
4058 To give a concrete example of the difference between optimizing
4059 for size and speed, consider:
4061 a[0] = (b[1] << c[3]) - d[1];
4062 a[1] = (b[0] << c[2]) - d[0];
4063 a[2] = (b[3] << c[1]) - d[3];
4064 a[3] = (b[2] << c[0]) - d[2];
4066 There are three different layouts here: one for a, one for b and d,
4067 and one for c. When optimizing for speed it is better to permute each
4068 of b, c and d into the order required by a, since those permutations
4069 happen in parallel. But when optimizing for size, it is better to:
4071 - permute c into the same order as b
4072 - do the arithmetic
4073 - permute the result into the order required by a
4075 This gives 2 permutations rather than 3. */
4077 class vect_optimize_slp_pass
4079 public:
4080 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4081 void run ();
4083 private:
4084 /* Graph building. */
4085 struct loop *containing_loop (slp_tree);
4086 bool is_cfg_latch_edge (graph_edge *);
4087 void build_vertices (hash_set<slp_tree> &, slp_tree);
4088 void build_vertices ();
4089 void build_graph ();
4091 /* Partitioning. */
4092 void create_partitions ();
4093 template<typename T> void for_each_partition_edge (unsigned int, T);
4095 /* Layout selection. */
4096 bool is_compatible_layout (slp_tree, unsigned int);
4097 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4098 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4099 unsigned int);
4100 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4101 int, unsigned int);
4102 int internal_node_cost (slp_tree, int, unsigned int);
4103 void start_choosing_layouts ();
4105 /* Cost propagation. */
4106 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4107 unsigned int, unsigned int);
4108 slpg_layout_cost total_in_cost (unsigned int);
4109 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4110 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4111 void forward_pass ();
4112 void backward_pass ();
4114 /* Rematerialization. */
4115 slp_tree get_result_with_layout (slp_tree, unsigned int);
4116 void materialize ();
4118 /* Clean-up. */
4119 void remove_redundant_permutations ();
4121 void dump ();
4123 vec_info *m_vinfo;
4125 /* True if we should optimize the graph for size, false if we should
4126 optimize it for speed. (It wouldn't be easy to make this decision
4127 more locally.) */
4128 bool m_optimize_size;
4130 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4131 In other words, a node's predecessors are its slp_tree parents and
4132 a node's successors are its slp_tree children. */
4133 graph *m_slpg = nullptr;
4135 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4136 auto_vec<slpg_vertex> m_vertices;
4138 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4139 and loads. */
4140 auto_vec<int> m_leafs;
4142 /* This array has one entry for every vector layout that we're considering.
4143 Element 0 is null and indicates "no change". Other entries describe
4144 permutations that are inherent in the current graph and that we would
4145 like to reverse if possible.
4147 For example, a permutation { 1, 2, 3, 0 } means that something has
4148 effectively been permuted in that way, such as a load group
4149 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4150 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4151 in order to put things "back" in order. */
4152 auto_vec<vec<unsigned> > m_perms;
4154 /* A partitioning of the nodes for which a layout must be chosen.
4155 Each partition represents an <SCC, cfg loop> pair; that is,
4156 nodes in different SCCs belong to different partitions, and nodes
4157 within an SCC can be further partitioned according to a containing
4158 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4160 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4161 from leaves (such as loads) to roots (such as stores).
4163 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4164 auto_vec<slpg_partition_info> m_partitions;
4166 /* The list of all nodes for which a layout must be chosen. Nodes for
4167 partition P come before the nodes for partition P+1. Nodes within a
4168 partition are in reverse postorder. */
4169 auto_vec<unsigned int> m_partitioned_nodes;
4171 /* Index P * num-layouts + L contains the cost of using layout L
4172 for partition P. */
4173 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4175 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4176 original output of node N adjusted to have layout L. */
4177 auto_vec<slp_tree> m_node_layouts;
4180 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4181 Also record whether we should optimize anything for speed rather
4182 than size. */
4184 void
4185 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4186 slp_tree node)
4188 unsigned i;
4189 slp_tree child;
4191 if (visited.add (node))
4192 return;
4194 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4196 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4197 if (optimize_bb_for_speed_p (bb))
4198 m_optimize_size = false;
4201 node->vertex = m_vertices.length ();
4202 m_vertices.safe_push (slpg_vertex (node));
4204 bool leaf = true;
4205 bool force_leaf = false;
4206 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4207 if (child)
4209 leaf = false;
4210 build_vertices (visited, child);
4212 else
4213 force_leaf = true;
4214 /* Since SLP discovery works along use-def edges all cycles have an
4215 entry - but there's the exception of cycles where we do not handle
4216 the entry explicitely (but with a NULL SLP node), like some reductions
4217 and inductions. Force those SLP PHIs to act as leafs to make them
4218 backwards reachable. */
4219 if (leaf || force_leaf)
4220 m_leafs.safe_push (node->vertex);
4223 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4225 void
4226 vect_optimize_slp_pass::build_vertices ()
4228 hash_set<slp_tree> visited;
4229 unsigned i;
4230 slp_instance instance;
4231 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4232 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4235 /* Apply (reverse) bijectite PERM to VEC. */
4237 template <class T>
4238 static void
4239 vect_slp_permute (vec<unsigned> perm,
4240 vec<T> &vec, bool reverse)
4242 auto_vec<T, 64> saved;
4243 saved.create (vec.length ());
4244 for (unsigned i = 0; i < vec.length (); ++i)
4245 saved.quick_push (vec[i]);
4247 if (reverse)
4249 for (unsigned i = 0; i < vec.length (); ++i)
4250 vec[perm[i]] = saved[i];
4251 for (unsigned i = 0; i < vec.length (); ++i)
4252 gcc_assert (vec[perm[i]] == saved[i]);
4254 else
4256 for (unsigned i = 0; i < vec.length (); ++i)
4257 vec[i] = saved[perm[i]];
4258 for (unsigned i = 0; i < vec.length (); ++i)
4259 gcc_assert (vec[i] == saved[perm[i]]);
4263 /* Return the cfg loop that contains NODE. */
4265 struct loop *
4266 vect_optimize_slp_pass::containing_loop (slp_tree node)
4268 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4269 if (!rep)
4270 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4271 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4274 /* Return true if UD (an edge from a use to a definition) is associated
4275 with a loop latch edge in the cfg. */
4277 bool
4278 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4280 slp_tree use = m_vertices[ud->src].node;
4281 slp_tree def = m_vertices[ud->dest].node;
4282 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4283 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4284 return false;
4286 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4287 return (is_a<gphi *> (use_rep->stmt)
4288 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4289 && containing_loop (def) == containing_loop (use));
4292 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4293 a nonnull data field. */
4295 void
4296 vect_optimize_slp_pass::build_graph ()
4298 m_optimize_size = true;
4299 build_vertices ();
4301 m_slpg = new_graph (m_vertices.length ());
4302 for (slpg_vertex &v : m_vertices)
4303 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4304 if (child)
4306 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4307 if (is_cfg_latch_edge (ud))
4308 ud->data = this;
4312 /* Return true if E corresponds to a loop latch edge in the cfg. */
4314 static bool
4315 skip_cfg_latch_edges (graph_edge *e)
4317 return e->data;
4320 /* Create the node partitions. */
4322 void
4323 vect_optimize_slp_pass::create_partitions ()
4325 /* Calculate a postorder of the graph, ignoring edges that correspond
4326 to natural latch edges in the cfg. Reading the vector from the end
4327 to the beginning gives the reverse postorder. */
4328 auto_vec<int> initial_rpo;
4329 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4330 false, NULL, skip_cfg_latch_edges);
4331 gcc_assert (initial_rpo.length () == m_vertices.length ());
4333 /* Calculate the strongly connected components of the graph. */
4334 auto_vec<int> scc_grouping;
4335 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4337 /* Create a new index order in which all nodes from the same SCC are
4338 consecutive. Use scc_pos to record the index of the first node in
4339 each SCC. */
4340 auto_vec<unsigned int> scc_pos (num_sccs);
4341 int last_component = -1;
4342 unsigned int node_count = 0;
4343 for (unsigned int node_i : scc_grouping)
4345 if (last_component != m_slpg->vertices[node_i].component)
4347 last_component = m_slpg->vertices[node_i].component;
4348 gcc_assert (last_component == int (scc_pos.length ()));
4349 scc_pos.quick_push (node_count);
4351 node_count += 1;
4353 gcc_assert (node_count == initial_rpo.length ()
4354 && last_component + 1 == int (num_sccs));
4356 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4357 inside each SCC following the RPO we calculated above. The fact that
4358 we ignored natural latch edges when calculating the RPO should ensure
4359 that, for natural loop nests:
4361 - the first node that we encounter in a cfg loop is the loop header phi
4362 - the loop header phis are in dominance order
4364 Arranging for this is an optimization (see below) rather than a
4365 correctness issue. Unnatural loops with a tangled mess of backedges
4366 will still work correctly, but might give poorer results.
4368 Also update scc_pos so that it gives 1 + the index of the last node
4369 in the SCC. */
4370 m_partitioned_nodes.safe_grow (node_count);
4371 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4373 unsigned int node_i = initial_rpo[old_i];
4374 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4375 m_partitioned_nodes[new_i] = node_i;
4378 /* When optimizing for speed, partition each SCC based on the containing
4379 cfg loop. The order we constructed above should ensure that, for natural
4380 cfg loops, we'll create sub-SCC partitions for outer loops before
4381 the corresponding sub-SCC partitions for inner loops. Similarly,
4382 when one sibling loop A dominates another sibling loop B, we should
4383 create a sub-SCC partition for A before a sub-SCC partition for B.
4385 As above, nothing depends for correctness on whether this achieves
4386 a natural nesting, but we should get better results when it does. */
4387 m_partitions.reserve (m_vertices.length ());
4388 unsigned int next_partition_i = 0;
4389 hash_map<struct loop *, int> loop_partitions;
4390 unsigned int rpo_begin = 0;
4391 unsigned int num_partitioned_nodes = 0;
4392 for (unsigned int rpo_end : scc_pos)
4394 loop_partitions.empty ();
4395 unsigned int partition_i = next_partition_i;
4396 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4398 /* Handle externals and constants optimistically throughout.
4399 But treat existing vectors as fixed since we do not handle
4400 permuting them. */
4401 unsigned int node_i = m_partitioned_nodes[rpo_i];
4402 auto &vertex = m_vertices[node_i];
4403 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4404 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4405 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4406 vertex.partition = -1;
4407 else
4409 bool existed;
4410 if (m_optimize_size)
4411 existed = next_partition_i > partition_i;
4412 else
4414 struct loop *loop = containing_loop (vertex.node);
4415 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4416 if (!existed)
4417 entry = next_partition_i;
4418 partition_i = entry;
4420 if (!existed)
4422 m_partitions.quick_push (slpg_partition_info ());
4423 next_partition_i += 1;
4425 vertex.partition = partition_i;
4426 num_partitioned_nodes += 1;
4427 m_partitions[partition_i].node_end += 1;
4430 rpo_begin = rpo_end;
4433 /* Assign ranges of consecutive node indices to each partition,
4434 in partition order. Start with node_end being the same as
4435 node_begin so that the next loop can use it as a counter. */
4436 unsigned int node_begin = 0;
4437 for (auto &partition : m_partitions)
4439 partition.node_begin = node_begin;
4440 node_begin += partition.node_end;
4441 partition.node_end = partition.node_begin;
4443 gcc_assert (node_begin == num_partitioned_nodes);
4445 /* Finally build the list of nodes in partition order. */
4446 m_partitioned_nodes.truncate (num_partitioned_nodes);
4447 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4449 int partition_i = m_vertices[node_i].partition;
4450 if (partition_i >= 0)
4452 unsigned int order_i = m_partitions[partition_i].node_end++;
4453 m_partitioned_nodes[order_i] = node_i;
4458 /* Look for edges from earlier partitions into node NODE_I and edges from
4459 node NODE_I into later partitions. Call:
4461 FN (ud, other_node_i)
4463 for each such use-to-def edge ud, where other_node_i is the node at the
4464 other end of the edge. */
4466 template<typename T>
4467 void
4468 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4470 int partition_i = m_vertices[node_i].partition;
4471 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4472 pred; pred = pred->pred_next)
4474 int src_partition_i = m_vertices[pred->src].partition;
4475 if (src_partition_i >= 0 && src_partition_i != partition_i)
4476 fn (pred, pred->src);
4478 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4479 succ; succ = succ->succ_next)
4481 int dest_partition_i = m_vertices[succ->dest].partition;
4482 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4483 fn (succ, succ->dest);
4487 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4488 that NODE would operate on. This test is independent of NODE's actual
4489 operation. */
4491 bool
4492 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4493 unsigned int layout_i)
4495 if (layout_i == 0)
4496 return true;
4498 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4499 return false;
4501 return true;
4504 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4505 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4506 layouts is incompatible with NODE or if the change is not possible for
4507 some other reason.
4509 The properties taken from NODE include the number of lanes and the
4510 vector type. The actual operation doesn't matter. */
4513 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4514 unsigned int from_layout_i,
4515 unsigned int to_layout_i)
4517 if (!is_compatible_layout (node, from_layout_i)
4518 || !is_compatible_layout (node, to_layout_i))
4519 return -1;
4521 if (from_layout_i == to_layout_i)
4522 return 0;
4524 auto_vec<slp_tree, 1> children (1);
4525 children.quick_push (node);
4526 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4527 if (from_layout_i > 0)
4528 for (unsigned int i : m_perms[from_layout_i])
4529 perm.quick_push ({ 0, i });
4530 else
4531 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4532 perm.quick_push ({ 0, i });
4533 if (to_layout_i > 0)
4534 vect_slp_permute (m_perms[to_layout_i], perm, true);
4535 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4536 children, false);
4537 if (count >= 0)
4538 return MAX (count, 1);
4540 /* ??? In principle we could try changing via layout 0, giving two
4541 layout changes rather than 1. Doing that would require
4542 corresponding support in get_result_with_layout. */
4543 return -1;
4546 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4548 inline slpg_partition_layout_costs &
4549 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4550 unsigned int layout_i)
4552 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4555 /* Change PERM in one of two ways:
4557 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4558 chosen for child I of NODE.
4560 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4562 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4564 void
4565 vect_optimize_slp_pass::
4566 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4567 int in_layout_i, unsigned int out_layout_i)
4569 for (auto &entry : perm)
4571 int this_in_layout_i = in_layout_i;
4572 if (this_in_layout_i < 0)
4574 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4575 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4576 this_in_layout_i = m_partitions[in_partition_i].layout;
4578 if (this_in_layout_i > 0)
4579 entry.second = m_perms[this_in_layout_i][entry.second];
4581 if (out_layout_i > 0)
4582 vect_slp_permute (m_perms[out_layout_i], perm, true);
4585 /* Check whether the target allows NODE to be rearranged so that the node's
4586 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4587 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4589 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4590 NODE can adapt to the layout changes that have (perhaps provisionally)
4591 been chosen for NODE's children, so that no extra permutations are
4592 needed on either the input or the output of NODE.
4594 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4595 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4597 IN_LAYOUT_I has no meaning for other types of node.
4599 Keeping the node as-is is always valid. If the target doesn't appear
4600 to support the node as-is, but might realistically support other layouts,
4601 then layout 0 instead has the cost of a worst-case permutation. On the
4602 one hand, this ensures that every node has at least one valid layout,
4603 avoiding what would otherwise be an awkward special case. On the other,
4604 it still encourages the pass to change an invalid pre-existing layout
4605 choice into a valid one. */
4608 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4609 unsigned int out_layout_i)
4611 const int fallback_cost = 1;
4613 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4615 auto_lane_permutation_t tmp_perm;
4616 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4618 /* Check that the child nodes support the chosen layout. Checking
4619 the first child is enough, since any second child would have the
4620 same shape. */
4621 auto first_child = SLP_TREE_CHILDREN (node)[0];
4622 if (in_layout_i > 0
4623 && !is_compatible_layout (first_child, in_layout_i))
4624 return -1;
4626 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4627 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4628 node, tmp_perm,
4629 SLP_TREE_CHILDREN (node),
4630 false);
4631 if (count < 0)
4633 if (in_layout_i == 0 && out_layout_i == 0)
4635 /* Use the fallback cost if the node could in principle support
4636 some nonzero layout for both the inputs and the outputs.
4637 Otherwise assume that the node will be rejected later
4638 and rebuilt from scalars. */
4639 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4640 return fallback_cost;
4641 return 0;
4643 return -1;
4646 /* We currently have no way of telling whether the new layout is cheaper
4647 or more expensive than the old one. But at least in principle,
4648 it should be worth making zero permutations (whole-vector shuffles)
4649 cheaper than real permutations, in case the pass is able to remove
4650 the latter. */
4651 return count == 0 ? 0 : 1;
4654 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4655 if (rep
4656 && STMT_VINFO_DATA_REF (rep)
4657 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4658 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4660 auto_load_permutation_t tmp_perm;
4661 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4662 if (out_layout_i > 0)
4663 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4665 poly_uint64 vf = 1;
4666 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4667 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4668 unsigned int n_perms;
4669 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4670 nullptr, vf, true, false, &n_perms))
4672 auto rep = SLP_TREE_REPRESENTATIVE (node);
4673 if (out_layout_i == 0)
4675 /* Use the fallback cost if the load is an N-to-N permutation.
4676 Otherwise assume that the node will be rejected later
4677 and rebuilt from scalars. */
4678 if (STMT_VINFO_GROUPED_ACCESS (rep)
4679 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4680 == SLP_TREE_LANES (node)))
4681 return fallback_cost;
4682 return 0;
4684 return -1;
4687 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4688 return n_perms == 0 ? 0 : 1;
4691 return 0;
4694 /* Decide which element layouts we should consider using. Calculate the
4695 weights associated with inserting layout changes on partition edges.
4696 Also mark partitions that cannot change layout, by setting their
4697 layout to zero. */
4699 void
4700 vect_optimize_slp_pass::start_choosing_layouts ()
4702 /* Used to assign unique permutation indices. */
4703 using perm_hash = unbounded_hashmap_traits<
4704 vec_free_hash_base<int_hash_base<unsigned>>,
4705 int_hash<int, -1, -2>
4707 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4709 /* Layout 0 is "no change". */
4710 m_perms.safe_push (vNULL);
4712 /* Create layouts from existing permutations. */
4713 auto_load_permutation_t tmp_perm;
4714 for (unsigned int node_i : m_partitioned_nodes)
4716 /* Leafs also double as entries to the reverse graph. Allow the
4717 layout of those to be changed. */
4718 auto &vertex = m_vertices[node_i];
4719 auto &partition = m_partitions[vertex.partition];
4720 if (!m_slpg->vertices[node_i].succ)
4721 partition.layout = 0;
4723 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4724 slp_tree node = vertex.node;
4725 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4726 slp_tree child;
4727 unsigned HOST_WIDE_INT imin, imax = 0;
4728 bool any_permute = false;
4729 tmp_perm.truncate (0);
4730 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4732 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4733 unpermuted, record a layout that reverses this permutation.
4735 We would need more work to cope with loads that are internally
4736 permuted and also have inputs (such as masks for
4737 IFN_MASK_LOADs). */
4738 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4739 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4741 partition.layout = -1;
4742 continue;
4744 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4745 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4746 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4748 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4749 && SLP_TREE_CHILDREN (node).length () == 1
4750 && (child = SLP_TREE_CHILDREN (node)[0])
4751 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4752 .is_constant (&imin)))
4754 /* If the child has the same vector size as this node,
4755 reversing the permutation can make the permutation a no-op.
4756 In other cases it can change a true permutation into a
4757 full-vector extract. */
4758 tmp_perm.reserve (SLP_TREE_LANES (node));
4759 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4762 else
4763 continue;
4765 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4767 unsigned idx = tmp_perm[j];
4768 imin = MIN (imin, idx);
4769 imax = MAX (imax, idx);
4770 if (idx - tmp_perm[0] != j)
4771 any_permute = true;
4773 /* If the span doesn't match we'd disrupt VF computation, avoid
4774 that for now. */
4775 if (imax - imin + 1 != SLP_TREE_LANES (node))
4776 continue;
4777 /* If there's no permute no need to split one out. In this case
4778 we can consider turning a load into a permuted load, if that
4779 turns out to be cheaper than alternatives. */
4780 if (!any_permute)
4782 partition.layout = -1;
4783 continue;
4786 /* For now only handle true permutes, like
4787 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4788 when permuting constants and invariants keeping the permute
4789 bijective. */
4790 auto_sbitmap load_index (SLP_TREE_LANES (node));
4791 bitmap_clear (load_index);
4792 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4793 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4794 unsigned j;
4795 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4796 if (!bitmap_bit_p (load_index, j))
4797 break;
4798 if (j != SLP_TREE_LANES (node))
4799 continue;
4801 vec<unsigned> perm = vNULL;
4802 perm.safe_grow (SLP_TREE_LANES (node), true);
4803 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4804 perm[j] = tmp_perm[j] - imin;
4806 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4808 /* Continue to use existing layouts, but don't add any more. */
4809 int *entry = layout_ids.get (perm);
4810 partition.layout = entry ? *entry : 0;
4811 perm.release ();
4813 else
4815 bool existed;
4816 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4817 if (existed)
4818 perm.release ();
4819 else
4821 layout_i = m_perms.length ();
4822 m_perms.safe_push (perm);
4824 partition.layout = layout_i;
4828 /* Initially assume that every layout is possible and has zero cost
4829 in every partition. */
4830 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4831 * m_perms.length ());
4833 /* We have to mark outgoing permutations facing non-associating-reduction
4834 graph entries that are not represented as to be materialized.
4835 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4836 for (slp_instance instance : m_vinfo->slp_instances)
4837 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4839 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4840 m_partitions[m_vertices[node_i].partition].layout = 0;
4842 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4844 stmt_vec_info stmt_info
4845 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4846 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4847 if (needs_fold_left_reduction_p (TREE_TYPE
4848 (gimple_get_lhs (stmt_info->stmt)),
4849 STMT_VINFO_REDUC_CODE (reduc_info)))
4851 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4852 m_partitions[m_vertices[node_i].partition].layout = 0;
4856 /* Check which layouts each node and partition can handle. Calculate the
4857 weights associated with inserting layout changes on edges. */
4858 for (unsigned int node_i : m_partitioned_nodes)
4860 auto &vertex = m_vertices[node_i];
4861 auto &partition = m_partitions[vertex.partition];
4862 slp_tree node = vertex.node;
4864 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4866 vertex.weight = vect_slp_node_weight (node);
4868 /* We do not handle stores with a permutation, so all
4869 incoming permutations must have been materialized.
4871 We also don't handle masked grouped loads, which lack a
4872 permutation vector. In this case the memory locations
4873 form an implicit second input to the loads, on top of the
4874 explicit mask input, and the memory input's layout cannot
4875 be changed.
4877 On the other hand, we do support permuting gather loads and
4878 masked gather loads, where each scalar load is independent
4879 of the others. This can be useful if the address/index input
4880 benefits from permutation. */
4881 if (STMT_VINFO_DATA_REF (rep)
4882 && STMT_VINFO_GROUPED_ACCESS (rep)
4883 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4884 partition.layout = 0;
4886 /* We cannot change the layout of an operation that is
4887 not independent on lanes. Note this is an explicit
4888 negative list since that's much shorter than the respective
4889 positive one but it's critical to keep maintaining it. */
4890 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4891 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4893 case CFN_COMPLEX_ADD_ROT90:
4894 case CFN_COMPLEX_ADD_ROT270:
4895 case CFN_COMPLEX_MUL:
4896 case CFN_COMPLEX_MUL_CONJ:
4897 case CFN_VEC_ADDSUB:
4898 case CFN_VEC_FMADDSUB:
4899 case CFN_VEC_FMSUBADD:
4900 partition.layout = 0;
4901 default:;
4905 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4907 auto &other_vertex = m_vertices[other_node_i];
4909 /* Count the number of edges from earlier partitions and the number
4910 of edges to later partitions. */
4911 if (other_vertex.partition < vertex.partition)
4912 partition.in_degree += 1;
4913 else
4914 partition.out_degree += 1;
4916 /* If the current node uses the result of OTHER_NODE_I, accumulate
4917 the effects of that. */
4918 if (ud->src == int (node_i))
4920 other_vertex.out_weight += vertex.weight;
4921 other_vertex.out_degree += 1;
4924 for_each_partition_edge (node_i, process_edge);
4928 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4929 its current (provisional) choice of layout. The inputs do not necessarily
4930 have the same layout as each other. */
4932 slpg_layout_cost
4933 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4935 auto &vertex = m_vertices[node_i];
4936 slpg_layout_cost cost;
4937 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4939 auto &other_vertex = m_vertices[other_node_i];
4940 if (other_vertex.partition < vertex.partition)
4942 auto &other_partition = m_partitions[other_vertex.partition];
4943 auto &other_costs = partition_layout_costs (other_vertex.partition,
4944 other_partition.layout);
4945 slpg_layout_cost this_cost = other_costs.in_cost;
4946 this_cost.add_serial_cost (other_costs.internal_cost);
4947 this_cost.split (other_partition.out_degree);
4948 cost.add_parallel_cost (this_cost);
4951 for_each_partition_edge (node_i, add_cost);
4952 return cost;
4955 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4956 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4957 slpg_layout_cost::impossible () if the change isn't possible. */
4959 slpg_layout_cost
4960 vect_optimize_slp_pass::
4961 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4962 unsigned int layout2_i)
4964 auto &def_vertex = m_vertices[ud->dest];
4965 auto &use_vertex = m_vertices[ud->src];
4966 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4967 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4968 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4969 use_layout_i);
4970 if (factor < 0)
4971 return slpg_layout_cost::impossible ();
4973 /* We have a choice of putting the layout change at the site of the
4974 definition or at the site of the use. Prefer the former when
4975 optimizing for size or when the execution frequency of the
4976 definition is no greater than the combined execution frequencies of
4977 the uses. When putting the layout change at the site of the definition,
4978 divvy up the cost among all consumers. */
4979 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4981 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4982 cost.split (def_vertex.out_degree);
4983 return cost;
4985 return { use_vertex.weight * factor, m_optimize_size };
4988 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4989 partition; FROM_NODE_I could be the definition node or the use node.
4990 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4991 Return the cost of any necessary fix-ups on edge UD, or return
4992 slpg_layout_cost::impossible () if the change isn't possible.
4994 At this point, FROM_NODE_I's partition has chosen the cheapest
4995 layout based on the information available so far, but this choice
4996 is only provisional. */
4998 slpg_layout_cost
4999 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5000 unsigned int to_layout_i)
5002 auto &from_vertex = m_vertices[from_node_i];
5003 unsigned int from_partition_i = from_vertex.partition;
5004 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5005 gcc_assert (from_partition.layout >= 0);
5007 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5008 with its current layout preference. */
5009 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5010 auto edge_cost = edge_layout_cost (ud, from_node_i,
5011 from_partition.layout, to_layout_i);
5012 if (edge_cost.is_possible ())
5014 auto &from_costs = partition_layout_costs (from_partition_i,
5015 from_partition.layout);
5016 cost = from_costs.in_cost;
5017 cost.add_serial_cost (from_costs.internal_cost);
5018 cost.split (from_partition.out_degree);
5019 cost.add_serial_cost (edge_cost);
5022 /* Take the minimum of that cost and the cost that applies if
5023 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5024 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5025 to_layout_i);
5026 if (direct_layout_costs.is_possible ())
5028 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5029 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5030 direct_cost.split (from_partition.out_degree);
5031 if (!cost.is_possible ()
5032 || direct_cost.is_better_than (cost, m_optimize_size))
5033 cost = direct_cost;
5036 return cost;
5039 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5040 partition; TO_NODE_I could be the definition node or the use node.
5041 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5042 return the cost of any necessary fix-ups on edge UD, or
5043 slpg_layout_cost::impossible () if the choice cannot be made.
5045 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5047 slpg_layout_cost
5048 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5049 unsigned int from_layout_i)
5051 auto &to_vertex = m_vertices[to_node_i];
5052 unsigned int to_partition_i = to_vertex.partition;
5053 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5054 gcc_assert (to_partition.layout >= 0);
5056 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5057 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5058 any other inputs keep their current choice of layout. */
5059 auto &to_costs = partition_layout_costs (to_partition_i,
5060 to_partition.layout);
5061 if (ud->src == int (to_node_i)
5062 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5064 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5065 auto old_layout = from_partition.layout;
5066 from_partition.layout = from_layout_i;
5067 int factor = internal_node_cost (to_vertex.node, -1,
5068 to_partition.layout);
5069 from_partition.layout = old_layout;
5070 if (factor >= 0)
5072 slpg_layout_cost cost = to_costs.out_cost;
5073 cost.add_serial_cost ({ to_vertex.weight * factor,
5074 m_optimize_size });
5075 cost.split (to_partition.in_degree);
5076 return cost;
5080 /* Compute the cost if we insert any necessary layout change on edge UD. */
5081 auto edge_cost = edge_layout_cost (ud, to_node_i,
5082 to_partition.layout, from_layout_i);
5083 if (edge_cost.is_possible ())
5085 slpg_layout_cost cost = to_costs.out_cost;
5086 cost.add_serial_cost (to_costs.internal_cost);
5087 cost.split (to_partition.in_degree);
5088 cost.add_serial_cost (edge_cost);
5089 return cost;
5092 return slpg_layout_cost::impossible ();
5095 /* Make a forward pass through the partitions, accumulating input costs.
5096 Make a tentative (provisional) choice of layout for each partition,
5097 ensuring that this choice still allows later partitions to keep
5098 their original layout. */
5100 void
5101 vect_optimize_slp_pass::forward_pass ()
5103 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5104 ++partition_i)
5106 auto &partition = m_partitions[partition_i];
5108 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5109 the incoming cost that would apply if every predecessor partition
5110 keeps its current layout. This is used within the loop below. */
5111 slpg_layout_cost in_cost;
5112 slp_tree single_node = nullptr;
5113 if (partition.node_end == partition.node_begin + 1)
5115 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5116 single_node = m_vertices[node_i].node;
5117 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5118 in_cost = total_in_cost (node_i);
5121 /* Go through the possible layouts. Decide which ones are valid
5122 for this partition and record which of the valid layouts has
5123 the lowest cost. */
5124 unsigned int min_layout_i = 0;
5125 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5126 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5128 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5129 if (!layout_costs.is_possible ())
5130 continue;
5132 /* If the recorded layout is already 0 then the layout cannot
5133 change. */
5134 if (partition.layout == 0 && layout_i != 0)
5136 layout_costs.mark_impossible ();
5137 continue;
5140 bool is_possible = true;
5141 for (unsigned int order_i = partition.node_begin;
5142 order_i < partition.node_end; ++order_i)
5144 unsigned int node_i = m_partitioned_nodes[order_i];
5145 auto &vertex = m_vertices[node_i];
5147 /* Reject the layout if it is individually incompatible
5148 with any node in the partition. */
5149 if (!is_compatible_layout (vertex.node, layout_i))
5151 is_possible = false;
5152 break;
5155 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5157 auto &other_vertex = m_vertices[other_node_i];
5158 if (other_vertex.partition < vertex.partition)
5160 /* Accumulate the incoming costs from earlier
5161 partitions, plus the cost of any layout changes
5162 on UD itself. */
5163 auto cost = forward_cost (ud, other_node_i, layout_i);
5164 if (!cost.is_possible ())
5165 is_possible = false;
5166 else
5167 layout_costs.in_cost.add_parallel_cost (cost);
5169 else
5170 /* Reject the layout if it would make layout 0 impossible
5171 for later partitions. This amounts to testing that the
5172 target supports reversing the layout change on edges
5173 to later partitions.
5175 In principle, it might be possible to push a layout
5176 change all the way down a graph, so that it never
5177 needs to be reversed and so that the target doesn't
5178 need to support the reverse operation. But it would
5179 be awkward to bail out if we hit a partition that
5180 does not support the new layout, especially since
5181 we are not dealing with a lattice. */
5182 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5183 layout_i).is_possible ();
5185 for_each_partition_edge (node_i, add_cost);
5187 /* Accumulate the cost of using LAYOUT_I within NODE,
5188 both for the inputs and the outputs. */
5189 int factor = internal_node_cost (vertex.node, layout_i,
5190 layout_i);
5191 if (factor < 0)
5193 is_possible = false;
5194 break;
5196 else if (factor)
5197 layout_costs.internal_cost.add_serial_cost
5198 ({ vertex.weight * factor, m_optimize_size });
5200 if (!is_possible)
5202 layout_costs.mark_impossible ();
5203 continue;
5206 /* Combine the incoming and partition-internal costs. */
5207 slpg_layout_cost combined_cost = layout_costs.in_cost;
5208 combined_cost.add_serial_cost (layout_costs.internal_cost);
5210 /* If this partition consists of a single VEC_PERM_EXPR, see
5211 if the VEC_PERM_EXPR can be changed to support output layout
5212 LAYOUT_I while keeping all the provisional choices of input
5213 layout. */
5214 if (single_node
5215 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5217 int factor = internal_node_cost (single_node, -1, layout_i);
5218 if (factor >= 0)
5220 auto weight = m_vertices[single_node->vertex].weight;
5221 slpg_layout_cost internal_cost
5222 = { weight * factor, m_optimize_size };
5224 slpg_layout_cost alt_cost = in_cost;
5225 alt_cost.add_serial_cost (internal_cost);
5226 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5228 combined_cost = alt_cost;
5229 layout_costs.in_cost = in_cost;
5230 layout_costs.internal_cost = internal_cost;
5235 /* Record the layout with the lowest cost. Prefer layout 0 in
5236 the event of a tie between it and another layout. */
5237 if (!min_layout_cost.is_possible ()
5238 || combined_cost.is_better_than (min_layout_cost,
5239 m_optimize_size))
5241 min_layout_i = layout_i;
5242 min_layout_cost = combined_cost;
5246 /* This loop's handling of earlier partitions should ensure that
5247 choosing the original layout for the current partition is no
5248 less valid than it was in the original graph, even with the
5249 provisional layout choices for those earlier partitions. */
5250 gcc_assert (min_layout_cost.is_possible ());
5251 partition.layout = min_layout_i;
5255 /* Make a backward pass through the partitions, accumulating output costs.
5256 Make a final choice of layout for each partition. */
5258 void
5259 vect_optimize_slp_pass::backward_pass ()
5261 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5263 auto &partition = m_partitions[partition_i];
5265 unsigned int min_layout_i = 0;
5266 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5267 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5269 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5270 if (!layout_costs.is_possible ())
5271 continue;
5273 /* Accumulate the costs from successor partitions. */
5274 bool is_possible = true;
5275 for (unsigned int order_i = partition.node_begin;
5276 order_i < partition.node_end; ++order_i)
5278 unsigned int node_i = m_partitioned_nodes[order_i];
5279 auto &vertex = m_vertices[node_i];
5280 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5282 auto &other_vertex = m_vertices[other_node_i];
5283 auto &other_partition = m_partitions[other_vertex.partition];
5284 if (other_vertex.partition > vertex.partition)
5286 /* Accumulate the incoming costs from later
5287 partitions, plus the cost of any layout changes
5288 on UD itself. */
5289 auto cost = backward_cost (ud, other_node_i, layout_i);
5290 if (!cost.is_possible ())
5291 is_possible = false;
5292 else
5293 layout_costs.out_cost.add_parallel_cost (cost);
5295 else
5296 /* Make sure that earlier partitions can (if necessary
5297 or beneficial) keep the layout that they chose in
5298 the forward pass. This ensures that there is at
5299 least one valid choice of layout. */
5300 is_possible &= edge_layout_cost (ud, other_node_i,
5301 other_partition.layout,
5302 layout_i).is_possible ();
5304 for_each_partition_edge (node_i, add_cost);
5306 if (!is_possible)
5308 layout_costs.mark_impossible ();
5309 continue;
5312 /* Locally combine the costs from the forward and backward passes.
5313 (This combined cost is not passed on, since that would lead
5314 to double counting.) */
5315 slpg_layout_cost combined_cost = layout_costs.in_cost;
5316 combined_cost.add_serial_cost (layout_costs.internal_cost);
5317 combined_cost.add_serial_cost (layout_costs.out_cost);
5319 /* Record the layout with the lowest cost. Prefer layout 0 in
5320 the event of a tie between it and another layout. */
5321 if (!min_layout_cost.is_possible ()
5322 || combined_cost.is_better_than (min_layout_cost,
5323 m_optimize_size))
5325 min_layout_i = layout_i;
5326 min_layout_cost = combined_cost;
5330 gcc_assert (min_layout_cost.is_possible ());
5331 partition.layout = min_layout_i;
5335 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5336 NODE already has the layout that was selected for its partition. */
5338 slp_tree
5339 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5340 unsigned int to_layout_i)
5342 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5343 slp_tree result = m_node_layouts[result_i];
5344 if (result)
5345 return result;
5347 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5348 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5349 /* We can't permute vector defs in place. */
5350 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5352 /* If the vector is uniform or unchanged, there's nothing to do. */
5353 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5354 result = node;
5355 else
5357 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5358 result = vect_create_new_slp_node (scalar_ops);
5359 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5362 else
5364 unsigned int partition_i = m_vertices[node->vertex].partition;
5365 unsigned int from_layout_i = m_partitions[partition_i].layout;
5366 if (from_layout_i == to_layout_i)
5367 return node;
5369 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5370 permutation instead of a serial one. Leave the new permutation
5371 in TMP_PERM on success. */
5372 auto_lane_permutation_t tmp_perm;
5373 unsigned int num_inputs = 1;
5374 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5376 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5377 if (from_layout_i != 0)
5378 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5379 if (to_layout_i != 0)
5380 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5381 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5382 tmp_perm,
5383 SLP_TREE_CHILDREN (node),
5384 false) >= 0)
5385 num_inputs = SLP_TREE_CHILDREN (node).length ();
5386 else
5387 tmp_perm.truncate (0);
5390 if (dump_enabled_p ())
5392 if (tmp_perm.length () > 0)
5393 dump_printf_loc (MSG_NOTE, vect_location,
5394 "duplicating permutation node %p with"
5395 " layout %d\n",
5396 (void *) node, to_layout_i);
5397 else
5398 dump_printf_loc (MSG_NOTE, vect_location,
5399 "inserting permutation node in place of %p\n",
5400 (void *) node);
5403 unsigned int num_lanes = SLP_TREE_LANES (node);
5404 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5405 if (SLP_TREE_SCALAR_STMTS (node).length ())
5407 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5408 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5409 if (from_layout_i != 0)
5410 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5411 if (to_layout_i != 0)
5412 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5414 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5415 SLP_TREE_LANES (result) = num_lanes;
5416 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5417 result->vertex = -1;
5419 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5420 if (tmp_perm.length ())
5422 lane_perm.safe_splice (tmp_perm);
5423 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5425 else
5427 lane_perm.create (num_lanes);
5428 for (unsigned j = 0; j < num_lanes; ++j)
5429 lane_perm.quick_push ({ 0, j });
5430 if (from_layout_i != 0)
5431 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5432 if (to_layout_i != 0)
5433 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5434 SLP_TREE_CHILDREN (result).safe_push (node);
5436 for (slp_tree child : SLP_TREE_CHILDREN (result))
5437 child->refcnt++;
5439 m_node_layouts[result_i] = result;
5440 return result;
5443 /* Apply the chosen vector layouts to the SLP graph. */
5445 void
5446 vect_optimize_slp_pass::materialize ()
5448 /* We no longer need the costs, so avoid having two O(N * P) arrays
5449 live at the same time. */
5450 m_partition_layout_costs.release ();
5451 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5453 auto_sbitmap fully_folded (m_vertices.length ());
5454 bitmap_clear (fully_folded);
5455 for (unsigned int node_i : m_partitioned_nodes)
5457 auto &vertex = m_vertices[node_i];
5458 slp_tree node = vertex.node;
5459 int layout_i = m_partitions[vertex.partition].layout;
5460 gcc_assert (layout_i >= 0);
5462 /* Rearrange the scalar statements to match the chosen layout. */
5463 if (layout_i > 0)
5464 vect_slp_permute (m_perms[layout_i],
5465 SLP_TREE_SCALAR_STMTS (node), true);
5467 /* Update load and lane permutations. */
5468 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5470 /* First try to absorb the input vector layouts. If that fails,
5471 force the inputs to have layout LAYOUT_I too. We checked that
5472 that was possible before deciding to use nonzero output layouts.
5473 (Note that at this stage we don't really have any guarantee that
5474 the target supports the original VEC_PERM_EXPR.) */
5475 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5476 auto_lane_permutation_t tmp_perm;
5477 tmp_perm.safe_splice (perm);
5478 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5479 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5480 tmp_perm,
5481 SLP_TREE_CHILDREN (node),
5482 false) >= 0)
5484 if (dump_enabled_p ()
5485 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5486 perm.begin ()))
5487 dump_printf_loc (MSG_NOTE, vect_location,
5488 "absorbing input layouts into %p\n",
5489 (void *) node);
5490 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5491 bitmap_set_bit (fully_folded, node_i);
5493 else
5495 /* Not MSG_MISSED because it would make no sense to users. */
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE, vect_location,
5498 "failed to absorb input layouts into %p\n",
5499 (void *) node);
5500 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5503 else
5505 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5506 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5507 if (layout_i > 0)
5508 /* ??? When we handle non-bijective permutes the idea
5509 is that we can force the load-permutation to be
5510 { min, min + 1, min + 2, ... max }. But then the
5511 scalar defs might no longer match the lane content
5512 which means wrong-code with live lane vectorization.
5513 So we possibly have to have NULL entries for those. */
5514 vect_slp_permute (m_perms[layout_i], load_perm, true);
5518 /* Do this before any nodes disappear, since it involves a walk
5519 over the leaves. */
5520 remove_redundant_permutations ();
5522 /* Replace each child with a correctly laid-out version. */
5523 for (unsigned int node_i : m_partitioned_nodes)
5525 /* Skip nodes that have already been handled above. */
5526 if (bitmap_bit_p (fully_folded, node_i))
5527 continue;
5529 auto &vertex = m_vertices[node_i];
5530 int in_layout_i = m_partitions[vertex.partition].layout;
5531 gcc_assert (in_layout_i >= 0);
5533 unsigned j;
5534 slp_tree child;
5535 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5537 if (!child)
5538 continue;
5540 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5541 if (new_child != child)
5543 vect_free_slp_tree (child);
5544 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5545 new_child->refcnt += 1;
5551 /* Elide load permutations that are not necessary. Such permutations might
5552 be pre-existing, rather than created by the layout optimizations. */
5554 void
5555 vect_optimize_slp_pass::remove_redundant_permutations ()
5557 for (unsigned int node_i : m_leafs)
5559 slp_tree node = m_vertices[node_i].node;
5560 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5561 continue;
5563 /* In basic block vectorization we allow any subchain of an interleaving
5564 chain.
5565 FORNOW: not in loop SLP because of realignment complications. */
5566 if (is_a <bb_vec_info> (m_vinfo))
5568 bool subchain_p = true;
5569 stmt_vec_info next_load_info = NULL;
5570 stmt_vec_info load_info;
5571 unsigned j;
5572 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5574 if (j != 0
5575 && (next_load_info != load_info
5576 || DR_GROUP_GAP (load_info) != 1))
5578 subchain_p = false;
5579 break;
5581 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5583 if (subchain_p)
5585 SLP_TREE_LOAD_PERMUTATION (node).release ();
5586 continue;
5589 else
5591 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5592 stmt_vec_info load_info;
5593 bool this_load_permuted = false;
5594 unsigned j;
5595 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5596 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5598 this_load_permuted = true;
5599 break;
5601 /* When this isn't a grouped access we know it's single element
5602 and contiguous. */
5603 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5605 if (!this_load_permuted
5606 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5607 || SLP_TREE_LANES (node) == 1))
5608 SLP_TREE_LOAD_PERMUTATION (node).release ();
5609 continue;
5611 stmt_vec_info first_stmt_info
5612 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5613 if (!this_load_permuted
5614 /* The load requires permutation when unrolling exposes
5615 a gap either because the group is larger than the SLP
5616 group-size or because there is a gap between the groups. */
5617 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5618 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5619 && DR_GROUP_GAP (first_stmt_info) == 0)))
5621 SLP_TREE_LOAD_PERMUTATION (node).release ();
5622 continue;
5628 /* Print the partition graph and layout information to the dump file. */
5630 void
5631 vect_optimize_slp_pass::dump ()
5633 dump_printf_loc (MSG_NOTE, vect_location,
5634 "SLP optimize permutations:\n");
5635 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5637 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5638 const char *sep = "";
5639 for (unsigned int idx : m_perms[layout_i])
5641 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5642 sep = ", ";
5644 dump_printf (MSG_NOTE, " }\n");
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "SLP optimize partitions:\n");
5648 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5649 ++partition_i)
5651 auto &partition = m_partitions[partition_i];
5652 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5653 dump_printf_loc (MSG_NOTE, vect_location,
5654 " partition %d (layout %d):\n",
5655 partition_i, partition.layout);
5656 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5657 for (unsigned int order_i = partition.node_begin;
5658 order_i < partition.node_end; ++order_i)
5660 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5661 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5662 (void *) vertex.node);
5663 dump_printf_loc (MSG_NOTE, vect_location,
5664 " weight: %f\n",
5665 vertex.weight.to_double ());
5666 if (vertex.out_degree)
5667 dump_printf_loc (MSG_NOTE, vect_location,
5668 " out weight: %f (degree %d)\n",
5669 vertex.out_weight.to_double (),
5670 vertex.out_degree);
5671 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5672 dump_printf_loc (MSG_NOTE, vect_location,
5673 " op: VEC_PERM_EXPR\n");
5674 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5675 dump_printf_loc (MSG_NOTE, vect_location,
5676 " op template: %G", rep->stmt);
5678 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5679 for (unsigned int order_i = partition.node_begin;
5680 order_i < partition.node_end; ++order_i)
5682 unsigned int node_i = m_partitioned_nodes[order_i];
5683 auto &vertex = m_vertices[node_i];
5684 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5686 auto &other_vertex = m_vertices[other_node_i];
5687 if (other_vertex.partition < vertex.partition)
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 " - %p [%d] --> %p\n",
5690 (void *) other_vertex.node,
5691 other_vertex.partition,
5692 (void *) vertex.node);
5693 else
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 " - %p --> [%d] %p\n",
5696 (void *) vertex.node,
5697 other_vertex.partition,
5698 (void *) other_vertex.node);
5700 for_each_partition_edge (node_i, print_edge);
5703 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5705 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5706 if (layout_costs.is_possible ())
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 " layout %d:%s\n", layout_i,
5710 partition.layout == int (layout_i)
5711 ? " (*)" : "");
5712 slpg_layout_cost combined_cost = layout_costs.in_cost;
5713 combined_cost.add_serial_cost (layout_costs.internal_cost);
5714 combined_cost.add_serial_cost (layout_costs.out_cost);
5715 #define TEMPLATE "{depth: %f, total: %f}"
5716 dump_printf_loc (MSG_NOTE, vect_location,
5717 " " TEMPLATE "\n",
5718 layout_costs.in_cost.depth.to_double (),
5719 layout_costs.in_cost.total.to_double ());
5720 dump_printf_loc (MSG_NOTE, vect_location,
5721 " + " TEMPLATE "\n",
5722 layout_costs.internal_cost.depth.to_double (),
5723 layout_costs.internal_cost.total.to_double ());
5724 dump_printf_loc (MSG_NOTE, vect_location,
5725 " + " TEMPLATE "\n",
5726 layout_costs.out_cost.depth.to_double (),
5727 layout_costs.out_cost.total.to_double ());
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 " = " TEMPLATE "\n",
5730 combined_cost.depth.to_double (),
5731 combined_cost.total.to_double ());
5732 #undef TEMPLATE
5734 else
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 " layout %d: rejected\n", layout_i);
5741 /* Main entry point for the SLP graph optimization pass. */
5743 void
5744 vect_optimize_slp_pass::run ()
5746 build_graph ();
5747 create_partitions ();
5748 start_choosing_layouts ();
5749 if (m_perms.length () > 1)
5751 forward_pass ();
5752 backward_pass ();
5753 if (dump_enabled_p ())
5754 dump ();
5755 materialize ();
5756 while (!m_perms.is_empty ())
5757 m_perms.pop ().release ();
5759 else
5760 remove_redundant_permutations ();
5761 free_graph (m_slpg);
5764 /* Optimize the SLP graph of VINFO. */
5766 void
5767 vect_optimize_slp (vec_info *vinfo)
5769 if (vinfo->slp_instances.is_empty ())
5770 return;
5771 vect_optimize_slp_pass (vinfo).run ();
5774 /* Gather loads reachable from the individual SLP graph entries. */
5776 void
5777 vect_gather_slp_loads (vec_info *vinfo)
5779 unsigned i;
5780 slp_instance instance;
5781 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5783 hash_set<slp_tree> visited;
5784 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5785 SLP_INSTANCE_TREE (instance), visited);
5790 /* For each possible SLP instance decide whether to SLP it and calculate overall
5791 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5792 least one instance. */
5794 bool
5795 vect_make_slp_decision (loop_vec_info loop_vinfo)
5797 unsigned int i;
5798 poly_uint64 unrolling_factor = 1;
5799 const vec<slp_instance> &slp_instances
5800 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5801 slp_instance instance;
5802 int decided_to_slp = 0;
5804 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5806 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5808 /* FORNOW: SLP if you can. */
5809 /* All unroll factors have the form:
5811 GET_MODE_SIZE (vinfo->vector_mode) * X
5813 for some rational X, so they must have a common multiple. */
5814 unrolling_factor
5815 = force_common_multiple (unrolling_factor,
5816 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5818 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5819 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5820 loop-based vectorization. Such stmts will be marked as HYBRID. */
5821 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5822 decided_to_slp++;
5825 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5827 if (decided_to_slp && dump_enabled_p ())
5829 dump_printf_loc (MSG_NOTE, vect_location,
5830 "Decided to SLP %d instances. Unrolling factor ",
5831 decided_to_slp);
5832 dump_dec (MSG_NOTE, unrolling_factor);
5833 dump_printf (MSG_NOTE, "\n");
5836 return (decided_to_slp > 0);
5839 /* Private data for vect_detect_hybrid_slp. */
5840 struct vdhs_data
5842 loop_vec_info loop_vinfo;
5843 vec<stmt_vec_info> *worklist;
5846 /* Walker for walk_gimple_op. */
5848 static tree
5849 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5851 walk_stmt_info *wi = (walk_stmt_info *)data;
5852 vdhs_data *dat = (vdhs_data *)wi->info;
5854 if (wi->is_lhs)
5855 return NULL_TREE;
5857 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5858 if (!def_stmt_info)
5859 return NULL_TREE;
5860 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5861 if (PURE_SLP_STMT (def_stmt_info))
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5865 def_stmt_info->stmt);
5866 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5867 dat->worklist->safe_push (def_stmt_info);
5870 return NULL_TREE;
5873 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5874 if so, otherwise pushing it to WORKLIST. */
5876 static void
5877 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5878 vec<stmt_vec_info> &worklist,
5879 stmt_vec_info stmt_info)
5881 if (dump_enabled_p ())
5882 dump_printf_loc (MSG_NOTE, vect_location,
5883 "Processing hybrid candidate : %G", stmt_info->stmt);
5884 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5885 imm_use_iterator iter2;
5886 ssa_op_iter iter1;
5887 use_operand_p use_p;
5888 def_operand_p def_p;
5889 bool any_def = false;
5890 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5892 any_def = true;
5893 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5895 if (is_gimple_debug (USE_STMT (use_p)))
5896 continue;
5897 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5898 /* An out-of loop use means this is a loop_vect sink. */
5899 if (!use_info)
5901 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_NOTE, vect_location,
5903 "Found loop_vect sink: %G", stmt_info->stmt);
5904 worklist.safe_push (stmt_info);
5905 return;
5907 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5909 if (dump_enabled_p ())
5910 dump_printf_loc (MSG_NOTE, vect_location,
5911 "Found loop_vect use: %G", use_info->stmt);
5912 worklist.safe_push (stmt_info);
5913 return;
5917 /* No def means this is a loo_vect sink. */
5918 if (!any_def)
5920 if (dump_enabled_p ())
5921 dump_printf_loc (MSG_NOTE, vect_location,
5922 "Found loop_vect sink: %G", stmt_info->stmt);
5923 worklist.safe_push (stmt_info);
5924 return;
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_NOTE, vect_location,
5928 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5929 STMT_SLP_TYPE (stmt_info) = pure_slp;
5932 /* Find stmts that must be both vectorized and SLPed. */
5934 void
5935 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5937 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5939 /* All stmts participating in SLP are marked pure_slp, all other
5940 stmts are loop_vect.
5941 First collect all loop_vect stmts into a worklist.
5942 SLP patterns cause not all original scalar stmts to appear in
5943 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5944 Rectify this here and do a backward walk over the IL only considering
5945 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5946 mark them as pure_slp. */
5947 auto_vec<stmt_vec_info> worklist;
5948 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5950 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5951 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5952 gsi_next (&gsi))
5954 gphi *phi = gsi.phi ();
5955 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5956 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5957 maybe_push_to_hybrid_worklist (loop_vinfo,
5958 worklist, stmt_info);
5960 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5961 gsi_prev (&gsi))
5963 gimple *stmt = gsi_stmt (gsi);
5964 if (is_gimple_debug (stmt))
5965 continue;
5966 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5967 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5969 for (gimple_stmt_iterator gsi2
5970 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5971 !gsi_end_p (gsi2); gsi_next (&gsi2))
5973 stmt_vec_info patt_info
5974 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5975 if (!STMT_SLP_TYPE (patt_info)
5976 && STMT_VINFO_RELEVANT (patt_info))
5977 maybe_push_to_hybrid_worklist (loop_vinfo,
5978 worklist, patt_info);
5980 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5982 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5983 maybe_push_to_hybrid_worklist (loop_vinfo,
5984 worklist, stmt_info);
5988 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5989 mark any SLP vectorized stmt as hybrid.
5990 ??? We're visiting def stmts N times (once for each non-SLP and
5991 once for each hybrid-SLP use). */
5992 walk_stmt_info wi;
5993 vdhs_data dat;
5994 dat.worklist = &worklist;
5995 dat.loop_vinfo = loop_vinfo;
5996 memset (&wi, 0, sizeof (wi));
5997 wi.info = (void *)&dat;
5998 while (!worklist.is_empty ())
6000 stmt_vec_info stmt_info = worklist.pop ();
6001 /* Since SSA operands are not set up for pattern stmts we need
6002 to use walk_gimple_op. */
6003 wi.is_lhs = 0;
6004 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6005 /* For gather/scatter make sure to walk the offset operand, that
6006 can be a scaling and conversion away. */
6007 gather_scatter_info gs_info;
6008 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6009 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6011 int dummy;
6012 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6018 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6020 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6021 : vec_info (vec_info::bb, shared),
6022 bbs (_bbs),
6023 roots (vNULL)
6025 for (unsigned i = 0; i < bbs.length (); ++i)
6027 if (i != 0)
6028 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6029 gsi_next (&si))
6031 gphi *phi = si.phi ();
6032 gimple_set_uid (phi, 0);
6033 add_stmt (phi);
6035 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6036 !gsi_end_p (gsi); gsi_next (&gsi))
6038 gimple *stmt = gsi_stmt (gsi);
6039 gimple_set_uid (stmt, 0);
6040 if (is_gimple_debug (stmt))
6041 continue;
6042 add_stmt (stmt);
6048 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6049 stmts in the basic block. */
6051 _bb_vec_info::~_bb_vec_info ()
6053 /* Reset region marker. */
6054 for (unsigned i = 0; i < bbs.length (); ++i)
6056 if (i != 0)
6057 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6058 gsi_next (&si))
6060 gphi *phi = si.phi ();
6061 gimple_set_uid (phi, -1);
6063 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6064 !gsi_end_p (gsi); gsi_next (&gsi))
6066 gimple *stmt = gsi_stmt (gsi);
6067 gimple_set_uid (stmt, -1);
6071 for (unsigned i = 0; i < roots.length (); ++i)
6073 roots[i].stmts.release ();
6074 roots[i].roots.release ();
6075 roots[i].remain.release ();
6077 roots.release ();
6080 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6081 given then that child nodes have already been processed, and that
6082 their def types currently match their SLP node's def type. */
6084 static bool
6085 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6086 slp_instance node_instance,
6087 stmt_vector_for_cost *cost_vec)
6089 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6091 /* Calculate the number of vector statements to be created for the
6092 scalar stmts in this node. For SLP reductions it is equal to the
6093 number of vector statements in the children (which has already been
6094 calculated by the recursive call). Otherwise it is the number of
6095 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6096 VF divided by the number of elements in a vector. */
6097 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6098 && !STMT_VINFO_DATA_REF (stmt_info)
6099 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6101 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6102 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6104 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6105 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6106 break;
6109 else
6111 poly_uint64 vf;
6112 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6113 vf = loop_vinfo->vectorization_factor;
6114 else
6115 vf = 1;
6116 unsigned int group_size = SLP_TREE_LANES (node);
6117 tree vectype = SLP_TREE_VECTYPE (node);
6118 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6119 = vect_get_num_vectors (vf * group_size, vectype);
6122 /* Handle purely internal nodes. */
6123 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6125 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6126 return false;
6128 stmt_vec_info slp_stmt_info;
6129 unsigned int i;
6130 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6132 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6133 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6134 node_instance, i,
6135 false, cost_vec))
6136 return false;
6138 return true;
6141 bool dummy;
6142 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6143 node, node_instance, cost_vec);
6146 /* Try to build NODE from scalars, returning true on success.
6147 NODE_INSTANCE is the SLP instance that contains NODE. */
6149 static bool
6150 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6151 slp_instance node_instance)
6153 stmt_vec_info stmt_info;
6154 unsigned int i;
6156 if (!is_a <bb_vec_info> (vinfo)
6157 || node == SLP_INSTANCE_TREE (node_instance)
6158 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6159 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6160 /* Force the mask use to be built from scalars instead. */
6161 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6162 return false;
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE, vect_location,
6166 "Building vector operands of %p from scalars instead\n",
6167 (void *) node);
6169 /* Don't remove and free the child nodes here, since they could be
6170 referenced by other structures. The analysis and scheduling phases
6171 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6172 unsigned int group_size = SLP_TREE_LANES (node);
6173 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6174 /* Invariants get their vector type from the uses. */
6175 SLP_TREE_VECTYPE (node) = NULL_TREE;
6176 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6177 SLP_TREE_LOAD_PERMUTATION (node).release ();
6178 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6180 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6181 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6183 return true;
6186 /* Return true if all elements of the slice are the same. */
6187 bool
6188 vect_scalar_ops_slice::all_same_p () const
6190 for (unsigned int i = 1; i < length; ++i)
6191 if (!operand_equal_p (op (0), op (i)))
6192 return false;
6193 return true;
6196 hashval_t
6197 vect_scalar_ops_slice_hash::hash (const value_type &s)
6199 hashval_t hash = 0;
6200 for (unsigned i = 0; i < s.length; ++i)
6201 hash = iterative_hash_expr (s.op (i), hash);
6202 return hash;
6205 bool
6206 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6207 const compare_type &s2)
6209 if (s1.length != s2.length)
6210 return false;
6211 for (unsigned i = 0; i < s1.length; ++i)
6212 if (!operand_equal_p (s1.op (i), s2.op (i)))
6213 return false;
6214 return true;
6217 /* Compute the prologue cost for invariant or constant operands represented
6218 by NODE. */
6220 static void
6221 vect_prologue_cost_for_slp (slp_tree node,
6222 stmt_vector_for_cost *cost_vec)
6224 /* There's a special case of an existing vector, that costs nothing. */
6225 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6226 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6227 return;
6228 /* Without looking at the actual initializer a vector of
6229 constants can be implemented as load from the constant pool.
6230 When all elements are the same we can use a splat. */
6231 tree vectype = SLP_TREE_VECTYPE (node);
6232 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6233 unsigned HOST_WIDE_INT const_nunits;
6234 unsigned nelt_limit;
6235 auto ops = &SLP_TREE_SCALAR_OPS (node);
6236 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6237 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6238 && ! multiple_p (const_nunits, group_size))
6240 nelt_limit = const_nunits;
6241 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6242 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6243 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6244 starts.quick_push (i * const_nunits);
6246 else
6248 /* If either the vector has variable length or the vectors
6249 are composed of repeated whole groups we only need to
6250 cost construction once. All vectors will be the same. */
6251 nelt_limit = group_size;
6252 starts.quick_push (0);
6254 /* ??? We're just tracking whether vectors in a single node are the same.
6255 Ideally we'd do something more global. */
6256 bool passed = false;
6257 for (unsigned int start : starts)
6259 vect_cost_for_stmt kind;
6260 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6261 kind = vector_load;
6262 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6263 kind = scalar_to_vec;
6264 else
6265 kind = vec_construct;
6266 /* The target cost hook has no idea which part of the SLP node
6267 we are costing so avoid passing it down more than once. Pass
6268 it to the first vec_construct or scalar_to_vec part since for those
6269 the x86 backend tries to account for GPR to XMM register moves. */
6270 record_stmt_cost (cost_vec, 1, kind,
6271 (kind != vector_load && !passed) ? node : nullptr,
6272 vectype, 0, vect_prologue);
6273 if (kind != vector_load)
6274 passed = true;
6278 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6279 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6281 Return true if the operations are supported. */
6283 static bool
6284 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6285 slp_instance node_instance,
6286 hash_set<slp_tree> &visited_set,
6287 vec<slp_tree> &visited_vec,
6288 stmt_vector_for_cost *cost_vec)
6290 int i, j;
6291 slp_tree child;
6293 /* Assume we can code-generate all invariants. */
6294 if (!node
6295 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6296 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6297 return true;
6299 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_NOTE, vect_location,
6303 "Failed cyclic SLP reference in %p\n", (void *) node);
6304 return false;
6306 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6308 /* If we already analyzed the exact same set of scalar stmts we're done.
6309 We share the generated vector stmts for those. */
6310 if (visited_set.add (node))
6311 return true;
6312 visited_vec.safe_push (node);
6314 bool res = true;
6315 unsigned visited_rec_start = visited_vec.length ();
6316 unsigned cost_vec_rec_start = cost_vec->length ();
6317 bool seen_non_constant_child = false;
6318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6320 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6321 visited_set, visited_vec,
6322 cost_vec);
6323 if (!res)
6324 break;
6325 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6326 seen_non_constant_child = true;
6328 /* We're having difficulties scheduling nodes with just constant
6329 operands and no scalar stmts since we then cannot compute a stmt
6330 insertion place. */
6331 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6333 if (dump_enabled_p ())
6334 dump_printf_loc (MSG_NOTE, vect_location,
6335 "Cannot vectorize all-constant op node %p\n",
6336 (void *) node);
6337 res = false;
6340 if (res)
6341 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6342 cost_vec);
6343 /* If analysis failed we have to pop all recursive visited nodes
6344 plus ourselves. */
6345 if (!res)
6347 while (visited_vec.length () >= visited_rec_start)
6348 visited_set.remove (visited_vec.pop ());
6349 cost_vec->truncate (cost_vec_rec_start);
6352 /* When the node can be vectorized cost invariant nodes it references.
6353 This is not done in DFS order to allow the refering node
6354 vectorizable_* calls to nail down the invariant nodes vector type
6355 and possibly unshare it if it needs a different vector type than
6356 other referrers. */
6357 if (res)
6358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6359 if (child
6360 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6361 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6362 /* Perform usual caching, note code-generation still
6363 code-gens these nodes multiple times but we expect
6364 to CSE them later. */
6365 && !visited_set.add (child))
6367 visited_vec.safe_push (child);
6368 /* ??? After auditing more code paths make a "default"
6369 and push the vector type from NODE to all children
6370 if it is not already set. */
6371 /* Compute the number of vectors to be generated. */
6372 tree vector_type = SLP_TREE_VECTYPE (child);
6373 if (!vector_type)
6375 /* For shifts with a scalar argument we don't need
6376 to cost or code-generate anything.
6377 ??? Represent this more explicitely. */
6378 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6379 == shift_vec_info_type)
6380 && j == 1);
6381 continue;
6383 unsigned group_size = SLP_TREE_LANES (child);
6384 poly_uint64 vf = 1;
6385 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6386 vf = loop_vinfo->vectorization_factor;
6387 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6388 = vect_get_num_vectors (vf * group_size, vector_type);
6389 /* And cost them. */
6390 vect_prologue_cost_for_slp (child, cost_vec);
6393 /* If this node or any of its children can't be vectorized, try pruning
6394 the tree here rather than felling the whole thing. */
6395 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6397 /* We'll need to revisit this for invariant costing and number
6398 of vectorized stmt setting. */
6399 res = true;
6402 return res;
6405 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6406 region and that can be vectorized using vectorizable_live_operation
6407 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6408 scalar code computing it to be retained. */
6410 static void
6411 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6412 slp_instance instance,
6413 stmt_vector_for_cost *cost_vec,
6414 hash_set<stmt_vec_info> &svisited,
6415 hash_set<slp_tree> &visited)
6417 if (visited.add (node))
6418 return;
6420 unsigned i;
6421 stmt_vec_info stmt_info;
6422 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6423 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6425 if (svisited.contains (stmt_info))
6426 continue;
6427 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6428 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6429 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6430 /* Only the pattern root stmt computes the original scalar value. */
6431 continue;
6432 bool mark_visited = true;
6433 gimple *orig_stmt = orig_stmt_info->stmt;
6434 ssa_op_iter op_iter;
6435 def_operand_p def_p;
6436 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6438 imm_use_iterator use_iter;
6439 gimple *use_stmt;
6440 stmt_vec_info use_stmt_info;
6441 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6442 if (!is_gimple_debug (use_stmt))
6444 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6445 if (!use_stmt_info
6446 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6448 STMT_VINFO_LIVE_P (stmt_info) = true;
6449 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6450 node, instance, i,
6451 false, cost_vec))
6452 /* ??? So we know we can vectorize the live stmt
6453 from one SLP node. If we cannot do so from all
6454 or none consistently we'd have to record which
6455 SLP node (and lane) we want to use for the live
6456 operation. So make sure we can code-generate
6457 from all nodes. */
6458 mark_visited = false;
6459 else
6460 STMT_VINFO_LIVE_P (stmt_info) = false;
6461 break;
6464 /* We have to verify whether we can insert the lane extract
6465 before all uses. The following is a conservative approximation.
6466 We cannot put this into vectorizable_live_operation because
6467 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6468 doesn't work.
6469 Note that while the fact that we emit code for loads at the
6470 first load should make this a non-problem leafs we construct
6471 from scalars are vectorized after the last scalar def.
6472 ??? If we'd actually compute the insert location during
6473 analysis we could use sth less conservative than the last
6474 scalar stmt in the node for the dominance check. */
6475 /* ??? What remains is "live" uses in vector CTORs in the same
6476 SLP graph which is where those uses can end up code-generated
6477 right after their definition instead of close to their original
6478 use. But that would restrict us to code-generate lane-extracts
6479 from the latest stmt in a node. So we compensate for this
6480 during code-generation, simply not replacing uses for those
6481 hopefully rare cases. */
6482 if (STMT_VINFO_LIVE_P (stmt_info))
6483 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6484 if (!is_gimple_debug (use_stmt)
6485 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6486 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6487 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6489 if (dump_enabled_p ())
6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491 "Cannot determine insertion place for "
6492 "lane extract\n");
6493 STMT_VINFO_LIVE_P (stmt_info) = false;
6494 mark_visited = true;
6497 if (mark_visited)
6498 svisited.add (stmt_info);
6501 slp_tree child;
6502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6503 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6504 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6505 cost_vec, svisited, visited);
6508 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6510 static bool
6511 vectorizable_bb_reduc_epilogue (slp_instance instance,
6512 stmt_vector_for_cost *cost_vec)
6514 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6515 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6516 if (reduc_code == MINUS_EXPR)
6517 reduc_code = PLUS_EXPR;
6518 internal_fn reduc_fn;
6519 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6520 if (!vectype
6521 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6522 || reduc_fn == IFN_LAST
6523 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6524 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6525 TREE_TYPE (vectype)))
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529 "not vectorized: basic block reduction epilogue "
6530 "operation unsupported.\n");
6531 return false;
6534 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6535 cost log2 vector operations plus shuffles and one extraction. */
6536 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6537 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6538 vectype, 0, vect_body);
6539 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6540 vectype, 0, vect_body);
6541 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6542 vectype, 0, vect_body);
6544 /* Since we replace all stmts of a possibly longer scalar reduction
6545 chain account for the extra scalar stmts for that. */
6546 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6547 instance->root_stmts[0], 0, vect_body);
6548 return true;
6551 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6552 and recurse to children. */
6554 static void
6555 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6556 hash_set<slp_tree> &visited)
6558 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6559 || visited.add (node))
6560 return;
6562 stmt_vec_info stmt;
6563 unsigned i;
6564 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6565 roots.remove (vect_orig_stmt (stmt));
6567 slp_tree child;
6568 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6569 if (child)
6570 vect_slp_prune_covered_roots (child, roots, visited);
6573 /* Analyze statements in SLP instances of VINFO. Return true if the
6574 operations are supported. */
6576 bool
6577 vect_slp_analyze_operations (vec_info *vinfo)
6579 slp_instance instance;
6580 int i;
6582 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6584 hash_set<slp_tree> visited;
6585 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6587 auto_vec<slp_tree> visited_vec;
6588 stmt_vector_for_cost cost_vec;
6589 cost_vec.create (2);
6590 if (is_a <bb_vec_info> (vinfo))
6591 vect_location = instance->location ();
6592 if (!vect_slp_analyze_node_operations (vinfo,
6593 SLP_INSTANCE_TREE (instance),
6594 instance, visited, visited_vec,
6595 &cost_vec)
6596 /* CTOR instances require vectorized defs for the SLP tree root. */
6597 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6598 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6599 != vect_internal_def
6600 /* Make sure we vectorized with the expected type. */
6601 || !useless_type_conversion_p
6602 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6603 (instance->root_stmts[0]->stmt))),
6604 TREE_TYPE (SLP_TREE_VECTYPE
6605 (SLP_INSTANCE_TREE (instance))))))
6606 /* Check we can vectorize the reduction. */
6607 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6608 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6610 slp_tree node = SLP_INSTANCE_TREE (instance);
6611 stmt_vec_info stmt_info;
6612 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6613 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6614 else
6615 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6616 if (dump_enabled_p ())
6617 dump_printf_loc (MSG_NOTE, vect_location,
6618 "removing SLP instance operations starting from: %G",
6619 stmt_info->stmt);
6620 vect_free_slp_instance (instance);
6621 vinfo->slp_instances.ordered_remove (i);
6622 cost_vec.release ();
6623 while (!visited_vec.is_empty ())
6624 visited.remove (visited_vec.pop ());
6626 else
6628 i++;
6629 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6631 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6632 cost_vec.release ();
6634 else
6635 /* For BB vectorization remember the SLP graph entry
6636 cost for later. */
6637 instance->cost_vec = cost_vec;
6641 /* Now look for SLP instances with a root that are covered by other
6642 instances and remove them. */
6643 hash_set<stmt_vec_info> roots;
6644 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6645 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6646 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6647 if (!roots.is_empty ())
6649 visited.empty ();
6650 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6651 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6652 visited);
6653 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6654 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6655 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6657 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6658 if (dump_enabled_p ())
6659 dump_printf_loc (MSG_NOTE, vect_location,
6660 "removing SLP instance operations starting "
6661 "from: %G", root->stmt);
6662 vect_free_slp_instance (instance);
6663 vinfo->slp_instances.ordered_remove (i);
6665 else
6666 ++i;
6669 /* Compute vectorizable live stmts. */
6670 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6672 hash_set<stmt_vec_info> svisited;
6673 hash_set<slp_tree> visited;
6674 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6676 vect_location = instance->location ();
6677 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6678 instance, &instance->cost_vec, svisited,
6679 visited);
6683 return !vinfo->slp_instances.is_empty ();
6686 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6687 closing the eventual chain. */
6689 static slp_instance
6690 get_ultimate_leader (slp_instance instance,
6691 hash_map<slp_instance, slp_instance> &instance_leader)
6693 auto_vec<slp_instance *, 8> chain;
6694 slp_instance *tem;
6695 while (*(tem = instance_leader.get (instance)) != instance)
6697 chain.safe_push (tem);
6698 instance = *tem;
6700 while (!chain.is_empty ())
6701 *chain.pop () = instance;
6702 return instance;
6705 namespace {
6706 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6707 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6708 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6710 INSTANCE_LEADER is as for get_ultimate_leader. */
6712 template<typename T>
6713 bool
6714 vect_map_to_instance (slp_instance instance, T key,
6715 hash_map<T, slp_instance> &key_to_instance,
6716 hash_map<slp_instance, slp_instance> &instance_leader)
6718 bool existed_p;
6719 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6720 if (!existed_p)
6722 else if (key_instance != instance)
6724 /* If we're running into a previously marked key make us the
6725 leader of the current ultimate leader. This keeps the
6726 leader chain acyclic and works even when the current instance
6727 connects two previously independent graph parts. */
6728 slp_instance key_leader
6729 = get_ultimate_leader (key_instance, instance_leader);
6730 if (key_leader != instance)
6731 instance_leader.put (key_leader, instance);
6733 key_instance = instance;
6734 return existed_p;
6738 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6740 static void
6741 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6742 slp_instance instance, slp_tree node,
6743 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6744 hash_map<slp_tree, slp_instance> &node_to_instance,
6745 hash_map<slp_instance, slp_instance> &instance_leader)
6747 stmt_vec_info stmt_info;
6748 unsigned i;
6750 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6751 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6752 instance_leader);
6754 if (vect_map_to_instance (instance, node, node_to_instance,
6755 instance_leader))
6756 return;
6758 slp_tree child;
6759 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6760 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6761 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6762 node_to_instance, instance_leader);
6765 /* Partition the SLP graph into pieces that can be costed independently. */
6767 static void
6768 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6770 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6772 /* First walk the SLP graph assigning each involved scalar stmt a
6773 corresponding SLP graph entry and upon visiting a previously
6774 marked stmt, make the stmts leader the current SLP graph entry. */
6775 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6776 hash_map<slp_tree, slp_instance> node_to_instance;
6777 hash_map<slp_instance, slp_instance> instance_leader;
6778 slp_instance instance;
6779 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6781 instance_leader.put (instance, instance);
6782 vect_bb_partition_graph_r (bb_vinfo,
6783 instance, SLP_INSTANCE_TREE (instance),
6784 stmt_to_instance, node_to_instance,
6785 instance_leader);
6788 /* Then collect entries to each independent subgraph. */
6789 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6791 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6792 leader->subgraph_entries.safe_push (instance);
6793 if (dump_enabled_p ()
6794 && leader != instance)
6795 dump_printf_loc (MSG_NOTE, vect_location,
6796 "instance %p is leader of %p\n",
6797 (void *) leader, (void *) instance);
6801 /* Compute the set of scalar stmts participating in internal and external
6802 nodes. */
6804 static void
6805 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6806 hash_set<slp_tree> &visited,
6807 hash_set<stmt_vec_info> &vstmts,
6808 hash_set<stmt_vec_info> &estmts)
6810 int i;
6811 stmt_vec_info stmt_info;
6812 slp_tree child;
6814 if (visited.add (node))
6815 return;
6817 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6819 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6820 vstmts.add (stmt_info);
6822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6823 if (child)
6824 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6825 vstmts, estmts);
6827 else
6828 for (tree def : SLP_TREE_SCALAR_OPS (node))
6830 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6831 if (def_stmt)
6832 estmts.add (def_stmt);
6837 /* Compute the scalar cost of the SLP node NODE and its children
6838 and return it. Do not account defs that are marked in LIFE and
6839 update LIFE according to uses of NODE. */
6841 static void
6842 vect_bb_slp_scalar_cost (vec_info *vinfo,
6843 slp_tree node, vec<bool, va_heap> *life,
6844 stmt_vector_for_cost *cost_vec,
6845 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6846 hash_set<slp_tree> &visited)
6848 unsigned i;
6849 stmt_vec_info stmt_info;
6850 slp_tree child;
6852 if (visited.add (node))
6853 return;
6855 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6857 ssa_op_iter op_iter;
6858 def_operand_p def_p;
6860 if ((*life)[i])
6861 continue;
6863 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6864 gimple *orig_stmt = orig_stmt_info->stmt;
6866 /* If there is a non-vectorized use of the defs then the scalar
6867 stmt is kept live in which case we do not account it or any
6868 required defs in the SLP children in the scalar cost. This
6869 way we make the vectorization more costly when compared to
6870 the scalar cost. */
6871 if (!STMT_VINFO_LIVE_P (stmt_info))
6873 auto_vec<gimple *, 8> worklist;
6874 hash_set<gimple *> *worklist_visited = NULL;
6875 worklist.quick_push (orig_stmt);
6878 gimple *work_stmt = worklist.pop ();
6879 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6881 imm_use_iterator use_iter;
6882 gimple *use_stmt;
6883 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6884 DEF_FROM_PTR (def_p))
6885 if (!is_gimple_debug (use_stmt))
6887 stmt_vec_info use_stmt_info
6888 = vinfo->lookup_stmt (use_stmt);
6889 if (!use_stmt_info
6890 || !vectorized_scalar_stmts.contains (use_stmt_info))
6892 if (use_stmt_info
6893 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6895 /* For stmts participating in patterns we have
6896 to check its uses recursively. */
6897 if (!worklist_visited)
6898 worklist_visited = new hash_set<gimple *> ();
6899 if (!worklist_visited->add (use_stmt))
6900 worklist.safe_push (use_stmt);
6901 continue;
6903 (*life)[i] = true;
6904 goto next_lane;
6909 while (!worklist.is_empty ());
6910 next_lane:
6911 if (worklist_visited)
6912 delete worklist_visited;
6913 if ((*life)[i])
6914 continue;
6917 /* Count scalar stmts only once. */
6918 if (gimple_visited_p (orig_stmt))
6919 continue;
6920 gimple_set_visited (orig_stmt, true);
6922 vect_cost_for_stmt kind;
6923 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6925 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6926 kind = scalar_load;
6927 else
6928 kind = scalar_store;
6930 else if (vect_nop_conversion_p (orig_stmt_info))
6931 continue;
6932 /* For single-argument PHIs assume coalescing which means zero cost
6933 for the scalar and the vector PHIs. This avoids artificially
6934 favoring the vector path (but may pessimize it in some cases). */
6935 else if (is_a <gphi *> (orig_stmt_info->stmt)
6936 && gimple_phi_num_args
6937 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6938 continue;
6939 else
6940 kind = scalar_stmt;
6941 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6942 SLP_TREE_VECTYPE (node), 0, vect_body);
6945 auto_vec<bool, 20> subtree_life;
6946 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6948 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6950 /* Do not directly pass LIFE to the recursive call, copy it to
6951 confine changes in the callee to the current child/subtree. */
6952 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6954 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6955 for (unsigned j = 0;
6956 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6958 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6959 if (perm.first == i)
6960 subtree_life[perm.second] = (*life)[j];
6963 else
6965 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6966 subtree_life.safe_splice (*life);
6968 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6969 vectorized_scalar_stmts, visited);
6970 subtree_life.truncate (0);
6975 /* Comparator for the loop-index sorted cost vectors. */
6977 static int
6978 li_cost_vec_cmp (const void *a_, const void *b_)
6980 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6981 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6982 if (a->first < b->first)
6983 return -1;
6984 else if (a->first == b->first)
6985 return 0;
6986 return 1;
6989 /* Check if vectorization of the basic block is profitable for the
6990 subgraph denoted by SLP_INSTANCES. */
6992 static bool
6993 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6994 vec<slp_instance> slp_instances,
6995 loop_p orig_loop)
6997 slp_instance instance;
6998 int i;
6999 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7000 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7002 if (dump_enabled_p ())
7004 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7005 hash_set<slp_tree> visited;
7006 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7007 vect_print_slp_graph (MSG_NOTE, vect_location,
7008 SLP_INSTANCE_TREE (instance), visited);
7011 /* Compute the set of scalar stmts we know will go away 'locally' when
7012 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7013 not accurate for nodes promoted extern late or for scalar stmts that
7014 are used both in extern defs and in vectorized defs. */
7015 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7016 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7017 hash_set<slp_tree> visited;
7018 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7020 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7021 SLP_INSTANCE_TREE (instance),
7022 visited,
7023 vectorized_scalar_stmts,
7024 scalar_stmts_in_externs);
7025 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7026 vectorized_scalar_stmts.add (rstmt);
7028 /* Scalar stmts used as defs in external nodes need to be preseved, so
7029 remove them from vectorized_scalar_stmts. */
7030 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7031 vectorized_scalar_stmts.remove (stmt);
7033 /* Calculate scalar cost and sum the cost for the vector stmts
7034 previously collected. */
7035 stmt_vector_for_cost scalar_costs = vNULL;
7036 stmt_vector_for_cost vector_costs = vNULL;
7037 visited.empty ();
7038 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7040 auto_vec<bool, 20> life;
7041 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7042 true);
7043 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7044 record_stmt_cost (&scalar_costs,
7045 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7046 scalar_stmt,
7047 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7048 vect_bb_slp_scalar_cost (bb_vinfo,
7049 SLP_INSTANCE_TREE (instance),
7050 &life, &scalar_costs, vectorized_scalar_stmts,
7051 visited);
7052 vector_costs.safe_splice (instance->cost_vec);
7053 instance->cost_vec.release ();
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7059 /* When costing non-loop vectorization we need to consider each covered
7060 loop independently and make sure vectorization is profitable. For
7061 now we assume a loop may be not entered or executed an arbitrary
7062 number of iterations (??? static information can provide more
7063 precise info here) which means we can simply cost each containing
7064 loops stmts separately. */
7066 /* First produce cost vectors sorted by loop index. */
7067 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7068 li_scalar_costs (scalar_costs.length ());
7069 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7070 li_vector_costs (vector_costs.length ());
7071 stmt_info_for_cost *cost;
7072 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7074 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7075 li_scalar_costs.quick_push (std::make_pair (l, cost));
7077 /* Use a random used loop as fallback in case the first vector_costs
7078 entry does not have a stmt_info associated with it. */
7079 unsigned l = li_scalar_costs[0].first;
7080 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7082 /* We inherit from the previous COST, invariants, externals and
7083 extracts immediately follow the cost for the related stmt. */
7084 if (cost->stmt_info)
7085 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7086 li_vector_costs.quick_push (std::make_pair (l, cost));
7088 li_scalar_costs.qsort (li_cost_vec_cmp);
7089 li_vector_costs.qsort (li_cost_vec_cmp);
7091 /* Now cost the portions individually. */
7092 unsigned vi = 0;
7093 unsigned si = 0;
7094 bool profitable = true;
7095 while (si < li_scalar_costs.length ()
7096 && vi < li_vector_costs.length ())
7098 unsigned sl = li_scalar_costs[si].first;
7099 unsigned vl = li_vector_costs[vi].first;
7100 if (sl != vl)
7102 if (dump_enabled_p ())
7103 dump_printf_loc (MSG_NOTE, vect_location,
7104 "Scalar %d and vector %d loop part do not "
7105 "match up, skipping scalar part\n", sl, vl);
7106 /* Skip the scalar part, assuming zero cost on the vector side. */
7109 si++;
7111 while (si < li_scalar_costs.length ()
7112 && li_scalar_costs[si].first == sl);
7113 continue;
7116 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7119 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7120 si++;
7122 while (si < li_scalar_costs.length ()
7123 && li_scalar_costs[si].first == sl);
7124 unsigned dummy;
7125 finish_cost (scalar_target_cost_data, nullptr,
7126 &dummy, &scalar_cost, &dummy);
7128 /* Complete the target-specific vector cost calculation. */
7129 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7132 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7133 vi++;
7135 while (vi < li_vector_costs.length ()
7136 && li_vector_costs[vi].first == vl);
7137 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7138 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7139 delete scalar_target_cost_data;
7140 delete vect_target_cost_data;
7142 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7144 if (dump_enabled_p ())
7146 dump_printf_loc (MSG_NOTE, vect_location,
7147 "Cost model analysis for part in loop %d:\n", sl);
7148 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7149 vec_inside_cost + vec_outside_cost);
7150 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7153 /* Vectorization is profitable if its cost is more than the cost of scalar
7154 version. Note that we err on the vector side for equal cost because
7155 the cost estimate is otherwise quite pessimistic (constant uses are
7156 free on the scalar side but cost a load on the vector side for
7157 example). */
7158 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7160 profitable = false;
7161 break;
7164 if (profitable && vi < li_vector_costs.length ())
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_NOTE, vect_location,
7168 "Excess vector cost for part in loop %d:\n",
7169 li_vector_costs[vi].first);
7170 profitable = false;
7173 /* Unset visited flag. This is delayed when the subgraph is profitable
7174 and we process the loop for remaining unvectorized if-converted code. */
7175 if (!orig_loop || !profitable)
7176 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7177 gimple_set_visited (cost->stmt_info->stmt, false);
7179 scalar_costs.release ();
7180 vector_costs.release ();
7182 return profitable;
7185 /* qsort comparator for lane defs. */
7187 static int
7188 vld_cmp (const void *a_, const void *b_)
7190 auto *a = (const std::pair<unsigned, tree> *)a_;
7191 auto *b = (const std::pair<unsigned, tree> *)b_;
7192 return a->first - b->first;
7195 /* Return true if USE_STMT is a vector lane insert into VEC and set
7196 *THIS_LANE to the lane number that is set. */
7198 static bool
7199 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7201 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7202 if (!use_ass
7203 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7204 || (vec
7205 ? gimple_assign_rhs1 (use_ass) != vec
7206 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7207 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7208 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7209 || !constant_multiple_p
7210 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7211 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7212 this_lane))
7213 return false;
7214 return true;
7217 /* Find any vectorizable constructors and add them to the grouped_store
7218 array. */
7220 static void
7221 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7223 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7224 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7225 !gsi_end_p (gsi); gsi_next (&gsi))
7227 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7228 if (!assign)
7229 continue;
7231 tree rhs = gimple_assign_rhs1 (assign);
7232 enum tree_code code = gimple_assign_rhs_code (assign);
7233 use_operand_p use_p;
7234 gimple *use_stmt;
7235 if (code == CONSTRUCTOR)
7237 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7238 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7239 CONSTRUCTOR_NELTS (rhs))
7240 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7241 || uniform_vector_p (rhs))
7242 continue;
7244 unsigned j;
7245 tree val;
7246 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7247 if (TREE_CODE (val) != SSA_NAME
7248 || !bb_vinfo->lookup_def (val))
7249 break;
7250 if (j != CONSTRUCTOR_NELTS (rhs))
7251 continue;
7253 vec<stmt_vec_info> roots = vNULL;
7254 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7255 vec<stmt_vec_info> stmts;
7256 stmts.create (CONSTRUCTOR_NELTS (rhs));
7257 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7258 stmts.quick_push
7259 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7260 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7261 stmts, roots));
7263 else if (code == BIT_INSERT_EXPR
7264 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7265 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7266 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7267 && integer_zerop (gimple_assign_rhs3 (assign))
7268 && useless_type_conversion_p
7269 (TREE_TYPE (TREE_TYPE (rhs)),
7270 TREE_TYPE (gimple_assign_rhs2 (assign)))
7271 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7273 /* We start to match on insert to lane zero but since the
7274 inserts need not be ordered we'd have to search both
7275 the def and the use chains. */
7276 tree vectype = TREE_TYPE (rhs);
7277 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7278 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7279 auto_sbitmap lanes (nlanes);
7280 bitmap_clear (lanes);
7281 bitmap_set_bit (lanes, 0);
7282 tree def = gimple_assign_lhs (assign);
7283 lane_defs.quick_push
7284 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7285 unsigned lanes_found = 1;
7286 /* Start with the use chains, the last stmt will be the root. */
7287 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7288 vec<stmt_vec_info> roots = vNULL;
7289 roots.safe_push (last);
7292 use_operand_p use_p;
7293 gimple *use_stmt;
7294 if (!single_imm_use (def, &use_p, &use_stmt))
7295 break;
7296 unsigned this_lane;
7297 if (!bb_vinfo->lookup_stmt (use_stmt)
7298 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7299 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7300 break;
7301 if (bitmap_bit_p (lanes, this_lane))
7302 break;
7303 lanes_found++;
7304 bitmap_set_bit (lanes, this_lane);
7305 gassign *use_ass = as_a <gassign *> (use_stmt);
7306 lane_defs.quick_push (std::make_pair
7307 (this_lane, gimple_assign_rhs2 (use_ass)));
7308 last = bb_vinfo->lookup_stmt (use_ass);
7309 roots.safe_push (last);
7310 def = gimple_assign_lhs (use_ass);
7312 while (lanes_found < nlanes);
7313 if (roots.length () > 1)
7314 std::swap(roots[0], roots[roots.length () - 1]);
7315 if (lanes_found < nlanes)
7317 /* Now search the def chain. */
7318 def = gimple_assign_rhs1 (assign);
7321 if (TREE_CODE (def) != SSA_NAME
7322 || !has_single_use (def))
7323 break;
7324 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7325 unsigned this_lane;
7326 if (!bb_vinfo->lookup_stmt (def_stmt)
7327 || !vect_slp_is_lane_insert (def_stmt,
7328 NULL_TREE, &this_lane)
7329 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7330 break;
7331 if (bitmap_bit_p (lanes, this_lane))
7332 break;
7333 lanes_found++;
7334 bitmap_set_bit (lanes, this_lane);
7335 lane_defs.quick_push (std::make_pair
7336 (this_lane,
7337 gimple_assign_rhs2 (def_stmt)));
7338 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7339 def = gimple_assign_rhs1 (def_stmt);
7341 while (lanes_found < nlanes);
7343 if (lanes_found == nlanes)
7345 /* Sort lane_defs after the lane index and register the root. */
7346 lane_defs.qsort (vld_cmp);
7347 vec<stmt_vec_info> stmts;
7348 stmts.create (nlanes);
7349 for (unsigned i = 0; i < nlanes; ++i)
7350 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7351 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7352 stmts, roots));
7354 else
7355 roots.release ();
7357 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7358 && (associative_tree_code (code) || code == MINUS_EXPR)
7359 /* ??? This pessimizes a two-element reduction. PR54400.
7360 ??? In-order reduction could be handled if we only
7361 traverse one operand chain in vect_slp_linearize_chain. */
7362 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7363 /* Ops with constants at the tail can be stripped here. */
7364 && TREE_CODE (rhs) == SSA_NAME
7365 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7366 /* Should be the chain end. */
7367 && (!single_imm_use (gimple_assign_lhs (assign),
7368 &use_p, &use_stmt)
7369 || !is_gimple_assign (use_stmt)
7370 || (gimple_assign_rhs_code (use_stmt) != code
7371 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7372 || (gimple_assign_rhs_code (use_stmt)
7373 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7375 /* We start the match at the end of a possible association
7376 chain. */
7377 auto_vec<chain_op_t> chain;
7378 auto_vec<std::pair<tree_code, gimple *> > worklist;
7379 auto_vec<gimple *> chain_stmts;
7380 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7381 if (code == MINUS_EXPR)
7382 code = PLUS_EXPR;
7383 internal_fn reduc_fn;
7384 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7385 || reduc_fn == IFN_LAST)
7386 continue;
7387 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7388 /* ??? */
7389 code_stmt, alt_code_stmt, &chain_stmts);
7390 if (chain.length () > 1)
7392 /* Sort the chain according to def_type and operation. */
7393 chain.sort (dt_sort_cmp, bb_vinfo);
7394 /* ??? Now we'd want to strip externals and constants
7395 but record those to be handled in the epilogue. */
7396 /* ??? For now do not allow mixing ops or externs/constants. */
7397 bool invalid = false;
7398 unsigned remain_cnt = 0;
7399 for (unsigned i = 0; i < chain.length (); ++i)
7401 if (chain[i].code != code)
7403 invalid = true;
7404 break;
7406 if (chain[i].dt != vect_internal_def)
7407 remain_cnt++;
7409 if (!invalid && chain.length () - remain_cnt > 1)
7411 vec<stmt_vec_info> stmts;
7412 vec<tree> remain = vNULL;
7413 stmts.create (chain.length ());
7414 if (remain_cnt > 0)
7415 remain.create (remain_cnt);
7416 for (unsigned i = 0; i < chain.length (); ++i)
7418 if (chain[i].dt == vect_internal_def)
7419 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7420 else
7421 remain.quick_push (chain[i].op);
7423 vec<stmt_vec_info> roots;
7424 roots.create (chain_stmts.length ());
7425 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7426 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7427 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7428 stmts, roots, remain));
7435 /* Walk the grouped store chains and replace entries with their
7436 pattern variant if any. */
7438 static void
7439 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7441 stmt_vec_info first_element;
7442 unsigned i;
7444 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7446 /* We also have CTORs in this array. */
7447 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7448 continue;
7449 if (STMT_VINFO_IN_PATTERN_P (first_element))
7451 stmt_vec_info orig = first_element;
7452 first_element = STMT_VINFO_RELATED_STMT (first_element);
7453 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7454 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7455 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7456 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7457 vinfo->grouped_stores[i] = first_element;
7459 stmt_vec_info prev = first_element;
7460 while (DR_GROUP_NEXT_ELEMENT (prev))
7462 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7463 if (STMT_VINFO_IN_PATTERN_P (elt))
7465 stmt_vec_info orig = elt;
7466 elt = STMT_VINFO_RELATED_STMT (elt);
7467 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7468 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7469 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7471 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7472 prev = elt;
7477 /* Check if the region described by BB_VINFO can be vectorized, returning
7478 true if so. When returning false, set FATAL to true if the same failure
7479 would prevent vectorization at other vector sizes, false if it is still
7480 worth trying other sizes. N_STMTS is the number of statements in the
7481 region. */
7483 static bool
7484 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7485 vec<int> *dataref_groups)
7487 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7489 slp_instance instance;
7490 int i;
7491 poly_uint64 min_vf = 2;
7493 /* The first group of checks is independent of the vector size. */
7494 fatal = true;
7496 /* Analyze the data references. */
7498 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7500 if (dump_enabled_p ())
7501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502 "not vectorized: unhandled data-ref in basic "
7503 "block.\n");
7504 return false;
7507 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7509 if (dump_enabled_p ())
7510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511 "not vectorized: unhandled data access in "
7512 "basic block.\n");
7513 return false;
7516 vect_slp_check_for_roots (bb_vinfo);
7518 /* If there are no grouped stores and no constructors in the region
7519 there is no need to continue with pattern recog as vect_analyze_slp
7520 will fail anyway. */
7521 if (bb_vinfo->grouped_stores.is_empty ()
7522 && bb_vinfo->roots.is_empty ())
7524 if (dump_enabled_p ())
7525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526 "not vectorized: no grouped stores in "
7527 "basic block.\n");
7528 return false;
7531 /* While the rest of the analysis below depends on it in some way. */
7532 fatal = false;
7534 vect_pattern_recog (bb_vinfo);
7536 /* Update store groups from pattern processing. */
7537 vect_fixup_store_groups_with_patterns (bb_vinfo);
7539 /* Check the SLP opportunities in the basic block, analyze and build SLP
7540 trees. */
7541 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7543 if (dump_enabled_p ())
7545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546 "Failed to SLP the basic block.\n");
7547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7548 "not vectorized: failed to find SLP opportunities "
7549 "in basic block.\n");
7551 return false;
7554 /* Optimize permutations. */
7555 vect_optimize_slp (bb_vinfo);
7557 /* Gather the loads reachable from the SLP graph entries. */
7558 vect_gather_slp_loads (bb_vinfo);
7560 vect_record_base_alignments (bb_vinfo);
7562 /* Analyze and verify the alignment of data references and the
7563 dependence in the SLP instances. */
7564 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7566 vect_location = instance->location ();
7567 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7568 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7570 slp_tree node = SLP_INSTANCE_TREE (instance);
7571 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7572 if (dump_enabled_p ())
7573 dump_printf_loc (MSG_NOTE, vect_location,
7574 "removing SLP instance operations starting from: %G",
7575 stmt_info->stmt);
7576 vect_free_slp_instance (instance);
7577 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7578 continue;
7581 /* Mark all the statements that we want to vectorize as pure SLP and
7582 relevant. */
7583 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7584 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7585 unsigned j;
7586 stmt_vec_info root;
7587 /* Likewise consider instance root stmts as vectorized. */
7588 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7589 STMT_SLP_TYPE (root) = pure_slp;
7591 i++;
7593 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7594 return false;
7596 if (!vect_slp_analyze_operations (bb_vinfo))
7598 if (dump_enabled_p ())
7599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600 "not vectorized: bad operation in basic block.\n");
7601 return false;
7604 vect_bb_partition_graph (bb_vinfo);
7606 return true;
7609 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7610 basic blocks in BBS, returning true on success.
7611 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7613 static bool
7614 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7615 vec<int> *dataref_groups, unsigned int n_stmts,
7616 loop_p orig_loop)
7618 bb_vec_info bb_vinfo;
7619 auto_vector_modes vector_modes;
7621 /* Autodetect first vector size we try. */
7622 machine_mode next_vector_mode = VOIDmode;
7623 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7624 unsigned int mode_i = 0;
7626 vec_info_shared shared;
7628 machine_mode autodetected_vector_mode = VOIDmode;
7629 while (1)
7631 bool vectorized = false;
7632 bool fatal = false;
7633 bb_vinfo = new _bb_vec_info (bbs, &shared);
7635 bool first_time_p = shared.datarefs.is_empty ();
7636 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7637 if (first_time_p)
7638 bb_vinfo->shared->save_datarefs ();
7639 else
7640 bb_vinfo->shared->check_datarefs ();
7641 bb_vinfo->vector_mode = next_vector_mode;
7643 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7645 if (dump_enabled_p ())
7647 dump_printf_loc (MSG_NOTE, vect_location,
7648 "***** Analysis succeeded with vector mode"
7649 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7650 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7653 bb_vinfo->shared->check_datarefs ();
7655 auto_vec<slp_instance> profitable_subgraphs;
7656 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7658 if (instance->subgraph_entries.is_empty ())
7659 continue;
7661 dump_user_location_t saved_vect_location = vect_location;
7662 vect_location = instance->location ();
7663 if (!unlimited_cost_model (NULL)
7664 && !vect_bb_vectorization_profitable_p
7665 (bb_vinfo, instance->subgraph_entries, orig_loop))
7667 if (dump_enabled_p ())
7668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669 "not vectorized: vectorization is not "
7670 "profitable.\n");
7671 vect_location = saved_vect_location;
7672 continue;
7675 vect_location = saved_vect_location;
7676 if (!dbg_cnt (vect_slp))
7677 continue;
7679 profitable_subgraphs.safe_push (instance);
7682 /* When we're vectorizing an if-converted loop body make sure
7683 we vectorized all if-converted code. */
7684 if (!profitable_subgraphs.is_empty ()
7685 && orig_loop)
7687 gcc_assert (bb_vinfo->bbs.length () == 1);
7688 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7689 !gsi_end_p (gsi); gsi_next (&gsi))
7691 /* The costing above left us with DCEable vectorized scalar
7692 stmts having the visited flag set on profitable
7693 subgraphs. Do the delayed clearing of the flag here. */
7694 if (gimple_visited_p (gsi_stmt (gsi)))
7696 gimple_set_visited (gsi_stmt (gsi), false);
7697 continue;
7699 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7700 continue;
7702 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7703 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7705 if (!profitable_subgraphs.is_empty ()
7706 && dump_enabled_p ())
7707 dump_printf_loc (MSG_NOTE, vect_location,
7708 "not profitable because of "
7709 "unprofitable if-converted scalar "
7710 "code\n");
7711 profitable_subgraphs.truncate (0);
7716 /* Finally schedule the profitable subgraphs. */
7717 for (slp_instance instance : profitable_subgraphs)
7719 if (!vectorized && dump_enabled_p ())
7720 dump_printf_loc (MSG_NOTE, vect_location,
7721 "Basic block will be vectorized "
7722 "using SLP\n");
7723 vectorized = true;
7725 /* Dump before scheduling as store vectorization will remove
7726 the original stores and mess with the instance tree
7727 so querying its location will eventually ICE. */
7728 if (flag_checking)
7729 for (slp_instance sub : instance->subgraph_entries)
7730 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7731 unsigned HOST_WIDE_INT bytes;
7732 if (dump_enabled_p ())
7733 for (slp_instance sub : instance->subgraph_entries)
7735 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7736 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7737 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7738 sub->location (),
7739 "basic block part vectorized using %wu "
7740 "byte vectors\n", bytes);
7741 else
7742 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7743 sub->location (),
7744 "basic block part vectorized using "
7745 "variable length vectors\n");
7748 dump_user_location_t saved_vect_location = vect_location;
7749 vect_location = instance->location ();
7751 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7753 vect_location = saved_vect_location;
7756 else
7758 if (dump_enabled_p ())
7759 dump_printf_loc (MSG_NOTE, vect_location,
7760 "***** Analysis failed with vector mode %s\n",
7761 GET_MODE_NAME (bb_vinfo->vector_mode));
7764 if (mode_i == 0)
7765 autodetected_vector_mode = bb_vinfo->vector_mode;
7767 if (!fatal)
7768 while (mode_i < vector_modes.length ()
7769 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7771 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "***** The result for vector mode %s would"
7774 " be the same\n",
7775 GET_MODE_NAME (vector_modes[mode_i]));
7776 mode_i += 1;
7779 delete bb_vinfo;
7781 if (mode_i < vector_modes.length ()
7782 && VECTOR_MODE_P (autodetected_vector_mode)
7783 && (related_vector_mode (vector_modes[mode_i],
7784 GET_MODE_INNER (autodetected_vector_mode))
7785 == autodetected_vector_mode)
7786 && (related_vector_mode (autodetected_vector_mode,
7787 GET_MODE_INNER (vector_modes[mode_i]))
7788 == vector_modes[mode_i]))
7790 if (dump_enabled_p ())
7791 dump_printf_loc (MSG_NOTE, vect_location,
7792 "***** Skipping vector mode %s, which would"
7793 " repeat the analysis for %s\n",
7794 GET_MODE_NAME (vector_modes[mode_i]),
7795 GET_MODE_NAME (autodetected_vector_mode));
7796 mode_i += 1;
7799 if (vectorized
7800 || mode_i == vector_modes.length ()
7801 || autodetected_vector_mode == VOIDmode
7802 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7803 vector sizes will fail do not bother iterating. */
7804 || fatal)
7805 return vectorized;
7807 /* Try the next biggest vector size. */
7808 next_vector_mode = vector_modes[mode_i++];
7809 if (dump_enabled_p ())
7810 dump_printf_loc (MSG_NOTE, vect_location,
7811 "***** Re-trying analysis with vector mode %s\n",
7812 GET_MODE_NAME (next_vector_mode));
7817 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7818 true if anything in the basic-block was vectorized. */
7820 static bool
7821 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7823 vec<data_reference_p> datarefs = vNULL;
7824 auto_vec<int> dataref_groups;
7825 int insns = 0;
7826 int current_group = 0;
7828 for (unsigned i = 0; i < bbs.length (); i++)
7830 basic_block bb = bbs[i];
7831 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7832 gsi_next (&gsi))
7834 gimple *stmt = gsi_stmt (gsi);
7835 if (is_gimple_debug (stmt))
7836 continue;
7838 insns++;
7840 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7841 vect_location = stmt;
7843 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7844 &dataref_groups, current_group))
7845 ++current_group;
7847 /* New BBs always start a new DR group. */
7848 ++current_group;
7851 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7854 /* Special entry for the BB vectorizer. Analyze and transform a single
7855 if-converted BB with ORIG_LOOPs body being the not if-converted
7856 representation. Returns true if anything in the basic-block was
7857 vectorized. */
7859 bool
7860 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7862 auto_vec<basic_block> bbs;
7863 bbs.safe_push (bb);
7864 return vect_slp_bbs (bbs, orig_loop);
7867 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7868 true if anything in the basic-block was vectorized. */
7870 bool
7871 vect_slp_function (function *fun)
7873 bool r = false;
7874 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7875 auto_bitmap exit_bbs;
7876 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7877 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7878 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7879 true, rpo, NULL);
7881 /* For the moment split the function into pieces to avoid making
7882 the iteration on the vector mode moot. Split at points we know
7883 to not handle well which is CFG merges (SLP discovery doesn't
7884 handle non-loop-header PHIs) and loop exits. Since pattern
7885 recog requires reverse iteration to visit uses before defs
7886 simply chop RPO into pieces. */
7887 auto_vec<basic_block> bbs;
7888 for (unsigned i = 0; i < n; i++)
7890 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7891 bool split = false;
7893 /* Split when a BB is not dominated by the first block. */
7894 if (!bbs.is_empty ()
7895 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7897 if (dump_enabled_p ())
7898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899 "splitting region at dominance boundary bb%d\n",
7900 bb->index);
7901 split = true;
7903 /* Split when the loop determined by the first block
7904 is exited. This is because we eventually insert
7905 invariants at region begin. */
7906 else if (!bbs.is_empty ()
7907 && bbs[0]->loop_father != bb->loop_father
7908 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7910 if (dump_enabled_p ())
7911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7912 "splitting region at loop %d exit at bb%d\n",
7913 bbs[0]->loop_father->num, bb->index);
7914 split = true;
7916 else if (!bbs.is_empty ()
7917 && bb->loop_father->header == bb
7918 && bb->loop_father->dont_vectorize)
7920 if (dump_enabled_p ())
7921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7922 "splitting region at dont-vectorize loop %d "
7923 "entry at bb%d\n",
7924 bb->loop_father->num, bb->index);
7925 split = true;
7928 if (split && !bbs.is_empty ())
7930 r |= vect_slp_bbs (bbs, NULL);
7931 bbs.truncate (0);
7934 if (bbs.is_empty ())
7936 /* We need to be able to insert at the head of the region which
7937 we cannot for region starting with a returns-twice call. */
7938 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7939 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7941 if (dump_enabled_p ())
7942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943 "skipping bb%d as start of region as it "
7944 "starts with returns-twice call\n",
7945 bb->index);
7946 continue;
7948 /* If the loop this BB belongs to is marked as not to be vectorized
7949 honor that also for BB vectorization. */
7950 if (bb->loop_father->dont_vectorize)
7951 continue;
7954 bbs.safe_push (bb);
7956 /* When we have a stmt ending this block and defining a
7957 value we have to insert on edges when inserting after it for
7958 a vector containing its definition. Avoid this for now. */
7959 if (gimple *last = *gsi_last_bb (bb))
7960 if (gimple_get_lhs (last)
7961 && is_ctrl_altering_stmt (last))
7963 if (dump_enabled_p ())
7964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 "splitting region at control altering "
7966 "definition %G", last);
7967 r |= vect_slp_bbs (bbs, NULL);
7968 bbs.truncate (0);
7972 if (!bbs.is_empty ())
7973 r |= vect_slp_bbs (bbs, NULL);
7975 free (rpo);
7977 return r;
7980 /* Build a variable-length vector in which the elements in ELTS are repeated
7981 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7982 RESULTS and add any new instructions to SEQ.
7984 The approach we use is:
7986 (1) Find a vector mode VM with integer elements of mode IM.
7988 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7989 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7990 from small vectors to IM.
7992 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7994 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7995 correct byte contents.
7997 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7999 We try to find the largest IM for which this sequence works, in order
8000 to cut down on the number of interleaves. */
8002 void
8003 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8004 const vec<tree> &elts, unsigned int nresults,
8005 vec<tree> &results)
8007 unsigned int nelts = elts.length ();
8008 tree element_type = TREE_TYPE (vector_type);
8010 /* (1) Find a vector mode VM with integer elements of mode IM. */
8011 unsigned int nvectors = 1;
8012 tree new_vector_type;
8013 tree permutes[2];
8014 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8015 &nvectors, &new_vector_type,
8016 permutes))
8017 gcc_unreachable ();
8019 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8020 unsigned int partial_nelts = nelts / nvectors;
8021 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8023 tree_vector_builder partial_elts;
8024 auto_vec<tree, 32> pieces (nvectors * 2);
8025 pieces.quick_grow_cleared (nvectors * 2);
8026 for (unsigned int i = 0; i < nvectors; ++i)
8028 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8029 ELTS' has mode IM. */
8030 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8031 for (unsigned int j = 0; j < partial_nelts; ++j)
8032 partial_elts.quick_push (elts[i * partial_nelts + j]);
8033 tree t = gimple_build_vector (seq, &partial_elts);
8034 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8035 TREE_TYPE (new_vector_type), t);
8037 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8038 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8041 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8042 correct byte contents.
8044 Conceptually, we need to repeat the following operation log2(nvectors)
8045 times, where hi_start = nvectors / 2:
8047 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8048 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8050 However, if each input repeats every N elements and the VF is
8051 a multiple of N * 2, the HI result is the same as the LO result.
8052 This will be true for the first N1 iterations of the outer loop,
8053 followed by N2 iterations for which both the LO and HI results
8054 are needed. I.e.:
8056 N1 + N2 = log2(nvectors)
8058 Each "N1 iteration" doubles the number of redundant vectors and the
8059 effect of the process as a whole is to have a sequence of nvectors/2**N1
8060 vectors that repeats 2**N1 times. Rather than generate these redundant
8061 vectors, we halve the number of vectors for each N1 iteration. */
8062 unsigned int in_start = 0;
8063 unsigned int out_start = nvectors;
8064 unsigned int new_nvectors = nvectors;
8065 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8067 unsigned int hi_start = new_nvectors / 2;
8068 unsigned int out_i = 0;
8069 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8071 if ((in_i & 1) != 0
8072 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8073 2 * in_repeat))
8074 continue;
8076 tree output = make_ssa_name (new_vector_type);
8077 tree input1 = pieces[in_start + (in_i / 2)];
8078 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8079 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8080 input1, input2,
8081 permutes[in_i & 1]);
8082 gimple_seq_add_stmt (seq, stmt);
8083 pieces[out_start + out_i] = output;
8084 out_i += 1;
8086 std::swap (in_start, out_start);
8087 new_nvectors = out_i;
8090 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8091 results.reserve (nresults);
8092 for (unsigned int i = 0; i < nresults; ++i)
8093 if (i < new_nvectors)
8094 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8095 pieces[in_start + i]));
8096 else
8097 results.quick_push (results[i - new_nvectors]);
8101 /* For constant and loop invariant defs in OP_NODE this function creates
8102 vector defs that will be used in the vectorized stmts and stores them
8103 to SLP_TREE_VEC_DEFS of OP_NODE. */
8105 static void
8106 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8108 unsigned HOST_WIDE_INT nunits;
8109 tree vec_cst;
8110 unsigned j, number_of_places_left_in_vector;
8111 tree vector_type;
8112 tree vop;
8113 int group_size = op_node->ops.length ();
8114 unsigned int vec_num, i;
8115 unsigned number_of_copies = 1;
8116 bool constant_p;
8117 gimple_seq ctor_seq = NULL;
8118 auto_vec<tree, 16> permute_results;
8120 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8121 vector_type = SLP_TREE_VECTYPE (op_node);
8123 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8124 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8125 auto_vec<tree> voprnds (number_of_vectors);
8127 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8128 created vectors. It is greater than 1 if unrolling is performed.
8130 For example, we have two scalar operands, s1 and s2 (e.g., group of
8131 strided accesses of size two), while NUNITS is four (i.e., four scalars
8132 of this type can be packed in a vector). The output vector will contain
8133 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8134 will be 2).
8136 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8137 containing the operands.
8139 For example, NUNITS is four as before, and the group size is 8
8140 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8141 {s5, s6, s7, s8}. */
8143 /* When using duplicate_and_interleave, we just need one element for
8144 each scalar statement. */
8145 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8146 nunits = group_size;
8148 number_of_copies = nunits * number_of_vectors / group_size;
8150 number_of_places_left_in_vector = nunits;
8151 constant_p = true;
8152 tree_vector_builder elts (vector_type, nunits, 1);
8153 elts.quick_grow (nunits);
8154 stmt_vec_info insert_after = NULL;
8155 for (j = 0; j < number_of_copies; j++)
8157 tree op;
8158 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8160 /* Create 'vect_ = {op0,op1,...,opn}'. */
8161 number_of_places_left_in_vector--;
8162 tree orig_op = op;
8163 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8165 if (CONSTANT_CLASS_P (op))
8167 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8169 /* Can't use VIEW_CONVERT_EXPR for booleans because
8170 of possibly different sizes of scalar value and
8171 vector element. */
8172 if (integer_zerop (op))
8173 op = build_int_cst (TREE_TYPE (vector_type), 0);
8174 else if (integer_onep (op))
8175 op = build_all_ones_cst (TREE_TYPE (vector_type));
8176 else
8177 gcc_unreachable ();
8179 else
8180 op = fold_unary (VIEW_CONVERT_EXPR,
8181 TREE_TYPE (vector_type), op);
8182 gcc_assert (op && CONSTANT_CLASS_P (op));
8184 else
8186 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8187 gimple *init_stmt;
8188 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8190 tree true_val
8191 = build_all_ones_cst (TREE_TYPE (vector_type));
8192 tree false_val
8193 = build_zero_cst (TREE_TYPE (vector_type));
8194 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8195 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8196 op, true_val,
8197 false_val);
8199 else
8201 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8202 op);
8203 init_stmt
8204 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8205 op);
8207 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8208 op = new_temp;
8211 elts[number_of_places_left_in_vector] = op;
8212 if (!CONSTANT_CLASS_P (op))
8213 constant_p = false;
8214 /* For BB vectorization we have to compute an insert location
8215 when a def is inside the analyzed region since we cannot
8216 simply insert at the BB start in this case. */
8217 stmt_vec_info opdef;
8218 if (TREE_CODE (orig_op) == SSA_NAME
8219 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8220 && is_a <bb_vec_info> (vinfo)
8221 && (opdef = vinfo->lookup_def (orig_op)))
8223 if (!insert_after)
8224 insert_after = opdef;
8225 else
8226 insert_after = get_later_stmt (insert_after, opdef);
8229 if (number_of_places_left_in_vector == 0)
8231 if (constant_p
8232 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8233 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8234 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8235 else
8237 if (permute_results.is_empty ())
8238 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8239 elts, number_of_vectors,
8240 permute_results);
8241 vec_cst = permute_results[number_of_vectors - j - 1];
8243 if (!gimple_seq_empty_p (ctor_seq))
8245 if (insert_after)
8247 gimple_stmt_iterator gsi;
8248 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8250 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8251 gsi_insert_seq_before (&gsi, ctor_seq,
8252 GSI_CONTINUE_LINKING);
8254 else if (!stmt_ends_bb_p (insert_after->stmt))
8256 gsi = gsi_for_stmt (insert_after->stmt);
8257 gsi_insert_seq_after (&gsi, ctor_seq,
8258 GSI_CONTINUE_LINKING);
8260 else
8262 /* When we want to insert after a def where the
8263 defining stmt throws then insert on the fallthru
8264 edge. */
8265 edge e = find_fallthru_edge
8266 (gimple_bb (insert_after->stmt)->succs);
8267 basic_block new_bb
8268 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8269 gcc_assert (!new_bb);
8272 else
8273 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8274 ctor_seq = NULL;
8276 voprnds.quick_push (vec_cst);
8277 insert_after = NULL;
8278 number_of_places_left_in_vector = nunits;
8279 constant_p = true;
8280 elts.new_vector (vector_type, nunits, 1);
8281 elts.quick_grow (nunits);
8286 /* Since the vectors are created in the reverse order, we should invert
8287 them. */
8288 vec_num = voprnds.length ();
8289 for (j = vec_num; j != 0; j--)
8291 vop = voprnds[j - 1];
8292 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8295 /* In case that VF is greater than the unrolling factor needed for the SLP
8296 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8297 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8298 to replicate the vectors. */
8299 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8300 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8301 i++)
8302 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8305 /* Get the Ith vectorized definition from SLP_NODE. */
8307 tree
8308 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8310 return SLP_TREE_VEC_DEFS (slp_node)[i];
8313 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8315 void
8316 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8318 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8319 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8322 /* Get N vectorized definitions for SLP_NODE. */
8324 void
8325 vect_get_slp_defs (vec_info *,
8326 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8328 if (n == -1U)
8329 n = SLP_TREE_CHILDREN (slp_node).length ();
8331 for (unsigned i = 0; i < n; ++i)
8333 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8334 vec<tree> vec_defs = vNULL;
8335 vect_get_slp_defs (child, &vec_defs);
8336 vec_oprnds->quick_push (vec_defs);
8340 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8341 - PERM gives the permutation that the caller wants to use for NODE,
8342 which might be different from SLP_LOAD_PERMUTATION.
8343 - DUMP_P controls whether the function dumps information. */
8345 static bool
8346 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8347 load_permutation_t &perm,
8348 const vec<tree> &dr_chain,
8349 gimple_stmt_iterator *gsi, poly_uint64 vf,
8350 bool analyze_only, bool dump_p,
8351 unsigned *n_perms, unsigned int *n_loads,
8352 bool dce_chain)
8354 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8355 int vec_index = 0;
8356 tree vectype = SLP_TREE_VECTYPE (node);
8357 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8358 unsigned int mask_element;
8359 unsigned dr_group_size;
8360 machine_mode mode;
8362 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8363 dr_group_size = 1;
8364 else
8366 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8367 dr_group_size = DR_GROUP_SIZE (stmt_info);
8370 mode = TYPE_MODE (vectype);
8371 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8372 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8374 /* Initialize the vect stmts of NODE to properly insert the generated
8375 stmts later. */
8376 if (! analyze_only)
8377 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8378 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8380 /* Generate permutation masks for every NODE. Number of masks for each NODE
8381 is equal to GROUP_SIZE.
8382 E.g., we have a group of three nodes with three loads from the same
8383 location in each node, and the vector size is 4. I.e., we have a
8384 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8385 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8386 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8389 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8390 The last mask is illegal since we assume two operands for permute
8391 operation, and the mask element values can't be outside that range.
8392 Hence, the last mask must be converted into {2,5,5,5}.
8393 For the first two permutations we need the first and the second input
8394 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8395 we need the second and the third vectors: {b1,c1,a2,b2} and
8396 {c2,a3,b3,c3}. */
8398 int vect_stmts_counter = 0;
8399 unsigned int index = 0;
8400 int first_vec_index = -1;
8401 int second_vec_index = -1;
8402 bool noop_p = true;
8403 *n_perms = 0;
8405 vec_perm_builder mask;
8406 unsigned int nelts_to_build;
8407 unsigned int nvectors_per_build;
8408 unsigned int in_nlanes;
8409 bool repeating_p = (group_size == dr_group_size
8410 && multiple_p (nunits, group_size));
8411 if (repeating_p)
8413 /* A single vector contains a whole number of copies of the node, so:
8414 (a) all permutes can use the same mask; and
8415 (b) the permutes only need a single vector input. */
8416 mask.new_vector (nunits, group_size, 3);
8417 nelts_to_build = mask.encoded_nelts ();
8418 /* It's possible to obtain zero nstmts during analyze_only, so make
8419 it at least one to ensure the later computation for n_perms
8420 proceed. */
8421 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8422 in_nlanes = dr_group_size * 3;
8424 else
8426 /* We need to construct a separate mask for each vector statement. */
8427 unsigned HOST_WIDE_INT const_nunits, const_vf;
8428 if (!nunits.is_constant (&const_nunits)
8429 || !vf.is_constant (&const_vf))
8430 return false;
8431 mask.new_vector (const_nunits, const_nunits, 1);
8432 nelts_to_build = const_vf * group_size;
8433 nvectors_per_build = 1;
8434 in_nlanes = const_vf * dr_group_size;
8436 auto_sbitmap used_in_lanes (in_nlanes);
8437 bitmap_clear (used_in_lanes);
8438 auto_bitmap used_defs;
8440 unsigned int count = mask.encoded_nelts ();
8441 mask.quick_grow (count);
8442 vec_perm_indices indices;
8444 for (unsigned int j = 0; j < nelts_to_build; j++)
8446 unsigned int iter_num = j / group_size;
8447 unsigned int stmt_num = j % group_size;
8448 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8449 bitmap_set_bit (used_in_lanes, i);
8450 if (repeating_p)
8452 first_vec_index = 0;
8453 mask_element = i;
8455 else
8457 /* Enforced before the loop when !repeating_p. */
8458 unsigned int const_nunits = nunits.to_constant ();
8459 vec_index = i / const_nunits;
8460 mask_element = i % const_nunits;
8461 if (vec_index == first_vec_index
8462 || first_vec_index == -1)
8464 first_vec_index = vec_index;
8466 else if (vec_index == second_vec_index
8467 || second_vec_index == -1)
8469 second_vec_index = vec_index;
8470 mask_element += const_nunits;
8472 else
8474 if (dump_p)
8475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8476 "permutation requires at "
8477 "least three vectors %G",
8478 stmt_info->stmt);
8479 gcc_assert (analyze_only);
8480 return false;
8483 gcc_assert (mask_element < 2 * const_nunits);
8486 if (mask_element != index)
8487 noop_p = false;
8488 mask[index++] = mask_element;
8490 if (index == count)
8492 if (!noop_p)
8494 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8495 if (!can_vec_perm_const_p (mode, mode, indices))
8497 if (dump_p)
8499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8500 "unsupported vect permute { ");
8501 for (i = 0; i < count; ++i)
8503 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8504 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8506 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8508 gcc_assert (analyze_only);
8509 return false;
8512 tree mask_vec = NULL_TREE;
8513 if (!analyze_only)
8514 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8516 if (second_vec_index == -1)
8517 second_vec_index = first_vec_index;
8519 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8521 ++*n_perms;
8522 if (analyze_only)
8523 continue;
8524 /* Generate the permute statement if necessary. */
8525 tree first_vec = dr_chain[first_vec_index + ri];
8526 tree second_vec = dr_chain[second_vec_index + ri];
8527 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8528 tree perm_dest
8529 = vect_create_destination_var (gimple_assign_lhs (stmt),
8530 vectype);
8531 perm_dest = make_ssa_name (perm_dest);
8532 gimple *perm_stmt
8533 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8534 second_vec, mask_vec);
8535 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8536 gsi);
8537 if (dce_chain)
8539 bitmap_set_bit (used_defs, first_vec_index + ri);
8540 bitmap_set_bit (used_defs, second_vec_index + ri);
8543 /* Store the vector statement in NODE. */
8544 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8547 else if (!analyze_only)
8549 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8551 tree first_vec = dr_chain[first_vec_index + ri];
8552 /* If mask was NULL_TREE generate the requested
8553 identity transform. */
8554 if (dce_chain)
8555 bitmap_set_bit (used_defs, first_vec_index + ri);
8557 /* Store the vector statement in NODE. */
8558 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8562 index = 0;
8563 first_vec_index = -1;
8564 second_vec_index = -1;
8565 noop_p = true;
8569 if (n_loads)
8571 if (repeating_p)
8572 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8573 else
8575 /* Enforced above when !repeating_p. */
8576 unsigned int const_nunits = nunits.to_constant ();
8577 *n_loads = 0;
8578 bool load_seen = false;
8579 for (unsigned i = 0; i < in_nlanes; ++i)
8581 if (i % const_nunits == 0)
8583 if (load_seen)
8584 *n_loads += 1;
8585 load_seen = false;
8587 if (bitmap_bit_p (used_in_lanes, i))
8588 load_seen = true;
8590 if (load_seen)
8591 *n_loads += 1;
8595 if (dce_chain)
8596 for (unsigned i = 0; i < dr_chain.length (); ++i)
8597 if (!bitmap_bit_p (used_defs, i))
8599 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8600 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8601 gsi_remove (&rgsi, true);
8602 release_defs (stmt);
8605 return true;
8608 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8609 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8610 permute statements for the SLP node NODE. Store the number of vector
8611 permute instructions in *N_PERMS and the number of vector load
8612 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8613 that were not needed. */
8615 bool
8616 vect_transform_slp_perm_load (vec_info *vinfo,
8617 slp_tree node, const vec<tree> &dr_chain,
8618 gimple_stmt_iterator *gsi, poly_uint64 vf,
8619 bool analyze_only, unsigned *n_perms,
8620 unsigned int *n_loads, bool dce_chain)
8622 return vect_transform_slp_perm_load_1 (vinfo, node,
8623 SLP_TREE_LOAD_PERMUTATION (node),
8624 dr_chain, gsi, vf, analyze_only,
8625 dump_enabled_p (), n_perms, n_loads,
8626 dce_chain);
8629 /* Produce the next vector result for SLP permutation NODE by adding a vector
8630 statement at GSI. If MASK_VEC is nonnull, add:
8632 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8634 otherwise add:
8636 <new SSA name> = FIRST_DEF. */
8638 static void
8639 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8640 slp_tree node, tree first_def, tree second_def,
8641 tree mask_vec, poly_uint64 identity_offset)
8643 tree vectype = SLP_TREE_VECTYPE (node);
8645 /* ??? We SLP match existing vector element extracts but
8646 allow punning which we need to re-instantiate at uses
8647 but have no good way of explicitly representing. */
8648 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8649 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8651 gassign *conv_stmt
8652 = gimple_build_assign (make_ssa_name (vectype),
8653 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8654 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8655 first_def = gimple_assign_lhs (conv_stmt);
8657 gassign *perm_stmt;
8658 tree perm_dest = make_ssa_name (vectype);
8659 if (mask_vec)
8661 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8662 TYPE_SIZE (vectype))
8663 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8665 gassign *conv_stmt
8666 = gimple_build_assign (make_ssa_name (vectype),
8667 build1 (VIEW_CONVERT_EXPR,
8668 vectype, second_def));
8669 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8670 second_def = gimple_assign_lhs (conv_stmt);
8672 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8673 first_def, second_def,
8674 mask_vec);
8676 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8678 /* For identity permutes we still need to handle the case
8679 of offsetted extracts or concats. */
8680 unsigned HOST_WIDE_INT c;
8681 auto first_def_nunits
8682 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8683 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8685 unsigned HOST_WIDE_INT elsz
8686 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8687 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8688 TYPE_SIZE (vectype),
8689 bitsize_int (identity_offset * elsz));
8690 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8692 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8693 first_def_nunits, &c) && c == 2)
8695 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8696 NULL_TREE, second_def);
8697 perm_stmt = gimple_build_assign (perm_dest, ctor);
8699 else
8700 gcc_unreachable ();
8702 else
8704 /* We need a copy here in case the def was external. */
8705 perm_stmt = gimple_build_assign (perm_dest, first_def);
8707 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8708 /* Store the vector statement in NODE. */
8709 node->push_vec_def (perm_stmt);
8712 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8713 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8714 If GSI is nonnull, emit the permutation there.
8716 When GSI is null, the only purpose of NODE is to give properties
8717 of the result, such as the vector type and number of SLP lanes.
8718 The node does not need to be a VEC_PERM_EXPR.
8720 If the target supports the operation, return the number of individual
8721 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8722 dump file if DUMP_P is true. */
8724 static int
8725 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8726 slp_tree node, lane_permutation_t &perm,
8727 vec<slp_tree> &children, bool dump_p)
8729 tree vectype = SLP_TREE_VECTYPE (node);
8731 /* ??? We currently only support all same vector input types
8732 while the SLP IL should really do a concat + select and thus accept
8733 arbitrary mismatches. */
8734 slp_tree child;
8735 unsigned i;
8736 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8737 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8738 tree op_vectype = NULL_TREE;
8739 FOR_EACH_VEC_ELT (children, i, child)
8740 if (SLP_TREE_VECTYPE (child))
8742 op_vectype = SLP_TREE_VECTYPE (child);
8743 break;
8745 if (!op_vectype)
8746 op_vectype = vectype;
8747 FOR_EACH_VEC_ELT (children, i, child)
8749 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8750 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8751 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8752 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8754 if (dump_p)
8755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756 "Unsupported vector types in lane permutation\n");
8757 return -1;
8759 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8760 repeating_p = false;
8763 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8764 if (dump_p)
8766 dump_printf_loc (MSG_NOTE, vect_location,
8767 "vectorizing permutation");
8768 for (unsigned i = 0; i < perm.length (); ++i)
8769 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8770 if (repeating_p)
8771 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8772 dump_printf (MSG_NOTE, "\n");
8775 /* REPEATING_P is true if every output vector is guaranteed to use the
8776 same permute vector. We can handle that case for both variable-length
8777 and constant-length vectors, but we only handle other cases for
8778 constant-length vectors.
8780 Set:
8782 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8783 mask vector that we want to build.
8785 - NCOPIES to the number of copies of PERM that we need in order
8786 to build the necessary permute mask vectors.
8788 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8789 for each permute mask vector. This is only relevant when GSI is
8790 nonnull. */
8791 uint64_t npatterns;
8792 unsigned nelts_per_pattern;
8793 uint64_t ncopies;
8794 unsigned noutputs_per_mask;
8795 if (repeating_p)
8797 /* We need a single permute mask vector that has the form:
8799 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8801 In other words, the original n-element permute in PERM is
8802 "unrolled" to fill a full vector. The stepped vector encoding
8803 that we use for permutes requires 3n elements. */
8804 npatterns = SLP_TREE_LANES (node);
8805 nelts_per_pattern = ncopies = 3;
8806 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8808 else
8810 /* Calculate every element of every permute mask vector explicitly,
8811 instead of relying on the pattern described above. */
8812 if (!nunits.is_constant (&npatterns))
8813 return -1;
8814 nelts_per_pattern = ncopies = 1;
8815 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8816 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8817 return -1;
8818 noutputs_per_mask = 1;
8820 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8821 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8823 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8824 from the { SLP operand, scalar lane } permutation as recorded in the
8825 SLP node as intermediate step. This part should already work
8826 with SLP children with arbitrary number of lanes. */
8827 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8828 auto_vec<unsigned> active_lane;
8829 vperm.create (olanes);
8830 active_lane.safe_grow_cleared (children.length (), true);
8831 for (unsigned i = 0; i < ncopies; ++i)
8833 for (unsigned pi = 0; pi < perm.length (); ++pi)
8835 std::pair<unsigned, unsigned> p = perm[pi];
8836 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8837 if (repeating_p)
8838 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8839 else
8841 /* We checked above that the vectors are constant-length. */
8842 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8843 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8844 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8845 vperm.quick_push ({{p.first, vi}, vl});
8848 /* Advance to the next group. */
8849 for (unsigned j = 0; j < children.length (); ++j)
8850 active_lane[j] += SLP_TREE_LANES (children[j]);
8853 if (dump_p)
8855 dump_printf_loc (MSG_NOTE, vect_location,
8856 "vectorizing permutation");
8857 for (unsigned i = 0; i < perm.length (); ++i)
8858 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8859 if (repeating_p)
8860 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8861 dump_printf (MSG_NOTE, "\n");
8862 dump_printf_loc (MSG_NOTE, vect_location, "as");
8863 for (unsigned i = 0; i < vperm.length (); ++i)
8865 if (i != 0
8866 && (repeating_p
8867 ? multiple_p (i, npatterns)
8868 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8869 dump_printf (MSG_NOTE, ",");
8870 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8871 vperm[i].first.first, vperm[i].first.second,
8872 vperm[i].second);
8874 dump_printf (MSG_NOTE, "\n");
8877 /* We can only handle two-vector permutes, everything else should
8878 be lowered on the SLP level. The following is closely inspired
8879 by vect_transform_slp_perm_load and is supposed to eventually
8880 replace it.
8881 ??? As intermediate step do code-gen in the SLP tree representation
8882 somehow? */
8883 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8884 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8885 unsigned int index = 0;
8886 poly_uint64 mask_element;
8887 vec_perm_builder mask;
8888 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8889 unsigned int count = mask.encoded_nelts ();
8890 mask.quick_grow (count);
8891 vec_perm_indices indices;
8892 unsigned nperms = 0;
8893 for (unsigned i = 0; i < vperm.length (); ++i)
8895 mask_element = vperm[i].second;
8896 if (first_vec.first == -1U
8897 || first_vec == vperm[i].first)
8898 first_vec = vperm[i].first;
8899 else if (second_vec.first == -1U
8900 || second_vec == vperm[i].first)
8902 second_vec = vperm[i].first;
8903 mask_element += nunits;
8905 else
8907 if (dump_p)
8908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8909 "permutation requires at "
8910 "least three vectors\n");
8911 gcc_assert (!gsi);
8912 return -1;
8915 mask[index++] = mask_element;
8917 if (index == count)
8919 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8920 TYPE_VECTOR_SUBPARTS (op_vectype));
8921 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8922 && constant_multiple_p (mask[0], nunits));
8923 machine_mode vmode = TYPE_MODE (vectype);
8924 machine_mode op_vmode = TYPE_MODE (op_vectype);
8925 unsigned HOST_WIDE_INT c;
8926 if ((!identity_p
8927 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8928 || (identity_p
8929 && !known_le (nunits,
8930 TYPE_VECTOR_SUBPARTS (op_vectype))
8931 && (!constant_multiple_p (nunits,
8932 TYPE_VECTOR_SUBPARTS (op_vectype),
8933 &c) || c != 2)))
8935 if (dump_p)
8937 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8938 vect_location,
8939 "unsupported vect permute { ");
8940 for (i = 0; i < count; ++i)
8942 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8943 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8945 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8947 gcc_assert (!gsi);
8948 return -1;
8951 if (!identity_p)
8952 nperms++;
8953 if (gsi)
8955 if (second_vec.first == -1U)
8956 second_vec = first_vec;
8958 slp_tree
8959 first_node = children[first_vec.first],
8960 second_node = children[second_vec.first];
8962 tree mask_vec = NULL_TREE;
8963 if (!identity_p)
8964 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8966 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8968 tree first_def
8969 = vect_get_slp_vect_def (first_node,
8970 first_vec.second + vi);
8971 tree second_def
8972 = vect_get_slp_vect_def (second_node,
8973 second_vec.second + vi);
8974 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8975 second_def, mask_vec, mask[0]);
8979 index = 0;
8980 first_vec = std::make_pair (-1U, -1U);
8981 second_vec = std::make_pair (-1U, -1U);
8985 return nperms;
8988 /* Vectorize the SLP permutations in NODE as specified
8989 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8990 child number and lane number.
8991 Interleaving of two two-lane two-child SLP subtrees (not supported):
8992 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8993 A blend of two four-lane two-child SLP subtrees:
8994 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8995 Highpart of a four-lane one-child SLP subtree (not supported):
8996 [ { 0, 2 }, { 0, 3 } ]
8997 Where currently only a subset is supported by code generating below. */
8999 static bool
9000 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9001 slp_tree node, stmt_vector_for_cost *cost_vec)
9003 tree vectype = SLP_TREE_VECTYPE (node);
9004 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9005 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9006 SLP_TREE_CHILDREN (node),
9007 dump_enabled_p ());
9008 if (nperms < 0)
9009 return false;
9011 if (!gsi)
9012 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9014 return true;
9017 /* Vectorize SLP NODE. */
9019 static void
9020 vect_schedule_slp_node (vec_info *vinfo,
9021 slp_tree node, slp_instance instance)
9023 gimple_stmt_iterator si;
9024 int i;
9025 slp_tree child;
9027 /* For existing vectors there's nothing to do. */
9028 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9029 && SLP_TREE_VEC_DEFS (node).exists ())
9030 return;
9032 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9034 /* Vectorize externals and constants. */
9035 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9036 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9038 /* ??? vectorizable_shift can end up using a scalar operand which is
9039 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9040 node in this case. */
9041 if (!SLP_TREE_VECTYPE (node))
9042 return;
9044 vect_create_constant_vectors (vinfo, node);
9045 return;
9048 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9050 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9051 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9053 if (dump_enabled_p ())
9054 dump_printf_loc (MSG_NOTE, vect_location,
9055 "------>vectorizing SLP node starting from: %G",
9056 stmt_info->stmt);
9058 if (STMT_VINFO_DATA_REF (stmt_info)
9059 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9061 /* Vectorized loads go before the first scalar load to make it
9062 ready early, vectorized stores go before the last scalar
9063 stmt which is where all uses are ready. */
9064 stmt_vec_info last_stmt_info = NULL;
9065 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9066 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9067 else /* DR_IS_WRITE */
9068 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9069 si = gsi_for_stmt (last_stmt_info->stmt);
9071 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9072 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9073 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9074 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9076 /* For PHI node vectorization we do not use the insertion iterator. */
9077 si = gsi_none ();
9079 else
9081 /* Emit other stmts after the children vectorized defs which is
9082 earliest possible. */
9083 gimple *last_stmt = NULL;
9084 bool seen_vector_def = false;
9085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9086 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9088 /* For fold-left reductions we are retaining the scalar
9089 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9090 set so the representation isn't perfect. Resort to the
9091 last scalar def here. */
9092 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9094 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9095 == cycle_phi_info_type);
9096 gphi *phi = as_a <gphi *>
9097 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9098 if (!last_stmt
9099 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9100 last_stmt = phi;
9102 /* We are emitting all vectorized stmts in the same place and
9103 the last one is the last.
9104 ??? Unless we have a load permutation applied and that
9105 figures to re-use an earlier generated load. */
9106 unsigned j;
9107 tree vdef;
9108 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9110 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9111 if (!last_stmt
9112 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9113 last_stmt = vstmt;
9116 else if (!SLP_TREE_VECTYPE (child))
9118 /* For externals we use unvectorized at all scalar defs. */
9119 unsigned j;
9120 tree def;
9121 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9122 if (TREE_CODE (def) == SSA_NAME
9123 && !SSA_NAME_IS_DEFAULT_DEF (def))
9125 gimple *stmt = SSA_NAME_DEF_STMT (def);
9126 if (!last_stmt
9127 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9128 last_stmt = stmt;
9131 else
9133 /* For externals we have to look at all defs since their
9134 insertion place is decided per vector. But beware
9135 of pre-existing vectors where we need to make sure
9136 we do not insert before the region boundary. */
9137 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9138 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9139 seen_vector_def = true;
9140 else
9142 unsigned j;
9143 tree vdef;
9144 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9145 if (TREE_CODE (vdef) == SSA_NAME
9146 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9148 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9149 if (!last_stmt
9150 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9151 last_stmt = vstmt;
9155 /* This can happen when all children are pre-existing vectors or
9156 constants. */
9157 if (!last_stmt)
9158 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9159 if (!last_stmt)
9161 gcc_assert (seen_vector_def);
9162 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9164 else if (is_ctrl_altering_stmt (last_stmt))
9166 /* We split regions to vectorize at control altering stmts
9167 with a definition so this must be an external which
9168 we can insert at the start of the region. */
9169 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9171 else if (is_a <bb_vec_info> (vinfo)
9172 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9173 && gimple_could_trap_p (stmt_info->stmt))
9175 /* We've constrained possibly trapping operations to all come
9176 from the same basic-block, if vectorized defs would allow earlier
9177 scheduling still force vectorized stmts to the original block.
9178 This is only necessary for BB vectorization since for loop vect
9179 all operations are in a single BB and scalar stmt based
9180 placement doesn't play well with epilogue vectorization. */
9181 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9182 gimple_bb (stmt_info->stmt),
9183 gimple_bb (last_stmt)));
9184 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9186 else if (is_a <gphi *> (last_stmt))
9187 si = gsi_after_labels (gimple_bb (last_stmt));
9188 else
9190 si = gsi_for_stmt (last_stmt);
9191 gsi_next (&si);
9195 /* Handle purely internal nodes. */
9196 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9198 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9199 be shared with different SLP nodes (but usually it's the same
9200 operation apart from the case the stmt is only there for denoting
9201 the actual scalar lane defs ...). So do not call vect_transform_stmt
9202 but open-code it here (partly). */
9203 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9204 gcc_assert (done);
9205 stmt_vec_info slp_stmt_info;
9206 unsigned int i;
9207 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9208 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9210 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9211 instance, i, true, NULL);
9212 gcc_assert (done);
9215 else
9216 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9219 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9220 For loop vectorization this is done in vectorizable_call, but for SLP
9221 it needs to be deferred until end of vect_schedule_slp, because multiple
9222 SLP instances may refer to the same scalar stmt. */
9224 static void
9225 vect_remove_slp_scalar_calls (vec_info *vinfo,
9226 slp_tree node, hash_set<slp_tree> &visited)
9228 gimple *new_stmt;
9229 gimple_stmt_iterator gsi;
9230 int i;
9231 slp_tree child;
9232 tree lhs;
9233 stmt_vec_info stmt_info;
9235 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9236 return;
9238 if (visited.add (node))
9239 return;
9241 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9242 vect_remove_slp_scalar_calls (vinfo, child, visited);
9244 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9246 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9247 if (!stmt || gimple_bb (stmt) == NULL)
9248 continue;
9249 if (is_pattern_stmt_p (stmt_info)
9250 || !PURE_SLP_STMT (stmt_info))
9251 continue;
9252 lhs = gimple_call_lhs (stmt);
9253 if (lhs)
9254 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9255 else
9257 new_stmt = gimple_build_nop ();
9258 unlink_stmt_vdef (stmt_info->stmt);
9260 gsi = gsi_for_stmt (stmt);
9261 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9262 if (lhs)
9263 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9267 static void
9268 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9270 hash_set<slp_tree> visited;
9271 vect_remove_slp_scalar_calls (vinfo, node, visited);
9274 /* Vectorize the instance root. */
9276 void
9277 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9279 gassign *rstmt = NULL;
9281 if (instance->kind == slp_inst_kind_ctor)
9283 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9285 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9286 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9287 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9288 TREE_TYPE (vect_lhs)))
9289 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9290 vect_lhs);
9291 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9293 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9295 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9296 tree child_def;
9297 int j;
9298 vec<constructor_elt, va_gc> *v;
9299 vec_alloc (v, nelts);
9301 /* A CTOR can handle V16HI composition from VNx8HI so we
9302 do not need to convert vector elements if the types
9303 do not match. */
9304 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9305 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9306 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9307 tree rtype
9308 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9309 tree r_constructor = build_constructor (rtype, v);
9310 rstmt = gimple_build_assign (lhs, r_constructor);
9313 else if (instance->kind == slp_inst_kind_bb_reduc)
9315 /* Largely inspired by reduction chain epilogue handling in
9316 vect_create_epilog_for_reduction. */
9317 vec<tree> vec_defs = vNULL;
9318 vect_get_slp_defs (node, &vec_defs);
9319 enum tree_code reduc_code
9320 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9321 /* ??? We actually have to reflect signs somewhere. */
9322 if (reduc_code == MINUS_EXPR)
9323 reduc_code = PLUS_EXPR;
9324 gimple_seq epilogue = NULL;
9325 /* We may end up with more than one vector result, reduce them
9326 to one vector. */
9327 tree vec_def = vec_defs[0];
9328 tree vectype = TREE_TYPE (vec_def);
9329 tree compute_vectype = vectype;
9330 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9331 && TYPE_OVERFLOW_UNDEFINED (vectype)
9332 && operation_can_overflow (reduc_code));
9333 if (pun_for_overflow_p)
9335 compute_vectype = unsigned_type_for (vectype);
9336 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9337 compute_vectype, vec_def);
9339 for (unsigned i = 1; i < vec_defs.length (); ++i)
9341 tree def = vec_defs[i];
9342 if (pun_for_overflow_p)
9343 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9344 compute_vectype, def);
9345 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9346 vec_def, def);
9348 vec_defs.release ();
9349 /* ??? Support other schemes than direct internal fn. */
9350 internal_fn reduc_fn;
9351 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9352 || reduc_fn == IFN_LAST)
9353 gcc_unreachable ();
9354 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9355 TREE_TYPE (compute_vectype), vec_def);
9356 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9358 tree rem_def = NULL_TREE;
9359 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9361 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9362 if (!rem_def)
9363 rem_def = def;
9364 else
9365 rem_def = gimple_build (&epilogue, reduc_code,
9366 TREE_TYPE (scalar_def),
9367 rem_def, def);
9369 scalar_def = gimple_build (&epilogue, reduc_code,
9370 TREE_TYPE (scalar_def),
9371 scalar_def, rem_def);
9373 scalar_def = gimple_convert (&epilogue,
9374 TREE_TYPE (vectype), scalar_def);
9375 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9376 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9377 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9378 update_stmt (gsi_stmt (rgsi));
9379 return;
9381 else
9382 gcc_unreachable ();
9384 gcc_assert (rstmt);
9386 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9387 gsi_replace (&rgsi, rstmt, true);
9390 struct slp_scc_info
9392 bool on_stack;
9393 int dfs;
9394 int lowlink;
9397 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9399 static void
9400 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9401 hash_map<slp_tree, slp_scc_info> &scc_info,
9402 int &maxdfs, vec<slp_tree> &stack)
9404 bool existed_p;
9405 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9406 gcc_assert (!existed_p);
9407 info->dfs = maxdfs;
9408 info->lowlink = maxdfs;
9409 maxdfs++;
9411 /* Leaf. */
9412 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9414 info->on_stack = false;
9415 vect_schedule_slp_node (vinfo, node, instance);
9416 return;
9419 info->on_stack = true;
9420 stack.safe_push (node);
9422 unsigned i;
9423 slp_tree child;
9424 /* DFS recurse. */
9425 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9427 if (!child)
9428 continue;
9429 slp_scc_info *child_info = scc_info.get (child);
9430 if (!child_info)
9432 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9433 /* Recursion might have re-allocated the node. */
9434 info = scc_info.get (node);
9435 child_info = scc_info.get (child);
9436 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9438 else if (child_info->on_stack)
9439 info->lowlink = MIN (info->lowlink, child_info->dfs);
9441 if (info->lowlink != info->dfs)
9442 return;
9444 auto_vec<slp_tree, 4> phis_to_fixup;
9446 /* Singleton. */
9447 if (stack.last () == node)
9449 stack.pop ();
9450 info->on_stack = false;
9451 vect_schedule_slp_node (vinfo, node, instance);
9452 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9453 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9454 phis_to_fixup.quick_push (node);
9456 else
9458 /* SCC. */
9459 int last_idx = stack.length () - 1;
9460 while (stack[last_idx] != node)
9461 last_idx--;
9462 /* We can break the cycle at PHIs who have at least one child
9463 code generated. Then we could re-start the DFS walk until
9464 all nodes in the SCC are covered (we might have new entries
9465 for only back-reachable nodes). But it's simpler to just
9466 iterate and schedule those that are ready. */
9467 unsigned todo = stack.length () - last_idx;
9470 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9472 slp_tree entry = stack[idx];
9473 if (!entry)
9474 continue;
9475 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9476 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9477 bool ready = !phi;
9478 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9479 if (!child)
9481 gcc_assert (phi);
9482 ready = true;
9483 break;
9485 else if (scc_info.get (child)->on_stack)
9487 if (!phi)
9489 ready = false;
9490 break;
9493 else
9495 if (phi)
9497 ready = true;
9498 break;
9501 if (ready)
9503 vect_schedule_slp_node (vinfo, entry, instance);
9504 scc_info.get (entry)->on_stack = false;
9505 stack[idx] = NULL;
9506 todo--;
9507 if (phi)
9508 phis_to_fixup.safe_push (entry);
9512 while (todo != 0);
9514 /* Pop the SCC. */
9515 stack.truncate (last_idx);
9518 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9519 slp_tree phi_node;
9520 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9522 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9523 edge_iterator ei;
9524 edge e;
9525 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9527 unsigned dest_idx = e->dest_idx;
9528 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9529 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9530 continue;
9531 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9532 /* Simply fill all args. */
9533 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9534 != vect_first_order_recurrence)
9535 for (unsigned i = 0; i < n; ++i)
9537 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9538 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9539 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9540 e, gimple_phi_arg_location (phi, dest_idx));
9542 else
9544 /* Unless it is a first order recurrence which needs
9545 args filled in for both the PHI node and the permutes. */
9546 gimple *perm
9547 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9548 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9549 add_phi_arg (as_a <gphi *> (rphi),
9550 vect_get_slp_vect_def (child, n - 1),
9551 e, gimple_phi_arg_location (phi, dest_idx));
9552 for (unsigned i = 0; i < n; ++i)
9554 gimple *perm
9555 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9556 if (i > 0)
9557 gimple_assign_set_rhs1 (perm,
9558 vect_get_slp_vect_def (child, i - 1));
9559 gimple_assign_set_rhs2 (perm,
9560 vect_get_slp_vect_def (child, i));
9561 update_stmt (perm);
9568 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9570 void
9571 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9573 slp_instance instance;
9574 unsigned int i;
9576 hash_map<slp_tree, slp_scc_info> scc_info;
9577 int maxdfs = 0;
9578 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9580 slp_tree node = SLP_INSTANCE_TREE (instance);
9581 if (dump_enabled_p ())
9583 dump_printf_loc (MSG_NOTE, vect_location,
9584 "Vectorizing SLP tree:\n");
9585 /* ??? Dump all? */
9586 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9587 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9588 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9589 vect_print_slp_graph (MSG_NOTE, vect_location,
9590 SLP_INSTANCE_TREE (instance));
9592 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9593 have a PHI be the node breaking the cycle. */
9594 auto_vec<slp_tree> stack;
9595 if (!scc_info.get (node))
9596 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9598 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9599 vectorize_slp_instance_root_stmt (node, instance);
9601 if (dump_enabled_p ())
9602 dump_printf_loc (MSG_NOTE, vect_location,
9603 "vectorizing stmts using SLP.\n");
9606 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9608 slp_tree root = SLP_INSTANCE_TREE (instance);
9609 stmt_vec_info store_info;
9610 unsigned int j;
9612 /* Remove scalar call stmts. Do not do this for basic-block
9613 vectorization as not all uses may be vectorized.
9614 ??? Why should this be necessary? DCE should be able to
9615 remove the stmts itself.
9616 ??? For BB vectorization we can as well remove scalar
9617 stmts starting from the SLP tree root if they have no
9618 uses. */
9619 if (is_a <loop_vec_info> (vinfo))
9620 vect_remove_slp_scalar_calls (vinfo, root);
9622 /* Remove vectorized stores original scalar stmts. */
9623 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9625 if (!STMT_VINFO_DATA_REF (store_info)
9626 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9627 break;
9629 store_info = vect_orig_stmt (store_info);
9630 /* Free the attached stmt_vec_info and remove the stmt. */
9631 vinfo->remove_stmt (store_info);
9633 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9634 to not crash in vect_free_slp_tree later. */
9635 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9636 SLP_TREE_REPRESENTATIVE (root) = NULL;