C99 testsuite readiness: Compile more tests with -std=gnu89
[official-gcc.git] / gcc / tree-vect-slp.cc
blob24bf6582f8dfab44711484a002c139f779099102
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
154 void
155 _slp_tree::push_vec_def (gimple *def)
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
168 void
169 vect_free_slp_tree (slp_tree node)
171 int i;
172 slp_tree child;
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
191 delete node;
194 /* Return a location suitable for dumpings related to the SLP instance. */
196 dump_user_location_t
197 _slp_instance::location () const
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
206 /* Free the memory allocated for the SLP instance. */
208 void
209 vect_free_slp_instance (slp_instance instance)
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
221 /* Create an SLP node for SCALAR_STMTS. */
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
233 /* Create an SLP node for SCALAR_STMTS. */
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
247 /* Create an SLP node for SCALAR_STMTS. */
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
255 /* Create an SLP node for OPS. */
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
266 /* Create an SLP node for OPS. */
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
271 return vect_create_new_slp_node (new _slp_tree, ops);
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
317 return oprnds_info;
321 /* Free operands info. */
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
326 int i;
327 slp_oprnd_info oprnd_info;
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
336 oprnds_info.release ();
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
342 static sreal
343 vect_slp_node_weight (slp_tree node)
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
350 /* Return true if STMTS contains a pattern statement. */
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
384 return true;
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
409 while (next_stmt_info);
411 return -1;
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
459 for (unsigned int i = 0; i < 3; ++i)
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
483 return true;
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
493 /* Return true if DTA and DTB match. */
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
508 static const int arg1_map[] = { 1, 1 };
509 static const int arg2_map[] = { 1, 2 };
510 static const int arg1_arg4_map[] = { 2, 1, 4 };
511 static const int arg3_arg2_map[] = { 2, 3, 2 };
512 static const int op1_op0_map[] = { 2, 1, 0 };
513 static const int off_map[] = { 1, -3 };
514 static const int off_op0_map[] = { 2, -3, 0 };
515 static const int off_arg2_map[] = { 2, -3, 2 };
516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
517 static const int mask_call_maps[6][7] = {
518 { 1, 1, },
519 { 2, 1, 2, },
520 { 3, 1, 2, 3, },
521 { 4, 1, 2, 3, 4, },
522 { 5, 1, 2, 3, 4, 5, },
523 { 6, 1, 2, 3, 4, 5, 6 },
526 /* For most SLP statements, there is a one-to-one mapping between
527 gimple arguments and child nodes. If that is not true for STMT,
528 return an array that contains:
530 - the number of child nodes, followed by
531 - for each child node, the index of the argument associated with that node.
532 The special index -1 is the first operand of an embedded comparison and
533 the special index -2 is the second operand of an embedded comparison.
534 The special indes -3 is the offset of a gather as analyzed by
535 vect_check_gather_scatter.
537 SWAP is as for vect_get_and_check_slp_defs. */
539 static const int *
540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
541 unsigned char swap = 0)
543 if (auto assign = dyn_cast<const gassign *> (stmt))
545 if (gimple_assign_rhs_code (assign) == COND_EXPR
546 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
547 return cond_expr_maps[swap];
548 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
549 && swap)
550 return op1_op0_map;
551 if (gather_scatter_p)
552 return gimple_vdef (stmt) ? off_op0_map : off_map;
554 gcc_assert (!swap);
555 if (auto call = dyn_cast<const gcall *> (stmt))
557 if (gimple_call_internal_p (call))
558 switch (gimple_call_internal_fn (call))
560 case IFN_MASK_LOAD:
561 return gather_scatter_p ? off_arg2_map : arg2_map;
563 case IFN_GATHER_LOAD:
564 return arg1_map;
566 case IFN_MASK_GATHER_LOAD:
567 return arg1_arg4_map;
569 case IFN_MASK_STORE:
570 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
572 case IFN_MASK_CALL:
574 unsigned nargs = gimple_call_num_args (call);
575 if (nargs >= 2 && nargs <= 7)
576 return mask_call_maps[nargs-2];
577 else
578 return nullptr;
581 default:
582 break;
585 return nullptr;
588 /* Return the SLP node child index for operand OP of STMT. */
591 vect_slp_child_index_for_operand (const gimple *stmt, int op)
593 const int *opmap = vect_get_operand_map (stmt);
594 if (!opmap)
595 return op;
596 for (int i = 1; i < 1 + opmap[0]; ++i)
597 if (opmap[i] == op)
598 return i - 1;
599 gcc_unreachable ();
602 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
603 they are of a valid type and that they match the defs of the first stmt of
604 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
605 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
606 indicates swap is required for cond_expr stmts. Specifically, SWAP
607 is 1 if STMT is cond and operands of comparison need to be swapped;
608 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
610 If there was a fatal error return -1; if the error could be corrected by
611 swapping operands of father node of this one, return 1; if everything is
612 ok return 0. */
613 static int
614 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
615 bool *skip_args,
616 vec<stmt_vec_info> stmts, unsigned stmt_num,
617 vec<slp_oprnd_info> *oprnds_info)
619 stmt_vec_info stmt_info = stmts[stmt_num];
620 tree oprnd;
621 unsigned int i, number_of_oprnds;
622 enum vect_def_type dt = vect_uninitialized_def;
623 slp_oprnd_info oprnd_info;
624 gather_scatter_info gs_info;
625 unsigned int gs_op = -1u;
626 unsigned int commutative_op = -1U;
627 bool first = stmt_num == 0;
629 if (!is_a<gcall *> (stmt_info->stmt)
630 && !is_a<gassign *> (stmt_info->stmt)
631 && !is_a<gphi *> (stmt_info->stmt))
632 return -1;
634 number_of_oprnds = gimple_num_args (stmt_info->stmt);
635 const int *map
636 = vect_get_operand_map (stmt_info->stmt,
637 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
638 if (map)
639 number_of_oprnds = *map++;
640 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
642 if (gimple_call_internal_p (stmt))
644 internal_fn ifn = gimple_call_internal_fn (stmt);
645 commutative_op = first_commutative_argument (ifn);
648 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
650 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
651 commutative_op = 0;
654 bool swapped = (swap != 0);
655 bool backedge = false;
656 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
657 for (i = 0; i < number_of_oprnds; i++)
659 oprnd_info = (*oprnds_info)[i];
660 int opno = map ? map[i] : int (i);
661 if (opno == -3)
663 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
664 if (!is_a <loop_vec_info> (vinfo)
665 || !vect_check_gather_scatter (stmt_info,
666 as_a <loop_vec_info> (vinfo),
667 first ? &oprnd_info->first_gs_info
668 : &gs_info))
669 return -1;
671 if (first)
673 oprnd_info->first_gs_p = true;
674 oprnd = oprnd_info->first_gs_info.offset;
676 else
678 gs_op = i;
679 oprnd = gs_info.offset;
682 else if (opno < 0)
683 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
684 else
686 oprnd = gimple_arg (stmt_info->stmt, opno);
687 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
689 edge e = gimple_phi_arg_edge (stmt, opno);
690 backedge = (is_a <bb_vec_info> (vinfo)
691 ? e->flags & EDGE_DFS_BACK
692 : dominated_by_p (CDI_DOMINATORS, e->src,
693 gimple_bb (stmt_info->stmt)));
696 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
697 oprnd = TREE_OPERAND (oprnd, 0);
699 stmt_vec_info def_stmt_info;
700 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: can't analyze def for %T\n",
705 oprnd);
707 return -1;
710 if (skip_args[i])
712 oprnd_info->def_stmts.quick_push (NULL);
713 oprnd_info->ops.quick_push (NULL_TREE);
714 oprnd_info->first_dt = vect_uninitialized_def;
715 continue;
718 oprnd_info->def_stmts.quick_push (def_stmt_info);
719 oprnd_info->ops.quick_push (oprnd);
721 if (def_stmt_info
722 && is_pattern_stmt_p (def_stmt_info))
724 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
725 != def_stmt_info)
726 oprnd_info->any_pattern = true;
727 else
728 /* If we promote this to external use the original stmt def. */
729 oprnd_info->ops.last ()
730 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
733 /* If there's a extern def on a backedge make sure we can
734 code-generate at the region start.
735 ??? This is another case that could be fixed by adjusting
736 how we split the function but at the moment we'd have conflicting
737 goals there. */
738 if (backedge
739 && dts[i] == vect_external_def
740 && is_a <bb_vec_info> (vinfo)
741 && TREE_CODE (oprnd) == SSA_NAME
742 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
743 && !dominated_by_p (CDI_DOMINATORS,
744 as_a <bb_vec_info> (vinfo)->bbs[0],
745 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
749 "Build SLP failed: extern def %T only defined "
750 "on backedge\n", oprnd);
751 return -1;
754 if (first)
756 tree type = TREE_TYPE (oprnd);
757 dt = dts[i];
758 if ((dt == vect_constant_def
759 || dt == vect_external_def)
760 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
761 && (TREE_CODE (type) == BOOLEAN_TYPE
762 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
763 type)))
765 if (dump_enabled_p ())
766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
767 "Build SLP failed: invalid type of def "
768 "for variable-length SLP %T\n", oprnd);
769 return -1;
772 /* For the swapping logic below force vect_reduction_def
773 for the reduction op in a SLP reduction group. */
774 if (!STMT_VINFO_DATA_REF (stmt_info)
775 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
776 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
777 && def_stmt_info)
778 dts[i] = dt = vect_reduction_def;
780 /* Check the types of the definition. */
781 switch (dt)
783 case vect_external_def:
784 case vect_constant_def:
785 case vect_internal_def:
786 case vect_reduction_def:
787 case vect_induction_def:
788 case vect_nested_cycle:
789 case vect_first_order_recurrence:
790 break;
792 default:
793 /* FORNOW: Not supported. */
794 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
796 "Build SLP failed: illegal type of def %T\n",
797 oprnd);
798 return -1;
801 oprnd_info->first_dt = dt;
802 oprnd_info->first_op_type = type;
805 if (first)
806 return 0;
808 /* Now match the operand definition types to that of the first stmt. */
809 for (i = 0; i < number_of_oprnds;)
811 if (skip_args[i])
813 ++i;
814 continue;
817 oprnd_info = (*oprnds_info)[i];
818 dt = dts[i];
819 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
820 oprnd = oprnd_info->ops[stmt_num];
821 tree type = TREE_TYPE (oprnd);
823 if (!types_compatible_p (oprnd_info->first_op_type, type))
825 if (dump_enabled_p ())
826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
827 "Build SLP failed: different operand types\n");
828 return 1;
831 if ((gs_op == i) != oprnd_info->first_gs_p)
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: mixed gather and non-gather\n");
836 return 1;
838 else if (gs_op == i)
840 if (!operand_equal_p (oprnd_info->first_gs_info.base,
841 gs_info.base))
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
845 "Build SLP failed: different gather base\n");
846 return 1;
848 if (oprnd_info->first_gs_info.scale != gs_info.scale)
850 if (dump_enabled_p ())
851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
852 "Build SLP failed: different gather scale\n");
853 return 1;
857 /* Not first stmt of the group, check that the def-stmt/s match
858 the def-stmt/s of the first stmt. Allow different definition
859 types for reduction chains: the first stmt must be a
860 vect_reduction_def (a phi node), and the rest
861 end in the reduction chain. */
862 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
863 && !(oprnd_info->first_dt == vect_reduction_def
864 && !STMT_VINFO_DATA_REF (stmt_info)
865 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
866 && def_stmt_info
867 && !STMT_VINFO_DATA_REF (def_stmt_info)
868 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
869 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
870 || (!STMT_VINFO_DATA_REF (stmt_info)
871 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
872 && ((!def_stmt_info
873 || STMT_VINFO_DATA_REF (def_stmt_info)
874 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
875 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
876 != (oprnd_info->first_dt != vect_reduction_def))))
878 /* Try swapping operands if we got a mismatch. For BB
879 vectorization only in case it will clearly improve things. */
880 if (i == commutative_op && !swapped
881 && (!is_a <bb_vec_info> (vinfo)
882 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
883 dts[i+1])
884 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
885 || vect_def_types_match
886 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location,
890 "trying swapped operands\n");
891 std::swap (dts[i], dts[i+1]);
892 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
893 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
894 std::swap ((*oprnds_info)[i]->ops[stmt_num],
895 (*oprnds_info)[i+1]->ops[stmt_num]);
896 swapped = true;
897 continue;
900 if (is_a <bb_vec_info> (vinfo)
901 && !oprnd_info->any_pattern)
903 /* Now for commutative ops we should see whether we can
904 make the other operand matching. */
905 if (dump_enabled_p ())
906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 "treating operand as external\n");
908 oprnd_info->first_dt = dt = vect_external_def;
910 else
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Build SLP failed: different types\n");
915 return 1;
919 /* Make sure to demote the overall operand to external. */
920 if (dt == vect_external_def)
921 oprnd_info->first_dt = vect_external_def;
922 /* For a SLP reduction chain we want to duplicate the reduction to
923 each of the chain members. That gets us a sane SLP graph (still
924 the stmts are not 100% correct wrt the initial values). */
925 else if ((dt == vect_internal_def
926 || dt == vect_reduction_def)
927 && oprnd_info->first_dt == vect_reduction_def
928 && !STMT_VINFO_DATA_REF (stmt_info)
929 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
930 && !STMT_VINFO_DATA_REF (def_stmt_info)
931 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
932 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
934 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
935 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
938 ++i;
941 /* Swap operands. */
942 if (swapped)
944 if (dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE, vect_location,
946 "swapped operands to match def types in %G",
947 stmt_info->stmt);
950 return 0;
953 /* Return true if call statements CALL1 and CALL2 are similar enough
954 to be combined into the same SLP group. */
956 bool
957 compatible_calls_p (gcall *call1, gcall *call2)
959 unsigned int nargs = gimple_call_num_args (call1);
960 if (nargs != gimple_call_num_args (call2))
961 return false;
963 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
964 return false;
966 if (gimple_call_internal_p (call1))
968 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
969 TREE_TYPE (gimple_call_lhs (call2))))
970 return false;
971 for (unsigned int i = 0; i < nargs; ++i)
972 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
973 TREE_TYPE (gimple_call_arg (call2, i))))
974 return false;
976 else
978 if (!operand_equal_p (gimple_call_fn (call1),
979 gimple_call_fn (call2), 0))
980 return false;
982 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
983 return false;
986 /* Check that any unvectorized arguments are equal. */
987 if (const int *map = vect_get_operand_map (call1))
989 unsigned int nkept = *map++;
990 unsigned int mapi = 0;
991 for (unsigned int i = 0; i < nargs; ++i)
992 if (mapi < nkept && map[mapi] == int (i))
993 mapi += 1;
994 else if (!operand_equal_p (gimple_call_arg (call1, i),
995 gimple_call_arg (call2, i)))
996 return false;
999 return true;
1002 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1003 caller's attempt to find the vector type in STMT_INFO with the narrowest
1004 element type. Return true if VECTYPE is nonnull and if it is valid
1005 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1006 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1007 vect_build_slp_tree. */
1009 static bool
1010 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1011 unsigned int group_size,
1012 tree vectype, poly_uint64 *max_nunits)
1014 if (!vectype)
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "Build SLP failed: unsupported data-type in %G\n",
1019 stmt_info->stmt);
1020 /* Fatal mismatch. */
1021 return false;
1024 /* If populating the vector type requires unrolling then fail
1025 before adjusting *max_nunits for basic-block vectorization. */
1026 if (is_a <bb_vec_info> (vinfo)
1027 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1029 if (dump_enabled_p ())
1030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031 "Build SLP failed: unrolling required "
1032 "in basic block SLP\n");
1033 /* Fatal mismatch. */
1034 return false;
1037 /* In case of multiple types we need to detect the smallest type. */
1038 vect_update_max_nunits (max_nunits, vectype);
1039 return true;
1042 /* Verify if the scalar stmts STMTS are isomorphic, require data
1043 permutation or are of unsupported types of operation. Return
1044 true if they are, otherwise return false and indicate in *MATCHES
1045 which stmts are not isomorphic to the first one. If MATCHES[0]
1046 is false then this indicates the comparison could not be
1047 carried out or the stmts will never be vectorized by SLP.
1049 Note COND_EXPR is possibly isomorphic to another one after swapping its
1050 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1051 the first stmt by swapping the two operands of comparison; set SWAP[i]
1052 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1053 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1054 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1056 static bool
1057 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1058 vec<stmt_vec_info> stmts, unsigned int group_size,
1059 poly_uint64 *max_nunits, bool *matches,
1060 bool *two_operators, tree *node_vectype)
1062 unsigned int i;
1063 stmt_vec_info first_stmt_info = stmts[0];
1064 code_helper first_stmt_code = ERROR_MARK;
1065 code_helper alt_stmt_code = ERROR_MARK;
1066 code_helper rhs_code = ERROR_MARK;
1067 code_helper first_cond_code = ERROR_MARK;
1068 tree lhs;
1069 bool need_same_oprnds = false;
1070 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1071 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1072 bool first_stmt_ldst_p = false, ldst_p = false;
1073 bool first_stmt_phi_p = false, phi_p = false;
1074 bool maybe_soft_fail = false;
1075 tree soft_fail_nunits_vectype = NULL_TREE;
1077 /* For every stmt in NODE find its def stmt/s. */
1078 stmt_vec_info stmt_info;
1079 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1081 gimple *stmt = stmt_info->stmt;
1082 swap[i] = 0;
1083 matches[i] = false;
1085 if (dump_enabled_p ())
1086 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1088 /* Fail to vectorize statements marked as unvectorizable, throw
1089 or are volatile. */
1090 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1091 || stmt_can_throw_internal (cfun, stmt)
1092 || gimple_has_volatile_ops (stmt))
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1096 "Build SLP failed: unvectorizable statement %G",
1097 stmt);
1098 /* ??? For BB vectorization we want to commutate operands in a way
1099 to shuffle all unvectorizable defs into one operand and have
1100 the other still vectorized. The following doesn't reliably
1101 work for this though but it's the easiest we can do here. */
1102 if (is_a <bb_vec_info> (vinfo) && i != 0)
1103 continue;
1104 /* Fatal mismatch. */
1105 matches[0] = false;
1106 return false;
1109 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1110 lhs = gimple_get_lhs (stmt);
1111 if (lhs == NULL_TREE
1112 && (!call_stmt
1113 || !gimple_call_internal_p (stmt)
1114 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: not GIMPLE_ASSIGN nor "
1119 "GIMPLE_CALL %G", stmt);
1120 if (is_a <bb_vec_info> (vinfo) && i != 0)
1121 continue;
1122 /* Fatal mismatch. */
1123 matches[0] = false;
1124 return false;
1127 tree nunits_vectype;
1128 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1129 &nunits_vectype, group_size))
1131 if (is_a <bb_vec_info> (vinfo) && i != 0)
1132 continue;
1133 /* Fatal mismatch. */
1134 matches[0] = false;
1135 return false;
1137 /* Record nunits required but continue analysis, producing matches[]
1138 as if nunits was not an issue. This allows splitting of groups
1139 to happen. */
1140 if (nunits_vectype
1141 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1142 nunits_vectype, max_nunits))
1144 gcc_assert (is_a <bb_vec_info> (vinfo));
1145 maybe_soft_fail = true;
1146 soft_fail_nunits_vectype = nunits_vectype;
1149 gcc_assert (vectype);
1151 if (call_stmt)
1153 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1154 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1155 rhs_code = cfn;
1156 else
1157 rhs_code = CALL_EXPR;
1159 if (cfn == CFN_MASK_LOAD
1160 || cfn == CFN_GATHER_LOAD
1161 || cfn == CFN_MASK_GATHER_LOAD)
1162 ldst_p = true;
1163 else if (cfn == CFN_MASK_STORE)
1165 ldst_p = true;
1166 rhs_code = CFN_MASK_STORE;
1168 else if ((cfn != CFN_LAST
1169 && cfn != CFN_MASK_CALL
1170 && internal_fn_p (cfn)
1171 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1172 || gimple_call_tail_p (call_stmt)
1173 || gimple_call_noreturn_p (call_stmt)
1174 || gimple_call_chain (call_stmt))
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "Build SLP failed: unsupported call type %G",
1179 (gimple *) call_stmt);
1180 if (is_a <bb_vec_info> (vinfo) && i != 0)
1181 continue;
1182 /* Fatal mismatch. */
1183 matches[0] = false;
1184 return false;
1187 else if (gimple_code (stmt) == GIMPLE_PHI)
1189 rhs_code = ERROR_MARK;
1190 phi_p = true;
1192 else
1194 rhs_code = gimple_assign_rhs_code (stmt);
1195 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1198 /* Check the operation. */
1199 if (i == 0)
1201 *node_vectype = vectype;
1202 first_stmt_code = rhs_code;
1203 first_stmt_ldst_p = ldst_p;
1204 first_stmt_phi_p = phi_p;
1206 /* Shift arguments should be equal in all the packed stmts for a
1207 vector shift with scalar shift operand. */
1208 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1209 || rhs_code == LROTATE_EXPR
1210 || rhs_code == RROTATE_EXPR)
1212 /* First see if we have a vector/vector shift. */
1213 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1215 /* No vector/vector shift, try for a vector/scalar shift. */
1216 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1218 if (dump_enabled_p ())
1219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1220 "Build SLP failed: "
1221 "op not supported by target.\n");
1222 if (is_a <bb_vec_info> (vinfo) && i != 0)
1223 continue;
1224 /* Fatal mismatch. */
1225 matches[0] = false;
1226 return false;
1228 need_same_oprnds = true;
1229 first_op1 = gimple_assign_rhs2 (stmt);
1232 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1234 need_same_oprnds = true;
1235 first_op1 = gimple_assign_rhs2 (stmt);
1237 else if (!ldst_p
1238 && rhs_code == BIT_FIELD_REF)
1240 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1241 if (!is_a <bb_vec_info> (vinfo)
1242 || TREE_CODE (vec) != SSA_NAME
1243 /* When the element types are not compatible we pun the
1244 source to the target vectype which requires equal size. */
1245 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1246 || !types_compatible_p (TREE_TYPE (vectype),
1247 TREE_TYPE (TREE_TYPE (vec))))
1248 && !operand_equal_p (TYPE_SIZE (vectype),
1249 TYPE_SIZE (TREE_TYPE (vec)))))
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253 "Build SLP failed: "
1254 "BIT_FIELD_REF not supported\n");
1255 /* Fatal mismatch. */
1256 matches[0] = false;
1257 return false;
1260 else if (rhs_code == CFN_DIV_POW2)
1262 need_same_oprnds = true;
1263 first_op1 = gimple_call_arg (call_stmt, 1);
1266 else
1268 if (first_stmt_code != rhs_code
1269 && alt_stmt_code == ERROR_MARK)
1270 alt_stmt_code = rhs_code;
1271 if ((first_stmt_code != rhs_code
1272 && (first_stmt_code != IMAGPART_EXPR
1273 || rhs_code != REALPART_EXPR)
1274 && (first_stmt_code != REALPART_EXPR
1275 || rhs_code != IMAGPART_EXPR)
1276 /* Handle mismatches in plus/minus by computing both
1277 and merging the results. */
1278 && !((first_stmt_code == PLUS_EXPR
1279 || first_stmt_code == MINUS_EXPR)
1280 && (alt_stmt_code == PLUS_EXPR
1281 || alt_stmt_code == MINUS_EXPR)
1282 && rhs_code == alt_stmt_code)
1283 && !(first_stmt_code.is_tree_code ()
1284 && rhs_code.is_tree_code ()
1285 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1286 == tcc_comparison)
1287 && (swap_tree_comparison (tree_code (first_stmt_code))
1288 == tree_code (rhs_code)))
1289 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1290 && (first_stmt_code == ARRAY_REF
1291 || first_stmt_code == BIT_FIELD_REF
1292 || first_stmt_code == INDIRECT_REF
1293 || first_stmt_code == COMPONENT_REF
1294 || first_stmt_code == MEM_REF)
1295 && (rhs_code == ARRAY_REF
1296 || rhs_code == BIT_FIELD_REF
1297 || rhs_code == INDIRECT_REF
1298 || rhs_code == COMPONENT_REF
1299 || rhs_code == MEM_REF)))
1300 || (ldst_p
1301 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1302 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1303 || first_stmt_ldst_p != ldst_p
1304 || first_stmt_phi_p != phi_p)
1306 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309 "Build SLP failed: different operation "
1310 "in stmt %G", stmt);
1311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1312 "original stmt %G", first_stmt_info->stmt);
1314 /* Mismatch. */
1315 continue;
1318 if (!ldst_p
1319 && first_stmt_code == BIT_FIELD_REF
1320 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1321 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "Build SLP failed: different BIT_FIELD_REF "
1326 "arguments in %G", stmt);
1327 /* Mismatch. */
1328 continue;
1331 if (call_stmt
1332 && first_stmt_code != CFN_MASK_LOAD
1333 && first_stmt_code != CFN_MASK_STORE)
1335 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1336 call_stmt))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "Build SLP failed: different calls in %G",
1341 stmt);
1342 /* Mismatch. */
1343 continue;
1347 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1348 && (gimple_bb (first_stmt_info->stmt)
1349 != gimple_bb (stmt_info->stmt)))
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353 "Build SLP failed: different BB for PHI "
1354 "or possibly trapping operation in %G", stmt);
1355 /* Mismatch. */
1356 continue;
1359 if (need_same_oprnds)
1361 tree other_op1 = gimple_arg (stmt, 1);
1362 if (!operand_equal_p (first_op1, other_op1, 0))
1364 if (dump_enabled_p ())
1365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 "Build SLP failed: different shift "
1367 "arguments in %G", stmt);
1368 /* Mismatch. */
1369 continue;
1373 if (!types_compatible_p (vectype, *node_vectype))
1375 if (dump_enabled_p ())
1376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377 "Build SLP failed: different vector type "
1378 "in %G", stmt);
1379 /* Mismatch. */
1380 continue;
1384 /* Grouped store or load. */
1385 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1387 gcc_assert (ldst_p);
1388 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1390 /* Store. */
1391 gcc_assert (rhs_code == CFN_MASK_STORE
1392 || REFERENCE_CLASS_P (lhs)
1393 || DECL_P (lhs));
1395 else
1397 /* Load. */
1398 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1399 if (prev_first_load)
1401 /* Check that there are no loads from different interleaving
1402 chains in the same node. */
1403 if (prev_first_load != first_load)
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1407 vect_location,
1408 "Build SLP failed: different "
1409 "interleaving chains in one node %G",
1410 stmt);
1411 /* Mismatch. */
1412 continue;
1415 else
1416 prev_first_load = first_load;
1419 /* Non-grouped store or load. */
1420 else if (ldst_p)
1422 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1423 && rhs_code != CFN_GATHER_LOAD
1424 && rhs_code != CFN_MASK_GATHER_LOAD
1425 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426 /* Not grouped loads are handled as externals for BB
1427 vectorization. For loop vectorization we can handle
1428 splats the same we handle single element interleaving. */
1429 && (is_a <bb_vec_info> (vinfo)
1430 || stmt_info != first_stmt_info))
1432 /* Not grouped load. */
1433 if (dump_enabled_p ())
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 "Build SLP failed: not grouped load %G", stmt);
1437 if (i != 0)
1438 continue;
1439 /* Fatal mismatch. */
1440 matches[0] = false;
1441 return false;
1444 /* Not memory operation. */
1445 else
1447 if (!phi_p
1448 && rhs_code.is_tree_code ()
1449 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453 && rhs_code != VIEW_CONVERT_EXPR
1454 && rhs_code != CALL_EXPR
1455 && rhs_code != BIT_FIELD_REF)
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459 "Build SLP failed: operation unsupported %G",
1460 stmt);
1461 if (is_a <bb_vec_info> (vinfo) && i != 0)
1462 continue;
1463 /* Fatal mismatch. */
1464 matches[0] = false;
1465 return false;
1468 if (rhs_code == COND_EXPR)
1470 tree cond_expr = gimple_assign_rhs1 (stmt);
1471 enum tree_code cond_code = TREE_CODE (cond_expr);
1472 enum tree_code swap_code = ERROR_MARK;
1473 enum tree_code invert_code = ERROR_MARK;
1475 if (i == 0)
1476 first_cond_code = TREE_CODE (cond_expr);
1477 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1479 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480 swap_code = swap_tree_comparison (cond_code);
1481 invert_code = invert_tree_comparison (cond_code, honor_nans);
1484 if (first_cond_code == cond_code)
1486 /* Isomorphic can be achieved by swapping. */
1487 else if (first_cond_code == swap_code)
1488 swap[i] = 1;
1489 /* Isomorphic can be achieved by inverting. */
1490 else if (first_cond_code == invert_code)
1491 swap[i] = 2;
1492 else
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 "Build SLP failed: different"
1497 " operation %G", stmt);
1498 /* Mismatch. */
1499 continue;
1503 if (rhs_code.is_tree_code ()
1504 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505 && (swap_tree_comparison ((tree_code)first_stmt_code)
1506 == (tree_code)rhs_code))
1507 swap[i] = 1;
1510 matches[i] = true;
1513 for (i = 0; i < group_size; ++i)
1514 if (!matches[i])
1515 return false;
1517 /* If we allowed a two-operation SLP node verify the target can cope
1518 with the permute we are going to use. */
1519 if (alt_stmt_code != ERROR_MARK
1520 && (!alt_stmt_code.is_tree_code ()
1521 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1524 *two_operators = true;
1527 if (maybe_soft_fail)
1529 unsigned HOST_WIDE_INT const_nunits;
1530 if (!TYPE_VECTOR_SUBPARTS
1531 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532 || const_nunits > group_size)
1533 matches[0] = false;
1534 else
1536 /* With constant vector elements simulate a mismatch at the
1537 point we need to split. */
1538 unsigned tail = group_size & (const_nunits - 1);
1539 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1541 return false;
1544 return true;
1547 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548 Note we never remove apart from at destruction time so we do not
1549 need a special value for deleted that differs from empty. */
1550 struct bst_traits
1552 typedef vec <stmt_vec_info> value_type;
1553 typedef vec <stmt_vec_info> compare_type;
1554 static inline hashval_t hash (value_type);
1555 static inline bool equal (value_type existing, value_type candidate);
1556 static inline bool is_empty (value_type x) { return !x.exists (); }
1557 static inline bool is_deleted (value_type x) { return !x.exists (); }
1558 static const bool empty_zero_p = true;
1559 static inline void mark_empty (value_type &x) { x.release (); }
1560 static inline void mark_deleted (value_type &x) { x.release (); }
1561 static inline void remove (value_type &x) { x.release (); }
1563 inline hashval_t
1564 bst_traits::hash (value_type x)
1566 inchash::hash h;
1567 for (unsigned i = 0; i < x.length (); ++i)
1568 h.add_int (gimple_uid (x[i]->stmt));
1569 return h.end ();
1571 inline bool
1572 bst_traits::equal (value_type existing, value_type candidate)
1574 if (existing.length () != candidate.length ())
1575 return false;
1576 for (unsigned i = 0; i < existing.length (); ++i)
1577 if (existing[i] != candidate[i])
1578 return false;
1579 return true;
1582 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583 but then vec::insert does memmove and that's not compatible with
1584 std::pair. */
1585 struct chain_op_t
1587 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588 : code (code_), dt (dt_), op (op_) {}
1589 tree_code code;
1590 vect_def_type dt;
1591 tree op;
1594 /* Comparator for sorting associatable chains. */
1596 static int
1597 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1599 auto *op1 = (const chain_op_t *) op1_;
1600 auto *op2 = (const chain_op_t *) op2_;
1601 if (op1->dt != op2->dt)
1602 return (int)op1->dt - (int)op2->dt;
1603 return (int)op1->code - (int)op2->code;
1606 /* Linearize the associatable expression chain at START with the
1607 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608 filling CHAIN with the result and using WORKLIST as intermediate storage.
1609 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1611 stmts, starting with START. */
1613 static void
1614 vect_slp_linearize_chain (vec_info *vinfo,
1615 vec<std::pair<tree_code, gimple *> > &worklist,
1616 vec<chain_op_t> &chain,
1617 enum tree_code code, gimple *start,
1618 gimple *&code_stmt, gimple *&alt_code_stmt,
1619 vec<gimple *> *chain_stmts)
1621 /* For each lane linearize the addition/subtraction (or other
1622 uniform associatable operation) expression tree. */
1623 worklist.safe_push (std::make_pair (code, start));
1624 while (!worklist.is_empty ())
1626 auto entry = worklist.pop ();
1627 gassign *stmt = as_a <gassign *> (entry.second);
1628 enum tree_code in_code = entry.first;
1629 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1631 if (!code_stmt
1632 && gimple_assign_rhs_code (stmt) == code)
1633 code_stmt = stmt;
1634 else if (!alt_code_stmt
1635 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636 alt_code_stmt = stmt;
1637 if (chain_stmts)
1638 chain_stmts->safe_push (stmt);
1639 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1641 tree op = gimple_op (stmt, opnum);
1642 vect_def_type dt;
1643 stmt_vec_info def_stmt_info;
1644 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645 gcc_assert (res);
1646 if (dt == vect_internal_def
1647 && is_pattern_stmt_p (def_stmt_info))
1648 op = gimple_get_lhs (def_stmt_info->stmt);
1649 gimple *use_stmt;
1650 use_operand_p use_p;
1651 if (dt == vect_internal_def
1652 && single_imm_use (op, &use_p, &use_stmt)
1653 && is_gimple_assign (def_stmt_info->stmt)
1654 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655 || (code == PLUS_EXPR
1656 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657 == MINUS_EXPR))))
1659 tree_code op_def_code = this_code;
1660 if (op_def_code == MINUS_EXPR && opnum == 1)
1661 op_def_code = PLUS_EXPR;
1662 if (in_code == MINUS_EXPR)
1663 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664 worklist.safe_push (std::make_pair (op_def_code,
1665 def_stmt_info->stmt));
1667 else
1669 tree_code op_def_code = this_code;
1670 if (op_def_code == MINUS_EXPR && opnum == 1)
1671 op_def_code = PLUS_EXPR;
1672 if (in_code == MINUS_EXPR)
1673 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674 chain.safe_push (chain_op_t (op_def_code, dt, op));
1680 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681 simple_hashmap_traits <bst_traits, slp_tree> >
1682 scalar_stmts_to_slp_tree_map_t;
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686 vec<stmt_vec_info> stmts, unsigned int group_size,
1687 poly_uint64 *max_nunits,
1688 bool *matches, unsigned *limit, unsigned *tree_size,
1689 scalar_stmts_to_slp_tree_map_t *bst_map);
1691 static slp_tree
1692 vect_build_slp_tree (vec_info *vinfo,
1693 vec<stmt_vec_info> stmts, unsigned int group_size,
1694 poly_uint64 *max_nunits,
1695 bool *matches, unsigned *limit, unsigned *tree_size,
1696 scalar_stmts_to_slp_tree_map_t *bst_map)
1698 if (slp_tree *leader = bst_map->get (stmts))
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702 !(*leader)->failed ? "" : "failed ",
1703 (void *) *leader);
1704 if (!(*leader)->failed)
1706 SLP_TREE_REF_COUNT (*leader)++;
1707 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708 stmts.release ();
1709 return *leader;
1711 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712 return NULL;
1715 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716 so we can pick up backedge destinations during discovery. */
1717 slp_tree res = new _slp_tree;
1718 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719 SLP_TREE_SCALAR_STMTS (res) = stmts;
1720 bst_map->put (stmts.copy (), res);
1722 if (*limit == 0)
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "SLP discovery limit exceeded\n");
1727 /* Mark the node invalid so we can detect those when still in use
1728 as backedge destinations. */
1729 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731 res->failed = XNEWVEC (bool, group_size);
1732 memset (res->failed, 0, sizeof (bool) * group_size);
1733 memset (matches, 0, sizeof (bool) * group_size);
1734 return NULL;
1736 --*limit;
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "starting SLP discovery for node %p\n", (void *) res);
1742 poly_uint64 this_max_nunits = 1;
1743 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744 &this_max_nunits,
1745 matches, limit, tree_size, bst_map);
1746 if (!res_)
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "SLP discovery for node %p failed\n", (void *) res);
1751 /* Mark the node invalid so we can detect those when still in use
1752 as backedge destinations. */
1753 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755 res->failed = XNEWVEC (bool, group_size);
1756 if (flag_checking)
1758 unsigned i;
1759 for (i = 0; i < group_size; ++i)
1760 if (!matches[i])
1761 break;
1762 gcc_assert (i < group_size);
1764 memcpy (res->failed, matches, sizeof (bool) * group_size);
1766 else
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_NOTE, vect_location,
1770 "SLP discovery for node %p succeeded\n",
1771 (void *) res);
1772 gcc_assert (res_ == res);
1773 res->max_nunits = this_max_nunits;
1774 vect_update_max_nunits (max_nunits, this_max_nunits);
1775 /* Keep a reference for the bst_map use. */
1776 SLP_TREE_REF_COUNT (res)++;
1778 return res_;
1781 /* Helper for building an associated SLP node chain. */
1783 static void
1784 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785 slp_tree op0, slp_tree op1,
1786 stmt_vec_info oper1, stmt_vec_info oper2,
1787 vec<std::pair<unsigned, unsigned> > lperm)
1789 unsigned group_size = SLP_TREE_LANES (op1);
1791 slp_tree child1 = new _slp_tree;
1792 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793 SLP_TREE_VECTYPE (child1) = vectype;
1794 SLP_TREE_LANES (child1) = group_size;
1795 SLP_TREE_CHILDREN (child1).create (2);
1796 SLP_TREE_CHILDREN (child1).quick_push (op0);
1797 SLP_TREE_CHILDREN (child1).quick_push (op1);
1798 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1800 slp_tree child2 = new _slp_tree;
1801 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802 SLP_TREE_VECTYPE (child2) = vectype;
1803 SLP_TREE_LANES (child2) = group_size;
1804 SLP_TREE_CHILDREN (child2).create (2);
1805 SLP_TREE_CHILDREN (child2).quick_push (op0);
1806 SLP_TREE_REF_COUNT (op0)++;
1807 SLP_TREE_CHILDREN (child2).quick_push (op1);
1808 SLP_TREE_REF_COUNT (op1)++;
1809 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1811 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813 SLP_TREE_VECTYPE (perm) = vectype;
1814 SLP_TREE_LANES (perm) = group_size;
1815 /* ??? We should set this NULL but that's not expected. */
1816 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818 SLP_TREE_CHILDREN (perm).quick_push (child1);
1819 SLP_TREE_CHILDREN (perm).quick_push (child2);
1822 /* Recursively build an SLP tree starting from NODE.
1823 Fail (and return a value not equal to zero) if def-stmts are not
1824 isomorphic, require data permutation or are of unsupported types of
1825 operation. Otherwise, return 0.
1826 The value returned is the depth in the SLP tree where a mismatch
1827 was found. */
1829 static slp_tree
1830 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831 vec<stmt_vec_info> stmts, unsigned int group_size,
1832 poly_uint64 *max_nunits,
1833 bool *matches, unsigned *limit, unsigned *tree_size,
1834 scalar_stmts_to_slp_tree_map_t *bst_map)
1836 unsigned nops, i, this_tree_size = 0;
1837 poly_uint64 this_max_nunits = *max_nunits;
1839 matches[0] = false;
1841 stmt_vec_info stmt_info = stmts[0];
1842 if (!is_a<gcall *> (stmt_info->stmt)
1843 && !is_a<gassign *> (stmt_info->stmt)
1844 && !is_a<gphi *> (stmt_info->stmt))
1845 return NULL;
1847 nops = gimple_num_args (stmt_info->stmt);
1848 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849 STMT_VINFO_GATHER_SCATTER_P
1850 (stmt_info)))
1851 nops = map[0];
1853 /* If the SLP node is a PHI (induction or reduction), terminate
1854 the recursion. */
1855 bool *skip_args = XALLOCAVEC (bool, nops);
1856 memset (skip_args, 0, sizeof (bool) * nops);
1857 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1860 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862 group_size);
1863 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864 max_nunits))
1865 return NULL;
1867 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868 if (def_type == vect_induction_def)
1870 /* Induction PHIs are not cycles but walk the initial
1871 value. Only for inner loops through, for outer loops
1872 we need to pick up the value from the actual PHIs
1873 to more easily support peeling and epilogue vectorization. */
1874 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875 if (!nested_in_vect_loop_p (loop, stmt_info))
1876 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877 else
1878 loop = loop->inner;
1879 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1881 else if (def_type == vect_reduction_def
1882 || def_type == vect_double_reduction_def
1883 || def_type == vect_nested_cycle
1884 || def_type == vect_first_order_recurrence)
1886 /* Else def types have to match. */
1887 stmt_vec_info other_info;
1888 bool all_same = true;
1889 FOR_EACH_VEC_ELT (stmts, i, other_info)
1891 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892 return NULL;
1893 if (other_info != stmt_info)
1894 all_same = false;
1896 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897 /* Reduction initial values are not explicitely represented. */
1898 if (def_type != vect_first_order_recurrence
1899 && !nested_in_vect_loop_p (loop, stmt_info))
1900 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901 /* Reduction chain backedge defs are filled manually.
1902 ??? Need a better way to identify a SLP reduction chain PHI.
1903 Or a better overall way to SLP match those. */
1904 if (all_same && def_type == vect_reduction_def)
1905 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1907 else if (def_type != vect_internal_def)
1908 return NULL;
1912 bool two_operators = false;
1913 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914 tree vectype = NULL_TREE;
1915 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916 &this_max_nunits, matches, &two_operators,
1917 &vectype))
1918 return NULL;
1920 /* If the SLP node is a load, terminate the recursion unless masked. */
1921 if (STMT_VINFO_DATA_REF (stmt_info)
1922 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1924 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1925 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1926 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1927 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1928 else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1929 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1930 else
1932 *max_nunits = this_max_nunits;
1933 (*tree_size)++;
1934 node = vect_create_new_slp_node (node, stmts, 0);
1935 SLP_TREE_VECTYPE (node) = vectype;
1936 /* And compute the load permutation. Whether it is actually
1937 a permutation depends on the unrolling factor which is
1938 decided later. */
1939 vec<unsigned> load_permutation;
1940 int j;
1941 stmt_vec_info load_info;
1942 load_permutation.create (group_size);
1943 stmt_vec_info first_stmt_info
1944 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1945 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1947 int load_place;
1948 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1949 load_place = vect_get_place_in_interleaving_chain
1950 (load_info, first_stmt_info);
1951 else
1952 load_place = 0;
1953 gcc_assert (load_place != -1);
1954 load_permutation.safe_push (load_place);
1956 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1957 return node;
1960 else if (gimple_assign_single_p (stmt_info->stmt)
1961 && !gimple_vuse (stmt_info->stmt)
1962 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1964 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1965 the same SSA name vector of a compatible type to vectype. */
1966 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1967 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1968 stmt_vec_info estmt_info;
1969 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1971 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1972 tree bfref = gimple_assign_rhs1 (estmt);
1973 HOST_WIDE_INT lane;
1974 if (!known_eq (bit_field_size (bfref),
1975 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1976 || !constant_multiple_p (bit_field_offset (bfref),
1977 bit_field_size (bfref), &lane))
1979 lperm.release ();
1980 matches[0] = false;
1981 return NULL;
1983 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1985 slp_tree vnode = vect_create_new_slp_node (vNULL);
1986 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1987 /* ??? We record vectype here but we hide eventually necessary
1988 punning and instead rely on code generation to materialize
1989 VIEW_CONVERT_EXPRs as necessary. We instead should make
1990 this explicit somehow. */
1991 SLP_TREE_VECTYPE (vnode) = vectype;
1992 else
1994 /* For different size but compatible elements we can still
1995 use VEC_PERM_EXPR without punning. */
1996 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1997 && types_compatible_p (TREE_TYPE (vectype),
1998 TREE_TYPE (TREE_TYPE (vec))));
1999 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2001 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2002 unsigned HOST_WIDE_INT const_nunits;
2003 if (nunits.is_constant (&const_nunits))
2004 SLP_TREE_LANES (vnode) = const_nunits;
2005 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2006 /* We are always building a permutation node even if it is an identity
2007 permute to shield the rest of the vectorizer from the odd node
2008 representing an actual vector without any scalar ops.
2009 ??? We could hide it completely with making the permute node
2010 external? */
2011 node = vect_create_new_slp_node (node, stmts, 1);
2012 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2013 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2014 SLP_TREE_VECTYPE (node) = vectype;
2015 SLP_TREE_CHILDREN (node).quick_push (vnode);
2016 return node;
2018 /* When discovery reaches an associatable operation see whether we can
2019 improve that to match up lanes in a way superior to the operand
2020 swapping code which at most looks at two defs.
2021 ??? For BB vectorization we cannot do the brute-force search
2022 for matching as we can succeed by means of builds from scalars
2023 and have no good way to "cost" one build against another. */
2024 else if (is_a <loop_vec_info> (vinfo)
2025 /* ??? We don't handle !vect_internal_def defs below. */
2026 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2027 && is_gimple_assign (stmt_info->stmt)
2028 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2029 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2030 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2031 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2032 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2034 /* See if we have a chain of (mixed) adds or subtracts or other
2035 associatable ops. */
2036 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2037 if (code == MINUS_EXPR)
2038 code = PLUS_EXPR;
2039 stmt_vec_info other_op_stmt_info = NULL;
2040 stmt_vec_info op_stmt_info = NULL;
2041 unsigned chain_len = 0;
2042 auto_vec<chain_op_t> chain;
2043 auto_vec<std::pair<tree_code, gimple *> > worklist;
2044 auto_vec<vec<chain_op_t> > chains (group_size);
2045 auto_vec<slp_tree, 4> children;
2046 bool hard_fail = true;
2047 for (unsigned lane = 0; lane < group_size; ++lane)
2049 /* For each lane linearize the addition/subtraction (or other
2050 uniform associatable operation) expression tree. */
2051 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2052 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2053 stmts[lane]->stmt, op_stmt, other_op_stmt,
2054 NULL);
2055 if (!op_stmt_info && op_stmt)
2056 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2057 if (!other_op_stmt_info && other_op_stmt)
2058 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2059 if (chain.length () == 2)
2061 /* In a chain of just two elements resort to the regular
2062 operand swapping scheme. If we run into a length
2063 mismatch still hard-FAIL. */
2064 if (chain_len == 0)
2065 hard_fail = false;
2066 else
2068 matches[lane] = false;
2069 /* ??? We might want to process the other lanes, but
2070 make sure to not give false matching hints to the
2071 caller for lanes we did not process. */
2072 if (lane != group_size - 1)
2073 matches[0] = false;
2075 break;
2077 else if (chain_len == 0)
2078 chain_len = chain.length ();
2079 else if (chain.length () != chain_len)
2081 /* ??? Here we could slip in magic to compensate with
2082 neutral operands. */
2083 matches[lane] = false;
2084 if (lane != group_size - 1)
2085 matches[0] = false;
2086 break;
2088 chains.quick_push (chain.copy ());
2089 chain.truncate (0);
2091 if (chains.length () == group_size)
2093 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2094 if (!op_stmt_info)
2096 hard_fail = false;
2097 goto out;
2099 /* Now we have a set of chains with the same length. */
2100 /* 1. pre-sort according to def_type and operation. */
2101 for (unsigned lane = 0; lane < group_size; ++lane)
2102 chains[lane].stablesort (dt_sort_cmp, vinfo);
2103 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "pre-sorted chains of %s\n",
2107 get_tree_code_name (code));
2108 for (unsigned lane = 0; lane < group_size; ++lane)
2110 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2111 dump_printf (MSG_NOTE, "%s %T ",
2112 get_tree_code_name (chains[lane][opnum].code),
2113 chains[lane][opnum].op);
2114 dump_printf (MSG_NOTE, "\n");
2117 /* 2. try to build children nodes, associating as necessary. */
2118 for (unsigned n = 0; n < chain_len; ++n)
2120 vect_def_type dt = chains[0][n].dt;
2121 unsigned lane;
2122 for (lane = 0; lane < group_size; ++lane)
2123 if (chains[lane][n].dt != dt)
2125 if (dt == vect_constant_def
2126 && chains[lane][n].dt == vect_external_def)
2127 dt = vect_external_def;
2128 else if (dt == vect_external_def
2129 && chains[lane][n].dt == vect_constant_def)
2131 else
2132 break;
2134 if (lane != group_size)
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE, vect_location,
2138 "giving up on chain due to mismatched "
2139 "def types\n");
2140 matches[lane] = false;
2141 if (lane != group_size - 1)
2142 matches[0] = false;
2143 goto out;
2145 if (dt == vect_constant_def
2146 || dt == vect_external_def)
2148 /* Check whether we can build the invariant. If we can't
2149 we never will be able to. */
2150 tree type = TREE_TYPE (chains[0][n].op);
2151 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2152 && (TREE_CODE (type) == BOOLEAN_TYPE
2153 || !can_duplicate_and_interleave_p (vinfo, group_size,
2154 type)))
2156 matches[0] = false;
2157 goto out;
2159 vec<tree> ops;
2160 ops.create (group_size);
2161 for (lane = 0; lane < group_size; ++lane)
2162 ops.quick_push (chains[lane][n].op);
2163 slp_tree child = vect_create_new_slp_node (ops);
2164 SLP_TREE_DEF_TYPE (child) = dt;
2165 children.safe_push (child);
2167 else if (dt != vect_internal_def)
2169 /* Not sure, we might need sth special.
2170 gcc.dg/vect/pr96854.c,
2171 gfortran.dg/vect/fast-math-pr37021.f90
2172 and gfortran.dg/vect/pr61171.f trigger. */
2173 /* Soft-fail for now. */
2174 hard_fail = false;
2175 goto out;
2177 else
2179 vec<stmt_vec_info> op_stmts;
2180 op_stmts.create (group_size);
2181 slp_tree child = NULL;
2182 /* Brute-force our way. We have to consider a lane
2183 failing after fixing an earlier fail up in the
2184 SLP discovery recursion. So track the current
2185 permute per lane. */
2186 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2187 memset (perms, 0, sizeof (unsigned) * group_size);
2190 op_stmts.truncate (0);
2191 for (lane = 0; lane < group_size; ++lane)
2192 op_stmts.quick_push
2193 (vinfo->lookup_def (chains[lane][n].op));
2194 child = vect_build_slp_tree (vinfo, op_stmts,
2195 group_size, &this_max_nunits,
2196 matches, limit,
2197 &this_tree_size, bst_map);
2198 /* ??? We're likely getting too many fatal mismatches
2199 here so maybe we want to ignore them (but then we
2200 have no idea which lanes fatally mismatched). */
2201 if (child || !matches[0])
2202 break;
2203 /* Swap another lane we have not yet matched up into
2204 lanes that did not match. If we run out of
2205 permute possibilities for a lane terminate the
2206 search. */
2207 bool term = false;
2208 for (lane = 1; lane < group_size; ++lane)
2209 if (!matches[lane])
2211 if (n + perms[lane] + 1 == chain_len)
2213 term = true;
2214 break;
2216 std::swap (chains[lane][n],
2217 chains[lane][n + perms[lane] + 1]);
2218 perms[lane]++;
2220 if (term)
2221 break;
2223 while (1);
2224 if (!child)
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_NOTE, vect_location,
2228 "failed to match up op %d\n", n);
2229 op_stmts.release ();
2230 if (lane != group_size - 1)
2231 matches[0] = false;
2232 else
2233 matches[lane] = false;
2234 goto out;
2236 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_NOTE, vect_location,
2239 "matched up op %d to\n", n);
2240 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2242 children.safe_push (child);
2245 /* 3. build SLP nodes to combine the chain. */
2246 for (unsigned lane = 0; lane < group_size; ++lane)
2247 if (chains[lane][0].code != code)
2249 /* See if there's any alternate all-PLUS entry. */
2250 unsigned n;
2251 for (n = 1; n < chain_len; ++n)
2253 for (lane = 0; lane < group_size; ++lane)
2254 if (chains[lane][n].code != code)
2255 break;
2256 if (lane == group_size)
2257 break;
2259 if (n != chain_len)
2261 /* Swap that in at first position. */
2262 std::swap (children[0], children[n]);
2263 for (lane = 0; lane < group_size; ++lane)
2264 std::swap (chains[lane][0], chains[lane][n]);
2266 else
2268 /* ??? When this triggers and we end up with two
2269 vect_constant/external_def up-front things break (ICE)
2270 spectacularly finding an insertion place for the
2271 all-constant op. We should have a fully
2272 vect_internal_def operand though(?) so we can swap
2273 that into first place and then prepend the all-zero
2274 constant. */
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_NOTE, vect_location,
2277 "inserting constant zero to compensate "
2278 "for (partially) negated first "
2279 "operand\n");
2280 chain_len++;
2281 for (lane = 0; lane < group_size; ++lane)
2282 chains[lane].safe_insert
2283 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2284 vec<tree> zero_ops;
2285 zero_ops.create (group_size);
2286 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2287 for (lane = 1; lane < group_size; ++lane)
2288 zero_ops.quick_push (zero_ops[0]);
2289 slp_tree zero = vect_create_new_slp_node (zero_ops);
2290 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2291 children.safe_insert (0, zero);
2293 break;
2295 for (unsigned i = 1; i < children.length (); ++i)
2297 slp_tree op0 = children[i - 1];
2298 slp_tree op1 = children[i];
2299 bool this_two_op = false;
2300 for (unsigned lane = 0; lane < group_size; ++lane)
2301 if (chains[lane][i].code != chains[0][i].code)
2303 this_two_op = true;
2304 break;
2306 slp_tree child;
2307 if (i == children.length () - 1)
2308 child = vect_create_new_slp_node (node, stmts, 2);
2309 else
2310 child = vect_create_new_slp_node (2, ERROR_MARK);
2311 if (this_two_op)
2313 vec<std::pair<unsigned, unsigned> > lperm;
2314 lperm.create (group_size);
2315 for (unsigned lane = 0; lane < group_size; ++lane)
2316 lperm.quick_push (std::make_pair
2317 (chains[lane][i].code != chains[0][i].code, lane));
2318 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2319 (chains[0][i].code == code
2320 ? op_stmt_info
2321 : other_op_stmt_info),
2322 (chains[0][i].code == code
2323 ? other_op_stmt_info
2324 : op_stmt_info),
2325 lperm);
2327 else
2329 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2330 SLP_TREE_VECTYPE (child) = vectype;
2331 SLP_TREE_LANES (child) = group_size;
2332 SLP_TREE_CHILDREN (child).quick_push (op0);
2333 SLP_TREE_CHILDREN (child).quick_push (op1);
2334 SLP_TREE_REPRESENTATIVE (child)
2335 = (chains[0][i].code == code
2336 ? op_stmt_info : other_op_stmt_info);
2338 children[i] = child;
2340 *tree_size += this_tree_size + 1;
2341 *max_nunits = this_max_nunits;
2342 while (!chains.is_empty ())
2343 chains.pop ().release ();
2344 return node;
2346 out:
2347 while (!children.is_empty ())
2348 vect_free_slp_tree (children.pop ());
2349 while (!chains.is_empty ())
2350 chains.pop ().release ();
2351 /* Hard-fail, otherwise we might run into quadratic processing of the
2352 chains starting one stmt into the chain again. */
2353 if (hard_fail)
2354 return NULL;
2355 /* Fall thru to normal processing. */
2358 /* Get at the operands, verifying they are compatible. */
2359 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2360 slp_oprnd_info oprnd_info;
2361 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2363 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2364 stmts, i, &oprnds_info);
2365 if (res != 0)
2366 matches[(res == -1) ? 0 : i] = false;
2367 if (!matches[0])
2368 break;
2370 for (i = 0; i < group_size; ++i)
2371 if (!matches[i])
2373 vect_free_oprnd_info (oprnds_info);
2374 return NULL;
2376 swap = NULL;
2378 auto_vec<slp_tree, 4> children;
2380 stmt_info = stmts[0];
2382 /* Create SLP_TREE nodes for the definition node/s. */
2383 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2385 slp_tree child;
2386 unsigned int j;
2388 /* We're skipping certain operands from processing, for example
2389 outer loop reduction initial defs. */
2390 if (skip_args[i])
2392 children.safe_push (NULL);
2393 continue;
2396 if (oprnd_info->first_dt == vect_uninitialized_def)
2398 /* COND_EXPR have one too many eventually if the condition
2399 is a SSA name. */
2400 gcc_assert (i == 3 && nops == 4);
2401 continue;
2404 if (is_a <bb_vec_info> (vinfo)
2405 && oprnd_info->first_dt == vect_internal_def
2406 && !oprnd_info->any_pattern)
2408 /* For BB vectorization, if all defs are the same do not
2409 bother to continue the build along the single-lane
2410 graph but use a splat of the scalar value. */
2411 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2412 for (j = 1; j < group_size; ++j)
2413 if (oprnd_info->def_stmts[j] != first_def)
2414 break;
2415 if (j == group_size
2416 /* But avoid doing this for loads where we may be
2417 able to CSE things, unless the stmt is not
2418 vectorizable. */
2419 && (!STMT_VINFO_VECTORIZABLE (first_def)
2420 || !gimple_vuse (first_def->stmt)))
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE, vect_location,
2424 "Using a splat of the uniform operand %G",
2425 first_def->stmt);
2426 oprnd_info->first_dt = vect_external_def;
2430 if (oprnd_info->first_dt == vect_external_def
2431 || oprnd_info->first_dt == vect_constant_def)
2433 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2434 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2435 oprnd_info->ops = vNULL;
2436 children.safe_push (invnode);
2437 continue;
2440 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2441 group_size, &this_max_nunits,
2442 matches, limit,
2443 &this_tree_size, bst_map)) != NULL)
2445 oprnd_info->def_stmts = vNULL;
2446 children.safe_push (child);
2447 continue;
2450 /* If the SLP build for operand zero failed and operand zero
2451 and one can be commutated try that for the scalar stmts
2452 that failed the match. */
2453 if (i == 0
2454 /* A first scalar stmt mismatch signals a fatal mismatch. */
2455 && matches[0]
2456 /* ??? For COND_EXPRs we can swap the comparison operands
2457 as well as the arms under some constraints. */
2458 && nops == 2
2459 && oprnds_info[1]->first_dt == vect_internal_def
2460 && is_gimple_assign (stmt_info->stmt)
2461 /* Swapping operands for reductions breaks assumptions later on. */
2462 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2463 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2465 /* See whether we can swap the matching or the non-matching
2466 stmt operands. */
2467 bool swap_not_matching = true;
2470 for (j = 0; j < group_size; ++j)
2472 if (matches[j] != !swap_not_matching)
2473 continue;
2474 stmt_vec_info stmt_info = stmts[j];
2475 /* Verify if we can swap operands of this stmt. */
2476 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2477 if (!stmt
2478 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2480 if (!swap_not_matching)
2481 goto fail;
2482 swap_not_matching = false;
2483 break;
2487 while (j != group_size);
2489 /* Swap mismatched definition stmts. */
2490 if (dump_enabled_p ())
2491 dump_printf_loc (MSG_NOTE, vect_location,
2492 "Re-trying with swapped operands of stmts ");
2493 for (j = 0; j < group_size; ++j)
2494 if (matches[j] == !swap_not_matching)
2496 std::swap (oprnds_info[0]->def_stmts[j],
2497 oprnds_info[1]->def_stmts[j]);
2498 std::swap (oprnds_info[0]->ops[j],
2499 oprnds_info[1]->ops[j]);
2500 if (dump_enabled_p ())
2501 dump_printf (MSG_NOTE, "%d ", j);
2503 if (dump_enabled_p ())
2504 dump_printf (MSG_NOTE, "\n");
2505 /* After swapping some operands we lost track whether an
2506 operand has any pattern defs so be conservative here. */
2507 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2508 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2509 /* And try again with scratch 'matches' ... */
2510 bool *tem = XALLOCAVEC (bool, group_size);
2511 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2512 group_size, &this_max_nunits,
2513 tem, limit,
2514 &this_tree_size, bst_map)) != NULL)
2516 oprnd_info->def_stmts = vNULL;
2517 children.safe_push (child);
2518 continue;
2521 fail:
2523 /* If the SLP build failed and we analyze a basic-block
2524 simply treat nodes we fail to build as externally defined
2525 (and thus build vectors from the scalar defs).
2526 The cost model will reject outright expensive cases.
2527 ??? This doesn't treat cases where permutation ultimatively
2528 fails (or we don't try permutation below). Ideally we'd
2529 even compute a permutation that will end up with the maximum
2530 SLP tree size... */
2531 if (is_a <bb_vec_info> (vinfo)
2532 /* ??? Rejecting patterns this way doesn't work. We'd have to
2533 do extra work to cancel the pattern so the uses see the
2534 scalar version. */
2535 && !is_pattern_stmt_p (stmt_info)
2536 && !oprnd_info->any_pattern)
2538 /* But if there's a leading vector sized set of matching stmts
2539 fail here so we can split the group. This matches the condition
2540 vect_analyze_slp_instance uses. */
2541 /* ??? We might want to split here and combine the results to support
2542 multiple vector sizes better. */
2543 for (j = 0; j < group_size; ++j)
2544 if (!matches[j])
2545 break;
2546 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2548 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_NOTE, vect_location,
2550 "Building vector operands from scalars\n");
2551 this_tree_size++;
2552 child = vect_create_new_slp_node (oprnd_info->ops);
2553 children.safe_push (child);
2554 oprnd_info->ops = vNULL;
2555 continue;
2559 gcc_assert (child == NULL);
2560 FOR_EACH_VEC_ELT (children, j, child)
2561 if (child)
2562 vect_free_slp_tree (child);
2563 vect_free_oprnd_info (oprnds_info);
2564 return NULL;
2567 vect_free_oprnd_info (oprnds_info);
2569 /* If we have all children of a child built up from uniform scalars
2570 or does more than one possibly expensive vector construction then
2571 just throw that away, causing it built up from scalars.
2572 The exception is the SLP node for the vector store. */
2573 if (is_a <bb_vec_info> (vinfo)
2574 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2575 /* ??? Rejecting patterns this way doesn't work. We'd have to
2576 do extra work to cancel the pattern so the uses see the
2577 scalar version. */
2578 && !is_pattern_stmt_p (stmt_info))
2580 slp_tree child;
2581 unsigned j;
2582 bool all_uniform_p = true;
2583 unsigned n_vector_builds = 0;
2584 FOR_EACH_VEC_ELT (children, j, child)
2586 if (!child)
2588 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2589 all_uniform_p = false;
2590 else if (!vect_slp_tree_uniform_p (child))
2592 all_uniform_p = false;
2593 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2594 n_vector_builds++;
2597 if (all_uniform_p
2598 || n_vector_builds > 1
2599 || (n_vector_builds == children.length ()
2600 && is_a <gphi *> (stmt_info->stmt)))
2602 /* Roll back. */
2603 matches[0] = false;
2604 FOR_EACH_VEC_ELT (children, j, child)
2605 if (child)
2606 vect_free_slp_tree (child);
2608 if (dump_enabled_p ())
2609 dump_printf_loc (MSG_NOTE, vect_location,
2610 "Building parent vector operands from "
2611 "scalars instead\n");
2612 return NULL;
2616 *tree_size += this_tree_size + 1;
2617 *max_nunits = this_max_nunits;
2619 if (two_operators)
2621 /* ??? We'd likely want to either cache in bst_map sth like
2622 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2623 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2624 explicit stmts to put in so the keying on 'stmts' doesn't
2625 work (but we have the same issue with nodes that use 'ops'). */
2626 slp_tree one = new _slp_tree;
2627 slp_tree two = new _slp_tree;
2628 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2629 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2630 SLP_TREE_VECTYPE (one) = vectype;
2631 SLP_TREE_VECTYPE (two) = vectype;
2632 SLP_TREE_CHILDREN (one).safe_splice (children);
2633 SLP_TREE_CHILDREN (two).safe_splice (children);
2634 slp_tree child;
2635 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2636 SLP_TREE_REF_COUNT (child)++;
2638 /* Here we record the original defs since this
2639 node represents the final lane configuration. */
2640 node = vect_create_new_slp_node (node, stmts, 2);
2641 SLP_TREE_VECTYPE (node) = vectype;
2642 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2643 SLP_TREE_CHILDREN (node).quick_push (one);
2644 SLP_TREE_CHILDREN (node).quick_push (two);
2645 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2646 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2647 enum tree_code ocode = ERROR_MARK;
2648 stmt_vec_info ostmt_info;
2649 unsigned j = 0;
2650 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2652 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2653 if (gimple_assign_rhs_code (ostmt) != code0)
2655 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2656 ocode = gimple_assign_rhs_code (ostmt);
2657 j = i;
2659 else
2660 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2662 SLP_TREE_CODE (one) = code0;
2663 SLP_TREE_CODE (two) = ocode;
2664 SLP_TREE_LANES (one) = stmts.length ();
2665 SLP_TREE_LANES (two) = stmts.length ();
2666 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2667 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2668 return node;
2671 node = vect_create_new_slp_node (node, stmts, nops);
2672 SLP_TREE_VECTYPE (node) = vectype;
2673 SLP_TREE_CHILDREN (node).splice (children);
2674 return node;
2677 /* Dump a single SLP tree NODE. */
2679 static void
2680 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2681 slp_tree node)
2683 unsigned i, j;
2684 slp_tree child;
2685 stmt_vec_info stmt_info;
2686 tree op;
2688 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2689 dump_user_location_t user_loc = loc.get_user_location ();
2690 dump_printf_loc (metadata, user_loc,
2691 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2692 ", refcnt=%u)",
2693 SLP_TREE_DEF_TYPE (node) == vect_external_def
2694 ? " (external)"
2695 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2696 ? " (constant)"
2697 : ""), (void *) node,
2698 estimated_poly_value (node->max_nunits),
2699 SLP_TREE_REF_COUNT (node));
2700 if (SLP_TREE_VECTYPE (node))
2701 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2702 dump_printf (metadata, "\n");
2703 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2705 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2706 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2707 else
2708 dump_printf_loc (metadata, user_loc, "op template: %G",
2709 SLP_TREE_REPRESENTATIVE (node)->stmt);
2711 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2712 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2713 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2714 else
2716 dump_printf_loc (metadata, user_loc, "\t{ ");
2717 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2718 dump_printf (metadata, "%T%s ", op,
2719 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2720 dump_printf (metadata, "}\n");
2722 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2724 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2725 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2726 dump_printf (dump_kind, " %u", j);
2727 dump_printf (dump_kind, " }\n");
2729 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2731 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2732 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2733 dump_printf (dump_kind, " %u[%u]",
2734 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2735 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2736 dump_printf (dump_kind, " }\n");
2738 if (SLP_TREE_CHILDREN (node).is_empty ())
2739 return;
2740 dump_printf_loc (metadata, user_loc, "\tchildren");
2741 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2742 dump_printf (dump_kind, " %p", (void *)child);
2743 dump_printf (dump_kind, "\n");
2746 DEBUG_FUNCTION void
2747 debug (slp_tree node)
2749 debug_dump_context ctx;
2750 vect_print_slp_tree (MSG_NOTE,
2751 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2752 node);
2755 /* Recursive helper for the dot producer below. */
2757 static void
2758 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2760 if (visited.add (node))
2761 return;
2763 fprintf (f, "\"%p\" [label=\"", (void *)node);
2764 vect_print_slp_tree (MSG_NOTE,
2765 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2766 node);
2767 fprintf (f, "\"];\n");
2770 for (slp_tree child : SLP_TREE_CHILDREN (node))
2771 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2773 for (slp_tree child : SLP_TREE_CHILDREN (node))
2774 if (child)
2775 dot_slp_tree (f, child, visited);
2778 DEBUG_FUNCTION void
2779 dot_slp_tree (const char *fname, slp_tree node)
2781 FILE *f = fopen (fname, "w");
2782 fprintf (f, "digraph {\n");
2783 fflush (f);
2785 debug_dump_context ctx (f);
2786 hash_set<slp_tree> visited;
2787 dot_slp_tree (f, node, visited);
2789 fflush (f);
2790 fprintf (f, "}\n");
2791 fclose (f);
2794 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2796 static void
2797 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2798 slp_tree node, hash_set<slp_tree> &visited)
2800 unsigned i;
2801 slp_tree child;
2803 if (visited.add (node))
2804 return;
2806 vect_print_slp_tree (dump_kind, loc, node);
2808 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2809 if (child)
2810 vect_print_slp_graph (dump_kind, loc, child, visited);
2813 static void
2814 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2815 slp_tree entry)
2817 hash_set<slp_tree> visited;
2818 vect_print_slp_graph (dump_kind, loc, entry, visited);
2821 /* Mark the tree rooted at NODE with PURE_SLP. */
2823 static void
2824 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2826 int i;
2827 stmt_vec_info stmt_info;
2828 slp_tree child;
2830 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2831 return;
2833 if (visited.add (node))
2834 return;
2836 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2837 STMT_SLP_TYPE (stmt_info) = pure_slp;
2839 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2840 if (child)
2841 vect_mark_slp_stmts (child, visited);
2844 static void
2845 vect_mark_slp_stmts (slp_tree node)
2847 hash_set<slp_tree> visited;
2848 vect_mark_slp_stmts (node, visited);
2851 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2853 static void
2854 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2856 int i;
2857 stmt_vec_info stmt_info;
2858 slp_tree child;
2860 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2861 return;
2863 if (visited.add (node))
2864 return;
2866 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2868 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2869 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2870 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2873 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2874 if (child)
2875 vect_mark_slp_stmts_relevant (child, visited);
2878 static void
2879 vect_mark_slp_stmts_relevant (slp_tree node)
2881 hash_set<slp_tree> visited;
2882 vect_mark_slp_stmts_relevant (node, visited);
2886 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2888 static void
2889 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2890 hash_set<slp_tree> &visited)
2892 if (!node || visited.add (node))
2893 return;
2895 if (SLP_TREE_CHILDREN (node).length () == 0)
2897 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2898 return;
2899 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2900 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2901 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2902 loads.safe_push (node);
2904 else
2906 unsigned i;
2907 slp_tree child;
2908 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2909 vect_gather_slp_loads (loads, child, visited);
2914 /* Find the last store in SLP INSTANCE. */
2916 stmt_vec_info
2917 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2919 stmt_vec_info last = NULL;
2920 stmt_vec_info stmt_vinfo;
2922 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2924 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2925 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2928 return last;
2931 /* Find the first stmt in NODE. */
2933 stmt_vec_info
2934 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2936 stmt_vec_info first = NULL;
2937 stmt_vec_info stmt_vinfo;
2939 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2941 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2942 if (!first
2943 || get_later_stmt (stmt_vinfo, first) == first)
2944 first = stmt_vinfo;
2947 return first;
2950 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2951 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2952 (also containing the first GROUP1_SIZE stmts, since stores are
2953 consecutive), the second containing the remainder.
2954 Return the first stmt in the second group. */
2956 static stmt_vec_info
2957 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2959 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2960 gcc_assert (group1_size > 0);
2961 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2962 gcc_assert (group2_size > 0);
2963 DR_GROUP_SIZE (first_vinfo) = group1_size;
2965 stmt_vec_info stmt_info = first_vinfo;
2966 for (unsigned i = group1_size; i > 1; i--)
2968 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2969 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2971 /* STMT is now the last element of the first group. */
2972 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2973 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2975 DR_GROUP_SIZE (group2) = group2_size;
2976 for (stmt_info = group2; stmt_info;
2977 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2979 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2980 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2983 /* For the second group, the DR_GROUP_GAP is that before the original group,
2984 plus skipping over the first vector. */
2985 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2987 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2988 DR_GROUP_GAP (first_vinfo) += group2_size;
2990 if (dump_enabled_p ())
2991 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2992 group1_size, group2_size);
2994 return group2;
2997 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2998 statements and a vector of NUNITS elements. */
3000 static poly_uint64
3001 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3003 return exact_div (common_multiple (nunits, group_size), group_size);
3006 /* Helper that checks to see if a node is a load node. */
3008 static inline bool
3009 vect_is_slp_load_node (slp_tree root)
3011 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3012 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3013 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3017 /* Helper function of optimize_load_redistribution that performs the operation
3018 recursively. */
3020 static slp_tree
3021 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3022 vec_info *vinfo, unsigned int group_size,
3023 hash_map<slp_tree, slp_tree> *load_map,
3024 slp_tree root)
3026 if (slp_tree *leader = load_map->get (root))
3027 return *leader;
3029 slp_tree node;
3030 unsigned i;
3032 /* For now, we don't know anything about externals so do not do anything. */
3033 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3034 return NULL;
3035 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3037 /* First convert this node into a load node and add it to the leaves
3038 list and flatten the permute from a lane to a load one. If it's
3039 unneeded it will be elided later. */
3040 vec<stmt_vec_info> stmts;
3041 stmts.create (SLP_TREE_LANES (root));
3042 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3043 for (unsigned j = 0; j < lane_perm.length (); j++)
3045 std::pair<unsigned, unsigned> perm = lane_perm[j];
3046 node = SLP_TREE_CHILDREN (root)[perm.first];
3048 if (!vect_is_slp_load_node (node)
3049 || SLP_TREE_CHILDREN (node).exists ())
3051 stmts.release ();
3052 goto next;
3055 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_NOTE, vect_location,
3060 "converting stmts on permute node %p\n",
3061 (void *) root);
3063 bool *matches = XALLOCAVEC (bool, group_size);
3064 poly_uint64 max_nunits = 1;
3065 unsigned tree_size = 0, limit = 1;
3066 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3067 matches, &limit, &tree_size, bst_map);
3068 if (!node)
3069 stmts.release ();
3071 load_map->put (root, node);
3072 return node;
3075 next:
3076 load_map->put (root, NULL);
3078 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3080 slp_tree value
3081 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3082 node);
3083 if (value)
3085 SLP_TREE_REF_COUNT (value)++;
3086 SLP_TREE_CHILDREN (root)[i] = value;
3087 /* ??? We know the original leafs of the replaced nodes will
3088 be referenced by bst_map, only the permutes created by
3089 pattern matching are not. */
3090 if (SLP_TREE_REF_COUNT (node) == 1)
3091 load_map->remove (node);
3092 vect_free_slp_tree (node);
3096 return NULL;
3099 /* Temporary workaround for loads not being CSEd during SLP build. This
3100 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3101 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3102 same DR such that the final operation is equal to a permuted load. Such
3103 NODES are then directly converted into LOADS themselves. The nodes are
3104 CSEd using BST_MAP. */
3106 static void
3107 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3108 vec_info *vinfo, unsigned int group_size,
3109 hash_map<slp_tree, slp_tree> *load_map,
3110 slp_tree root)
3112 slp_tree node;
3113 unsigned i;
3115 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3117 slp_tree value
3118 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3119 node);
3120 if (value)
3122 SLP_TREE_REF_COUNT (value)++;
3123 SLP_TREE_CHILDREN (root)[i] = value;
3124 /* ??? We know the original leafs of the replaced nodes will
3125 be referenced by bst_map, only the permutes created by
3126 pattern matching are not. */
3127 if (SLP_TREE_REF_COUNT (node) == 1)
3128 load_map->remove (node);
3129 vect_free_slp_tree (node);
3134 /* Helper function of vect_match_slp_patterns.
3136 Attempts to match patterns against the slp tree rooted in REF_NODE using
3137 VINFO. Patterns are matched in post-order traversal.
3139 If matching is successful the value in REF_NODE is updated and returned, if
3140 not then it is returned unchanged. */
3142 static bool
3143 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3144 slp_tree_to_load_perm_map_t *perm_cache,
3145 slp_compat_nodes_map_t *compat_cache,
3146 hash_set<slp_tree> *visited)
3148 unsigned i;
3149 slp_tree node = *ref_node;
3150 bool found_p = false;
3151 if (!node || visited->add (node))
3152 return false;
3154 slp_tree child;
3155 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3156 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3157 vinfo, perm_cache, compat_cache,
3158 visited);
3160 for (unsigned x = 0; x < num__slp_patterns; x++)
3162 vect_pattern *pattern
3163 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3164 if (pattern)
3166 pattern->build (vinfo);
3167 delete pattern;
3168 found_p = true;
3172 return found_p;
3175 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3176 vec_info VINFO.
3178 The modified tree is returned. Patterns are tried in order and multiple
3179 patterns may match. */
3181 static bool
3182 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3183 hash_set<slp_tree> *visited,
3184 slp_tree_to_load_perm_map_t *perm_cache,
3185 slp_compat_nodes_map_t *compat_cache)
3187 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3188 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3190 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_NOTE, vect_location,
3192 "Analyzing SLP tree %p for patterns\n",
3193 (void *) SLP_INSTANCE_TREE (instance));
3195 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3196 visited);
3199 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3200 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3201 Return true if we could use IFN_STORE_LANES instead and if that appears
3202 to be the better approach. */
3204 static bool
3205 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3206 unsigned int group_size,
3207 unsigned int new_group_size)
3209 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3210 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3211 if (!vectype)
3212 return false;
3213 /* Allow the split if one of the two new groups would operate on full
3214 vectors *within* rather than across one scalar loop iteration.
3215 This is purely a heuristic, but it should work well for group
3216 sizes of 3 and 4, where the possible splits are:
3218 3->2+1: OK if the vector has exactly two elements
3219 4->2+2: Likewise
3220 4->3+1: Less clear-cut. */
3221 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3222 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3223 return false;
3224 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3227 /* Analyze an SLP instance starting from a group of grouped stores. Call
3228 vect_build_slp_tree to build a tree of packed stmts if possible.
3229 Return FALSE if it's impossible to SLP any stmt in the loop. */
3231 static bool
3232 vect_analyze_slp_instance (vec_info *vinfo,
3233 scalar_stmts_to_slp_tree_map_t *bst_map,
3234 stmt_vec_info stmt_info, slp_instance_kind kind,
3235 unsigned max_tree_size, unsigned *limit);
3237 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3238 of KIND. Return true if successful. */
3240 static bool
3241 vect_build_slp_instance (vec_info *vinfo,
3242 slp_instance_kind kind,
3243 vec<stmt_vec_info> &scalar_stmts,
3244 vec<stmt_vec_info> &root_stmt_infos,
3245 vec<tree> &remain,
3246 unsigned max_tree_size, unsigned *limit,
3247 scalar_stmts_to_slp_tree_map_t *bst_map,
3248 /* ??? We need stmt_info for group splitting. */
3249 stmt_vec_info stmt_info_)
3251 if (kind == slp_inst_kind_ctor)
3253 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_NOTE, vect_location,
3255 "Analyzing vectorizable constructor: %G\n",
3256 root_stmt_infos[0]->stmt);
3259 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Starting SLP discovery for\n");
3263 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3264 dump_printf_loc (MSG_NOTE, vect_location,
3265 " %G", scalar_stmts[i]->stmt);
3268 /* When a BB reduction doesn't have an even number of lanes
3269 strip it down, treating the remaining lane as scalar.
3270 ??? Selecting the optimal set of lanes to vectorize would be nice
3271 but SLP build for all lanes will fail quickly because we think
3272 we're going to need unrolling. */
3273 if (kind == slp_inst_kind_bb_reduc
3274 && (scalar_stmts.length () & 1))
3275 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3277 /* Build the tree for the SLP instance. */
3278 unsigned int group_size = scalar_stmts.length ();
3279 bool *matches = XALLOCAVEC (bool, group_size);
3280 poly_uint64 max_nunits = 1;
3281 unsigned tree_size = 0;
3282 unsigned i;
3283 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3284 &max_nunits, matches, limit,
3285 &tree_size, bst_map);
3286 if (node != NULL)
3288 /* Calculate the unrolling factor based on the smallest type. */
3289 poly_uint64 unrolling_factor
3290 = calculate_unrolling_factor (max_nunits, group_size);
3292 if (maybe_ne (unrolling_factor, 1U)
3293 && is_a <bb_vec_info> (vinfo))
3295 unsigned HOST_WIDE_INT const_max_nunits;
3296 if (!max_nunits.is_constant (&const_max_nunits)
3297 || const_max_nunits > group_size)
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "Build SLP failed: store group "
3302 "size not a multiple of the vector size "
3303 "in basic block SLP\n");
3304 vect_free_slp_tree (node);
3305 return false;
3307 /* Fatal mismatch. */
3308 if (dump_enabled_p ())
3309 dump_printf_loc (MSG_NOTE, vect_location,
3310 "SLP discovery succeeded but node needs "
3311 "splitting\n");
3312 memset (matches, true, group_size);
3313 matches[group_size / const_max_nunits * const_max_nunits] = false;
3314 vect_free_slp_tree (node);
3316 else
3318 /* Create a new SLP instance. */
3319 slp_instance new_instance = XNEW (class _slp_instance);
3320 SLP_INSTANCE_TREE (new_instance) = node;
3321 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3322 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3323 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3324 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3325 SLP_INSTANCE_KIND (new_instance) = kind;
3326 new_instance->reduc_phis = NULL;
3327 new_instance->cost_vec = vNULL;
3328 new_instance->subgraph_entries = vNULL;
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_NOTE, vect_location,
3332 "SLP size %u vs. limit %u.\n",
3333 tree_size, max_tree_size);
3335 /* Fixup SLP reduction chains. */
3336 if (kind == slp_inst_kind_reduc_chain)
3338 /* If this is a reduction chain with a conversion in front
3339 amend the SLP tree with a node for that. */
3340 gimple *scalar_def
3341 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3342 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3344 /* Get at the conversion stmt - we know it's the single use
3345 of the last stmt of the reduction chain. */
3346 use_operand_p use_p;
3347 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3348 &use_p, &scalar_def);
3349 gcc_assert (r);
3350 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3351 next_info = vect_stmt_to_vectorize (next_info);
3352 scalar_stmts = vNULL;
3353 scalar_stmts.create (group_size);
3354 for (unsigned i = 0; i < group_size; ++i)
3355 scalar_stmts.quick_push (next_info);
3356 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3357 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3358 SLP_TREE_CHILDREN (conv).quick_push (node);
3359 SLP_INSTANCE_TREE (new_instance) = conv;
3360 /* We also have to fake this conversion stmt as SLP reduction
3361 group so we don't have to mess with too much code
3362 elsewhere. */
3363 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3364 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3366 /* Fill the backedge child of the PHI SLP node. The
3367 general matching code cannot find it because the
3368 scalar code does not reflect how we vectorize the
3369 reduction. */
3370 use_operand_p use_p;
3371 imm_use_iterator imm_iter;
3372 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3373 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3374 gimple_get_lhs (scalar_def))
3375 /* There are exactly two non-debug uses, the reduction
3376 PHI and the loop-closed PHI node. */
3377 if (!is_gimple_debug (USE_STMT (use_p))
3378 && gimple_bb (USE_STMT (use_p)) == loop->header)
3380 auto_vec<stmt_vec_info, 64> phis (group_size);
3381 stmt_vec_info phi_info
3382 = vinfo->lookup_stmt (USE_STMT (use_p));
3383 for (unsigned i = 0; i < group_size; ++i)
3384 phis.quick_push (phi_info);
3385 slp_tree *phi_node = bst_map->get (phis);
3386 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3387 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3388 = SLP_INSTANCE_TREE (new_instance);
3389 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3393 vinfo->slp_instances.safe_push (new_instance);
3395 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3396 the number of scalar stmts in the root in a few places.
3397 Verify that assumption holds. */
3398 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3399 .length () == group_size);
3401 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "Final SLP tree for instance %p:\n",
3405 (void *) new_instance);
3406 vect_print_slp_graph (MSG_NOTE, vect_location,
3407 SLP_INSTANCE_TREE (new_instance));
3410 return true;
3413 else
3415 /* Failed to SLP. */
3416 /* Free the allocated memory. */
3417 scalar_stmts.release ();
3420 stmt_vec_info stmt_info = stmt_info_;
3421 /* Try to break the group up into pieces. */
3422 if (kind == slp_inst_kind_store)
3424 /* ??? We could delay all the actual splitting of store-groups
3425 until after SLP discovery of the original group completed.
3426 Then we can recurse to vect_build_slp_instance directly. */
3427 for (i = 0; i < group_size; i++)
3428 if (!matches[i])
3429 break;
3431 /* For basic block SLP, try to break the group up into multiples of
3432 a vector size. */
3433 if (is_a <bb_vec_info> (vinfo)
3434 && (i > 1 && i < group_size))
3436 tree scalar_type
3437 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3438 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3439 1 << floor_log2 (i));
3440 unsigned HOST_WIDE_INT const_nunits;
3441 if (vectype
3442 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3444 /* Split into two groups at the first vector boundary. */
3445 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3446 unsigned group1_size = i & ~(const_nunits - 1);
3448 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_NOTE, vect_location,
3450 "Splitting SLP group at stmt %u\n", i);
3451 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3452 group1_size);
3453 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3454 kind, max_tree_size,
3455 limit);
3456 /* Split the rest at the failure point and possibly
3457 re-analyze the remaining matching part if it has
3458 at least two lanes. */
3459 if (group1_size < i
3460 && (i + 1 < group_size
3461 || i - group1_size > 1))
3463 stmt_vec_info rest2 = rest;
3464 rest = vect_split_slp_store_group (rest, i - group1_size);
3465 if (i - group1_size > 1)
3466 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3467 kind, max_tree_size,
3468 limit);
3470 /* Re-analyze the non-matching tail if it has at least
3471 two lanes. */
3472 if (i + 1 < group_size)
3473 res |= vect_analyze_slp_instance (vinfo, bst_map,
3474 rest, kind, max_tree_size,
3475 limit);
3476 return res;
3480 /* For loop vectorization split into arbitrary pieces of size > 1. */
3481 if (is_a <loop_vec_info> (vinfo)
3482 && (i > 1 && i < group_size)
3483 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3485 unsigned group1_size = i;
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "Splitting SLP group at stmt %u\n", i);
3491 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3492 group1_size);
3493 /* Loop vectorization cannot handle gaps in stores, make sure
3494 the split group appears as strided. */
3495 STMT_VINFO_STRIDED_P (rest) = 1;
3496 DR_GROUP_GAP (rest) = 0;
3497 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3498 DR_GROUP_GAP (stmt_info) = 0;
3500 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3501 kind, max_tree_size, limit);
3502 if (i + 1 < group_size)
3503 res |= vect_analyze_slp_instance (vinfo, bst_map,
3504 rest, kind, max_tree_size, limit);
3506 return res;
3509 /* Even though the first vector did not all match, we might be able to SLP
3510 (some) of the remainder. FORNOW ignore this possibility. */
3513 /* Failed to SLP. */
3514 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3516 return false;
3520 /* Analyze an SLP instance starting from a group of grouped stores. Call
3521 vect_build_slp_tree to build a tree of packed stmts if possible.
3522 Return FALSE if it's impossible to SLP any stmt in the loop. */
3524 static bool
3525 vect_analyze_slp_instance (vec_info *vinfo,
3526 scalar_stmts_to_slp_tree_map_t *bst_map,
3527 stmt_vec_info stmt_info,
3528 slp_instance_kind kind,
3529 unsigned max_tree_size, unsigned *limit)
3531 unsigned int i;
3532 vec<stmt_vec_info> scalar_stmts;
3534 if (is_a <bb_vec_info> (vinfo))
3535 vect_location = stmt_info->stmt;
3537 stmt_vec_info next_info = stmt_info;
3538 if (kind == slp_inst_kind_store)
3540 /* Collect the stores and store them in scalar_stmts. */
3541 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3542 while (next_info)
3544 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3545 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3548 else if (kind == slp_inst_kind_reduc_chain)
3550 /* Collect the reduction stmts and store them in scalar_stmts. */
3551 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3552 while (next_info)
3554 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3555 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3557 /* Mark the first element of the reduction chain as reduction to properly
3558 transform the node. In the reduction analysis phase only the last
3559 element of the chain is marked as reduction. */
3560 STMT_VINFO_DEF_TYPE (stmt_info)
3561 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3562 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3563 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3565 else if (kind == slp_inst_kind_reduc_group)
3567 /* Collect reduction statements. */
3568 const vec<stmt_vec_info> &reductions
3569 = as_a <loop_vec_info> (vinfo)->reductions;
3570 scalar_stmts.create (reductions.length ());
3571 for (i = 0; reductions.iterate (i, &next_info); i++)
3572 if ((STMT_VINFO_RELEVANT_P (next_info)
3573 || STMT_VINFO_LIVE_P (next_info))
3574 /* ??? Make sure we didn't skip a conversion around a reduction
3575 path. In that case we'd have to reverse engineer that conversion
3576 stmt following the chain using reduc_idx and from the PHI
3577 using reduc_def. */
3578 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3579 scalar_stmts.quick_push (next_info);
3580 /* If less than two were relevant/live there's nothing to SLP. */
3581 if (scalar_stmts.length () < 2)
3582 return false;
3584 else
3585 gcc_unreachable ();
3587 vec<stmt_vec_info> roots = vNULL;
3588 vec<tree> remain = vNULL;
3589 /* Build the tree for the SLP instance. */
3590 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3591 roots, remain,
3592 max_tree_size, limit, bst_map,
3593 kind == slp_inst_kind_store
3594 ? stmt_info : NULL);
3596 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3597 where we should do store group splitting. */
3599 return res;
3602 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3603 trees of packed scalar stmts if SLP is possible. */
3605 opt_result
3606 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3608 unsigned int i;
3609 stmt_vec_info first_element;
3610 slp_instance instance;
3612 DUMP_VECT_SCOPE ("vect_analyze_slp");
3614 unsigned limit = max_tree_size;
3616 scalar_stmts_to_slp_tree_map_t *bst_map
3617 = new scalar_stmts_to_slp_tree_map_t ();
3619 /* Find SLP sequences starting from groups of grouped stores. */
3620 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3621 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3622 slp_inst_kind_store, max_tree_size, &limit);
3624 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3626 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3628 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3629 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3630 bb_vinfo->roots[i].stmts,
3631 bb_vinfo->roots[i].roots,
3632 bb_vinfo->roots[i].remain,
3633 max_tree_size, &limit, bst_map, NULL))
3635 bb_vinfo->roots[i].stmts = vNULL;
3636 bb_vinfo->roots[i].roots = vNULL;
3637 bb_vinfo->roots[i].remain = vNULL;
3642 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3644 /* Find SLP sequences starting from reduction chains. */
3645 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3646 if (! STMT_VINFO_RELEVANT_P (first_element)
3647 && ! STMT_VINFO_LIVE_P (first_element))
3649 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3650 slp_inst_kind_reduc_chain,
3651 max_tree_size, &limit))
3653 /* Dissolve reduction chain group. */
3654 stmt_vec_info vinfo = first_element;
3655 stmt_vec_info last = NULL;
3656 while (vinfo)
3658 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3659 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3660 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3661 last = vinfo;
3662 vinfo = next;
3664 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3665 /* It can be still vectorized as part of an SLP reduction. */
3666 loop_vinfo->reductions.safe_push (last);
3669 /* Find SLP sequences starting from groups of reductions. */
3670 if (loop_vinfo->reductions.length () > 1)
3671 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3672 slp_inst_kind_reduc_group, max_tree_size,
3673 &limit);
3676 hash_set<slp_tree> visited_patterns;
3677 slp_tree_to_load_perm_map_t perm_cache;
3678 slp_compat_nodes_map_t compat_cache;
3680 /* See if any patterns can be found in the SLP tree. */
3681 bool pattern_found = false;
3682 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3683 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3684 &visited_patterns, &perm_cache,
3685 &compat_cache);
3687 /* If any were found optimize permutations of loads. */
3688 if (pattern_found)
3690 hash_map<slp_tree, slp_tree> load_map;
3691 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3693 slp_tree root = SLP_INSTANCE_TREE (instance);
3694 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3695 &load_map, root);
3701 /* The map keeps a reference on SLP nodes built, release that. */
3702 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3703 it != bst_map->end (); ++it)
3704 if ((*it).second)
3705 vect_free_slp_tree ((*it).second);
3706 delete bst_map;
3708 if (pattern_found && dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE, vect_location,
3711 "Pattern matched SLP tree\n");
3712 hash_set<slp_tree> visited;
3713 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3714 vect_print_slp_graph (MSG_NOTE, vect_location,
3715 SLP_INSTANCE_TREE (instance), visited);
3718 return opt_result::success ();
3721 /* Estimates the cost of inserting layout changes into the SLP graph.
3722 It can also say that the insertion is impossible. */
3724 struct slpg_layout_cost
3726 slpg_layout_cost () = default;
3727 slpg_layout_cost (sreal, bool);
3729 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3730 bool is_possible () const { return depth != sreal::max (); }
3732 bool operator== (const slpg_layout_cost &) const;
3733 bool operator!= (const slpg_layout_cost &) const;
3735 bool is_better_than (const slpg_layout_cost &, bool) const;
3737 void add_parallel_cost (const slpg_layout_cost &);
3738 void add_serial_cost (const slpg_layout_cost &);
3739 void split (unsigned int);
3741 /* The longest sequence of layout changes needed during any traversal
3742 of the partition dag, weighted by execution frequency.
3744 This is the most important metric when optimizing for speed, since
3745 it helps to ensure that we keep the number of operations on
3746 critical paths to a minimum. */
3747 sreal depth = 0;
3749 /* An estimate of the total number of operations needed. It is weighted by
3750 execution frequency when optimizing for speed but not when optimizing for
3751 size. In order to avoid double-counting, a node with a fanout of N will
3752 distribute 1/N of its total cost to each successor.
3754 This is the most important metric when optimizing for size, since
3755 it helps to keep the total number of operations to a minimum, */
3756 sreal total = 0;
3759 /* Construct costs for a node with weight WEIGHT. A higher weight
3760 indicates more frequent execution. IS_FOR_SIZE is true if we are
3761 optimizing for size rather than speed. */
3763 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3764 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3768 bool
3769 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3771 return depth == other.depth && total == other.total;
3774 bool
3775 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3777 return !operator== (other);
3780 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3781 true if we are optimizing for size rather than speed. */
3783 bool
3784 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3785 bool is_for_size) const
3787 if (is_for_size)
3789 if (total != other.total)
3790 return total < other.total;
3791 return depth < other.depth;
3793 else
3795 if (depth != other.depth)
3796 return depth < other.depth;
3797 return total < other.total;
3801 /* Increase the costs to account for something with cost INPUT_COST
3802 happening in parallel with the current costs. */
3804 void
3805 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3807 depth = std::max (depth, input_cost.depth);
3808 total += input_cost.total;
3811 /* Increase the costs to account for something with cost INPUT_COST
3812 happening in series with the current costs. */
3814 void
3815 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3817 depth += other.depth;
3818 total += other.total;
3821 /* Split the total cost among TIMES successors or predecessors. */
3823 void
3824 slpg_layout_cost::split (unsigned int times)
3826 if (times > 1)
3827 total /= times;
3830 /* Information about one node in the SLP graph, for use during
3831 vect_optimize_slp_pass. */
3833 struct slpg_vertex
3835 slpg_vertex (slp_tree node_) : node (node_) {}
3837 /* The node itself. */
3838 slp_tree node;
3840 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3841 partitions are flexible; they can have whichever layout consumers
3842 want them to have. */
3843 int partition = -1;
3845 /* The number of nodes that directly use the result of this one
3846 (i.e. the number of nodes that count this one as a child). */
3847 unsigned int out_degree = 0;
3849 /* The execution frequency of the node. */
3850 sreal weight = 0;
3852 /* The total execution frequency of all nodes that directly use the
3853 result of this one. */
3854 sreal out_weight = 0;
3857 /* Information about one partition of the SLP graph, for use during
3858 vect_optimize_slp_pass. */
3860 struct slpg_partition_info
3862 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3863 of m_partitioned_nodes. */
3864 unsigned int node_begin = 0;
3865 unsigned int node_end = 0;
3867 /* Which layout we've chosen to use for this partition, or -1 if
3868 we haven't picked one yet. */
3869 int layout = -1;
3871 /* The number of predecessors and successors in the partition dag.
3872 The predecessors always have lower partition numbers and the
3873 successors always have higher partition numbers.
3875 Note that the directions of these edges are not necessarily the
3876 same as in the data flow graph. For example, if an SCC has separate
3877 partitions for an inner loop and an outer loop, the inner loop's
3878 partition will have at least two incoming edges from the outer loop's
3879 partition: one for a live-in value and one for a live-out value.
3880 In data flow terms, one of these edges would also be from the outer loop
3881 to the inner loop, but the other would be in the opposite direction. */
3882 unsigned int in_degree = 0;
3883 unsigned int out_degree = 0;
3886 /* Information about the costs of using a particular layout for a
3887 particular partition. It can also say that the combination is
3888 impossible. */
3890 struct slpg_partition_layout_costs
3892 bool is_possible () const { return internal_cost.is_possible (); }
3893 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3895 /* The costs inherited from predecessor partitions. */
3896 slpg_layout_cost in_cost;
3898 /* The inherent cost of the layout within the node itself. For example,
3899 this is nonzero for a load if choosing a particular layout would require
3900 the load to permute the loaded elements. It is nonzero for a
3901 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3902 to full-vector moves. */
3903 slpg_layout_cost internal_cost;
3905 /* The costs inherited from successor partitions. */
3906 slpg_layout_cost out_cost;
3909 /* This class tries to optimize the layout of vectors in order to avoid
3910 unnecessary shuffling. At the moment, the set of possible layouts are
3911 restricted to bijective permutations.
3913 The goal of the pass depends on whether we're optimizing for size or
3914 for speed. When optimizing for size, the goal is to reduce the overall
3915 number of layout changes (including layout changes implied by things
3916 like load permutations). When optimizing for speed, the goal is to
3917 reduce the maximum latency attributable to layout changes on any
3918 non-cyclical path through the data flow graph.
3920 For example, when optimizing a loop nest for speed, we will prefer
3921 to make layout changes outside of a loop rather than inside of a loop,
3922 and will prefer to make layout changes in parallel rather than serially,
3923 even if that increases the overall number of layout changes.
3925 The high-level procedure is:
3927 (1) Build a graph in which edges go from uses (parents) to definitions
3928 (children).
3930 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3932 (3) When optimizing for speed, partition the nodes in each SCC based
3933 on their containing cfg loop. When optimizing for size, treat
3934 each SCC as a single partition.
3936 This gives us a dag of partitions. The goal is now to assign a
3937 layout to each partition.
3939 (4) Construct a set of vector layouts that are worth considering.
3940 Record which nodes must keep their current layout.
3942 (5) Perform a forward walk over the partition dag (from loads to stores)
3943 accumulating the "forward" cost of using each layout. When visiting
3944 each partition, assign a tentative choice of layout to the partition
3945 and use that choice when calculating the cost of using a different
3946 layout in successor partitions.
3948 (6) Perform a backward walk over the partition dag (from stores to loads),
3949 accumulating the "backward" cost of using each layout. When visiting
3950 each partition, make a final choice of layout for that partition based
3951 on the accumulated forward costs (from (5)) and backward costs
3952 (from (6)).
3954 (7) Apply the chosen layouts to the SLP graph.
3956 For example, consider the SLP statements:
3958 S1: a_1 = load
3959 loop:
3960 S2: a_2 = PHI<a_1, a_3>
3961 S3: b_1 = load
3962 S4: a_3 = a_2 + b_1
3963 exit:
3964 S5: a_4 = PHI<a_3>
3965 S6: store a_4
3967 S2 and S4 form an SCC and are part of the same loop. Every other
3968 statement is in a singleton SCC. In this example there is a one-to-one
3969 mapping between SCCs and partitions and the partition dag looks like this;
3971 S1 S3
3973 S2+S4
3979 S2, S3 and S4 will have a higher execution frequency than the other
3980 statements, so when optimizing for speed, the goal is to avoid any
3981 layout changes:
3983 - within S3
3984 - within S2+S4
3985 - on the S3->S2+S4 edge
3987 For example, if S3 was originally a reversing load, the goal of the
3988 pass is to make it an unreversed load and change the layout on the
3989 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3990 on S1->S2+S4 and S5->S6 would also be acceptable.)
3992 The difference between SCCs and partitions becomes important if we
3993 add an outer loop:
3995 S1: a_1 = ...
3996 loop1:
3997 S2: a_2 = PHI<a_1, a_6>
3998 S3: b_1 = load
3999 S4: a_3 = a_2 + b_1
4000 loop2:
4001 S5: a_4 = PHI<a_3, a_5>
4002 S6: c_1 = load
4003 S7: a_5 = a_4 + c_1
4004 exit2:
4005 S8: a_6 = PHI<a_5>
4006 S9: store a_6
4007 exit1:
4009 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4010 for speed, we usually do not want restrictions in the outer loop to "infect"
4011 the decision for the inner loop. For example, if an outer-loop node
4012 in the SCC contains a statement with a fixed layout, that should not
4013 prevent the inner loop from using a different layout. Conversely,
4014 the inner loop should not dictate a layout to the outer loop: if the
4015 outer loop does a lot of computation, then it may not be efficient to
4016 do all of that computation in the inner loop's preferred layout.
4018 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4019 and S5+S7 (inner). We also try to arrange partitions so that:
4021 - the partition for an outer loop comes before the partition for
4022 an inner loop
4024 - if a sibling loop A dominates a sibling loop B, A's partition
4025 comes before B's
4027 This gives the following partition dag for the example above:
4029 S1 S3
4031 S2+S4+S8 S6
4032 | \\ /
4033 | S5+S7
4037 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4038 one for a reversal of the edge S7->S8.
4040 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4041 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4042 preferred layout against the cost of changing the layout on entry to the
4043 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4045 Although this works well when optimizing for speed, it has the downside
4046 when optimizing for size that the choice of layout for S5+S7 is completely
4047 independent of S9, which lessens the chance of reducing the overall number
4048 of permutations. We therefore do not partition SCCs when optimizing
4049 for size.
4051 To give a concrete example of the difference between optimizing
4052 for size and speed, consider:
4054 a[0] = (b[1] << c[3]) - d[1];
4055 a[1] = (b[0] << c[2]) - d[0];
4056 a[2] = (b[3] << c[1]) - d[3];
4057 a[3] = (b[2] << c[0]) - d[2];
4059 There are three different layouts here: one for a, one for b and d,
4060 and one for c. When optimizing for speed it is better to permute each
4061 of b, c and d into the order required by a, since those permutations
4062 happen in parallel. But when optimizing for size, it is better to:
4064 - permute c into the same order as b
4065 - do the arithmetic
4066 - permute the result into the order required by a
4068 This gives 2 permutations rather than 3. */
4070 class vect_optimize_slp_pass
4072 public:
4073 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4074 void run ();
4076 private:
4077 /* Graph building. */
4078 struct loop *containing_loop (slp_tree);
4079 bool is_cfg_latch_edge (graph_edge *);
4080 void build_vertices (hash_set<slp_tree> &, slp_tree);
4081 void build_vertices ();
4082 void build_graph ();
4084 /* Partitioning. */
4085 void create_partitions ();
4086 template<typename T> void for_each_partition_edge (unsigned int, T);
4088 /* Layout selection. */
4089 bool is_compatible_layout (slp_tree, unsigned int);
4090 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4091 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4092 unsigned int);
4093 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4094 int, unsigned int);
4095 int internal_node_cost (slp_tree, int, unsigned int);
4096 void start_choosing_layouts ();
4098 /* Cost propagation. */
4099 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4100 unsigned int, unsigned int);
4101 slpg_layout_cost total_in_cost (unsigned int);
4102 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4103 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4104 void forward_pass ();
4105 void backward_pass ();
4107 /* Rematerialization. */
4108 slp_tree get_result_with_layout (slp_tree, unsigned int);
4109 void materialize ();
4111 /* Clean-up. */
4112 void remove_redundant_permutations ();
4114 void dump ();
4116 vec_info *m_vinfo;
4118 /* True if we should optimize the graph for size, false if we should
4119 optimize it for speed. (It wouldn't be easy to make this decision
4120 more locally.) */
4121 bool m_optimize_size;
4123 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4124 In other words, a node's predecessors are its slp_tree parents and
4125 a node's successors are its slp_tree children. */
4126 graph *m_slpg = nullptr;
4128 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4129 auto_vec<slpg_vertex> m_vertices;
4131 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4132 and loads. */
4133 auto_vec<int> m_leafs;
4135 /* This array has one entry for every vector layout that we're considering.
4136 Element 0 is null and indicates "no change". Other entries describe
4137 permutations that are inherent in the current graph and that we would
4138 like to reverse if possible.
4140 For example, a permutation { 1, 2, 3, 0 } means that something has
4141 effectively been permuted in that way, such as a load group
4142 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4143 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4144 in order to put things "back" in order. */
4145 auto_vec<vec<unsigned> > m_perms;
4147 /* A partitioning of the nodes for which a layout must be chosen.
4148 Each partition represents an <SCC, cfg loop> pair; that is,
4149 nodes in different SCCs belong to different partitions, and nodes
4150 within an SCC can be further partitioned according to a containing
4151 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4153 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4154 from leaves (such as loads) to roots (such as stores).
4156 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4157 auto_vec<slpg_partition_info> m_partitions;
4159 /* The list of all nodes for which a layout must be chosen. Nodes for
4160 partition P come before the nodes for partition P+1. Nodes within a
4161 partition are in reverse postorder. */
4162 auto_vec<unsigned int> m_partitioned_nodes;
4164 /* Index P * num-layouts + L contains the cost of using layout L
4165 for partition P. */
4166 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4168 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4169 original output of node N adjusted to have layout L. */
4170 auto_vec<slp_tree> m_node_layouts;
4173 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4174 Also record whether we should optimize anything for speed rather
4175 than size. */
4177 void
4178 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4179 slp_tree node)
4181 unsigned i;
4182 slp_tree child;
4184 if (visited.add (node))
4185 return;
4187 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4189 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4190 if (optimize_bb_for_speed_p (bb))
4191 m_optimize_size = false;
4194 node->vertex = m_vertices.length ();
4195 m_vertices.safe_push (slpg_vertex (node));
4197 bool leaf = true;
4198 bool force_leaf = false;
4199 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4200 if (child)
4202 leaf = false;
4203 build_vertices (visited, child);
4205 else
4206 force_leaf = true;
4207 /* Since SLP discovery works along use-def edges all cycles have an
4208 entry - but there's the exception of cycles where we do not handle
4209 the entry explicitely (but with a NULL SLP node), like some reductions
4210 and inductions. Force those SLP PHIs to act as leafs to make them
4211 backwards reachable. */
4212 if (leaf || force_leaf)
4213 m_leafs.safe_push (node->vertex);
4216 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4218 void
4219 vect_optimize_slp_pass::build_vertices ()
4221 hash_set<slp_tree> visited;
4222 unsigned i;
4223 slp_instance instance;
4224 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4225 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4228 /* Apply (reverse) bijectite PERM to VEC. */
4230 template <class T>
4231 static void
4232 vect_slp_permute (vec<unsigned> perm,
4233 vec<T> &vec, bool reverse)
4235 auto_vec<T, 64> saved;
4236 saved.create (vec.length ());
4237 for (unsigned i = 0; i < vec.length (); ++i)
4238 saved.quick_push (vec[i]);
4240 if (reverse)
4242 for (unsigned i = 0; i < vec.length (); ++i)
4243 vec[perm[i]] = saved[i];
4244 for (unsigned i = 0; i < vec.length (); ++i)
4245 gcc_assert (vec[perm[i]] == saved[i]);
4247 else
4249 for (unsigned i = 0; i < vec.length (); ++i)
4250 vec[i] = saved[perm[i]];
4251 for (unsigned i = 0; i < vec.length (); ++i)
4252 gcc_assert (vec[i] == saved[perm[i]]);
4256 /* Return the cfg loop that contains NODE. */
4258 struct loop *
4259 vect_optimize_slp_pass::containing_loop (slp_tree node)
4261 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4262 if (!rep)
4263 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4264 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4267 /* Return true if UD (an edge from a use to a definition) is associated
4268 with a loop latch edge in the cfg. */
4270 bool
4271 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4273 slp_tree use = m_vertices[ud->src].node;
4274 slp_tree def = m_vertices[ud->dest].node;
4275 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4276 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4277 return false;
4279 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4280 return (is_a<gphi *> (use_rep->stmt)
4281 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4282 && containing_loop (def) == containing_loop (use));
4285 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4286 a nonnull data field. */
4288 void
4289 vect_optimize_slp_pass::build_graph ()
4291 m_optimize_size = true;
4292 build_vertices ();
4294 m_slpg = new_graph (m_vertices.length ());
4295 for (slpg_vertex &v : m_vertices)
4296 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4297 if (child)
4299 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4300 if (is_cfg_latch_edge (ud))
4301 ud->data = this;
4305 /* Return true if E corresponds to a loop latch edge in the cfg. */
4307 static bool
4308 skip_cfg_latch_edges (graph_edge *e)
4310 return e->data;
4313 /* Create the node partitions. */
4315 void
4316 vect_optimize_slp_pass::create_partitions ()
4318 /* Calculate a postorder of the graph, ignoring edges that correspond
4319 to natural latch edges in the cfg. Reading the vector from the end
4320 to the beginning gives the reverse postorder. */
4321 auto_vec<int> initial_rpo;
4322 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4323 false, NULL, skip_cfg_latch_edges);
4324 gcc_assert (initial_rpo.length () == m_vertices.length ());
4326 /* Calculate the strongly connected components of the graph. */
4327 auto_vec<int> scc_grouping;
4328 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4330 /* Create a new index order in which all nodes from the same SCC are
4331 consecutive. Use scc_pos to record the index of the first node in
4332 each SCC. */
4333 auto_vec<unsigned int> scc_pos (num_sccs);
4334 int last_component = -1;
4335 unsigned int node_count = 0;
4336 for (unsigned int node_i : scc_grouping)
4338 if (last_component != m_slpg->vertices[node_i].component)
4340 last_component = m_slpg->vertices[node_i].component;
4341 gcc_assert (last_component == int (scc_pos.length ()));
4342 scc_pos.quick_push (node_count);
4344 node_count += 1;
4346 gcc_assert (node_count == initial_rpo.length ()
4347 && last_component + 1 == int (num_sccs));
4349 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4350 inside each SCC following the RPO we calculated above. The fact that
4351 we ignored natural latch edges when calculating the RPO should ensure
4352 that, for natural loop nests:
4354 - the first node that we encounter in a cfg loop is the loop header phi
4355 - the loop header phis are in dominance order
4357 Arranging for this is an optimization (see below) rather than a
4358 correctness issue. Unnatural loops with a tangled mess of backedges
4359 will still work correctly, but might give poorer results.
4361 Also update scc_pos so that it gives 1 + the index of the last node
4362 in the SCC. */
4363 m_partitioned_nodes.safe_grow (node_count);
4364 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4366 unsigned int node_i = initial_rpo[old_i];
4367 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4368 m_partitioned_nodes[new_i] = node_i;
4371 /* When optimizing for speed, partition each SCC based on the containing
4372 cfg loop. The order we constructed above should ensure that, for natural
4373 cfg loops, we'll create sub-SCC partitions for outer loops before
4374 the corresponding sub-SCC partitions for inner loops. Similarly,
4375 when one sibling loop A dominates another sibling loop B, we should
4376 create a sub-SCC partition for A before a sub-SCC partition for B.
4378 As above, nothing depends for correctness on whether this achieves
4379 a natural nesting, but we should get better results when it does. */
4380 m_partitions.reserve (m_vertices.length ());
4381 unsigned int next_partition_i = 0;
4382 hash_map<struct loop *, int> loop_partitions;
4383 unsigned int rpo_begin = 0;
4384 unsigned int num_partitioned_nodes = 0;
4385 for (unsigned int rpo_end : scc_pos)
4387 loop_partitions.empty ();
4388 unsigned int partition_i = next_partition_i;
4389 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4391 /* Handle externals and constants optimistically throughout.
4392 But treat existing vectors as fixed since we do not handle
4393 permuting them. */
4394 unsigned int node_i = m_partitioned_nodes[rpo_i];
4395 auto &vertex = m_vertices[node_i];
4396 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4397 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4398 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4399 vertex.partition = -1;
4400 else
4402 bool existed;
4403 if (m_optimize_size)
4404 existed = next_partition_i > partition_i;
4405 else
4407 struct loop *loop = containing_loop (vertex.node);
4408 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4409 if (!existed)
4410 entry = next_partition_i;
4411 partition_i = entry;
4413 if (!existed)
4415 m_partitions.quick_push (slpg_partition_info ());
4416 next_partition_i += 1;
4418 vertex.partition = partition_i;
4419 num_partitioned_nodes += 1;
4420 m_partitions[partition_i].node_end += 1;
4423 rpo_begin = rpo_end;
4426 /* Assign ranges of consecutive node indices to each partition,
4427 in partition order. Start with node_end being the same as
4428 node_begin so that the next loop can use it as a counter. */
4429 unsigned int node_begin = 0;
4430 for (auto &partition : m_partitions)
4432 partition.node_begin = node_begin;
4433 node_begin += partition.node_end;
4434 partition.node_end = partition.node_begin;
4436 gcc_assert (node_begin == num_partitioned_nodes);
4438 /* Finally build the list of nodes in partition order. */
4439 m_partitioned_nodes.truncate (num_partitioned_nodes);
4440 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4442 int partition_i = m_vertices[node_i].partition;
4443 if (partition_i >= 0)
4445 unsigned int order_i = m_partitions[partition_i].node_end++;
4446 m_partitioned_nodes[order_i] = node_i;
4451 /* Look for edges from earlier partitions into node NODE_I and edges from
4452 node NODE_I into later partitions. Call:
4454 FN (ud, other_node_i)
4456 for each such use-to-def edge ud, where other_node_i is the node at the
4457 other end of the edge. */
4459 template<typename T>
4460 void
4461 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4463 int partition_i = m_vertices[node_i].partition;
4464 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4465 pred; pred = pred->pred_next)
4467 int src_partition_i = m_vertices[pred->src].partition;
4468 if (src_partition_i >= 0 && src_partition_i != partition_i)
4469 fn (pred, pred->src);
4471 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4472 succ; succ = succ->succ_next)
4474 int dest_partition_i = m_vertices[succ->dest].partition;
4475 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4476 fn (succ, succ->dest);
4480 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4481 that NODE would operate on. This test is independent of NODE's actual
4482 operation. */
4484 bool
4485 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4486 unsigned int layout_i)
4488 if (layout_i == 0)
4489 return true;
4491 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4492 return false;
4494 return true;
4497 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4498 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4499 layouts is incompatible with NODE or if the change is not possible for
4500 some other reason.
4502 The properties taken from NODE include the number of lanes and the
4503 vector type. The actual operation doesn't matter. */
4506 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4507 unsigned int from_layout_i,
4508 unsigned int to_layout_i)
4510 if (!is_compatible_layout (node, from_layout_i)
4511 || !is_compatible_layout (node, to_layout_i))
4512 return -1;
4514 if (from_layout_i == to_layout_i)
4515 return 0;
4517 auto_vec<slp_tree, 1> children (1);
4518 children.quick_push (node);
4519 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4520 if (from_layout_i > 0)
4521 for (unsigned int i : m_perms[from_layout_i])
4522 perm.quick_push ({ 0, i });
4523 else
4524 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4525 perm.quick_push ({ 0, i });
4526 if (to_layout_i > 0)
4527 vect_slp_permute (m_perms[to_layout_i], perm, true);
4528 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4529 children, false);
4530 if (count >= 0)
4531 return MAX (count, 1);
4533 /* ??? In principle we could try changing via layout 0, giving two
4534 layout changes rather than 1. Doing that would require
4535 corresponding support in get_result_with_layout. */
4536 return -1;
4539 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4541 inline slpg_partition_layout_costs &
4542 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4543 unsigned int layout_i)
4545 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4548 /* Change PERM in one of two ways:
4550 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4551 chosen for child I of NODE.
4553 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4555 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4557 void
4558 vect_optimize_slp_pass::
4559 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4560 int in_layout_i, unsigned int out_layout_i)
4562 for (auto &entry : perm)
4564 int this_in_layout_i = in_layout_i;
4565 if (this_in_layout_i < 0)
4567 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4568 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4569 this_in_layout_i = m_partitions[in_partition_i].layout;
4571 if (this_in_layout_i > 0)
4572 entry.second = m_perms[this_in_layout_i][entry.second];
4574 if (out_layout_i > 0)
4575 vect_slp_permute (m_perms[out_layout_i], perm, true);
4578 /* Check whether the target allows NODE to be rearranged so that the node's
4579 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4580 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4582 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4583 NODE can adapt to the layout changes that have (perhaps provisionally)
4584 been chosen for NODE's children, so that no extra permutations are
4585 needed on either the input or the output of NODE.
4587 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4588 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4590 IN_LAYOUT_I has no meaning for other types of node.
4592 Keeping the node as-is is always valid. If the target doesn't appear
4593 to support the node as-is, but might realistically support other layouts,
4594 then layout 0 instead has the cost of a worst-case permutation. On the
4595 one hand, this ensures that every node has at least one valid layout,
4596 avoiding what would otherwise be an awkward special case. On the other,
4597 it still encourages the pass to change an invalid pre-existing layout
4598 choice into a valid one. */
4601 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4602 unsigned int out_layout_i)
4604 const int fallback_cost = 1;
4606 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4608 auto_lane_permutation_t tmp_perm;
4609 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4611 /* Check that the child nodes support the chosen layout. Checking
4612 the first child is enough, since any second child would have the
4613 same shape. */
4614 auto first_child = SLP_TREE_CHILDREN (node)[0];
4615 if (in_layout_i > 0
4616 && !is_compatible_layout (first_child, in_layout_i))
4617 return -1;
4619 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4620 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4621 node, tmp_perm,
4622 SLP_TREE_CHILDREN (node),
4623 false);
4624 if (count < 0)
4626 if (in_layout_i == 0 && out_layout_i == 0)
4628 /* Use the fallback cost if the node could in principle support
4629 some nonzero layout for both the inputs and the outputs.
4630 Otherwise assume that the node will be rejected later
4631 and rebuilt from scalars. */
4632 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4633 return fallback_cost;
4634 return 0;
4636 return -1;
4639 /* We currently have no way of telling whether the new layout is cheaper
4640 or more expensive than the old one. But at least in principle,
4641 it should be worth making zero permutations (whole-vector shuffles)
4642 cheaper than real permutations, in case the pass is able to remove
4643 the latter. */
4644 return count == 0 ? 0 : 1;
4647 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4648 if (rep
4649 && STMT_VINFO_DATA_REF (rep)
4650 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4651 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4653 auto_load_permutation_t tmp_perm;
4654 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4655 if (out_layout_i > 0)
4656 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4658 poly_uint64 vf = 1;
4659 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4660 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4661 unsigned int n_perms;
4662 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4663 nullptr, vf, true, false, &n_perms))
4665 auto rep = SLP_TREE_REPRESENTATIVE (node);
4666 if (out_layout_i == 0)
4668 /* Use the fallback cost if the load is an N-to-N permutation.
4669 Otherwise assume that the node will be rejected later
4670 and rebuilt from scalars. */
4671 if (STMT_VINFO_GROUPED_ACCESS (rep)
4672 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4673 == SLP_TREE_LANES (node)))
4674 return fallback_cost;
4675 return 0;
4677 return -1;
4680 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4681 return n_perms == 0 ? 0 : 1;
4684 return 0;
4687 /* Decide which element layouts we should consider using. Calculate the
4688 weights associated with inserting layout changes on partition edges.
4689 Also mark partitions that cannot change layout, by setting their
4690 layout to zero. */
4692 void
4693 vect_optimize_slp_pass::start_choosing_layouts ()
4695 /* Used to assign unique permutation indices. */
4696 using perm_hash = unbounded_hashmap_traits<
4697 vec_free_hash_base<int_hash_base<unsigned>>,
4698 int_hash<int, -1, -2>
4700 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4702 /* Layout 0 is "no change". */
4703 m_perms.safe_push (vNULL);
4705 /* Create layouts from existing permutations. */
4706 auto_load_permutation_t tmp_perm;
4707 for (unsigned int node_i : m_partitioned_nodes)
4709 /* Leafs also double as entries to the reverse graph. Allow the
4710 layout of those to be changed. */
4711 auto &vertex = m_vertices[node_i];
4712 auto &partition = m_partitions[vertex.partition];
4713 if (!m_slpg->vertices[node_i].succ)
4714 partition.layout = 0;
4716 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4717 slp_tree node = vertex.node;
4718 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4719 slp_tree child;
4720 unsigned HOST_WIDE_INT imin, imax = 0;
4721 bool any_permute = false;
4722 tmp_perm.truncate (0);
4723 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4725 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4726 unpermuted, record a layout that reverses this permutation.
4728 We would need more work to cope with loads that are internally
4729 permuted and also have inputs (such as masks for
4730 IFN_MASK_LOADs). */
4731 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4732 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4734 partition.layout = -1;
4735 continue;
4737 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4738 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4739 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4741 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4742 && SLP_TREE_CHILDREN (node).length () == 1
4743 && (child = SLP_TREE_CHILDREN (node)[0])
4744 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4745 .is_constant (&imin)))
4747 /* If the child has the same vector size as this node,
4748 reversing the permutation can make the permutation a no-op.
4749 In other cases it can change a true permutation into a
4750 full-vector extract. */
4751 tmp_perm.reserve (SLP_TREE_LANES (node));
4752 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4753 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4755 else
4756 continue;
4758 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760 unsigned idx = tmp_perm[j];
4761 imin = MIN (imin, idx);
4762 imax = MAX (imax, idx);
4763 if (idx - tmp_perm[0] != j)
4764 any_permute = true;
4766 /* If the span doesn't match we'd disrupt VF computation, avoid
4767 that for now. */
4768 if (imax - imin + 1 != SLP_TREE_LANES (node))
4769 continue;
4770 /* If there's no permute no need to split one out. In this case
4771 we can consider turning a load into a permuted load, if that
4772 turns out to be cheaper than alternatives. */
4773 if (!any_permute)
4775 partition.layout = -1;
4776 continue;
4779 /* For now only handle true permutes, like
4780 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4781 when permuting constants and invariants keeping the permute
4782 bijective. */
4783 auto_sbitmap load_index (SLP_TREE_LANES (node));
4784 bitmap_clear (load_index);
4785 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4786 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4787 unsigned j;
4788 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4789 if (!bitmap_bit_p (load_index, j))
4790 break;
4791 if (j != SLP_TREE_LANES (node))
4792 continue;
4794 vec<unsigned> perm = vNULL;
4795 perm.safe_grow (SLP_TREE_LANES (node), true);
4796 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797 perm[j] = tmp_perm[j] - imin;
4799 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4801 /* Continue to use existing layouts, but don't add any more. */
4802 int *entry = layout_ids.get (perm);
4803 partition.layout = entry ? *entry : 0;
4804 perm.release ();
4806 else
4808 bool existed;
4809 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4810 if (existed)
4811 perm.release ();
4812 else
4814 layout_i = m_perms.length ();
4815 m_perms.safe_push (perm);
4817 partition.layout = layout_i;
4821 /* Initially assume that every layout is possible and has zero cost
4822 in every partition. */
4823 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4824 * m_perms.length ());
4826 /* We have to mark outgoing permutations facing non-associating-reduction
4827 graph entries that are not represented as to be materialized.
4828 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4829 for (slp_instance instance : m_vinfo->slp_instances)
4830 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4832 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4833 m_partitions[m_vertices[node_i].partition].layout = 0;
4835 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4837 stmt_vec_info stmt_info
4838 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4839 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4840 if (needs_fold_left_reduction_p (TREE_TYPE
4841 (gimple_get_lhs (stmt_info->stmt)),
4842 STMT_VINFO_REDUC_CODE (reduc_info)))
4844 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4845 m_partitions[m_vertices[node_i].partition].layout = 0;
4849 /* Check which layouts each node and partition can handle. Calculate the
4850 weights associated with inserting layout changes on edges. */
4851 for (unsigned int node_i : m_partitioned_nodes)
4853 auto &vertex = m_vertices[node_i];
4854 auto &partition = m_partitions[vertex.partition];
4855 slp_tree node = vertex.node;
4857 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4859 vertex.weight = vect_slp_node_weight (node);
4861 /* We do not handle stores with a permutation, so all
4862 incoming permutations must have been materialized.
4864 We also don't handle masked grouped loads, which lack a
4865 permutation vector. In this case the memory locations
4866 form an implicit second input to the loads, on top of the
4867 explicit mask input, and the memory input's layout cannot
4868 be changed.
4870 On the other hand, we do support permuting gather loads and
4871 masked gather loads, where each scalar load is independent
4872 of the others. This can be useful if the address/index input
4873 benefits from permutation. */
4874 if (STMT_VINFO_DATA_REF (rep)
4875 && STMT_VINFO_GROUPED_ACCESS (rep)
4876 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4877 partition.layout = 0;
4879 /* We cannot change the layout of an operation that is
4880 not independent on lanes. Note this is an explicit
4881 negative list since that's much shorter than the respective
4882 positive one but it's critical to keep maintaining it. */
4883 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4884 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4886 case CFN_COMPLEX_ADD_ROT90:
4887 case CFN_COMPLEX_ADD_ROT270:
4888 case CFN_COMPLEX_MUL:
4889 case CFN_COMPLEX_MUL_CONJ:
4890 case CFN_VEC_ADDSUB:
4891 case CFN_VEC_FMADDSUB:
4892 case CFN_VEC_FMSUBADD:
4893 partition.layout = 0;
4894 default:;
4898 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4900 auto &other_vertex = m_vertices[other_node_i];
4902 /* Count the number of edges from earlier partitions and the number
4903 of edges to later partitions. */
4904 if (other_vertex.partition < vertex.partition)
4905 partition.in_degree += 1;
4906 else
4907 partition.out_degree += 1;
4909 /* If the current node uses the result of OTHER_NODE_I, accumulate
4910 the effects of that. */
4911 if (ud->src == int (node_i))
4913 other_vertex.out_weight += vertex.weight;
4914 other_vertex.out_degree += 1;
4917 for_each_partition_edge (node_i, process_edge);
4921 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4922 its current (provisional) choice of layout. The inputs do not necessarily
4923 have the same layout as each other. */
4925 slpg_layout_cost
4926 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4928 auto &vertex = m_vertices[node_i];
4929 slpg_layout_cost cost;
4930 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4932 auto &other_vertex = m_vertices[other_node_i];
4933 if (other_vertex.partition < vertex.partition)
4935 auto &other_partition = m_partitions[other_vertex.partition];
4936 auto &other_costs = partition_layout_costs (other_vertex.partition,
4937 other_partition.layout);
4938 slpg_layout_cost this_cost = other_costs.in_cost;
4939 this_cost.add_serial_cost (other_costs.internal_cost);
4940 this_cost.split (other_partition.out_degree);
4941 cost.add_parallel_cost (this_cost);
4944 for_each_partition_edge (node_i, add_cost);
4945 return cost;
4948 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4949 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4950 slpg_layout_cost::impossible () if the change isn't possible. */
4952 slpg_layout_cost
4953 vect_optimize_slp_pass::
4954 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4955 unsigned int layout2_i)
4957 auto &def_vertex = m_vertices[ud->dest];
4958 auto &use_vertex = m_vertices[ud->src];
4959 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4960 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4961 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4962 use_layout_i);
4963 if (factor < 0)
4964 return slpg_layout_cost::impossible ();
4966 /* We have a choice of putting the layout change at the site of the
4967 definition or at the site of the use. Prefer the former when
4968 optimizing for size or when the execution frequency of the
4969 definition is no greater than the combined execution frequencies of
4970 the uses. When putting the layout change at the site of the definition,
4971 divvy up the cost among all consumers. */
4972 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4974 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4975 cost.split (def_vertex.out_degree);
4976 return cost;
4978 return { use_vertex.weight * factor, m_optimize_size };
4981 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4982 partition; FROM_NODE_I could be the definition node or the use node.
4983 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4984 Return the cost of any necessary fix-ups on edge UD, or return
4985 slpg_layout_cost::impossible () if the change isn't possible.
4987 At this point, FROM_NODE_I's partition has chosen the cheapest
4988 layout based on the information available so far, but this choice
4989 is only provisional. */
4991 slpg_layout_cost
4992 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4993 unsigned int to_layout_i)
4995 auto &from_vertex = m_vertices[from_node_i];
4996 unsigned int from_partition_i = from_vertex.partition;
4997 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4998 gcc_assert (from_partition.layout >= 0);
5000 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5001 with its current layout preference. */
5002 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5003 auto edge_cost = edge_layout_cost (ud, from_node_i,
5004 from_partition.layout, to_layout_i);
5005 if (edge_cost.is_possible ())
5007 auto &from_costs = partition_layout_costs (from_partition_i,
5008 from_partition.layout);
5009 cost = from_costs.in_cost;
5010 cost.add_serial_cost (from_costs.internal_cost);
5011 cost.split (from_partition.out_degree);
5012 cost.add_serial_cost (edge_cost);
5015 /* Take the minimum of that cost and the cost that applies if
5016 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5017 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5018 to_layout_i);
5019 if (direct_layout_costs.is_possible ())
5021 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5022 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5023 direct_cost.split (from_partition.out_degree);
5024 if (!cost.is_possible ()
5025 || direct_cost.is_better_than (cost, m_optimize_size))
5026 cost = direct_cost;
5029 return cost;
5032 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5033 partition; TO_NODE_I could be the definition node or the use node.
5034 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5035 return the cost of any necessary fix-ups on edge UD, or
5036 slpg_layout_cost::impossible () if the choice cannot be made.
5038 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5040 slpg_layout_cost
5041 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5042 unsigned int from_layout_i)
5044 auto &to_vertex = m_vertices[to_node_i];
5045 unsigned int to_partition_i = to_vertex.partition;
5046 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5047 gcc_assert (to_partition.layout >= 0);
5049 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5050 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5051 any other inputs keep their current choice of layout. */
5052 auto &to_costs = partition_layout_costs (to_partition_i,
5053 to_partition.layout);
5054 if (ud->src == int (to_node_i)
5055 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5057 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5058 auto old_layout = from_partition.layout;
5059 from_partition.layout = from_layout_i;
5060 int factor = internal_node_cost (to_vertex.node, -1,
5061 to_partition.layout);
5062 from_partition.layout = old_layout;
5063 if (factor >= 0)
5065 slpg_layout_cost cost = to_costs.out_cost;
5066 cost.add_serial_cost ({ to_vertex.weight * factor,
5067 m_optimize_size });
5068 cost.split (to_partition.in_degree);
5069 return cost;
5073 /* Compute the cost if we insert any necessary layout change on edge UD. */
5074 auto edge_cost = edge_layout_cost (ud, to_node_i,
5075 to_partition.layout, from_layout_i);
5076 if (edge_cost.is_possible ())
5078 slpg_layout_cost cost = to_costs.out_cost;
5079 cost.add_serial_cost (to_costs.internal_cost);
5080 cost.split (to_partition.in_degree);
5081 cost.add_serial_cost (edge_cost);
5082 return cost;
5085 return slpg_layout_cost::impossible ();
5088 /* Make a forward pass through the partitions, accumulating input costs.
5089 Make a tentative (provisional) choice of layout for each partition,
5090 ensuring that this choice still allows later partitions to keep
5091 their original layout. */
5093 void
5094 vect_optimize_slp_pass::forward_pass ()
5096 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5097 ++partition_i)
5099 auto &partition = m_partitions[partition_i];
5101 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5102 the incoming cost that would apply if every predecessor partition
5103 keeps its current layout. This is used within the loop below. */
5104 slpg_layout_cost in_cost;
5105 slp_tree single_node = nullptr;
5106 if (partition.node_end == partition.node_begin + 1)
5108 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5109 single_node = m_vertices[node_i].node;
5110 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5111 in_cost = total_in_cost (node_i);
5114 /* Go through the possible layouts. Decide which ones are valid
5115 for this partition and record which of the valid layouts has
5116 the lowest cost. */
5117 unsigned int min_layout_i = 0;
5118 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5119 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5121 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5122 if (!layout_costs.is_possible ())
5123 continue;
5125 /* If the recorded layout is already 0 then the layout cannot
5126 change. */
5127 if (partition.layout == 0 && layout_i != 0)
5129 layout_costs.mark_impossible ();
5130 continue;
5133 bool is_possible = true;
5134 for (unsigned int order_i = partition.node_begin;
5135 order_i < partition.node_end; ++order_i)
5137 unsigned int node_i = m_partitioned_nodes[order_i];
5138 auto &vertex = m_vertices[node_i];
5140 /* Reject the layout if it is individually incompatible
5141 with any node in the partition. */
5142 if (!is_compatible_layout (vertex.node, layout_i))
5144 is_possible = false;
5145 break;
5148 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5150 auto &other_vertex = m_vertices[other_node_i];
5151 if (other_vertex.partition < vertex.partition)
5153 /* Accumulate the incoming costs from earlier
5154 partitions, plus the cost of any layout changes
5155 on UD itself. */
5156 auto cost = forward_cost (ud, other_node_i, layout_i);
5157 if (!cost.is_possible ())
5158 is_possible = false;
5159 else
5160 layout_costs.in_cost.add_parallel_cost (cost);
5162 else
5163 /* Reject the layout if it would make layout 0 impossible
5164 for later partitions. This amounts to testing that the
5165 target supports reversing the layout change on edges
5166 to later partitions.
5168 In principle, it might be possible to push a layout
5169 change all the way down a graph, so that it never
5170 needs to be reversed and so that the target doesn't
5171 need to support the reverse operation. But it would
5172 be awkward to bail out if we hit a partition that
5173 does not support the new layout, especially since
5174 we are not dealing with a lattice. */
5175 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5176 layout_i).is_possible ();
5178 for_each_partition_edge (node_i, add_cost);
5180 /* Accumulate the cost of using LAYOUT_I within NODE,
5181 both for the inputs and the outputs. */
5182 int factor = internal_node_cost (vertex.node, layout_i,
5183 layout_i);
5184 if (factor < 0)
5186 is_possible = false;
5187 break;
5189 else if (factor)
5190 layout_costs.internal_cost.add_serial_cost
5191 ({ vertex.weight * factor, m_optimize_size });
5193 if (!is_possible)
5195 layout_costs.mark_impossible ();
5196 continue;
5199 /* Combine the incoming and partition-internal costs. */
5200 slpg_layout_cost combined_cost = layout_costs.in_cost;
5201 combined_cost.add_serial_cost (layout_costs.internal_cost);
5203 /* If this partition consists of a single VEC_PERM_EXPR, see
5204 if the VEC_PERM_EXPR can be changed to support output layout
5205 LAYOUT_I while keeping all the provisional choices of input
5206 layout. */
5207 if (single_node
5208 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5210 int factor = internal_node_cost (single_node, -1, layout_i);
5211 if (factor >= 0)
5213 auto weight = m_vertices[single_node->vertex].weight;
5214 slpg_layout_cost internal_cost
5215 = { weight * factor, m_optimize_size };
5217 slpg_layout_cost alt_cost = in_cost;
5218 alt_cost.add_serial_cost (internal_cost);
5219 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5221 combined_cost = alt_cost;
5222 layout_costs.in_cost = in_cost;
5223 layout_costs.internal_cost = internal_cost;
5228 /* Record the layout with the lowest cost. Prefer layout 0 in
5229 the event of a tie between it and another layout. */
5230 if (!min_layout_cost.is_possible ()
5231 || combined_cost.is_better_than (min_layout_cost,
5232 m_optimize_size))
5234 min_layout_i = layout_i;
5235 min_layout_cost = combined_cost;
5239 /* This loop's handling of earlier partitions should ensure that
5240 choosing the original layout for the current partition is no
5241 less valid than it was in the original graph, even with the
5242 provisional layout choices for those earlier partitions. */
5243 gcc_assert (min_layout_cost.is_possible ());
5244 partition.layout = min_layout_i;
5248 /* Make a backward pass through the partitions, accumulating output costs.
5249 Make a final choice of layout for each partition. */
5251 void
5252 vect_optimize_slp_pass::backward_pass ()
5254 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5256 auto &partition = m_partitions[partition_i];
5258 unsigned int min_layout_i = 0;
5259 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5260 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5262 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5263 if (!layout_costs.is_possible ())
5264 continue;
5266 /* Accumulate the costs from successor partitions. */
5267 bool is_possible = true;
5268 for (unsigned int order_i = partition.node_begin;
5269 order_i < partition.node_end; ++order_i)
5271 unsigned int node_i = m_partitioned_nodes[order_i];
5272 auto &vertex = m_vertices[node_i];
5273 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5275 auto &other_vertex = m_vertices[other_node_i];
5276 auto &other_partition = m_partitions[other_vertex.partition];
5277 if (other_vertex.partition > vertex.partition)
5279 /* Accumulate the incoming costs from later
5280 partitions, plus the cost of any layout changes
5281 on UD itself. */
5282 auto cost = backward_cost (ud, other_node_i, layout_i);
5283 if (!cost.is_possible ())
5284 is_possible = false;
5285 else
5286 layout_costs.out_cost.add_parallel_cost (cost);
5288 else
5289 /* Make sure that earlier partitions can (if necessary
5290 or beneficial) keep the layout that they chose in
5291 the forward pass. This ensures that there is at
5292 least one valid choice of layout. */
5293 is_possible &= edge_layout_cost (ud, other_node_i,
5294 other_partition.layout,
5295 layout_i).is_possible ();
5297 for_each_partition_edge (node_i, add_cost);
5299 if (!is_possible)
5301 layout_costs.mark_impossible ();
5302 continue;
5305 /* Locally combine the costs from the forward and backward passes.
5306 (This combined cost is not passed on, since that would lead
5307 to double counting.) */
5308 slpg_layout_cost combined_cost = layout_costs.in_cost;
5309 combined_cost.add_serial_cost (layout_costs.internal_cost);
5310 combined_cost.add_serial_cost (layout_costs.out_cost);
5312 /* Record the layout with the lowest cost. Prefer layout 0 in
5313 the event of a tie between it and another layout. */
5314 if (!min_layout_cost.is_possible ()
5315 || combined_cost.is_better_than (min_layout_cost,
5316 m_optimize_size))
5318 min_layout_i = layout_i;
5319 min_layout_cost = combined_cost;
5323 gcc_assert (min_layout_cost.is_possible ());
5324 partition.layout = min_layout_i;
5328 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5329 NODE already has the layout that was selected for its partition. */
5331 slp_tree
5332 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5333 unsigned int to_layout_i)
5335 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5336 slp_tree result = m_node_layouts[result_i];
5337 if (result)
5338 return result;
5340 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5341 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5342 /* We can't permute vector defs in place. */
5343 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5345 /* If the vector is uniform or unchanged, there's nothing to do. */
5346 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5347 result = node;
5348 else
5350 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5351 result = vect_create_new_slp_node (scalar_ops);
5352 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5355 else
5357 unsigned int partition_i = m_vertices[node->vertex].partition;
5358 unsigned int from_layout_i = m_partitions[partition_i].layout;
5359 if (from_layout_i == to_layout_i)
5360 return node;
5362 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5363 permutation instead of a serial one. Leave the new permutation
5364 in TMP_PERM on success. */
5365 auto_lane_permutation_t tmp_perm;
5366 unsigned int num_inputs = 1;
5367 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5369 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5370 if (from_layout_i != 0)
5371 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5372 if (to_layout_i != 0)
5373 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5374 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5375 tmp_perm,
5376 SLP_TREE_CHILDREN (node),
5377 false) >= 0)
5378 num_inputs = SLP_TREE_CHILDREN (node).length ();
5379 else
5380 tmp_perm.truncate (0);
5383 if (dump_enabled_p ())
5385 if (tmp_perm.length () > 0)
5386 dump_printf_loc (MSG_NOTE, vect_location,
5387 "duplicating permutation node %p with"
5388 " layout %d\n",
5389 (void *) node, to_layout_i);
5390 else
5391 dump_printf_loc (MSG_NOTE, vect_location,
5392 "inserting permutation node in place of %p\n",
5393 (void *) node);
5396 unsigned int num_lanes = SLP_TREE_LANES (node);
5397 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5398 if (SLP_TREE_SCALAR_STMTS (node).length ())
5400 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5401 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5402 if (from_layout_i != 0)
5403 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5404 if (to_layout_i != 0)
5405 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5407 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5408 SLP_TREE_LANES (result) = num_lanes;
5409 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5410 result->vertex = -1;
5412 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5413 if (tmp_perm.length ())
5415 lane_perm.safe_splice (tmp_perm);
5416 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5418 else
5420 lane_perm.create (num_lanes);
5421 for (unsigned j = 0; j < num_lanes; ++j)
5422 lane_perm.quick_push ({ 0, j });
5423 if (from_layout_i != 0)
5424 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5425 if (to_layout_i != 0)
5426 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5427 SLP_TREE_CHILDREN (result).safe_push (node);
5429 for (slp_tree child : SLP_TREE_CHILDREN (result))
5430 child->refcnt++;
5432 m_node_layouts[result_i] = result;
5433 return result;
5436 /* Apply the chosen vector layouts to the SLP graph. */
5438 void
5439 vect_optimize_slp_pass::materialize ()
5441 /* We no longer need the costs, so avoid having two O(N * P) arrays
5442 live at the same time. */
5443 m_partition_layout_costs.release ();
5444 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5446 auto_sbitmap fully_folded (m_vertices.length ());
5447 bitmap_clear (fully_folded);
5448 for (unsigned int node_i : m_partitioned_nodes)
5450 auto &vertex = m_vertices[node_i];
5451 slp_tree node = vertex.node;
5452 int layout_i = m_partitions[vertex.partition].layout;
5453 gcc_assert (layout_i >= 0);
5455 /* Rearrange the scalar statements to match the chosen layout. */
5456 if (layout_i > 0)
5457 vect_slp_permute (m_perms[layout_i],
5458 SLP_TREE_SCALAR_STMTS (node), true);
5460 /* Update load and lane permutations. */
5461 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5463 /* First try to absorb the input vector layouts. If that fails,
5464 force the inputs to have layout LAYOUT_I too. We checked that
5465 that was possible before deciding to use nonzero output layouts.
5466 (Note that at this stage we don't really have any guarantee that
5467 the target supports the original VEC_PERM_EXPR.) */
5468 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5469 auto_lane_permutation_t tmp_perm;
5470 tmp_perm.safe_splice (perm);
5471 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5472 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5473 tmp_perm,
5474 SLP_TREE_CHILDREN (node),
5475 false) >= 0)
5477 if (dump_enabled_p ()
5478 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5479 perm.begin ()))
5480 dump_printf_loc (MSG_NOTE, vect_location,
5481 "absorbing input layouts into %p\n",
5482 (void *) node);
5483 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5484 bitmap_set_bit (fully_folded, node_i);
5486 else
5488 /* Not MSG_MISSED because it would make no sense to users. */
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_NOTE, vect_location,
5491 "failed to absorb input layouts into %p\n",
5492 (void *) node);
5493 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5496 else
5498 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5499 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5500 if (layout_i > 0)
5501 /* ??? When we handle non-bijective permutes the idea
5502 is that we can force the load-permutation to be
5503 { min, min + 1, min + 2, ... max }. But then the
5504 scalar defs might no longer match the lane content
5505 which means wrong-code with live lane vectorization.
5506 So we possibly have to have NULL entries for those. */
5507 vect_slp_permute (m_perms[layout_i], load_perm, true);
5511 /* Do this before any nodes disappear, since it involves a walk
5512 over the leaves. */
5513 remove_redundant_permutations ();
5515 /* Replace each child with a correctly laid-out version. */
5516 for (unsigned int node_i : m_partitioned_nodes)
5518 /* Skip nodes that have already been handled above. */
5519 if (bitmap_bit_p (fully_folded, node_i))
5520 continue;
5522 auto &vertex = m_vertices[node_i];
5523 int in_layout_i = m_partitions[vertex.partition].layout;
5524 gcc_assert (in_layout_i >= 0);
5526 unsigned j;
5527 slp_tree child;
5528 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5530 if (!child)
5531 continue;
5533 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5534 if (new_child != child)
5536 vect_free_slp_tree (child);
5537 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5538 new_child->refcnt += 1;
5544 /* Elide load permutations that are not necessary. Such permutations might
5545 be pre-existing, rather than created by the layout optimizations. */
5547 void
5548 vect_optimize_slp_pass::remove_redundant_permutations ()
5550 for (unsigned int node_i : m_leafs)
5552 slp_tree node = m_vertices[node_i].node;
5553 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5554 continue;
5556 /* In basic block vectorization we allow any subchain of an interleaving
5557 chain.
5558 FORNOW: not in loop SLP because of realignment complications. */
5559 if (is_a <bb_vec_info> (m_vinfo))
5561 bool subchain_p = true;
5562 stmt_vec_info next_load_info = NULL;
5563 stmt_vec_info load_info;
5564 unsigned j;
5565 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5567 if (j != 0
5568 && (next_load_info != load_info
5569 || DR_GROUP_GAP (load_info) != 1))
5571 subchain_p = false;
5572 break;
5574 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5576 if (subchain_p)
5578 SLP_TREE_LOAD_PERMUTATION (node).release ();
5579 continue;
5582 else
5584 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5585 stmt_vec_info load_info;
5586 bool this_load_permuted = false;
5587 unsigned j;
5588 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5589 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5591 this_load_permuted = true;
5592 break;
5594 /* When this isn't a grouped access we know it's single element
5595 and contiguous. */
5596 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5598 if (!this_load_permuted
5599 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5600 || SLP_TREE_LANES (node) == 1))
5601 SLP_TREE_LOAD_PERMUTATION (node).release ();
5602 continue;
5604 stmt_vec_info first_stmt_info
5605 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5606 if (!this_load_permuted
5607 /* The load requires permutation when unrolling exposes
5608 a gap either because the group is larger than the SLP
5609 group-size or because there is a gap between the groups. */
5610 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5611 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5612 && DR_GROUP_GAP (first_stmt_info) == 0)))
5614 SLP_TREE_LOAD_PERMUTATION (node).release ();
5615 continue;
5621 /* Print the partition graph and layout information to the dump file. */
5623 void
5624 vect_optimize_slp_pass::dump ()
5626 dump_printf_loc (MSG_NOTE, vect_location,
5627 "SLP optimize permutations:\n");
5628 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5630 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5631 const char *sep = "";
5632 for (unsigned int idx : m_perms[layout_i])
5634 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5635 sep = ", ";
5637 dump_printf (MSG_NOTE, " }\n");
5639 dump_printf_loc (MSG_NOTE, vect_location,
5640 "SLP optimize partitions:\n");
5641 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5642 ++partition_i)
5644 auto &partition = m_partitions[partition_i];
5645 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 " partition %d (layout %d):\n",
5648 partition_i, partition.layout);
5649 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5650 for (unsigned int order_i = partition.node_begin;
5651 order_i < partition.node_end; ++order_i)
5653 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5654 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5655 (void *) vertex.node);
5656 dump_printf_loc (MSG_NOTE, vect_location,
5657 " weight: %f\n",
5658 vertex.weight.to_double ());
5659 if (vertex.out_degree)
5660 dump_printf_loc (MSG_NOTE, vect_location,
5661 " out weight: %f (degree %d)\n",
5662 vertex.out_weight.to_double (),
5663 vertex.out_degree);
5664 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5665 dump_printf_loc (MSG_NOTE, vect_location,
5666 " op: VEC_PERM_EXPR\n");
5667 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5668 dump_printf_loc (MSG_NOTE, vect_location,
5669 " op template: %G", rep->stmt);
5671 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5672 for (unsigned int order_i = partition.node_begin;
5673 order_i < partition.node_end; ++order_i)
5675 unsigned int node_i = m_partitioned_nodes[order_i];
5676 auto &vertex = m_vertices[node_i];
5677 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5679 auto &other_vertex = m_vertices[other_node_i];
5680 if (other_vertex.partition < vertex.partition)
5681 dump_printf_loc (MSG_NOTE, vect_location,
5682 " - %p [%d] --> %p\n",
5683 (void *) other_vertex.node,
5684 other_vertex.partition,
5685 (void *) vertex.node);
5686 else
5687 dump_printf_loc (MSG_NOTE, vect_location,
5688 " - %p --> [%d] %p\n",
5689 (void *) vertex.node,
5690 other_vertex.partition,
5691 (void *) other_vertex.node);
5693 for_each_partition_edge (node_i, print_edge);
5696 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5698 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5699 if (layout_costs.is_possible ())
5701 dump_printf_loc (MSG_NOTE, vect_location,
5702 " layout %d:%s\n", layout_i,
5703 partition.layout == int (layout_i)
5704 ? " (*)" : "");
5705 slpg_layout_cost combined_cost = layout_costs.in_cost;
5706 combined_cost.add_serial_cost (layout_costs.internal_cost);
5707 combined_cost.add_serial_cost (layout_costs.out_cost);
5708 #define TEMPLATE "{depth: %f, total: %f}"
5709 dump_printf_loc (MSG_NOTE, vect_location,
5710 " " TEMPLATE "\n",
5711 layout_costs.in_cost.depth.to_double (),
5712 layout_costs.in_cost.total.to_double ());
5713 dump_printf_loc (MSG_NOTE, vect_location,
5714 " + " TEMPLATE "\n",
5715 layout_costs.internal_cost.depth.to_double (),
5716 layout_costs.internal_cost.total.to_double ());
5717 dump_printf_loc (MSG_NOTE, vect_location,
5718 " + " TEMPLATE "\n",
5719 layout_costs.out_cost.depth.to_double (),
5720 layout_costs.out_cost.total.to_double ());
5721 dump_printf_loc (MSG_NOTE, vect_location,
5722 " = " TEMPLATE "\n",
5723 combined_cost.depth.to_double (),
5724 combined_cost.total.to_double ());
5725 #undef TEMPLATE
5727 else
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 " layout %d: rejected\n", layout_i);
5734 /* Main entry point for the SLP graph optimization pass. */
5736 void
5737 vect_optimize_slp_pass::run ()
5739 build_graph ();
5740 create_partitions ();
5741 start_choosing_layouts ();
5742 if (m_perms.length () > 1)
5744 forward_pass ();
5745 backward_pass ();
5746 if (dump_enabled_p ())
5747 dump ();
5748 materialize ();
5749 while (!m_perms.is_empty ())
5750 m_perms.pop ().release ();
5752 else
5753 remove_redundant_permutations ();
5754 free_graph (m_slpg);
5757 /* Optimize the SLP graph of VINFO. */
5759 void
5760 vect_optimize_slp (vec_info *vinfo)
5762 if (vinfo->slp_instances.is_empty ())
5763 return;
5764 vect_optimize_slp_pass (vinfo).run ();
5767 /* Gather loads reachable from the individual SLP graph entries. */
5769 void
5770 vect_gather_slp_loads (vec_info *vinfo)
5772 unsigned i;
5773 slp_instance instance;
5774 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5776 hash_set<slp_tree> visited;
5777 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5778 SLP_INSTANCE_TREE (instance), visited);
5783 /* For each possible SLP instance decide whether to SLP it and calculate overall
5784 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5785 least one instance. */
5787 bool
5788 vect_make_slp_decision (loop_vec_info loop_vinfo)
5790 unsigned int i;
5791 poly_uint64 unrolling_factor = 1;
5792 const vec<slp_instance> &slp_instances
5793 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5794 slp_instance instance;
5795 int decided_to_slp = 0;
5797 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5799 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5801 /* FORNOW: SLP if you can. */
5802 /* All unroll factors have the form:
5804 GET_MODE_SIZE (vinfo->vector_mode) * X
5806 for some rational X, so they must have a common multiple. */
5807 unrolling_factor
5808 = force_common_multiple (unrolling_factor,
5809 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5811 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5812 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5813 loop-based vectorization. Such stmts will be marked as HYBRID. */
5814 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5815 decided_to_slp++;
5818 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5820 if (decided_to_slp && dump_enabled_p ())
5822 dump_printf_loc (MSG_NOTE, vect_location,
5823 "Decided to SLP %d instances. Unrolling factor ",
5824 decided_to_slp);
5825 dump_dec (MSG_NOTE, unrolling_factor);
5826 dump_printf (MSG_NOTE, "\n");
5829 return (decided_to_slp > 0);
5832 /* Private data for vect_detect_hybrid_slp. */
5833 struct vdhs_data
5835 loop_vec_info loop_vinfo;
5836 vec<stmt_vec_info> *worklist;
5839 /* Walker for walk_gimple_op. */
5841 static tree
5842 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5844 walk_stmt_info *wi = (walk_stmt_info *)data;
5845 vdhs_data *dat = (vdhs_data *)wi->info;
5847 if (wi->is_lhs)
5848 return NULL_TREE;
5850 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5851 if (!def_stmt_info)
5852 return NULL_TREE;
5853 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5854 if (PURE_SLP_STMT (def_stmt_info))
5856 if (dump_enabled_p ())
5857 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5858 def_stmt_info->stmt);
5859 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5860 dat->worklist->safe_push (def_stmt_info);
5863 return NULL_TREE;
5866 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5867 if so, otherwise pushing it to WORKLIST. */
5869 static void
5870 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5871 vec<stmt_vec_info> &worklist,
5872 stmt_vec_info stmt_info)
5874 if (dump_enabled_p ())
5875 dump_printf_loc (MSG_NOTE, vect_location,
5876 "Processing hybrid candidate : %G", stmt_info->stmt);
5877 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5878 imm_use_iterator iter2;
5879 ssa_op_iter iter1;
5880 use_operand_p use_p;
5881 def_operand_p def_p;
5882 bool any_def = false;
5883 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5885 any_def = true;
5886 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5888 if (is_gimple_debug (USE_STMT (use_p)))
5889 continue;
5890 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5891 /* An out-of loop use means this is a loop_vect sink. */
5892 if (!use_info)
5894 if (dump_enabled_p ())
5895 dump_printf_loc (MSG_NOTE, vect_location,
5896 "Found loop_vect sink: %G", stmt_info->stmt);
5897 worklist.safe_push (stmt_info);
5898 return;
5900 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5902 if (dump_enabled_p ())
5903 dump_printf_loc (MSG_NOTE, vect_location,
5904 "Found loop_vect use: %G", use_info->stmt);
5905 worklist.safe_push (stmt_info);
5906 return;
5910 /* No def means this is a loo_vect sink. */
5911 if (!any_def)
5913 if (dump_enabled_p ())
5914 dump_printf_loc (MSG_NOTE, vect_location,
5915 "Found loop_vect sink: %G", stmt_info->stmt);
5916 worklist.safe_push (stmt_info);
5917 return;
5919 if (dump_enabled_p ())
5920 dump_printf_loc (MSG_NOTE, vect_location,
5921 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5922 STMT_SLP_TYPE (stmt_info) = pure_slp;
5925 /* Find stmts that must be both vectorized and SLPed. */
5927 void
5928 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5930 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5932 /* All stmts participating in SLP are marked pure_slp, all other
5933 stmts are loop_vect.
5934 First collect all loop_vect stmts into a worklist.
5935 SLP patterns cause not all original scalar stmts to appear in
5936 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5937 Rectify this here and do a backward walk over the IL only considering
5938 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5939 mark them as pure_slp. */
5940 auto_vec<stmt_vec_info> worklist;
5941 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5943 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5944 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5945 gsi_next (&gsi))
5947 gphi *phi = gsi.phi ();
5948 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5949 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5950 maybe_push_to_hybrid_worklist (loop_vinfo,
5951 worklist, stmt_info);
5953 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5954 gsi_prev (&gsi))
5956 gimple *stmt = gsi_stmt (gsi);
5957 if (is_gimple_debug (stmt))
5958 continue;
5959 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5960 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5962 for (gimple_stmt_iterator gsi2
5963 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5964 !gsi_end_p (gsi2); gsi_next (&gsi2))
5966 stmt_vec_info patt_info
5967 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5968 if (!STMT_SLP_TYPE (patt_info)
5969 && STMT_VINFO_RELEVANT (patt_info))
5970 maybe_push_to_hybrid_worklist (loop_vinfo,
5971 worklist, patt_info);
5973 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5975 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5976 maybe_push_to_hybrid_worklist (loop_vinfo,
5977 worklist, stmt_info);
5981 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5982 mark any SLP vectorized stmt as hybrid.
5983 ??? We're visiting def stmts N times (once for each non-SLP and
5984 once for each hybrid-SLP use). */
5985 walk_stmt_info wi;
5986 vdhs_data dat;
5987 dat.worklist = &worklist;
5988 dat.loop_vinfo = loop_vinfo;
5989 memset (&wi, 0, sizeof (wi));
5990 wi.info = (void *)&dat;
5991 while (!worklist.is_empty ())
5993 stmt_vec_info stmt_info = worklist.pop ();
5994 /* Since SSA operands are not set up for pattern stmts we need
5995 to use walk_gimple_op. */
5996 wi.is_lhs = 0;
5997 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5998 /* For gather/scatter make sure to walk the offset operand, that
5999 can be a scaling and conversion away. */
6000 gather_scatter_info gs_info;
6001 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6002 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6004 int dummy;
6005 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6011 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6013 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6014 : vec_info (vec_info::bb, shared),
6015 bbs (_bbs),
6016 roots (vNULL)
6018 for (unsigned i = 0; i < bbs.length (); ++i)
6020 if (i != 0)
6021 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6022 gsi_next (&si))
6024 gphi *phi = si.phi ();
6025 gimple_set_uid (phi, 0);
6026 add_stmt (phi);
6028 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6029 !gsi_end_p (gsi); gsi_next (&gsi))
6031 gimple *stmt = gsi_stmt (gsi);
6032 gimple_set_uid (stmt, 0);
6033 if (is_gimple_debug (stmt))
6034 continue;
6035 add_stmt (stmt);
6041 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6042 stmts in the basic block. */
6044 _bb_vec_info::~_bb_vec_info ()
6046 /* Reset region marker. */
6047 for (unsigned i = 0; i < bbs.length (); ++i)
6049 if (i != 0)
6050 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6051 gsi_next (&si))
6053 gphi *phi = si.phi ();
6054 gimple_set_uid (phi, -1);
6056 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6057 !gsi_end_p (gsi); gsi_next (&gsi))
6059 gimple *stmt = gsi_stmt (gsi);
6060 gimple_set_uid (stmt, -1);
6064 for (unsigned i = 0; i < roots.length (); ++i)
6066 roots[i].stmts.release ();
6067 roots[i].roots.release ();
6068 roots[i].remain.release ();
6070 roots.release ();
6073 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6074 given then that child nodes have already been processed, and that
6075 their def types currently match their SLP node's def type. */
6077 static bool
6078 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6079 slp_instance node_instance,
6080 stmt_vector_for_cost *cost_vec)
6082 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6084 /* Calculate the number of vector statements to be created for the
6085 scalar stmts in this node. For SLP reductions it is equal to the
6086 number of vector statements in the children (which has already been
6087 calculated by the recursive call). Otherwise it is the number of
6088 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6089 VF divided by the number of elements in a vector. */
6090 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6091 && !STMT_VINFO_DATA_REF (stmt_info)
6092 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6094 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6095 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6097 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6098 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6099 break;
6102 else
6104 poly_uint64 vf;
6105 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6106 vf = loop_vinfo->vectorization_factor;
6107 else
6108 vf = 1;
6109 unsigned int group_size = SLP_TREE_LANES (node);
6110 tree vectype = SLP_TREE_VECTYPE (node);
6111 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6112 = vect_get_num_vectors (vf * group_size, vectype);
6115 /* Handle purely internal nodes. */
6116 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6118 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6119 return false;
6121 stmt_vec_info slp_stmt_info;
6122 unsigned int i;
6123 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6125 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6126 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6127 node_instance, i,
6128 false, cost_vec))
6129 return false;
6131 return true;
6134 bool dummy;
6135 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6136 node, node_instance, cost_vec);
6139 /* Try to build NODE from scalars, returning true on success.
6140 NODE_INSTANCE is the SLP instance that contains NODE. */
6142 static bool
6143 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6144 slp_instance node_instance)
6146 stmt_vec_info stmt_info;
6147 unsigned int i;
6149 if (!is_a <bb_vec_info> (vinfo)
6150 || node == SLP_INSTANCE_TREE (node_instance)
6151 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6152 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6153 /* Force the mask use to be built from scalars instead. */
6154 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6155 return false;
6157 if (dump_enabled_p ())
6158 dump_printf_loc (MSG_NOTE, vect_location,
6159 "Building vector operands of %p from scalars instead\n",
6160 (void *) node);
6162 /* Don't remove and free the child nodes here, since they could be
6163 referenced by other structures. The analysis and scheduling phases
6164 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6165 unsigned int group_size = SLP_TREE_LANES (node);
6166 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6167 /* Invariants get their vector type from the uses. */
6168 SLP_TREE_VECTYPE (node) = NULL_TREE;
6169 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6170 SLP_TREE_LOAD_PERMUTATION (node).release ();
6171 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6173 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6174 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6176 return true;
6179 /* Return true if all elements of the slice are the same. */
6180 bool
6181 vect_scalar_ops_slice::all_same_p () const
6183 for (unsigned int i = 1; i < length; ++i)
6184 if (!operand_equal_p (op (0), op (i)))
6185 return false;
6186 return true;
6189 hashval_t
6190 vect_scalar_ops_slice_hash::hash (const value_type &s)
6192 hashval_t hash = 0;
6193 for (unsigned i = 0; i < s.length; ++i)
6194 hash = iterative_hash_expr (s.op (i), hash);
6195 return hash;
6198 bool
6199 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6200 const compare_type &s2)
6202 if (s1.length != s2.length)
6203 return false;
6204 for (unsigned i = 0; i < s1.length; ++i)
6205 if (!operand_equal_p (s1.op (i), s2.op (i)))
6206 return false;
6207 return true;
6210 /* Compute the prologue cost for invariant or constant operands represented
6211 by NODE. */
6213 static void
6214 vect_prologue_cost_for_slp (slp_tree node,
6215 stmt_vector_for_cost *cost_vec)
6217 /* There's a special case of an existing vector, that costs nothing. */
6218 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6219 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6220 return;
6221 /* Without looking at the actual initializer a vector of
6222 constants can be implemented as load from the constant pool.
6223 When all elements are the same we can use a splat. */
6224 tree vectype = SLP_TREE_VECTYPE (node);
6225 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6226 unsigned HOST_WIDE_INT const_nunits;
6227 unsigned nelt_limit;
6228 auto ops = &SLP_TREE_SCALAR_OPS (node);
6229 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6230 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6231 && ! multiple_p (const_nunits, group_size))
6233 nelt_limit = const_nunits;
6234 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6235 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6236 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6237 starts.quick_push (i * const_nunits);
6239 else
6241 /* If either the vector has variable length or the vectors
6242 are composed of repeated whole groups we only need to
6243 cost construction once. All vectors will be the same. */
6244 nelt_limit = group_size;
6245 starts.quick_push (0);
6247 /* ??? We're just tracking whether vectors in a single node are the same.
6248 Ideally we'd do something more global. */
6249 bool passed = false;
6250 for (unsigned int start : starts)
6252 vect_cost_for_stmt kind;
6253 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6254 kind = vector_load;
6255 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6256 kind = scalar_to_vec;
6257 else
6258 kind = vec_construct;
6259 /* The target cost hook has no idea which part of the SLP node
6260 we are costing so avoid passing it down more than once. Pass
6261 it to the first vec_construct or scalar_to_vec part since for those
6262 the x86 backend tries to account for GPR to XMM register moves. */
6263 record_stmt_cost (cost_vec, 1, kind,
6264 (kind != vector_load && !passed) ? node : nullptr,
6265 vectype, 0, vect_prologue);
6266 if (kind != vector_load)
6267 passed = true;
6271 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6272 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6274 Return true if the operations are supported. */
6276 static bool
6277 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6278 slp_instance node_instance,
6279 hash_set<slp_tree> &visited_set,
6280 vec<slp_tree> &visited_vec,
6281 stmt_vector_for_cost *cost_vec)
6283 int i, j;
6284 slp_tree child;
6286 /* Assume we can code-generate all invariants. */
6287 if (!node
6288 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6289 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6290 return true;
6292 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6294 if (dump_enabled_p ())
6295 dump_printf_loc (MSG_NOTE, vect_location,
6296 "Failed cyclic SLP reference in %p\n", (void *) node);
6297 return false;
6299 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6301 /* If we already analyzed the exact same set of scalar stmts we're done.
6302 We share the generated vector stmts for those. */
6303 if (visited_set.add (node))
6304 return true;
6305 visited_vec.safe_push (node);
6307 bool res = true;
6308 unsigned visited_rec_start = visited_vec.length ();
6309 unsigned cost_vec_rec_start = cost_vec->length ();
6310 bool seen_non_constant_child = false;
6311 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6313 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6314 visited_set, visited_vec,
6315 cost_vec);
6316 if (!res)
6317 break;
6318 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6319 seen_non_constant_child = true;
6321 /* We're having difficulties scheduling nodes with just constant
6322 operands and no scalar stmts since we then cannot compute a stmt
6323 insertion place. */
6324 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6326 if (dump_enabled_p ())
6327 dump_printf_loc (MSG_NOTE, vect_location,
6328 "Cannot vectorize all-constant op node %p\n",
6329 (void *) node);
6330 res = false;
6333 if (res)
6334 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6335 cost_vec);
6336 /* If analysis failed we have to pop all recursive visited nodes
6337 plus ourselves. */
6338 if (!res)
6340 while (visited_vec.length () >= visited_rec_start)
6341 visited_set.remove (visited_vec.pop ());
6342 cost_vec->truncate (cost_vec_rec_start);
6345 /* When the node can be vectorized cost invariant nodes it references.
6346 This is not done in DFS order to allow the refering node
6347 vectorizable_* calls to nail down the invariant nodes vector type
6348 and possibly unshare it if it needs a different vector type than
6349 other referrers. */
6350 if (res)
6351 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6352 if (child
6353 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6354 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6355 /* Perform usual caching, note code-generation still
6356 code-gens these nodes multiple times but we expect
6357 to CSE them later. */
6358 && !visited_set.add (child))
6360 visited_vec.safe_push (child);
6361 /* ??? After auditing more code paths make a "default"
6362 and push the vector type from NODE to all children
6363 if it is not already set. */
6364 /* Compute the number of vectors to be generated. */
6365 tree vector_type = SLP_TREE_VECTYPE (child);
6366 if (!vector_type)
6368 /* For shifts with a scalar argument we don't need
6369 to cost or code-generate anything.
6370 ??? Represent this more explicitely. */
6371 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6372 == shift_vec_info_type)
6373 && j == 1);
6374 continue;
6376 unsigned group_size = SLP_TREE_LANES (child);
6377 poly_uint64 vf = 1;
6378 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6379 vf = loop_vinfo->vectorization_factor;
6380 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6381 = vect_get_num_vectors (vf * group_size, vector_type);
6382 /* And cost them. */
6383 vect_prologue_cost_for_slp (child, cost_vec);
6386 /* If this node or any of its children can't be vectorized, try pruning
6387 the tree here rather than felling the whole thing. */
6388 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6390 /* We'll need to revisit this for invariant costing and number
6391 of vectorized stmt setting. */
6392 res = true;
6395 return res;
6398 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6399 region and that can be vectorized using vectorizable_live_operation
6400 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6401 scalar code computing it to be retained. */
6403 static void
6404 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6405 slp_instance instance,
6406 stmt_vector_for_cost *cost_vec,
6407 hash_set<stmt_vec_info> &svisited,
6408 hash_set<slp_tree> &visited)
6410 if (visited.add (node))
6411 return;
6413 unsigned i;
6414 stmt_vec_info stmt_info;
6415 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6418 if (svisited.contains (stmt_info))
6419 continue;
6420 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6421 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6422 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6423 /* Only the pattern root stmt computes the original scalar value. */
6424 continue;
6425 bool mark_visited = true;
6426 gimple *orig_stmt = orig_stmt_info->stmt;
6427 ssa_op_iter op_iter;
6428 def_operand_p def_p;
6429 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6431 imm_use_iterator use_iter;
6432 gimple *use_stmt;
6433 stmt_vec_info use_stmt_info;
6434 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6435 if (!is_gimple_debug (use_stmt))
6437 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6438 if (!use_stmt_info
6439 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6441 STMT_VINFO_LIVE_P (stmt_info) = true;
6442 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6443 node, instance, i,
6444 false, cost_vec))
6445 /* ??? So we know we can vectorize the live stmt
6446 from one SLP node. If we cannot do so from all
6447 or none consistently we'd have to record which
6448 SLP node (and lane) we want to use for the live
6449 operation. So make sure we can code-generate
6450 from all nodes. */
6451 mark_visited = false;
6452 else
6453 STMT_VINFO_LIVE_P (stmt_info) = false;
6454 break;
6457 /* We have to verify whether we can insert the lane extract
6458 before all uses. The following is a conservative approximation.
6459 We cannot put this into vectorizable_live_operation because
6460 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6461 doesn't work.
6462 Note that while the fact that we emit code for loads at the
6463 first load should make this a non-problem leafs we construct
6464 from scalars are vectorized after the last scalar def.
6465 ??? If we'd actually compute the insert location during
6466 analysis we could use sth less conservative than the last
6467 scalar stmt in the node for the dominance check. */
6468 /* ??? What remains is "live" uses in vector CTORs in the same
6469 SLP graph which is where those uses can end up code-generated
6470 right after their definition instead of close to their original
6471 use. But that would restrict us to code-generate lane-extracts
6472 from the latest stmt in a node. So we compensate for this
6473 during code-generation, simply not replacing uses for those
6474 hopefully rare cases. */
6475 if (STMT_VINFO_LIVE_P (stmt_info))
6476 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6477 if (!is_gimple_debug (use_stmt)
6478 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6479 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6480 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6482 if (dump_enabled_p ())
6483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6484 "Cannot determine insertion place for "
6485 "lane extract\n");
6486 STMT_VINFO_LIVE_P (stmt_info) = false;
6487 mark_visited = true;
6490 if (mark_visited)
6491 svisited.add (stmt_info);
6494 slp_tree child;
6495 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6496 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6497 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6498 cost_vec, svisited, visited);
6501 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6503 static bool
6504 vectorizable_bb_reduc_epilogue (slp_instance instance,
6505 stmt_vector_for_cost *cost_vec)
6507 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6508 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6509 if (reduc_code == MINUS_EXPR)
6510 reduc_code = PLUS_EXPR;
6511 internal_fn reduc_fn;
6512 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6513 if (!vectype
6514 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6515 || reduc_fn == IFN_LAST
6516 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6517 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6518 TREE_TYPE (vectype)))
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "not vectorized: basic block reduction epilogue "
6523 "operation unsupported.\n");
6524 return false;
6527 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6528 cost log2 vector operations plus shuffles and one extraction. */
6529 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6530 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6531 vectype, 0, vect_body);
6532 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6533 vectype, 0, vect_body);
6534 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6535 vectype, 0, vect_body);
6537 /* Since we replace all stmts of a possibly longer scalar reduction
6538 chain account for the extra scalar stmts for that. */
6539 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6540 instance->root_stmts[0], 0, vect_body);
6541 return true;
6544 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6545 and recurse to children. */
6547 static void
6548 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6549 hash_set<slp_tree> &visited)
6551 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6552 || visited.add (node))
6553 return;
6555 stmt_vec_info stmt;
6556 unsigned i;
6557 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6558 roots.remove (vect_orig_stmt (stmt));
6560 slp_tree child;
6561 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6562 if (child)
6563 vect_slp_prune_covered_roots (child, roots, visited);
6566 /* Analyze statements in SLP instances of VINFO. Return true if the
6567 operations are supported. */
6569 bool
6570 vect_slp_analyze_operations (vec_info *vinfo)
6572 slp_instance instance;
6573 int i;
6575 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6577 hash_set<slp_tree> visited;
6578 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6580 auto_vec<slp_tree> visited_vec;
6581 stmt_vector_for_cost cost_vec;
6582 cost_vec.create (2);
6583 if (is_a <bb_vec_info> (vinfo))
6584 vect_location = instance->location ();
6585 if (!vect_slp_analyze_node_operations (vinfo,
6586 SLP_INSTANCE_TREE (instance),
6587 instance, visited, visited_vec,
6588 &cost_vec)
6589 /* CTOR instances require vectorized defs for the SLP tree root. */
6590 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6591 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6592 != vect_internal_def
6593 /* Make sure we vectorized with the expected type. */
6594 || !useless_type_conversion_p
6595 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6596 (instance->root_stmts[0]->stmt))),
6597 TREE_TYPE (SLP_TREE_VECTYPE
6598 (SLP_INSTANCE_TREE (instance))))))
6599 /* Check we can vectorize the reduction. */
6600 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6601 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6603 slp_tree node = SLP_INSTANCE_TREE (instance);
6604 stmt_vec_info stmt_info;
6605 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6606 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6607 else
6608 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6609 if (dump_enabled_p ())
6610 dump_printf_loc (MSG_NOTE, vect_location,
6611 "removing SLP instance operations starting from: %G",
6612 stmt_info->stmt);
6613 vect_free_slp_instance (instance);
6614 vinfo->slp_instances.ordered_remove (i);
6615 cost_vec.release ();
6616 while (!visited_vec.is_empty ())
6617 visited.remove (visited_vec.pop ());
6619 else
6621 i++;
6622 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6624 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6625 cost_vec.release ();
6627 else
6628 /* For BB vectorization remember the SLP graph entry
6629 cost for later. */
6630 instance->cost_vec = cost_vec;
6634 /* Now look for SLP instances with a root that are covered by other
6635 instances and remove them. */
6636 hash_set<stmt_vec_info> roots;
6637 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6638 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6639 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6640 if (!roots.is_empty ())
6642 visited.empty ();
6643 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6644 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6645 visited);
6646 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6647 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6648 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6650 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_NOTE, vect_location,
6653 "removing SLP instance operations starting "
6654 "from: %G", root->stmt);
6655 vect_free_slp_instance (instance);
6656 vinfo->slp_instances.ordered_remove (i);
6658 else
6659 ++i;
6662 /* Compute vectorizable live stmts. */
6663 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6665 hash_set<stmt_vec_info> svisited;
6666 hash_set<slp_tree> visited;
6667 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6669 vect_location = instance->location ();
6670 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6671 instance, &instance->cost_vec, svisited,
6672 visited);
6676 return !vinfo->slp_instances.is_empty ();
6679 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6680 closing the eventual chain. */
6682 static slp_instance
6683 get_ultimate_leader (slp_instance instance,
6684 hash_map<slp_instance, slp_instance> &instance_leader)
6686 auto_vec<slp_instance *, 8> chain;
6687 slp_instance *tem;
6688 while (*(tem = instance_leader.get (instance)) != instance)
6690 chain.safe_push (tem);
6691 instance = *tem;
6693 while (!chain.is_empty ())
6694 *chain.pop () = instance;
6695 return instance;
6698 namespace {
6699 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6700 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6701 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6703 INSTANCE_LEADER is as for get_ultimate_leader. */
6705 template<typename T>
6706 bool
6707 vect_map_to_instance (slp_instance instance, T key,
6708 hash_map<T, slp_instance> &key_to_instance,
6709 hash_map<slp_instance, slp_instance> &instance_leader)
6711 bool existed_p;
6712 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6713 if (!existed_p)
6715 else if (key_instance != instance)
6717 /* If we're running into a previously marked key make us the
6718 leader of the current ultimate leader. This keeps the
6719 leader chain acyclic and works even when the current instance
6720 connects two previously independent graph parts. */
6721 slp_instance key_leader
6722 = get_ultimate_leader (key_instance, instance_leader);
6723 if (key_leader != instance)
6724 instance_leader.put (key_leader, instance);
6726 key_instance = instance;
6727 return existed_p;
6731 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6733 static void
6734 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6735 slp_instance instance, slp_tree node,
6736 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6737 hash_map<slp_tree, slp_instance> &node_to_instance,
6738 hash_map<slp_instance, slp_instance> &instance_leader)
6740 stmt_vec_info stmt_info;
6741 unsigned i;
6743 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6744 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6745 instance_leader);
6747 if (vect_map_to_instance (instance, node, node_to_instance,
6748 instance_leader))
6749 return;
6751 slp_tree child;
6752 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6753 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6754 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6755 node_to_instance, instance_leader);
6758 /* Partition the SLP graph into pieces that can be costed independently. */
6760 static void
6761 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6763 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6765 /* First walk the SLP graph assigning each involved scalar stmt a
6766 corresponding SLP graph entry and upon visiting a previously
6767 marked stmt, make the stmts leader the current SLP graph entry. */
6768 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6769 hash_map<slp_tree, slp_instance> node_to_instance;
6770 hash_map<slp_instance, slp_instance> instance_leader;
6771 slp_instance instance;
6772 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6774 instance_leader.put (instance, instance);
6775 vect_bb_partition_graph_r (bb_vinfo,
6776 instance, SLP_INSTANCE_TREE (instance),
6777 stmt_to_instance, node_to_instance,
6778 instance_leader);
6781 /* Then collect entries to each independent subgraph. */
6782 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6784 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6785 leader->subgraph_entries.safe_push (instance);
6786 if (dump_enabled_p ()
6787 && leader != instance)
6788 dump_printf_loc (MSG_NOTE, vect_location,
6789 "instance %p is leader of %p\n",
6790 (void *) leader, (void *) instance);
6794 /* Compute the set of scalar stmts participating in internal and external
6795 nodes. */
6797 static void
6798 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6799 hash_set<slp_tree> &visited,
6800 hash_set<stmt_vec_info> &vstmts,
6801 hash_set<stmt_vec_info> &estmts)
6803 int i;
6804 stmt_vec_info stmt_info;
6805 slp_tree child;
6807 if (visited.add (node))
6808 return;
6810 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6812 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6813 vstmts.add (stmt_info);
6815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6816 if (child)
6817 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6818 vstmts, estmts);
6820 else
6821 for (tree def : SLP_TREE_SCALAR_OPS (node))
6823 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6824 if (def_stmt)
6825 estmts.add (def_stmt);
6830 /* Compute the scalar cost of the SLP node NODE and its children
6831 and return it. Do not account defs that are marked in LIFE and
6832 update LIFE according to uses of NODE. */
6834 static void
6835 vect_bb_slp_scalar_cost (vec_info *vinfo,
6836 slp_tree node, vec<bool, va_heap> *life,
6837 stmt_vector_for_cost *cost_vec,
6838 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6839 hash_set<slp_tree> &visited)
6841 unsigned i;
6842 stmt_vec_info stmt_info;
6843 slp_tree child;
6845 if (visited.add (node))
6846 return;
6848 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6850 ssa_op_iter op_iter;
6851 def_operand_p def_p;
6853 if ((*life)[i])
6854 continue;
6856 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6857 gimple *orig_stmt = orig_stmt_info->stmt;
6859 /* If there is a non-vectorized use of the defs then the scalar
6860 stmt is kept live in which case we do not account it or any
6861 required defs in the SLP children in the scalar cost. This
6862 way we make the vectorization more costly when compared to
6863 the scalar cost. */
6864 if (!STMT_VINFO_LIVE_P (stmt_info))
6866 auto_vec<gimple *, 8> worklist;
6867 hash_set<gimple *> *worklist_visited = NULL;
6868 worklist.quick_push (orig_stmt);
6871 gimple *work_stmt = worklist.pop ();
6872 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6874 imm_use_iterator use_iter;
6875 gimple *use_stmt;
6876 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6877 DEF_FROM_PTR (def_p))
6878 if (!is_gimple_debug (use_stmt))
6880 stmt_vec_info use_stmt_info
6881 = vinfo->lookup_stmt (use_stmt);
6882 if (!use_stmt_info
6883 || !vectorized_scalar_stmts.contains (use_stmt_info))
6885 if (use_stmt_info
6886 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6888 /* For stmts participating in patterns we have
6889 to check its uses recursively. */
6890 if (!worklist_visited)
6891 worklist_visited = new hash_set<gimple *> ();
6892 if (!worklist_visited->add (use_stmt))
6893 worklist.safe_push (use_stmt);
6894 continue;
6896 (*life)[i] = true;
6897 goto next_lane;
6902 while (!worklist.is_empty ());
6903 next_lane:
6904 if (worklist_visited)
6905 delete worklist_visited;
6906 if ((*life)[i])
6907 continue;
6910 /* Count scalar stmts only once. */
6911 if (gimple_visited_p (orig_stmt))
6912 continue;
6913 gimple_set_visited (orig_stmt, true);
6915 vect_cost_for_stmt kind;
6916 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6918 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6919 kind = scalar_load;
6920 else
6921 kind = scalar_store;
6923 else if (vect_nop_conversion_p (orig_stmt_info))
6924 continue;
6925 /* For single-argument PHIs assume coalescing which means zero cost
6926 for the scalar and the vector PHIs. This avoids artificially
6927 favoring the vector path (but may pessimize it in some cases). */
6928 else if (is_a <gphi *> (orig_stmt_info->stmt)
6929 && gimple_phi_num_args
6930 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6931 continue;
6932 else
6933 kind = scalar_stmt;
6934 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6935 SLP_TREE_VECTYPE (node), 0, vect_body);
6938 auto_vec<bool, 20> subtree_life;
6939 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6941 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6943 /* Do not directly pass LIFE to the recursive call, copy it to
6944 confine changes in the callee to the current child/subtree. */
6945 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6947 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6948 for (unsigned j = 0;
6949 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6951 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6952 if (perm.first == i)
6953 subtree_life[perm.second] = (*life)[j];
6956 else
6958 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6959 subtree_life.safe_splice (*life);
6961 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6962 vectorized_scalar_stmts, visited);
6963 subtree_life.truncate (0);
6968 /* Comparator for the loop-index sorted cost vectors. */
6970 static int
6971 li_cost_vec_cmp (const void *a_, const void *b_)
6973 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6974 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6975 if (a->first < b->first)
6976 return -1;
6977 else if (a->first == b->first)
6978 return 0;
6979 return 1;
6982 /* Check if vectorization of the basic block is profitable for the
6983 subgraph denoted by SLP_INSTANCES. */
6985 static bool
6986 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6987 vec<slp_instance> slp_instances,
6988 loop_p orig_loop)
6990 slp_instance instance;
6991 int i;
6992 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6993 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6995 if (dump_enabled_p ())
6997 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6998 hash_set<slp_tree> visited;
6999 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7000 vect_print_slp_graph (MSG_NOTE, vect_location,
7001 SLP_INSTANCE_TREE (instance), visited);
7004 /* Compute the set of scalar stmts we know will go away 'locally' when
7005 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7006 not accurate for nodes promoted extern late or for scalar stmts that
7007 are used both in extern defs and in vectorized defs. */
7008 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7009 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7010 hash_set<slp_tree> visited;
7011 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7013 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7014 SLP_INSTANCE_TREE (instance),
7015 visited,
7016 vectorized_scalar_stmts,
7017 scalar_stmts_in_externs);
7018 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7019 vectorized_scalar_stmts.add (rstmt);
7021 /* Scalar stmts used as defs in external nodes need to be preseved, so
7022 remove them from vectorized_scalar_stmts. */
7023 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7024 vectorized_scalar_stmts.remove (stmt);
7026 /* Calculate scalar cost and sum the cost for the vector stmts
7027 previously collected. */
7028 stmt_vector_for_cost scalar_costs = vNULL;
7029 stmt_vector_for_cost vector_costs = vNULL;
7030 visited.empty ();
7031 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7033 auto_vec<bool, 20> life;
7034 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7035 true);
7036 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7037 record_stmt_cost (&scalar_costs,
7038 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7039 scalar_stmt,
7040 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7041 vect_bb_slp_scalar_cost (bb_vinfo,
7042 SLP_INSTANCE_TREE (instance),
7043 &life, &scalar_costs, vectorized_scalar_stmts,
7044 visited);
7045 vector_costs.safe_splice (instance->cost_vec);
7046 instance->cost_vec.release ();
7049 if (dump_enabled_p ())
7050 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7052 /* When costing non-loop vectorization we need to consider each covered
7053 loop independently and make sure vectorization is profitable. For
7054 now we assume a loop may be not entered or executed an arbitrary
7055 number of iterations (??? static information can provide more
7056 precise info here) which means we can simply cost each containing
7057 loops stmts separately. */
7059 /* First produce cost vectors sorted by loop index. */
7060 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7061 li_scalar_costs (scalar_costs.length ());
7062 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7063 li_vector_costs (vector_costs.length ());
7064 stmt_info_for_cost *cost;
7065 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7067 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7068 li_scalar_costs.quick_push (std::make_pair (l, cost));
7070 /* Use a random used loop as fallback in case the first vector_costs
7071 entry does not have a stmt_info associated with it. */
7072 unsigned l = li_scalar_costs[0].first;
7073 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7075 /* We inherit from the previous COST, invariants, externals and
7076 extracts immediately follow the cost for the related stmt. */
7077 if (cost->stmt_info)
7078 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7079 li_vector_costs.quick_push (std::make_pair (l, cost));
7081 li_scalar_costs.qsort (li_cost_vec_cmp);
7082 li_vector_costs.qsort (li_cost_vec_cmp);
7084 /* Now cost the portions individually. */
7085 unsigned vi = 0;
7086 unsigned si = 0;
7087 bool profitable = true;
7088 while (si < li_scalar_costs.length ()
7089 && vi < li_vector_costs.length ())
7091 unsigned sl = li_scalar_costs[si].first;
7092 unsigned vl = li_vector_costs[vi].first;
7093 if (sl != vl)
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_NOTE, vect_location,
7097 "Scalar %d and vector %d loop part do not "
7098 "match up, skipping scalar part\n", sl, vl);
7099 /* Skip the scalar part, assuming zero cost on the vector side. */
7102 si++;
7104 while (si < li_scalar_costs.length ()
7105 && li_scalar_costs[si].first == sl);
7106 continue;
7109 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7112 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7113 si++;
7115 while (si < li_scalar_costs.length ()
7116 && li_scalar_costs[si].first == sl);
7117 unsigned dummy;
7118 finish_cost (scalar_target_cost_data, nullptr,
7119 &dummy, &scalar_cost, &dummy);
7121 /* Complete the target-specific vector cost calculation. */
7122 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7125 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7126 vi++;
7128 while (vi < li_vector_costs.length ()
7129 && li_vector_costs[vi].first == vl);
7130 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7131 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7132 delete scalar_target_cost_data;
7133 delete vect_target_cost_data;
7135 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7137 if (dump_enabled_p ())
7139 dump_printf_loc (MSG_NOTE, vect_location,
7140 "Cost model analysis for part in loop %d:\n", sl);
7141 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7142 vec_inside_cost + vec_outside_cost);
7143 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7146 /* Vectorization is profitable if its cost is more than the cost of scalar
7147 version. Note that we err on the vector side for equal cost because
7148 the cost estimate is otherwise quite pessimistic (constant uses are
7149 free on the scalar side but cost a load on the vector side for
7150 example). */
7151 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7153 profitable = false;
7154 break;
7157 if (profitable && vi < li_vector_costs.length ())
7159 if (dump_enabled_p ())
7160 dump_printf_loc (MSG_NOTE, vect_location,
7161 "Excess vector cost for part in loop %d:\n",
7162 li_vector_costs[vi].first);
7163 profitable = false;
7166 /* Unset visited flag. This is delayed when the subgraph is profitable
7167 and we process the loop for remaining unvectorized if-converted code. */
7168 if (!orig_loop || !profitable)
7169 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7170 gimple_set_visited (cost->stmt_info->stmt, false);
7172 scalar_costs.release ();
7173 vector_costs.release ();
7175 return profitable;
7178 /* qsort comparator for lane defs. */
7180 static int
7181 vld_cmp (const void *a_, const void *b_)
7183 auto *a = (const std::pair<unsigned, tree> *)a_;
7184 auto *b = (const std::pair<unsigned, tree> *)b_;
7185 return a->first - b->first;
7188 /* Return true if USE_STMT is a vector lane insert into VEC and set
7189 *THIS_LANE to the lane number that is set. */
7191 static bool
7192 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7194 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7195 if (!use_ass
7196 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7197 || (vec
7198 ? gimple_assign_rhs1 (use_ass) != vec
7199 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7200 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7201 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7202 || !constant_multiple_p
7203 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7204 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7205 this_lane))
7206 return false;
7207 return true;
7210 /* Find any vectorizable constructors and add them to the grouped_store
7211 array. */
7213 static void
7214 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7216 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7217 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7218 !gsi_end_p (gsi); gsi_next (&gsi))
7220 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7221 if (!assign)
7222 continue;
7224 tree rhs = gimple_assign_rhs1 (assign);
7225 enum tree_code code = gimple_assign_rhs_code (assign);
7226 use_operand_p use_p;
7227 gimple *use_stmt;
7228 if (code == CONSTRUCTOR)
7230 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7231 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7232 CONSTRUCTOR_NELTS (rhs))
7233 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7234 || uniform_vector_p (rhs))
7235 continue;
7237 unsigned j;
7238 tree val;
7239 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7240 if (TREE_CODE (val) != SSA_NAME
7241 || !bb_vinfo->lookup_def (val))
7242 break;
7243 if (j != CONSTRUCTOR_NELTS (rhs))
7244 continue;
7246 vec<stmt_vec_info> roots = vNULL;
7247 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7248 vec<stmt_vec_info> stmts;
7249 stmts.create (CONSTRUCTOR_NELTS (rhs));
7250 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7251 stmts.quick_push
7252 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7253 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7254 stmts, roots));
7256 else if (code == BIT_INSERT_EXPR
7257 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7258 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7259 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7260 && integer_zerop (gimple_assign_rhs3 (assign))
7261 && useless_type_conversion_p
7262 (TREE_TYPE (TREE_TYPE (rhs)),
7263 TREE_TYPE (gimple_assign_rhs2 (assign)))
7264 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7266 /* We start to match on insert to lane zero but since the
7267 inserts need not be ordered we'd have to search both
7268 the def and the use chains. */
7269 tree vectype = TREE_TYPE (rhs);
7270 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7271 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7272 auto_sbitmap lanes (nlanes);
7273 bitmap_clear (lanes);
7274 bitmap_set_bit (lanes, 0);
7275 tree def = gimple_assign_lhs (assign);
7276 lane_defs.quick_push
7277 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7278 unsigned lanes_found = 1;
7279 /* Start with the use chains, the last stmt will be the root. */
7280 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7281 vec<stmt_vec_info> roots = vNULL;
7282 roots.safe_push (last);
7285 use_operand_p use_p;
7286 gimple *use_stmt;
7287 if (!single_imm_use (def, &use_p, &use_stmt))
7288 break;
7289 unsigned this_lane;
7290 if (!bb_vinfo->lookup_stmt (use_stmt)
7291 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7292 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7293 break;
7294 if (bitmap_bit_p (lanes, this_lane))
7295 break;
7296 lanes_found++;
7297 bitmap_set_bit (lanes, this_lane);
7298 gassign *use_ass = as_a <gassign *> (use_stmt);
7299 lane_defs.quick_push (std::make_pair
7300 (this_lane, gimple_assign_rhs2 (use_ass)));
7301 last = bb_vinfo->lookup_stmt (use_ass);
7302 roots.safe_push (last);
7303 def = gimple_assign_lhs (use_ass);
7305 while (lanes_found < nlanes);
7306 if (roots.length () > 1)
7307 std::swap(roots[0], roots[roots.length () - 1]);
7308 if (lanes_found < nlanes)
7310 /* Now search the def chain. */
7311 def = gimple_assign_rhs1 (assign);
7314 if (TREE_CODE (def) != SSA_NAME
7315 || !has_single_use (def))
7316 break;
7317 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7318 unsigned this_lane;
7319 if (!bb_vinfo->lookup_stmt (def_stmt)
7320 || !vect_slp_is_lane_insert (def_stmt,
7321 NULL_TREE, &this_lane)
7322 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7323 break;
7324 if (bitmap_bit_p (lanes, this_lane))
7325 break;
7326 lanes_found++;
7327 bitmap_set_bit (lanes, this_lane);
7328 lane_defs.quick_push (std::make_pair
7329 (this_lane,
7330 gimple_assign_rhs2 (def_stmt)));
7331 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7332 def = gimple_assign_rhs1 (def_stmt);
7334 while (lanes_found < nlanes);
7336 if (lanes_found == nlanes)
7338 /* Sort lane_defs after the lane index and register the root. */
7339 lane_defs.qsort (vld_cmp);
7340 vec<stmt_vec_info> stmts;
7341 stmts.create (nlanes);
7342 for (unsigned i = 0; i < nlanes; ++i)
7343 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7344 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7345 stmts, roots));
7347 else
7348 roots.release ();
7350 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7351 && (associative_tree_code (code) || code == MINUS_EXPR)
7352 /* ??? This pessimizes a two-element reduction. PR54400.
7353 ??? In-order reduction could be handled if we only
7354 traverse one operand chain in vect_slp_linearize_chain. */
7355 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7356 /* Ops with constants at the tail can be stripped here. */
7357 && TREE_CODE (rhs) == SSA_NAME
7358 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7359 /* Should be the chain end. */
7360 && (!single_imm_use (gimple_assign_lhs (assign),
7361 &use_p, &use_stmt)
7362 || !is_gimple_assign (use_stmt)
7363 || (gimple_assign_rhs_code (use_stmt) != code
7364 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7365 || (gimple_assign_rhs_code (use_stmt)
7366 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7368 /* We start the match at the end of a possible association
7369 chain. */
7370 auto_vec<chain_op_t> chain;
7371 auto_vec<std::pair<tree_code, gimple *> > worklist;
7372 auto_vec<gimple *> chain_stmts;
7373 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7374 if (code == MINUS_EXPR)
7375 code = PLUS_EXPR;
7376 internal_fn reduc_fn;
7377 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7378 || reduc_fn == IFN_LAST)
7379 continue;
7380 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7381 /* ??? */
7382 code_stmt, alt_code_stmt, &chain_stmts);
7383 if (chain.length () > 1)
7385 /* Sort the chain according to def_type and operation. */
7386 chain.sort (dt_sort_cmp, bb_vinfo);
7387 /* ??? Now we'd want to strip externals and constants
7388 but record those to be handled in the epilogue. */
7389 /* ??? For now do not allow mixing ops or externs/constants. */
7390 bool invalid = false;
7391 unsigned remain_cnt = 0;
7392 for (unsigned i = 0; i < chain.length (); ++i)
7394 if (chain[i].code != code)
7396 invalid = true;
7397 break;
7399 if (chain[i].dt != vect_internal_def)
7400 remain_cnt++;
7402 if (!invalid && chain.length () - remain_cnt > 1)
7404 vec<stmt_vec_info> stmts;
7405 vec<tree> remain = vNULL;
7406 stmts.create (chain.length ());
7407 if (remain_cnt > 0)
7408 remain.create (remain_cnt);
7409 for (unsigned i = 0; i < chain.length (); ++i)
7411 if (chain[i].dt == vect_internal_def)
7412 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7413 else
7414 remain.quick_push (chain[i].op);
7416 vec<stmt_vec_info> roots;
7417 roots.create (chain_stmts.length ());
7418 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7419 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7420 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7421 stmts, roots, remain));
7428 /* Walk the grouped store chains and replace entries with their
7429 pattern variant if any. */
7431 static void
7432 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7434 stmt_vec_info first_element;
7435 unsigned i;
7437 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7439 /* We also have CTORs in this array. */
7440 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7441 continue;
7442 if (STMT_VINFO_IN_PATTERN_P (first_element))
7444 stmt_vec_info orig = first_element;
7445 first_element = STMT_VINFO_RELATED_STMT (first_element);
7446 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7447 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7448 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7449 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7450 vinfo->grouped_stores[i] = first_element;
7452 stmt_vec_info prev = first_element;
7453 while (DR_GROUP_NEXT_ELEMENT (prev))
7455 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7456 if (STMT_VINFO_IN_PATTERN_P (elt))
7458 stmt_vec_info orig = elt;
7459 elt = STMT_VINFO_RELATED_STMT (elt);
7460 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7461 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7462 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7464 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7465 prev = elt;
7470 /* Check if the region described by BB_VINFO can be vectorized, returning
7471 true if so. When returning false, set FATAL to true if the same failure
7472 would prevent vectorization at other vector sizes, false if it is still
7473 worth trying other sizes. N_STMTS is the number of statements in the
7474 region. */
7476 static bool
7477 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7478 vec<int> *dataref_groups)
7480 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7482 slp_instance instance;
7483 int i;
7484 poly_uint64 min_vf = 2;
7486 /* The first group of checks is independent of the vector size. */
7487 fatal = true;
7489 /* Analyze the data references. */
7491 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7493 if (dump_enabled_p ())
7494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7495 "not vectorized: unhandled data-ref in basic "
7496 "block.\n");
7497 return false;
7500 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7502 if (dump_enabled_p ())
7503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7504 "not vectorized: unhandled data access in "
7505 "basic block.\n");
7506 return false;
7509 vect_slp_check_for_roots (bb_vinfo);
7511 /* If there are no grouped stores and no constructors in the region
7512 there is no need to continue with pattern recog as vect_analyze_slp
7513 will fail anyway. */
7514 if (bb_vinfo->grouped_stores.is_empty ()
7515 && bb_vinfo->roots.is_empty ())
7517 if (dump_enabled_p ())
7518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7519 "not vectorized: no grouped stores in "
7520 "basic block.\n");
7521 return false;
7524 /* While the rest of the analysis below depends on it in some way. */
7525 fatal = false;
7527 vect_pattern_recog (bb_vinfo);
7529 /* Update store groups from pattern processing. */
7530 vect_fixup_store_groups_with_patterns (bb_vinfo);
7532 /* Check the SLP opportunities in the basic block, analyze and build SLP
7533 trees. */
7534 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7536 if (dump_enabled_p ())
7538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7539 "Failed to SLP the basic block.\n");
7540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7541 "not vectorized: failed to find SLP opportunities "
7542 "in basic block.\n");
7544 return false;
7547 /* Optimize permutations. */
7548 vect_optimize_slp (bb_vinfo);
7550 /* Gather the loads reachable from the SLP graph entries. */
7551 vect_gather_slp_loads (bb_vinfo);
7553 vect_record_base_alignments (bb_vinfo);
7555 /* Analyze and verify the alignment of data references and the
7556 dependence in the SLP instances. */
7557 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7559 vect_location = instance->location ();
7560 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7561 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7563 slp_tree node = SLP_INSTANCE_TREE (instance);
7564 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7565 if (dump_enabled_p ())
7566 dump_printf_loc (MSG_NOTE, vect_location,
7567 "removing SLP instance operations starting from: %G",
7568 stmt_info->stmt);
7569 vect_free_slp_instance (instance);
7570 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7571 continue;
7574 /* Mark all the statements that we want to vectorize as pure SLP and
7575 relevant. */
7576 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7577 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7578 unsigned j;
7579 stmt_vec_info root;
7580 /* Likewise consider instance root stmts as vectorized. */
7581 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7582 STMT_SLP_TYPE (root) = pure_slp;
7584 i++;
7586 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7587 return false;
7589 if (!vect_slp_analyze_operations (bb_vinfo))
7591 if (dump_enabled_p ())
7592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7593 "not vectorized: bad operation in basic block.\n");
7594 return false;
7597 vect_bb_partition_graph (bb_vinfo);
7599 return true;
7602 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7603 basic blocks in BBS, returning true on success.
7604 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7606 static bool
7607 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7608 vec<int> *dataref_groups, unsigned int n_stmts,
7609 loop_p orig_loop)
7611 bb_vec_info bb_vinfo;
7612 auto_vector_modes vector_modes;
7614 /* Autodetect first vector size we try. */
7615 machine_mode next_vector_mode = VOIDmode;
7616 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7617 unsigned int mode_i = 0;
7619 vec_info_shared shared;
7621 machine_mode autodetected_vector_mode = VOIDmode;
7622 while (1)
7624 bool vectorized = false;
7625 bool fatal = false;
7626 bb_vinfo = new _bb_vec_info (bbs, &shared);
7628 bool first_time_p = shared.datarefs.is_empty ();
7629 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7630 if (first_time_p)
7631 bb_vinfo->shared->save_datarefs ();
7632 else
7633 bb_vinfo->shared->check_datarefs ();
7634 bb_vinfo->vector_mode = next_vector_mode;
7636 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7638 if (dump_enabled_p ())
7640 dump_printf_loc (MSG_NOTE, vect_location,
7641 "***** Analysis succeeded with vector mode"
7642 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7643 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7646 bb_vinfo->shared->check_datarefs ();
7648 auto_vec<slp_instance> profitable_subgraphs;
7649 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7651 if (instance->subgraph_entries.is_empty ())
7652 continue;
7654 dump_user_location_t saved_vect_location = vect_location;
7655 vect_location = instance->location ();
7656 if (!unlimited_cost_model (NULL)
7657 && !vect_bb_vectorization_profitable_p
7658 (bb_vinfo, instance->subgraph_entries, orig_loop))
7660 if (dump_enabled_p ())
7661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7662 "not vectorized: vectorization is not "
7663 "profitable.\n");
7664 vect_location = saved_vect_location;
7665 continue;
7668 vect_location = saved_vect_location;
7669 if (!dbg_cnt (vect_slp))
7670 continue;
7672 profitable_subgraphs.safe_push (instance);
7675 /* When we're vectorizing an if-converted loop body make sure
7676 we vectorized all if-converted code. */
7677 if (!profitable_subgraphs.is_empty ()
7678 && orig_loop)
7680 gcc_assert (bb_vinfo->bbs.length () == 1);
7681 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7682 !gsi_end_p (gsi); gsi_next (&gsi))
7684 /* The costing above left us with DCEable vectorized scalar
7685 stmts having the visited flag set on profitable
7686 subgraphs. Do the delayed clearing of the flag here. */
7687 if (gimple_visited_p (gsi_stmt (gsi)))
7689 gimple_set_visited (gsi_stmt (gsi), false);
7690 continue;
7692 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7693 continue;
7695 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7696 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7698 if (!profitable_subgraphs.is_empty ()
7699 && dump_enabled_p ())
7700 dump_printf_loc (MSG_NOTE, vect_location,
7701 "not profitable because of "
7702 "unprofitable if-converted scalar "
7703 "code\n");
7704 profitable_subgraphs.truncate (0);
7709 /* Finally schedule the profitable subgraphs. */
7710 for (slp_instance instance : profitable_subgraphs)
7712 if (!vectorized && dump_enabled_p ())
7713 dump_printf_loc (MSG_NOTE, vect_location,
7714 "Basic block will be vectorized "
7715 "using SLP\n");
7716 vectorized = true;
7718 /* Dump before scheduling as store vectorization will remove
7719 the original stores and mess with the instance tree
7720 so querying its location will eventually ICE. */
7721 if (flag_checking)
7722 for (slp_instance sub : instance->subgraph_entries)
7723 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7724 unsigned HOST_WIDE_INT bytes;
7725 if (dump_enabled_p ())
7726 for (slp_instance sub : instance->subgraph_entries)
7728 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7729 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7730 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7731 sub->location (),
7732 "basic block part vectorized using %wu "
7733 "byte vectors\n", bytes);
7734 else
7735 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7736 sub->location (),
7737 "basic block part vectorized using "
7738 "variable length vectors\n");
7741 dump_user_location_t saved_vect_location = vect_location;
7742 vect_location = instance->location ();
7744 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7746 vect_location = saved_vect_location;
7749 else
7751 if (dump_enabled_p ())
7752 dump_printf_loc (MSG_NOTE, vect_location,
7753 "***** Analysis failed with vector mode %s\n",
7754 GET_MODE_NAME (bb_vinfo->vector_mode));
7757 if (mode_i == 0)
7758 autodetected_vector_mode = bb_vinfo->vector_mode;
7760 if (!fatal)
7761 while (mode_i < vector_modes.length ()
7762 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7764 if (dump_enabled_p ())
7765 dump_printf_loc (MSG_NOTE, vect_location,
7766 "***** The result for vector mode %s would"
7767 " be the same\n",
7768 GET_MODE_NAME (vector_modes[mode_i]));
7769 mode_i += 1;
7772 delete bb_vinfo;
7774 if (mode_i < vector_modes.length ()
7775 && VECTOR_MODE_P (autodetected_vector_mode)
7776 && (related_vector_mode (vector_modes[mode_i],
7777 GET_MODE_INNER (autodetected_vector_mode))
7778 == autodetected_vector_mode)
7779 && (related_vector_mode (autodetected_vector_mode,
7780 GET_MODE_INNER (vector_modes[mode_i]))
7781 == vector_modes[mode_i]))
7783 if (dump_enabled_p ())
7784 dump_printf_loc (MSG_NOTE, vect_location,
7785 "***** Skipping vector mode %s, which would"
7786 " repeat the analysis for %s\n",
7787 GET_MODE_NAME (vector_modes[mode_i]),
7788 GET_MODE_NAME (autodetected_vector_mode));
7789 mode_i += 1;
7792 if (vectorized
7793 || mode_i == vector_modes.length ()
7794 || autodetected_vector_mode == VOIDmode
7795 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7796 vector sizes will fail do not bother iterating. */
7797 || fatal)
7798 return vectorized;
7800 /* Try the next biggest vector size. */
7801 next_vector_mode = vector_modes[mode_i++];
7802 if (dump_enabled_p ())
7803 dump_printf_loc (MSG_NOTE, vect_location,
7804 "***** Re-trying analysis with vector mode %s\n",
7805 GET_MODE_NAME (next_vector_mode));
7810 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7811 true if anything in the basic-block was vectorized. */
7813 static bool
7814 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7816 vec<data_reference_p> datarefs = vNULL;
7817 auto_vec<int> dataref_groups;
7818 int insns = 0;
7819 int current_group = 0;
7821 for (unsigned i = 0; i < bbs.length (); i++)
7823 basic_block bb = bbs[i];
7824 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7825 gsi_next (&gsi))
7827 gimple *stmt = gsi_stmt (gsi);
7828 if (is_gimple_debug (stmt))
7829 continue;
7831 insns++;
7833 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7834 vect_location = stmt;
7836 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7837 &dataref_groups, current_group))
7838 ++current_group;
7840 /* New BBs always start a new DR group. */
7841 ++current_group;
7844 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7847 /* Special entry for the BB vectorizer. Analyze and transform a single
7848 if-converted BB with ORIG_LOOPs body being the not if-converted
7849 representation. Returns true if anything in the basic-block was
7850 vectorized. */
7852 bool
7853 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7855 auto_vec<basic_block> bbs;
7856 bbs.safe_push (bb);
7857 return vect_slp_bbs (bbs, orig_loop);
7860 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7861 true if anything in the basic-block was vectorized. */
7863 bool
7864 vect_slp_function (function *fun)
7866 bool r = false;
7867 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7868 auto_bitmap exit_bbs;
7869 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7870 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7871 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7872 true, rpo, NULL);
7874 /* For the moment split the function into pieces to avoid making
7875 the iteration on the vector mode moot. Split at points we know
7876 to not handle well which is CFG merges (SLP discovery doesn't
7877 handle non-loop-header PHIs) and loop exits. Since pattern
7878 recog requires reverse iteration to visit uses before defs
7879 simply chop RPO into pieces. */
7880 auto_vec<basic_block> bbs;
7881 for (unsigned i = 0; i < n; i++)
7883 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7884 bool split = false;
7886 /* Split when a BB is not dominated by the first block. */
7887 if (!bbs.is_empty ()
7888 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7890 if (dump_enabled_p ())
7891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7892 "splitting region at dominance boundary bb%d\n",
7893 bb->index);
7894 split = true;
7896 /* Split when the loop determined by the first block
7897 is exited. This is because we eventually insert
7898 invariants at region begin. */
7899 else if (!bbs.is_empty ()
7900 && bbs[0]->loop_father != bb->loop_father
7901 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7903 if (dump_enabled_p ())
7904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7905 "splitting region at loop %d exit at bb%d\n",
7906 bbs[0]->loop_father->num, bb->index);
7907 split = true;
7909 else if (!bbs.is_empty ()
7910 && bb->loop_father->header == bb
7911 && bb->loop_father->dont_vectorize)
7913 if (dump_enabled_p ())
7914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915 "splitting region at dont-vectorize loop %d "
7916 "entry at bb%d\n",
7917 bb->loop_father->num, bb->index);
7918 split = true;
7921 if (split && !bbs.is_empty ())
7923 r |= vect_slp_bbs (bbs, NULL);
7924 bbs.truncate (0);
7927 if (bbs.is_empty ())
7929 /* We need to be able to insert at the head of the region which
7930 we cannot for region starting with a returns-twice call. */
7931 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7932 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7934 if (dump_enabled_p ())
7935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7936 "skipping bb%d as start of region as it "
7937 "starts with returns-twice call\n",
7938 bb->index);
7939 continue;
7941 /* If the loop this BB belongs to is marked as not to be vectorized
7942 honor that also for BB vectorization. */
7943 if (bb->loop_father->dont_vectorize)
7944 continue;
7947 bbs.safe_push (bb);
7949 /* When we have a stmt ending this block and defining a
7950 value we have to insert on edges when inserting after it for
7951 a vector containing its definition. Avoid this for now. */
7952 if (gimple *last = *gsi_last_bb (bb))
7953 if (gimple_get_lhs (last)
7954 && is_ctrl_altering_stmt (last))
7956 if (dump_enabled_p ())
7957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958 "splitting region at control altering "
7959 "definition %G", last);
7960 r |= vect_slp_bbs (bbs, NULL);
7961 bbs.truncate (0);
7965 if (!bbs.is_empty ())
7966 r |= vect_slp_bbs (bbs, NULL);
7968 free (rpo);
7970 return r;
7973 /* Build a variable-length vector in which the elements in ELTS are repeated
7974 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7975 RESULTS and add any new instructions to SEQ.
7977 The approach we use is:
7979 (1) Find a vector mode VM with integer elements of mode IM.
7981 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7982 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7983 from small vectors to IM.
7985 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7987 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7988 correct byte contents.
7990 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7992 We try to find the largest IM for which this sequence works, in order
7993 to cut down on the number of interleaves. */
7995 void
7996 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7997 const vec<tree> &elts, unsigned int nresults,
7998 vec<tree> &results)
8000 unsigned int nelts = elts.length ();
8001 tree element_type = TREE_TYPE (vector_type);
8003 /* (1) Find a vector mode VM with integer elements of mode IM. */
8004 unsigned int nvectors = 1;
8005 tree new_vector_type;
8006 tree permutes[2];
8007 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8008 &nvectors, &new_vector_type,
8009 permutes))
8010 gcc_unreachable ();
8012 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8013 unsigned int partial_nelts = nelts / nvectors;
8014 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8016 tree_vector_builder partial_elts;
8017 auto_vec<tree, 32> pieces (nvectors * 2);
8018 pieces.quick_grow_cleared (nvectors * 2);
8019 for (unsigned int i = 0; i < nvectors; ++i)
8021 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8022 ELTS' has mode IM. */
8023 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8024 for (unsigned int j = 0; j < partial_nelts; ++j)
8025 partial_elts.quick_push (elts[i * partial_nelts + j]);
8026 tree t = gimple_build_vector (seq, &partial_elts);
8027 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8028 TREE_TYPE (new_vector_type), t);
8030 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8031 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8034 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8035 correct byte contents.
8037 Conceptually, we need to repeat the following operation log2(nvectors)
8038 times, where hi_start = nvectors / 2:
8040 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8041 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8043 However, if each input repeats every N elements and the VF is
8044 a multiple of N * 2, the HI result is the same as the LO result.
8045 This will be true for the first N1 iterations of the outer loop,
8046 followed by N2 iterations for which both the LO and HI results
8047 are needed. I.e.:
8049 N1 + N2 = log2(nvectors)
8051 Each "N1 iteration" doubles the number of redundant vectors and the
8052 effect of the process as a whole is to have a sequence of nvectors/2**N1
8053 vectors that repeats 2**N1 times. Rather than generate these redundant
8054 vectors, we halve the number of vectors for each N1 iteration. */
8055 unsigned int in_start = 0;
8056 unsigned int out_start = nvectors;
8057 unsigned int new_nvectors = nvectors;
8058 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8060 unsigned int hi_start = new_nvectors / 2;
8061 unsigned int out_i = 0;
8062 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8064 if ((in_i & 1) != 0
8065 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8066 2 * in_repeat))
8067 continue;
8069 tree output = make_ssa_name (new_vector_type);
8070 tree input1 = pieces[in_start + (in_i / 2)];
8071 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8072 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8073 input1, input2,
8074 permutes[in_i & 1]);
8075 gimple_seq_add_stmt (seq, stmt);
8076 pieces[out_start + out_i] = output;
8077 out_i += 1;
8079 std::swap (in_start, out_start);
8080 new_nvectors = out_i;
8083 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8084 results.reserve (nresults);
8085 for (unsigned int i = 0; i < nresults; ++i)
8086 if (i < new_nvectors)
8087 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8088 pieces[in_start + i]));
8089 else
8090 results.quick_push (results[i - new_nvectors]);
8094 /* For constant and loop invariant defs in OP_NODE this function creates
8095 vector defs that will be used in the vectorized stmts and stores them
8096 to SLP_TREE_VEC_DEFS of OP_NODE. */
8098 static void
8099 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8101 unsigned HOST_WIDE_INT nunits;
8102 tree vec_cst;
8103 unsigned j, number_of_places_left_in_vector;
8104 tree vector_type;
8105 tree vop;
8106 int group_size = op_node->ops.length ();
8107 unsigned int vec_num, i;
8108 unsigned number_of_copies = 1;
8109 bool constant_p;
8110 gimple_seq ctor_seq = NULL;
8111 auto_vec<tree, 16> permute_results;
8113 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8114 vector_type = SLP_TREE_VECTYPE (op_node);
8116 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8117 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8118 auto_vec<tree> voprnds (number_of_vectors);
8120 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8121 created vectors. It is greater than 1 if unrolling is performed.
8123 For example, we have two scalar operands, s1 and s2 (e.g., group of
8124 strided accesses of size two), while NUNITS is four (i.e., four scalars
8125 of this type can be packed in a vector). The output vector will contain
8126 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8127 will be 2).
8129 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8130 containing the operands.
8132 For example, NUNITS is four as before, and the group size is 8
8133 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8134 {s5, s6, s7, s8}. */
8136 /* When using duplicate_and_interleave, we just need one element for
8137 each scalar statement. */
8138 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8139 nunits = group_size;
8141 number_of_copies = nunits * number_of_vectors / group_size;
8143 number_of_places_left_in_vector = nunits;
8144 constant_p = true;
8145 tree_vector_builder elts (vector_type, nunits, 1);
8146 elts.quick_grow (nunits);
8147 stmt_vec_info insert_after = NULL;
8148 for (j = 0; j < number_of_copies; j++)
8150 tree op;
8151 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8153 /* Create 'vect_ = {op0,op1,...,opn}'. */
8154 number_of_places_left_in_vector--;
8155 tree orig_op = op;
8156 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8158 if (CONSTANT_CLASS_P (op))
8160 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8162 /* Can't use VIEW_CONVERT_EXPR for booleans because
8163 of possibly different sizes of scalar value and
8164 vector element. */
8165 if (integer_zerop (op))
8166 op = build_int_cst (TREE_TYPE (vector_type), 0);
8167 else if (integer_onep (op))
8168 op = build_all_ones_cst (TREE_TYPE (vector_type));
8169 else
8170 gcc_unreachable ();
8172 else
8173 op = fold_unary (VIEW_CONVERT_EXPR,
8174 TREE_TYPE (vector_type), op);
8175 gcc_assert (op && CONSTANT_CLASS_P (op));
8177 else
8179 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8180 gimple *init_stmt;
8181 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8183 tree true_val
8184 = build_all_ones_cst (TREE_TYPE (vector_type));
8185 tree false_val
8186 = build_zero_cst (TREE_TYPE (vector_type));
8187 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8188 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8189 op, true_val,
8190 false_val);
8192 else
8194 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8195 op);
8196 init_stmt
8197 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8198 op);
8200 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8201 op = new_temp;
8204 elts[number_of_places_left_in_vector] = op;
8205 if (!CONSTANT_CLASS_P (op))
8206 constant_p = false;
8207 /* For BB vectorization we have to compute an insert location
8208 when a def is inside the analyzed region since we cannot
8209 simply insert at the BB start in this case. */
8210 stmt_vec_info opdef;
8211 if (TREE_CODE (orig_op) == SSA_NAME
8212 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8213 && is_a <bb_vec_info> (vinfo)
8214 && (opdef = vinfo->lookup_def (orig_op)))
8216 if (!insert_after)
8217 insert_after = opdef;
8218 else
8219 insert_after = get_later_stmt (insert_after, opdef);
8222 if (number_of_places_left_in_vector == 0)
8224 if (constant_p
8225 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8226 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8227 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8228 else
8230 if (permute_results.is_empty ())
8231 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8232 elts, number_of_vectors,
8233 permute_results);
8234 vec_cst = permute_results[number_of_vectors - j - 1];
8236 if (!gimple_seq_empty_p (ctor_seq))
8238 if (insert_after)
8240 gimple_stmt_iterator gsi;
8241 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8243 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8244 gsi_insert_seq_before (&gsi, ctor_seq,
8245 GSI_CONTINUE_LINKING);
8247 else if (!stmt_ends_bb_p (insert_after->stmt))
8249 gsi = gsi_for_stmt (insert_after->stmt);
8250 gsi_insert_seq_after (&gsi, ctor_seq,
8251 GSI_CONTINUE_LINKING);
8253 else
8255 /* When we want to insert after a def where the
8256 defining stmt throws then insert on the fallthru
8257 edge. */
8258 edge e = find_fallthru_edge
8259 (gimple_bb (insert_after->stmt)->succs);
8260 basic_block new_bb
8261 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8262 gcc_assert (!new_bb);
8265 else
8266 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8267 ctor_seq = NULL;
8269 voprnds.quick_push (vec_cst);
8270 insert_after = NULL;
8271 number_of_places_left_in_vector = nunits;
8272 constant_p = true;
8273 elts.new_vector (vector_type, nunits, 1);
8274 elts.quick_grow (nunits);
8279 /* Since the vectors are created in the reverse order, we should invert
8280 them. */
8281 vec_num = voprnds.length ();
8282 for (j = vec_num; j != 0; j--)
8284 vop = voprnds[j - 1];
8285 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8288 /* In case that VF is greater than the unrolling factor needed for the SLP
8289 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8290 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8291 to replicate the vectors. */
8292 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8293 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8294 i++)
8295 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8298 /* Get the Ith vectorized definition from SLP_NODE. */
8300 tree
8301 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8303 return SLP_TREE_VEC_DEFS (slp_node)[i];
8306 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8308 void
8309 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8311 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8312 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8315 /* Get N vectorized definitions for SLP_NODE. */
8317 void
8318 vect_get_slp_defs (vec_info *,
8319 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8321 if (n == -1U)
8322 n = SLP_TREE_CHILDREN (slp_node).length ();
8324 for (unsigned i = 0; i < n; ++i)
8326 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8327 vec<tree> vec_defs = vNULL;
8328 vect_get_slp_defs (child, &vec_defs);
8329 vec_oprnds->quick_push (vec_defs);
8333 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8334 - PERM gives the permutation that the caller wants to use for NODE,
8335 which might be different from SLP_LOAD_PERMUTATION.
8336 - DUMP_P controls whether the function dumps information. */
8338 static bool
8339 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8340 load_permutation_t &perm,
8341 const vec<tree> &dr_chain,
8342 gimple_stmt_iterator *gsi, poly_uint64 vf,
8343 bool analyze_only, bool dump_p,
8344 unsigned *n_perms, unsigned int *n_loads,
8345 bool dce_chain)
8347 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8348 int vec_index = 0;
8349 tree vectype = SLP_TREE_VECTYPE (node);
8350 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8351 unsigned int mask_element;
8352 unsigned dr_group_size;
8353 machine_mode mode;
8355 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8356 dr_group_size = 1;
8357 else
8359 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8360 dr_group_size = DR_GROUP_SIZE (stmt_info);
8363 mode = TYPE_MODE (vectype);
8364 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8365 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8367 /* Initialize the vect stmts of NODE to properly insert the generated
8368 stmts later. */
8369 if (! analyze_only)
8370 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8371 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8373 /* Generate permutation masks for every NODE. Number of masks for each NODE
8374 is equal to GROUP_SIZE.
8375 E.g., we have a group of three nodes with three loads from the same
8376 location in each node, and the vector size is 4. I.e., we have a
8377 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8378 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8379 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8382 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8383 The last mask is illegal since we assume two operands for permute
8384 operation, and the mask element values can't be outside that range.
8385 Hence, the last mask must be converted into {2,5,5,5}.
8386 For the first two permutations we need the first and the second input
8387 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8388 we need the second and the third vectors: {b1,c1,a2,b2} and
8389 {c2,a3,b3,c3}. */
8391 int vect_stmts_counter = 0;
8392 unsigned int index = 0;
8393 int first_vec_index = -1;
8394 int second_vec_index = -1;
8395 bool noop_p = true;
8396 *n_perms = 0;
8398 vec_perm_builder mask;
8399 unsigned int nelts_to_build;
8400 unsigned int nvectors_per_build;
8401 unsigned int in_nlanes;
8402 bool repeating_p = (group_size == dr_group_size
8403 && multiple_p (nunits, group_size));
8404 if (repeating_p)
8406 /* A single vector contains a whole number of copies of the node, so:
8407 (a) all permutes can use the same mask; and
8408 (b) the permutes only need a single vector input. */
8409 mask.new_vector (nunits, group_size, 3);
8410 nelts_to_build = mask.encoded_nelts ();
8411 /* It's possible to obtain zero nstmts during analyze_only, so make
8412 it at least one to ensure the later computation for n_perms
8413 proceed. */
8414 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8415 in_nlanes = dr_group_size * 3;
8417 else
8419 /* We need to construct a separate mask for each vector statement. */
8420 unsigned HOST_WIDE_INT const_nunits, const_vf;
8421 if (!nunits.is_constant (&const_nunits)
8422 || !vf.is_constant (&const_vf))
8423 return false;
8424 mask.new_vector (const_nunits, const_nunits, 1);
8425 nelts_to_build = const_vf * group_size;
8426 nvectors_per_build = 1;
8427 in_nlanes = const_vf * dr_group_size;
8429 auto_sbitmap used_in_lanes (in_nlanes);
8430 bitmap_clear (used_in_lanes);
8431 auto_bitmap used_defs;
8433 unsigned int count = mask.encoded_nelts ();
8434 mask.quick_grow (count);
8435 vec_perm_indices indices;
8437 for (unsigned int j = 0; j < nelts_to_build; j++)
8439 unsigned int iter_num = j / group_size;
8440 unsigned int stmt_num = j % group_size;
8441 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8442 bitmap_set_bit (used_in_lanes, i);
8443 if (repeating_p)
8445 first_vec_index = 0;
8446 mask_element = i;
8448 else
8450 /* Enforced before the loop when !repeating_p. */
8451 unsigned int const_nunits = nunits.to_constant ();
8452 vec_index = i / const_nunits;
8453 mask_element = i % const_nunits;
8454 if (vec_index == first_vec_index
8455 || first_vec_index == -1)
8457 first_vec_index = vec_index;
8459 else if (vec_index == second_vec_index
8460 || second_vec_index == -1)
8462 second_vec_index = vec_index;
8463 mask_element += const_nunits;
8465 else
8467 if (dump_p)
8468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8469 "permutation requires at "
8470 "least three vectors %G",
8471 stmt_info->stmt);
8472 gcc_assert (analyze_only);
8473 return false;
8476 gcc_assert (mask_element < 2 * const_nunits);
8479 if (mask_element != index)
8480 noop_p = false;
8481 mask[index++] = mask_element;
8483 if (index == count)
8485 if (!noop_p)
8487 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8488 if (!can_vec_perm_const_p (mode, mode, indices))
8490 if (dump_p)
8492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8493 "unsupported vect permute { ");
8494 for (i = 0; i < count; ++i)
8496 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8497 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8499 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8501 gcc_assert (analyze_only);
8502 return false;
8505 tree mask_vec = NULL_TREE;
8506 if (!analyze_only)
8507 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8509 if (second_vec_index == -1)
8510 second_vec_index = first_vec_index;
8512 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8514 ++*n_perms;
8515 if (analyze_only)
8516 continue;
8517 /* Generate the permute statement if necessary. */
8518 tree first_vec = dr_chain[first_vec_index + ri];
8519 tree second_vec = dr_chain[second_vec_index + ri];
8520 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8521 tree perm_dest
8522 = vect_create_destination_var (gimple_assign_lhs (stmt),
8523 vectype);
8524 perm_dest = make_ssa_name (perm_dest);
8525 gimple *perm_stmt
8526 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8527 second_vec, mask_vec);
8528 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8529 gsi);
8530 if (dce_chain)
8532 bitmap_set_bit (used_defs, first_vec_index + ri);
8533 bitmap_set_bit (used_defs, second_vec_index + ri);
8536 /* Store the vector statement in NODE. */
8537 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8540 else if (!analyze_only)
8542 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8544 tree first_vec = dr_chain[first_vec_index + ri];
8545 /* If mask was NULL_TREE generate the requested
8546 identity transform. */
8547 if (dce_chain)
8548 bitmap_set_bit (used_defs, first_vec_index + ri);
8550 /* Store the vector statement in NODE. */
8551 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8555 index = 0;
8556 first_vec_index = -1;
8557 second_vec_index = -1;
8558 noop_p = true;
8562 if (n_loads)
8564 if (repeating_p)
8565 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8566 else
8568 /* Enforced above when !repeating_p. */
8569 unsigned int const_nunits = nunits.to_constant ();
8570 *n_loads = 0;
8571 bool load_seen = false;
8572 for (unsigned i = 0; i < in_nlanes; ++i)
8574 if (i % const_nunits == 0)
8576 if (load_seen)
8577 *n_loads += 1;
8578 load_seen = false;
8580 if (bitmap_bit_p (used_in_lanes, i))
8581 load_seen = true;
8583 if (load_seen)
8584 *n_loads += 1;
8588 if (dce_chain)
8589 for (unsigned i = 0; i < dr_chain.length (); ++i)
8590 if (!bitmap_bit_p (used_defs, i))
8592 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8593 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8594 gsi_remove (&rgsi, true);
8595 release_defs (stmt);
8598 return true;
8601 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8602 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8603 permute statements for the SLP node NODE. Store the number of vector
8604 permute instructions in *N_PERMS and the number of vector load
8605 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8606 that were not needed. */
8608 bool
8609 vect_transform_slp_perm_load (vec_info *vinfo,
8610 slp_tree node, const vec<tree> &dr_chain,
8611 gimple_stmt_iterator *gsi, poly_uint64 vf,
8612 bool analyze_only, unsigned *n_perms,
8613 unsigned int *n_loads, bool dce_chain)
8615 return vect_transform_slp_perm_load_1 (vinfo, node,
8616 SLP_TREE_LOAD_PERMUTATION (node),
8617 dr_chain, gsi, vf, analyze_only,
8618 dump_enabled_p (), n_perms, n_loads,
8619 dce_chain);
8622 /* Produce the next vector result for SLP permutation NODE by adding a vector
8623 statement at GSI. If MASK_VEC is nonnull, add:
8625 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8627 otherwise add:
8629 <new SSA name> = FIRST_DEF. */
8631 static void
8632 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8633 slp_tree node, tree first_def, tree second_def,
8634 tree mask_vec, poly_uint64 identity_offset)
8636 tree vectype = SLP_TREE_VECTYPE (node);
8638 /* ??? We SLP match existing vector element extracts but
8639 allow punning which we need to re-instantiate at uses
8640 but have no good way of explicitly representing. */
8641 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8642 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8644 gassign *conv_stmt
8645 = gimple_build_assign (make_ssa_name (vectype),
8646 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8647 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8648 first_def = gimple_assign_lhs (conv_stmt);
8650 gassign *perm_stmt;
8651 tree perm_dest = make_ssa_name (vectype);
8652 if (mask_vec)
8654 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8655 TYPE_SIZE (vectype))
8656 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8658 gassign *conv_stmt
8659 = gimple_build_assign (make_ssa_name (vectype),
8660 build1 (VIEW_CONVERT_EXPR,
8661 vectype, second_def));
8662 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8663 second_def = gimple_assign_lhs (conv_stmt);
8665 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8666 first_def, second_def,
8667 mask_vec);
8669 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8671 /* For identity permutes we still need to handle the case
8672 of offsetted extracts or concats. */
8673 unsigned HOST_WIDE_INT c;
8674 auto first_def_nunits
8675 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8676 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8678 unsigned HOST_WIDE_INT elsz
8679 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8680 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8681 TYPE_SIZE (vectype),
8682 bitsize_int (identity_offset * elsz));
8683 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8685 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8686 first_def_nunits, &c) && c == 2)
8688 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8689 NULL_TREE, second_def);
8690 perm_stmt = gimple_build_assign (perm_dest, ctor);
8692 else
8693 gcc_unreachable ();
8695 else
8697 /* We need a copy here in case the def was external. */
8698 perm_stmt = gimple_build_assign (perm_dest, first_def);
8700 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8701 /* Store the vector statement in NODE. */
8702 node->push_vec_def (perm_stmt);
8705 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8706 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8707 If GSI is nonnull, emit the permutation there.
8709 When GSI is null, the only purpose of NODE is to give properties
8710 of the result, such as the vector type and number of SLP lanes.
8711 The node does not need to be a VEC_PERM_EXPR.
8713 If the target supports the operation, return the number of individual
8714 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8715 dump file if DUMP_P is true. */
8717 static int
8718 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8719 slp_tree node, lane_permutation_t &perm,
8720 vec<slp_tree> &children, bool dump_p)
8722 tree vectype = SLP_TREE_VECTYPE (node);
8724 /* ??? We currently only support all same vector input types
8725 while the SLP IL should really do a concat + select and thus accept
8726 arbitrary mismatches. */
8727 slp_tree child;
8728 unsigned i;
8729 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8730 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8731 tree op_vectype = NULL_TREE;
8732 FOR_EACH_VEC_ELT (children, i, child)
8733 if (SLP_TREE_VECTYPE (child))
8735 op_vectype = SLP_TREE_VECTYPE (child);
8736 break;
8738 if (!op_vectype)
8739 op_vectype = vectype;
8740 FOR_EACH_VEC_ELT (children, i, child)
8742 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8743 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8744 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8745 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8747 if (dump_p)
8748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8749 "Unsupported vector types in lane permutation\n");
8750 return -1;
8752 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8753 repeating_p = false;
8756 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8757 if (dump_p)
8759 dump_printf_loc (MSG_NOTE, vect_location,
8760 "vectorizing permutation");
8761 for (unsigned i = 0; i < perm.length (); ++i)
8762 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8763 if (repeating_p)
8764 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8765 dump_printf (MSG_NOTE, "\n");
8768 /* REPEATING_P is true if every output vector is guaranteed to use the
8769 same permute vector. We can handle that case for both variable-length
8770 and constant-length vectors, but we only handle other cases for
8771 constant-length vectors.
8773 Set:
8775 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8776 mask vector that we want to build.
8778 - NCOPIES to the number of copies of PERM that we need in order
8779 to build the necessary permute mask vectors.
8781 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8782 for each permute mask vector. This is only relevant when GSI is
8783 nonnull. */
8784 uint64_t npatterns;
8785 unsigned nelts_per_pattern;
8786 uint64_t ncopies;
8787 unsigned noutputs_per_mask;
8788 if (repeating_p)
8790 /* We need a single permute mask vector that has the form:
8792 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8794 In other words, the original n-element permute in PERM is
8795 "unrolled" to fill a full vector. The stepped vector encoding
8796 that we use for permutes requires 3n elements. */
8797 npatterns = SLP_TREE_LANES (node);
8798 nelts_per_pattern = ncopies = 3;
8799 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8801 else
8803 /* Calculate every element of every permute mask vector explicitly,
8804 instead of relying on the pattern described above. */
8805 if (!nunits.is_constant (&npatterns))
8806 return -1;
8807 nelts_per_pattern = ncopies = 1;
8808 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8809 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8810 return -1;
8811 noutputs_per_mask = 1;
8813 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8814 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8816 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8817 from the { SLP operand, scalar lane } permutation as recorded in the
8818 SLP node as intermediate step. This part should already work
8819 with SLP children with arbitrary number of lanes. */
8820 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8821 auto_vec<unsigned> active_lane;
8822 vperm.create (olanes);
8823 active_lane.safe_grow_cleared (children.length (), true);
8824 for (unsigned i = 0; i < ncopies; ++i)
8826 for (unsigned pi = 0; pi < perm.length (); ++pi)
8828 std::pair<unsigned, unsigned> p = perm[pi];
8829 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8830 if (repeating_p)
8831 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8832 else
8834 /* We checked above that the vectors are constant-length. */
8835 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8836 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8837 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8838 vperm.quick_push ({{p.first, vi}, vl});
8841 /* Advance to the next group. */
8842 for (unsigned j = 0; j < children.length (); ++j)
8843 active_lane[j] += SLP_TREE_LANES (children[j]);
8846 if (dump_p)
8848 dump_printf_loc (MSG_NOTE, vect_location,
8849 "vectorizing permutation");
8850 for (unsigned i = 0; i < perm.length (); ++i)
8851 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8852 if (repeating_p)
8853 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8854 dump_printf (MSG_NOTE, "\n");
8855 dump_printf_loc (MSG_NOTE, vect_location, "as");
8856 for (unsigned i = 0; i < vperm.length (); ++i)
8858 if (i != 0
8859 && (repeating_p
8860 ? multiple_p (i, npatterns)
8861 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8862 dump_printf (MSG_NOTE, ",");
8863 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8864 vperm[i].first.first, vperm[i].first.second,
8865 vperm[i].second);
8867 dump_printf (MSG_NOTE, "\n");
8870 /* We can only handle two-vector permutes, everything else should
8871 be lowered on the SLP level. The following is closely inspired
8872 by vect_transform_slp_perm_load and is supposed to eventually
8873 replace it.
8874 ??? As intermediate step do code-gen in the SLP tree representation
8875 somehow? */
8876 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8877 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8878 unsigned int index = 0;
8879 poly_uint64 mask_element;
8880 vec_perm_builder mask;
8881 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8882 unsigned int count = mask.encoded_nelts ();
8883 mask.quick_grow (count);
8884 vec_perm_indices indices;
8885 unsigned nperms = 0;
8886 for (unsigned i = 0; i < vperm.length (); ++i)
8888 mask_element = vperm[i].second;
8889 if (first_vec.first == -1U
8890 || first_vec == vperm[i].first)
8891 first_vec = vperm[i].first;
8892 else if (second_vec.first == -1U
8893 || second_vec == vperm[i].first)
8895 second_vec = vperm[i].first;
8896 mask_element += nunits;
8898 else
8900 if (dump_p)
8901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8902 "permutation requires at "
8903 "least three vectors\n");
8904 gcc_assert (!gsi);
8905 return -1;
8908 mask[index++] = mask_element;
8910 if (index == count)
8912 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8913 TYPE_VECTOR_SUBPARTS (op_vectype));
8914 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8915 && constant_multiple_p (mask[0], nunits));
8916 machine_mode vmode = TYPE_MODE (vectype);
8917 machine_mode op_vmode = TYPE_MODE (op_vectype);
8918 unsigned HOST_WIDE_INT c;
8919 if ((!identity_p
8920 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8921 || (identity_p
8922 && !known_le (nunits,
8923 TYPE_VECTOR_SUBPARTS (op_vectype))
8924 && (!constant_multiple_p (nunits,
8925 TYPE_VECTOR_SUBPARTS (op_vectype),
8926 &c) || c != 2)))
8928 if (dump_p)
8930 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8931 vect_location,
8932 "unsupported vect permute { ");
8933 for (i = 0; i < count; ++i)
8935 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8936 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8938 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8940 gcc_assert (!gsi);
8941 return -1;
8944 if (!identity_p)
8945 nperms++;
8946 if (gsi)
8948 if (second_vec.first == -1U)
8949 second_vec = first_vec;
8951 slp_tree
8952 first_node = children[first_vec.first],
8953 second_node = children[second_vec.first];
8955 tree mask_vec = NULL_TREE;
8956 if (!identity_p)
8957 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8959 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8961 tree first_def
8962 = vect_get_slp_vect_def (first_node,
8963 first_vec.second + vi);
8964 tree second_def
8965 = vect_get_slp_vect_def (second_node,
8966 second_vec.second + vi);
8967 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8968 second_def, mask_vec, mask[0]);
8972 index = 0;
8973 first_vec = std::make_pair (-1U, -1U);
8974 second_vec = std::make_pair (-1U, -1U);
8978 return nperms;
8981 /* Vectorize the SLP permutations in NODE as specified
8982 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8983 child number and lane number.
8984 Interleaving of two two-lane two-child SLP subtrees (not supported):
8985 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8986 A blend of two four-lane two-child SLP subtrees:
8987 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8988 Highpart of a four-lane one-child SLP subtree (not supported):
8989 [ { 0, 2 }, { 0, 3 } ]
8990 Where currently only a subset is supported by code generating below. */
8992 static bool
8993 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8994 slp_tree node, stmt_vector_for_cost *cost_vec)
8996 tree vectype = SLP_TREE_VECTYPE (node);
8997 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8998 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8999 SLP_TREE_CHILDREN (node),
9000 dump_enabled_p ());
9001 if (nperms < 0)
9002 return false;
9004 if (!gsi)
9005 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9007 return true;
9010 /* Vectorize SLP NODE. */
9012 static void
9013 vect_schedule_slp_node (vec_info *vinfo,
9014 slp_tree node, slp_instance instance)
9016 gimple_stmt_iterator si;
9017 int i;
9018 slp_tree child;
9020 /* For existing vectors there's nothing to do. */
9021 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9022 && SLP_TREE_VEC_DEFS (node).exists ())
9023 return;
9025 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9027 /* Vectorize externals and constants. */
9028 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9029 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9031 /* ??? vectorizable_shift can end up using a scalar operand which is
9032 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9033 node in this case. */
9034 if (!SLP_TREE_VECTYPE (node))
9035 return;
9037 vect_create_constant_vectors (vinfo, node);
9038 return;
9041 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9043 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9044 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9046 if (dump_enabled_p ())
9047 dump_printf_loc (MSG_NOTE, vect_location,
9048 "------>vectorizing SLP node starting from: %G",
9049 stmt_info->stmt);
9051 if (STMT_VINFO_DATA_REF (stmt_info)
9052 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9054 /* Vectorized loads go before the first scalar load to make it
9055 ready early, vectorized stores go before the last scalar
9056 stmt which is where all uses are ready. */
9057 stmt_vec_info last_stmt_info = NULL;
9058 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9059 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9060 else /* DR_IS_WRITE */
9061 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9062 si = gsi_for_stmt (last_stmt_info->stmt);
9064 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9065 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9066 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9067 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9069 /* For PHI node vectorization we do not use the insertion iterator. */
9070 si = gsi_none ();
9072 else
9074 /* Emit other stmts after the children vectorized defs which is
9075 earliest possible. */
9076 gimple *last_stmt = NULL;
9077 bool seen_vector_def = false;
9078 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9079 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9081 /* For fold-left reductions we are retaining the scalar
9082 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9083 set so the representation isn't perfect. Resort to the
9084 last scalar def here. */
9085 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9087 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9088 == cycle_phi_info_type);
9089 gphi *phi = as_a <gphi *>
9090 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9091 if (!last_stmt
9092 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9093 last_stmt = phi;
9095 /* We are emitting all vectorized stmts in the same place and
9096 the last one is the last.
9097 ??? Unless we have a load permutation applied and that
9098 figures to re-use an earlier generated load. */
9099 unsigned j;
9100 tree vdef;
9101 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9103 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9104 if (!last_stmt
9105 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9106 last_stmt = vstmt;
9109 else if (!SLP_TREE_VECTYPE (child))
9111 /* For externals we use unvectorized at all scalar defs. */
9112 unsigned j;
9113 tree def;
9114 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9115 if (TREE_CODE (def) == SSA_NAME
9116 && !SSA_NAME_IS_DEFAULT_DEF (def))
9118 gimple *stmt = SSA_NAME_DEF_STMT (def);
9119 if (!last_stmt
9120 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9121 last_stmt = stmt;
9124 else
9126 /* For externals we have to look at all defs since their
9127 insertion place is decided per vector. But beware
9128 of pre-existing vectors where we need to make sure
9129 we do not insert before the region boundary. */
9130 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9131 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9132 seen_vector_def = true;
9133 else
9135 unsigned j;
9136 tree vdef;
9137 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9138 if (TREE_CODE (vdef) == SSA_NAME
9139 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9141 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9142 if (!last_stmt
9143 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9144 last_stmt = vstmt;
9148 /* This can happen when all children are pre-existing vectors or
9149 constants. */
9150 if (!last_stmt)
9151 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9152 if (!last_stmt)
9154 gcc_assert (seen_vector_def);
9155 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9157 else if (is_ctrl_altering_stmt (last_stmt))
9159 /* We split regions to vectorize at control altering stmts
9160 with a definition so this must be an external which
9161 we can insert at the start of the region. */
9162 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9164 else if (is_a <bb_vec_info> (vinfo)
9165 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9166 && gimple_could_trap_p (stmt_info->stmt))
9168 /* We've constrained possibly trapping operations to all come
9169 from the same basic-block, if vectorized defs would allow earlier
9170 scheduling still force vectorized stmts to the original block.
9171 This is only necessary for BB vectorization since for loop vect
9172 all operations are in a single BB and scalar stmt based
9173 placement doesn't play well with epilogue vectorization. */
9174 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9175 gimple_bb (stmt_info->stmt),
9176 gimple_bb (last_stmt)));
9177 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9179 else if (is_a <gphi *> (last_stmt))
9180 si = gsi_after_labels (gimple_bb (last_stmt));
9181 else
9183 si = gsi_for_stmt (last_stmt);
9184 gsi_next (&si);
9188 /* Handle purely internal nodes. */
9189 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9191 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9192 be shared with different SLP nodes (but usually it's the same
9193 operation apart from the case the stmt is only there for denoting
9194 the actual scalar lane defs ...). So do not call vect_transform_stmt
9195 but open-code it here (partly). */
9196 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9197 gcc_assert (done);
9198 stmt_vec_info slp_stmt_info;
9199 unsigned int i;
9200 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9201 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9203 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9204 instance, i, true, NULL);
9205 gcc_assert (done);
9208 else
9209 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9212 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9213 For loop vectorization this is done in vectorizable_call, but for SLP
9214 it needs to be deferred until end of vect_schedule_slp, because multiple
9215 SLP instances may refer to the same scalar stmt. */
9217 static void
9218 vect_remove_slp_scalar_calls (vec_info *vinfo,
9219 slp_tree node, hash_set<slp_tree> &visited)
9221 gimple *new_stmt;
9222 gimple_stmt_iterator gsi;
9223 int i;
9224 slp_tree child;
9225 tree lhs;
9226 stmt_vec_info stmt_info;
9228 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9229 return;
9231 if (visited.add (node))
9232 return;
9234 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9235 vect_remove_slp_scalar_calls (vinfo, child, visited);
9237 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9239 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9240 if (!stmt || gimple_bb (stmt) == NULL)
9241 continue;
9242 if (is_pattern_stmt_p (stmt_info)
9243 || !PURE_SLP_STMT (stmt_info))
9244 continue;
9245 lhs = gimple_call_lhs (stmt);
9246 if (lhs)
9247 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9248 else
9250 new_stmt = gimple_build_nop ();
9251 unlink_stmt_vdef (stmt_info->stmt);
9253 gsi = gsi_for_stmt (stmt);
9254 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9255 if (lhs)
9256 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9260 static void
9261 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9263 hash_set<slp_tree> visited;
9264 vect_remove_slp_scalar_calls (vinfo, node, visited);
9267 /* Vectorize the instance root. */
9269 void
9270 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9272 gassign *rstmt = NULL;
9274 if (instance->kind == slp_inst_kind_ctor)
9276 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9278 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9279 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9280 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9281 TREE_TYPE (vect_lhs)))
9282 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9283 vect_lhs);
9284 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9286 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9288 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9289 tree child_def;
9290 int j;
9291 vec<constructor_elt, va_gc> *v;
9292 vec_alloc (v, nelts);
9294 /* A CTOR can handle V16HI composition from VNx8HI so we
9295 do not need to convert vector elements if the types
9296 do not match. */
9297 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9298 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9299 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9300 tree rtype
9301 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9302 tree r_constructor = build_constructor (rtype, v);
9303 rstmt = gimple_build_assign (lhs, r_constructor);
9306 else if (instance->kind == slp_inst_kind_bb_reduc)
9308 /* Largely inspired by reduction chain epilogue handling in
9309 vect_create_epilog_for_reduction. */
9310 vec<tree> vec_defs = vNULL;
9311 vect_get_slp_defs (node, &vec_defs);
9312 enum tree_code reduc_code
9313 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9314 /* ??? We actually have to reflect signs somewhere. */
9315 if (reduc_code == MINUS_EXPR)
9316 reduc_code = PLUS_EXPR;
9317 gimple_seq epilogue = NULL;
9318 /* We may end up with more than one vector result, reduce them
9319 to one vector. */
9320 tree vec_def = vec_defs[0];
9321 tree vectype = TREE_TYPE (vec_def);
9322 tree compute_vectype = vectype;
9323 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9324 && TYPE_OVERFLOW_UNDEFINED (vectype)
9325 && operation_can_overflow (reduc_code));
9326 if (pun_for_overflow_p)
9328 compute_vectype = unsigned_type_for (vectype);
9329 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9330 compute_vectype, vec_def);
9332 for (unsigned i = 1; i < vec_defs.length (); ++i)
9334 tree def = vec_defs[i];
9335 if (pun_for_overflow_p)
9336 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9337 compute_vectype, def);
9338 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9339 vec_def, def);
9341 vec_defs.release ();
9342 /* ??? Support other schemes than direct internal fn. */
9343 internal_fn reduc_fn;
9344 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9345 || reduc_fn == IFN_LAST)
9346 gcc_unreachable ();
9347 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9348 TREE_TYPE (compute_vectype), vec_def);
9349 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9351 tree rem_def = NULL_TREE;
9352 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9354 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9355 if (!rem_def)
9356 rem_def = def;
9357 else
9358 rem_def = gimple_build (&epilogue, reduc_code,
9359 TREE_TYPE (scalar_def),
9360 rem_def, def);
9362 scalar_def = gimple_build (&epilogue, reduc_code,
9363 TREE_TYPE (scalar_def),
9364 scalar_def, rem_def);
9366 scalar_def = gimple_convert (&epilogue,
9367 TREE_TYPE (vectype), scalar_def);
9368 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9369 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9370 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9371 update_stmt (gsi_stmt (rgsi));
9372 return;
9374 else
9375 gcc_unreachable ();
9377 gcc_assert (rstmt);
9379 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9380 gsi_replace (&rgsi, rstmt, true);
9383 struct slp_scc_info
9385 bool on_stack;
9386 int dfs;
9387 int lowlink;
9390 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9392 static void
9393 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9394 hash_map<slp_tree, slp_scc_info> &scc_info,
9395 int &maxdfs, vec<slp_tree> &stack)
9397 bool existed_p;
9398 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9399 gcc_assert (!existed_p);
9400 info->dfs = maxdfs;
9401 info->lowlink = maxdfs;
9402 maxdfs++;
9404 /* Leaf. */
9405 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9407 info->on_stack = false;
9408 vect_schedule_slp_node (vinfo, node, instance);
9409 return;
9412 info->on_stack = true;
9413 stack.safe_push (node);
9415 unsigned i;
9416 slp_tree child;
9417 /* DFS recurse. */
9418 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9420 if (!child)
9421 continue;
9422 slp_scc_info *child_info = scc_info.get (child);
9423 if (!child_info)
9425 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9426 /* Recursion might have re-allocated the node. */
9427 info = scc_info.get (node);
9428 child_info = scc_info.get (child);
9429 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9431 else if (child_info->on_stack)
9432 info->lowlink = MIN (info->lowlink, child_info->dfs);
9434 if (info->lowlink != info->dfs)
9435 return;
9437 auto_vec<slp_tree, 4> phis_to_fixup;
9439 /* Singleton. */
9440 if (stack.last () == node)
9442 stack.pop ();
9443 info->on_stack = false;
9444 vect_schedule_slp_node (vinfo, node, instance);
9445 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9446 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9447 phis_to_fixup.quick_push (node);
9449 else
9451 /* SCC. */
9452 int last_idx = stack.length () - 1;
9453 while (stack[last_idx] != node)
9454 last_idx--;
9455 /* We can break the cycle at PHIs who have at least one child
9456 code generated. Then we could re-start the DFS walk until
9457 all nodes in the SCC are covered (we might have new entries
9458 for only back-reachable nodes). But it's simpler to just
9459 iterate and schedule those that are ready. */
9460 unsigned todo = stack.length () - last_idx;
9463 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9465 slp_tree entry = stack[idx];
9466 if (!entry)
9467 continue;
9468 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9469 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9470 bool ready = !phi;
9471 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9472 if (!child)
9474 gcc_assert (phi);
9475 ready = true;
9476 break;
9478 else if (scc_info.get (child)->on_stack)
9480 if (!phi)
9482 ready = false;
9483 break;
9486 else
9488 if (phi)
9490 ready = true;
9491 break;
9494 if (ready)
9496 vect_schedule_slp_node (vinfo, entry, instance);
9497 scc_info.get (entry)->on_stack = false;
9498 stack[idx] = NULL;
9499 todo--;
9500 if (phi)
9501 phis_to_fixup.safe_push (entry);
9505 while (todo != 0);
9507 /* Pop the SCC. */
9508 stack.truncate (last_idx);
9511 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9512 slp_tree phi_node;
9513 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9515 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9516 edge_iterator ei;
9517 edge e;
9518 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9520 unsigned dest_idx = e->dest_idx;
9521 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9522 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9523 continue;
9524 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9525 /* Simply fill all args. */
9526 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9527 != vect_first_order_recurrence)
9528 for (unsigned i = 0; i < n; ++i)
9530 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9531 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9532 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9533 e, gimple_phi_arg_location (phi, dest_idx));
9535 else
9537 /* Unless it is a first order recurrence which needs
9538 args filled in for both the PHI node and the permutes. */
9539 gimple *perm
9540 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9541 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9542 add_phi_arg (as_a <gphi *> (rphi),
9543 vect_get_slp_vect_def (child, n - 1),
9544 e, gimple_phi_arg_location (phi, dest_idx));
9545 for (unsigned i = 0; i < n; ++i)
9547 gimple *perm
9548 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9549 if (i > 0)
9550 gimple_assign_set_rhs1 (perm,
9551 vect_get_slp_vect_def (child, i - 1));
9552 gimple_assign_set_rhs2 (perm,
9553 vect_get_slp_vect_def (child, i));
9554 update_stmt (perm);
9561 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9563 void
9564 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9566 slp_instance instance;
9567 unsigned int i;
9569 hash_map<slp_tree, slp_scc_info> scc_info;
9570 int maxdfs = 0;
9571 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9573 slp_tree node = SLP_INSTANCE_TREE (instance);
9574 if (dump_enabled_p ())
9576 dump_printf_loc (MSG_NOTE, vect_location,
9577 "Vectorizing SLP tree:\n");
9578 /* ??? Dump all? */
9579 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9580 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9581 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9582 vect_print_slp_graph (MSG_NOTE, vect_location,
9583 SLP_INSTANCE_TREE (instance));
9585 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9586 have a PHI be the node breaking the cycle. */
9587 auto_vec<slp_tree> stack;
9588 if (!scc_info.get (node))
9589 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9591 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9592 vectorize_slp_instance_root_stmt (node, instance);
9594 if (dump_enabled_p ())
9595 dump_printf_loc (MSG_NOTE, vect_location,
9596 "vectorizing stmts using SLP.\n");
9599 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9601 slp_tree root = SLP_INSTANCE_TREE (instance);
9602 stmt_vec_info store_info;
9603 unsigned int j;
9605 /* Remove scalar call stmts. Do not do this for basic-block
9606 vectorization as not all uses may be vectorized.
9607 ??? Why should this be necessary? DCE should be able to
9608 remove the stmts itself.
9609 ??? For BB vectorization we can as well remove scalar
9610 stmts starting from the SLP tree root if they have no
9611 uses. */
9612 if (is_a <loop_vec_info> (vinfo))
9613 vect_remove_slp_scalar_calls (vinfo, root);
9615 /* Remove vectorized stores original scalar stmts. */
9616 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9618 if (!STMT_VINFO_DATA_REF (store_info)
9619 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9620 break;
9622 store_info = vect_orig_stmt (store_info);
9623 /* Free the attached stmt_vec_info and remove the stmt. */
9624 vinfo->remove_stmt (store_info);
9626 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9627 to not crash in vect_free_slp_tree later. */
9628 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9629 SLP_TREE_REPRESENTATIVE (root) = NULL;