Fix ICE with SLP and -fdbg-cnt
[official-gcc.git] / gcc / tree-vect-slp.cc
blob33c4d1308f61f416e0d510e0a787288a4aec28eb
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_DEFS (this).release ();
145 SLP_TREE_LOAD_PERMUTATION (this).release ();
146 SLP_TREE_LANE_PERMUTATION (this).release ();
147 SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Push the single SSA definition in DEF to the vector of vector defs. */
154 void
155 _slp_tree::push_vec_def (gimple *def)
157 if (gphi *phi = dyn_cast <gphi *> (def))
158 vec_defs.quick_push (gimple_phi_result (phi));
159 else
161 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 vec_defs.quick_push (get_def_from_ptr (defop));
166 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
168 void
169 vect_free_slp_tree (slp_tree node)
171 int i;
172 slp_tree child;
174 if (--SLP_TREE_REF_COUNT (node) != 0)
175 return;
177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 if (child)
179 vect_free_slp_tree (child);
181 /* If the node defines any SLP only patterns then those patterns are no
182 longer valid and should be removed. */
183 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
186 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
191 delete node;
194 /* Return a location suitable for dumpings related to the SLP instance. */
196 dump_user_location_t
197 _slp_instance::location () const
199 if (!root_stmts.is_empty ())
200 return root_stmts[0]->stmt;
201 else
202 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
206 /* Free the memory allocated for the SLP instance. */
208 void
209 vect_free_slp_instance (slp_instance instance)
211 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 SLP_INSTANCE_LOADS (instance).release ();
213 SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 instance->subgraph_entries.release ();
216 instance->cost_vec.release ();
217 free (instance);
221 /* Create an SLP node for SCALAR_STMTS. */
223 slp_tree
224 vect_create_new_slp_node (unsigned nops, tree_code code)
226 slp_tree node = new _slp_tree;
227 SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 SLP_TREE_CHILDREN (node).create (nops);
229 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 SLP_TREE_CODE (node) = code;
231 return node;
233 /* Create an SLP node for SCALAR_STMTS. */
235 static slp_tree
236 vect_create_new_slp_node (slp_tree node,
237 vec<stmt_vec_info> scalar_stmts, unsigned nops)
239 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 SLP_TREE_CHILDREN (node).create (nops);
241 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 SLP_TREE_LANES (node) = scalar_stmts.length ();
244 return node;
247 /* Create an SLP node for SCALAR_STMTS. */
249 static slp_tree
250 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
252 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
255 /* Create an SLP node for OPS. */
257 static slp_tree
258 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
260 SLP_TREE_SCALAR_OPS (node) = ops;
261 SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 SLP_TREE_LANES (node) = ops.length ();
263 return node;
266 /* Create an SLP node for OPS. */
268 static slp_tree
269 vect_create_new_slp_node (vec<tree> ops)
271 return vect_create_new_slp_node (new _slp_tree, ops);
275 /* This structure is used in creation of an SLP tree. Each instance
276 corresponds to the same operand in a group of scalar stmts in an SLP
277 node. */
278 typedef struct _slp_oprnd_info
280 /* Def-stmts for the operands. */
281 vec<stmt_vec_info> def_stmts;
282 /* Operands. */
283 vec<tree> ops;
284 /* Information about the first statement, its vector def-type, type, the
285 operand itself in case it's constant, and an indication if it's a pattern
286 stmt and gather/scatter info. */
287 tree first_op_type;
288 enum vect_def_type first_dt;
289 bool any_pattern;
290 bool first_gs_p;
291 gather_scatter_info first_gs_info;
292 } *slp_oprnd_info;
295 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 operand. */
297 static vec<slp_oprnd_info>
298 vect_create_oprnd_info (int nops, int group_size)
300 int i;
301 slp_oprnd_info oprnd_info;
302 vec<slp_oprnd_info> oprnds_info;
304 oprnds_info.create (nops);
305 for (i = 0; i < nops; i++)
307 oprnd_info = XNEW (struct _slp_oprnd_info);
308 oprnd_info->def_stmts.create (group_size);
309 oprnd_info->ops.create (group_size);
310 oprnd_info->first_dt = vect_uninitialized_def;
311 oprnd_info->first_op_type = NULL_TREE;
312 oprnd_info->any_pattern = false;
313 oprnd_info->first_gs_p = false;
314 oprnds_info.quick_push (oprnd_info);
317 return oprnds_info;
321 /* Free operands info. */
323 static void
324 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
326 int i;
327 slp_oprnd_info oprnd_info;
329 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
331 oprnd_info->def_stmts.release ();
332 oprnd_info->ops.release ();
333 XDELETE (oprnd_info);
336 oprnds_info.release ();
339 /* Return the execution frequency of NODE (so that a higher value indicates
340 a "more important" node when optimizing for speed). */
342 static sreal
343 vect_slp_node_weight (slp_tree node)
345 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 basic_block bb = gimple_bb (stmt_info->stmt);
347 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
350 /* Return true if STMTS contains a pattern statement. */
352 static bool
353 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
355 stmt_vec_info stmt_info;
356 unsigned int i;
357 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 if (is_pattern_stmt_p (stmt_info))
359 return true;
360 return false;
363 /* Return true when all lanes in the external or constant NODE have
364 the same value. */
366 static bool
367 vect_slp_tree_uniform_p (slp_tree node)
369 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
372 /* Pre-exsting vectors. */
373 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 return false;
376 unsigned i;
377 tree op, first = NULL_TREE;
378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 if (!first)
380 first = op;
381 else if (!operand_equal_p (first, op, 0))
382 return false;
384 return true;
387 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 of the chain. */
392 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 stmt_vec_info first_stmt_info)
395 stmt_vec_info next_stmt_info = first_stmt_info;
396 int result = 0;
398 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 return -1;
403 if (next_stmt_info == stmt_info)
404 return result;
405 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 if (next_stmt_info)
407 result += DR_GROUP_GAP (next_stmt_info);
409 while (next_stmt_info);
411 return -1;
414 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 using the method implemented by duplicate_and_interleave. Return true
416 if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 (if nonnull). */
420 bool
421 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 tree elt_type, unsigned int *nvectors_out,
423 tree *vector_type_out,
424 tree *permutes)
426 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 return false;
430 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 unsigned int nvectors = 1;
433 for (;;)
435 scalar_int_mode int_mode;
436 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
439 /* Get the natural vector type for this SLP group size. */
440 tree int_type = build_nonstandard_integer_type
441 (GET_MODE_BITSIZE (int_mode), 1);
442 tree vector_type
443 = get_vectype_for_scalar_type (vinfo, int_type, count);
444 poly_int64 half_nelts;
445 if (vector_type
446 && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 GET_MODE_SIZE (base_vector_mode))
449 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 2, &half_nelts))
452 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 together into elements of type INT_TYPE and using the result
454 to build NVECTORS vectors. */
455 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 vec_perm_builder sel1 (nelts, 2, 3);
457 vec_perm_builder sel2 (nelts, 2, 3);
459 for (unsigned int i = 0; i < 3; ++i)
461 sel1.quick_push (i);
462 sel1.quick_push (i + nelts);
463 sel2.quick_push (half_nelts + i);
464 sel2.quick_push (half_nelts + i + nelts);
466 vec_perm_indices indices1 (sel1, 2, nelts);
467 vec_perm_indices indices2 (sel2, 2, nelts);
468 machine_mode vmode = TYPE_MODE (vector_type);
469 if (can_vec_perm_const_p (vmode, vmode, indices1)
470 && can_vec_perm_const_p (vmode, vmode, indices2))
472 if (nvectors_out)
473 *nvectors_out = nvectors;
474 if (vector_type_out)
475 *vector_type_out = vector_type;
476 if (permutes)
478 permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 indices1);
480 permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 indices2);
483 return true;
487 if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 return false;
489 nvectors *= 2;
493 /* Return true if DTA and DTB match. */
495 static bool
496 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
498 return (dta == dtb
499 || ((dta == vect_external_def || dta == vect_constant_def)
500 && (dtb == vect_external_def || dtb == vect_constant_def)));
503 static const int cond_expr_maps[3][5] = {
504 { 4, -1, -2, 1, 2 },
505 { 4, -2, -1, 1, 2 },
506 { 4, -1, -2, 2, 1 }
508 static const int arg1_map[] = { 1, 1 };
509 static const int arg2_map[] = { 1, 2 };
510 static const int arg1_arg4_map[] = { 2, 1, 4 };
511 static const int arg3_arg2_map[] = { 2, 3, 2 };
512 static const int op1_op0_map[] = { 2, 1, 0 };
513 static const int off_map[] = { 1, -3 };
514 static const int off_op0_map[] = { 2, -3, 0 };
515 static const int off_arg2_map[] = { 2, -3, 2 };
516 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
517 static const int mask_call_maps[6][7] = {
518 { 1, 1, },
519 { 2, 1, 2, },
520 { 3, 1, 2, 3, },
521 { 4, 1, 2, 3, 4, },
522 { 5, 1, 2, 3, 4, 5, },
523 { 6, 1, 2, 3, 4, 5, 6 },
526 /* For most SLP statements, there is a one-to-one mapping between
527 gimple arguments and child nodes. If that is not true for STMT,
528 return an array that contains:
530 - the number of child nodes, followed by
531 - for each child node, the index of the argument associated with that node.
532 The special index -1 is the first operand of an embedded comparison and
533 the special index -2 is the second operand of an embedded comparison.
534 The special indes -3 is the offset of a gather as analyzed by
535 vect_check_gather_scatter.
537 SWAP is as for vect_get_and_check_slp_defs. */
539 static const int *
540 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
541 unsigned char swap = 0)
543 if (auto assign = dyn_cast<const gassign *> (stmt))
545 if (gimple_assign_rhs_code (assign) == COND_EXPR
546 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
547 return cond_expr_maps[swap];
548 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
549 && swap)
550 return op1_op0_map;
551 if (gather_scatter_p)
552 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
553 ? off_op0_map : off_map);
555 gcc_assert (!swap);
556 if (auto call = dyn_cast<const gcall *> (stmt))
558 if (gimple_call_internal_p (call))
559 switch (gimple_call_internal_fn (call))
561 case IFN_MASK_LOAD:
562 return gather_scatter_p ? off_arg2_map : arg2_map;
564 case IFN_GATHER_LOAD:
565 return arg1_map;
567 case IFN_MASK_GATHER_LOAD:
568 case IFN_MASK_LEN_GATHER_LOAD:
569 return arg1_arg4_map;
571 case IFN_MASK_STORE:
572 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574 case IFN_MASK_CALL:
576 unsigned nargs = gimple_call_num_args (call);
577 if (nargs >= 2 && nargs <= 7)
578 return mask_call_maps[nargs-2];
579 else
580 return nullptr;
583 default:
584 break;
587 return nullptr;
590 /* Return the SLP node child index for operand OP of STMT. */
593 vect_slp_child_index_for_operand (const gimple *stmt, int op,
594 bool gather_scatter_p)
596 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
597 if (!opmap)
598 return op;
599 for (int i = 1; i < 1 + opmap[0]; ++i)
600 if (opmap[i] == op)
601 return i - 1;
602 gcc_unreachable ();
605 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
606 they are of a valid type and that they match the defs of the first stmt of
607 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
608 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
609 indicates swap is required for cond_expr stmts. Specifically, SWAP
610 is 1 if STMT is cond and operands of comparison need to be swapped;
611 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
613 If there was a fatal error return -1; if the error could be corrected by
614 swapping operands of father node of this one, return 1; if everything is
615 ok return 0. */
616 static int
617 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
618 bool *skip_args,
619 vec<stmt_vec_info> stmts, unsigned stmt_num,
620 vec<slp_oprnd_info> *oprnds_info)
622 stmt_vec_info stmt_info = stmts[stmt_num];
623 tree oprnd;
624 unsigned int i, number_of_oprnds;
625 enum vect_def_type dt = vect_uninitialized_def;
626 slp_oprnd_info oprnd_info;
627 gather_scatter_info gs_info;
628 unsigned int gs_op = -1u;
629 unsigned int commutative_op = -1U;
630 bool first = stmt_num == 0;
632 if (!is_a<gcall *> (stmt_info->stmt)
633 && !is_a<gassign *> (stmt_info->stmt)
634 && !is_a<gphi *> (stmt_info->stmt))
635 return -1;
637 number_of_oprnds = gimple_num_args (stmt_info->stmt);
638 const int *map
639 = vect_get_operand_map (stmt_info->stmt,
640 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
641 if (map)
642 number_of_oprnds = *map++;
643 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
645 if (gimple_call_internal_p (stmt))
647 internal_fn ifn = gimple_call_internal_fn (stmt);
648 commutative_op = first_commutative_argument (ifn);
651 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
653 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
654 commutative_op = 0;
657 bool swapped = (swap != 0);
658 bool backedge = false;
659 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
660 for (i = 0; i < number_of_oprnds; i++)
662 oprnd_info = (*oprnds_info)[i];
663 int opno = map ? map[i] : int (i);
664 if (opno == -3)
666 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
667 if (!is_a <loop_vec_info> (vinfo)
668 || !vect_check_gather_scatter (stmt_info,
669 as_a <loop_vec_info> (vinfo),
670 first ? &oprnd_info->first_gs_info
671 : &gs_info))
672 return -1;
674 if (first)
676 oprnd_info->first_gs_p = true;
677 oprnd = oprnd_info->first_gs_info.offset;
679 else
681 gs_op = i;
682 oprnd = gs_info.offset;
685 else if (opno < 0)
686 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
687 else
689 oprnd = gimple_arg (stmt_info->stmt, opno);
690 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
692 edge e = gimple_phi_arg_edge (stmt, opno);
693 backedge = (is_a <bb_vec_info> (vinfo)
694 ? e->flags & EDGE_DFS_BACK
695 : dominated_by_p (CDI_DOMINATORS, e->src,
696 gimple_bb (stmt_info->stmt)));
699 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
700 oprnd = TREE_OPERAND (oprnd, 0);
702 stmt_vec_info def_stmt_info;
703 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "Build SLP failed: can't analyze def for %T\n",
708 oprnd);
710 return -1;
713 if (skip_args[i])
715 oprnd_info->def_stmts.quick_push (NULL);
716 oprnd_info->ops.quick_push (NULL_TREE);
717 oprnd_info->first_dt = vect_uninitialized_def;
718 continue;
721 oprnd_info->def_stmts.quick_push (def_stmt_info);
722 oprnd_info->ops.quick_push (oprnd);
724 if (def_stmt_info
725 && is_pattern_stmt_p (def_stmt_info))
727 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
728 != def_stmt_info)
729 oprnd_info->any_pattern = true;
730 else
731 /* If we promote this to external use the original stmt def. */
732 oprnd_info->ops.last ()
733 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
736 /* If there's a extern def on a backedge make sure we can
737 code-generate at the region start.
738 ??? This is another case that could be fixed by adjusting
739 how we split the function but at the moment we'd have conflicting
740 goals there. */
741 if (backedge
742 && dts[i] == vect_external_def
743 && is_a <bb_vec_info> (vinfo)
744 && TREE_CODE (oprnd) == SSA_NAME
745 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
746 && !dominated_by_p (CDI_DOMINATORS,
747 as_a <bb_vec_info> (vinfo)->bbs[0],
748 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "Build SLP failed: extern def %T only defined "
753 "on backedge\n", oprnd);
754 return -1;
757 if (first)
759 tree type = TREE_TYPE (oprnd);
760 dt = dts[i];
761 if ((dt == vect_constant_def
762 || dt == vect_external_def)
763 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
764 && TREE_CODE (type) != BOOLEAN_TYPE
765 && !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: invalid type of def "
770 "for variable-length SLP %T\n", oprnd);
771 return -1;
774 /* For the swapping logic below force vect_reduction_def
775 for the reduction op in a SLP reduction group. */
776 if (!STMT_VINFO_DATA_REF (stmt_info)
777 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
778 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
779 && def_stmt_info)
780 dts[i] = dt = vect_reduction_def;
782 /* Check the types of the definition. */
783 switch (dt)
785 case vect_external_def:
786 case vect_constant_def:
787 case vect_internal_def:
788 case vect_reduction_def:
789 case vect_induction_def:
790 case vect_nested_cycle:
791 case vect_first_order_recurrence:
792 break;
794 default:
795 /* FORNOW: Not supported. */
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 "Build SLP failed: illegal type of def %T\n",
799 oprnd);
800 return -1;
803 oprnd_info->first_dt = dt;
804 oprnd_info->first_op_type = type;
807 if (first)
808 return 0;
810 /* Now match the operand definition types to that of the first stmt. */
811 for (i = 0; i < number_of_oprnds;)
813 if (skip_args[i])
815 ++i;
816 continue;
819 oprnd_info = (*oprnds_info)[i];
820 dt = dts[i];
821 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
822 oprnd = oprnd_info->ops[stmt_num];
823 tree type = TREE_TYPE (oprnd);
825 if (!types_compatible_p (oprnd_info->first_op_type, type))
827 if (dump_enabled_p ())
828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
829 "Build SLP failed: different operand types\n");
830 return 1;
833 if ((gs_op == i) != oprnd_info->first_gs_p)
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
837 "Build SLP failed: mixed gather and non-gather\n");
838 return 1;
840 else if (gs_op == i)
842 if (!operand_equal_p (oprnd_info->first_gs_info.base,
843 gs_info.base))
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 "Build SLP failed: different gather base\n");
848 return 1;
850 if (oprnd_info->first_gs_info.scale != gs_info.scale)
852 if (dump_enabled_p ())
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "Build SLP failed: different gather scale\n");
855 return 1;
859 /* Not first stmt of the group, check that the def-stmt/s match
860 the def-stmt/s of the first stmt. Allow different definition
861 types for reduction chains: the first stmt must be a
862 vect_reduction_def (a phi node), and the rest
863 end in the reduction chain. */
864 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
865 && !(oprnd_info->first_dt == vect_reduction_def
866 && !STMT_VINFO_DATA_REF (stmt_info)
867 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
868 && def_stmt_info
869 && !STMT_VINFO_DATA_REF (def_stmt_info)
870 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
871 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
872 || (!STMT_VINFO_DATA_REF (stmt_info)
873 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
874 && ((!def_stmt_info
875 || STMT_VINFO_DATA_REF (def_stmt_info)
876 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
877 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
878 != (oprnd_info->first_dt != vect_reduction_def))))
880 /* Try swapping operands if we got a mismatch. For BB
881 vectorization only in case it will clearly improve things. */
882 if (i == commutative_op && !swapped
883 && (!is_a <bb_vec_info> (vinfo)
884 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
885 dts[i+1])
886 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
887 || vect_def_types_match
888 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location,
892 "trying swapped operands\n");
893 std::swap (dts[i], dts[i+1]);
894 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
895 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
896 std::swap ((*oprnds_info)[i]->ops[stmt_num],
897 (*oprnds_info)[i+1]->ops[stmt_num]);
898 swapped = true;
899 continue;
902 if (is_a <bb_vec_info> (vinfo)
903 && !oprnd_info->any_pattern)
905 /* Now for commutative ops we should see whether we can
906 make the other operand matching. */
907 if (dump_enabled_p ())
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "treating operand as external\n");
910 oprnd_info->first_dt = dt = vect_external_def;
912 else
914 if (dump_enabled_p ())
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: different types\n");
917 return 1;
921 /* Make sure to demote the overall operand to external. */
922 if (dt == vect_external_def)
923 oprnd_info->first_dt = vect_external_def;
924 /* For a SLP reduction chain we want to duplicate the reduction to
925 each of the chain members. That gets us a sane SLP graph (still
926 the stmts are not 100% correct wrt the initial values). */
927 else if ((dt == vect_internal_def
928 || dt == vect_reduction_def)
929 && oprnd_info->first_dt == vect_reduction_def
930 && !STMT_VINFO_DATA_REF (stmt_info)
931 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
932 && !STMT_VINFO_DATA_REF (def_stmt_info)
933 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
934 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
936 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
937 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
940 ++i;
943 /* Swap operands. */
944 if (swapped)
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location,
948 "swapped operands to match def types in %G",
949 stmt_info->stmt);
952 return 0;
955 /* Return true if call statements CALL1 and CALL2 are similar enough
956 to be combined into the same SLP group. */
958 bool
959 compatible_calls_p (gcall *call1, gcall *call2)
961 unsigned int nargs = gimple_call_num_args (call1);
962 if (nargs != gimple_call_num_args (call2))
963 return false;
965 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
966 return false;
968 if (gimple_call_internal_p (call1))
970 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
971 TREE_TYPE (gimple_call_lhs (call2))))
972 return false;
973 for (unsigned int i = 0; i < nargs; ++i)
974 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
975 TREE_TYPE (gimple_call_arg (call2, i))))
976 return false;
978 else
980 if (!operand_equal_p (gimple_call_fn (call1),
981 gimple_call_fn (call2), 0))
982 return false;
984 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
985 return false;
988 /* Check that any unvectorized arguments are equal. */
989 if (const int *map = vect_get_operand_map (call1))
991 unsigned int nkept = *map++;
992 unsigned int mapi = 0;
993 for (unsigned int i = 0; i < nargs; ++i)
994 if (mapi < nkept && map[mapi] == int (i))
995 mapi += 1;
996 else if (!operand_equal_p (gimple_call_arg (call1, i),
997 gimple_call_arg (call2, i)))
998 return false;
1001 return true;
1004 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1005 caller's attempt to find the vector type in STMT_INFO with the narrowest
1006 element type. Return true if VECTYPE is nonnull and if it is valid
1007 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1008 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1009 vect_build_slp_tree. */
1011 static bool
1012 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1013 unsigned int group_size,
1014 tree vectype, poly_uint64 *max_nunits)
1016 if (!vectype)
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020 "Build SLP failed: unsupported data-type in %G\n",
1021 stmt_info->stmt);
1022 /* Fatal mismatch. */
1023 return false;
1026 /* If populating the vector type requires unrolling then fail
1027 before adjusting *max_nunits for basic-block vectorization. */
1028 if (is_a <bb_vec_info> (vinfo)
1029 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: unrolling required "
1034 "in basic block SLP\n");
1035 /* Fatal mismatch. */
1036 return false;
1039 /* In case of multiple types we need to detect the smallest type. */
1040 vect_update_max_nunits (max_nunits, vectype);
1041 return true;
1044 /* Verify if the scalar stmts STMTS are isomorphic, require data
1045 permutation or are of unsupported types of operation. Return
1046 true if they are, otherwise return false and indicate in *MATCHES
1047 which stmts are not isomorphic to the first one. If MATCHES[0]
1048 is false then this indicates the comparison could not be
1049 carried out or the stmts will never be vectorized by SLP.
1051 Note COND_EXPR is possibly isomorphic to another one after swapping its
1052 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1053 the first stmt by swapping the two operands of comparison; set SWAP[i]
1054 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1055 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1056 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1058 static bool
1059 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1060 vec<stmt_vec_info> stmts, unsigned int group_size,
1061 poly_uint64 *max_nunits, bool *matches,
1062 bool *two_operators, tree *node_vectype)
1064 unsigned int i;
1065 stmt_vec_info first_stmt_info = stmts[0];
1066 code_helper first_stmt_code = ERROR_MARK;
1067 code_helper alt_stmt_code = ERROR_MARK;
1068 code_helper rhs_code = ERROR_MARK;
1069 code_helper first_cond_code = ERROR_MARK;
1070 tree lhs;
1071 bool need_same_oprnds = false;
1072 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1073 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1074 bool first_stmt_ldst_p = false, ldst_p = false;
1075 bool first_stmt_phi_p = false, phi_p = false;
1076 bool maybe_soft_fail = false;
1077 tree soft_fail_nunits_vectype = NULL_TREE;
1079 /* For every stmt in NODE find its def stmt/s. */
1080 stmt_vec_info stmt_info;
1081 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1083 gimple *stmt = stmt_info->stmt;
1084 swap[i] = 0;
1085 matches[i] = false;
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1090 /* Fail to vectorize statements marked as unvectorizable, throw
1091 or are volatile. */
1092 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1093 || stmt_can_throw_internal (cfun, stmt)
1094 || gimple_has_volatile_ops (stmt))
1096 if (dump_enabled_p ())
1097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 "Build SLP failed: unvectorizable statement %G",
1099 stmt);
1100 /* ??? For BB vectorization we want to commutate operands in a way
1101 to shuffle all unvectorizable defs into one operand and have
1102 the other still vectorized. The following doesn't reliably
1103 work for this though but it's the easiest we can do here. */
1104 if (is_a <bb_vec_info> (vinfo) && i != 0)
1105 continue;
1106 /* Fatal mismatch. */
1107 matches[0] = false;
1108 return false;
1111 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1112 lhs = gimple_get_lhs (stmt);
1113 if (lhs == NULL_TREE
1114 && (!call_stmt
1115 || !gimple_call_internal_p (stmt)
1116 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120 "Build SLP failed: not GIMPLE_ASSIGN nor "
1121 "GIMPLE_CALL %G", stmt);
1122 if (is_a <bb_vec_info> (vinfo) && i != 0)
1123 continue;
1124 /* Fatal mismatch. */
1125 matches[0] = false;
1126 return false;
1129 tree nunits_vectype;
1130 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1131 &nunits_vectype, group_size))
1133 if (is_a <bb_vec_info> (vinfo) && i != 0)
1134 continue;
1135 /* Fatal mismatch. */
1136 matches[0] = false;
1137 return false;
1139 /* Record nunits required but continue analysis, producing matches[]
1140 as if nunits was not an issue. This allows splitting of groups
1141 to happen. */
1142 if (nunits_vectype
1143 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1144 nunits_vectype, max_nunits))
1146 gcc_assert (is_a <bb_vec_info> (vinfo));
1147 maybe_soft_fail = true;
1148 soft_fail_nunits_vectype = nunits_vectype;
1151 gcc_assert (vectype);
1153 if (call_stmt)
1155 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1156 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1157 rhs_code = cfn;
1158 else
1159 rhs_code = CALL_EXPR;
1161 if (cfn == CFN_MASK_LOAD
1162 || cfn == CFN_GATHER_LOAD
1163 || cfn == CFN_MASK_GATHER_LOAD
1164 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1165 ldst_p = true;
1166 else if (cfn == CFN_MASK_STORE)
1168 ldst_p = true;
1169 rhs_code = CFN_MASK_STORE;
1171 else if ((cfn != CFN_LAST
1172 && cfn != CFN_MASK_CALL
1173 && internal_fn_p (cfn)
1174 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1175 || gimple_call_tail_p (call_stmt)
1176 || gimple_call_noreturn_p (call_stmt)
1177 || gimple_call_chain (call_stmt))
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Build SLP failed: unsupported call type %G",
1182 (gimple *) call_stmt);
1183 if (is_a <bb_vec_info> (vinfo) && i != 0)
1184 continue;
1185 /* Fatal mismatch. */
1186 matches[0] = false;
1187 return false;
1190 else if (gimple_code (stmt) == GIMPLE_PHI)
1192 rhs_code = ERROR_MARK;
1193 phi_p = true;
1195 else
1197 rhs_code = gimple_assign_rhs_code (stmt);
1198 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1201 /* Check the operation. */
1202 if (i == 0)
1204 *node_vectype = vectype;
1205 first_stmt_code = rhs_code;
1206 first_stmt_ldst_p = ldst_p;
1207 first_stmt_phi_p = phi_p;
1209 /* Shift arguments should be equal in all the packed stmts for a
1210 vector shift with scalar shift operand. */
1211 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1212 || rhs_code == LROTATE_EXPR
1213 || rhs_code == RROTATE_EXPR)
1215 /* First see if we have a vector/vector shift. */
1216 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1218 /* No vector/vector shift, try for a vector/scalar shift. */
1219 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: "
1224 "op not supported by target.\n");
1225 if (is_a <bb_vec_info> (vinfo) && i != 0)
1226 continue;
1227 /* Fatal mismatch. */
1228 matches[0] = false;
1229 return false;
1231 need_same_oprnds = true;
1232 first_op1 = gimple_assign_rhs2 (stmt);
1235 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1237 need_same_oprnds = true;
1238 first_op1 = gimple_assign_rhs2 (stmt);
1240 else if (!ldst_p
1241 && rhs_code == BIT_FIELD_REF)
1243 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1244 if (!is_a <bb_vec_info> (vinfo)
1245 || TREE_CODE (vec) != SSA_NAME
1246 /* When the element types are not compatible we pun the
1247 source to the target vectype which requires equal size. */
1248 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1249 || !types_compatible_p (TREE_TYPE (vectype),
1250 TREE_TYPE (TREE_TYPE (vec))))
1251 && !operand_equal_p (TYPE_SIZE (vectype),
1252 TYPE_SIZE (TREE_TYPE (vec)))))
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "Build SLP failed: "
1257 "BIT_FIELD_REF not supported\n");
1258 /* Fatal mismatch. */
1259 matches[0] = false;
1260 return false;
1263 else if (rhs_code == CFN_DIV_POW2)
1265 need_same_oprnds = true;
1266 first_op1 = gimple_call_arg (call_stmt, 1);
1269 else
1271 if (first_stmt_code != rhs_code
1272 && alt_stmt_code == ERROR_MARK)
1273 alt_stmt_code = rhs_code;
1274 if ((first_stmt_code != rhs_code
1275 && (first_stmt_code != IMAGPART_EXPR
1276 || rhs_code != REALPART_EXPR)
1277 && (first_stmt_code != REALPART_EXPR
1278 || rhs_code != IMAGPART_EXPR)
1279 /* Handle mismatches in plus/minus by computing both
1280 and merging the results. */
1281 && !((first_stmt_code == PLUS_EXPR
1282 || first_stmt_code == MINUS_EXPR)
1283 && (alt_stmt_code == PLUS_EXPR
1284 || alt_stmt_code == MINUS_EXPR)
1285 && rhs_code == alt_stmt_code)
1286 && !(first_stmt_code.is_tree_code ()
1287 && rhs_code.is_tree_code ()
1288 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1289 == tcc_comparison)
1290 && (swap_tree_comparison (tree_code (first_stmt_code))
1291 == tree_code (rhs_code)))
1292 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293 && (first_stmt_code == ARRAY_REF
1294 || first_stmt_code == BIT_FIELD_REF
1295 || first_stmt_code == INDIRECT_REF
1296 || first_stmt_code == COMPONENT_REF
1297 || first_stmt_code == MEM_REF)
1298 && (rhs_code == ARRAY_REF
1299 || rhs_code == BIT_FIELD_REF
1300 || rhs_code == INDIRECT_REF
1301 || rhs_code == COMPONENT_REF
1302 || rhs_code == MEM_REF)))
1303 || (ldst_p
1304 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1305 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1306 || (ldst_p
1307 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1308 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1309 || first_stmt_ldst_p != ldst_p
1310 || first_stmt_phi_p != phi_p)
1312 if (dump_enabled_p ())
1314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315 "Build SLP failed: different operation "
1316 "in stmt %G", stmt);
1317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1318 "original stmt %G", first_stmt_info->stmt);
1320 /* Mismatch. */
1321 continue;
1324 if (!ldst_p
1325 && first_stmt_code == BIT_FIELD_REF
1326 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1327 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 "Build SLP failed: different BIT_FIELD_REF "
1332 "arguments in %G", stmt);
1333 /* Mismatch. */
1334 continue;
1337 if (call_stmt
1338 && first_stmt_code != CFN_MASK_LOAD
1339 && first_stmt_code != CFN_MASK_STORE)
1341 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1342 call_stmt))
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346 "Build SLP failed: different calls in %G",
1347 stmt);
1348 /* Mismatch. */
1349 continue;
1353 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1354 && (gimple_bb (first_stmt_info->stmt)
1355 != gimple_bb (stmt_info->stmt)))
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359 "Build SLP failed: different BB for PHI "
1360 "or possibly trapping operation in %G", stmt);
1361 /* Mismatch. */
1362 continue;
1365 if (need_same_oprnds)
1367 tree other_op1 = gimple_arg (stmt, 1);
1368 if (!operand_equal_p (first_op1, other_op1, 0))
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "Build SLP failed: different shift "
1373 "arguments in %G", stmt);
1374 /* Mismatch. */
1375 continue;
1379 if (!types_compatible_p (vectype, *node_vectype))
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 "Build SLP failed: different vector type "
1384 "in %G", stmt);
1385 /* Mismatch. */
1386 continue;
1390 /* Grouped store or load. */
1391 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1393 gcc_assert (ldst_p);
1394 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1396 /* Store. */
1397 gcc_assert (rhs_code == CFN_MASK_STORE
1398 || REFERENCE_CLASS_P (lhs)
1399 || DECL_P (lhs));
1401 else
1403 /* Load. */
1404 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1405 if (prev_first_load)
1407 /* Check that there are no loads from different interleaving
1408 chains in the same node. */
1409 if (prev_first_load != first_load)
1411 if (dump_enabled_p ())
1412 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1413 vect_location,
1414 "Build SLP failed: different "
1415 "interleaving chains in one node %G",
1416 stmt);
1417 /* Mismatch. */
1418 continue;
1421 else
1422 prev_first_load = first_load;
1425 /* Non-grouped store or load. */
1426 else if (ldst_p)
1428 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1429 && rhs_code != CFN_GATHER_LOAD
1430 && rhs_code != CFN_MASK_GATHER_LOAD
1431 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1432 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1433 /* Not grouped loads are handled as externals for BB
1434 vectorization. For loop vectorization we can handle
1435 splats the same we handle single element interleaving. */
1436 && (is_a <bb_vec_info> (vinfo)
1437 || stmt_info != first_stmt_info))
1439 /* Not grouped load. */
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442 "Build SLP failed: not grouped load %G", stmt);
1444 if (i != 0)
1445 continue;
1446 /* Fatal mismatch. */
1447 matches[0] = false;
1448 return false;
1451 /* Not memory operation. */
1452 else
1454 if (!phi_p
1455 && rhs_code.is_tree_code ()
1456 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1457 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1458 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1459 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1460 && rhs_code != VIEW_CONVERT_EXPR
1461 && rhs_code != CALL_EXPR
1462 && rhs_code != BIT_FIELD_REF)
1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466 "Build SLP failed: operation unsupported %G",
1467 stmt);
1468 if (is_a <bb_vec_info> (vinfo) && i != 0)
1469 continue;
1470 /* Fatal mismatch. */
1471 matches[0] = false;
1472 return false;
1475 if (rhs_code == COND_EXPR)
1477 tree cond_expr = gimple_assign_rhs1 (stmt);
1478 enum tree_code cond_code = TREE_CODE (cond_expr);
1479 enum tree_code swap_code = ERROR_MARK;
1480 enum tree_code invert_code = ERROR_MARK;
1482 if (i == 0)
1483 first_cond_code = TREE_CODE (cond_expr);
1484 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1486 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1487 swap_code = swap_tree_comparison (cond_code);
1488 invert_code = invert_tree_comparison (cond_code, honor_nans);
1491 if (first_cond_code == cond_code)
1493 /* Isomorphic can be achieved by swapping. */
1494 else if (first_cond_code == swap_code)
1495 swap[i] = 1;
1496 /* Isomorphic can be achieved by inverting. */
1497 else if (first_cond_code == invert_code)
1498 swap[i] = 2;
1499 else
1501 if (dump_enabled_p ())
1502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503 "Build SLP failed: different"
1504 " operation %G", stmt);
1505 /* Mismatch. */
1506 continue;
1510 if (rhs_code.is_tree_code ()
1511 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1512 && (swap_tree_comparison ((tree_code)first_stmt_code)
1513 == (tree_code)rhs_code))
1514 swap[i] = 1;
1517 matches[i] = true;
1520 for (i = 0; i < group_size; ++i)
1521 if (!matches[i])
1522 return false;
1524 /* If we allowed a two-operation SLP node verify the target can cope
1525 with the permute we are going to use. */
1526 if (alt_stmt_code != ERROR_MARK
1527 && (!alt_stmt_code.is_tree_code ()
1528 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1529 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1531 *two_operators = true;
1534 if (maybe_soft_fail)
1536 unsigned HOST_WIDE_INT const_nunits;
1537 if (!TYPE_VECTOR_SUBPARTS
1538 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1539 || const_nunits > group_size)
1540 matches[0] = false;
1541 else
1543 /* With constant vector elements simulate a mismatch at the
1544 point we need to split. */
1545 unsigned tail = group_size & (const_nunits - 1);
1546 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1548 return false;
1551 return true;
1554 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1555 Note we never remove apart from at destruction time so we do not
1556 need a special value for deleted that differs from empty. */
1557 struct bst_traits
1559 typedef vec <stmt_vec_info> value_type;
1560 typedef vec <stmt_vec_info> compare_type;
1561 static inline hashval_t hash (value_type);
1562 static inline bool equal (value_type existing, value_type candidate);
1563 static inline bool is_empty (value_type x) { return !x.exists (); }
1564 static inline bool is_deleted (value_type x) { return !x.exists (); }
1565 static const bool empty_zero_p = true;
1566 static inline void mark_empty (value_type &x) { x.release (); }
1567 static inline void mark_deleted (value_type &x) { x.release (); }
1568 static inline void remove (value_type &x) { x.release (); }
1570 inline hashval_t
1571 bst_traits::hash (value_type x)
1573 inchash::hash h;
1574 for (unsigned i = 0; i < x.length (); ++i)
1575 h.add_int (gimple_uid (x[i]->stmt));
1576 return h.end ();
1578 inline bool
1579 bst_traits::equal (value_type existing, value_type candidate)
1581 if (existing.length () != candidate.length ())
1582 return false;
1583 for (unsigned i = 0; i < existing.length (); ++i)
1584 if (existing[i] != candidate[i])
1585 return false;
1586 return true;
1589 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1590 but then vec::insert does memmove and that's not compatible with
1591 std::pair. */
1592 struct chain_op_t
1594 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1595 : code (code_), dt (dt_), op (op_) {}
1596 tree_code code;
1597 vect_def_type dt;
1598 tree op;
1601 /* Comparator for sorting associatable chains. */
1603 static int
1604 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1606 auto *op1 = (const chain_op_t *) op1_;
1607 auto *op2 = (const chain_op_t *) op2_;
1608 if (op1->dt != op2->dt)
1609 return (int)op1->dt - (int)op2->dt;
1610 return (int)op1->code - (int)op2->code;
1613 /* Linearize the associatable expression chain at START with the
1614 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1615 filling CHAIN with the result and using WORKLIST as intermediate storage.
1616 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1617 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1618 stmts, starting with START. */
1620 static void
1621 vect_slp_linearize_chain (vec_info *vinfo,
1622 vec<std::pair<tree_code, gimple *> > &worklist,
1623 vec<chain_op_t> &chain,
1624 enum tree_code code, gimple *start,
1625 gimple *&code_stmt, gimple *&alt_code_stmt,
1626 vec<gimple *> *chain_stmts)
1628 /* For each lane linearize the addition/subtraction (or other
1629 uniform associatable operation) expression tree. */
1630 worklist.safe_push (std::make_pair (code, start));
1631 while (!worklist.is_empty ())
1633 auto entry = worklist.pop ();
1634 gassign *stmt = as_a <gassign *> (entry.second);
1635 enum tree_code in_code = entry.first;
1636 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1637 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1638 if (!code_stmt
1639 && gimple_assign_rhs_code (stmt) == code)
1640 code_stmt = stmt;
1641 else if (!alt_code_stmt
1642 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1643 alt_code_stmt = stmt;
1644 if (chain_stmts)
1645 chain_stmts->safe_push (stmt);
1646 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1648 tree op = gimple_op (stmt, opnum);
1649 vect_def_type dt;
1650 stmt_vec_info def_stmt_info;
1651 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1652 gcc_assert (res);
1653 if (dt == vect_internal_def
1654 && is_pattern_stmt_p (def_stmt_info))
1655 op = gimple_get_lhs (def_stmt_info->stmt);
1656 gimple *use_stmt;
1657 use_operand_p use_p;
1658 if (dt == vect_internal_def
1659 && single_imm_use (op, &use_p, &use_stmt)
1660 && is_gimple_assign (def_stmt_info->stmt)
1661 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1662 || (code == PLUS_EXPR
1663 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1664 == MINUS_EXPR))))
1666 tree_code op_def_code = this_code;
1667 if (op_def_code == MINUS_EXPR && opnum == 1)
1668 op_def_code = PLUS_EXPR;
1669 if (in_code == MINUS_EXPR)
1670 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1671 worklist.safe_push (std::make_pair (op_def_code,
1672 def_stmt_info->stmt));
1674 else
1676 tree_code op_def_code = this_code;
1677 if (op_def_code == MINUS_EXPR && opnum == 1)
1678 op_def_code = PLUS_EXPR;
1679 if (in_code == MINUS_EXPR)
1680 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1681 chain.safe_push (chain_op_t (op_def_code, dt, op));
1687 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1688 simple_hashmap_traits <bst_traits, slp_tree> >
1689 scalar_stmts_to_slp_tree_map_t;
1691 static slp_tree
1692 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1693 vec<stmt_vec_info> stmts, unsigned int group_size,
1694 poly_uint64 *max_nunits,
1695 bool *matches, unsigned *limit, unsigned *tree_size,
1696 scalar_stmts_to_slp_tree_map_t *bst_map);
1698 static slp_tree
1699 vect_build_slp_tree (vec_info *vinfo,
1700 vec<stmt_vec_info> stmts, unsigned int group_size,
1701 poly_uint64 *max_nunits,
1702 bool *matches, unsigned *limit, unsigned *tree_size,
1703 scalar_stmts_to_slp_tree_map_t *bst_map)
1705 if (slp_tree *leader = bst_map->get (stmts))
1707 if (dump_enabled_p ())
1708 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1709 !(*leader)->failed ? "" : "failed ",
1710 (void *) *leader);
1711 if (!(*leader)->failed)
1713 SLP_TREE_REF_COUNT (*leader)++;
1714 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1715 stmts.release ();
1716 return *leader;
1718 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1719 return NULL;
1722 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1723 so we can pick up backedge destinations during discovery. */
1724 slp_tree res = new _slp_tree;
1725 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1726 SLP_TREE_SCALAR_STMTS (res) = stmts;
1727 bst_map->put (stmts.copy (), res);
1729 if (*limit == 0)
1731 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "SLP discovery limit exceeded\n");
1734 /* Mark the node invalid so we can detect those when still in use
1735 as backedge destinations. */
1736 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1737 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1738 res->failed = XNEWVEC (bool, group_size);
1739 memset (res->failed, 0, sizeof (bool) * group_size);
1740 memset (matches, 0, sizeof (bool) * group_size);
1741 return NULL;
1743 --*limit;
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE, vect_location,
1747 "starting SLP discovery for node %p\n", (void *) res);
1749 poly_uint64 this_max_nunits = 1;
1750 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1751 &this_max_nunits,
1752 matches, limit, tree_size, bst_map);
1753 if (!res_)
1755 if (dump_enabled_p ())
1756 dump_printf_loc (MSG_NOTE, vect_location,
1757 "SLP discovery for node %p failed\n", (void *) res);
1758 /* Mark the node invalid so we can detect those when still in use
1759 as backedge destinations. */
1760 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1761 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1762 res->failed = XNEWVEC (bool, group_size);
1763 if (flag_checking)
1765 unsigned i;
1766 for (i = 0; i < group_size; ++i)
1767 if (!matches[i])
1768 break;
1769 gcc_assert (i < group_size);
1771 memcpy (res->failed, matches, sizeof (bool) * group_size);
1773 else
1775 if (dump_enabled_p ())
1776 dump_printf_loc (MSG_NOTE, vect_location,
1777 "SLP discovery for node %p succeeded\n",
1778 (void *) res);
1779 gcc_assert (res_ == res);
1780 res->max_nunits = this_max_nunits;
1781 vect_update_max_nunits (max_nunits, this_max_nunits);
1782 /* Keep a reference for the bst_map use. */
1783 SLP_TREE_REF_COUNT (res)++;
1785 return res_;
1788 /* Helper for building an associated SLP node chain. */
1790 static void
1791 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1792 slp_tree op0, slp_tree op1,
1793 stmt_vec_info oper1, stmt_vec_info oper2,
1794 vec<std::pair<unsigned, unsigned> > lperm)
1796 unsigned group_size = SLP_TREE_LANES (op1);
1798 slp_tree child1 = new _slp_tree;
1799 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1800 SLP_TREE_VECTYPE (child1) = vectype;
1801 SLP_TREE_LANES (child1) = group_size;
1802 SLP_TREE_CHILDREN (child1).create (2);
1803 SLP_TREE_CHILDREN (child1).quick_push (op0);
1804 SLP_TREE_CHILDREN (child1).quick_push (op1);
1805 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1807 slp_tree child2 = new _slp_tree;
1808 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1809 SLP_TREE_VECTYPE (child2) = vectype;
1810 SLP_TREE_LANES (child2) = group_size;
1811 SLP_TREE_CHILDREN (child2).create (2);
1812 SLP_TREE_CHILDREN (child2).quick_push (op0);
1813 SLP_TREE_REF_COUNT (op0)++;
1814 SLP_TREE_CHILDREN (child2).quick_push (op1);
1815 SLP_TREE_REF_COUNT (op1)++;
1816 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1818 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1819 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1820 SLP_TREE_VECTYPE (perm) = vectype;
1821 SLP_TREE_LANES (perm) = group_size;
1822 /* ??? We should set this NULL but that's not expected. */
1823 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1824 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1825 SLP_TREE_CHILDREN (perm).quick_push (child1);
1826 SLP_TREE_CHILDREN (perm).quick_push (child2);
1829 /* Recursively build an SLP tree starting from NODE.
1830 Fail (and return a value not equal to zero) if def-stmts are not
1831 isomorphic, require data permutation or are of unsupported types of
1832 operation. Otherwise, return 0.
1833 The value returned is the depth in the SLP tree where a mismatch
1834 was found. */
1836 static slp_tree
1837 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1838 vec<stmt_vec_info> stmts, unsigned int group_size,
1839 poly_uint64 *max_nunits,
1840 bool *matches, unsigned *limit, unsigned *tree_size,
1841 scalar_stmts_to_slp_tree_map_t *bst_map)
1843 unsigned nops, i, this_tree_size = 0;
1844 poly_uint64 this_max_nunits = *max_nunits;
1846 matches[0] = false;
1848 stmt_vec_info stmt_info = stmts[0];
1849 if (!is_a<gcall *> (stmt_info->stmt)
1850 && !is_a<gassign *> (stmt_info->stmt)
1851 && !is_a<gphi *> (stmt_info->stmt))
1852 return NULL;
1854 nops = gimple_num_args (stmt_info->stmt);
1855 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1856 STMT_VINFO_GATHER_SCATTER_P
1857 (stmt_info)))
1858 nops = map[0];
1860 /* If the SLP node is a PHI (induction or reduction), terminate
1861 the recursion. */
1862 bool *skip_args = XALLOCAVEC (bool, nops);
1863 memset (skip_args, 0, sizeof (bool) * nops);
1864 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1865 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1867 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1868 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1869 group_size);
1870 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1871 max_nunits))
1872 return NULL;
1874 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1875 if (def_type == vect_induction_def)
1877 /* Induction PHIs are not cycles but walk the initial
1878 value. Only for inner loops through, for outer loops
1879 we need to pick up the value from the actual PHIs
1880 to more easily support peeling and epilogue vectorization. */
1881 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1882 if (!nested_in_vect_loop_p (loop, stmt_info))
1883 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1884 else
1885 loop = loop->inner;
1886 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1888 else if (def_type == vect_reduction_def
1889 || def_type == vect_double_reduction_def
1890 || def_type == vect_nested_cycle
1891 || def_type == vect_first_order_recurrence)
1893 /* Else def types have to match. */
1894 stmt_vec_info other_info;
1895 bool all_same = true;
1896 FOR_EACH_VEC_ELT (stmts, i, other_info)
1898 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1899 return NULL;
1900 if (other_info != stmt_info)
1901 all_same = false;
1903 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1904 /* Reduction initial values are not explicitely represented. */
1905 if (def_type != vect_first_order_recurrence
1906 && !nested_in_vect_loop_p (loop, stmt_info))
1907 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1908 /* Reduction chain backedge defs are filled manually.
1909 ??? Need a better way to identify a SLP reduction chain PHI.
1910 Or a better overall way to SLP match those. */
1911 if (all_same && def_type == vect_reduction_def)
1912 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1914 else if (def_type != vect_internal_def)
1915 return NULL;
1919 bool two_operators = false;
1920 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1921 tree vectype = NULL_TREE;
1922 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1923 &this_max_nunits, matches, &two_operators,
1924 &vectype))
1925 return NULL;
1927 /* If the SLP node is a load, terminate the recursion unless masked. */
1928 if (STMT_VINFO_DATA_REF (stmt_info)
1929 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1931 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1932 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1933 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1934 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1935 || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD));
1936 else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1937 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1938 else
1940 *max_nunits = this_max_nunits;
1941 (*tree_size)++;
1942 node = vect_create_new_slp_node (node, stmts, 0);
1943 SLP_TREE_VECTYPE (node) = vectype;
1944 /* And compute the load permutation. Whether it is actually
1945 a permutation depends on the unrolling factor which is
1946 decided later. */
1947 vec<unsigned> load_permutation;
1948 int j;
1949 stmt_vec_info load_info;
1950 load_permutation.create (group_size);
1951 stmt_vec_info first_stmt_info
1952 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1953 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1955 int load_place;
1956 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1957 load_place = vect_get_place_in_interleaving_chain
1958 (load_info, first_stmt_info);
1959 else
1960 load_place = 0;
1961 gcc_assert (load_place != -1);
1962 load_permutation.safe_push (load_place);
1964 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1965 return node;
1968 else if (gimple_assign_single_p (stmt_info->stmt)
1969 && !gimple_vuse (stmt_info->stmt)
1970 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1972 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1973 the same SSA name vector of a compatible type to vectype. */
1974 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1975 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1976 stmt_vec_info estmt_info;
1977 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1979 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1980 tree bfref = gimple_assign_rhs1 (estmt);
1981 HOST_WIDE_INT lane;
1982 if (!known_eq (bit_field_size (bfref),
1983 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1984 || !constant_multiple_p (bit_field_offset (bfref),
1985 bit_field_size (bfref), &lane))
1987 lperm.release ();
1988 matches[0] = false;
1989 return NULL;
1991 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1993 slp_tree vnode = vect_create_new_slp_node (vNULL);
1994 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1995 /* ??? We record vectype here but we hide eventually necessary
1996 punning and instead rely on code generation to materialize
1997 VIEW_CONVERT_EXPRs as necessary. We instead should make
1998 this explicit somehow. */
1999 SLP_TREE_VECTYPE (vnode) = vectype;
2000 else
2002 /* For different size but compatible elements we can still
2003 use VEC_PERM_EXPR without punning. */
2004 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2005 && types_compatible_p (TREE_TYPE (vectype),
2006 TREE_TYPE (TREE_TYPE (vec))));
2007 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2009 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2010 unsigned HOST_WIDE_INT const_nunits;
2011 if (nunits.is_constant (&const_nunits))
2012 SLP_TREE_LANES (vnode) = const_nunits;
2013 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2014 /* We are always building a permutation node even if it is an identity
2015 permute to shield the rest of the vectorizer from the odd node
2016 representing an actual vector without any scalar ops.
2017 ??? We could hide it completely with making the permute node
2018 external? */
2019 node = vect_create_new_slp_node (node, stmts, 1);
2020 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2021 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2022 SLP_TREE_VECTYPE (node) = vectype;
2023 SLP_TREE_CHILDREN (node).quick_push (vnode);
2024 return node;
2026 /* When discovery reaches an associatable operation see whether we can
2027 improve that to match up lanes in a way superior to the operand
2028 swapping code which at most looks at two defs.
2029 ??? For BB vectorization we cannot do the brute-force search
2030 for matching as we can succeed by means of builds from scalars
2031 and have no good way to "cost" one build against another. */
2032 else if (is_a <loop_vec_info> (vinfo)
2033 /* ??? We don't handle !vect_internal_def defs below. */
2034 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2035 && is_gimple_assign (stmt_info->stmt)
2036 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2037 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2038 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2039 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2040 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2042 /* See if we have a chain of (mixed) adds or subtracts or other
2043 associatable ops. */
2044 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2045 if (code == MINUS_EXPR)
2046 code = PLUS_EXPR;
2047 stmt_vec_info other_op_stmt_info = NULL;
2048 stmt_vec_info op_stmt_info = NULL;
2049 unsigned chain_len = 0;
2050 auto_vec<chain_op_t> chain;
2051 auto_vec<std::pair<tree_code, gimple *> > worklist;
2052 auto_vec<vec<chain_op_t> > chains (group_size);
2053 auto_vec<slp_tree, 4> children;
2054 bool hard_fail = true;
2055 for (unsigned lane = 0; lane < group_size; ++lane)
2057 /* For each lane linearize the addition/subtraction (or other
2058 uniform associatable operation) expression tree. */
2059 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2060 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2061 stmts[lane]->stmt, op_stmt, other_op_stmt,
2062 NULL);
2063 if (!op_stmt_info && op_stmt)
2064 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2065 if (!other_op_stmt_info && other_op_stmt)
2066 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2067 if (chain.length () == 2)
2069 /* In a chain of just two elements resort to the regular
2070 operand swapping scheme. If we run into a length
2071 mismatch still hard-FAIL. */
2072 if (chain_len == 0)
2073 hard_fail = false;
2074 else
2076 matches[lane] = false;
2077 /* ??? We might want to process the other lanes, but
2078 make sure to not give false matching hints to the
2079 caller for lanes we did not process. */
2080 if (lane != group_size - 1)
2081 matches[0] = false;
2083 break;
2085 else if (chain_len == 0)
2086 chain_len = chain.length ();
2087 else if (chain.length () != chain_len)
2089 /* ??? Here we could slip in magic to compensate with
2090 neutral operands. */
2091 matches[lane] = false;
2092 if (lane != group_size - 1)
2093 matches[0] = false;
2094 break;
2096 chains.quick_push (chain.copy ());
2097 chain.truncate (0);
2099 if (chains.length () == group_size)
2101 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2102 if (!op_stmt_info)
2104 hard_fail = false;
2105 goto out;
2107 /* Now we have a set of chains with the same length. */
2108 /* 1. pre-sort according to def_type and operation. */
2109 for (unsigned lane = 0; lane < group_size; ++lane)
2110 chains[lane].stablesort (dt_sort_cmp, vinfo);
2111 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_NOTE, vect_location,
2114 "pre-sorted chains of %s\n",
2115 get_tree_code_name (code));
2116 for (unsigned lane = 0; lane < group_size; ++lane)
2118 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2119 dump_printf (MSG_NOTE, "%s %T ",
2120 get_tree_code_name (chains[lane][opnum].code),
2121 chains[lane][opnum].op);
2122 dump_printf (MSG_NOTE, "\n");
2125 /* 2. try to build children nodes, associating as necessary. */
2126 for (unsigned n = 0; n < chain_len; ++n)
2128 vect_def_type dt = chains[0][n].dt;
2129 unsigned lane;
2130 for (lane = 0; lane < group_size; ++lane)
2131 if (chains[lane][n].dt != dt)
2133 if (dt == vect_constant_def
2134 && chains[lane][n].dt == vect_external_def)
2135 dt = vect_external_def;
2136 else if (dt == vect_external_def
2137 && chains[lane][n].dt == vect_constant_def)
2139 else
2140 break;
2142 if (lane != group_size)
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "giving up on chain due to mismatched "
2147 "def types\n");
2148 matches[lane] = false;
2149 if (lane != group_size - 1)
2150 matches[0] = false;
2151 goto out;
2153 if (dt == vect_constant_def
2154 || dt == vect_external_def)
2156 /* Check whether we can build the invariant. If we can't
2157 we never will be able to. */
2158 tree type = TREE_TYPE (chains[0][n].op);
2159 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2160 && (TREE_CODE (type) == BOOLEAN_TYPE
2161 || !can_duplicate_and_interleave_p (vinfo, group_size,
2162 type)))
2164 matches[0] = false;
2165 goto out;
2167 vec<tree> ops;
2168 ops.create (group_size);
2169 for (lane = 0; lane < group_size; ++lane)
2170 ops.quick_push (chains[lane][n].op);
2171 slp_tree child = vect_create_new_slp_node (ops);
2172 SLP_TREE_DEF_TYPE (child) = dt;
2173 children.safe_push (child);
2175 else if (dt != vect_internal_def)
2177 /* Not sure, we might need sth special.
2178 gcc.dg/vect/pr96854.c,
2179 gfortran.dg/vect/fast-math-pr37021.f90
2180 and gfortran.dg/vect/pr61171.f trigger. */
2181 /* Soft-fail for now. */
2182 hard_fail = false;
2183 goto out;
2185 else
2187 vec<stmt_vec_info> op_stmts;
2188 op_stmts.create (group_size);
2189 slp_tree child = NULL;
2190 /* Brute-force our way. We have to consider a lane
2191 failing after fixing an earlier fail up in the
2192 SLP discovery recursion. So track the current
2193 permute per lane. */
2194 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2195 memset (perms, 0, sizeof (unsigned) * group_size);
2198 op_stmts.truncate (0);
2199 for (lane = 0; lane < group_size; ++lane)
2200 op_stmts.quick_push
2201 (vinfo->lookup_def (chains[lane][n].op));
2202 child = vect_build_slp_tree (vinfo, op_stmts,
2203 group_size, &this_max_nunits,
2204 matches, limit,
2205 &this_tree_size, bst_map);
2206 /* ??? We're likely getting too many fatal mismatches
2207 here so maybe we want to ignore them (but then we
2208 have no idea which lanes fatally mismatched). */
2209 if (child || !matches[0])
2210 break;
2211 /* Swap another lane we have not yet matched up into
2212 lanes that did not match. If we run out of
2213 permute possibilities for a lane terminate the
2214 search. */
2215 bool term = false;
2216 for (lane = 1; lane < group_size; ++lane)
2217 if (!matches[lane])
2219 if (n + perms[lane] + 1 == chain_len)
2221 term = true;
2222 break;
2224 std::swap (chains[lane][n],
2225 chains[lane][n + perms[lane] + 1]);
2226 perms[lane]++;
2228 if (term)
2229 break;
2231 while (1);
2232 if (!child)
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_NOTE, vect_location,
2236 "failed to match up op %d\n", n);
2237 op_stmts.release ();
2238 if (lane != group_size - 1)
2239 matches[0] = false;
2240 else
2241 matches[lane] = false;
2242 goto out;
2244 if (dump_enabled_p ())
2246 dump_printf_loc (MSG_NOTE, vect_location,
2247 "matched up op %d to\n", n);
2248 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2250 children.safe_push (child);
2253 /* 3. build SLP nodes to combine the chain. */
2254 for (unsigned lane = 0; lane < group_size; ++lane)
2255 if (chains[lane][0].code != code)
2257 /* See if there's any alternate all-PLUS entry. */
2258 unsigned n;
2259 for (n = 1; n < chain_len; ++n)
2261 for (lane = 0; lane < group_size; ++lane)
2262 if (chains[lane][n].code != code)
2263 break;
2264 if (lane == group_size)
2265 break;
2267 if (n != chain_len)
2269 /* Swap that in at first position. */
2270 std::swap (children[0], children[n]);
2271 for (lane = 0; lane < group_size; ++lane)
2272 std::swap (chains[lane][0], chains[lane][n]);
2274 else
2276 /* ??? When this triggers and we end up with two
2277 vect_constant/external_def up-front things break (ICE)
2278 spectacularly finding an insertion place for the
2279 all-constant op. We should have a fully
2280 vect_internal_def operand though(?) so we can swap
2281 that into first place and then prepend the all-zero
2282 constant. */
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_NOTE, vect_location,
2285 "inserting constant zero to compensate "
2286 "for (partially) negated first "
2287 "operand\n");
2288 chain_len++;
2289 for (lane = 0; lane < group_size; ++lane)
2290 chains[lane].safe_insert
2291 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2292 vec<tree> zero_ops;
2293 zero_ops.create (group_size);
2294 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2295 for (lane = 1; lane < group_size; ++lane)
2296 zero_ops.quick_push (zero_ops[0]);
2297 slp_tree zero = vect_create_new_slp_node (zero_ops);
2298 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2299 children.safe_insert (0, zero);
2301 break;
2303 for (unsigned i = 1; i < children.length (); ++i)
2305 slp_tree op0 = children[i - 1];
2306 slp_tree op1 = children[i];
2307 bool this_two_op = false;
2308 for (unsigned lane = 0; lane < group_size; ++lane)
2309 if (chains[lane][i].code != chains[0][i].code)
2311 this_two_op = true;
2312 break;
2314 slp_tree child;
2315 if (i == children.length () - 1)
2316 child = vect_create_new_slp_node (node, stmts, 2);
2317 else
2318 child = vect_create_new_slp_node (2, ERROR_MARK);
2319 if (this_two_op)
2321 vec<std::pair<unsigned, unsigned> > lperm;
2322 lperm.create (group_size);
2323 for (unsigned lane = 0; lane < group_size; ++lane)
2324 lperm.quick_push (std::make_pair
2325 (chains[lane][i].code != chains[0][i].code, lane));
2326 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2327 (chains[0][i].code == code
2328 ? op_stmt_info
2329 : other_op_stmt_info),
2330 (chains[0][i].code == code
2331 ? other_op_stmt_info
2332 : op_stmt_info),
2333 lperm);
2335 else
2337 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2338 SLP_TREE_VECTYPE (child) = vectype;
2339 SLP_TREE_LANES (child) = group_size;
2340 SLP_TREE_CHILDREN (child).quick_push (op0);
2341 SLP_TREE_CHILDREN (child).quick_push (op1);
2342 SLP_TREE_REPRESENTATIVE (child)
2343 = (chains[0][i].code == code
2344 ? op_stmt_info : other_op_stmt_info);
2346 children[i] = child;
2348 *tree_size += this_tree_size + 1;
2349 *max_nunits = this_max_nunits;
2350 while (!chains.is_empty ())
2351 chains.pop ().release ();
2352 return node;
2354 out:
2355 while (!children.is_empty ())
2356 vect_free_slp_tree (children.pop ());
2357 while (!chains.is_empty ())
2358 chains.pop ().release ();
2359 /* Hard-fail, otherwise we might run into quadratic processing of the
2360 chains starting one stmt into the chain again. */
2361 if (hard_fail)
2362 return NULL;
2363 /* Fall thru to normal processing. */
2366 /* Get at the operands, verifying they are compatible. */
2367 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2368 slp_oprnd_info oprnd_info;
2369 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2371 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2372 stmts, i, &oprnds_info);
2373 if (res != 0)
2374 matches[(res == -1) ? 0 : i] = false;
2375 if (!matches[0])
2376 break;
2378 for (i = 0; i < group_size; ++i)
2379 if (!matches[i])
2381 vect_free_oprnd_info (oprnds_info);
2382 return NULL;
2384 swap = NULL;
2386 auto_vec<slp_tree, 4> children;
2388 stmt_info = stmts[0];
2390 /* Create SLP_TREE nodes for the definition node/s. */
2391 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2393 slp_tree child;
2394 unsigned int j;
2396 /* We're skipping certain operands from processing, for example
2397 outer loop reduction initial defs. */
2398 if (skip_args[i])
2400 children.safe_push (NULL);
2401 continue;
2404 if (oprnd_info->first_dt == vect_uninitialized_def)
2406 /* COND_EXPR have one too many eventually if the condition
2407 is a SSA name. */
2408 gcc_assert (i == 3 && nops == 4);
2409 continue;
2412 if (is_a <bb_vec_info> (vinfo)
2413 && oprnd_info->first_dt == vect_internal_def
2414 && !oprnd_info->any_pattern)
2416 /* For BB vectorization, if all defs are the same do not
2417 bother to continue the build along the single-lane
2418 graph but use a splat of the scalar value. */
2419 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2420 for (j = 1; j < group_size; ++j)
2421 if (oprnd_info->def_stmts[j] != first_def)
2422 break;
2423 if (j == group_size
2424 /* But avoid doing this for loads where we may be
2425 able to CSE things, unless the stmt is not
2426 vectorizable. */
2427 && (!STMT_VINFO_VECTORIZABLE (first_def)
2428 || !gimple_vuse (first_def->stmt)))
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "Using a splat of the uniform operand %G",
2433 first_def->stmt);
2434 oprnd_info->first_dt = vect_external_def;
2438 if (oprnd_info->first_dt == vect_external_def
2439 || oprnd_info->first_dt == vect_constant_def)
2441 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2442 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2443 oprnd_info->ops = vNULL;
2444 children.safe_push (invnode);
2445 continue;
2448 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2449 group_size, &this_max_nunits,
2450 matches, limit,
2451 &this_tree_size, bst_map)) != NULL)
2453 oprnd_info->def_stmts = vNULL;
2454 children.safe_push (child);
2455 continue;
2458 /* If the SLP build for operand zero failed and operand zero
2459 and one can be commutated try that for the scalar stmts
2460 that failed the match. */
2461 if (i == 0
2462 /* A first scalar stmt mismatch signals a fatal mismatch. */
2463 && matches[0]
2464 /* ??? For COND_EXPRs we can swap the comparison operands
2465 as well as the arms under some constraints. */
2466 && nops == 2
2467 && oprnds_info[1]->first_dt == vect_internal_def
2468 && is_gimple_assign (stmt_info->stmt)
2469 /* Swapping operands for reductions breaks assumptions later on. */
2470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2471 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2473 /* See whether we can swap the matching or the non-matching
2474 stmt operands. */
2475 bool swap_not_matching = true;
2478 for (j = 0; j < group_size; ++j)
2480 if (matches[j] != !swap_not_matching)
2481 continue;
2482 stmt_vec_info stmt_info = stmts[j];
2483 /* Verify if we can swap operands of this stmt. */
2484 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2485 if (!stmt
2486 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2488 if (!swap_not_matching)
2489 goto fail;
2490 swap_not_matching = false;
2491 break;
2495 while (j != group_size);
2497 /* Swap mismatched definition stmts. */
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "Re-trying with swapped operands of stmts ");
2501 for (j = 0; j < group_size; ++j)
2502 if (matches[j] == !swap_not_matching)
2504 std::swap (oprnds_info[0]->def_stmts[j],
2505 oprnds_info[1]->def_stmts[j]);
2506 std::swap (oprnds_info[0]->ops[j],
2507 oprnds_info[1]->ops[j]);
2508 if (dump_enabled_p ())
2509 dump_printf (MSG_NOTE, "%d ", j);
2511 if (dump_enabled_p ())
2512 dump_printf (MSG_NOTE, "\n");
2513 /* After swapping some operands we lost track whether an
2514 operand has any pattern defs so be conservative here. */
2515 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2516 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2517 /* And try again with scratch 'matches' ... */
2518 bool *tem = XALLOCAVEC (bool, group_size);
2519 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2520 group_size, &this_max_nunits,
2521 tem, limit,
2522 &this_tree_size, bst_map)) != NULL)
2524 oprnd_info->def_stmts = vNULL;
2525 children.safe_push (child);
2526 continue;
2529 fail:
2531 /* If the SLP build failed and we analyze a basic-block
2532 simply treat nodes we fail to build as externally defined
2533 (and thus build vectors from the scalar defs).
2534 The cost model will reject outright expensive cases.
2535 ??? This doesn't treat cases where permutation ultimatively
2536 fails (or we don't try permutation below). Ideally we'd
2537 even compute a permutation that will end up with the maximum
2538 SLP tree size... */
2539 if (is_a <bb_vec_info> (vinfo)
2540 /* ??? Rejecting patterns this way doesn't work. We'd have to
2541 do extra work to cancel the pattern so the uses see the
2542 scalar version. */
2543 && !is_pattern_stmt_p (stmt_info)
2544 && !oprnd_info->any_pattern)
2546 /* But if there's a leading vector sized set of matching stmts
2547 fail here so we can split the group. This matches the condition
2548 vect_analyze_slp_instance uses. */
2549 /* ??? We might want to split here and combine the results to support
2550 multiple vector sizes better. */
2551 for (j = 0; j < group_size; ++j)
2552 if (!matches[j])
2553 break;
2554 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "Building vector operands from scalars\n");
2559 this_tree_size++;
2560 child = vect_create_new_slp_node (oprnd_info->ops);
2561 children.safe_push (child);
2562 oprnd_info->ops = vNULL;
2563 continue;
2567 gcc_assert (child == NULL);
2568 FOR_EACH_VEC_ELT (children, j, child)
2569 if (child)
2570 vect_free_slp_tree (child);
2571 vect_free_oprnd_info (oprnds_info);
2572 return NULL;
2575 vect_free_oprnd_info (oprnds_info);
2577 /* If we have all children of a child built up from uniform scalars
2578 or does more than one possibly expensive vector construction then
2579 just throw that away, causing it built up from scalars.
2580 The exception is the SLP node for the vector store. */
2581 if (is_a <bb_vec_info> (vinfo)
2582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2583 /* ??? Rejecting patterns this way doesn't work. We'd have to
2584 do extra work to cancel the pattern so the uses see the
2585 scalar version. */
2586 && !is_pattern_stmt_p (stmt_info))
2588 slp_tree child;
2589 unsigned j;
2590 bool all_uniform_p = true;
2591 unsigned n_vector_builds = 0;
2592 FOR_EACH_VEC_ELT (children, j, child)
2594 if (!child)
2596 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2597 all_uniform_p = false;
2598 else if (!vect_slp_tree_uniform_p (child))
2600 all_uniform_p = false;
2601 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2602 n_vector_builds++;
2605 if (all_uniform_p
2606 || n_vector_builds > 1
2607 || (n_vector_builds == children.length ()
2608 && is_a <gphi *> (stmt_info->stmt)))
2610 /* Roll back. */
2611 matches[0] = false;
2612 FOR_EACH_VEC_ELT (children, j, child)
2613 if (child)
2614 vect_free_slp_tree (child);
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "Building parent vector operands from "
2619 "scalars instead\n");
2620 return NULL;
2624 *tree_size += this_tree_size + 1;
2625 *max_nunits = this_max_nunits;
2627 if (two_operators)
2629 /* ??? We'd likely want to either cache in bst_map sth like
2630 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2631 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2632 explicit stmts to put in so the keying on 'stmts' doesn't
2633 work (but we have the same issue with nodes that use 'ops'). */
2634 slp_tree one = new _slp_tree;
2635 slp_tree two = new _slp_tree;
2636 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2637 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2638 SLP_TREE_VECTYPE (one) = vectype;
2639 SLP_TREE_VECTYPE (two) = vectype;
2640 SLP_TREE_CHILDREN (one).safe_splice (children);
2641 SLP_TREE_CHILDREN (two).safe_splice (children);
2642 slp_tree child;
2643 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2644 SLP_TREE_REF_COUNT (child)++;
2646 /* Here we record the original defs since this
2647 node represents the final lane configuration. */
2648 node = vect_create_new_slp_node (node, stmts, 2);
2649 SLP_TREE_VECTYPE (node) = vectype;
2650 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2651 SLP_TREE_CHILDREN (node).quick_push (one);
2652 SLP_TREE_CHILDREN (node).quick_push (two);
2653 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2654 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2655 enum tree_code ocode = ERROR_MARK;
2656 stmt_vec_info ostmt_info;
2657 unsigned j = 0;
2658 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2660 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2661 if (gimple_assign_rhs_code (ostmt) != code0)
2663 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2664 ocode = gimple_assign_rhs_code (ostmt);
2665 j = i;
2667 else
2668 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2670 SLP_TREE_CODE (one) = code0;
2671 SLP_TREE_CODE (two) = ocode;
2672 SLP_TREE_LANES (one) = stmts.length ();
2673 SLP_TREE_LANES (two) = stmts.length ();
2674 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2675 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2676 return node;
2679 node = vect_create_new_slp_node (node, stmts, nops);
2680 SLP_TREE_VECTYPE (node) = vectype;
2681 SLP_TREE_CHILDREN (node).splice (children);
2682 return node;
2685 /* Dump a single SLP tree NODE. */
2687 static void
2688 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2689 slp_tree node)
2691 unsigned i, j;
2692 slp_tree child;
2693 stmt_vec_info stmt_info;
2694 tree op;
2696 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2697 dump_user_location_t user_loc = loc.get_user_location ();
2698 dump_printf_loc (metadata, user_loc,
2699 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2700 ", refcnt=%u)",
2701 SLP_TREE_DEF_TYPE (node) == vect_external_def
2702 ? " (external)"
2703 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2704 ? " (constant)"
2705 : ""), (void *) node,
2706 estimated_poly_value (node->max_nunits),
2707 SLP_TREE_REF_COUNT (node));
2708 if (SLP_TREE_VECTYPE (node))
2709 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2710 dump_printf (metadata, "\n");
2711 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2713 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2714 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2715 else
2716 dump_printf_loc (metadata, user_loc, "op template: %G",
2717 SLP_TREE_REPRESENTATIVE (node)->stmt);
2719 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2720 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2721 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2722 else
2724 dump_printf_loc (metadata, user_loc, "\t{ ");
2725 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2726 dump_printf (metadata, "%T%s ", op,
2727 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2728 dump_printf (metadata, "}\n");
2730 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2732 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2733 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2734 dump_printf (dump_kind, " %u", j);
2735 dump_printf (dump_kind, " }\n");
2737 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2739 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2740 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2741 dump_printf (dump_kind, " %u[%u]",
2742 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2743 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2744 dump_printf (dump_kind, " }\n");
2746 if (SLP_TREE_CHILDREN (node).is_empty ())
2747 return;
2748 dump_printf_loc (metadata, user_loc, "\tchildren");
2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2750 dump_printf (dump_kind, " %p", (void *)child);
2751 dump_printf (dump_kind, "\n");
2754 DEBUG_FUNCTION void
2755 debug (slp_tree node)
2757 debug_dump_context ctx;
2758 vect_print_slp_tree (MSG_NOTE,
2759 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2760 node);
2763 /* Recursive helper for the dot producer below. */
2765 static void
2766 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2768 if (visited.add (node))
2769 return;
2771 fprintf (f, "\"%p\" [label=\"", (void *)node);
2772 vect_print_slp_tree (MSG_NOTE,
2773 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2774 node);
2775 fprintf (f, "\"];\n");
2778 for (slp_tree child : SLP_TREE_CHILDREN (node))
2779 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2781 for (slp_tree child : SLP_TREE_CHILDREN (node))
2782 if (child)
2783 dot_slp_tree (f, child, visited);
2786 DEBUG_FUNCTION void
2787 dot_slp_tree (const char *fname, slp_tree node)
2789 FILE *f = fopen (fname, "w");
2790 fprintf (f, "digraph {\n");
2791 fflush (f);
2793 debug_dump_context ctx (f);
2794 hash_set<slp_tree> visited;
2795 dot_slp_tree (f, node, visited);
2797 fflush (f);
2798 fprintf (f, "}\n");
2799 fclose (f);
2802 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2804 static void
2805 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2806 slp_tree node, hash_set<slp_tree> &visited)
2808 unsigned i;
2809 slp_tree child;
2811 if (visited.add (node))
2812 return;
2814 vect_print_slp_tree (dump_kind, loc, node);
2816 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2817 if (child)
2818 vect_print_slp_graph (dump_kind, loc, child, visited);
2821 static void
2822 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2823 slp_tree entry)
2825 hash_set<slp_tree> visited;
2826 vect_print_slp_graph (dump_kind, loc, entry, visited);
2829 /* Mark the tree rooted at NODE with PURE_SLP. */
2831 static void
2832 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2834 int i;
2835 stmt_vec_info stmt_info;
2836 slp_tree child;
2838 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2839 return;
2841 if (visited.add (node))
2842 return;
2844 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2845 STMT_SLP_TYPE (stmt_info) = pure_slp;
2847 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2848 if (child)
2849 vect_mark_slp_stmts (child, visited);
2852 static void
2853 vect_mark_slp_stmts (slp_tree node)
2855 hash_set<slp_tree> visited;
2856 vect_mark_slp_stmts (node, visited);
2859 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2861 static void
2862 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2864 int i;
2865 stmt_vec_info stmt_info;
2866 slp_tree child;
2868 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2869 return;
2871 if (visited.add (node))
2872 return;
2874 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2876 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2877 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2878 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2881 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2882 if (child)
2883 vect_mark_slp_stmts_relevant (child, visited);
2886 static void
2887 vect_mark_slp_stmts_relevant (slp_tree node)
2889 hash_set<slp_tree> visited;
2890 vect_mark_slp_stmts_relevant (node, visited);
2894 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2896 static void
2897 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2898 hash_set<slp_tree> &visited)
2900 if (!node || visited.add (node))
2901 return;
2903 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904 return;
2906 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2908 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2909 if (STMT_VINFO_DATA_REF (stmt_info)
2910 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2911 loads.safe_push (node);
2914 unsigned i;
2915 slp_tree child;
2916 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917 vect_gather_slp_loads (loads, child, visited);
2921 /* Find the last store in SLP INSTANCE. */
2923 stmt_vec_info
2924 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2926 stmt_vec_info last = NULL;
2927 stmt_vec_info stmt_vinfo;
2929 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2931 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2932 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2935 return last;
2938 /* Find the first stmt in NODE. */
2940 stmt_vec_info
2941 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2943 stmt_vec_info first = NULL;
2944 stmt_vec_info stmt_vinfo;
2946 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2948 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2949 if (!first
2950 || get_later_stmt (stmt_vinfo, first) == first)
2951 first = stmt_vinfo;
2954 return first;
2957 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2958 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2959 (also containing the first GROUP1_SIZE stmts, since stores are
2960 consecutive), the second containing the remainder.
2961 Return the first stmt in the second group. */
2963 static stmt_vec_info
2964 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2966 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2967 gcc_assert (group1_size > 0);
2968 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2969 gcc_assert (group2_size > 0);
2970 DR_GROUP_SIZE (first_vinfo) = group1_size;
2972 stmt_vec_info stmt_info = first_vinfo;
2973 for (unsigned i = group1_size; i > 1; i--)
2975 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2976 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2978 /* STMT is now the last element of the first group. */
2979 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2980 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2982 DR_GROUP_SIZE (group2) = group2_size;
2983 for (stmt_info = group2; stmt_info;
2984 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2986 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2987 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2990 /* For the second group, the DR_GROUP_GAP is that before the original group,
2991 plus skipping over the first vector. */
2992 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2994 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2995 DR_GROUP_GAP (first_vinfo) += group2_size;
2997 if (dump_enabled_p ())
2998 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2999 group1_size, group2_size);
3001 return group2;
3004 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3005 statements and a vector of NUNITS elements. */
3007 static poly_uint64
3008 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3010 return exact_div (common_multiple (nunits, group_size), group_size);
3013 /* Helper that checks to see if a node is a load node. */
3015 static inline bool
3016 vect_is_slp_load_node (slp_tree root)
3018 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3019 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3020 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3024 /* Helper function of optimize_load_redistribution that performs the operation
3025 recursively. */
3027 static slp_tree
3028 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3029 vec_info *vinfo, unsigned int group_size,
3030 hash_map<slp_tree, slp_tree> *load_map,
3031 slp_tree root)
3033 if (slp_tree *leader = load_map->get (root))
3034 return *leader;
3036 slp_tree node;
3037 unsigned i;
3039 /* For now, we don't know anything about externals so do not do anything. */
3040 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3041 return NULL;
3042 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3044 /* First convert this node into a load node and add it to the leaves
3045 list and flatten the permute from a lane to a load one. If it's
3046 unneeded it will be elided later. */
3047 vec<stmt_vec_info> stmts;
3048 stmts.create (SLP_TREE_LANES (root));
3049 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3050 for (unsigned j = 0; j < lane_perm.length (); j++)
3052 std::pair<unsigned, unsigned> perm = lane_perm[j];
3053 node = SLP_TREE_CHILDREN (root)[perm.first];
3055 if (!vect_is_slp_load_node (node)
3056 || SLP_TREE_CHILDREN (node).exists ())
3058 stmts.release ();
3059 goto next;
3062 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "converting stmts on permute node %p\n",
3068 (void *) root);
3070 bool *matches = XALLOCAVEC (bool, group_size);
3071 poly_uint64 max_nunits = 1;
3072 unsigned tree_size = 0, limit = 1;
3073 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3074 matches, &limit, &tree_size, bst_map);
3075 if (!node)
3076 stmts.release ();
3078 load_map->put (root, node);
3079 return node;
3082 next:
3083 load_map->put (root, NULL);
3085 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3087 slp_tree value
3088 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3089 node);
3090 if (value)
3092 SLP_TREE_REF_COUNT (value)++;
3093 SLP_TREE_CHILDREN (root)[i] = value;
3094 /* ??? We know the original leafs of the replaced nodes will
3095 be referenced by bst_map, only the permutes created by
3096 pattern matching are not. */
3097 if (SLP_TREE_REF_COUNT (node) == 1)
3098 load_map->remove (node);
3099 vect_free_slp_tree (node);
3103 return NULL;
3106 /* Temporary workaround for loads not being CSEd during SLP build. This
3107 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3108 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3109 same DR such that the final operation is equal to a permuted load. Such
3110 NODES are then directly converted into LOADS themselves. The nodes are
3111 CSEd using BST_MAP. */
3113 static void
3114 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3115 vec_info *vinfo, unsigned int group_size,
3116 hash_map<slp_tree, slp_tree> *load_map,
3117 slp_tree root)
3119 slp_tree node;
3120 unsigned i;
3122 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3124 slp_tree value
3125 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3126 node);
3127 if (value)
3129 SLP_TREE_REF_COUNT (value)++;
3130 SLP_TREE_CHILDREN (root)[i] = value;
3131 /* ??? We know the original leafs of the replaced nodes will
3132 be referenced by bst_map, only the permutes created by
3133 pattern matching are not. */
3134 if (SLP_TREE_REF_COUNT (node) == 1)
3135 load_map->remove (node);
3136 vect_free_slp_tree (node);
3141 /* Helper function of vect_match_slp_patterns.
3143 Attempts to match patterns against the slp tree rooted in REF_NODE using
3144 VINFO. Patterns are matched in post-order traversal.
3146 If matching is successful the value in REF_NODE is updated and returned, if
3147 not then it is returned unchanged. */
3149 static bool
3150 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3151 slp_tree_to_load_perm_map_t *perm_cache,
3152 slp_compat_nodes_map_t *compat_cache,
3153 hash_set<slp_tree> *visited)
3155 unsigned i;
3156 slp_tree node = *ref_node;
3157 bool found_p = false;
3158 if (!node || visited->add (node))
3159 return false;
3161 slp_tree child;
3162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3163 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3164 vinfo, perm_cache, compat_cache,
3165 visited);
3167 for (unsigned x = 0; x < num__slp_patterns; x++)
3169 vect_pattern *pattern
3170 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3171 if (pattern)
3173 pattern->build (vinfo);
3174 delete pattern;
3175 found_p = true;
3179 return found_p;
3182 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3183 vec_info VINFO.
3185 The modified tree is returned. Patterns are tried in order and multiple
3186 patterns may match. */
3188 static bool
3189 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3190 hash_set<slp_tree> *visited,
3191 slp_tree_to_load_perm_map_t *perm_cache,
3192 slp_compat_nodes_map_t *compat_cache)
3194 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3195 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_NOTE, vect_location,
3199 "Analyzing SLP tree %p for patterns\n",
3200 (void *) SLP_INSTANCE_TREE (instance));
3202 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3203 visited);
3206 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3207 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3208 Return true if we could use IFN_STORE_LANES instead and if that appears
3209 to be the better approach. */
3211 static bool
3212 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3213 unsigned int group_size,
3214 unsigned int new_group_size)
3216 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3217 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3218 if (!vectype)
3219 return false;
3220 /* Allow the split if one of the two new groups would operate on full
3221 vectors *within* rather than across one scalar loop iteration.
3222 This is purely a heuristic, but it should work well for group
3223 sizes of 3 and 4, where the possible splits are:
3225 3->2+1: OK if the vector has exactly two elements
3226 4->2+2: Likewise
3227 4->3+1: Less clear-cut. */
3228 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3229 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3230 return false;
3231 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3234 /* Analyze an SLP instance starting from a group of grouped stores. Call
3235 vect_build_slp_tree to build a tree of packed stmts if possible.
3236 Return FALSE if it's impossible to SLP any stmt in the loop. */
3238 static bool
3239 vect_analyze_slp_instance (vec_info *vinfo,
3240 scalar_stmts_to_slp_tree_map_t *bst_map,
3241 stmt_vec_info stmt_info, slp_instance_kind kind,
3242 unsigned max_tree_size, unsigned *limit);
3244 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3245 of KIND. Return true if successful. */
3247 static bool
3248 vect_build_slp_instance (vec_info *vinfo,
3249 slp_instance_kind kind,
3250 vec<stmt_vec_info> &scalar_stmts,
3251 vec<stmt_vec_info> &root_stmt_infos,
3252 vec<tree> &remain,
3253 unsigned max_tree_size, unsigned *limit,
3254 scalar_stmts_to_slp_tree_map_t *bst_map,
3255 /* ??? We need stmt_info for group splitting. */
3256 stmt_vec_info stmt_info_)
3258 if (kind == slp_inst_kind_ctor)
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "Analyzing vectorizable constructor: %G\n",
3263 root_stmt_infos[0]->stmt);
3266 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "Starting SLP discovery for\n");
3270 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3271 dump_printf_loc (MSG_NOTE, vect_location,
3272 " %G", scalar_stmts[i]->stmt);
3275 /* When a BB reduction doesn't have an even number of lanes
3276 strip it down, treating the remaining lane as scalar.
3277 ??? Selecting the optimal set of lanes to vectorize would be nice
3278 but SLP build for all lanes will fail quickly because we think
3279 we're going to need unrolling. */
3280 if (kind == slp_inst_kind_bb_reduc
3281 && (scalar_stmts.length () & 1))
3282 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3284 /* Build the tree for the SLP instance. */
3285 unsigned int group_size = scalar_stmts.length ();
3286 bool *matches = XALLOCAVEC (bool, group_size);
3287 poly_uint64 max_nunits = 1;
3288 unsigned tree_size = 0;
3289 unsigned i;
3290 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3291 &max_nunits, matches, limit,
3292 &tree_size, bst_map);
3293 if (node != NULL)
3295 /* Calculate the unrolling factor based on the smallest type. */
3296 poly_uint64 unrolling_factor
3297 = calculate_unrolling_factor (max_nunits, group_size);
3299 if (maybe_ne (unrolling_factor, 1U)
3300 && is_a <bb_vec_info> (vinfo))
3302 unsigned HOST_WIDE_INT const_max_nunits;
3303 if (!max_nunits.is_constant (&const_max_nunits)
3304 || const_max_nunits > group_size)
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 "Build SLP failed: store group "
3309 "size not a multiple of the vector size "
3310 "in basic block SLP\n");
3311 vect_free_slp_tree (node);
3312 return false;
3314 /* Fatal mismatch. */
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "SLP discovery succeeded but node needs "
3318 "splitting\n");
3319 memset (matches, true, group_size);
3320 matches[group_size / const_max_nunits * const_max_nunits] = false;
3321 vect_free_slp_tree (node);
3323 else
3325 /* Create a new SLP instance. */
3326 slp_instance new_instance = XNEW (class _slp_instance);
3327 SLP_INSTANCE_TREE (new_instance) = node;
3328 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3329 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3330 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3331 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3332 SLP_INSTANCE_KIND (new_instance) = kind;
3333 new_instance->reduc_phis = NULL;
3334 new_instance->cost_vec = vNULL;
3335 new_instance->subgraph_entries = vNULL;
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_NOTE, vect_location,
3339 "SLP size %u vs. limit %u.\n",
3340 tree_size, max_tree_size);
3342 /* Fixup SLP reduction chains. */
3343 if (kind == slp_inst_kind_reduc_chain)
3345 /* If this is a reduction chain with a conversion in front
3346 amend the SLP tree with a node for that. */
3347 gimple *scalar_def
3348 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3349 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3351 /* Get at the conversion stmt - we know it's the single use
3352 of the last stmt of the reduction chain. */
3353 use_operand_p use_p;
3354 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3355 &use_p, &scalar_def);
3356 gcc_assert (r);
3357 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3358 next_info = vect_stmt_to_vectorize (next_info);
3359 scalar_stmts = vNULL;
3360 scalar_stmts.create (group_size);
3361 for (unsigned i = 0; i < group_size; ++i)
3362 scalar_stmts.quick_push (next_info);
3363 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3364 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3365 SLP_TREE_CHILDREN (conv).quick_push (node);
3366 SLP_INSTANCE_TREE (new_instance) = conv;
3367 /* We also have to fake this conversion stmt as SLP reduction
3368 group so we don't have to mess with too much code
3369 elsewhere. */
3370 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3371 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3373 /* Fill the backedge child of the PHI SLP node. The
3374 general matching code cannot find it because the
3375 scalar code does not reflect how we vectorize the
3376 reduction. */
3377 use_operand_p use_p;
3378 imm_use_iterator imm_iter;
3379 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3380 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3381 gimple_get_lhs (scalar_def))
3382 /* There are exactly two non-debug uses, the reduction
3383 PHI and the loop-closed PHI node. */
3384 if (!is_gimple_debug (USE_STMT (use_p))
3385 && gimple_bb (USE_STMT (use_p)) == loop->header)
3387 auto_vec<stmt_vec_info, 64> phis (group_size);
3388 stmt_vec_info phi_info
3389 = vinfo->lookup_stmt (USE_STMT (use_p));
3390 for (unsigned i = 0; i < group_size; ++i)
3391 phis.quick_push (phi_info);
3392 slp_tree *phi_node = bst_map->get (phis);
3393 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3394 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3395 = SLP_INSTANCE_TREE (new_instance);
3396 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3400 vinfo->slp_instances.safe_push (new_instance);
3402 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3403 the number of scalar stmts in the root in a few places.
3404 Verify that assumption holds. */
3405 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3406 .length () == group_size);
3408 if (dump_enabled_p ())
3410 dump_printf_loc (MSG_NOTE, vect_location,
3411 "Final SLP tree for instance %p:\n",
3412 (void *) new_instance);
3413 vect_print_slp_graph (MSG_NOTE, vect_location,
3414 SLP_INSTANCE_TREE (new_instance));
3417 return true;
3420 else
3422 /* Failed to SLP. */
3423 /* Free the allocated memory. */
3424 scalar_stmts.release ();
3427 stmt_vec_info stmt_info = stmt_info_;
3428 /* Try to break the group up into pieces. */
3429 if (kind == slp_inst_kind_store)
3431 /* ??? We could delay all the actual splitting of store-groups
3432 until after SLP discovery of the original group completed.
3433 Then we can recurse to vect_build_slp_instance directly. */
3434 for (i = 0; i < group_size; i++)
3435 if (!matches[i])
3436 break;
3438 /* For basic block SLP, try to break the group up into multiples of
3439 a vector size. */
3440 if (is_a <bb_vec_info> (vinfo)
3441 && (i > 1 && i < group_size))
3443 tree scalar_type
3444 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3445 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3446 1 << floor_log2 (i));
3447 unsigned HOST_WIDE_INT const_nunits;
3448 if (vectype
3449 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3451 /* Split into two groups at the first vector boundary. */
3452 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3453 unsigned group1_size = i & ~(const_nunits - 1);
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location,
3457 "Splitting SLP group at stmt %u\n", i);
3458 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3459 group1_size);
3460 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3461 kind, max_tree_size,
3462 limit);
3463 /* Split the rest at the failure point and possibly
3464 re-analyze the remaining matching part if it has
3465 at least two lanes. */
3466 if (group1_size < i
3467 && (i + 1 < group_size
3468 || i - group1_size > 1))
3470 stmt_vec_info rest2 = rest;
3471 rest = vect_split_slp_store_group (rest, i - group1_size);
3472 if (i - group1_size > 1)
3473 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3474 kind, max_tree_size,
3475 limit);
3477 /* Re-analyze the non-matching tail if it has at least
3478 two lanes. */
3479 if (i + 1 < group_size)
3480 res |= vect_analyze_slp_instance (vinfo, bst_map,
3481 rest, kind, max_tree_size,
3482 limit);
3483 return res;
3487 /* For loop vectorization split into arbitrary pieces of size > 1. */
3488 if (is_a <loop_vec_info> (vinfo)
3489 && (i > 1 && i < group_size)
3490 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3492 unsigned group1_size = i;
3494 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_NOTE, vect_location,
3496 "Splitting SLP group at stmt %u\n", i);
3498 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3499 group1_size);
3500 /* Loop vectorization cannot handle gaps in stores, make sure
3501 the split group appears as strided. */
3502 STMT_VINFO_STRIDED_P (rest) = 1;
3503 DR_GROUP_GAP (rest) = 0;
3504 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3505 DR_GROUP_GAP (stmt_info) = 0;
3507 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3508 kind, max_tree_size, limit);
3509 if (i + 1 < group_size)
3510 res |= vect_analyze_slp_instance (vinfo, bst_map,
3511 rest, kind, max_tree_size, limit);
3513 return res;
3516 /* Even though the first vector did not all match, we might be able to SLP
3517 (some) of the remainder. FORNOW ignore this possibility. */
3520 /* Failed to SLP. */
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3523 return false;
3527 /* Analyze an SLP instance starting from a group of grouped stores. Call
3528 vect_build_slp_tree to build a tree of packed stmts if possible.
3529 Return FALSE if it's impossible to SLP any stmt in the loop. */
3531 static bool
3532 vect_analyze_slp_instance (vec_info *vinfo,
3533 scalar_stmts_to_slp_tree_map_t *bst_map,
3534 stmt_vec_info stmt_info,
3535 slp_instance_kind kind,
3536 unsigned max_tree_size, unsigned *limit)
3538 unsigned int i;
3539 vec<stmt_vec_info> scalar_stmts;
3541 if (is_a <bb_vec_info> (vinfo))
3542 vect_location = stmt_info->stmt;
3544 stmt_vec_info next_info = stmt_info;
3545 if (kind == slp_inst_kind_store)
3547 /* Collect the stores and store them in scalar_stmts. */
3548 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3549 while (next_info)
3551 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3552 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3555 else if (kind == slp_inst_kind_reduc_chain)
3557 /* Collect the reduction stmts and store them in scalar_stmts. */
3558 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3559 while (next_info)
3561 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3562 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3564 /* Mark the first element of the reduction chain as reduction to properly
3565 transform the node. In the reduction analysis phase only the last
3566 element of the chain is marked as reduction. */
3567 STMT_VINFO_DEF_TYPE (stmt_info)
3568 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3569 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3570 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3572 else if (kind == slp_inst_kind_reduc_group)
3574 /* Collect reduction statements. */
3575 const vec<stmt_vec_info> &reductions
3576 = as_a <loop_vec_info> (vinfo)->reductions;
3577 scalar_stmts.create (reductions.length ());
3578 for (i = 0; reductions.iterate (i, &next_info); i++)
3579 if ((STMT_VINFO_RELEVANT_P (next_info)
3580 || STMT_VINFO_LIVE_P (next_info))
3581 /* ??? Make sure we didn't skip a conversion around a reduction
3582 path. In that case we'd have to reverse engineer that conversion
3583 stmt following the chain using reduc_idx and from the PHI
3584 using reduc_def. */
3585 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3586 scalar_stmts.quick_push (next_info);
3587 /* If less than two were relevant/live there's nothing to SLP. */
3588 if (scalar_stmts.length () < 2)
3589 return false;
3591 else
3592 gcc_unreachable ();
3594 vec<stmt_vec_info> roots = vNULL;
3595 vec<tree> remain = vNULL;
3596 /* Build the tree for the SLP instance. */
3597 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3598 roots, remain,
3599 max_tree_size, limit, bst_map,
3600 kind == slp_inst_kind_store
3601 ? stmt_info : NULL);
3603 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3604 where we should do store group splitting. */
3606 return res;
3609 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3610 trees of packed scalar stmts if SLP is possible. */
3612 opt_result
3613 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3615 unsigned int i;
3616 stmt_vec_info first_element;
3617 slp_instance instance;
3619 DUMP_VECT_SCOPE ("vect_analyze_slp");
3621 unsigned limit = max_tree_size;
3623 scalar_stmts_to_slp_tree_map_t *bst_map
3624 = new scalar_stmts_to_slp_tree_map_t ();
3626 /* Find SLP sequences starting from groups of grouped stores. */
3627 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3628 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3629 slp_inst_kind_store, max_tree_size, &limit);
3631 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3633 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3635 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3636 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3637 bb_vinfo->roots[i].stmts,
3638 bb_vinfo->roots[i].roots,
3639 bb_vinfo->roots[i].remain,
3640 max_tree_size, &limit, bst_map, NULL))
3642 bb_vinfo->roots[i].stmts = vNULL;
3643 bb_vinfo->roots[i].roots = vNULL;
3644 bb_vinfo->roots[i].remain = vNULL;
3649 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3651 /* Find SLP sequences starting from reduction chains. */
3652 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3653 if (! STMT_VINFO_RELEVANT_P (first_element)
3654 && ! STMT_VINFO_LIVE_P (first_element))
3656 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3657 slp_inst_kind_reduc_chain,
3658 max_tree_size, &limit))
3660 /* Dissolve reduction chain group. */
3661 stmt_vec_info vinfo = first_element;
3662 stmt_vec_info last = NULL;
3663 while (vinfo)
3665 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3666 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3667 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3668 last = vinfo;
3669 vinfo = next;
3671 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3672 /* It can be still vectorized as part of an SLP reduction. */
3673 loop_vinfo->reductions.safe_push (last);
3676 /* Find SLP sequences starting from groups of reductions. */
3677 if (loop_vinfo->reductions.length () > 1)
3678 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3679 slp_inst_kind_reduc_group, max_tree_size,
3680 &limit);
3683 hash_set<slp_tree> visited_patterns;
3684 slp_tree_to_load_perm_map_t perm_cache;
3685 slp_compat_nodes_map_t compat_cache;
3687 /* See if any patterns can be found in the SLP tree. */
3688 bool pattern_found = false;
3689 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3690 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3691 &visited_patterns, &perm_cache,
3692 &compat_cache);
3694 /* If any were found optimize permutations of loads. */
3695 if (pattern_found)
3697 hash_map<slp_tree, slp_tree> load_map;
3698 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3700 slp_tree root = SLP_INSTANCE_TREE (instance);
3701 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3702 &load_map, root);
3708 /* The map keeps a reference on SLP nodes built, release that. */
3709 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3710 it != bst_map->end (); ++it)
3711 if ((*it).second)
3712 vect_free_slp_tree ((*it).second);
3713 delete bst_map;
3715 if (pattern_found && dump_enabled_p ())
3717 dump_printf_loc (MSG_NOTE, vect_location,
3718 "Pattern matched SLP tree\n");
3719 hash_set<slp_tree> visited;
3720 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3721 vect_print_slp_graph (MSG_NOTE, vect_location,
3722 SLP_INSTANCE_TREE (instance), visited);
3725 return opt_result::success ();
3728 /* Estimates the cost of inserting layout changes into the SLP graph.
3729 It can also say that the insertion is impossible. */
3731 struct slpg_layout_cost
3733 slpg_layout_cost () = default;
3734 slpg_layout_cost (sreal, bool);
3736 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3737 bool is_possible () const { return depth != sreal::max (); }
3739 bool operator== (const slpg_layout_cost &) const;
3740 bool operator!= (const slpg_layout_cost &) const;
3742 bool is_better_than (const slpg_layout_cost &, bool) const;
3744 void add_parallel_cost (const slpg_layout_cost &);
3745 void add_serial_cost (const slpg_layout_cost &);
3746 void split (unsigned int);
3748 /* The longest sequence of layout changes needed during any traversal
3749 of the partition dag, weighted by execution frequency.
3751 This is the most important metric when optimizing for speed, since
3752 it helps to ensure that we keep the number of operations on
3753 critical paths to a minimum. */
3754 sreal depth = 0;
3756 /* An estimate of the total number of operations needed. It is weighted by
3757 execution frequency when optimizing for speed but not when optimizing for
3758 size. In order to avoid double-counting, a node with a fanout of N will
3759 distribute 1/N of its total cost to each successor.
3761 This is the most important metric when optimizing for size, since
3762 it helps to keep the total number of operations to a minimum, */
3763 sreal total = 0;
3766 /* Construct costs for a node with weight WEIGHT. A higher weight
3767 indicates more frequent execution. IS_FOR_SIZE is true if we are
3768 optimizing for size rather than speed. */
3770 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3771 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3775 bool
3776 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3778 return depth == other.depth && total == other.total;
3781 bool
3782 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3784 return !operator== (other);
3787 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3788 true if we are optimizing for size rather than speed. */
3790 bool
3791 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3792 bool is_for_size) const
3794 if (is_for_size)
3796 if (total != other.total)
3797 return total < other.total;
3798 return depth < other.depth;
3800 else
3802 if (depth != other.depth)
3803 return depth < other.depth;
3804 return total < other.total;
3808 /* Increase the costs to account for something with cost INPUT_COST
3809 happening in parallel with the current costs. */
3811 void
3812 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3814 depth = std::max (depth, input_cost.depth);
3815 total += input_cost.total;
3818 /* Increase the costs to account for something with cost INPUT_COST
3819 happening in series with the current costs. */
3821 void
3822 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3824 depth += other.depth;
3825 total += other.total;
3828 /* Split the total cost among TIMES successors or predecessors. */
3830 void
3831 slpg_layout_cost::split (unsigned int times)
3833 if (times > 1)
3834 total /= times;
3837 /* Information about one node in the SLP graph, for use during
3838 vect_optimize_slp_pass. */
3840 struct slpg_vertex
3842 slpg_vertex (slp_tree node_) : node (node_) {}
3844 /* The node itself. */
3845 slp_tree node;
3847 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3848 partitions are flexible; they can have whichever layout consumers
3849 want them to have. */
3850 int partition = -1;
3852 /* The number of nodes that directly use the result of this one
3853 (i.e. the number of nodes that count this one as a child). */
3854 unsigned int out_degree = 0;
3856 /* The execution frequency of the node. */
3857 sreal weight = 0;
3859 /* The total execution frequency of all nodes that directly use the
3860 result of this one. */
3861 sreal out_weight = 0;
3864 /* Information about one partition of the SLP graph, for use during
3865 vect_optimize_slp_pass. */
3867 struct slpg_partition_info
3869 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3870 of m_partitioned_nodes. */
3871 unsigned int node_begin = 0;
3872 unsigned int node_end = 0;
3874 /* Which layout we've chosen to use for this partition, or -1 if
3875 we haven't picked one yet. */
3876 int layout = -1;
3878 /* The number of predecessors and successors in the partition dag.
3879 The predecessors always have lower partition numbers and the
3880 successors always have higher partition numbers.
3882 Note that the directions of these edges are not necessarily the
3883 same as in the data flow graph. For example, if an SCC has separate
3884 partitions for an inner loop and an outer loop, the inner loop's
3885 partition will have at least two incoming edges from the outer loop's
3886 partition: one for a live-in value and one for a live-out value.
3887 In data flow terms, one of these edges would also be from the outer loop
3888 to the inner loop, but the other would be in the opposite direction. */
3889 unsigned int in_degree = 0;
3890 unsigned int out_degree = 0;
3893 /* Information about the costs of using a particular layout for a
3894 particular partition. It can also say that the combination is
3895 impossible. */
3897 struct slpg_partition_layout_costs
3899 bool is_possible () const { return internal_cost.is_possible (); }
3900 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3902 /* The costs inherited from predecessor partitions. */
3903 slpg_layout_cost in_cost;
3905 /* The inherent cost of the layout within the node itself. For example,
3906 this is nonzero for a load if choosing a particular layout would require
3907 the load to permute the loaded elements. It is nonzero for a
3908 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3909 to full-vector moves. */
3910 slpg_layout_cost internal_cost;
3912 /* The costs inherited from successor partitions. */
3913 slpg_layout_cost out_cost;
3916 /* This class tries to optimize the layout of vectors in order to avoid
3917 unnecessary shuffling. At the moment, the set of possible layouts are
3918 restricted to bijective permutations.
3920 The goal of the pass depends on whether we're optimizing for size or
3921 for speed. When optimizing for size, the goal is to reduce the overall
3922 number of layout changes (including layout changes implied by things
3923 like load permutations). When optimizing for speed, the goal is to
3924 reduce the maximum latency attributable to layout changes on any
3925 non-cyclical path through the data flow graph.
3927 For example, when optimizing a loop nest for speed, we will prefer
3928 to make layout changes outside of a loop rather than inside of a loop,
3929 and will prefer to make layout changes in parallel rather than serially,
3930 even if that increases the overall number of layout changes.
3932 The high-level procedure is:
3934 (1) Build a graph in which edges go from uses (parents) to definitions
3935 (children).
3937 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3939 (3) When optimizing for speed, partition the nodes in each SCC based
3940 on their containing cfg loop. When optimizing for size, treat
3941 each SCC as a single partition.
3943 This gives us a dag of partitions. The goal is now to assign a
3944 layout to each partition.
3946 (4) Construct a set of vector layouts that are worth considering.
3947 Record which nodes must keep their current layout.
3949 (5) Perform a forward walk over the partition dag (from loads to stores)
3950 accumulating the "forward" cost of using each layout. When visiting
3951 each partition, assign a tentative choice of layout to the partition
3952 and use that choice when calculating the cost of using a different
3953 layout in successor partitions.
3955 (6) Perform a backward walk over the partition dag (from stores to loads),
3956 accumulating the "backward" cost of using each layout. When visiting
3957 each partition, make a final choice of layout for that partition based
3958 on the accumulated forward costs (from (5)) and backward costs
3959 (from (6)).
3961 (7) Apply the chosen layouts to the SLP graph.
3963 For example, consider the SLP statements:
3965 S1: a_1 = load
3966 loop:
3967 S2: a_2 = PHI<a_1, a_3>
3968 S3: b_1 = load
3969 S4: a_3 = a_2 + b_1
3970 exit:
3971 S5: a_4 = PHI<a_3>
3972 S6: store a_4
3974 S2 and S4 form an SCC and are part of the same loop. Every other
3975 statement is in a singleton SCC. In this example there is a one-to-one
3976 mapping between SCCs and partitions and the partition dag looks like this;
3978 S1 S3
3980 S2+S4
3986 S2, S3 and S4 will have a higher execution frequency than the other
3987 statements, so when optimizing for speed, the goal is to avoid any
3988 layout changes:
3990 - within S3
3991 - within S2+S4
3992 - on the S3->S2+S4 edge
3994 For example, if S3 was originally a reversing load, the goal of the
3995 pass is to make it an unreversed load and change the layout on the
3996 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3997 on S1->S2+S4 and S5->S6 would also be acceptable.)
3999 The difference between SCCs and partitions becomes important if we
4000 add an outer loop:
4002 S1: a_1 = ...
4003 loop1:
4004 S2: a_2 = PHI<a_1, a_6>
4005 S3: b_1 = load
4006 S4: a_3 = a_2 + b_1
4007 loop2:
4008 S5: a_4 = PHI<a_3, a_5>
4009 S6: c_1 = load
4010 S7: a_5 = a_4 + c_1
4011 exit2:
4012 S8: a_6 = PHI<a_5>
4013 S9: store a_6
4014 exit1:
4016 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4017 for speed, we usually do not want restrictions in the outer loop to "infect"
4018 the decision for the inner loop. For example, if an outer-loop node
4019 in the SCC contains a statement with a fixed layout, that should not
4020 prevent the inner loop from using a different layout. Conversely,
4021 the inner loop should not dictate a layout to the outer loop: if the
4022 outer loop does a lot of computation, then it may not be efficient to
4023 do all of that computation in the inner loop's preferred layout.
4025 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4026 and S5+S7 (inner). We also try to arrange partitions so that:
4028 - the partition for an outer loop comes before the partition for
4029 an inner loop
4031 - if a sibling loop A dominates a sibling loop B, A's partition
4032 comes before B's
4034 This gives the following partition dag for the example above:
4036 S1 S3
4038 S2+S4+S8 S6
4039 | \\ /
4040 | S5+S7
4044 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4045 one for a reversal of the edge S7->S8.
4047 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4048 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4049 preferred layout against the cost of changing the layout on entry to the
4050 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4052 Although this works well when optimizing for speed, it has the downside
4053 when optimizing for size that the choice of layout for S5+S7 is completely
4054 independent of S9, which lessens the chance of reducing the overall number
4055 of permutations. We therefore do not partition SCCs when optimizing
4056 for size.
4058 To give a concrete example of the difference between optimizing
4059 for size and speed, consider:
4061 a[0] = (b[1] << c[3]) - d[1];
4062 a[1] = (b[0] << c[2]) - d[0];
4063 a[2] = (b[3] << c[1]) - d[3];
4064 a[3] = (b[2] << c[0]) - d[2];
4066 There are three different layouts here: one for a, one for b and d,
4067 and one for c. When optimizing for speed it is better to permute each
4068 of b, c and d into the order required by a, since those permutations
4069 happen in parallel. But when optimizing for size, it is better to:
4071 - permute c into the same order as b
4072 - do the arithmetic
4073 - permute the result into the order required by a
4075 This gives 2 permutations rather than 3. */
4077 class vect_optimize_slp_pass
4079 public:
4080 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4081 void run ();
4083 private:
4084 /* Graph building. */
4085 struct loop *containing_loop (slp_tree);
4086 bool is_cfg_latch_edge (graph_edge *);
4087 void build_vertices (hash_set<slp_tree> &, slp_tree);
4088 void build_vertices ();
4089 void build_graph ();
4091 /* Partitioning. */
4092 void create_partitions ();
4093 template<typename T> void for_each_partition_edge (unsigned int, T);
4095 /* Layout selection. */
4096 bool is_compatible_layout (slp_tree, unsigned int);
4097 int change_layout_cost (slp_tree, unsigned int, unsigned int);
4098 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4099 unsigned int);
4100 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4101 int, unsigned int);
4102 int internal_node_cost (slp_tree, int, unsigned int);
4103 void start_choosing_layouts ();
4105 /* Cost propagation. */
4106 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4107 unsigned int, unsigned int);
4108 slpg_layout_cost total_in_cost (unsigned int);
4109 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4110 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4111 void forward_pass ();
4112 void backward_pass ();
4114 /* Rematerialization. */
4115 slp_tree get_result_with_layout (slp_tree, unsigned int);
4116 void materialize ();
4118 /* Clean-up. */
4119 void remove_redundant_permutations ();
4121 void dump ();
4123 vec_info *m_vinfo;
4125 /* True if we should optimize the graph for size, false if we should
4126 optimize it for speed. (It wouldn't be easy to make this decision
4127 more locally.) */
4128 bool m_optimize_size;
4130 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4131 In other words, a node's predecessors are its slp_tree parents and
4132 a node's successors are its slp_tree children. */
4133 graph *m_slpg = nullptr;
4135 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4136 auto_vec<slpg_vertex> m_vertices;
4138 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4139 and loads. */
4140 auto_vec<int> m_leafs;
4142 /* This array has one entry for every vector layout that we're considering.
4143 Element 0 is null and indicates "no change". Other entries describe
4144 permutations that are inherent in the current graph and that we would
4145 like to reverse if possible.
4147 For example, a permutation { 1, 2, 3, 0 } means that something has
4148 effectively been permuted in that way, such as a load group
4149 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4150 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4151 in order to put things "back" in order. */
4152 auto_vec<vec<unsigned> > m_perms;
4154 /* A partitioning of the nodes for which a layout must be chosen.
4155 Each partition represents an <SCC, cfg loop> pair; that is,
4156 nodes in different SCCs belong to different partitions, and nodes
4157 within an SCC can be further partitioned according to a containing
4158 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4160 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4161 from leaves (such as loads) to roots (such as stores).
4163 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4164 auto_vec<slpg_partition_info> m_partitions;
4166 /* The list of all nodes for which a layout must be chosen. Nodes for
4167 partition P come before the nodes for partition P+1. Nodes within a
4168 partition are in reverse postorder. */
4169 auto_vec<unsigned int> m_partitioned_nodes;
4171 /* Index P * num-layouts + L contains the cost of using layout L
4172 for partition P. */
4173 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4175 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4176 original output of node N adjusted to have layout L. */
4177 auto_vec<slp_tree> m_node_layouts;
4180 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4181 Also record whether we should optimize anything for speed rather
4182 than size. */
4184 void
4185 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4186 slp_tree node)
4188 unsigned i;
4189 slp_tree child;
4191 if (visited.add (node))
4192 return;
4194 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4196 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4197 if (optimize_bb_for_speed_p (bb))
4198 m_optimize_size = false;
4201 node->vertex = m_vertices.length ();
4202 m_vertices.safe_push (slpg_vertex (node));
4204 bool leaf = true;
4205 bool force_leaf = false;
4206 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4207 if (child)
4209 leaf = false;
4210 build_vertices (visited, child);
4212 else
4213 force_leaf = true;
4214 /* Since SLP discovery works along use-def edges all cycles have an
4215 entry - but there's the exception of cycles where we do not handle
4216 the entry explicitely (but with a NULL SLP node), like some reductions
4217 and inductions. Force those SLP PHIs to act as leafs to make them
4218 backwards reachable. */
4219 if (leaf || force_leaf)
4220 m_leafs.safe_push (node->vertex);
4223 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4225 void
4226 vect_optimize_slp_pass::build_vertices ()
4228 hash_set<slp_tree> visited;
4229 unsigned i;
4230 slp_instance instance;
4231 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4232 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4235 /* Apply (reverse) bijectite PERM to VEC. */
4237 template <class T>
4238 static void
4239 vect_slp_permute (vec<unsigned> perm,
4240 vec<T> &vec, bool reverse)
4242 auto_vec<T, 64> saved;
4243 saved.create (vec.length ());
4244 for (unsigned i = 0; i < vec.length (); ++i)
4245 saved.quick_push (vec[i]);
4247 if (reverse)
4249 for (unsigned i = 0; i < vec.length (); ++i)
4250 vec[perm[i]] = saved[i];
4251 for (unsigned i = 0; i < vec.length (); ++i)
4252 gcc_assert (vec[perm[i]] == saved[i]);
4254 else
4256 for (unsigned i = 0; i < vec.length (); ++i)
4257 vec[i] = saved[perm[i]];
4258 for (unsigned i = 0; i < vec.length (); ++i)
4259 gcc_assert (vec[i] == saved[perm[i]]);
4263 /* Return the cfg loop that contains NODE. */
4265 struct loop *
4266 vect_optimize_slp_pass::containing_loop (slp_tree node)
4268 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4269 if (!rep)
4270 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4271 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4274 /* Return true if UD (an edge from a use to a definition) is associated
4275 with a loop latch edge in the cfg. */
4277 bool
4278 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4280 slp_tree use = m_vertices[ud->src].node;
4281 slp_tree def = m_vertices[ud->dest].node;
4282 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4283 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4284 return false;
4286 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4287 return (is_a<gphi *> (use_rep->stmt)
4288 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4289 && containing_loop (def) == containing_loop (use));
4292 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4293 a nonnull data field. */
4295 void
4296 vect_optimize_slp_pass::build_graph ()
4298 m_optimize_size = true;
4299 build_vertices ();
4301 m_slpg = new_graph (m_vertices.length ());
4302 for (slpg_vertex &v : m_vertices)
4303 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4304 if (child)
4306 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4307 if (is_cfg_latch_edge (ud))
4308 ud->data = this;
4312 /* Return true if E corresponds to a loop latch edge in the cfg. */
4314 static bool
4315 skip_cfg_latch_edges (graph_edge *e)
4317 return e->data;
4320 /* Create the node partitions. */
4322 void
4323 vect_optimize_slp_pass::create_partitions ()
4325 /* Calculate a postorder of the graph, ignoring edges that correspond
4326 to natural latch edges in the cfg. Reading the vector from the end
4327 to the beginning gives the reverse postorder. */
4328 auto_vec<int> initial_rpo;
4329 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4330 false, NULL, skip_cfg_latch_edges);
4331 gcc_assert (initial_rpo.length () == m_vertices.length ());
4333 /* Calculate the strongly connected components of the graph. */
4334 auto_vec<int> scc_grouping;
4335 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4337 /* Create a new index order in which all nodes from the same SCC are
4338 consecutive. Use scc_pos to record the index of the first node in
4339 each SCC. */
4340 auto_vec<unsigned int> scc_pos (num_sccs);
4341 int last_component = -1;
4342 unsigned int node_count = 0;
4343 for (unsigned int node_i : scc_grouping)
4345 if (last_component != m_slpg->vertices[node_i].component)
4347 last_component = m_slpg->vertices[node_i].component;
4348 gcc_assert (last_component == int (scc_pos.length ()));
4349 scc_pos.quick_push (node_count);
4351 node_count += 1;
4353 gcc_assert (node_count == initial_rpo.length ()
4354 && last_component + 1 == int (num_sccs));
4356 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4357 inside each SCC following the RPO we calculated above. The fact that
4358 we ignored natural latch edges when calculating the RPO should ensure
4359 that, for natural loop nests:
4361 - the first node that we encounter in a cfg loop is the loop header phi
4362 - the loop header phis are in dominance order
4364 Arranging for this is an optimization (see below) rather than a
4365 correctness issue. Unnatural loops with a tangled mess of backedges
4366 will still work correctly, but might give poorer results.
4368 Also update scc_pos so that it gives 1 + the index of the last node
4369 in the SCC. */
4370 m_partitioned_nodes.safe_grow (node_count);
4371 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4373 unsigned int node_i = initial_rpo[old_i];
4374 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4375 m_partitioned_nodes[new_i] = node_i;
4378 /* When optimizing for speed, partition each SCC based on the containing
4379 cfg loop. The order we constructed above should ensure that, for natural
4380 cfg loops, we'll create sub-SCC partitions for outer loops before
4381 the corresponding sub-SCC partitions for inner loops. Similarly,
4382 when one sibling loop A dominates another sibling loop B, we should
4383 create a sub-SCC partition for A before a sub-SCC partition for B.
4385 As above, nothing depends for correctness on whether this achieves
4386 a natural nesting, but we should get better results when it does. */
4387 m_partitions.reserve (m_vertices.length ());
4388 unsigned int next_partition_i = 0;
4389 hash_map<struct loop *, int> loop_partitions;
4390 unsigned int rpo_begin = 0;
4391 unsigned int num_partitioned_nodes = 0;
4392 for (unsigned int rpo_end : scc_pos)
4394 loop_partitions.empty ();
4395 unsigned int partition_i = next_partition_i;
4396 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4398 /* Handle externals and constants optimistically throughout.
4399 But treat existing vectors as fixed since we do not handle
4400 permuting them. */
4401 unsigned int node_i = m_partitioned_nodes[rpo_i];
4402 auto &vertex = m_vertices[node_i];
4403 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4404 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4405 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4406 vertex.partition = -1;
4407 else
4409 bool existed;
4410 if (m_optimize_size)
4411 existed = next_partition_i > partition_i;
4412 else
4414 struct loop *loop = containing_loop (vertex.node);
4415 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4416 if (!existed)
4417 entry = next_partition_i;
4418 partition_i = entry;
4420 if (!existed)
4422 m_partitions.quick_push (slpg_partition_info ());
4423 next_partition_i += 1;
4425 vertex.partition = partition_i;
4426 num_partitioned_nodes += 1;
4427 m_partitions[partition_i].node_end += 1;
4430 rpo_begin = rpo_end;
4433 /* Assign ranges of consecutive node indices to each partition,
4434 in partition order. Start with node_end being the same as
4435 node_begin so that the next loop can use it as a counter. */
4436 unsigned int node_begin = 0;
4437 for (auto &partition : m_partitions)
4439 partition.node_begin = node_begin;
4440 node_begin += partition.node_end;
4441 partition.node_end = partition.node_begin;
4443 gcc_assert (node_begin == num_partitioned_nodes);
4445 /* Finally build the list of nodes in partition order. */
4446 m_partitioned_nodes.truncate (num_partitioned_nodes);
4447 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4449 int partition_i = m_vertices[node_i].partition;
4450 if (partition_i >= 0)
4452 unsigned int order_i = m_partitions[partition_i].node_end++;
4453 m_partitioned_nodes[order_i] = node_i;
4458 /* Look for edges from earlier partitions into node NODE_I and edges from
4459 node NODE_I into later partitions. Call:
4461 FN (ud, other_node_i)
4463 for each such use-to-def edge ud, where other_node_i is the node at the
4464 other end of the edge. */
4466 template<typename T>
4467 void
4468 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4470 int partition_i = m_vertices[node_i].partition;
4471 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4472 pred; pred = pred->pred_next)
4474 int src_partition_i = m_vertices[pred->src].partition;
4475 if (src_partition_i >= 0 && src_partition_i != partition_i)
4476 fn (pred, pred->src);
4478 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4479 succ; succ = succ->succ_next)
4481 int dest_partition_i = m_vertices[succ->dest].partition;
4482 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4483 fn (succ, succ->dest);
4487 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4488 that NODE would operate on. This test is independent of NODE's actual
4489 operation. */
4491 bool
4492 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4493 unsigned int layout_i)
4495 if (layout_i == 0)
4496 return true;
4498 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4499 return false;
4501 return true;
4504 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4505 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4506 layouts is incompatible with NODE or if the change is not possible for
4507 some other reason.
4509 The properties taken from NODE include the number of lanes and the
4510 vector type. The actual operation doesn't matter. */
4513 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4514 unsigned int from_layout_i,
4515 unsigned int to_layout_i)
4517 if (!is_compatible_layout (node, from_layout_i)
4518 || !is_compatible_layout (node, to_layout_i))
4519 return -1;
4521 if (from_layout_i == to_layout_i)
4522 return 0;
4524 auto_vec<slp_tree, 1> children (1);
4525 children.quick_push (node);
4526 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4527 if (from_layout_i > 0)
4528 for (unsigned int i : m_perms[from_layout_i])
4529 perm.quick_push ({ 0, i });
4530 else
4531 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4532 perm.quick_push ({ 0, i });
4533 if (to_layout_i > 0)
4534 vect_slp_permute (m_perms[to_layout_i], perm, true);
4535 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4536 children, false);
4537 if (count >= 0)
4538 return MAX (count, 1);
4540 /* ??? In principle we could try changing via layout 0, giving two
4541 layout changes rather than 1. Doing that would require
4542 corresponding support in get_result_with_layout. */
4543 return -1;
4546 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4548 inline slpg_partition_layout_costs &
4549 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4550 unsigned int layout_i)
4552 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4555 /* Change PERM in one of two ways:
4557 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4558 chosen for child I of NODE.
4560 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4562 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4564 void
4565 vect_optimize_slp_pass::
4566 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4567 int in_layout_i, unsigned int out_layout_i)
4569 for (auto &entry : perm)
4571 int this_in_layout_i = in_layout_i;
4572 if (this_in_layout_i < 0)
4574 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4575 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4576 this_in_layout_i = m_partitions[in_partition_i].layout;
4578 if (this_in_layout_i > 0)
4579 entry.second = m_perms[this_in_layout_i][entry.second];
4581 if (out_layout_i > 0)
4582 vect_slp_permute (m_perms[out_layout_i], perm, true);
4585 /* Check whether the target allows NODE to be rearranged so that the node's
4586 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4587 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4589 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4590 NODE can adapt to the layout changes that have (perhaps provisionally)
4591 been chosen for NODE's children, so that no extra permutations are
4592 needed on either the input or the output of NODE.
4594 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4595 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4597 IN_LAYOUT_I has no meaning for other types of node.
4599 Keeping the node as-is is always valid. If the target doesn't appear
4600 to support the node as-is, but might realistically support other layouts,
4601 then layout 0 instead has the cost of a worst-case permutation. On the
4602 one hand, this ensures that every node has at least one valid layout,
4603 avoiding what would otherwise be an awkward special case. On the other,
4604 it still encourages the pass to change an invalid pre-existing layout
4605 choice into a valid one. */
4608 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4609 unsigned int out_layout_i)
4611 const int fallback_cost = 1;
4613 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4615 auto_lane_permutation_t tmp_perm;
4616 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4618 /* Check that the child nodes support the chosen layout. Checking
4619 the first child is enough, since any second child would have the
4620 same shape. */
4621 auto first_child = SLP_TREE_CHILDREN (node)[0];
4622 if (in_layout_i > 0
4623 && !is_compatible_layout (first_child, in_layout_i))
4624 return -1;
4626 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4627 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4628 node, tmp_perm,
4629 SLP_TREE_CHILDREN (node),
4630 false);
4631 if (count < 0)
4633 if (in_layout_i == 0 && out_layout_i == 0)
4635 /* Use the fallback cost if the node could in principle support
4636 some nonzero layout for both the inputs and the outputs.
4637 Otherwise assume that the node will be rejected later
4638 and rebuilt from scalars. */
4639 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4640 return fallback_cost;
4641 return 0;
4643 return -1;
4646 /* We currently have no way of telling whether the new layout is cheaper
4647 or more expensive than the old one. But at least in principle,
4648 it should be worth making zero permutations (whole-vector shuffles)
4649 cheaper than real permutations, in case the pass is able to remove
4650 the latter. */
4651 return count == 0 ? 0 : 1;
4654 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4655 if (rep
4656 && STMT_VINFO_DATA_REF (rep)
4657 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4658 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4660 auto_load_permutation_t tmp_perm;
4661 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4662 if (out_layout_i > 0)
4663 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4665 poly_uint64 vf = 1;
4666 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4667 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4668 unsigned int n_perms;
4669 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4670 nullptr, vf, true, false, &n_perms))
4672 auto rep = SLP_TREE_REPRESENTATIVE (node);
4673 if (out_layout_i == 0)
4675 /* Use the fallback cost if the load is an N-to-N permutation.
4676 Otherwise assume that the node will be rejected later
4677 and rebuilt from scalars. */
4678 if (STMT_VINFO_GROUPED_ACCESS (rep)
4679 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4680 == SLP_TREE_LANES (node)))
4681 return fallback_cost;
4682 return 0;
4684 return -1;
4687 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4688 return n_perms == 0 ? 0 : 1;
4691 return 0;
4694 /* Decide which element layouts we should consider using. Calculate the
4695 weights associated with inserting layout changes on partition edges.
4696 Also mark partitions that cannot change layout, by setting their
4697 layout to zero. */
4699 void
4700 vect_optimize_slp_pass::start_choosing_layouts ()
4702 /* Used to assign unique permutation indices. */
4703 using perm_hash = unbounded_hashmap_traits<
4704 vec_free_hash_base<int_hash_base<unsigned>>,
4705 int_hash<int, -1, -2>
4707 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4709 /* Layout 0 is "no change". */
4710 m_perms.safe_push (vNULL);
4712 /* Create layouts from existing permutations. */
4713 auto_load_permutation_t tmp_perm;
4714 for (unsigned int node_i : m_partitioned_nodes)
4716 /* Leafs also double as entries to the reverse graph. Allow the
4717 layout of those to be changed. */
4718 auto &vertex = m_vertices[node_i];
4719 auto &partition = m_partitions[vertex.partition];
4720 if (!m_slpg->vertices[node_i].succ)
4721 partition.layout = 0;
4723 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4724 slp_tree node = vertex.node;
4725 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4726 slp_tree child;
4727 unsigned HOST_WIDE_INT imin, imax = 0;
4728 bool any_permute = false;
4729 tmp_perm.truncate (0);
4730 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4732 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4733 unpermuted, record a layout that reverses this permutation.
4735 We would need more work to cope with loads that are internally
4736 permuted and also have inputs (such as masks for
4737 IFN_MASK_LOADs). */
4738 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4739 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4741 partition.layout = -1;
4742 continue;
4744 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4745 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4746 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4748 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4749 && SLP_TREE_CHILDREN (node).length () == 1
4750 && (child = SLP_TREE_CHILDREN (node)[0])
4751 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4752 .is_constant (&imin)))
4754 /* If the child has the same vector size as this node,
4755 reversing the permutation can make the permutation a no-op.
4756 In other cases it can change a true permutation into a
4757 full-vector extract. */
4758 tmp_perm.reserve (SLP_TREE_LANES (node));
4759 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4760 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4762 else
4763 continue;
4765 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4767 unsigned idx = tmp_perm[j];
4768 imin = MIN (imin, idx);
4769 imax = MAX (imax, idx);
4770 if (idx - tmp_perm[0] != j)
4771 any_permute = true;
4773 /* If the span doesn't match we'd disrupt VF computation, avoid
4774 that for now. */
4775 if (imax - imin + 1 != SLP_TREE_LANES (node))
4776 continue;
4777 /* If there's no permute no need to split one out. In this case
4778 we can consider turning a load into a permuted load, if that
4779 turns out to be cheaper than alternatives. */
4780 if (!any_permute)
4782 partition.layout = -1;
4783 continue;
4786 /* For now only handle true permutes, like
4787 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4788 when permuting constants and invariants keeping the permute
4789 bijective. */
4790 auto_sbitmap load_index (SLP_TREE_LANES (node));
4791 bitmap_clear (load_index);
4792 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4793 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4794 unsigned j;
4795 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4796 if (!bitmap_bit_p (load_index, j))
4797 break;
4798 if (j != SLP_TREE_LANES (node))
4799 continue;
4801 vec<unsigned> perm = vNULL;
4802 perm.safe_grow (SLP_TREE_LANES (node), true);
4803 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4804 perm[j] = tmp_perm[j] - imin;
4806 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4808 /* Continue to use existing layouts, but don't add any more. */
4809 int *entry = layout_ids.get (perm);
4810 partition.layout = entry ? *entry : 0;
4811 perm.release ();
4813 else
4815 bool existed;
4816 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4817 if (existed)
4818 perm.release ();
4819 else
4821 layout_i = m_perms.length ();
4822 m_perms.safe_push (perm);
4824 partition.layout = layout_i;
4828 /* Initially assume that every layout is possible and has zero cost
4829 in every partition. */
4830 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4831 * m_perms.length ());
4833 /* We have to mark outgoing permutations facing non-associating-reduction
4834 graph entries that are not represented as to be materialized.
4835 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4836 for (slp_instance instance : m_vinfo->slp_instances)
4837 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4839 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4840 m_partitions[m_vertices[node_i].partition].layout = 0;
4842 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4844 stmt_vec_info stmt_info
4845 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4846 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4847 if (needs_fold_left_reduction_p (TREE_TYPE
4848 (gimple_get_lhs (stmt_info->stmt)),
4849 STMT_VINFO_REDUC_CODE (reduc_info)))
4851 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4852 m_partitions[m_vertices[node_i].partition].layout = 0;
4856 /* Check which layouts each node and partition can handle. Calculate the
4857 weights associated with inserting layout changes on edges. */
4858 for (unsigned int node_i : m_partitioned_nodes)
4860 auto &vertex = m_vertices[node_i];
4861 auto &partition = m_partitions[vertex.partition];
4862 slp_tree node = vertex.node;
4864 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4866 vertex.weight = vect_slp_node_weight (node);
4868 /* We do not handle stores with a permutation, so all
4869 incoming permutations must have been materialized.
4871 We also don't handle masked grouped loads, which lack a
4872 permutation vector. In this case the memory locations
4873 form an implicit second input to the loads, on top of the
4874 explicit mask input, and the memory input's layout cannot
4875 be changed.
4877 On the other hand, we do support permuting gather loads and
4878 masked gather loads, where each scalar load is independent
4879 of the others. This can be useful if the address/index input
4880 benefits from permutation. */
4881 if (STMT_VINFO_DATA_REF (rep)
4882 && STMT_VINFO_GROUPED_ACCESS (rep)
4883 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4884 partition.layout = 0;
4886 /* We cannot change the layout of an operation that is
4887 not independent on lanes. Note this is an explicit
4888 negative list since that's much shorter than the respective
4889 positive one but it's critical to keep maintaining it. */
4890 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4891 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4893 case CFN_COMPLEX_ADD_ROT90:
4894 case CFN_COMPLEX_ADD_ROT270:
4895 case CFN_COMPLEX_MUL:
4896 case CFN_COMPLEX_MUL_CONJ:
4897 case CFN_VEC_ADDSUB:
4898 case CFN_VEC_FMADDSUB:
4899 case CFN_VEC_FMSUBADD:
4900 partition.layout = 0;
4901 default:;
4905 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4907 auto &other_vertex = m_vertices[other_node_i];
4909 /* Count the number of edges from earlier partitions and the number
4910 of edges to later partitions. */
4911 if (other_vertex.partition < vertex.partition)
4912 partition.in_degree += 1;
4913 else
4914 partition.out_degree += 1;
4916 /* If the current node uses the result of OTHER_NODE_I, accumulate
4917 the effects of that. */
4918 if (ud->src == int (node_i))
4920 other_vertex.out_weight += vertex.weight;
4921 other_vertex.out_degree += 1;
4924 for_each_partition_edge (node_i, process_edge);
4928 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4929 its current (provisional) choice of layout. The inputs do not necessarily
4930 have the same layout as each other. */
4932 slpg_layout_cost
4933 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4935 auto &vertex = m_vertices[node_i];
4936 slpg_layout_cost cost;
4937 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4939 auto &other_vertex = m_vertices[other_node_i];
4940 if (other_vertex.partition < vertex.partition)
4942 auto &other_partition = m_partitions[other_vertex.partition];
4943 auto &other_costs = partition_layout_costs (other_vertex.partition,
4944 other_partition.layout);
4945 slpg_layout_cost this_cost = other_costs.in_cost;
4946 this_cost.add_serial_cost (other_costs.internal_cost);
4947 this_cost.split (other_partition.out_degree);
4948 cost.add_parallel_cost (this_cost);
4951 for_each_partition_edge (node_i, add_cost);
4952 return cost;
4955 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4956 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4957 slpg_layout_cost::impossible () if the change isn't possible. */
4959 slpg_layout_cost
4960 vect_optimize_slp_pass::
4961 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4962 unsigned int layout2_i)
4964 auto &def_vertex = m_vertices[ud->dest];
4965 auto &use_vertex = m_vertices[ud->src];
4966 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4967 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4968 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4969 use_layout_i);
4970 if (factor < 0)
4971 return slpg_layout_cost::impossible ();
4973 /* We have a choice of putting the layout change at the site of the
4974 definition or at the site of the use. Prefer the former when
4975 optimizing for size or when the execution frequency of the
4976 definition is no greater than the combined execution frequencies of
4977 the uses. When putting the layout change at the site of the definition,
4978 divvy up the cost among all consumers. */
4979 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4981 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4982 cost.split (def_vertex.out_degree);
4983 return cost;
4985 return { use_vertex.weight * factor, m_optimize_size };
4988 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4989 partition; FROM_NODE_I could be the definition node or the use node.
4990 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4991 Return the cost of any necessary fix-ups on edge UD, or return
4992 slpg_layout_cost::impossible () if the change isn't possible.
4994 At this point, FROM_NODE_I's partition has chosen the cheapest
4995 layout based on the information available so far, but this choice
4996 is only provisional. */
4998 slpg_layout_cost
4999 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5000 unsigned int to_layout_i)
5002 auto &from_vertex = m_vertices[from_node_i];
5003 unsigned int from_partition_i = from_vertex.partition;
5004 slpg_partition_info &from_partition = m_partitions[from_partition_i];
5005 gcc_assert (from_partition.layout >= 0);
5007 /* First calculate the cost on the assumption that FROM_PARTITION sticks
5008 with its current layout preference. */
5009 slpg_layout_cost cost = slpg_layout_cost::impossible ();
5010 auto edge_cost = edge_layout_cost (ud, from_node_i,
5011 from_partition.layout, to_layout_i);
5012 if (edge_cost.is_possible ())
5014 auto &from_costs = partition_layout_costs (from_partition_i,
5015 from_partition.layout);
5016 cost = from_costs.in_cost;
5017 cost.add_serial_cost (from_costs.internal_cost);
5018 cost.split (from_partition.out_degree);
5019 cost.add_serial_cost (edge_cost);
5022 /* Take the minimum of that cost and the cost that applies if
5023 FROM_PARTITION instead switches to TO_LAYOUT_I. */
5024 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5025 to_layout_i);
5026 if (direct_layout_costs.is_possible ())
5028 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5029 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5030 direct_cost.split (from_partition.out_degree);
5031 if (!cost.is_possible ()
5032 || direct_cost.is_better_than (cost, m_optimize_size))
5033 cost = direct_cost;
5036 return cost;
5039 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5040 partition; TO_NODE_I could be the definition node or the use node.
5041 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5042 return the cost of any necessary fix-ups on edge UD, or
5043 slpg_layout_cost::impossible () if the choice cannot be made.
5045 At this point, TO_NODE_I's partition has a fixed choice of layout. */
5047 slpg_layout_cost
5048 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5049 unsigned int from_layout_i)
5051 auto &to_vertex = m_vertices[to_node_i];
5052 unsigned int to_partition_i = to_vertex.partition;
5053 slpg_partition_info &to_partition = m_partitions[to_partition_i];
5054 gcc_assert (to_partition.layout >= 0);
5056 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5057 adjusted for this input having layout FROM_LAYOUT_I. Assume that
5058 any other inputs keep their current choice of layout. */
5059 auto &to_costs = partition_layout_costs (to_partition_i,
5060 to_partition.layout);
5061 if (ud->src == int (to_node_i)
5062 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5064 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5065 auto old_layout = from_partition.layout;
5066 from_partition.layout = from_layout_i;
5067 int factor = internal_node_cost (to_vertex.node, -1,
5068 to_partition.layout);
5069 from_partition.layout = old_layout;
5070 if (factor >= 0)
5072 slpg_layout_cost cost = to_costs.out_cost;
5073 cost.add_serial_cost ({ to_vertex.weight * factor,
5074 m_optimize_size });
5075 cost.split (to_partition.in_degree);
5076 return cost;
5080 /* Compute the cost if we insert any necessary layout change on edge UD. */
5081 auto edge_cost = edge_layout_cost (ud, to_node_i,
5082 to_partition.layout, from_layout_i);
5083 if (edge_cost.is_possible ())
5085 slpg_layout_cost cost = to_costs.out_cost;
5086 cost.add_serial_cost (to_costs.internal_cost);
5087 cost.split (to_partition.in_degree);
5088 cost.add_serial_cost (edge_cost);
5089 return cost;
5092 return slpg_layout_cost::impossible ();
5095 /* Make a forward pass through the partitions, accumulating input costs.
5096 Make a tentative (provisional) choice of layout for each partition,
5097 ensuring that this choice still allows later partitions to keep
5098 their original layout. */
5100 void
5101 vect_optimize_slp_pass::forward_pass ()
5103 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5104 ++partition_i)
5106 auto &partition = m_partitions[partition_i];
5108 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5109 the incoming cost that would apply if every predecessor partition
5110 keeps its current layout. This is used within the loop below. */
5111 slpg_layout_cost in_cost;
5112 slp_tree single_node = nullptr;
5113 if (partition.node_end == partition.node_begin + 1)
5115 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5116 single_node = m_vertices[node_i].node;
5117 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5118 in_cost = total_in_cost (node_i);
5121 /* Go through the possible layouts. Decide which ones are valid
5122 for this partition and record which of the valid layouts has
5123 the lowest cost. */
5124 unsigned int min_layout_i = 0;
5125 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5126 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5128 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5129 if (!layout_costs.is_possible ())
5130 continue;
5132 /* If the recorded layout is already 0 then the layout cannot
5133 change. */
5134 if (partition.layout == 0 && layout_i != 0)
5136 layout_costs.mark_impossible ();
5137 continue;
5140 bool is_possible = true;
5141 for (unsigned int order_i = partition.node_begin;
5142 order_i < partition.node_end; ++order_i)
5144 unsigned int node_i = m_partitioned_nodes[order_i];
5145 auto &vertex = m_vertices[node_i];
5147 /* Reject the layout if it is individually incompatible
5148 with any node in the partition. */
5149 if (!is_compatible_layout (vertex.node, layout_i))
5151 is_possible = false;
5152 break;
5155 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5157 auto &other_vertex = m_vertices[other_node_i];
5158 if (other_vertex.partition < vertex.partition)
5160 /* Accumulate the incoming costs from earlier
5161 partitions, plus the cost of any layout changes
5162 on UD itself. */
5163 auto cost = forward_cost (ud, other_node_i, layout_i);
5164 if (!cost.is_possible ())
5165 is_possible = false;
5166 else
5167 layout_costs.in_cost.add_parallel_cost (cost);
5169 else
5170 /* Reject the layout if it would make layout 0 impossible
5171 for later partitions. This amounts to testing that the
5172 target supports reversing the layout change on edges
5173 to later partitions.
5175 In principle, it might be possible to push a layout
5176 change all the way down a graph, so that it never
5177 needs to be reversed and so that the target doesn't
5178 need to support the reverse operation. But it would
5179 be awkward to bail out if we hit a partition that
5180 does not support the new layout, especially since
5181 we are not dealing with a lattice. */
5182 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5183 layout_i).is_possible ();
5185 for_each_partition_edge (node_i, add_cost);
5187 /* Accumulate the cost of using LAYOUT_I within NODE,
5188 both for the inputs and the outputs. */
5189 int factor = internal_node_cost (vertex.node, layout_i,
5190 layout_i);
5191 if (factor < 0)
5193 is_possible = false;
5194 break;
5196 else if (factor)
5197 layout_costs.internal_cost.add_serial_cost
5198 ({ vertex.weight * factor, m_optimize_size });
5200 if (!is_possible)
5202 layout_costs.mark_impossible ();
5203 continue;
5206 /* Combine the incoming and partition-internal costs. */
5207 slpg_layout_cost combined_cost = layout_costs.in_cost;
5208 combined_cost.add_serial_cost (layout_costs.internal_cost);
5210 /* If this partition consists of a single VEC_PERM_EXPR, see
5211 if the VEC_PERM_EXPR can be changed to support output layout
5212 LAYOUT_I while keeping all the provisional choices of input
5213 layout. */
5214 if (single_node
5215 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5217 int factor = internal_node_cost (single_node, -1, layout_i);
5218 if (factor >= 0)
5220 auto weight = m_vertices[single_node->vertex].weight;
5221 slpg_layout_cost internal_cost
5222 = { weight * factor, m_optimize_size };
5224 slpg_layout_cost alt_cost = in_cost;
5225 alt_cost.add_serial_cost (internal_cost);
5226 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5228 combined_cost = alt_cost;
5229 layout_costs.in_cost = in_cost;
5230 layout_costs.internal_cost = internal_cost;
5235 /* Record the layout with the lowest cost. Prefer layout 0 in
5236 the event of a tie between it and another layout. */
5237 if (!min_layout_cost.is_possible ()
5238 || combined_cost.is_better_than (min_layout_cost,
5239 m_optimize_size))
5241 min_layout_i = layout_i;
5242 min_layout_cost = combined_cost;
5246 /* This loop's handling of earlier partitions should ensure that
5247 choosing the original layout for the current partition is no
5248 less valid than it was in the original graph, even with the
5249 provisional layout choices for those earlier partitions. */
5250 gcc_assert (min_layout_cost.is_possible ());
5251 partition.layout = min_layout_i;
5255 /* Make a backward pass through the partitions, accumulating output costs.
5256 Make a final choice of layout for each partition. */
5258 void
5259 vect_optimize_slp_pass::backward_pass ()
5261 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5263 auto &partition = m_partitions[partition_i];
5265 unsigned int min_layout_i = 0;
5266 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5267 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5269 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5270 if (!layout_costs.is_possible ())
5271 continue;
5273 /* Accumulate the costs from successor partitions. */
5274 bool is_possible = true;
5275 for (unsigned int order_i = partition.node_begin;
5276 order_i < partition.node_end; ++order_i)
5278 unsigned int node_i = m_partitioned_nodes[order_i];
5279 auto &vertex = m_vertices[node_i];
5280 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5282 auto &other_vertex = m_vertices[other_node_i];
5283 auto &other_partition = m_partitions[other_vertex.partition];
5284 if (other_vertex.partition > vertex.partition)
5286 /* Accumulate the incoming costs from later
5287 partitions, plus the cost of any layout changes
5288 on UD itself. */
5289 auto cost = backward_cost (ud, other_node_i, layout_i);
5290 if (!cost.is_possible ())
5291 is_possible = false;
5292 else
5293 layout_costs.out_cost.add_parallel_cost (cost);
5295 else
5296 /* Make sure that earlier partitions can (if necessary
5297 or beneficial) keep the layout that they chose in
5298 the forward pass. This ensures that there is at
5299 least one valid choice of layout. */
5300 is_possible &= edge_layout_cost (ud, other_node_i,
5301 other_partition.layout,
5302 layout_i).is_possible ();
5304 for_each_partition_edge (node_i, add_cost);
5306 if (!is_possible)
5308 layout_costs.mark_impossible ();
5309 continue;
5312 /* Locally combine the costs from the forward and backward passes.
5313 (This combined cost is not passed on, since that would lead
5314 to double counting.) */
5315 slpg_layout_cost combined_cost = layout_costs.in_cost;
5316 combined_cost.add_serial_cost (layout_costs.internal_cost);
5317 combined_cost.add_serial_cost (layout_costs.out_cost);
5319 /* Record the layout with the lowest cost. Prefer layout 0 in
5320 the event of a tie between it and another layout. */
5321 if (!min_layout_cost.is_possible ()
5322 || combined_cost.is_better_than (min_layout_cost,
5323 m_optimize_size))
5325 min_layout_i = layout_i;
5326 min_layout_cost = combined_cost;
5330 gcc_assert (min_layout_cost.is_possible ());
5331 partition.layout = min_layout_i;
5335 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5336 NODE already has the layout that was selected for its partition. */
5338 slp_tree
5339 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5340 unsigned int to_layout_i)
5342 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5343 slp_tree result = m_node_layouts[result_i];
5344 if (result)
5345 return result;
5347 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5348 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5349 /* We can't permute vector defs in place. */
5350 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5352 /* If the vector is uniform or unchanged, there's nothing to do. */
5353 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5354 result = node;
5355 else
5357 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5358 result = vect_create_new_slp_node (scalar_ops);
5359 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5362 else
5364 unsigned int partition_i = m_vertices[node->vertex].partition;
5365 unsigned int from_layout_i = m_partitions[partition_i].layout;
5366 if (from_layout_i == to_layout_i)
5367 return node;
5369 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5370 permutation instead of a serial one. Leave the new permutation
5371 in TMP_PERM on success. */
5372 auto_lane_permutation_t tmp_perm;
5373 unsigned int num_inputs = 1;
5374 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5376 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5377 if (from_layout_i != 0)
5378 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5379 if (to_layout_i != 0)
5380 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5381 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5382 tmp_perm,
5383 SLP_TREE_CHILDREN (node),
5384 false) >= 0)
5385 num_inputs = SLP_TREE_CHILDREN (node).length ();
5386 else
5387 tmp_perm.truncate (0);
5390 if (dump_enabled_p ())
5392 if (tmp_perm.length () > 0)
5393 dump_printf_loc (MSG_NOTE, vect_location,
5394 "duplicating permutation node %p with"
5395 " layout %d\n",
5396 (void *) node, to_layout_i);
5397 else
5398 dump_printf_loc (MSG_NOTE, vect_location,
5399 "inserting permutation node in place of %p\n",
5400 (void *) node);
5403 unsigned int num_lanes = SLP_TREE_LANES (node);
5404 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5405 if (SLP_TREE_SCALAR_STMTS (node).length ())
5407 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5408 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5409 if (from_layout_i != 0)
5410 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5411 if (to_layout_i != 0)
5412 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5414 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5415 SLP_TREE_LANES (result) = num_lanes;
5416 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5417 result->vertex = -1;
5419 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5420 if (tmp_perm.length ())
5422 lane_perm.safe_splice (tmp_perm);
5423 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5425 else
5427 lane_perm.create (num_lanes);
5428 for (unsigned j = 0; j < num_lanes; ++j)
5429 lane_perm.quick_push ({ 0, j });
5430 if (from_layout_i != 0)
5431 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5432 if (to_layout_i != 0)
5433 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5434 SLP_TREE_CHILDREN (result).safe_push (node);
5436 for (slp_tree child : SLP_TREE_CHILDREN (result))
5437 child->refcnt++;
5439 m_node_layouts[result_i] = result;
5440 return result;
5443 /* Apply the chosen vector layouts to the SLP graph. */
5445 void
5446 vect_optimize_slp_pass::materialize ()
5448 /* We no longer need the costs, so avoid having two O(N * P) arrays
5449 live at the same time. */
5450 m_partition_layout_costs.release ();
5451 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5453 auto_sbitmap fully_folded (m_vertices.length ());
5454 bitmap_clear (fully_folded);
5455 for (unsigned int node_i : m_partitioned_nodes)
5457 auto &vertex = m_vertices[node_i];
5458 slp_tree node = vertex.node;
5459 int layout_i = m_partitions[vertex.partition].layout;
5460 gcc_assert (layout_i >= 0);
5462 /* Rearrange the scalar statements to match the chosen layout. */
5463 if (layout_i > 0)
5464 vect_slp_permute (m_perms[layout_i],
5465 SLP_TREE_SCALAR_STMTS (node), true);
5467 /* Update load and lane permutations. */
5468 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5470 /* First try to absorb the input vector layouts. If that fails,
5471 force the inputs to have layout LAYOUT_I too. We checked that
5472 that was possible before deciding to use nonzero output layouts.
5473 (Note that at this stage we don't really have any guarantee that
5474 the target supports the original VEC_PERM_EXPR.) */
5475 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5476 auto_lane_permutation_t tmp_perm;
5477 tmp_perm.safe_splice (perm);
5478 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5479 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5480 tmp_perm,
5481 SLP_TREE_CHILDREN (node),
5482 false) >= 0)
5484 if (dump_enabled_p ()
5485 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5486 perm.begin ()))
5487 dump_printf_loc (MSG_NOTE, vect_location,
5488 "absorbing input layouts into %p\n",
5489 (void *) node);
5490 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5491 bitmap_set_bit (fully_folded, node_i);
5493 else
5495 /* Not MSG_MISSED because it would make no sense to users. */
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE, vect_location,
5498 "failed to absorb input layouts into %p\n",
5499 (void *) node);
5500 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5503 else
5505 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5506 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5507 if (layout_i > 0)
5508 /* ??? When we handle non-bijective permutes the idea
5509 is that we can force the load-permutation to be
5510 { min, min + 1, min + 2, ... max }. But then the
5511 scalar defs might no longer match the lane content
5512 which means wrong-code with live lane vectorization.
5513 So we possibly have to have NULL entries for those. */
5514 vect_slp_permute (m_perms[layout_i], load_perm, true);
5518 /* Do this before any nodes disappear, since it involves a walk
5519 over the leaves. */
5520 remove_redundant_permutations ();
5522 /* Replace each child with a correctly laid-out version. */
5523 for (unsigned int node_i : m_partitioned_nodes)
5525 /* Skip nodes that have already been handled above. */
5526 if (bitmap_bit_p (fully_folded, node_i))
5527 continue;
5529 auto &vertex = m_vertices[node_i];
5530 int in_layout_i = m_partitions[vertex.partition].layout;
5531 gcc_assert (in_layout_i >= 0);
5533 unsigned j;
5534 slp_tree child;
5535 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5537 if (!child)
5538 continue;
5540 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5541 if (new_child != child)
5543 vect_free_slp_tree (child);
5544 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5545 new_child->refcnt += 1;
5551 /* Elide load permutations that are not necessary. Such permutations might
5552 be pre-existing, rather than created by the layout optimizations. */
5554 void
5555 vect_optimize_slp_pass::remove_redundant_permutations ()
5557 for (unsigned int node_i : m_leafs)
5559 slp_tree node = m_vertices[node_i].node;
5560 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5561 continue;
5563 /* In basic block vectorization we allow any subchain of an interleaving
5564 chain.
5565 FORNOW: not in loop SLP because of realignment complications. */
5566 if (is_a <bb_vec_info> (m_vinfo))
5568 bool subchain_p = true;
5569 stmt_vec_info next_load_info = NULL;
5570 stmt_vec_info load_info;
5571 unsigned j;
5572 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5574 if (j != 0
5575 && (next_load_info != load_info
5576 || DR_GROUP_GAP (load_info) != 1))
5578 subchain_p = false;
5579 break;
5581 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5583 if (subchain_p)
5585 SLP_TREE_LOAD_PERMUTATION (node).release ();
5586 continue;
5589 else
5591 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5592 stmt_vec_info load_info;
5593 bool this_load_permuted = false;
5594 unsigned j;
5595 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5596 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5598 this_load_permuted = true;
5599 break;
5601 /* When this isn't a grouped access we know it's single element
5602 and contiguous. */
5603 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5605 if (!this_load_permuted
5606 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5607 || SLP_TREE_LANES (node) == 1))
5608 SLP_TREE_LOAD_PERMUTATION (node).release ();
5609 continue;
5611 stmt_vec_info first_stmt_info
5612 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5613 if (!this_load_permuted
5614 /* The load requires permutation when unrolling exposes
5615 a gap either because the group is larger than the SLP
5616 group-size or because there is a gap between the groups. */
5617 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5618 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5619 && DR_GROUP_GAP (first_stmt_info) == 0)))
5621 SLP_TREE_LOAD_PERMUTATION (node).release ();
5622 continue;
5628 /* Print the partition graph and layout information to the dump file. */
5630 void
5631 vect_optimize_slp_pass::dump ()
5633 dump_printf_loc (MSG_NOTE, vect_location,
5634 "SLP optimize permutations:\n");
5635 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5637 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5638 const char *sep = "";
5639 for (unsigned int idx : m_perms[layout_i])
5641 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5642 sep = ", ";
5644 dump_printf (MSG_NOTE, " }\n");
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "SLP optimize partitions:\n");
5648 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5649 ++partition_i)
5651 auto &partition = m_partitions[partition_i];
5652 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5653 dump_printf_loc (MSG_NOTE, vect_location,
5654 " partition %d (layout %d):\n",
5655 partition_i, partition.layout);
5656 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5657 for (unsigned int order_i = partition.node_begin;
5658 order_i < partition.node_end; ++order_i)
5660 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5661 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5662 (void *) vertex.node);
5663 dump_printf_loc (MSG_NOTE, vect_location,
5664 " weight: %f\n",
5665 vertex.weight.to_double ());
5666 if (vertex.out_degree)
5667 dump_printf_loc (MSG_NOTE, vect_location,
5668 " out weight: %f (degree %d)\n",
5669 vertex.out_weight.to_double (),
5670 vertex.out_degree);
5671 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5672 dump_printf_loc (MSG_NOTE, vect_location,
5673 " op: VEC_PERM_EXPR\n");
5674 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5675 dump_printf_loc (MSG_NOTE, vect_location,
5676 " op template: %G", rep->stmt);
5678 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5679 for (unsigned int order_i = partition.node_begin;
5680 order_i < partition.node_end; ++order_i)
5682 unsigned int node_i = m_partitioned_nodes[order_i];
5683 auto &vertex = m_vertices[node_i];
5684 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5686 auto &other_vertex = m_vertices[other_node_i];
5687 if (other_vertex.partition < vertex.partition)
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 " - %p [%d] --> %p\n",
5690 (void *) other_vertex.node,
5691 other_vertex.partition,
5692 (void *) vertex.node);
5693 else
5694 dump_printf_loc (MSG_NOTE, vect_location,
5695 " - %p --> [%d] %p\n",
5696 (void *) vertex.node,
5697 other_vertex.partition,
5698 (void *) other_vertex.node);
5700 for_each_partition_edge (node_i, print_edge);
5703 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5705 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5706 if (layout_costs.is_possible ())
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 " layout %d:%s\n", layout_i,
5710 partition.layout == int (layout_i)
5711 ? " (*)" : "");
5712 slpg_layout_cost combined_cost = layout_costs.in_cost;
5713 combined_cost.add_serial_cost (layout_costs.internal_cost);
5714 combined_cost.add_serial_cost (layout_costs.out_cost);
5715 #define TEMPLATE "{depth: %f, total: %f}"
5716 dump_printf_loc (MSG_NOTE, vect_location,
5717 " " TEMPLATE "\n",
5718 layout_costs.in_cost.depth.to_double (),
5719 layout_costs.in_cost.total.to_double ());
5720 dump_printf_loc (MSG_NOTE, vect_location,
5721 " + " TEMPLATE "\n",
5722 layout_costs.internal_cost.depth.to_double (),
5723 layout_costs.internal_cost.total.to_double ());
5724 dump_printf_loc (MSG_NOTE, vect_location,
5725 " + " TEMPLATE "\n",
5726 layout_costs.out_cost.depth.to_double (),
5727 layout_costs.out_cost.total.to_double ());
5728 dump_printf_loc (MSG_NOTE, vect_location,
5729 " = " TEMPLATE "\n",
5730 combined_cost.depth.to_double (),
5731 combined_cost.total.to_double ());
5732 #undef TEMPLATE
5734 else
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 " layout %d: rejected\n", layout_i);
5741 /* Main entry point for the SLP graph optimization pass. */
5743 void
5744 vect_optimize_slp_pass::run ()
5746 build_graph ();
5747 create_partitions ();
5748 start_choosing_layouts ();
5749 if (m_perms.length () > 1)
5751 forward_pass ();
5752 backward_pass ();
5753 if (dump_enabled_p ())
5754 dump ();
5755 materialize ();
5756 while (!m_perms.is_empty ())
5757 m_perms.pop ().release ();
5759 else
5760 remove_redundant_permutations ();
5761 free_graph (m_slpg);
5764 /* Optimize the SLP graph of VINFO. */
5766 void
5767 vect_optimize_slp (vec_info *vinfo)
5769 if (vinfo->slp_instances.is_empty ())
5770 return;
5771 vect_optimize_slp_pass (vinfo).run ();
5774 /* Gather loads reachable from the individual SLP graph entries. */
5776 void
5777 vect_gather_slp_loads (vec_info *vinfo)
5779 unsigned i;
5780 slp_instance instance;
5781 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5783 hash_set<slp_tree> visited;
5784 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5785 SLP_INSTANCE_TREE (instance), visited);
5790 /* For each possible SLP instance decide whether to SLP it and calculate overall
5791 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5792 least one instance. */
5794 bool
5795 vect_make_slp_decision (loop_vec_info loop_vinfo)
5797 unsigned int i;
5798 poly_uint64 unrolling_factor = 1;
5799 const vec<slp_instance> &slp_instances
5800 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5801 slp_instance instance;
5802 int decided_to_slp = 0;
5804 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5806 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5808 /* FORNOW: SLP if you can. */
5809 /* All unroll factors have the form:
5811 GET_MODE_SIZE (vinfo->vector_mode) * X
5813 for some rational X, so they must have a common multiple. */
5814 unrolling_factor
5815 = force_common_multiple (unrolling_factor,
5816 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5818 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5819 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5820 loop-based vectorization. Such stmts will be marked as HYBRID. */
5821 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5822 decided_to_slp++;
5825 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5827 if (decided_to_slp && dump_enabled_p ())
5829 dump_printf_loc (MSG_NOTE, vect_location,
5830 "Decided to SLP %d instances. Unrolling factor ",
5831 decided_to_slp);
5832 dump_dec (MSG_NOTE, unrolling_factor);
5833 dump_printf (MSG_NOTE, "\n");
5836 return (decided_to_slp > 0);
5839 /* Private data for vect_detect_hybrid_slp. */
5840 struct vdhs_data
5842 loop_vec_info loop_vinfo;
5843 vec<stmt_vec_info> *worklist;
5846 /* Walker for walk_gimple_op. */
5848 static tree
5849 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5851 walk_stmt_info *wi = (walk_stmt_info *)data;
5852 vdhs_data *dat = (vdhs_data *)wi->info;
5854 if (wi->is_lhs)
5855 return NULL_TREE;
5857 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5858 if (!def_stmt_info)
5859 return NULL_TREE;
5860 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5861 if (PURE_SLP_STMT (def_stmt_info))
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5865 def_stmt_info->stmt);
5866 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5867 dat->worklist->safe_push (def_stmt_info);
5870 return NULL_TREE;
5873 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5874 if so, otherwise pushing it to WORKLIST. */
5876 static void
5877 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5878 vec<stmt_vec_info> &worklist,
5879 stmt_vec_info stmt_info)
5881 if (dump_enabled_p ())
5882 dump_printf_loc (MSG_NOTE, vect_location,
5883 "Processing hybrid candidate : %G", stmt_info->stmt);
5884 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5885 imm_use_iterator iter2;
5886 ssa_op_iter iter1;
5887 use_operand_p use_p;
5888 def_operand_p def_p;
5889 bool any_def = false;
5890 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5892 any_def = true;
5893 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5895 if (is_gimple_debug (USE_STMT (use_p)))
5896 continue;
5897 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5898 /* An out-of loop use means this is a loop_vect sink. */
5899 if (!use_info)
5901 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_NOTE, vect_location,
5903 "Found loop_vect sink: %G", stmt_info->stmt);
5904 worklist.safe_push (stmt_info);
5905 return;
5907 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5909 if (dump_enabled_p ())
5910 dump_printf_loc (MSG_NOTE, vect_location,
5911 "Found loop_vect use: %G", use_info->stmt);
5912 worklist.safe_push (stmt_info);
5913 return;
5917 /* No def means this is a loo_vect sink. */
5918 if (!any_def)
5920 if (dump_enabled_p ())
5921 dump_printf_loc (MSG_NOTE, vect_location,
5922 "Found loop_vect sink: %G", stmt_info->stmt);
5923 worklist.safe_push (stmt_info);
5924 return;
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_NOTE, vect_location,
5928 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5929 STMT_SLP_TYPE (stmt_info) = pure_slp;
5932 /* Find stmts that must be both vectorized and SLPed. */
5934 void
5935 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5937 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5939 /* All stmts participating in SLP are marked pure_slp, all other
5940 stmts are loop_vect.
5941 First collect all loop_vect stmts into a worklist.
5942 SLP patterns cause not all original scalar stmts to appear in
5943 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5944 Rectify this here and do a backward walk over the IL only considering
5945 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5946 mark them as pure_slp. */
5947 auto_vec<stmt_vec_info> worklist;
5948 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5950 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5951 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5952 gsi_next (&gsi))
5954 gphi *phi = gsi.phi ();
5955 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5956 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5957 maybe_push_to_hybrid_worklist (loop_vinfo,
5958 worklist, stmt_info);
5960 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5961 gsi_prev (&gsi))
5963 gimple *stmt = gsi_stmt (gsi);
5964 if (is_gimple_debug (stmt))
5965 continue;
5966 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5967 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5969 for (gimple_stmt_iterator gsi2
5970 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5971 !gsi_end_p (gsi2); gsi_next (&gsi2))
5973 stmt_vec_info patt_info
5974 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5975 if (!STMT_SLP_TYPE (patt_info)
5976 && STMT_VINFO_RELEVANT (patt_info))
5977 maybe_push_to_hybrid_worklist (loop_vinfo,
5978 worklist, patt_info);
5980 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5982 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5983 maybe_push_to_hybrid_worklist (loop_vinfo,
5984 worklist, stmt_info);
5988 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5989 mark any SLP vectorized stmt as hybrid.
5990 ??? We're visiting def stmts N times (once for each non-SLP and
5991 once for each hybrid-SLP use). */
5992 walk_stmt_info wi;
5993 vdhs_data dat;
5994 dat.worklist = &worklist;
5995 dat.loop_vinfo = loop_vinfo;
5996 memset (&wi, 0, sizeof (wi));
5997 wi.info = (void *)&dat;
5998 while (!worklist.is_empty ())
6000 stmt_vec_info stmt_info = worklist.pop ();
6001 /* Since SSA operands are not set up for pattern stmts we need
6002 to use walk_gimple_op. */
6003 wi.is_lhs = 0;
6004 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6005 /* For gather/scatter make sure to walk the offset operand, that
6006 can be a scaling and conversion away. */
6007 gather_scatter_info gs_info;
6008 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6009 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6011 int dummy;
6012 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6018 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6020 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6021 : vec_info (vec_info::bb, shared),
6022 bbs (_bbs),
6023 roots (vNULL)
6025 for (unsigned i = 0; i < bbs.length (); ++i)
6027 if (i != 0)
6028 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6029 gsi_next (&si))
6031 gphi *phi = si.phi ();
6032 gimple_set_uid (phi, 0);
6033 add_stmt (phi);
6035 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6036 !gsi_end_p (gsi); gsi_next (&gsi))
6038 gimple *stmt = gsi_stmt (gsi);
6039 gimple_set_uid (stmt, 0);
6040 if (is_gimple_debug (stmt))
6041 continue;
6042 add_stmt (stmt);
6048 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6049 stmts in the basic block. */
6051 _bb_vec_info::~_bb_vec_info ()
6053 /* Reset region marker. */
6054 for (unsigned i = 0; i < bbs.length (); ++i)
6056 if (i != 0)
6057 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6058 gsi_next (&si))
6060 gphi *phi = si.phi ();
6061 gimple_set_uid (phi, -1);
6063 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6064 !gsi_end_p (gsi); gsi_next (&gsi))
6066 gimple *stmt = gsi_stmt (gsi);
6067 gimple_set_uid (stmt, -1);
6071 for (unsigned i = 0; i < roots.length (); ++i)
6073 roots[i].stmts.release ();
6074 roots[i].roots.release ();
6075 roots[i].remain.release ();
6077 roots.release ();
6080 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6081 given then that child nodes have already been processed, and that
6082 their def types currently match their SLP node's def type. */
6084 static bool
6085 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6086 slp_instance node_instance,
6087 stmt_vector_for_cost *cost_vec)
6089 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6091 /* Calculate the number of vector statements to be created for the
6092 scalar stmts in this node. For SLP reductions it is equal to the
6093 number of vector statements in the children (which has already been
6094 calculated by the recursive call). Otherwise it is the number of
6095 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6096 VF divided by the number of elements in a vector. */
6097 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6098 && !STMT_VINFO_DATA_REF (stmt_info)
6099 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6101 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6102 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6104 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6105 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6106 break;
6109 else
6111 poly_uint64 vf;
6112 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6113 vf = loop_vinfo->vectorization_factor;
6114 else
6115 vf = 1;
6116 unsigned int group_size = SLP_TREE_LANES (node);
6117 tree vectype = SLP_TREE_VECTYPE (node);
6118 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6119 = vect_get_num_vectors (vf * group_size, vectype);
6122 /* Handle purely internal nodes. */
6123 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6125 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6126 return false;
6128 stmt_vec_info slp_stmt_info;
6129 unsigned int i;
6130 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6132 if (STMT_VINFO_LIVE_P (slp_stmt_info)
6133 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6134 node_instance, i,
6135 false, cost_vec))
6136 return false;
6138 return true;
6141 bool dummy;
6142 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6143 node, node_instance, cost_vec);
6146 /* Try to build NODE from scalars, returning true on success.
6147 NODE_INSTANCE is the SLP instance that contains NODE. */
6149 static bool
6150 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6151 slp_instance node_instance)
6153 stmt_vec_info stmt_info;
6154 unsigned int i;
6156 if (!is_a <bb_vec_info> (vinfo)
6157 || node == SLP_INSTANCE_TREE (node_instance)
6158 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6159 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6160 /* Force the mask use to be built from scalars instead. */
6161 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6162 return false;
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE, vect_location,
6166 "Building vector operands of %p from scalars instead\n",
6167 (void *) node);
6169 /* Don't remove and free the child nodes here, since they could be
6170 referenced by other structures. The analysis and scheduling phases
6171 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6172 unsigned int group_size = SLP_TREE_LANES (node);
6173 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6174 /* Invariants get their vector type from the uses. */
6175 SLP_TREE_VECTYPE (node) = NULL_TREE;
6176 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6177 SLP_TREE_LOAD_PERMUTATION (node).release ();
6178 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6180 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6181 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6183 return true;
6186 /* Return true if all elements of the slice are the same. */
6187 bool
6188 vect_scalar_ops_slice::all_same_p () const
6190 for (unsigned int i = 1; i < length; ++i)
6191 if (!operand_equal_p (op (0), op (i)))
6192 return false;
6193 return true;
6196 hashval_t
6197 vect_scalar_ops_slice_hash::hash (const value_type &s)
6199 hashval_t hash = 0;
6200 for (unsigned i = 0; i < s.length; ++i)
6201 hash = iterative_hash_expr (s.op (i), hash);
6202 return hash;
6205 bool
6206 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6207 const compare_type &s2)
6209 if (s1.length != s2.length)
6210 return false;
6211 for (unsigned i = 0; i < s1.length; ++i)
6212 if (!operand_equal_p (s1.op (i), s2.op (i)))
6213 return false;
6214 return true;
6217 /* Compute the prologue cost for invariant or constant operands represented
6218 by NODE. */
6220 static void
6221 vect_prologue_cost_for_slp (slp_tree node,
6222 stmt_vector_for_cost *cost_vec)
6224 /* There's a special case of an existing vector, that costs nothing. */
6225 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6226 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6227 return;
6228 /* Without looking at the actual initializer a vector of
6229 constants can be implemented as load from the constant pool.
6230 When all elements are the same we can use a splat. */
6231 tree vectype = SLP_TREE_VECTYPE (node);
6232 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6233 unsigned HOST_WIDE_INT const_nunits;
6234 unsigned nelt_limit;
6235 auto ops = &SLP_TREE_SCALAR_OPS (node);
6236 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6237 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6238 && ! multiple_p (const_nunits, group_size))
6240 nelt_limit = const_nunits;
6241 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6242 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6243 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6244 starts.quick_push (i * const_nunits);
6246 else
6248 /* If either the vector has variable length or the vectors
6249 are composed of repeated whole groups we only need to
6250 cost construction once. All vectors will be the same. */
6251 nelt_limit = group_size;
6252 starts.quick_push (0);
6254 /* ??? We're just tracking whether vectors in a single node are the same.
6255 Ideally we'd do something more global. */
6256 bool passed = false;
6257 for (unsigned int start : starts)
6259 vect_cost_for_stmt kind;
6260 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6261 kind = vector_load;
6262 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6263 kind = scalar_to_vec;
6264 else
6265 kind = vec_construct;
6266 /* The target cost hook has no idea which part of the SLP node
6267 we are costing so avoid passing it down more than once. Pass
6268 it to the first vec_construct or scalar_to_vec part since for those
6269 the x86 backend tries to account for GPR to XMM register moves. */
6270 record_stmt_cost (cost_vec, 1, kind,
6271 (kind != vector_load && !passed) ? node : nullptr,
6272 vectype, 0, vect_prologue);
6273 if (kind != vector_load)
6274 passed = true;
6278 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6279 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6281 Return true if the operations are supported. */
6283 static bool
6284 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6285 slp_instance node_instance,
6286 hash_set<slp_tree> &visited_set,
6287 vec<slp_tree> &visited_vec,
6288 stmt_vector_for_cost *cost_vec)
6290 int i, j;
6291 slp_tree child;
6293 /* Assume we can code-generate all invariants. */
6294 if (!node
6295 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6296 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6297 return true;
6299 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_NOTE, vect_location,
6303 "Failed cyclic SLP reference in %p\n", (void *) node);
6304 return false;
6306 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6308 /* If we already analyzed the exact same set of scalar stmts we're done.
6309 We share the generated vector stmts for those. */
6310 if (visited_set.add (node))
6311 return true;
6312 visited_vec.safe_push (node);
6314 bool res = true;
6315 unsigned visited_rec_start = visited_vec.length ();
6316 unsigned cost_vec_rec_start = cost_vec->length ();
6317 bool seen_non_constant_child = false;
6318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6320 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6321 visited_set, visited_vec,
6322 cost_vec);
6323 if (!res)
6324 break;
6325 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6326 seen_non_constant_child = true;
6328 /* We're having difficulties scheduling nodes with just constant
6329 operands and no scalar stmts since we then cannot compute a stmt
6330 insertion place. */
6331 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6333 if (dump_enabled_p ())
6334 dump_printf_loc (MSG_NOTE, vect_location,
6335 "Cannot vectorize all-constant op node %p\n",
6336 (void *) node);
6337 res = false;
6340 if (res)
6341 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6342 cost_vec);
6343 /* If analysis failed we have to pop all recursive visited nodes
6344 plus ourselves. */
6345 if (!res)
6347 while (visited_vec.length () >= visited_rec_start)
6348 visited_set.remove (visited_vec.pop ());
6349 cost_vec->truncate (cost_vec_rec_start);
6352 /* When the node can be vectorized cost invariant nodes it references.
6353 This is not done in DFS order to allow the refering node
6354 vectorizable_* calls to nail down the invariant nodes vector type
6355 and possibly unshare it if it needs a different vector type than
6356 other referrers. */
6357 if (res)
6358 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6359 if (child
6360 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6361 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6362 /* Perform usual caching, note code-generation still
6363 code-gens these nodes multiple times but we expect
6364 to CSE them later. */
6365 && !visited_set.add (child))
6367 visited_vec.safe_push (child);
6368 /* ??? After auditing more code paths make a "default"
6369 and push the vector type from NODE to all children
6370 if it is not already set. */
6371 /* Compute the number of vectors to be generated. */
6372 tree vector_type = SLP_TREE_VECTYPE (child);
6373 if (!vector_type)
6375 /* For shifts with a scalar argument we don't need
6376 to cost or code-generate anything.
6377 ??? Represent this more explicitely. */
6378 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6379 == shift_vec_info_type)
6380 && j == 1);
6381 continue;
6383 unsigned group_size = SLP_TREE_LANES (child);
6384 poly_uint64 vf = 1;
6385 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6386 vf = loop_vinfo->vectorization_factor;
6387 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6388 = vect_get_num_vectors (vf * group_size, vector_type);
6389 /* And cost them. */
6390 vect_prologue_cost_for_slp (child, cost_vec);
6393 /* If this node or any of its children can't be vectorized, try pruning
6394 the tree here rather than felling the whole thing. */
6395 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6397 /* We'll need to revisit this for invariant costing and number
6398 of vectorized stmt setting. */
6399 res = true;
6402 return res;
6405 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6406 region and that can be vectorized using vectorizable_live_operation
6407 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6408 scalar code computing it to be retained. */
6410 static void
6411 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6412 slp_instance instance,
6413 stmt_vector_for_cost *cost_vec,
6414 hash_set<stmt_vec_info> &svisited,
6415 hash_set<slp_tree> &visited)
6417 if (visited.add (node))
6418 return;
6420 unsigned i;
6421 stmt_vec_info stmt_info;
6422 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6423 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6425 if (svisited.contains (stmt_info))
6426 continue;
6427 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6428 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6429 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6430 /* Only the pattern root stmt computes the original scalar value. */
6431 continue;
6432 bool mark_visited = true;
6433 gimple *orig_stmt = orig_stmt_info->stmt;
6434 ssa_op_iter op_iter;
6435 def_operand_p def_p;
6436 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6438 imm_use_iterator use_iter;
6439 gimple *use_stmt;
6440 stmt_vec_info use_stmt_info;
6441 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6442 if (!is_gimple_debug (use_stmt))
6444 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6445 if (!use_stmt_info
6446 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6448 STMT_VINFO_LIVE_P (stmt_info) = true;
6449 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6450 node, instance, i,
6451 false, cost_vec))
6452 /* ??? So we know we can vectorize the live stmt
6453 from one SLP node. If we cannot do so from all
6454 or none consistently we'd have to record which
6455 SLP node (and lane) we want to use for the live
6456 operation. So make sure we can code-generate
6457 from all nodes. */
6458 mark_visited = false;
6459 else
6460 STMT_VINFO_LIVE_P (stmt_info) = false;
6461 break;
6464 /* We have to verify whether we can insert the lane extract
6465 before all uses. The following is a conservative approximation.
6466 We cannot put this into vectorizable_live_operation because
6467 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6468 doesn't work.
6469 Note that while the fact that we emit code for loads at the
6470 first load should make this a non-problem leafs we construct
6471 from scalars are vectorized after the last scalar def.
6472 ??? If we'd actually compute the insert location during
6473 analysis we could use sth less conservative than the last
6474 scalar stmt in the node for the dominance check. */
6475 /* ??? What remains is "live" uses in vector CTORs in the same
6476 SLP graph which is where those uses can end up code-generated
6477 right after their definition instead of close to their original
6478 use. But that would restrict us to code-generate lane-extracts
6479 from the latest stmt in a node. So we compensate for this
6480 during code-generation, simply not replacing uses for those
6481 hopefully rare cases. */
6482 if (STMT_VINFO_LIVE_P (stmt_info))
6483 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6484 if (!is_gimple_debug (use_stmt)
6485 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6486 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6487 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6489 if (dump_enabled_p ())
6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491 "Cannot determine insertion place for "
6492 "lane extract\n");
6493 STMT_VINFO_LIVE_P (stmt_info) = false;
6494 mark_visited = true;
6497 if (mark_visited)
6498 svisited.add (stmt_info);
6501 slp_tree child;
6502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6503 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6504 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6505 cost_vec, svisited, visited);
6508 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6510 static bool
6511 vectorizable_bb_reduc_epilogue (slp_instance instance,
6512 stmt_vector_for_cost *cost_vec)
6514 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6515 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6516 if (reduc_code == MINUS_EXPR)
6517 reduc_code = PLUS_EXPR;
6518 internal_fn reduc_fn;
6519 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6520 if (!vectype
6521 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6522 || reduc_fn == IFN_LAST
6523 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6524 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6525 TREE_TYPE (vectype)))
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529 "not vectorized: basic block reduction epilogue "
6530 "operation unsupported.\n");
6531 return false;
6534 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6535 cost log2 vector operations plus shuffles and one extraction. */
6536 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6537 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6538 vectype, 0, vect_body);
6539 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6540 vectype, 0, vect_body);
6541 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6542 vectype, 0, vect_body);
6544 /* Since we replace all stmts of a possibly longer scalar reduction
6545 chain account for the extra scalar stmts for that. */
6546 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6547 instance->root_stmts[0], 0, vect_body);
6548 return true;
6551 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6552 and recurse to children. */
6554 static void
6555 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6556 hash_set<slp_tree> &visited)
6558 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6559 || visited.add (node))
6560 return;
6562 stmt_vec_info stmt;
6563 unsigned i;
6564 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6565 roots.remove (vect_orig_stmt (stmt));
6567 slp_tree child;
6568 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6569 if (child)
6570 vect_slp_prune_covered_roots (child, roots, visited);
6573 /* Analyze statements in SLP instances of VINFO. Return true if the
6574 operations are supported. */
6576 bool
6577 vect_slp_analyze_operations (vec_info *vinfo)
6579 slp_instance instance;
6580 int i;
6582 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6584 hash_set<slp_tree> visited;
6585 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6587 auto_vec<slp_tree> visited_vec;
6588 stmt_vector_for_cost cost_vec;
6589 cost_vec.create (2);
6590 if (is_a <bb_vec_info> (vinfo))
6591 vect_location = instance->location ();
6592 if (!vect_slp_analyze_node_operations (vinfo,
6593 SLP_INSTANCE_TREE (instance),
6594 instance, visited, visited_vec,
6595 &cost_vec)
6596 /* CTOR instances require vectorized defs for the SLP tree root. */
6597 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6598 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6599 != vect_internal_def
6600 /* Make sure we vectorized with the expected type. */
6601 || !useless_type_conversion_p
6602 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6603 (instance->root_stmts[0]->stmt))),
6604 TREE_TYPE (SLP_TREE_VECTYPE
6605 (SLP_INSTANCE_TREE (instance))))))
6606 /* Check we can vectorize the reduction. */
6607 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6608 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6610 slp_tree node = SLP_INSTANCE_TREE (instance);
6611 stmt_vec_info stmt_info;
6612 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6613 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6614 else
6615 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6616 if (dump_enabled_p ())
6617 dump_printf_loc (MSG_NOTE, vect_location,
6618 "removing SLP instance operations starting from: %G",
6619 stmt_info->stmt);
6620 vect_free_slp_instance (instance);
6621 vinfo->slp_instances.ordered_remove (i);
6622 cost_vec.release ();
6623 while (!visited_vec.is_empty ())
6624 visited.remove (visited_vec.pop ());
6626 else
6628 i++;
6629 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6631 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6632 cost_vec.release ();
6634 else
6635 /* For BB vectorization remember the SLP graph entry
6636 cost for later. */
6637 instance->cost_vec = cost_vec;
6641 /* Now look for SLP instances with a root that are covered by other
6642 instances and remove them. */
6643 hash_set<stmt_vec_info> roots;
6644 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6645 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6646 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6647 if (!roots.is_empty ())
6649 visited.empty ();
6650 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6651 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6652 visited);
6653 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6654 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6655 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6657 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6658 if (dump_enabled_p ())
6659 dump_printf_loc (MSG_NOTE, vect_location,
6660 "removing SLP instance operations starting "
6661 "from: %G", root->stmt);
6662 vect_free_slp_instance (instance);
6663 vinfo->slp_instances.ordered_remove (i);
6665 else
6666 ++i;
6669 /* Compute vectorizable live stmts. */
6670 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6672 hash_set<stmt_vec_info> svisited;
6673 hash_set<slp_tree> visited;
6674 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6676 vect_location = instance->location ();
6677 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6678 instance, &instance->cost_vec, svisited,
6679 visited);
6683 return !vinfo->slp_instances.is_empty ();
6686 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6687 closing the eventual chain. */
6689 static slp_instance
6690 get_ultimate_leader (slp_instance instance,
6691 hash_map<slp_instance, slp_instance> &instance_leader)
6693 auto_vec<slp_instance *, 8> chain;
6694 slp_instance *tem;
6695 while (*(tem = instance_leader.get (instance)) != instance)
6697 chain.safe_push (tem);
6698 instance = *tem;
6700 while (!chain.is_empty ())
6701 *chain.pop () = instance;
6702 return instance;
6705 namespace {
6706 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6707 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6708 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6710 INSTANCE_LEADER is as for get_ultimate_leader. */
6712 template<typename T>
6713 bool
6714 vect_map_to_instance (slp_instance instance, T key,
6715 hash_map<T, slp_instance> &key_to_instance,
6716 hash_map<slp_instance, slp_instance> &instance_leader)
6718 bool existed_p;
6719 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6720 if (!existed_p)
6722 else if (key_instance != instance)
6724 /* If we're running into a previously marked key make us the
6725 leader of the current ultimate leader. This keeps the
6726 leader chain acyclic and works even when the current instance
6727 connects two previously independent graph parts. */
6728 slp_instance key_leader
6729 = get_ultimate_leader (key_instance, instance_leader);
6730 if (key_leader != instance)
6731 instance_leader.put (key_leader, instance);
6733 key_instance = instance;
6734 return existed_p;
6738 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6740 static void
6741 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6742 slp_instance instance, slp_tree node,
6743 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6744 hash_map<slp_tree, slp_instance> &node_to_instance,
6745 hash_map<slp_instance, slp_instance> &instance_leader)
6747 stmt_vec_info stmt_info;
6748 unsigned i;
6750 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6751 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6752 instance_leader);
6754 if (vect_map_to_instance (instance, node, node_to_instance,
6755 instance_leader))
6756 return;
6758 slp_tree child;
6759 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6760 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6761 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6762 node_to_instance, instance_leader);
6765 /* Partition the SLP graph into pieces that can be costed independently. */
6767 static void
6768 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6770 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6772 /* First walk the SLP graph assigning each involved scalar stmt a
6773 corresponding SLP graph entry and upon visiting a previously
6774 marked stmt, make the stmts leader the current SLP graph entry. */
6775 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6776 hash_map<slp_tree, slp_instance> node_to_instance;
6777 hash_map<slp_instance, slp_instance> instance_leader;
6778 slp_instance instance;
6779 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6781 instance_leader.put (instance, instance);
6782 vect_bb_partition_graph_r (bb_vinfo,
6783 instance, SLP_INSTANCE_TREE (instance),
6784 stmt_to_instance, node_to_instance,
6785 instance_leader);
6788 /* Then collect entries to each independent subgraph. */
6789 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6791 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6792 leader->subgraph_entries.safe_push (instance);
6793 if (dump_enabled_p ()
6794 && leader != instance)
6795 dump_printf_loc (MSG_NOTE, vect_location,
6796 "instance %p is leader of %p\n",
6797 (void *) leader, (void *) instance);
6801 /* Compute the set of scalar stmts participating in internal and external
6802 nodes. */
6804 static void
6805 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6806 hash_set<slp_tree> &visited,
6807 hash_set<stmt_vec_info> &vstmts,
6808 hash_set<stmt_vec_info> &estmts)
6810 int i;
6811 stmt_vec_info stmt_info;
6812 slp_tree child;
6814 if (visited.add (node))
6815 return;
6817 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6819 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6820 vstmts.add (stmt_info);
6822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6823 if (child)
6824 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6825 vstmts, estmts);
6827 else
6828 for (tree def : SLP_TREE_SCALAR_OPS (node))
6830 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6831 if (def_stmt)
6832 estmts.add (def_stmt);
6837 /* Compute the scalar cost of the SLP node NODE and its children
6838 and return it. Do not account defs that are marked in LIFE and
6839 update LIFE according to uses of NODE. */
6841 static void
6842 vect_bb_slp_scalar_cost (vec_info *vinfo,
6843 slp_tree node, vec<bool, va_heap> *life,
6844 stmt_vector_for_cost *cost_vec,
6845 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6846 hash_set<slp_tree> &visited)
6848 unsigned i;
6849 stmt_vec_info stmt_info;
6850 slp_tree child;
6852 if (visited.add (node))
6853 return;
6855 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6857 ssa_op_iter op_iter;
6858 def_operand_p def_p;
6860 if ((*life)[i])
6861 continue;
6863 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6864 gimple *orig_stmt = orig_stmt_info->stmt;
6866 /* If there is a non-vectorized use of the defs then the scalar
6867 stmt is kept live in which case we do not account it or any
6868 required defs in the SLP children in the scalar cost. This
6869 way we make the vectorization more costly when compared to
6870 the scalar cost. */
6871 if (!STMT_VINFO_LIVE_P (stmt_info))
6873 auto_vec<gimple *, 8> worklist;
6874 hash_set<gimple *> *worklist_visited = NULL;
6875 worklist.quick_push (orig_stmt);
6878 gimple *work_stmt = worklist.pop ();
6879 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6881 imm_use_iterator use_iter;
6882 gimple *use_stmt;
6883 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6884 DEF_FROM_PTR (def_p))
6885 if (!is_gimple_debug (use_stmt))
6887 stmt_vec_info use_stmt_info
6888 = vinfo->lookup_stmt (use_stmt);
6889 if (!use_stmt_info
6890 || !vectorized_scalar_stmts.contains (use_stmt_info))
6892 if (use_stmt_info
6893 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6895 /* For stmts participating in patterns we have
6896 to check its uses recursively. */
6897 if (!worklist_visited)
6898 worklist_visited = new hash_set<gimple *> ();
6899 if (!worklist_visited->add (use_stmt))
6900 worklist.safe_push (use_stmt);
6901 continue;
6903 (*life)[i] = true;
6904 goto next_lane;
6909 while (!worklist.is_empty ());
6910 next_lane:
6911 if (worklist_visited)
6912 delete worklist_visited;
6913 if ((*life)[i])
6914 continue;
6917 /* Count scalar stmts only once. */
6918 if (gimple_visited_p (orig_stmt))
6919 continue;
6920 gimple_set_visited (orig_stmt, true);
6922 vect_cost_for_stmt kind;
6923 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6925 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6926 kind = scalar_load;
6927 else
6928 kind = scalar_store;
6930 else if (vect_nop_conversion_p (orig_stmt_info))
6931 continue;
6932 /* For single-argument PHIs assume coalescing which means zero cost
6933 for the scalar and the vector PHIs. This avoids artificially
6934 favoring the vector path (but may pessimize it in some cases). */
6935 else if (is_a <gphi *> (orig_stmt_info->stmt)
6936 && gimple_phi_num_args
6937 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6938 continue;
6939 else
6940 kind = scalar_stmt;
6941 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6942 SLP_TREE_VECTYPE (node), 0, vect_body);
6945 auto_vec<bool, 20> subtree_life;
6946 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6948 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6950 /* Do not directly pass LIFE to the recursive call, copy it to
6951 confine changes in the callee to the current child/subtree. */
6952 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6954 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6955 for (unsigned j = 0;
6956 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6958 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6959 if (perm.first == i)
6960 subtree_life[perm.second] = (*life)[j];
6963 else
6965 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6966 subtree_life.safe_splice (*life);
6968 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6969 vectorized_scalar_stmts, visited);
6970 subtree_life.truncate (0);
6975 /* Comparator for the loop-index sorted cost vectors. */
6977 static int
6978 li_cost_vec_cmp (const void *a_, const void *b_)
6980 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6981 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6982 if (a->first < b->first)
6983 return -1;
6984 else if (a->first == b->first)
6985 return 0;
6986 return 1;
6989 /* Check if vectorization of the basic block is profitable for the
6990 subgraph denoted by SLP_INSTANCES. */
6992 static bool
6993 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6994 vec<slp_instance> slp_instances,
6995 loop_p orig_loop)
6997 slp_instance instance;
6998 int i;
6999 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7000 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7002 if (dump_enabled_p ())
7004 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7005 hash_set<slp_tree> visited;
7006 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7007 vect_print_slp_graph (MSG_NOTE, vect_location,
7008 SLP_INSTANCE_TREE (instance), visited);
7011 /* Compute the set of scalar stmts we know will go away 'locally' when
7012 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7013 not accurate for nodes promoted extern late or for scalar stmts that
7014 are used both in extern defs and in vectorized defs. */
7015 hash_set<stmt_vec_info> vectorized_scalar_stmts;
7016 hash_set<stmt_vec_info> scalar_stmts_in_externs;
7017 hash_set<slp_tree> visited;
7018 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7020 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7021 SLP_INSTANCE_TREE (instance),
7022 visited,
7023 vectorized_scalar_stmts,
7024 scalar_stmts_in_externs);
7025 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7026 vectorized_scalar_stmts.add (rstmt);
7028 /* Scalar stmts used as defs in external nodes need to be preseved, so
7029 remove them from vectorized_scalar_stmts. */
7030 for (stmt_vec_info stmt : scalar_stmts_in_externs)
7031 vectorized_scalar_stmts.remove (stmt);
7033 /* Calculate scalar cost and sum the cost for the vector stmts
7034 previously collected. */
7035 stmt_vector_for_cost scalar_costs = vNULL;
7036 stmt_vector_for_cost vector_costs = vNULL;
7037 visited.empty ();
7038 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7040 auto_vec<bool, 20> life;
7041 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7042 true);
7043 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7044 record_stmt_cost (&scalar_costs,
7045 SLP_INSTANCE_ROOT_STMTS (instance).length (),
7046 scalar_stmt,
7047 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7048 vect_bb_slp_scalar_cost (bb_vinfo,
7049 SLP_INSTANCE_TREE (instance),
7050 &life, &scalar_costs, vectorized_scalar_stmts,
7051 visited);
7052 vector_costs.safe_splice (instance->cost_vec);
7053 instance->cost_vec.release ();
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7059 /* When costing non-loop vectorization we need to consider each covered
7060 loop independently and make sure vectorization is profitable. For
7061 now we assume a loop may be not entered or executed an arbitrary
7062 number of iterations (??? static information can provide more
7063 precise info here) which means we can simply cost each containing
7064 loops stmts separately. */
7066 /* First produce cost vectors sorted by loop index. */
7067 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7068 li_scalar_costs (scalar_costs.length ());
7069 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7070 li_vector_costs (vector_costs.length ());
7071 stmt_info_for_cost *cost;
7072 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7074 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7075 li_scalar_costs.quick_push (std::make_pair (l, cost));
7077 /* Use a random used loop as fallback in case the first vector_costs
7078 entry does not have a stmt_info associated with it. */
7079 unsigned l = li_scalar_costs[0].first;
7080 FOR_EACH_VEC_ELT (vector_costs, i, cost)
7082 /* We inherit from the previous COST, invariants, externals and
7083 extracts immediately follow the cost for the related stmt. */
7084 if (cost->stmt_info)
7085 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7086 li_vector_costs.quick_push (std::make_pair (l, cost));
7088 li_scalar_costs.qsort (li_cost_vec_cmp);
7089 li_vector_costs.qsort (li_cost_vec_cmp);
7091 /* Now cost the portions individually. */
7092 unsigned vi = 0;
7093 unsigned si = 0;
7094 bool profitable = true;
7095 while (si < li_scalar_costs.length ()
7096 && vi < li_vector_costs.length ())
7098 unsigned sl = li_scalar_costs[si].first;
7099 unsigned vl = li_vector_costs[vi].first;
7100 if (sl != vl)
7102 if (dump_enabled_p ())
7103 dump_printf_loc (MSG_NOTE, vect_location,
7104 "Scalar %d and vector %d loop part do not "
7105 "match up, skipping scalar part\n", sl, vl);
7106 /* Skip the scalar part, assuming zero cost on the vector side. */
7109 si++;
7111 while (si < li_scalar_costs.length ()
7112 && li_scalar_costs[si].first == sl);
7113 continue;
7116 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7119 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7120 si++;
7122 while (si < li_scalar_costs.length ()
7123 && li_scalar_costs[si].first == sl);
7124 unsigned dummy;
7125 finish_cost (scalar_target_cost_data, nullptr,
7126 &dummy, &scalar_cost, &dummy);
7128 /* Complete the target-specific vector cost calculation. */
7129 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7132 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7133 vi++;
7135 while (vi < li_vector_costs.length ()
7136 && li_vector_costs[vi].first == vl);
7137 finish_cost (vect_target_cost_data, scalar_target_cost_data,
7138 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7139 delete scalar_target_cost_data;
7140 delete vect_target_cost_data;
7142 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7144 if (dump_enabled_p ())
7146 dump_printf_loc (MSG_NOTE, vect_location,
7147 "Cost model analysis for part in loop %d:\n", sl);
7148 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7149 vec_inside_cost + vec_outside_cost);
7150 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7153 /* Vectorization is profitable if its cost is more than the cost of scalar
7154 version. Note that we err on the vector side for equal cost because
7155 the cost estimate is otherwise quite pessimistic (constant uses are
7156 free on the scalar side but cost a load on the vector side for
7157 example). */
7158 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7160 profitable = false;
7161 break;
7164 if (profitable && vi < li_vector_costs.length ())
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_NOTE, vect_location,
7168 "Excess vector cost for part in loop %d:\n",
7169 li_vector_costs[vi].first);
7170 profitable = false;
7173 /* Unset visited flag. This is delayed when the subgraph is profitable
7174 and we process the loop for remaining unvectorized if-converted code. */
7175 if (!orig_loop || !profitable)
7176 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7177 gimple_set_visited (cost->stmt_info->stmt, false);
7179 scalar_costs.release ();
7180 vector_costs.release ();
7182 return profitable;
7185 /* qsort comparator for lane defs. */
7187 static int
7188 vld_cmp (const void *a_, const void *b_)
7190 auto *a = (const std::pair<unsigned, tree> *)a_;
7191 auto *b = (const std::pair<unsigned, tree> *)b_;
7192 return a->first - b->first;
7195 /* Return true if USE_STMT is a vector lane insert into VEC and set
7196 *THIS_LANE to the lane number that is set. */
7198 static bool
7199 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7201 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7202 if (!use_ass
7203 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7204 || (vec
7205 ? gimple_assign_rhs1 (use_ass) != vec
7206 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7207 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7208 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7209 || !constant_multiple_p
7210 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7211 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7212 this_lane))
7213 return false;
7214 return true;
7217 /* Find any vectorizable constructors and add them to the grouped_store
7218 array. */
7220 static void
7221 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7223 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7224 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7225 !gsi_end_p (gsi); gsi_next (&gsi))
7227 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7228 if (!assign)
7229 continue;
7231 tree rhs = gimple_assign_rhs1 (assign);
7232 enum tree_code code = gimple_assign_rhs_code (assign);
7233 use_operand_p use_p;
7234 gimple *use_stmt;
7235 if (code == CONSTRUCTOR)
7237 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7238 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7239 CONSTRUCTOR_NELTS (rhs))
7240 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7241 || uniform_vector_p (rhs))
7242 continue;
7244 unsigned j;
7245 tree val;
7246 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7247 if (TREE_CODE (val) != SSA_NAME
7248 || !bb_vinfo->lookup_def (val))
7249 break;
7250 if (j != CONSTRUCTOR_NELTS (rhs))
7251 continue;
7253 vec<stmt_vec_info> roots = vNULL;
7254 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7255 vec<stmt_vec_info> stmts;
7256 stmts.create (CONSTRUCTOR_NELTS (rhs));
7257 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7258 stmts.quick_push
7259 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7260 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7261 stmts, roots));
7263 else if (code == BIT_INSERT_EXPR
7264 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7265 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7266 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7267 && integer_zerop (gimple_assign_rhs3 (assign))
7268 && useless_type_conversion_p
7269 (TREE_TYPE (TREE_TYPE (rhs)),
7270 TREE_TYPE (gimple_assign_rhs2 (assign)))
7271 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7273 /* We start to match on insert to lane zero but since the
7274 inserts need not be ordered we'd have to search both
7275 the def and the use chains. */
7276 tree vectype = TREE_TYPE (rhs);
7277 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7278 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7279 auto_sbitmap lanes (nlanes);
7280 bitmap_clear (lanes);
7281 bitmap_set_bit (lanes, 0);
7282 tree def = gimple_assign_lhs (assign);
7283 lane_defs.quick_push
7284 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7285 unsigned lanes_found = 1;
7286 /* Start with the use chains, the last stmt will be the root. */
7287 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7288 vec<stmt_vec_info> roots = vNULL;
7289 roots.safe_push (last);
7292 use_operand_p use_p;
7293 gimple *use_stmt;
7294 if (!single_imm_use (def, &use_p, &use_stmt))
7295 break;
7296 unsigned this_lane;
7297 if (!bb_vinfo->lookup_stmt (use_stmt)
7298 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7299 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7300 break;
7301 if (bitmap_bit_p (lanes, this_lane))
7302 break;
7303 lanes_found++;
7304 bitmap_set_bit (lanes, this_lane);
7305 gassign *use_ass = as_a <gassign *> (use_stmt);
7306 lane_defs.quick_push (std::make_pair
7307 (this_lane, gimple_assign_rhs2 (use_ass)));
7308 last = bb_vinfo->lookup_stmt (use_ass);
7309 roots.safe_push (last);
7310 def = gimple_assign_lhs (use_ass);
7312 while (lanes_found < nlanes);
7313 if (roots.length () > 1)
7314 std::swap(roots[0], roots[roots.length () - 1]);
7315 if (lanes_found < nlanes)
7317 /* Now search the def chain. */
7318 def = gimple_assign_rhs1 (assign);
7321 if (TREE_CODE (def) != SSA_NAME
7322 || !has_single_use (def))
7323 break;
7324 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7325 unsigned this_lane;
7326 if (!bb_vinfo->lookup_stmt (def_stmt)
7327 || !vect_slp_is_lane_insert (def_stmt,
7328 NULL_TREE, &this_lane)
7329 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7330 break;
7331 if (bitmap_bit_p (lanes, this_lane))
7332 break;
7333 lanes_found++;
7334 bitmap_set_bit (lanes, this_lane);
7335 lane_defs.quick_push (std::make_pair
7336 (this_lane,
7337 gimple_assign_rhs2 (def_stmt)));
7338 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7339 def = gimple_assign_rhs1 (def_stmt);
7341 while (lanes_found < nlanes);
7343 if (lanes_found == nlanes)
7345 /* Sort lane_defs after the lane index and register the root. */
7346 lane_defs.qsort (vld_cmp);
7347 vec<stmt_vec_info> stmts;
7348 stmts.create (nlanes);
7349 for (unsigned i = 0; i < nlanes; ++i)
7350 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7351 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7352 stmts, roots));
7354 else
7355 roots.release ();
7357 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7358 && (associative_tree_code (code) || code == MINUS_EXPR)
7359 /* ??? This pessimizes a two-element reduction. PR54400.
7360 ??? In-order reduction could be handled if we only
7361 traverse one operand chain in vect_slp_linearize_chain. */
7362 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7363 /* Ops with constants at the tail can be stripped here. */
7364 && TREE_CODE (rhs) == SSA_NAME
7365 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7366 /* Should be the chain end. */
7367 && (!single_imm_use (gimple_assign_lhs (assign),
7368 &use_p, &use_stmt)
7369 || !is_gimple_assign (use_stmt)
7370 || (gimple_assign_rhs_code (use_stmt) != code
7371 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7372 || (gimple_assign_rhs_code (use_stmt)
7373 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7375 /* We start the match at the end of a possible association
7376 chain. */
7377 auto_vec<chain_op_t> chain;
7378 auto_vec<std::pair<tree_code, gimple *> > worklist;
7379 auto_vec<gimple *> chain_stmts;
7380 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7381 if (code == MINUS_EXPR)
7382 code = PLUS_EXPR;
7383 internal_fn reduc_fn;
7384 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7385 || reduc_fn == IFN_LAST)
7386 continue;
7387 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7388 /* ??? */
7389 code_stmt, alt_code_stmt, &chain_stmts);
7390 if (chain.length () > 1)
7392 /* Sort the chain according to def_type and operation. */
7393 chain.sort (dt_sort_cmp, bb_vinfo);
7394 /* ??? Now we'd want to strip externals and constants
7395 but record those to be handled in the epilogue. */
7396 /* ??? For now do not allow mixing ops or externs/constants. */
7397 bool invalid = false;
7398 unsigned remain_cnt = 0;
7399 for (unsigned i = 0; i < chain.length (); ++i)
7401 if (chain[i].code != code)
7403 invalid = true;
7404 break;
7406 if (chain[i].dt != vect_internal_def)
7407 remain_cnt++;
7409 if (!invalid && chain.length () - remain_cnt > 1)
7411 vec<stmt_vec_info> stmts;
7412 vec<tree> remain = vNULL;
7413 stmts.create (chain.length ());
7414 if (remain_cnt > 0)
7415 remain.create (remain_cnt);
7416 for (unsigned i = 0; i < chain.length (); ++i)
7418 if (chain[i].dt == vect_internal_def)
7419 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7420 else
7421 remain.quick_push (chain[i].op);
7423 vec<stmt_vec_info> roots;
7424 roots.create (chain_stmts.length ());
7425 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7426 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7427 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7428 stmts, roots, remain));
7435 /* Walk the grouped store chains and replace entries with their
7436 pattern variant if any. */
7438 static void
7439 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7441 stmt_vec_info first_element;
7442 unsigned i;
7444 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7446 /* We also have CTORs in this array. */
7447 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7448 continue;
7449 if (STMT_VINFO_IN_PATTERN_P (first_element))
7451 stmt_vec_info orig = first_element;
7452 first_element = STMT_VINFO_RELATED_STMT (first_element);
7453 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7454 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7455 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7456 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7457 vinfo->grouped_stores[i] = first_element;
7459 stmt_vec_info prev = first_element;
7460 while (DR_GROUP_NEXT_ELEMENT (prev))
7462 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7463 if (STMT_VINFO_IN_PATTERN_P (elt))
7465 stmt_vec_info orig = elt;
7466 elt = STMT_VINFO_RELATED_STMT (elt);
7467 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7468 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7469 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7471 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7472 prev = elt;
7477 /* Check if the region described by BB_VINFO can be vectorized, returning
7478 true if so. When returning false, set FATAL to true if the same failure
7479 would prevent vectorization at other vector sizes, false if it is still
7480 worth trying other sizes. N_STMTS is the number of statements in the
7481 region. */
7483 static bool
7484 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7485 vec<int> *dataref_groups)
7487 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7489 slp_instance instance;
7490 int i;
7491 poly_uint64 min_vf = 2;
7493 /* The first group of checks is independent of the vector size. */
7494 fatal = true;
7496 /* Analyze the data references. */
7498 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7500 if (dump_enabled_p ())
7501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502 "not vectorized: unhandled data-ref in basic "
7503 "block.\n");
7504 return false;
7507 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7509 if (dump_enabled_p ())
7510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511 "not vectorized: unhandled data access in "
7512 "basic block.\n");
7513 return false;
7516 vect_slp_check_for_roots (bb_vinfo);
7518 /* If there are no grouped stores and no constructors in the region
7519 there is no need to continue with pattern recog as vect_analyze_slp
7520 will fail anyway. */
7521 if (bb_vinfo->grouped_stores.is_empty ()
7522 && bb_vinfo->roots.is_empty ())
7524 if (dump_enabled_p ())
7525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526 "not vectorized: no grouped stores in "
7527 "basic block.\n");
7528 return false;
7531 /* While the rest of the analysis below depends on it in some way. */
7532 fatal = false;
7534 vect_pattern_recog (bb_vinfo);
7536 /* Update store groups from pattern processing. */
7537 vect_fixup_store_groups_with_patterns (bb_vinfo);
7539 /* Check the SLP opportunities in the basic block, analyze and build SLP
7540 trees. */
7541 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7543 if (dump_enabled_p ())
7545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546 "Failed to SLP the basic block.\n");
7547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7548 "not vectorized: failed to find SLP opportunities "
7549 "in basic block.\n");
7551 return false;
7554 /* Optimize permutations. */
7555 vect_optimize_slp (bb_vinfo);
7557 /* Gather the loads reachable from the SLP graph entries. */
7558 vect_gather_slp_loads (bb_vinfo);
7560 vect_record_base_alignments (bb_vinfo);
7562 /* Analyze and verify the alignment of data references and the
7563 dependence in the SLP instances. */
7564 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7566 vect_location = instance->location ();
7567 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7568 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7570 slp_tree node = SLP_INSTANCE_TREE (instance);
7571 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7572 if (dump_enabled_p ())
7573 dump_printf_loc (MSG_NOTE, vect_location,
7574 "removing SLP instance operations starting from: %G",
7575 stmt_info->stmt);
7576 vect_free_slp_instance (instance);
7577 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7578 continue;
7581 /* Mark all the statements that we want to vectorize as pure SLP and
7582 relevant. */
7583 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7584 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7585 unsigned j;
7586 stmt_vec_info root;
7587 /* Likewise consider instance root stmts as vectorized. */
7588 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7589 STMT_SLP_TYPE (root) = pure_slp;
7591 i++;
7593 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7594 return false;
7596 if (!vect_slp_analyze_operations (bb_vinfo))
7598 if (dump_enabled_p ())
7599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600 "not vectorized: bad operation in basic block.\n");
7601 return false;
7604 vect_bb_partition_graph (bb_vinfo);
7606 return true;
7609 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7610 basic blocks in BBS, returning true on success.
7611 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7613 static bool
7614 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7615 vec<int> *dataref_groups, unsigned int n_stmts,
7616 loop_p orig_loop)
7618 bb_vec_info bb_vinfo;
7619 auto_vector_modes vector_modes;
7621 /* Autodetect first vector size we try. */
7622 machine_mode next_vector_mode = VOIDmode;
7623 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7624 unsigned int mode_i = 0;
7626 vec_info_shared shared;
7628 machine_mode autodetected_vector_mode = VOIDmode;
7629 while (1)
7631 bool vectorized = false;
7632 bool fatal = false;
7633 bb_vinfo = new _bb_vec_info (bbs, &shared);
7635 bool first_time_p = shared.datarefs.is_empty ();
7636 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7637 if (first_time_p)
7638 bb_vinfo->shared->save_datarefs ();
7639 else
7640 bb_vinfo->shared->check_datarefs ();
7641 bb_vinfo->vector_mode = next_vector_mode;
7643 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7645 if (dump_enabled_p ())
7647 dump_printf_loc (MSG_NOTE, vect_location,
7648 "***** Analysis succeeded with vector mode"
7649 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7650 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7653 bb_vinfo->shared->check_datarefs ();
7655 bool force_clear = false;
7656 auto_vec<slp_instance> profitable_subgraphs;
7657 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7659 if (instance->subgraph_entries.is_empty ())
7660 continue;
7662 dump_user_location_t saved_vect_location = vect_location;
7663 vect_location = instance->location ();
7664 if (!unlimited_cost_model (NULL)
7665 && !vect_bb_vectorization_profitable_p
7666 (bb_vinfo, instance->subgraph_entries, orig_loop))
7668 if (dump_enabled_p ())
7669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7670 "not vectorized: vectorization is not "
7671 "profitable.\n");
7672 vect_location = saved_vect_location;
7673 continue;
7676 vect_location = saved_vect_location;
7677 if (!dbg_cnt (vect_slp))
7679 force_clear = true;
7680 continue;
7683 profitable_subgraphs.safe_push (instance);
7686 /* When we're vectorizing an if-converted loop body make sure
7687 we vectorized all if-converted code. */
7688 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7690 gcc_assert (bb_vinfo->bbs.length () == 1);
7691 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7692 !gsi_end_p (gsi); gsi_next (&gsi))
7694 /* The costing above left us with DCEable vectorized scalar
7695 stmts having the visited flag set on profitable
7696 subgraphs. Do the delayed clearing of the flag here. */
7697 if (gimple_visited_p (gsi_stmt (gsi)))
7699 gimple_set_visited (gsi_stmt (gsi), false);
7700 continue;
7702 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7703 continue;
7705 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7706 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7708 if (!profitable_subgraphs.is_empty ()
7709 && dump_enabled_p ())
7710 dump_printf_loc (MSG_NOTE, vect_location,
7711 "not profitable because of "
7712 "unprofitable if-converted scalar "
7713 "code\n");
7714 profitable_subgraphs.truncate (0);
7719 /* Finally schedule the profitable subgraphs. */
7720 for (slp_instance instance : profitable_subgraphs)
7722 if (!vectorized && dump_enabled_p ())
7723 dump_printf_loc (MSG_NOTE, vect_location,
7724 "Basic block will be vectorized "
7725 "using SLP\n");
7726 vectorized = true;
7728 /* Dump before scheduling as store vectorization will remove
7729 the original stores and mess with the instance tree
7730 so querying its location will eventually ICE. */
7731 if (flag_checking)
7732 for (slp_instance sub : instance->subgraph_entries)
7733 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7734 unsigned HOST_WIDE_INT bytes;
7735 if (dump_enabled_p ())
7736 for (slp_instance sub : instance->subgraph_entries)
7738 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7739 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7740 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7741 sub->location (),
7742 "basic block part vectorized using %wu "
7743 "byte vectors\n", bytes);
7744 else
7745 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7746 sub->location (),
7747 "basic block part vectorized using "
7748 "variable length vectors\n");
7751 dump_user_location_t saved_vect_location = vect_location;
7752 vect_location = instance->location ();
7754 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7756 vect_location = saved_vect_location;
7759 else
7761 if (dump_enabled_p ())
7762 dump_printf_loc (MSG_NOTE, vect_location,
7763 "***** Analysis failed with vector mode %s\n",
7764 GET_MODE_NAME (bb_vinfo->vector_mode));
7767 if (mode_i == 0)
7768 autodetected_vector_mode = bb_vinfo->vector_mode;
7770 if (!fatal)
7771 while (mode_i < vector_modes.length ()
7772 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7774 if (dump_enabled_p ())
7775 dump_printf_loc (MSG_NOTE, vect_location,
7776 "***** The result for vector mode %s would"
7777 " be the same\n",
7778 GET_MODE_NAME (vector_modes[mode_i]));
7779 mode_i += 1;
7782 delete bb_vinfo;
7784 if (mode_i < vector_modes.length ()
7785 && VECTOR_MODE_P (autodetected_vector_mode)
7786 && (related_vector_mode (vector_modes[mode_i],
7787 GET_MODE_INNER (autodetected_vector_mode))
7788 == autodetected_vector_mode)
7789 && (related_vector_mode (autodetected_vector_mode,
7790 GET_MODE_INNER (vector_modes[mode_i]))
7791 == vector_modes[mode_i]))
7793 if (dump_enabled_p ())
7794 dump_printf_loc (MSG_NOTE, vect_location,
7795 "***** Skipping vector mode %s, which would"
7796 " repeat the analysis for %s\n",
7797 GET_MODE_NAME (vector_modes[mode_i]),
7798 GET_MODE_NAME (autodetected_vector_mode));
7799 mode_i += 1;
7802 if (vectorized
7803 || mode_i == vector_modes.length ()
7804 || autodetected_vector_mode == VOIDmode
7805 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7806 vector sizes will fail do not bother iterating. */
7807 || fatal)
7808 return vectorized;
7810 /* Try the next biggest vector size. */
7811 next_vector_mode = vector_modes[mode_i++];
7812 if (dump_enabled_p ())
7813 dump_printf_loc (MSG_NOTE, vect_location,
7814 "***** Re-trying analysis with vector mode %s\n",
7815 GET_MODE_NAME (next_vector_mode));
7820 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7821 true if anything in the basic-block was vectorized. */
7823 static bool
7824 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7826 vec<data_reference_p> datarefs = vNULL;
7827 auto_vec<int> dataref_groups;
7828 int insns = 0;
7829 int current_group = 0;
7831 for (unsigned i = 0; i < bbs.length (); i++)
7833 basic_block bb = bbs[i];
7834 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7835 gsi_next (&gsi))
7837 gimple *stmt = gsi_stmt (gsi);
7838 if (is_gimple_debug (stmt))
7839 continue;
7841 insns++;
7843 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7844 vect_location = stmt;
7846 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7847 &dataref_groups, current_group))
7848 ++current_group;
7850 /* New BBs always start a new DR group. */
7851 ++current_group;
7854 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7857 /* Special entry for the BB vectorizer. Analyze and transform a single
7858 if-converted BB with ORIG_LOOPs body being the not if-converted
7859 representation. Returns true if anything in the basic-block was
7860 vectorized. */
7862 bool
7863 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7865 auto_vec<basic_block> bbs;
7866 bbs.safe_push (bb);
7867 return vect_slp_bbs (bbs, orig_loop);
7870 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7871 true if anything in the basic-block was vectorized. */
7873 bool
7874 vect_slp_function (function *fun)
7876 bool r = false;
7877 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7878 auto_bitmap exit_bbs;
7879 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
7880 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
7881 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
7882 true, rpo, NULL);
7884 /* For the moment split the function into pieces to avoid making
7885 the iteration on the vector mode moot. Split at points we know
7886 to not handle well which is CFG merges (SLP discovery doesn't
7887 handle non-loop-header PHIs) and loop exits. Since pattern
7888 recog requires reverse iteration to visit uses before defs
7889 simply chop RPO into pieces. */
7890 auto_vec<basic_block> bbs;
7891 for (unsigned i = 0; i < n; i++)
7893 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7894 bool split = false;
7896 /* Split when a BB is not dominated by the first block. */
7897 if (!bbs.is_empty ()
7898 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7900 if (dump_enabled_p ())
7901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7902 "splitting region at dominance boundary bb%d\n",
7903 bb->index);
7904 split = true;
7906 /* Split when the loop determined by the first block
7907 is exited. This is because we eventually insert
7908 invariants at region begin. */
7909 else if (!bbs.is_empty ()
7910 && bbs[0]->loop_father != bb->loop_father
7911 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7913 if (dump_enabled_p ())
7914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915 "splitting region at loop %d exit at bb%d\n",
7916 bbs[0]->loop_father->num, bb->index);
7917 split = true;
7919 else if (!bbs.is_empty ()
7920 && bb->loop_father->header == bb
7921 && bb->loop_father->dont_vectorize)
7923 if (dump_enabled_p ())
7924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7925 "splitting region at dont-vectorize loop %d "
7926 "entry at bb%d\n",
7927 bb->loop_father->num, bb->index);
7928 split = true;
7931 if (split && !bbs.is_empty ())
7933 r |= vect_slp_bbs (bbs, NULL);
7934 bbs.truncate (0);
7937 if (bbs.is_empty ())
7939 /* We need to be able to insert at the head of the region which
7940 we cannot for region starting with a returns-twice call. */
7941 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7942 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7944 if (dump_enabled_p ())
7945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7946 "skipping bb%d as start of region as it "
7947 "starts with returns-twice call\n",
7948 bb->index);
7949 continue;
7951 /* If the loop this BB belongs to is marked as not to be vectorized
7952 honor that also for BB vectorization. */
7953 if (bb->loop_father->dont_vectorize)
7954 continue;
7957 bbs.safe_push (bb);
7959 /* When we have a stmt ending this block and defining a
7960 value we have to insert on edges when inserting after it for
7961 a vector containing its definition. Avoid this for now. */
7962 if (gimple *last = *gsi_last_bb (bb))
7963 if (gimple_get_lhs (last)
7964 && is_ctrl_altering_stmt (last))
7966 if (dump_enabled_p ())
7967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7968 "splitting region at control altering "
7969 "definition %G", last);
7970 r |= vect_slp_bbs (bbs, NULL);
7971 bbs.truncate (0);
7975 if (!bbs.is_empty ())
7976 r |= vect_slp_bbs (bbs, NULL);
7978 free (rpo);
7980 return r;
7983 /* Build a variable-length vector in which the elements in ELTS are repeated
7984 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7985 RESULTS and add any new instructions to SEQ.
7987 The approach we use is:
7989 (1) Find a vector mode VM with integer elements of mode IM.
7991 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7992 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7993 from small vectors to IM.
7995 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7997 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7998 correct byte contents.
8000 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8002 We try to find the largest IM for which this sequence works, in order
8003 to cut down on the number of interleaves. */
8005 void
8006 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8007 const vec<tree> &elts, unsigned int nresults,
8008 vec<tree> &results)
8010 unsigned int nelts = elts.length ();
8011 tree element_type = TREE_TYPE (vector_type);
8013 /* (1) Find a vector mode VM with integer elements of mode IM. */
8014 unsigned int nvectors = 1;
8015 tree new_vector_type;
8016 tree permutes[2];
8017 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8018 &nvectors, &new_vector_type,
8019 permutes))
8020 gcc_unreachable ();
8022 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8023 unsigned int partial_nelts = nelts / nvectors;
8024 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8026 tree_vector_builder partial_elts;
8027 auto_vec<tree, 32> pieces (nvectors * 2);
8028 pieces.quick_grow_cleared (nvectors * 2);
8029 for (unsigned int i = 0; i < nvectors; ++i)
8031 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8032 ELTS' has mode IM. */
8033 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8034 for (unsigned int j = 0; j < partial_nelts; ++j)
8035 partial_elts.quick_push (elts[i * partial_nelts + j]);
8036 tree t = gimple_build_vector (seq, &partial_elts);
8037 t = gimple_build (seq, VIEW_CONVERT_EXPR,
8038 TREE_TYPE (new_vector_type), t);
8040 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8041 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8044 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8045 correct byte contents.
8047 Conceptually, we need to repeat the following operation log2(nvectors)
8048 times, where hi_start = nvectors / 2:
8050 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8051 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8053 However, if each input repeats every N elements and the VF is
8054 a multiple of N * 2, the HI result is the same as the LO result.
8055 This will be true for the first N1 iterations of the outer loop,
8056 followed by N2 iterations for which both the LO and HI results
8057 are needed. I.e.:
8059 N1 + N2 = log2(nvectors)
8061 Each "N1 iteration" doubles the number of redundant vectors and the
8062 effect of the process as a whole is to have a sequence of nvectors/2**N1
8063 vectors that repeats 2**N1 times. Rather than generate these redundant
8064 vectors, we halve the number of vectors for each N1 iteration. */
8065 unsigned int in_start = 0;
8066 unsigned int out_start = nvectors;
8067 unsigned int new_nvectors = nvectors;
8068 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8070 unsigned int hi_start = new_nvectors / 2;
8071 unsigned int out_i = 0;
8072 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8074 if ((in_i & 1) != 0
8075 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8076 2 * in_repeat))
8077 continue;
8079 tree output = make_ssa_name (new_vector_type);
8080 tree input1 = pieces[in_start + (in_i / 2)];
8081 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8082 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8083 input1, input2,
8084 permutes[in_i & 1]);
8085 gimple_seq_add_stmt (seq, stmt);
8086 pieces[out_start + out_i] = output;
8087 out_i += 1;
8089 std::swap (in_start, out_start);
8090 new_nvectors = out_i;
8093 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8094 results.reserve (nresults);
8095 for (unsigned int i = 0; i < nresults; ++i)
8096 if (i < new_nvectors)
8097 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8098 pieces[in_start + i]));
8099 else
8100 results.quick_push (results[i - new_nvectors]);
8104 /* For constant and loop invariant defs in OP_NODE this function creates
8105 vector defs that will be used in the vectorized stmts and stores them
8106 to SLP_TREE_VEC_DEFS of OP_NODE. */
8108 static void
8109 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8111 unsigned HOST_WIDE_INT nunits;
8112 tree vec_cst;
8113 unsigned j, number_of_places_left_in_vector;
8114 tree vector_type;
8115 tree vop;
8116 int group_size = op_node->ops.length ();
8117 unsigned int vec_num, i;
8118 unsigned number_of_copies = 1;
8119 bool constant_p;
8120 gimple_seq ctor_seq = NULL;
8121 auto_vec<tree, 16> permute_results;
8123 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8124 vector_type = SLP_TREE_VECTYPE (op_node);
8126 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8127 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8128 auto_vec<tree> voprnds (number_of_vectors);
8130 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8131 created vectors. It is greater than 1 if unrolling is performed.
8133 For example, we have two scalar operands, s1 and s2 (e.g., group of
8134 strided accesses of size two), while NUNITS is four (i.e., four scalars
8135 of this type can be packed in a vector). The output vector will contain
8136 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8137 will be 2).
8139 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8140 containing the operands.
8142 For example, NUNITS is four as before, and the group size is 8
8143 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8144 {s5, s6, s7, s8}. */
8146 /* When using duplicate_and_interleave, we just need one element for
8147 each scalar statement. */
8148 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8149 nunits = group_size;
8151 number_of_copies = nunits * number_of_vectors / group_size;
8153 number_of_places_left_in_vector = nunits;
8154 constant_p = true;
8155 tree_vector_builder elts (vector_type, nunits, 1);
8156 elts.quick_grow (nunits);
8157 stmt_vec_info insert_after = NULL;
8158 for (j = 0; j < number_of_copies; j++)
8160 tree op;
8161 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8163 /* Create 'vect_ = {op0,op1,...,opn}'. */
8164 number_of_places_left_in_vector--;
8165 tree orig_op = op;
8166 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8168 if (CONSTANT_CLASS_P (op))
8170 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8172 /* Can't use VIEW_CONVERT_EXPR for booleans because
8173 of possibly different sizes of scalar value and
8174 vector element. */
8175 if (integer_zerop (op))
8176 op = build_int_cst (TREE_TYPE (vector_type), 0);
8177 else if (integer_onep (op))
8178 op = build_all_ones_cst (TREE_TYPE (vector_type));
8179 else
8180 gcc_unreachable ();
8182 else
8183 op = fold_unary (VIEW_CONVERT_EXPR,
8184 TREE_TYPE (vector_type), op);
8185 gcc_assert (op && CONSTANT_CLASS_P (op));
8187 else
8189 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8190 gimple *init_stmt;
8191 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8193 tree true_val
8194 = build_all_ones_cst (TREE_TYPE (vector_type));
8195 tree false_val
8196 = build_zero_cst (TREE_TYPE (vector_type));
8197 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8198 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8199 op, true_val,
8200 false_val);
8202 else
8204 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8205 op);
8206 init_stmt
8207 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8208 op);
8210 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8211 op = new_temp;
8214 elts[number_of_places_left_in_vector] = op;
8215 if (!CONSTANT_CLASS_P (op))
8216 constant_p = false;
8217 /* For BB vectorization we have to compute an insert location
8218 when a def is inside the analyzed region since we cannot
8219 simply insert at the BB start in this case. */
8220 stmt_vec_info opdef;
8221 if (TREE_CODE (orig_op) == SSA_NAME
8222 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8223 && is_a <bb_vec_info> (vinfo)
8224 && (opdef = vinfo->lookup_def (orig_op)))
8226 if (!insert_after)
8227 insert_after = opdef;
8228 else
8229 insert_after = get_later_stmt (insert_after, opdef);
8232 if (number_of_places_left_in_vector == 0)
8234 if (constant_p
8235 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8236 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8237 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8238 else
8240 if (permute_results.is_empty ())
8241 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8242 elts, number_of_vectors,
8243 permute_results);
8244 vec_cst = permute_results[number_of_vectors - j - 1];
8246 if (!gimple_seq_empty_p (ctor_seq))
8248 if (insert_after)
8250 gimple_stmt_iterator gsi;
8251 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8253 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8254 gsi_insert_seq_before (&gsi, ctor_seq,
8255 GSI_CONTINUE_LINKING);
8257 else if (!stmt_ends_bb_p (insert_after->stmt))
8259 gsi = gsi_for_stmt (insert_after->stmt);
8260 gsi_insert_seq_after (&gsi, ctor_seq,
8261 GSI_CONTINUE_LINKING);
8263 else
8265 /* When we want to insert after a def where the
8266 defining stmt throws then insert on the fallthru
8267 edge. */
8268 edge e = find_fallthru_edge
8269 (gimple_bb (insert_after->stmt)->succs);
8270 basic_block new_bb
8271 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8272 gcc_assert (!new_bb);
8275 else
8276 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8277 ctor_seq = NULL;
8279 voprnds.quick_push (vec_cst);
8280 insert_after = NULL;
8281 number_of_places_left_in_vector = nunits;
8282 constant_p = true;
8283 elts.new_vector (vector_type, nunits, 1);
8284 elts.quick_grow (nunits);
8289 /* Since the vectors are created in the reverse order, we should invert
8290 them. */
8291 vec_num = voprnds.length ();
8292 for (j = vec_num; j != 0; j--)
8294 vop = voprnds[j - 1];
8295 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8298 /* In case that VF is greater than the unrolling factor needed for the SLP
8299 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8300 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8301 to replicate the vectors. */
8302 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8303 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8304 i++)
8305 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8308 /* Get the Ith vectorized definition from SLP_NODE. */
8310 tree
8311 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8313 return SLP_TREE_VEC_DEFS (slp_node)[i];
8316 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8318 void
8319 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8321 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8322 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8325 /* Get N vectorized definitions for SLP_NODE. */
8327 void
8328 vect_get_slp_defs (vec_info *,
8329 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8331 if (n == -1U)
8332 n = SLP_TREE_CHILDREN (slp_node).length ();
8334 for (unsigned i = 0; i < n; ++i)
8336 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8337 vec<tree> vec_defs = vNULL;
8338 vect_get_slp_defs (child, &vec_defs);
8339 vec_oprnds->quick_push (vec_defs);
8343 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8344 - PERM gives the permutation that the caller wants to use for NODE,
8345 which might be different from SLP_LOAD_PERMUTATION.
8346 - DUMP_P controls whether the function dumps information. */
8348 static bool
8349 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8350 load_permutation_t &perm,
8351 const vec<tree> &dr_chain,
8352 gimple_stmt_iterator *gsi, poly_uint64 vf,
8353 bool analyze_only, bool dump_p,
8354 unsigned *n_perms, unsigned int *n_loads,
8355 bool dce_chain)
8357 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8358 int vec_index = 0;
8359 tree vectype = SLP_TREE_VECTYPE (node);
8360 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8361 unsigned int mask_element;
8362 unsigned dr_group_size;
8363 machine_mode mode;
8365 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8366 dr_group_size = 1;
8367 else
8369 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8370 dr_group_size = DR_GROUP_SIZE (stmt_info);
8373 mode = TYPE_MODE (vectype);
8374 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8375 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8377 /* Initialize the vect stmts of NODE to properly insert the generated
8378 stmts later. */
8379 if (! analyze_only)
8380 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8381 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8383 /* Generate permutation masks for every NODE. Number of masks for each NODE
8384 is equal to GROUP_SIZE.
8385 E.g., we have a group of three nodes with three loads from the same
8386 location in each node, and the vector size is 4. I.e., we have a
8387 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8388 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8389 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8392 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8393 The last mask is illegal since we assume two operands for permute
8394 operation, and the mask element values can't be outside that range.
8395 Hence, the last mask must be converted into {2,5,5,5}.
8396 For the first two permutations we need the first and the second input
8397 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8398 we need the second and the third vectors: {b1,c1,a2,b2} and
8399 {c2,a3,b3,c3}. */
8401 int vect_stmts_counter = 0;
8402 unsigned int index = 0;
8403 int first_vec_index = -1;
8404 int second_vec_index = -1;
8405 bool noop_p = true;
8406 *n_perms = 0;
8408 vec_perm_builder mask;
8409 unsigned int nelts_to_build;
8410 unsigned int nvectors_per_build;
8411 unsigned int in_nlanes;
8412 bool repeating_p = (group_size == dr_group_size
8413 && multiple_p (nunits, group_size));
8414 if (repeating_p)
8416 /* A single vector contains a whole number of copies of the node, so:
8417 (a) all permutes can use the same mask; and
8418 (b) the permutes only need a single vector input. */
8419 mask.new_vector (nunits, group_size, 3);
8420 nelts_to_build = mask.encoded_nelts ();
8421 /* It's possible to obtain zero nstmts during analyze_only, so make
8422 it at least one to ensure the later computation for n_perms
8423 proceed. */
8424 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8425 in_nlanes = dr_group_size * 3;
8427 else
8429 /* We need to construct a separate mask for each vector statement. */
8430 unsigned HOST_WIDE_INT const_nunits, const_vf;
8431 if (!nunits.is_constant (&const_nunits)
8432 || !vf.is_constant (&const_vf))
8433 return false;
8434 mask.new_vector (const_nunits, const_nunits, 1);
8435 nelts_to_build = const_vf * group_size;
8436 nvectors_per_build = 1;
8437 in_nlanes = const_vf * dr_group_size;
8439 auto_sbitmap used_in_lanes (in_nlanes);
8440 bitmap_clear (used_in_lanes);
8441 auto_bitmap used_defs;
8443 unsigned int count = mask.encoded_nelts ();
8444 mask.quick_grow (count);
8445 vec_perm_indices indices;
8447 for (unsigned int j = 0; j < nelts_to_build; j++)
8449 unsigned int iter_num = j / group_size;
8450 unsigned int stmt_num = j % group_size;
8451 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8452 bitmap_set_bit (used_in_lanes, i);
8453 if (repeating_p)
8455 first_vec_index = 0;
8456 mask_element = i;
8458 else
8460 /* Enforced before the loop when !repeating_p. */
8461 unsigned int const_nunits = nunits.to_constant ();
8462 vec_index = i / const_nunits;
8463 mask_element = i % const_nunits;
8464 if (vec_index == first_vec_index
8465 || first_vec_index == -1)
8467 first_vec_index = vec_index;
8469 else if (vec_index == second_vec_index
8470 || second_vec_index == -1)
8472 second_vec_index = vec_index;
8473 mask_element += const_nunits;
8475 else
8477 if (dump_p)
8478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8479 "permutation requires at "
8480 "least three vectors %G",
8481 stmt_info->stmt);
8482 gcc_assert (analyze_only);
8483 return false;
8486 gcc_assert (mask_element < 2 * const_nunits);
8489 if (mask_element != index)
8490 noop_p = false;
8491 mask[index++] = mask_element;
8493 if (index == count)
8495 if (!noop_p)
8497 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8498 if (!can_vec_perm_const_p (mode, mode, indices))
8500 if (dump_p)
8502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8503 "unsupported vect permute { ");
8504 for (i = 0; i < count; ++i)
8506 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8507 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8509 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8511 gcc_assert (analyze_only);
8512 return false;
8515 tree mask_vec = NULL_TREE;
8516 if (!analyze_only)
8517 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8519 if (second_vec_index == -1)
8520 second_vec_index = first_vec_index;
8522 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8524 ++*n_perms;
8525 if (analyze_only)
8526 continue;
8527 /* Generate the permute statement if necessary. */
8528 tree first_vec = dr_chain[first_vec_index + ri];
8529 tree second_vec = dr_chain[second_vec_index + ri];
8530 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8531 tree perm_dest
8532 = vect_create_destination_var (gimple_assign_lhs (stmt),
8533 vectype);
8534 perm_dest = make_ssa_name (perm_dest);
8535 gimple *perm_stmt
8536 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8537 second_vec, mask_vec);
8538 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8539 gsi);
8540 if (dce_chain)
8542 bitmap_set_bit (used_defs, first_vec_index + ri);
8543 bitmap_set_bit (used_defs, second_vec_index + ri);
8546 /* Store the vector statement in NODE. */
8547 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8550 else if (!analyze_only)
8552 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8554 tree first_vec = dr_chain[first_vec_index + ri];
8555 /* If mask was NULL_TREE generate the requested
8556 identity transform. */
8557 if (dce_chain)
8558 bitmap_set_bit (used_defs, first_vec_index + ri);
8560 /* Store the vector statement in NODE. */
8561 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8565 index = 0;
8566 first_vec_index = -1;
8567 second_vec_index = -1;
8568 noop_p = true;
8572 if (n_loads)
8574 if (repeating_p)
8575 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8576 else
8578 /* Enforced above when !repeating_p. */
8579 unsigned int const_nunits = nunits.to_constant ();
8580 *n_loads = 0;
8581 bool load_seen = false;
8582 for (unsigned i = 0; i < in_nlanes; ++i)
8584 if (i % const_nunits == 0)
8586 if (load_seen)
8587 *n_loads += 1;
8588 load_seen = false;
8590 if (bitmap_bit_p (used_in_lanes, i))
8591 load_seen = true;
8593 if (load_seen)
8594 *n_loads += 1;
8598 if (dce_chain)
8599 for (unsigned i = 0; i < dr_chain.length (); ++i)
8600 if (!bitmap_bit_p (used_defs, i))
8602 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8603 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8604 gsi_remove (&rgsi, true);
8605 release_defs (stmt);
8608 return true;
8611 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8612 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8613 permute statements for the SLP node NODE. Store the number of vector
8614 permute instructions in *N_PERMS and the number of vector load
8615 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8616 that were not needed. */
8618 bool
8619 vect_transform_slp_perm_load (vec_info *vinfo,
8620 slp_tree node, const vec<tree> &dr_chain,
8621 gimple_stmt_iterator *gsi, poly_uint64 vf,
8622 bool analyze_only, unsigned *n_perms,
8623 unsigned int *n_loads, bool dce_chain)
8625 return vect_transform_slp_perm_load_1 (vinfo, node,
8626 SLP_TREE_LOAD_PERMUTATION (node),
8627 dr_chain, gsi, vf, analyze_only,
8628 dump_enabled_p (), n_perms, n_loads,
8629 dce_chain);
8632 /* Produce the next vector result for SLP permutation NODE by adding a vector
8633 statement at GSI. If MASK_VEC is nonnull, add:
8635 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8637 otherwise add:
8639 <new SSA name> = FIRST_DEF. */
8641 static void
8642 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8643 slp_tree node, tree first_def, tree second_def,
8644 tree mask_vec, poly_uint64 identity_offset)
8646 tree vectype = SLP_TREE_VECTYPE (node);
8648 /* ??? We SLP match existing vector element extracts but
8649 allow punning which we need to re-instantiate at uses
8650 but have no good way of explicitly representing. */
8651 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8652 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8654 gassign *conv_stmt
8655 = gimple_build_assign (make_ssa_name (vectype),
8656 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8657 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8658 first_def = gimple_assign_lhs (conv_stmt);
8660 gassign *perm_stmt;
8661 tree perm_dest = make_ssa_name (vectype);
8662 if (mask_vec)
8664 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8665 TYPE_SIZE (vectype))
8666 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8668 gassign *conv_stmt
8669 = gimple_build_assign (make_ssa_name (vectype),
8670 build1 (VIEW_CONVERT_EXPR,
8671 vectype, second_def));
8672 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8673 second_def = gimple_assign_lhs (conv_stmt);
8675 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8676 first_def, second_def,
8677 mask_vec);
8679 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8681 /* For identity permutes we still need to handle the case
8682 of offsetted extracts or concats. */
8683 unsigned HOST_WIDE_INT c;
8684 auto first_def_nunits
8685 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8686 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8688 unsigned HOST_WIDE_INT elsz
8689 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8690 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8691 TYPE_SIZE (vectype),
8692 bitsize_int (identity_offset * elsz));
8693 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8695 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8696 first_def_nunits, &c) && c == 2)
8698 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8699 NULL_TREE, second_def);
8700 perm_stmt = gimple_build_assign (perm_dest, ctor);
8702 else
8703 gcc_unreachable ();
8705 else
8707 /* We need a copy here in case the def was external. */
8708 perm_stmt = gimple_build_assign (perm_dest, first_def);
8710 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8711 /* Store the vector statement in NODE. */
8712 node->push_vec_def (perm_stmt);
8715 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8716 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8717 If GSI is nonnull, emit the permutation there.
8719 When GSI is null, the only purpose of NODE is to give properties
8720 of the result, such as the vector type and number of SLP lanes.
8721 The node does not need to be a VEC_PERM_EXPR.
8723 If the target supports the operation, return the number of individual
8724 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8725 dump file if DUMP_P is true. */
8727 static int
8728 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8729 slp_tree node, lane_permutation_t &perm,
8730 vec<slp_tree> &children, bool dump_p)
8732 tree vectype = SLP_TREE_VECTYPE (node);
8734 /* ??? We currently only support all same vector input types
8735 while the SLP IL should really do a concat + select and thus accept
8736 arbitrary mismatches. */
8737 slp_tree child;
8738 unsigned i;
8739 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8740 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8741 tree op_vectype = NULL_TREE;
8742 FOR_EACH_VEC_ELT (children, i, child)
8743 if (SLP_TREE_VECTYPE (child))
8745 op_vectype = SLP_TREE_VECTYPE (child);
8746 break;
8748 if (!op_vectype)
8749 op_vectype = vectype;
8750 FOR_EACH_VEC_ELT (children, i, child)
8752 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8753 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8754 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8755 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8757 if (dump_p)
8758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8759 "Unsupported vector types in lane permutation\n");
8760 return -1;
8762 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8763 repeating_p = false;
8766 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8767 if (dump_p)
8769 dump_printf_loc (MSG_NOTE, vect_location,
8770 "vectorizing permutation");
8771 for (unsigned i = 0; i < perm.length (); ++i)
8772 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8773 if (repeating_p)
8774 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8775 dump_printf (MSG_NOTE, "\n");
8778 /* REPEATING_P is true if every output vector is guaranteed to use the
8779 same permute vector. We can handle that case for both variable-length
8780 and constant-length vectors, but we only handle other cases for
8781 constant-length vectors.
8783 Set:
8785 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8786 mask vector that we want to build.
8788 - NCOPIES to the number of copies of PERM that we need in order
8789 to build the necessary permute mask vectors.
8791 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8792 for each permute mask vector. This is only relevant when GSI is
8793 nonnull. */
8794 uint64_t npatterns;
8795 unsigned nelts_per_pattern;
8796 uint64_t ncopies;
8797 unsigned noutputs_per_mask;
8798 if (repeating_p)
8800 /* We need a single permute mask vector that has the form:
8802 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8804 In other words, the original n-element permute in PERM is
8805 "unrolled" to fill a full vector. The stepped vector encoding
8806 that we use for permutes requires 3n elements. */
8807 npatterns = SLP_TREE_LANES (node);
8808 nelts_per_pattern = ncopies = 3;
8809 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8811 else
8813 /* Calculate every element of every permute mask vector explicitly,
8814 instead of relying on the pattern described above. */
8815 if (!nunits.is_constant (&npatterns))
8816 return -1;
8817 nelts_per_pattern = ncopies = 1;
8818 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8819 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8820 return -1;
8821 noutputs_per_mask = 1;
8823 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8824 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8826 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8827 from the { SLP operand, scalar lane } permutation as recorded in the
8828 SLP node as intermediate step. This part should already work
8829 with SLP children with arbitrary number of lanes. */
8830 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8831 auto_vec<unsigned> active_lane;
8832 vperm.create (olanes);
8833 active_lane.safe_grow_cleared (children.length (), true);
8834 for (unsigned i = 0; i < ncopies; ++i)
8836 for (unsigned pi = 0; pi < perm.length (); ++pi)
8838 std::pair<unsigned, unsigned> p = perm[pi];
8839 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8840 if (repeating_p)
8841 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8842 else
8844 /* We checked above that the vectors are constant-length. */
8845 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8846 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8847 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8848 vperm.quick_push ({{p.first, vi}, vl});
8851 /* Advance to the next group. */
8852 for (unsigned j = 0; j < children.length (); ++j)
8853 active_lane[j] += SLP_TREE_LANES (children[j]);
8856 if (dump_p)
8858 dump_printf_loc (MSG_NOTE, vect_location,
8859 "vectorizing permutation");
8860 for (unsigned i = 0; i < perm.length (); ++i)
8861 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8862 if (repeating_p)
8863 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8864 dump_printf (MSG_NOTE, "\n");
8865 dump_printf_loc (MSG_NOTE, vect_location, "as");
8866 for (unsigned i = 0; i < vperm.length (); ++i)
8868 if (i != 0
8869 && (repeating_p
8870 ? multiple_p (i, npatterns)
8871 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8872 dump_printf (MSG_NOTE, ",");
8873 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8874 vperm[i].first.first, vperm[i].first.second,
8875 vperm[i].second);
8877 dump_printf (MSG_NOTE, "\n");
8880 /* We can only handle two-vector permutes, everything else should
8881 be lowered on the SLP level. The following is closely inspired
8882 by vect_transform_slp_perm_load and is supposed to eventually
8883 replace it.
8884 ??? As intermediate step do code-gen in the SLP tree representation
8885 somehow? */
8886 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8887 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8888 unsigned int index = 0;
8889 poly_uint64 mask_element;
8890 vec_perm_builder mask;
8891 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8892 unsigned int count = mask.encoded_nelts ();
8893 mask.quick_grow (count);
8894 vec_perm_indices indices;
8895 unsigned nperms = 0;
8896 for (unsigned i = 0; i < vperm.length (); ++i)
8898 mask_element = vperm[i].second;
8899 if (first_vec.first == -1U
8900 || first_vec == vperm[i].first)
8901 first_vec = vperm[i].first;
8902 else if (second_vec.first == -1U
8903 || second_vec == vperm[i].first)
8905 second_vec = vperm[i].first;
8906 mask_element += nunits;
8908 else
8910 if (dump_p)
8911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8912 "permutation requires at "
8913 "least three vectors\n");
8914 gcc_assert (!gsi);
8915 return -1;
8918 mask[index++] = mask_element;
8920 if (index == count)
8922 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8923 TYPE_VECTOR_SUBPARTS (op_vectype));
8924 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8925 && constant_multiple_p (mask[0], nunits));
8926 machine_mode vmode = TYPE_MODE (vectype);
8927 machine_mode op_vmode = TYPE_MODE (op_vectype);
8928 unsigned HOST_WIDE_INT c;
8929 if ((!identity_p
8930 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8931 || (identity_p
8932 && !known_le (nunits,
8933 TYPE_VECTOR_SUBPARTS (op_vectype))
8934 && (!constant_multiple_p (nunits,
8935 TYPE_VECTOR_SUBPARTS (op_vectype),
8936 &c) || c != 2)))
8938 if (dump_p)
8940 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8941 vect_location,
8942 "unsupported vect permute { ");
8943 for (i = 0; i < count; ++i)
8945 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8946 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8948 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8950 gcc_assert (!gsi);
8951 return -1;
8954 if (!identity_p)
8955 nperms++;
8956 if (gsi)
8958 if (second_vec.first == -1U)
8959 second_vec = first_vec;
8961 slp_tree
8962 first_node = children[first_vec.first],
8963 second_node = children[second_vec.first];
8965 tree mask_vec = NULL_TREE;
8966 if (!identity_p)
8967 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8969 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8971 tree first_def
8972 = vect_get_slp_vect_def (first_node,
8973 first_vec.second + vi);
8974 tree second_def
8975 = vect_get_slp_vect_def (second_node,
8976 second_vec.second + vi);
8977 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8978 second_def, mask_vec, mask[0]);
8982 index = 0;
8983 first_vec = std::make_pair (-1U, -1U);
8984 second_vec = std::make_pair (-1U, -1U);
8988 return nperms;
8991 /* Vectorize the SLP permutations in NODE as specified
8992 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8993 child number and lane number.
8994 Interleaving of two two-lane two-child SLP subtrees (not supported):
8995 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8996 A blend of two four-lane two-child SLP subtrees:
8997 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8998 Highpart of a four-lane one-child SLP subtree (not supported):
8999 [ { 0, 2 }, { 0, 3 } ]
9000 Where currently only a subset is supported by code generating below. */
9002 static bool
9003 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9004 slp_tree node, stmt_vector_for_cost *cost_vec)
9006 tree vectype = SLP_TREE_VECTYPE (node);
9007 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9008 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9009 SLP_TREE_CHILDREN (node),
9010 dump_enabled_p ());
9011 if (nperms < 0)
9012 return false;
9014 if (!gsi)
9015 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9017 return true;
9020 /* Vectorize SLP NODE. */
9022 static void
9023 vect_schedule_slp_node (vec_info *vinfo,
9024 slp_tree node, slp_instance instance)
9026 gimple_stmt_iterator si;
9027 int i;
9028 slp_tree child;
9030 /* For existing vectors there's nothing to do. */
9031 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
9032 && SLP_TREE_VEC_DEFS (node).exists ())
9033 return;
9035 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9037 /* Vectorize externals and constants. */
9038 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9039 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9041 /* ??? vectorizable_shift can end up using a scalar operand which is
9042 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9043 node in this case. */
9044 if (!SLP_TREE_VECTYPE (node))
9045 return;
9047 vect_create_constant_vectors (vinfo, node);
9048 return;
9051 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9053 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9054 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9056 if (dump_enabled_p ())
9057 dump_printf_loc (MSG_NOTE, vect_location,
9058 "------>vectorizing SLP node starting from: %G",
9059 stmt_info->stmt);
9061 if (STMT_VINFO_DATA_REF (stmt_info)
9062 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9064 /* Vectorized loads go before the first scalar load to make it
9065 ready early, vectorized stores go before the last scalar
9066 stmt which is where all uses are ready. */
9067 stmt_vec_info last_stmt_info = NULL;
9068 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9069 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9070 else /* DR_IS_WRITE */
9071 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9072 si = gsi_for_stmt (last_stmt_info->stmt);
9074 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9075 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9076 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9077 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9079 /* For PHI node vectorization we do not use the insertion iterator. */
9080 si = gsi_none ();
9082 else
9084 /* Emit other stmts after the children vectorized defs which is
9085 earliest possible. */
9086 gimple *last_stmt = NULL;
9087 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9088 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9089 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9091 /* But avoid scheduling internal defs outside of the loop when
9092 we might have only implicitly tracked loop mask/len defs. */
9093 gimple_stmt_iterator si
9094 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9095 last_stmt = *si;
9097 bool seen_vector_def = false;
9098 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9099 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9101 /* For fold-left reductions we are retaining the scalar
9102 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9103 set so the representation isn't perfect. Resort to the
9104 last scalar def here. */
9105 if (SLP_TREE_VEC_DEFS (child).is_empty ())
9107 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9108 == cycle_phi_info_type);
9109 gphi *phi = as_a <gphi *>
9110 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9111 if (!last_stmt
9112 || vect_stmt_dominates_stmt_p (last_stmt, phi))
9113 last_stmt = phi;
9115 /* We are emitting all vectorized stmts in the same place and
9116 the last one is the last.
9117 ??? Unless we have a load permutation applied and that
9118 figures to re-use an earlier generated load. */
9119 unsigned j;
9120 tree vdef;
9121 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9123 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9124 if (!last_stmt
9125 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9126 last_stmt = vstmt;
9129 else if (!SLP_TREE_VECTYPE (child))
9131 /* For externals we use unvectorized at all scalar defs. */
9132 unsigned j;
9133 tree def;
9134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9135 if (TREE_CODE (def) == SSA_NAME
9136 && !SSA_NAME_IS_DEFAULT_DEF (def))
9138 gimple *stmt = SSA_NAME_DEF_STMT (def);
9139 if (!last_stmt
9140 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9141 last_stmt = stmt;
9144 else
9146 /* For externals we have to look at all defs since their
9147 insertion place is decided per vector. But beware
9148 of pre-existing vectors where we need to make sure
9149 we do not insert before the region boundary. */
9150 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9151 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9152 seen_vector_def = true;
9153 else
9155 unsigned j;
9156 tree vdef;
9157 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9158 if (TREE_CODE (vdef) == SSA_NAME
9159 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9161 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9162 if (!last_stmt
9163 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9164 last_stmt = vstmt;
9168 /* This can happen when all children are pre-existing vectors or
9169 constants. */
9170 if (!last_stmt)
9171 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9172 if (!last_stmt)
9174 gcc_assert (seen_vector_def);
9175 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9177 else if (is_ctrl_altering_stmt (last_stmt))
9179 /* We split regions to vectorize at control altering stmts
9180 with a definition so this must be an external which
9181 we can insert at the start of the region. */
9182 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9184 else if (is_a <bb_vec_info> (vinfo)
9185 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9186 && gimple_could_trap_p (stmt_info->stmt))
9188 /* We've constrained possibly trapping operations to all come
9189 from the same basic-block, if vectorized defs would allow earlier
9190 scheduling still force vectorized stmts to the original block.
9191 This is only necessary for BB vectorization since for loop vect
9192 all operations are in a single BB and scalar stmt based
9193 placement doesn't play well with epilogue vectorization. */
9194 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9195 gimple_bb (stmt_info->stmt),
9196 gimple_bb (last_stmt)));
9197 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9199 else if (is_a <gphi *> (last_stmt))
9200 si = gsi_after_labels (gimple_bb (last_stmt));
9201 else
9203 si = gsi_for_stmt (last_stmt);
9204 gsi_next (&si);
9208 /* Handle purely internal nodes. */
9209 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9211 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9212 be shared with different SLP nodes (but usually it's the same
9213 operation apart from the case the stmt is only there for denoting
9214 the actual scalar lane defs ...). So do not call vect_transform_stmt
9215 but open-code it here (partly). */
9216 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9217 gcc_assert (done);
9218 stmt_vec_info slp_stmt_info;
9219 unsigned int i;
9220 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9221 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9223 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9224 instance, i, true, NULL);
9225 gcc_assert (done);
9228 else
9229 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9232 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9233 For loop vectorization this is done in vectorizable_call, but for SLP
9234 it needs to be deferred until end of vect_schedule_slp, because multiple
9235 SLP instances may refer to the same scalar stmt. */
9237 static void
9238 vect_remove_slp_scalar_calls (vec_info *vinfo,
9239 slp_tree node, hash_set<slp_tree> &visited)
9241 gimple *new_stmt;
9242 gimple_stmt_iterator gsi;
9243 int i;
9244 slp_tree child;
9245 tree lhs;
9246 stmt_vec_info stmt_info;
9248 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9249 return;
9251 if (visited.add (node))
9252 return;
9254 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9255 vect_remove_slp_scalar_calls (vinfo, child, visited);
9257 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9259 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9260 if (!stmt || gimple_bb (stmt) == NULL)
9261 continue;
9262 if (is_pattern_stmt_p (stmt_info)
9263 || !PURE_SLP_STMT (stmt_info))
9264 continue;
9265 lhs = gimple_call_lhs (stmt);
9266 if (lhs)
9267 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9268 else
9270 new_stmt = gimple_build_nop ();
9271 unlink_stmt_vdef (stmt_info->stmt);
9273 gsi = gsi_for_stmt (stmt);
9274 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9275 if (lhs)
9276 SSA_NAME_DEF_STMT (lhs) = new_stmt;
9280 static void
9281 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9283 hash_set<slp_tree> visited;
9284 vect_remove_slp_scalar_calls (vinfo, node, visited);
9287 /* Vectorize the instance root. */
9289 void
9290 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9292 gassign *rstmt = NULL;
9294 if (instance->kind == slp_inst_kind_ctor)
9296 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9298 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9299 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9300 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9301 TREE_TYPE (vect_lhs)))
9302 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9303 vect_lhs);
9304 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9306 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9308 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9309 tree child_def;
9310 int j;
9311 vec<constructor_elt, va_gc> *v;
9312 vec_alloc (v, nelts);
9314 /* A CTOR can handle V16HI composition from VNx8HI so we
9315 do not need to convert vector elements if the types
9316 do not match. */
9317 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9318 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9319 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9320 tree rtype
9321 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9322 tree r_constructor = build_constructor (rtype, v);
9323 rstmt = gimple_build_assign (lhs, r_constructor);
9326 else if (instance->kind == slp_inst_kind_bb_reduc)
9328 /* Largely inspired by reduction chain epilogue handling in
9329 vect_create_epilog_for_reduction. */
9330 vec<tree> vec_defs = vNULL;
9331 vect_get_slp_defs (node, &vec_defs);
9332 enum tree_code reduc_code
9333 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9334 /* ??? We actually have to reflect signs somewhere. */
9335 if (reduc_code == MINUS_EXPR)
9336 reduc_code = PLUS_EXPR;
9337 gimple_seq epilogue = NULL;
9338 /* We may end up with more than one vector result, reduce them
9339 to one vector. */
9340 tree vec_def = vec_defs[0];
9341 tree vectype = TREE_TYPE (vec_def);
9342 tree compute_vectype = vectype;
9343 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9344 && TYPE_OVERFLOW_UNDEFINED (vectype)
9345 && operation_can_overflow (reduc_code));
9346 if (pun_for_overflow_p)
9348 compute_vectype = unsigned_type_for (vectype);
9349 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9350 compute_vectype, vec_def);
9352 for (unsigned i = 1; i < vec_defs.length (); ++i)
9354 tree def = vec_defs[i];
9355 if (pun_for_overflow_p)
9356 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9357 compute_vectype, def);
9358 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9359 vec_def, def);
9361 vec_defs.release ();
9362 /* ??? Support other schemes than direct internal fn. */
9363 internal_fn reduc_fn;
9364 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9365 || reduc_fn == IFN_LAST)
9366 gcc_unreachable ();
9367 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9368 TREE_TYPE (compute_vectype), vec_def);
9369 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9371 tree rem_def = NULL_TREE;
9372 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9374 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9375 if (!rem_def)
9376 rem_def = def;
9377 else
9378 rem_def = gimple_build (&epilogue, reduc_code,
9379 TREE_TYPE (scalar_def),
9380 rem_def, def);
9382 scalar_def = gimple_build (&epilogue, reduc_code,
9383 TREE_TYPE (scalar_def),
9384 scalar_def, rem_def);
9386 scalar_def = gimple_convert (&epilogue,
9387 TREE_TYPE (vectype), scalar_def);
9388 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9389 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9390 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9391 update_stmt (gsi_stmt (rgsi));
9392 return;
9394 else
9395 gcc_unreachable ();
9397 gcc_assert (rstmt);
9399 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9400 gsi_replace (&rgsi, rstmt, true);
9403 struct slp_scc_info
9405 bool on_stack;
9406 int dfs;
9407 int lowlink;
9410 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9412 static void
9413 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9414 hash_map<slp_tree, slp_scc_info> &scc_info,
9415 int &maxdfs, vec<slp_tree> &stack)
9417 bool existed_p;
9418 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9419 gcc_assert (!existed_p);
9420 info->dfs = maxdfs;
9421 info->lowlink = maxdfs;
9422 maxdfs++;
9424 /* Leaf. */
9425 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9427 info->on_stack = false;
9428 vect_schedule_slp_node (vinfo, node, instance);
9429 return;
9432 info->on_stack = true;
9433 stack.safe_push (node);
9435 unsigned i;
9436 slp_tree child;
9437 /* DFS recurse. */
9438 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9440 if (!child)
9441 continue;
9442 slp_scc_info *child_info = scc_info.get (child);
9443 if (!child_info)
9445 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9446 /* Recursion might have re-allocated the node. */
9447 info = scc_info.get (node);
9448 child_info = scc_info.get (child);
9449 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9451 else if (child_info->on_stack)
9452 info->lowlink = MIN (info->lowlink, child_info->dfs);
9454 if (info->lowlink != info->dfs)
9455 return;
9457 auto_vec<slp_tree, 4> phis_to_fixup;
9459 /* Singleton. */
9460 if (stack.last () == node)
9462 stack.pop ();
9463 info->on_stack = false;
9464 vect_schedule_slp_node (vinfo, node, instance);
9465 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9466 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9467 phis_to_fixup.quick_push (node);
9469 else
9471 /* SCC. */
9472 int last_idx = stack.length () - 1;
9473 while (stack[last_idx] != node)
9474 last_idx--;
9475 /* We can break the cycle at PHIs who have at least one child
9476 code generated. Then we could re-start the DFS walk until
9477 all nodes in the SCC are covered (we might have new entries
9478 for only back-reachable nodes). But it's simpler to just
9479 iterate and schedule those that are ready. */
9480 unsigned todo = stack.length () - last_idx;
9483 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9485 slp_tree entry = stack[idx];
9486 if (!entry)
9487 continue;
9488 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9489 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9490 bool ready = !phi;
9491 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9492 if (!child)
9494 gcc_assert (phi);
9495 ready = true;
9496 break;
9498 else if (scc_info.get (child)->on_stack)
9500 if (!phi)
9502 ready = false;
9503 break;
9506 else
9508 if (phi)
9510 ready = true;
9511 break;
9514 if (ready)
9516 vect_schedule_slp_node (vinfo, entry, instance);
9517 scc_info.get (entry)->on_stack = false;
9518 stack[idx] = NULL;
9519 todo--;
9520 if (phi)
9521 phis_to_fixup.safe_push (entry);
9525 while (todo != 0);
9527 /* Pop the SCC. */
9528 stack.truncate (last_idx);
9531 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9532 slp_tree phi_node;
9533 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9535 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9536 edge_iterator ei;
9537 edge e;
9538 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9540 unsigned dest_idx = e->dest_idx;
9541 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9542 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9543 continue;
9544 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9545 /* Simply fill all args. */
9546 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9547 != vect_first_order_recurrence)
9548 for (unsigned i = 0; i < n; ++i)
9550 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9551 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9552 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9553 e, gimple_phi_arg_location (phi, dest_idx));
9555 else
9557 /* Unless it is a first order recurrence which needs
9558 args filled in for both the PHI node and the permutes. */
9559 gimple *perm
9560 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9561 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9562 add_phi_arg (as_a <gphi *> (rphi),
9563 vect_get_slp_vect_def (child, n - 1),
9564 e, gimple_phi_arg_location (phi, dest_idx));
9565 for (unsigned i = 0; i < n; ++i)
9567 gimple *perm
9568 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9569 if (i > 0)
9570 gimple_assign_set_rhs1 (perm,
9571 vect_get_slp_vect_def (child, i - 1));
9572 gimple_assign_set_rhs2 (perm,
9573 vect_get_slp_vect_def (child, i));
9574 update_stmt (perm);
9581 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9583 void
9584 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9586 slp_instance instance;
9587 unsigned int i;
9589 hash_map<slp_tree, slp_scc_info> scc_info;
9590 int maxdfs = 0;
9591 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9593 slp_tree node = SLP_INSTANCE_TREE (instance);
9594 if (dump_enabled_p ())
9596 dump_printf_loc (MSG_NOTE, vect_location,
9597 "Vectorizing SLP tree:\n");
9598 /* ??? Dump all? */
9599 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9600 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9601 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9602 vect_print_slp_graph (MSG_NOTE, vect_location,
9603 SLP_INSTANCE_TREE (instance));
9605 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9606 have a PHI be the node breaking the cycle. */
9607 auto_vec<slp_tree> stack;
9608 if (!scc_info.get (node))
9609 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9611 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9612 vectorize_slp_instance_root_stmt (node, instance);
9614 if (dump_enabled_p ())
9615 dump_printf_loc (MSG_NOTE, vect_location,
9616 "vectorizing stmts using SLP.\n");
9619 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9621 slp_tree root = SLP_INSTANCE_TREE (instance);
9622 stmt_vec_info store_info;
9623 unsigned int j;
9625 /* Remove scalar call stmts. Do not do this for basic-block
9626 vectorization as not all uses may be vectorized.
9627 ??? Why should this be necessary? DCE should be able to
9628 remove the stmts itself.
9629 ??? For BB vectorization we can as well remove scalar
9630 stmts starting from the SLP tree root if they have no
9631 uses. */
9632 if (is_a <loop_vec_info> (vinfo))
9633 vect_remove_slp_scalar_calls (vinfo, root);
9635 /* Remove vectorized stores original scalar stmts. */
9636 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9638 if (!STMT_VINFO_DATA_REF (store_info)
9639 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9640 break;
9642 store_info = vect_orig_stmt (store_info);
9643 /* Free the attached stmt_vec_info and remove the stmt. */
9644 vinfo->remove_stmt (store_info);
9646 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9647 to not crash in vect_free_slp_tree later. */
9648 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9649 SLP_TREE_REPRESENTATIVE (root) = NULL;