Generic configury support for shared libs on VxWorks
[official-gcc.git] / gcc / tree-vect-slp.cc
blob229f2663ebcc6161e9294381a9a857308fe4c713
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_STMTS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_STMTS (this).release ();
145 SLP_TREE_VEC_DEFS (this).release ();
146 SLP_TREE_LOAD_PERMUTATION (this).release ();
147 SLP_TREE_LANE_PERMUTATION (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
154 void
155 vect_free_slp_tree (slp_tree node)
157 int i;
158 slp_tree child;
160 if (--SLP_TREE_REF_COUNT (node) != 0)
161 return;
163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
164 if (child)
165 vect_free_slp_tree (child);
167 /* If the node defines any SLP only patterns then those patterns are no
168 longer valid and should be removed. */
169 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
170 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
172 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
173 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
174 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
177 delete node;
180 /* Return a location suitable for dumpings related to the SLP instance. */
182 dump_user_location_t
183 _slp_instance::location () const
185 if (!root_stmts.is_empty ())
186 return root_stmts[0]->stmt;
187 else
188 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
192 /* Free the memory allocated for the SLP instance. */
194 void
195 vect_free_slp_instance (slp_instance instance)
197 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
198 SLP_INSTANCE_LOADS (instance).release ();
199 SLP_INSTANCE_ROOT_STMTS (instance).release ();
200 instance->subgraph_entries.release ();
201 instance->cost_vec.release ();
202 free (instance);
206 /* Create an SLP node for SCALAR_STMTS. */
208 slp_tree
209 vect_create_new_slp_node (unsigned nops, tree_code code)
211 slp_tree node = new _slp_tree;
212 SLP_TREE_SCALAR_STMTS (node) = vNULL;
213 SLP_TREE_CHILDREN (node).create (nops);
214 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
215 SLP_TREE_CODE (node) = code;
216 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (slp_tree node,
222 vec<stmt_vec_info> scalar_stmts, unsigned nops)
224 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
225 SLP_TREE_CHILDREN (node).create (nops);
226 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
227 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
228 SLP_TREE_LANES (node) = scalar_stmts.length ();
229 return node;
232 /* Create an SLP node for SCALAR_STMTS. */
234 static slp_tree
235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
237 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
240 /* Create an SLP node for OPS. */
242 static slp_tree
243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
245 SLP_TREE_SCALAR_OPS (node) = ops;
246 SLP_TREE_DEF_TYPE (node) = vect_external_def;
247 SLP_TREE_LANES (node) = ops.length ();
248 return node;
251 /* Create an SLP node for OPS. */
253 static slp_tree
254 vect_create_new_slp_node (vec<tree> ops)
256 return vect_create_new_slp_node (new _slp_tree, ops);
260 /* This structure is used in creation of an SLP tree. Each instance
261 corresponds to the same operand in a group of scalar stmts in an SLP
262 node. */
263 typedef struct _slp_oprnd_info
265 /* Def-stmts for the operands. */
266 vec<stmt_vec_info> def_stmts;
267 /* Operands. */
268 vec<tree> ops;
269 /* Information about the first statement, its vector def-type, type, the
270 operand itself in case it's constant, and an indication if it's a pattern
271 stmt. */
272 tree first_op_type;
273 enum vect_def_type first_dt;
274 bool any_pattern;
275 } *slp_oprnd_info;
278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
279 operand. */
280 static vec<slp_oprnd_info>
281 vect_create_oprnd_info (int nops, int group_size)
283 int i;
284 slp_oprnd_info oprnd_info;
285 vec<slp_oprnd_info> oprnds_info;
287 oprnds_info.create (nops);
288 for (i = 0; i < nops; i++)
290 oprnd_info = XNEW (struct _slp_oprnd_info);
291 oprnd_info->def_stmts.create (group_size);
292 oprnd_info->ops.create (group_size);
293 oprnd_info->first_dt = vect_uninitialized_def;
294 oprnd_info->first_op_type = NULL_TREE;
295 oprnd_info->any_pattern = false;
296 oprnds_info.quick_push (oprnd_info);
299 return oprnds_info;
303 /* Free operands info. */
305 static void
306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
308 int i;
309 slp_oprnd_info oprnd_info;
311 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
313 oprnd_info->def_stmts.release ();
314 oprnd_info->ops.release ();
315 XDELETE (oprnd_info);
318 oprnds_info.release ();
321 /* Return the execution frequency of NODE (so that a higher value indicates
322 a "more important" node when optimizing for speed). */
324 static sreal
325 vect_slp_node_weight (slp_tree node)
327 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
328 basic_block bb = gimple_bb (stmt_info->stmt);
329 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
332 /* Return true if STMTS contains a pattern statement. */
334 static bool
335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
337 stmt_vec_info stmt_info;
338 unsigned int i;
339 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
340 if (is_pattern_stmt_p (stmt_info))
341 return true;
342 return false;
345 /* Return true when all lanes in the external or constant NODE have
346 the same value. */
348 static bool
349 vect_slp_tree_uniform_p (slp_tree node)
351 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
352 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
354 /* Pre-exsting vectors. */
355 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
356 return false;
358 unsigned i;
359 tree op, first = NULL_TREE;
360 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
361 if (!first)
362 first = op;
363 else if (!operand_equal_p (first, op, 0))
364 return false;
366 return true;
369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
370 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
371 of the chain. */
374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
375 stmt_vec_info first_stmt_info)
377 stmt_vec_info next_stmt_info = first_stmt_info;
378 int result = 0;
380 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
381 return -1;
385 if (next_stmt_info == stmt_info)
386 return result;
387 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
388 if (next_stmt_info)
389 result += DR_GROUP_GAP (next_stmt_info);
391 while (next_stmt_info);
393 return -1;
396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
397 using the method implemented by duplicate_and_interleave. Return true
398 if so, returning the number of intermediate vectors in *NVECTORS_OUT
399 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
400 (if nonnull). */
402 bool
403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
404 tree elt_type, unsigned int *nvectors_out,
405 tree *vector_type_out,
406 tree *permutes)
408 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
409 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
410 return false;
412 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
413 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
414 unsigned int nvectors = 1;
415 for (;;)
417 scalar_int_mode int_mode;
418 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
419 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
421 /* Get the natural vector type for this SLP group size. */
422 tree int_type = build_nonstandard_integer_type
423 (GET_MODE_BITSIZE (int_mode), 1);
424 tree vector_type
425 = get_vectype_for_scalar_type (vinfo, int_type, count);
426 if (vector_type
427 && VECTOR_MODE_P (TYPE_MODE (vector_type))
428 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
429 GET_MODE_SIZE (base_vector_mode)))
431 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
432 together into elements of type INT_TYPE and using the result
433 to build NVECTORS vectors. */
434 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
435 vec_perm_builder sel1 (nelts, 2, 3);
436 vec_perm_builder sel2 (nelts, 2, 3);
437 poly_int64 half_nelts = exact_div (nelts, 2);
438 for (unsigned int i = 0; i < 3; ++i)
440 sel1.quick_push (i);
441 sel1.quick_push (i + nelts);
442 sel2.quick_push (half_nelts + i);
443 sel2.quick_push (half_nelts + i + nelts);
445 vec_perm_indices indices1 (sel1, 2, nelts);
446 vec_perm_indices indices2 (sel2, 2, nelts);
447 machine_mode vmode = TYPE_MODE (vector_type);
448 if (can_vec_perm_const_p (vmode, vmode, indices1)
449 && can_vec_perm_const_p (vmode, vmode, indices2))
451 if (nvectors_out)
452 *nvectors_out = nvectors;
453 if (vector_type_out)
454 *vector_type_out = vector_type;
455 if (permutes)
457 permutes[0] = vect_gen_perm_mask_checked (vector_type,
458 indices1);
459 permutes[1] = vect_gen_perm_mask_checked (vector_type,
460 indices2);
462 return true;
466 if (!multiple_p (elt_bytes, 2, &elt_bytes))
467 return false;
468 nvectors *= 2;
472 /* Return true if DTA and DTB match. */
474 static bool
475 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
477 return (dta == dtb
478 || ((dta == vect_external_def || dta == vect_constant_def)
479 && (dtb == vect_external_def || dtb == vect_constant_def)));
482 static const int cond_expr_maps[3][5] = {
483 { 4, -1, -2, 1, 2 },
484 { 4, -2, -1, 1, 2 },
485 { 4, -1, -2, 2, 1 }
487 static const int arg1_map[] = { 1, 1 };
488 static const int arg2_map[] = { 1, 2 };
489 static const int arg1_arg4_map[] = { 2, 1, 4 };
490 static const int op1_op0_map[] = { 2, 1, 0 };
492 /* For most SLP statements, there is a one-to-one mapping between
493 gimple arguments and child nodes. If that is not true for STMT,
494 return an array that contains:
496 - the number of child nodes, followed by
497 - for each child node, the index of the argument associated with that node.
498 The special index -1 is the first operand of an embedded comparison and
499 the special index -2 is the second operand of an embedded comparison.
501 SWAP is as for vect_get_and_check_slp_defs. */
503 static const int *
504 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
506 if (auto assign = dyn_cast<const gassign *> (stmt))
508 if (gimple_assign_rhs_code (assign) == COND_EXPR
509 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
510 return cond_expr_maps[swap];
511 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
512 && swap)
513 return op1_op0_map;
515 gcc_assert (!swap);
516 if (auto call = dyn_cast<const gcall *> (stmt))
518 if (gimple_call_internal_p (call))
519 switch (gimple_call_internal_fn (call))
521 case IFN_MASK_LOAD:
522 return arg2_map;
524 case IFN_GATHER_LOAD:
525 return arg1_map;
527 case IFN_MASK_GATHER_LOAD:
528 return arg1_arg4_map;
530 default:
531 break;
534 return nullptr;
537 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
538 they are of a valid type and that they match the defs of the first stmt of
539 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
540 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
541 indicates swap is required for cond_expr stmts. Specifically, SWAP
542 is 1 if STMT is cond and operands of comparison need to be swapped;
543 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
545 If there was a fatal error return -1; if the error could be corrected by
546 swapping operands of father node of this one, return 1; if everything is
547 ok return 0. */
548 static int
549 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
550 bool *skip_args,
551 vec<stmt_vec_info> stmts, unsigned stmt_num,
552 vec<slp_oprnd_info> *oprnds_info)
554 stmt_vec_info stmt_info = stmts[stmt_num];
555 tree oprnd;
556 unsigned int i, number_of_oprnds;
557 enum vect_def_type dt = vect_uninitialized_def;
558 slp_oprnd_info oprnd_info;
559 unsigned int commutative_op = -1U;
560 bool first = stmt_num == 0;
562 if (!is_a<gcall *> (stmt_info->stmt)
563 && !is_a<gassign *> (stmt_info->stmt)
564 && !is_a<gphi *> (stmt_info->stmt))
565 return -1;
567 number_of_oprnds = gimple_num_args (stmt_info->stmt);
568 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
569 if (map)
570 number_of_oprnds = *map++;
571 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
573 if (gimple_call_internal_p (stmt))
575 internal_fn ifn = gimple_call_internal_fn (stmt);
576 commutative_op = first_commutative_argument (ifn);
579 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
581 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
582 commutative_op = 0;
585 bool swapped = (swap != 0);
586 bool backedge = false;
587 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
588 for (i = 0; i < number_of_oprnds; i++)
590 int opno = map ? map[i] : int (i);
591 if (opno < 0)
592 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
593 else
595 oprnd = gimple_arg (stmt_info->stmt, opno);
596 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
597 backedge = dominated_by_p (CDI_DOMINATORS,
598 gimple_phi_arg_edge (stmt, opno)->src,
599 gimple_bb (stmt_info->stmt));
601 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
602 oprnd = TREE_OPERAND (oprnd, 0);
604 oprnd_info = (*oprnds_info)[i];
606 stmt_vec_info def_stmt_info;
607 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
611 "Build SLP failed: can't analyze def for %T\n",
612 oprnd);
614 return -1;
617 if (skip_args[i])
619 oprnd_info->def_stmts.quick_push (NULL);
620 oprnd_info->ops.quick_push (NULL_TREE);
621 oprnd_info->first_dt = vect_uninitialized_def;
622 continue;
625 oprnd_info->def_stmts.quick_push (def_stmt_info);
626 oprnd_info->ops.quick_push (oprnd);
628 if (def_stmt_info
629 && is_pattern_stmt_p (def_stmt_info))
631 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
632 != def_stmt_info)
633 oprnd_info->any_pattern = true;
634 else
635 /* If we promote this to external use the original stmt def. */
636 oprnd_info->ops.last ()
637 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
640 /* If there's a extern def on a backedge make sure we can
641 code-generate at the region start.
642 ??? This is another case that could be fixed by adjusting
643 how we split the function but at the moment we'd have conflicting
644 goals there. */
645 if (backedge
646 && dts[i] == vect_external_def
647 && is_a <bb_vec_info> (vinfo)
648 && TREE_CODE (oprnd) == SSA_NAME
649 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
650 && !dominated_by_p (CDI_DOMINATORS,
651 as_a <bb_vec_info> (vinfo)->bbs[0],
652 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "Build SLP failed: extern def %T only defined "
657 "on backedge\n", oprnd);
658 return -1;
661 if (first)
663 tree type = TREE_TYPE (oprnd);
664 dt = dts[i];
665 if ((dt == vect_constant_def
666 || dt == vect_external_def)
667 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
668 && (TREE_CODE (type) == BOOLEAN_TYPE
669 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
670 type)))
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
674 "Build SLP failed: invalid type of def "
675 "for variable-length SLP %T\n", oprnd);
676 return -1;
679 /* For the swapping logic below force vect_reduction_def
680 for the reduction op in a SLP reduction group. */
681 if (!STMT_VINFO_DATA_REF (stmt_info)
682 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
683 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
684 && def_stmt_info)
685 dts[i] = dt = vect_reduction_def;
687 /* Check the types of the definition. */
688 switch (dt)
690 case vect_external_def:
691 case vect_constant_def:
692 case vect_internal_def:
693 case vect_reduction_def:
694 case vect_induction_def:
695 case vect_nested_cycle:
696 break;
698 default:
699 /* FORNOW: Not supported. */
700 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: illegal type of def %T\n",
703 oprnd);
704 return -1;
707 oprnd_info->first_dt = dt;
708 oprnd_info->first_op_type = type;
711 if (first)
712 return 0;
714 /* Now match the operand definition types to that of the first stmt. */
715 for (i = 0; i < number_of_oprnds;)
717 if (skip_args[i])
719 ++i;
720 continue;
723 oprnd_info = (*oprnds_info)[i];
724 dt = dts[i];
725 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
726 oprnd = oprnd_info->ops[stmt_num];
727 tree type = TREE_TYPE (oprnd);
729 if (!types_compatible_p (oprnd_info->first_op_type, type))
731 if (dump_enabled_p ())
732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
733 "Build SLP failed: different operand types\n");
734 return 1;
737 /* Not first stmt of the group, check that the def-stmt/s match
738 the def-stmt/s of the first stmt. Allow different definition
739 types for reduction chains: the first stmt must be a
740 vect_reduction_def (a phi node), and the rest
741 end in the reduction chain. */
742 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
743 && !(oprnd_info->first_dt == vect_reduction_def
744 && !STMT_VINFO_DATA_REF (stmt_info)
745 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
746 && def_stmt_info
747 && !STMT_VINFO_DATA_REF (def_stmt_info)
748 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
749 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
750 || (!STMT_VINFO_DATA_REF (stmt_info)
751 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
752 && ((!def_stmt_info
753 || STMT_VINFO_DATA_REF (def_stmt_info)
754 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
755 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
756 != (oprnd_info->first_dt != vect_reduction_def))))
758 /* Try swapping operands if we got a mismatch. For BB
759 vectorization only in case it will clearly improve things. */
760 if (i == commutative_op && !swapped
761 && (!is_a <bb_vec_info> (vinfo)
762 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
763 dts[i+1])
764 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
765 || vect_def_types_match
766 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "trying swapped operands\n");
771 std::swap (dts[i], dts[i+1]);
772 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
773 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
774 std::swap ((*oprnds_info)[i]->ops[stmt_num],
775 (*oprnds_info)[i+1]->ops[stmt_num]);
776 swapped = true;
777 continue;
780 if (is_a <bb_vec_info> (vinfo)
781 && !oprnd_info->any_pattern)
783 /* Now for commutative ops we should see whether we can
784 make the other operand matching. */
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
787 "treating operand as external\n");
788 oprnd_info->first_dt = dt = vect_external_def;
790 else
792 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "Build SLP failed: different types\n");
795 return 1;
799 /* Make sure to demote the overall operand to external. */
800 if (dt == vect_external_def)
801 oprnd_info->first_dt = vect_external_def;
802 /* For a SLP reduction chain we want to duplicate the reduction to
803 each of the chain members. That gets us a sane SLP graph (still
804 the stmts are not 100% correct wrt the initial values). */
805 else if ((dt == vect_internal_def
806 || dt == vect_reduction_def)
807 && oprnd_info->first_dt == vect_reduction_def
808 && !STMT_VINFO_DATA_REF (stmt_info)
809 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
810 && !STMT_VINFO_DATA_REF (def_stmt_info)
811 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
812 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
814 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
815 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
818 ++i;
821 /* Swap operands. */
822 if (swapped)
824 if (dump_enabled_p ())
825 dump_printf_loc (MSG_NOTE, vect_location,
826 "swapped operands to match def types in %G",
827 stmt_info->stmt);
830 return 0;
833 /* Return true if call statements CALL1 and CALL2 are similar enough
834 to be combined into the same SLP group. */
836 bool
837 compatible_calls_p (gcall *call1, gcall *call2)
839 unsigned int nargs = gimple_call_num_args (call1);
840 if (nargs != gimple_call_num_args (call2))
841 return false;
843 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
844 return false;
846 if (gimple_call_internal_p (call1))
848 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
849 TREE_TYPE (gimple_call_lhs (call2))))
850 return false;
851 for (unsigned int i = 0; i < nargs; ++i)
852 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
853 TREE_TYPE (gimple_call_arg (call2, i))))
854 return false;
856 else
858 if (!operand_equal_p (gimple_call_fn (call1),
859 gimple_call_fn (call2), 0))
860 return false;
862 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
863 return false;
866 /* Check that any unvectorized arguments are equal. */
867 if (const int *map = vect_get_operand_map (call1))
869 unsigned int nkept = *map++;
870 unsigned int mapi = 0;
871 for (unsigned int i = 0; i < nargs; ++i)
872 if (mapi < nkept && map[mapi] == int (i))
873 mapi += 1;
874 else if (!operand_equal_p (gimple_call_arg (call1, i),
875 gimple_call_arg (call2, i)))
876 return false;
879 return true;
882 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
883 caller's attempt to find the vector type in STMT_INFO with the narrowest
884 element type. Return true if VECTYPE is nonnull and if it is valid
885 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
886 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
887 vect_build_slp_tree. */
889 static bool
890 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
891 unsigned int group_size,
892 tree vectype, poly_uint64 *max_nunits)
894 if (!vectype)
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Build SLP failed: unsupported data-type in %G\n",
899 stmt_info->stmt);
900 /* Fatal mismatch. */
901 return false;
904 /* If populating the vector type requires unrolling then fail
905 before adjusting *max_nunits for basic-block vectorization. */
906 if (is_a <bb_vec_info> (vinfo)
907 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
909 if (dump_enabled_p ())
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "Build SLP failed: unrolling required "
912 "in basic block SLP\n");
913 /* Fatal mismatch. */
914 return false;
917 /* In case of multiple types we need to detect the smallest type. */
918 vect_update_max_nunits (max_nunits, vectype);
919 return true;
922 /* Verify if the scalar stmts STMTS are isomorphic, require data
923 permutation or are of unsupported types of operation. Return
924 true if they are, otherwise return false and indicate in *MATCHES
925 which stmts are not isomorphic to the first one. If MATCHES[0]
926 is false then this indicates the comparison could not be
927 carried out or the stmts will never be vectorized by SLP.
929 Note COND_EXPR is possibly isomorphic to another one after swapping its
930 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
931 the first stmt by swapping the two operands of comparison; set SWAP[i]
932 to 2 if stmt I is isormorphic to the first stmt by inverting the code
933 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
934 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
936 static bool
937 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
938 vec<stmt_vec_info> stmts, unsigned int group_size,
939 poly_uint64 *max_nunits, bool *matches,
940 bool *two_operators, tree *node_vectype)
942 unsigned int i;
943 stmt_vec_info first_stmt_info = stmts[0];
944 code_helper first_stmt_code = ERROR_MARK;
945 code_helper alt_stmt_code = ERROR_MARK;
946 code_helper rhs_code = ERROR_MARK;
947 code_helper first_cond_code = ERROR_MARK;
948 tree lhs;
949 bool need_same_oprnds = false;
950 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
951 stmt_vec_info first_load = NULL, prev_first_load = NULL;
952 bool first_stmt_load_p = false, load_p = false;
953 bool first_stmt_phi_p = false, phi_p = false;
954 bool maybe_soft_fail = false;
955 tree soft_fail_nunits_vectype = NULL_TREE;
957 /* For every stmt in NODE find its def stmt/s. */
958 stmt_vec_info stmt_info;
959 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
961 gimple *stmt = stmt_info->stmt;
962 swap[i] = 0;
963 matches[i] = false;
965 if (dump_enabled_p ())
966 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
968 /* Fail to vectorize statements marked as unvectorizable, throw
969 or are volatile. */
970 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
971 || stmt_can_throw_internal (cfun, stmt)
972 || gimple_has_volatile_ops (stmt))
974 if (dump_enabled_p ())
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "Build SLP failed: unvectorizable statement %G",
977 stmt);
978 /* ??? For BB vectorization we want to commutate operands in a way
979 to shuffle all unvectorizable defs into one operand and have
980 the other still vectorized. The following doesn't reliably
981 work for this though but it's the easiest we can do here. */
982 if (is_a <bb_vec_info> (vinfo) && i != 0)
983 continue;
984 /* Fatal mismatch. */
985 matches[0] = false;
986 return false;
989 lhs = gimple_get_lhs (stmt);
990 if (lhs == NULL_TREE)
992 if (dump_enabled_p ())
993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
994 "Build SLP failed: not GIMPLE_ASSIGN nor "
995 "GIMPLE_CALL %G", stmt);
996 if (is_a <bb_vec_info> (vinfo) && i != 0)
997 continue;
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1003 tree nunits_vectype;
1004 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1005 &nunits_vectype, group_size))
1007 if (is_a <bb_vec_info> (vinfo) && i != 0)
1008 continue;
1009 /* Fatal mismatch. */
1010 matches[0] = false;
1011 return false;
1013 /* Record nunits required but continue analysis, producing matches[]
1014 as if nunits was not an issue. This allows splitting of groups
1015 to happen. */
1016 if (nunits_vectype
1017 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1018 nunits_vectype, max_nunits))
1020 gcc_assert (is_a <bb_vec_info> (vinfo));
1021 maybe_soft_fail = true;
1022 soft_fail_nunits_vectype = nunits_vectype;
1025 gcc_assert (vectype);
1027 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1028 if (call_stmt)
1030 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1031 if (cfn != CFN_LAST)
1032 rhs_code = cfn;
1033 else
1034 rhs_code = CALL_EXPR;
1036 if (cfn == CFN_MASK_LOAD
1037 || cfn == CFN_GATHER_LOAD
1038 || cfn == CFN_MASK_GATHER_LOAD)
1039 load_p = true;
1040 else if ((internal_fn_p (cfn)
1041 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1042 || gimple_call_tail_p (call_stmt)
1043 || gimple_call_noreturn_p (call_stmt)
1044 || gimple_call_chain (call_stmt))
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048 "Build SLP failed: unsupported call type %G",
1049 (gimple *) call_stmt);
1050 if (is_a <bb_vec_info> (vinfo) && i != 0)
1051 continue;
1052 /* Fatal mismatch. */
1053 matches[0] = false;
1054 return false;
1057 else if (gimple_code (stmt) == GIMPLE_PHI)
1059 rhs_code = ERROR_MARK;
1060 phi_p = true;
1062 else
1064 rhs_code = gimple_assign_rhs_code (stmt);
1065 load_p = gimple_vuse (stmt);
1068 /* Check the operation. */
1069 if (i == 0)
1071 *node_vectype = vectype;
1072 first_stmt_code = rhs_code;
1073 first_stmt_load_p = load_p;
1074 first_stmt_phi_p = phi_p;
1076 /* Shift arguments should be equal in all the packed stmts for a
1077 vector shift with scalar shift operand. */
1078 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1079 || rhs_code == LROTATE_EXPR
1080 || rhs_code == RROTATE_EXPR)
1082 /* First see if we have a vector/vector shift. */
1083 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1085 /* No vector/vector shift, try for a vector/scalar shift. */
1086 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 "Build SLP failed: "
1091 "op not supported by target.\n");
1092 if (is_a <bb_vec_info> (vinfo) && i != 0)
1093 continue;
1094 /* Fatal mismatch. */
1095 matches[0] = false;
1096 return false;
1098 need_same_oprnds = true;
1099 first_op1 = gimple_assign_rhs2 (stmt);
1102 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1104 need_same_oprnds = true;
1105 first_op1 = gimple_assign_rhs2 (stmt);
1107 else if (!load_p
1108 && rhs_code == BIT_FIELD_REF)
1110 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1111 if (!is_a <bb_vec_info> (vinfo)
1112 || TREE_CODE (vec) != SSA_NAME
1113 /* When the element types are not compatible we pun the
1114 source to the target vectype which requires equal size. */
1115 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1116 || !types_compatible_p (TREE_TYPE (vectype),
1117 TREE_TYPE (TREE_TYPE (vec))))
1118 && !operand_equal_p (TYPE_SIZE (vectype),
1119 TYPE_SIZE (TREE_TYPE (vec)))))
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1126 matches[0] = false;
1127 return false;
1130 else if (rhs_code == CFN_DIV_POW2)
1132 need_same_oprnds = true;
1133 first_op1 = gimple_call_arg (call_stmt, 1);
1136 else
1138 if (first_stmt_code != rhs_code
1139 && alt_stmt_code == ERROR_MARK)
1140 alt_stmt_code = rhs_code;
1141 if ((first_stmt_code != rhs_code
1142 && (first_stmt_code != IMAGPART_EXPR
1143 || rhs_code != REALPART_EXPR)
1144 && (first_stmt_code != REALPART_EXPR
1145 || rhs_code != IMAGPART_EXPR)
1146 /* Handle mismatches in plus/minus by computing both
1147 and merging the results. */
1148 && !((first_stmt_code == PLUS_EXPR
1149 || first_stmt_code == MINUS_EXPR)
1150 && (alt_stmt_code == PLUS_EXPR
1151 || alt_stmt_code == MINUS_EXPR)
1152 && rhs_code == alt_stmt_code)
1153 && !(first_stmt_code.is_tree_code ()
1154 && rhs_code.is_tree_code ()
1155 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1156 == tcc_comparison)
1157 && (swap_tree_comparison (tree_code (first_stmt_code))
1158 == tree_code (rhs_code)))
1159 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1160 && (first_stmt_code == ARRAY_REF
1161 || first_stmt_code == BIT_FIELD_REF
1162 || first_stmt_code == INDIRECT_REF
1163 || first_stmt_code == COMPONENT_REF
1164 || first_stmt_code == MEM_REF)
1165 && (rhs_code == ARRAY_REF
1166 || rhs_code == BIT_FIELD_REF
1167 || rhs_code == INDIRECT_REF
1168 || rhs_code == COMPONENT_REF
1169 || rhs_code == MEM_REF)))
1170 || first_stmt_load_p != load_p
1171 || first_stmt_phi_p != phi_p)
1173 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176 "Build SLP failed: different operation "
1177 "in stmt %G", stmt);
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 "original stmt %G", first_stmt_info->stmt);
1181 /* Mismatch. */
1182 continue;
1185 if (!load_p
1186 && first_stmt_code == BIT_FIELD_REF
1187 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1188 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1190 if (dump_enabled_p ())
1191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192 "Build SLP failed: different BIT_FIELD_REF "
1193 "arguments in %G", stmt);
1194 /* Mismatch. */
1195 continue;
1198 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1200 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1201 call_stmt))
1203 if (dump_enabled_p ())
1204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205 "Build SLP failed: different calls in %G",
1206 stmt);
1207 /* Mismatch. */
1208 continue;
1212 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1213 && (gimple_bb (first_stmt_info->stmt)
1214 != gimple_bb (stmt_info->stmt)))
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "Build SLP failed: different BB for PHI "
1219 "or possibly trapping operation in %G", stmt);
1220 /* Mismatch. */
1221 continue;
1224 if (need_same_oprnds)
1226 tree other_op1 = gimple_arg (stmt, 1);
1227 if (!operand_equal_p (first_op1, other_op1, 0))
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 "Build SLP failed: different shift "
1232 "arguments in %G", stmt);
1233 /* Mismatch. */
1234 continue;
1238 if (!types_compatible_p (vectype, *node_vectype))
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242 "Build SLP failed: different vector type "
1243 "in %G", stmt);
1244 /* Mismatch. */
1245 continue;
1249 /* Grouped store or load. */
1250 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1252 if (REFERENCE_CLASS_P (lhs))
1254 /* Store. */
1257 else
1259 /* Load. */
1260 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1261 if (prev_first_load)
1263 /* Check that there are no loads from different interleaving
1264 chains in the same node. */
1265 if (prev_first_load != first_load)
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1269 vect_location,
1270 "Build SLP failed: different "
1271 "interleaving chains in one node %G",
1272 stmt);
1273 /* Mismatch. */
1274 continue;
1277 else
1278 prev_first_load = first_load;
1280 } /* Grouped access. */
1281 else
1283 if (load_p
1284 && rhs_code != CFN_GATHER_LOAD
1285 && rhs_code != CFN_MASK_GATHER_LOAD)
1287 /* Not grouped load. */
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "Build SLP failed: not grouped load %G", stmt);
1292 /* FORNOW: Not grouped loads are not supported. */
1293 if (is_a <bb_vec_info> (vinfo) && i != 0)
1294 continue;
1295 /* Fatal mismatch. */
1296 matches[0] = false;
1297 return false;
1300 /* Not memory operation. */
1301 if (!phi_p
1302 && rhs_code.is_tree_code ()
1303 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1304 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1305 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1306 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1307 && rhs_code != VIEW_CONVERT_EXPR
1308 && rhs_code != CALL_EXPR
1309 && rhs_code != BIT_FIELD_REF)
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "Build SLP failed: operation unsupported %G",
1314 stmt);
1315 if (is_a <bb_vec_info> (vinfo) && i != 0)
1316 continue;
1317 /* Fatal mismatch. */
1318 matches[0] = false;
1319 return false;
1322 if (rhs_code == COND_EXPR)
1324 tree cond_expr = gimple_assign_rhs1 (stmt);
1325 enum tree_code cond_code = TREE_CODE (cond_expr);
1326 enum tree_code swap_code = ERROR_MARK;
1327 enum tree_code invert_code = ERROR_MARK;
1329 if (i == 0)
1330 first_cond_code = TREE_CODE (cond_expr);
1331 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1333 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1334 swap_code = swap_tree_comparison (cond_code);
1335 invert_code = invert_tree_comparison (cond_code, honor_nans);
1338 if (first_cond_code == cond_code)
1340 /* Isomorphic can be achieved by swapping. */
1341 else if (first_cond_code == swap_code)
1342 swap[i] = 1;
1343 /* Isomorphic can be achieved by inverting. */
1344 else if (first_cond_code == invert_code)
1345 swap[i] = 2;
1346 else
1348 if (dump_enabled_p ())
1349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350 "Build SLP failed: different"
1351 " operation %G", stmt);
1352 /* Mismatch. */
1353 continue;
1357 if (rhs_code.is_tree_code ()
1358 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1359 && (swap_tree_comparison ((tree_code)first_stmt_code)
1360 == (tree_code)rhs_code))
1361 swap[i] = 1;
1364 matches[i] = true;
1367 for (i = 0; i < group_size; ++i)
1368 if (!matches[i])
1369 return false;
1371 /* If we allowed a two-operation SLP node verify the target can cope
1372 with the permute we are going to use. */
1373 if (alt_stmt_code != ERROR_MARK
1374 && (!alt_stmt_code.is_tree_code ()
1375 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1376 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1378 *two_operators = true;
1381 if (maybe_soft_fail)
1383 unsigned HOST_WIDE_INT const_nunits;
1384 if (!TYPE_VECTOR_SUBPARTS
1385 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1386 || const_nunits > group_size)
1387 matches[0] = false;
1388 else
1390 /* With constant vector elements simulate a mismatch at the
1391 point we need to split. */
1392 unsigned tail = group_size & (const_nunits - 1);
1393 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1395 return false;
1398 return true;
1401 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1402 Note we never remove apart from at destruction time so we do not
1403 need a special value for deleted that differs from empty. */
1404 struct bst_traits
1406 typedef vec <stmt_vec_info> value_type;
1407 typedef vec <stmt_vec_info> compare_type;
1408 static inline hashval_t hash (value_type);
1409 static inline bool equal (value_type existing, value_type candidate);
1410 static inline bool is_empty (value_type x) { return !x.exists (); }
1411 static inline bool is_deleted (value_type x) { return !x.exists (); }
1412 static const bool empty_zero_p = true;
1413 static inline void mark_empty (value_type &x) { x.release (); }
1414 static inline void mark_deleted (value_type &x) { x.release (); }
1415 static inline void remove (value_type &x) { x.release (); }
1417 inline hashval_t
1418 bst_traits::hash (value_type x)
1420 inchash::hash h;
1421 for (unsigned i = 0; i < x.length (); ++i)
1422 h.add_int (gimple_uid (x[i]->stmt));
1423 return h.end ();
1425 inline bool
1426 bst_traits::equal (value_type existing, value_type candidate)
1428 if (existing.length () != candidate.length ())
1429 return false;
1430 for (unsigned i = 0; i < existing.length (); ++i)
1431 if (existing[i] != candidate[i])
1432 return false;
1433 return true;
1436 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1437 but then vec::insert does memmove and that's not compatible with
1438 std::pair. */
1439 struct chain_op_t
1441 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1442 : code (code_), dt (dt_), op (op_) {}
1443 tree_code code;
1444 vect_def_type dt;
1445 tree op;
1448 /* Comparator for sorting associatable chains. */
1450 static int
1451 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1453 auto *op1 = (const chain_op_t *) op1_;
1454 auto *op2 = (const chain_op_t *) op2_;
1455 if (op1->dt != op2->dt)
1456 return (int)op1->dt - (int)op2->dt;
1457 return (int)op1->code - (int)op2->code;
1460 /* Linearize the associatable expression chain at START with the
1461 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1462 filling CHAIN with the result and using WORKLIST as intermediate storage.
1463 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1464 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1465 stmts, starting with START. */
1467 static void
1468 vect_slp_linearize_chain (vec_info *vinfo,
1469 vec<std::pair<tree_code, gimple *> > &worklist,
1470 vec<chain_op_t> &chain,
1471 enum tree_code code, gimple *start,
1472 gimple *&code_stmt, gimple *&alt_code_stmt,
1473 vec<gimple *> *chain_stmts)
1475 /* For each lane linearize the addition/subtraction (or other
1476 uniform associatable operation) expression tree. */
1477 worklist.safe_push (std::make_pair (code, start));
1478 while (!worklist.is_empty ())
1480 auto entry = worklist.pop ();
1481 gassign *stmt = as_a <gassign *> (entry.second);
1482 enum tree_code in_code = entry.first;
1483 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1484 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1485 if (!code_stmt
1486 && gimple_assign_rhs_code (stmt) == code)
1487 code_stmt = stmt;
1488 else if (!alt_code_stmt
1489 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1490 alt_code_stmt = stmt;
1491 if (chain_stmts)
1492 chain_stmts->safe_push (stmt);
1493 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1495 tree op = gimple_op (stmt, opnum);
1496 vect_def_type dt;
1497 stmt_vec_info def_stmt_info;
1498 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1499 gcc_assert (res);
1500 if (dt == vect_internal_def
1501 && is_pattern_stmt_p (def_stmt_info))
1502 op = gimple_get_lhs (def_stmt_info->stmt);
1503 gimple *use_stmt;
1504 use_operand_p use_p;
1505 if (dt == vect_internal_def
1506 && single_imm_use (op, &use_p, &use_stmt)
1507 && is_gimple_assign (def_stmt_info->stmt)
1508 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1509 || (code == PLUS_EXPR
1510 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1511 == MINUS_EXPR))))
1513 tree_code op_def_code = this_code;
1514 if (op_def_code == MINUS_EXPR && opnum == 1)
1515 op_def_code = PLUS_EXPR;
1516 if (in_code == MINUS_EXPR)
1517 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1518 worklist.safe_push (std::make_pair (op_def_code,
1519 def_stmt_info->stmt));
1521 else
1523 tree_code op_def_code = this_code;
1524 if (op_def_code == MINUS_EXPR && opnum == 1)
1525 op_def_code = PLUS_EXPR;
1526 if (in_code == MINUS_EXPR)
1527 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1528 chain.safe_push (chain_op_t (op_def_code, dt, op));
1534 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1535 simple_hashmap_traits <bst_traits, slp_tree> >
1536 scalar_stmts_to_slp_tree_map_t;
1538 static slp_tree
1539 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1540 vec<stmt_vec_info> stmts, unsigned int group_size,
1541 poly_uint64 *max_nunits,
1542 bool *matches, unsigned *limit, unsigned *tree_size,
1543 scalar_stmts_to_slp_tree_map_t *bst_map);
1545 static slp_tree
1546 vect_build_slp_tree (vec_info *vinfo,
1547 vec<stmt_vec_info> stmts, unsigned int group_size,
1548 poly_uint64 *max_nunits,
1549 bool *matches, unsigned *limit, unsigned *tree_size,
1550 scalar_stmts_to_slp_tree_map_t *bst_map)
1552 if (slp_tree *leader = bst_map->get (stmts))
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1556 !(*leader)->failed ? "" : "failed ",
1557 (void *) *leader);
1558 if (!(*leader)->failed)
1560 SLP_TREE_REF_COUNT (*leader)++;
1561 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1562 stmts.release ();
1563 return *leader;
1565 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1566 return NULL;
1569 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1570 so we can pick up backedge destinations during discovery. */
1571 slp_tree res = new _slp_tree;
1572 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1573 SLP_TREE_SCALAR_STMTS (res) = stmts;
1574 bst_map->put (stmts.copy (), res);
1576 if (*limit == 0)
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_NOTE, vect_location,
1580 "SLP discovery limit exceeded\n");
1581 /* Mark the node invalid so we can detect those when still in use
1582 as backedge destinations. */
1583 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1584 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1585 res->failed = XNEWVEC (bool, group_size);
1586 memset (res->failed, 0, sizeof (bool) * group_size);
1587 memset (matches, 0, sizeof (bool) * group_size);
1588 return NULL;
1590 --*limit;
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "starting SLP discovery for node %p\n", (void *) res);
1596 poly_uint64 this_max_nunits = 1;
1597 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1598 &this_max_nunits,
1599 matches, limit, tree_size, bst_map);
1600 if (!res_)
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "SLP discovery for node %p failed\n", (void *) res);
1605 /* Mark the node invalid so we can detect those when still in use
1606 as backedge destinations. */
1607 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1608 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1609 res->failed = XNEWVEC (bool, group_size);
1610 if (flag_checking)
1612 unsigned i;
1613 for (i = 0; i < group_size; ++i)
1614 if (!matches[i])
1615 break;
1616 gcc_assert (i < group_size);
1618 memcpy (res->failed, matches, sizeof (bool) * group_size);
1620 else
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE, vect_location,
1624 "SLP discovery for node %p succeeded\n",
1625 (void *) res);
1626 gcc_assert (res_ == res);
1627 res->max_nunits = this_max_nunits;
1628 vect_update_max_nunits (max_nunits, this_max_nunits);
1629 /* Keep a reference for the bst_map use. */
1630 SLP_TREE_REF_COUNT (res)++;
1632 return res_;
1635 /* Helper for building an associated SLP node chain. */
1637 static void
1638 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1639 slp_tree op0, slp_tree op1,
1640 stmt_vec_info oper1, stmt_vec_info oper2,
1641 vec<std::pair<unsigned, unsigned> > lperm)
1643 unsigned group_size = SLP_TREE_LANES (op1);
1645 slp_tree child1 = new _slp_tree;
1646 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1647 SLP_TREE_VECTYPE (child1) = vectype;
1648 SLP_TREE_LANES (child1) = group_size;
1649 SLP_TREE_CHILDREN (child1).create (2);
1650 SLP_TREE_CHILDREN (child1).quick_push (op0);
1651 SLP_TREE_CHILDREN (child1).quick_push (op1);
1652 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1654 slp_tree child2 = new _slp_tree;
1655 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1656 SLP_TREE_VECTYPE (child2) = vectype;
1657 SLP_TREE_LANES (child2) = group_size;
1658 SLP_TREE_CHILDREN (child2).create (2);
1659 SLP_TREE_CHILDREN (child2).quick_push (op0);
1660 SLP_TREE_REF_COUNT (op0)++;
1661 SLP_TREE_CHILDREN (child2).quick_push (op1);
1662 SLP_TREE_REF_COUNT (op1)++;
1663 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1665 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1666 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1667 SLP_TREE_VECTYPE (perm) = vectype;
1668 SLP_TREE_LANES (perm) = group_size;
1669 /* ??? We should set this NULL but that's not expected. */
1670 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1671 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1672 SLP_TREE_CHILDREN (perm).quick_push (child1);
1673 SLP_TREE_CHILDREN (perm).quick_push (child2);
1676 /* Recursively build an SLP tree starting from NODE.
1677 Fail (and return a value not equal to zero) if def-stmts are not
1678 isomorphic, require data permutation or are of unsupported types of
1679 operation. Otherwise, return 0.
1680 The value returned is the depth in the SLP tree where a mismatch
1681 was found. */
1683 static slp_tree
1684 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1685 vec<stmt_vec_info> stmts, unsigned int group_size,
1686 poly_uint64 *max_nunits,
1687 bool *matches, unsigned *limit, unsigned *tree_size,
1688 scalar_stmts_to_slp_tree_map_t *bst_map)
1690 unsigned nops, i, this_tree_size = 0;
1691 poly_uint64 this_max_nunits = *max_nunits;
1693 matches[0] = false;
1695 stmt_vec_info stmt_info = stmts[0];
1696 if (!is_a<gcall *> (stmt_info->stmt)
1697 && !is_a<gassign *> (stmt_info->stmt)
1698 && !is_a<gphi *> (stmt_info->stmt))
1699 return NULL;
1701 nops = gimple_num_args (stmt_info->stmt);
1702 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1703 nops = map[0];
1705 /* If the SLP node is a PHI (induction or reduction), terminate
1706 the recursion. */
1707 bool *skip_args = XALLOCAVEC (bool, nops);
1708 memset (skip_args, 0, sizeof (bool) * nops);
1709 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1710 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1712 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1713 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1714 group_size);
1715 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1716 max_nunits))
1717 return NULL;
1719 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1720 if (def_type == vect_induction_def)
1722 /* Induction PHIs are not cycles but walk the initial
1723 value. Only for inner loops through, for outer loops
1724 we need to pick up the value from the actual PHIs
1725 to more easily support peeling and epilogue vectorization. */
1726 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1727 if (!nested_in_vect_loop_p (loop, stmt_info))
1728 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1729 else
1730 loop = loop->inner;
1731 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1733 else if (def_type == vect_reduction_def
1734 || def_type == vect_double_reduction_def
1735 || def_type == vect_nested_cycle)
1737 /* Else def types have to match. */
1738 stmt_vec_info other_info;
1739 bool all_same = true;
1740 FOR_EACH_VEC_ELT (stmts, i, other_info)
1742 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1743 return NULL;
1744 if (other_info != stmt_info)
1745 all_same = false;
1747 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1748 /* Reduction initial values are not explicitely represented. */
1749 if (!nested_in_vect_loop_p (loop, stmt_info))
1750 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1751 /* Reduction chain backedge defs are filled manually.
1752 ??? Need a better way to identify a SLP reduction chain PHI.
1753 Or a better overall way to SLP match those. */
1754 if (all_same && def_type == vect_reduction_def)
1755 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1757 else if (def_type != vect_internal_def)
1758 return NULL;
1762 bool two_operators = false;
1763 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1764 tree vectype = NULL_TREE;
1765 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1766 &this_max_nunits, matches, &two_operators,
1767 &vectype))
1768 return NULL;
1770 /* If the SLP node is a load, terminate the recursion unless masked. */
1771 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1772 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1774 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1775 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1776 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1777 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1778 else
1780 *max_nunits = this_max_nunits;
1781 (*tree_size)++;
1782 node = vect_create_new_slp_node (node, stmts, 0);
1783 SLP_TREE_VECTYPE (node) = vectype;
1784 /* And compute the load permutation. Whether it is actually
1785 a permutation depends on the unrolling factor which is
1786 decided later. */
1787 vec<unsigned> load_permutation;
1788 int j;
1789 stmt_vec_info load_info;
1790 load_permutation.create (group_size);
1791 stmt_vec_info first_stmt_info
1792 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1793 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1795 int load_place = vect_get_place_in_interleaving_chain
1796 (load_info, first_stmt_info);
1797 gcc_assert (load_place != -1);
1798 load_permutation.safe_push (load_place);
1800 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1801 return node;
1804 else if (gimple_assign_single_p (stmt_info->stmt)
1805 && !gimple_vuse (stmt_info->stmt)
1806 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1808 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1809 the same SSA name vector of a compatible type to vectype. */
1810 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1811 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1812 stmt_vec_info estmt_info;
1813 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1815 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1816 tree bfref = gimple_assign_rhs1 (estmt);
1817 HOST_WIDE_INT lane;
1818 if (!known_eq (bit_field_size (bfref),
1819 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1820 || !constant_multiple_p (bit_field_offset (bfref),
1821 bit_field_size (bfref), &lane))
1823 lperm.release ();
1824 matches[0] = false;
1825 return NULL;
1827 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1829 slp_tree vnode = vect_create_new_slp_node (vNULL);
1830 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1831 /* ??? We record vectype here but we hide eventually necessary
1832 punning and instead rely on code generation to materialize
1833 VIEW_CONVERT_EXPRs as necessary. We instead should make
1834 this explicit somehow. */
1835 SLP_TREE_VECTYPE (vnode) = vectype;
1836 else
1838 /* For different size but compatible elements we can still
1839 use VEC_PERM_EXPR without punning. */
1840 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1841 && types_compatible_p (TREE_TYPE (vectype),
1842 TREE_TYPE (TREE_TYPE (vec))));
1843 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1845 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1846 unsigned HOST_WIDE_INT const_nunits;
1847 if (nunits.is_constant (&const_nunits))
1848 SLP_TREE_LANES (vnode) = const_nunits;
1849 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1850 /* We are always building a permutation node even if it is an identity
1851 permute to shield the rest of the vectorizer from the odd node
1852 representing an actual vector without any scalar ops.
1853 ??? We could hide it completely with making the permute node
1854 external? */
1855 node = vect_create_new_slp_node (node, stmts, 1);
1856 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1857 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1858 SLP_TREE_VECTYPE (node) = vectype;
1859 SLP_TREE_CHILDREN (node).quick_push (vnode);
1860 return node;
1862 /* When discovery reaches an associatable operation see whether we can
1863 improve that to match up lanes in a way superior to the operand
1864 swapping code which at most looks at two defs.
1865 ??? For BB vectorization we cannot do the brute-force search
1866 for matching as we can succeed by means of builds from scalars
1867 and have no good way to "cost" one build against another. */
1868 else if (is_a <loop_vec_info> (vinfo)
1869 /* ??? We don't handle !vect_internal_def defs below. */
1870 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1871 && is_gimple_assign (stmt_info->stmt)
1872 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1873 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1874 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1875 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1876 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1878 /* See if we have a chain of (mixed) adds or subtracts or other
1879 associatable ops. */
1880 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1881 if (code == MINUS_EXPR)
1882 code = PLUS_EXPR;
1883 stmt_vec_info other_op_stmt_info = NULL;
1884 stmt_vec_info op_stmt_info = NULL;
1885 unsigned chain_len = 0;
1886 auto_vec<chain_op_t> chain;
1887 auto_vec<std::pair<tree_code, gimple *> > worklist;
1888 auto_vec<vec<chain_op_t> > chains (group_size);
1889 auto_vec<slp_tree, 4> children;
1890 bool hard_fail = true;
1891 for (unsigned lane = 0; lane < group_size; ++lane)
1893 /* For each lane linearize the addition/subtraction (or other
1894 uniform associatable operation) expression tree. */
1895 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1896 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1897 stmts[lane]->stmt, op_stmt, other_op_stmt,
1898 NULL);
1899 if (!op_stmt_info && op_stmt)
1900 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1901 if (!other_op_stmt_info && other_op_stmt)
1902 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1903 if (chain.length () == 2)
1905 /* In a chain of just two elements resort to the regular
1906 operand swapping scheme. If we run into a length
1907 mismatch still hard-FAIL. */
1908 if (chain_len == 0)
1909 hard_fail = false;
1910 else
1912 matches[lane] = false;
1913 /* ??? We might want to process the other lanes, but
1914 make sure to not give false matching hints to the
1915 caller for lanes we did not process. */
1916 if (lane != group_size - 1)
1917 matches[0] = false;
1919 break;
1921 else if (chain_len == 0)
1922 chain_len = chain.length ();
1923 else if (chain.length () != chain_len)
1925 /* ??? Here we could slip in magic to compensate with
1926 neutral operands. */
1927 matches[lane] = false;
1928 if (lane != group_size - 1)
1929 matches[0] = false;
1930 break;
1932 chains.quick_push (chain.copy ());
1933 chain.truncate (0);
1935 if (chains.length () == group_size)
1937 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1938 if (!op_stmt_info)
1940 hard_fail = false;
1941 goto out;
1943 /* Now we have a set of chains with the same length. */
1944 /* 1. pre-sort according to def_type and operation. */
1945 for (unsigned lane = 0; lane < group_size; ++lane)
1946 chains[lane].stablesort (dt_sort_cmp, vinfo);
1947 if (dump_enabled_p ())
1949 dump_printf_loc (MSG_NOTE, vect_location,
1950 "pre-sorted chains of %s\n",
1951 get_tree_code_name (code));
1952 for (unsigned lane = 0; lane < group_size; ++lane)
1954 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1955 dump_printf (MSG_NOTE, "%s %T ",
1956 get_tree_code_name (chains[lane][opnum].code),
1957 chains[lane][opnum].op);
1958 dump_printf (MSG_NOTE, "\n");
1961 /* 2. try to build children nodes, associating as necessary. */
1962 for (unsigned n = 0; n < chain_len; ++n)
1964 vect_def_type dt = chains[0][n].dt;
1965 unsigned lane;
1966 for (lane = 0; lane < group_size; ++lane)
1967 if (chains[lane][n].dt != dt)
1969 if (dt == vect_constant_def
1970 && chains[lane][n].dt == vect_external_def)
1971 dt = vect_external_def;
1972 else if (dt == vect_external_def
1973 && chains[lane][n].dt == vect_constant_def)
1975 else
1976 break;
1978 if (lane != group_size)
1980 if (dump_enabled_p ())
1981 dump_printf_loc (MSG_NOTE, vect_location,
1982 "giving up on chain due to mismatched "
1983 "def types\n");
1984 matches[lane] = false;
1985 if (lane != group_size - 1)
1986 matches[0] = false;
1987 goto out;
1989 if (dt == vect_constant_def
1990 || dt == vect_external_def)
1992 /* Check whether we can build the invariant. If we can't
1993 we never will be able to. */
1994 tree type = TREE_TYPE (chains[0][n].op);
1995 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1996 && (TREE_CODE (type) == BOOLEAN_TYPE
1997 || !can_duplicate_and_interleave_p (vinfo, group_size,
1998 type)))
2000 matches[0] = false;
2001 goto out;
2003 vec<tree> ops;
2004 ops.create (group_size);
2005 for (lane = 0; lane < group_size; ++lane)
2006 ops.quick_push (chains[lane][n].op);
2007 slp_tree child = vect_create_new_slp_node (ops);
2008 SLP_TREE_DEF_TYPE (child) = dt;
2009 children.safe_push (child);
2011 else if (dt != vect_internal_def)
2013 /* Not sure, we might need sth special.
2014 gcc.dg/vect/pr96854.c,
2015 gfortran.dg/vect/fast-math-pr37021.f90
2016 and gfortran.dg/vect/pr61171.f trigger. */
2017 /* Soft-fail for now. */
2018 hard_fail = false;
2019 goto out;
2021 else
2023 vec<stmt_vec_info> op_stmts;
2024 op_stmts.create (group_size);
2025 slp_tree child = NULL;
2026 /* Brute-force our way. We have to consider a lane
2027 failing after fixing an earlier fail up in the
2028 SLP discovery recursion. So track the current
2029 permute per lane. */
2030 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2031 memset (perms, 0, sizeof (unsigned) * group_size);
2034 op_stmts.truncate (0);
2035 for (lane = 0; lane < group_size; ++lane)
2036 op_stmts.quick_push
2037 (vinfo->lookup_def (chains[lane][n].op));
2038 child = vect_build_slp_tree (vinfo, op_stmts,
2039 group_size, &this_max_nunits,
2040 matches, limit,
2041 &this_tree_size, bst_map);
2042 /* ??? We're likely getting too many fatal mismatches
2043 here so maybe we want to ignore them (but then we
2044 have no idea which lanes fatally mismatched). */
2045 if (child || !matches[0])
2046 break;
2047 /* Swap another lane we have not yet matched up into
2048 lanes that did not match. If we run out of
2049 permute possibilities for a lane terminate the
2050 search. */
2051 bool term = false;
2052 for (lane = 1; lane < group_size; ++lane)
2053 if (!matches[lane])
2055 if (n + perms[lane] + 1 == chain_len)
2057 term = true;
2058 break;
2060 std::swap (chains[lane][n],
2061 chains[lane][n + perms[lane] + 1]);
2062 perms[lane]++;
2064 if (term)
2065 break;
2067 while (1);
2068 if (!child)
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "failed to match up op %d\n", n);
2073 op_stmts.release ();
2074 if (lane != group_size - 1)
2075 matches[0] = false;
2076 else
2077 matches[lane] = false;
2078 goto out;
2080 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "matched up op %d to\n", n);
2084 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2086 children.safe_push (child);
2089 /* 3. build SLP nodes to combine the chain. */
2090 for (unsigned lane = 0; lane < group_size; ++lane)
2091 if (chains[lane][0].code != code)
2093 /* See if there's any alternate all-PLUS entry. */
2094 unsigned n;
2095 for (n = 1; n < chain_len; ++n)
2097 for (lane = 0; lane < group_size; ++lane)
2098 if (chains[lane][n].code != code)
2099 break;
2100 if (lane == group_size)
2101 break;
2103 if (n != chain_len)
2105 /* Swap that in at first position. */
2106 std::swap (children[0], children[n]);
2107 for (lane = 0; lane < group_size; ++lane)
2108 std::swap (chains[lane][0], chains[lane][n]);
2110 else
2112 /* ??? When this triggers and we end up with two
2113 vect_constant/external_def up-front things break (ICE)
2114 spectacularly finding an insertion place for the
2115 all-constant op. We should have a fully
2116 vect_internal_def operand though(?) so we can swap
2117 that into first place and then prepend the all-zero
2118 constant. */
2119 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_NOTE, vect_location,
2121 "inserting constant zero to compensate "
2122 "for (partially) negated first "
2123 "operand\n");
2124 chain_len++;
2125 for (lane = 0; lane < group_size; ++lane)
2126 chains[lane].safe_insert
2127 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2128 vec<tree> zero_ops;
2129 zero_ops.create (group_size);
2130 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2131 for (lane = 1; lane < group_size; ++lane)
2132 zero_ops.quick_push (zero_ops[0]);
2133 slp_tree zero = vect_create_new_slp_node (zero_ops);
2134 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2135 children.safe_insert (0, zero);
2137 break;
2139 for (unsigned i = 1; i < children.length (); ++i)
2141 slp_tree op0 = children[i - 1];
2142 slp_tree op1 = children[i];
2143 bool this_two_op = false;
2144 for (unsigned lane = 0; lane < group_size; ++lane)
2145 if (chains[lane][i].code != chains[0][i].code)
2147 this_two_op = true;
2148 break;
2150 slp_tree child;
2151 if (i == children.length () - 1)
2152 child = vect_create_new_slp_node (node, stmts, 2);
2153 else
2154 child = vect_create_new_slp_node (2, ERROR_MARK);
2155 if (this_two_op)
2157 vec<std::pair<unsigned, unsigned> > lperm;
2158 lperm.create (group_size);
2159 for (unsigned lane = 0; lane < group_size; ++lane)
2160 lperm.quick_push (std::make_pair
2161 (chains[lane][i].code != chains[0][i].code, lane));
2162 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2163 (chains[0][i].code == code
2164 ? op_stmt_info
2165 : other_op_stmt_info),
2166 (chains[0][i].code == code
2167 ? other_op_stmt_info
2168 : op_stmt_info),
2169 lperm);
2171 else
2173 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2174 SLP_TREE_VECTYPE (child) = vectype;
2175 SLP_TREE_LANES (child) = group_size;
2176 SLP_TREE_CHILDREN (child).quick_push (op0);
2177 SLP_TREE_CHILDREN (child).quick_push (op1);
2178 SLP_TREE_REPRESENTATIVE (child)
2179 = (chains[0][i].code == code
2180 ? op_stmt_info : other_op_stmt_info);
2182 children[i] = child;
2184 *tree_size += this_tree_size + 1;
2185 *max_nunits = this_max_nunits;
2186 while (!chains.is_empty ())
2187 chains.pop ().release ();
2188 return node;
2190 out:
2191 while (!children.is_empty ())
2192 vect_free_slp_tree (children.pop ());
2193 while (!chains.is_empty ())
2194 chains.pop ().release ();
2195 /* Hard-fail, otherwise we might run into quadratic processing of the
2196 chains starting one stmt into the chain again. */
2197 if (hard_fail)
2198 return NULL;
2199 /* Fall thru to normal processing. */
2202 /* Get at the operands, verifying they are compatible. */
2203 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2204 slp_oprnd_info oprnd_info;
2205 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2207 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2208 stmts, i, &oprnds_info);
2209 if (res != 0)
2210 matches[(res == -1) ? 0 : i] = false;
2211 if (!matches[0])
2212 break;
2214 for (i = 0; i < group_size; ++i)
2215 if (!matches[i])
2217 vect_free_oprnd_info (oprnds_info);
2218 return NULL;
2220 swap = NULL;
2222 auto_vec<slp_tree, 4> children;
2224 stmt_info = stmts[0];
2226 /* Create SLP_TREE nodes for the definition node/s. */
2227 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2229 slp_tree child;
2230 unsigned int j;
2232 /* We're skipping certain operands from processing, for example
2233 outer loop reduction initial defs. */
2234 if (skip_args[i])
2236 children.safe_push (NULL);
2237 continue;
2240 if (oprnd_info->first_dt == vect_uninitialized_def)
2242 /* COND_EXPR have one too many eventually if the condition
2243 is a SSA name. */
2244 gcc_assert (i == 3 && nops == 4);
2245 continue;
2248 if (is_a <bb_vec_info> (vinfo)
2249 && oprnd_info->first_dt == vect_internal_def
2250 && !oprnd_info->any_pattern)
2252 /* For BB vectorization, if all defs are the same do not
2253 bother to continue the build along the single-lane
2254 graph but use a splat of the scalar value. */
2255 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2256 for (j = 1; j < group_size; ++j)
2257 if (oprnd_info->def_stmts[j] != first_def)
2258 break;
2259 if (j == group_size
2260 /* But avoid doing this for loads where we may be
2261 able to CSE things, unless the stmt is not
2262 vectorizable. */
2263 && (!STMT_VINFO_VECTORIZABLE (first_def)
2264 || !gimple_vuse (first_def->stmt)))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "Using a splat of the uniform operand %G",
2269 first_def->stmt);
2270 oprnd_info->first_dt = vect_external_def;
2274 if (oprnd_info->first_dt == vect_external_def
2275 || oprnd_info->first_dt == vect_constant_def)
2277 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2278 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2279 oprnd_info->ops = vNULL;
2280 children.safe_push (invnode);
2281 continue;
2284 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2285 group_size, &this_max_nunits,
2286 matches, limit,
2287 &this_tree_size, bst_map)) != NULL)
2289 oprnd_info->def_stmts = vNULL;
2290 children.safe_push (child);
2291 continue;
2294 /* If the SLP build for operand zero failed and operand zero
2295 and one can be commutated try that for the scalar stmts
2296 that failed the match. */
2297 if (i == 0
2298 /* A first scalar stmt mismatch signals a fatal mismatch. */
2299 && matches[0]
2300 /* ??? For COND_EXPRs we can swap the comparison operands
2301 as well as the arms under some constraints. */
2302 && nops == 2
2303 && oprnds_info[1]->first_dt == vect_internal_def
2304 && is_gimple_assign (stmt_info->stmt)
2305 /* Swapping operands for reductions breaks assumptions later on. */
2306 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2307 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2309 /* See whether we can swap the matching or the non-matching
2310 stmt operands. */
2311 bool swap_not_matching = true;
2314 for (j = 0; j < group_size; ++j)
2316 if (matches[j] != !swap_not_matching)
2317 continue;
2318 stmt_vec_info stmt_info = stmts[j];
2319 /* Verify if we can swap operands of this stmt. */
2320 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2321 if (!stmt
2322 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2324 if (!swap_not_matching)
2325 goto fail;
2326 swap_not_matching = false;
2327 break;
2331 while (j != group_size);
2333 /* Swap mismatched definition stmts. */
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "Re-trying with swapped operands of stmts ");
2337 for (j = 0; j < group_size; ++j)
2338 if (matches[j] == !swap_not_matching)
2340 std::swap (oprnds_info[0]->def_stmts[j],
2341 oprnds_info[1]->def_stmts[j]);
2342 std::swap (oprnds_info[0]->ops[j],
2343 oprnds_info[1]->ops[j]);
2344 if (dump_enabled_p ())
2345 dump_printf (MSG_NOTE, "%d ", j);
2347 if (dump_enabled_p ())
2348 dump_printf (MSG_NOTE, "\n");
2349 /* After swapping some operands we lost track whether an
2350 operand has any pattern defs so be conservative here. */
2351 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2352 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2353 /* And try again with scratch 'matches' ... */
2354 bool *tem = XALLOCAVEC (bool, group_size);
2355 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2356 group_size, &this_max_nunits,
2357 tem, limit,
2358 &this_tree_size, bst_map)) != NULL)
2360 oprnd_info->def_stmts = vNULL;
2361 children.safe_push (child);
2362 continue;
2365 fail:
2367 /* If the SLP build failed and we analyze a basic-block
2368 simply treat nodes we fail to build as externally defined
2369 (and thus build vectors from the scalar defs).
2370 The cost model will reject outright expensive cases.
2371 ??? This doesn't treat cases where permutation ultimatively
2372 fails (or we don't try permutation below). Ideally we'd
2373 even compute a permutation that will end up with the maximum
2374 SLP tree size... */
2375 if (is_a <bb_vec_info> (vinfo)
2376 /* ??? Rejecting patterns this way doesn't work. We'd have to
2377 do extra work to cancel the pattern so the uses see the
2378 scalar version. */
2379 && !is_pattern_stmt_p (stmt_info)
2380 && !oprnd_info->any_pattern)
2382 /* But if there's a leading vector sized set of matching stmts
2383 fail here so we can split the group. This matches the condition
2384 vect_analyze_slp_instance uses. */
2385 /* ??? We might want to split here and combine the results to support
2386 multiple vector sizes better. */
2387 for (j = 0; j < group_size; ++j)
2388 if (!matches[j])
2389 break;
2390 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2392 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_NOTE, vect_location,
2394 "Building vector operands from scalars\n");
2395 this_tree_size++;
2396 child = vect_create_new_slp_node (oprnd_info->ops);
2397 children.safe_push (child);
2398 oprnd_info->ops = vNULL;
2399 continue;
2403 gcc_assert (child == NULL);
2404 FOR_EACH_VEC_ELT (children, j, child)
2405 if (child)
2406 vect_free_slp_tree (child);
2407 vect_free_oprnd_info (oprnds_info);
2408 return NULL;
2411 vect_free_oprnd_info (oprnds_info);
2413 /* If we have all children of a child built up from uniform scalars
2414 or does more than one possibly expensive vector construction then
2415 just throw that away, causing it built up from scalars.
2416 The exception is the SLP node for the vector store. */
2417 if (is_a <bb_vec_info> (vinfo)
2418 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2419 /* ??? Rejecting patterns this way doesn't work. We'd have to
2420 do extra work to cancel the pattern so the uses see the
2421 scalar version. */
2422 && !is_pattern_stmt_p (stmt_info))
2424 slp_tree child;
2425 unsigned j;
2426 bool all_uniform_p = true;
2427 unsigned n_vector_builds = 0;
2428 FOR_EACH_VEC_ELT (children, j, child)
2430 if (!child)
2432 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2433 all_uniform_p = false;
2434 else if (!vect_slp_tree_uniform_p (child))
2436 all_uniform_p = false;
2437 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2438 n_vector_builds++;
2441 if (all_uniform_p
2442 || n_vector_builds > 1
2443 || (n_vector_builds == children.length ()
2444 && is_a <gphi *> (stmt_info->stmt)))
2446 /* Roll back. */
2447 matches[0] = false;
2448 FOR_EACH_VEC_ELT (children, j, child)
2449 if (child)
2450 vect_free_slp_tree (child);
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_NOTE, vect_location,
2454 "Building parent vector operands from "
2455 "scalars instead\n");
2456 return NULL;
2460 *tree_size += this_tree_size + 1;
2461 *max_nunits = this_max_nunits;
2463 if (two_operators)
2465 /* ??? We'd likely want to either cache in bst_map sth like
2466 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2467 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2468 explicit stmts to put in so the keying on 'stmts' doesn't
2469 work (but we have the same issue with nodes that use 'ops'). */
2470 slp_tree one = new _slp_tree;
2471 slp_tree two = new _slp_tree;
2472 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2473 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2474 SLP_TREE_VECTYPE (one) = vectype;
2475 SLP_TREE_VECTYPE (two) = vectype;
2476 SLP_TREE_CHILDREN (one).safe_splice (children);
2477 SLP_TREE_CHILDREN (two).safe_splice (children);
2478 slp_tree child;
2479 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2480 SLP_TREE_REF_COUNT (child)++;
2482 /* Here we record the original defs since this
2483 node represents the final lane configuration. */
2484 node = vect_create_new_slp_node (node, stmts, 2);
2485 SLP_TREE_VECTYPE (node) = vectype;
2486 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2487 SLP_TREE_CHILDREN (node).quick_push (one);
2488 SLP_TREE_CHILDREN (node).quick_push (two);
2489 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2490 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2491 enum tree_code ocode = ERROR_MARK;
2492 stmt_vec_info ostmt_info;
2493 unsigned j = 0;
2494 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2496 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2497 if (gimple_assign_rhs_code (ostmt) != code0)
2499 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2500 ocode = gimple_assign_rhs_code (ostmt);
2501 j = i;
2503 else
2504 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2506 SLP_TREE_CODE (one) = code0;
2507 SLP_TREE_CODE (two) = ocode;
2508 SLP_TREE_LANES (one) = stmts.length ();
2509 SLP_TREE_LANES (two) = stmts.length ();
2510 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2511 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2512 return node;
2515 node = vect_create_new_slp_node (node, stmts, nops);
2516 SLP_TREE_VECTYPE (node) = vectype;
2517 SLP_TREE_CHILDREN (node).splice (children);
2518 return node;
2521 /* Dump a single SLP tree NODE. */
2523 static void
2524 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2525 slp_tree node)
2527 unsigned i, j;
2528 slp_tree child;
2529 stmt_vec_info stmt_info;
2530 tree op;
2532 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2533 dump_user_location_t user_loc = loc.get_user_location ();
2534 dump_printf_loc (metadata, user_loc,
2535 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2536 ", refcnt=%u)",
2537 SLP_TREE_DEF_TYPE (node) == vect_external_def
2538 ? " (external)"
2539 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2540 ? " (constant)"
2541 : ""), (void *) node,
2542 estimated_poly_value (node->max_nunits),
2543 SLP_TREE_REF_COUNT (node));
2544 if (SLP_TREE_VECTYPE (node))
2545 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2546 dump_printf (metadata, "\n");
2547 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2549 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2550 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2551 else
2552 dump_printf_loc (metadata, user_loc, "op template: %G",
2553 SLP_TREE_REPRESENTATIVE (node)->stmt);
2555 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2556 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2557 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2558 else
2560 dump_printf_loc (metadata, user_loc, "\t{ ");
2561 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2562 dump_printf (metadata, "%T%s ", op,
2563 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2564 dump_printf (metadata, "}\n");
2566 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2568 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2569 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2570 dump_printf (dump_kind, " %u", j);
2571 dump_printf (dump_kind, " }\n");
2573 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2575 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2576 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2577 dump_printf (dump_kind, " %u[%u]",
2578 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2579 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2580 dump_printf (dump_kind, " }\n");
2582 if (SLP_TREE_CHILDREN (node).is_empty ())
2583 return;
2584 dump_printf_loc (metadata, user_loc, "\tchildren");
2585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2586 dump_printf (dump_kind, " %p", (void *)child);
2587 dump_printf (dump_kind, "\n");
2590 DEBUG_FUNCTION void
2591 debug (slp_tree node)
2593 debug_dump_context ctx;
2594 vect_print_slp_tree (MSG_NOTE,
2595 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2596 node);
2599 /* Recursive helper for the dot producer below. */
2601 static void
2602 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2604 if (visited.add (node))
2605 return;
2607 fprintf (f, "\"%p\" [label=\"", (void *)node);
2608 vect_print_slp_tree (MSG_NOTE,
2609 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2610 node);
2611 fprintf (f, "\"];\n");
2614 for (slp_tree child : SLP_TREE_CHILDREN (node))
2615 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2617 for (slp_tree child : SLP_TREE_CHILDREN (node))
2618 if (child)
2619 dot_slp_tree (f, child, visited);
2622 DEBUG_FUNCTION void
2623 dot_slp_tree (const char *fname, slp_tree node)
2625 FILE *f = fopen (fname, "w");
2626 fprintf (f, "digraph {\n");
2627 fflush (f);
2629 debug_dump_context ctx (f);
2630 hash_set<slp_tree> visited;
2631 dot_slp_tree (f, node, visited);
2633 fflush (f);
2634 fprintf (f, "}\n");
2635 fclose (f);
2638 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2640 static void
2641 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2642 slp_tree node, hash_set<slp_tree> &visited)
2644 unsigned i;
2645 slp_tree child;
2647 if (visited.add (node))
2648 return;
2650 vect_print_slp_tree (dump_kind, loc, node);
2652 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2653 if (child)
2654 vect_print_slp_graph (dump_kind, loc, child, visited);
2657 static void
2658 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2659 slp_tree entry)
2661 hash_set<slp_tree> visited;
2662 vect_print_slp_graph (dump_kind, loc, entry, visited);
2665 /* Mark the tree rooted at NODE with PURE_SLP. */
2667 static void
2668 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2670 int i;
2671 stmt_vec_info stmt_info;
2672 slp_tree child;
2674 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2675 return;
2677 if (visited.add (node))
2678 return;
2680 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2681 STMT_SLP_TYPE (stmt_info) = pure_slp;
2683 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2684 if (child)
2685 vect_mark_slp_stmts (child, visited);
2688 static void
2689 vect_mark_slp_stmts (slp_tree node)
2691 hash_set<slp_tree> visited;
2692 vect_mark_slp_stmts (node, visited);
2695 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2697 static void
2698 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2700 int i;
2701 stmt_vec_info stmt_info;
2702 slp_tree child;
2704 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2705 return;
2707 if (visited.add (node))
2708 return;
2710 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2712 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2713 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2714 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2717 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2718 if (child)
2719 vect_mark_slp_stmts_relevant (child, visited);
2722 static void
2723 vect_mark_slp_stmts_relevant (slp_tree node)
2725 hash_set<slp_tree> visited;
2726 vect_mark_slp_stmts_relevant (node, visited);
2730 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2732 static void
2733 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2734 hash_set<slp_tree> &visited)
2736 if (!node || visited.add (node))
2737 return;
2739 if (SLP_TREE_CHILDREN (node).length () == 0)
2741 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2742 return;
2743 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2744 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2745 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2746 loads.safe_push (node);
2748 else
2750 unsigned i;
2751 slp_tree child;
2752 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2753 vect_gather_slp_loads (loads, child, visited);
2758 /* Find the last store in SLP INSTANCE. */
2760 stmt_vec_info
2761 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2763 stmt_vec_info last = NULL;
2764 stmt_vec_info stmt_vinfo;
2766 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2768 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2769 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2772 return last;
2775 /* Find the first stmt in NODE. */
2777 stmt_vec_info
2778 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2780 stmt_vec_info first = NULL;
2781 stmt_vec_info stmt_vinfo;
2783 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2785 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2786 if (!first
2787 || get_later_stmt (stmt_vinfo, first) == first)
2788 first = stmt_vinfo;
2791 return first;
2794 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2795 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2796 (also containing the first GROUP1_SIZE stmts, since stores are
2797 consecutive), the second containing the remainder.
2798 Return the first stmt in the second group. */
2800 static stmt_vec_info
2801 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2803 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2804 gcc_assert (group1_size > 0);
2805 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2806 gcc_assert (group2_size > 0);
2807 DR_GROUP_SIZE (first_vinfo) = group1_size;
2809 stmt_vec_info stmt_info = first_vinfo;
2810 for (unsigned i = group1_size; i > 1; i--)
2812 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2813 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2815 /* STMT is now the last element of the first group. */
2816 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2817 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2819 DR_GROUP_SIZE (group2) = group2_size;
2820 for (stmt_info = group2; stmt_info;
2821 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2823 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2824 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2827 /* For the second group, the DR_GROUP_GAP is that before the original group,
2828 plus skipping over the first vector. */
2829 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2831 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2832 DR_GROUP_GAP (first_vinfo) += group2_size;
2834 if (dump_enabled_p ())
2835 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2836 group1_size, group2_size);
2838 return group2;
2841 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2842 statements and a vector of NUNITS elements. */
2844 static poly_uint64
2845 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2847 return exact_div (common_multiple (nunits, group_size), group_size);
2850 /* Helper that checks to see if a node is a load node. */
2852 static inline bool
2853 vect_is_slp_load_node (slp_tree root)
2855 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2856 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2857 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2861 /* Helper function of optimize_load_redistribution that performs the operation
2862 recursively. */
2864 static slp_tree
2865 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2866 vec_info *vinfo, unsigned int group_size,
2867 hash_map<slp_tree, slp_tree> *load_map,
2868 slp_tree root)
2870 if (slp_tree *leader = load_map->get (root))
2871 return *leader;
2873 slp_tree node;
2874 unsigned i;
2876 /* For now, we don't know anything about externals so do not do anything. */
2877 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2878 return NULL;
2879 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2881 /* First convert this node into a load node and add it to the leaves
2882 list and flatten the permute from a lane to a load one. If it's
2883 unneeded it will be elided later. */
2884 vec<stmt_vec_info> stmts;
2885 stmts.create (SLP_TREE_LANES (root));
2886 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2887 for (unsigned j = 0; j < lane_perm.length (); j++)
2889 std::pair<unsigned, unsigned> perm = lane_perm[j];
2890 node = SLP_TREE_CHILDREN (root)[perm.first];
2892 if (!vect_is_slp_load_node (node)
2893 || SLP_TREE_CHILDREN (node).exists ())
2895 stmts.release ();
2896 goto next;
2899 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_NOTE, vect_location,
2904 "converting stmts on permute node %p\n",
2905 (void *) root);
2907 bool *matches = XALLOCAVEC (bool, group_size);
2908 poly_uint64 max_nunits = 1;
2909 unsigned tree_size = 0, limit = 1;
2910 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2911 matches, &limit, &tree_size, bst_map);
2912 if (!node)
2913 stmts.release ();
2915 load_map->put (root, node);
2916 return node;
2919 next:
2920 load_map->put (root, NULL);
2922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2924 slp_tree value
2925 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2926 node);
2927 if (value)
2929 SLP_TREE_REF_COUNT (value)++;
2930 SLP_TREE_CHILDREN (root)[i] = value;
2931 /* ??? We know the original leafs of the replaced nodes will
2932 be referenced by bst_map, only the permutes created by
2933 pattern matching are not. */
2934 if (SLP_TREE_REF_COUNT (node) == 1)
2935 load_map->remove (node);
2936 vect_free_slp_tree (node);
2940 return NULL;
2943 /* Temporary workaround for loads not being CSEd during SLP build. This
2944 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2945 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2946 same DR such that the final operation is equal to a permuted load. Such
2947 NODES are then directly converted into LOADS themselves. The nodes are
2948 CSEd using BST_MAP. */
2950 static void
2951 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2952 vec_info *vinfo, unsigned int group_size,
2953 hash_map<slp_tree, slp_tree> *load_map,
2954 slp_tree root)
2956 slp_tree node;
2957 unsigned i;
2959 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2961 slp_tree value
2962 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2963 node);
2964 if (value)
2966 SLP_TREE_REF_COUNT (value)++;
2967 SLP_TREE_CHILDREN (root)[i] = value;
2968 /* ??? We know the original leafs of the replaced nodes will
2969 be referenced by bst_map, only the permutes created by
2970 pattern matching are not. */
2971 if (SLP_TREE_REF_COUNT (node) == 1)
2972 load_map->remove (node);
2973 vect_free_slp_tree (node);
2978 /* Helper function of vect_match_slp_patterns.
2980 Attempts to match patterns against the slp tree rooted in REF_NODE using
2981 VINFO. Patterns are matched in post-order traversal.
2983 If matching is successful the value in REF_NODE is updated and returned, if
2984 not then it is returned unchanged. */
2986 static bool
2987 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2988 slp_tree_to_load_perm_map_t *perm_cache,
2989 slp_compat_nodes_map_t *compat_cache,
2990 hash_set<slp_tree> *visited)
2992 unsigned i;
2993 slp_tree node = *ref_node;
2994 bool found_p = false;
2995 if (!node || visited->add (node))
2996 return false;
2998 slp_tree child;
2999 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3000 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3001 vinfo, perm_cache, compat_cache,
3002 visited);
3004 for (unsigned x = 0; x < num__slp_patterns; x++)
3006 vect_pattern *pattern
3007 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3008 if (pattern)
3010 pattern->build (vinfo);
3011 delete pattern;
3012 found_p = true;
3016 return found_p;
3019 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3020 vec_info VINFO.
3022 The modified tree is returned. Patterns are tried in order and multiple
3023 patterns may match. */
3025 static bool
3026 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3027 hash_set<slp_tree> *visited,
3028 slp_tree_to_load_perm_map_t *perm_cache,
3029 slp_compat_nodes_map_t *compat_cache)
3031 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3032 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3034 if (dump_enabled_p ())
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 "Analyzing SLP tree %p for patterns\n",
3037 (void *) SLP_INSTANCE_TREE (instance));
3039 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3040 visited);
3043 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3044 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3045 Return true if we could use IFN_STORE_LANES instead and if that appears
3046 to be the better approach. */
3048 static bool
3049 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3050 unsigned int group_size,
3051 unsigned int new_group_size)
3053 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3054 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3055 if (!vectype)
3056 return false;
3057 /* Allow the split if one of the two new groups would operate on full
3058 vectors *within* rather than across one scalar loop iteration.
3059 This is purely a heuristic, but it should work well for group
3060 sizes of 3 and 4, where the possible splits are:
3062 3->2+1: OK if the vector has exactly two elements
3063 4->2+2: Likewise
3064 4->3+1: Less clear-cut. */
3065 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3066 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3067 return false;
3068 return vect_store_lanes_supported (vectype, group_size, false);
3071 /* Analyze an SLP instance starting from a group of grouped stores. Call
3072 vect_build_slp_tree to build a tree of packed stmts if possible.
3073 Return FALSE if it's impossible to SLP any stmt in the loop. */
3075 static bool
3076 vect_analyze_slp_instance (vec_info *vinfo,
3077 scalar_stmts_to_slp_tree_map_t *bst_map,
3078 stmt_vec_info stmt_info, slp_instance_kind kind,
3079 unsigned max_tree_size, unsigned *limit);
3081 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3082 of KIND. Return true if successful. */
3084 static bool
3085 vect_build_slp_instance (vec_info *vinfo,
3086 slp_instance_kind kind,
3087 vec<stmt_vec_info> &scalar_stmts,
3088 vec<stmt_vec_info> &root_stmt_infos,
3089 unsigned max_tree_size, unsigned *limit,
3090 scalar_stmts_to_slp_tree_map_t *bst_map,
3091 /* ??? We need stmt_info for group splitting. */
3092 stmt_vec_info stmt_info_)
3094 if (dump_enabled_p ())
3096 dump_printf_loc (MSG_NOTE, vect_location,
3097 "Starting SLP discovery for\n");
3098 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 " %G", scalar_stmts[i]->stmt);
3103 /* Build the tree for the SLP instance. */
3104 unsigned int group_size = scalar_stmts.length ();
3105 bool *matches = XALLOCAVEC (bool, group_size);
3106 poly_uint64 max_nunits = 1;
3107 unsigned tree_size = 0;
3108 unsigned i;
3109 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3110 &max_nunits, matches, limit,
3111 &tree_size, bst_map);
3112 if (node != NULL)
3114 /* Calculate the unrolling factor based on the smallest type. */
3115 poly_uint64 unrolling_factor
3116 = calculate_unrolling_factor (max_nunits, group_size);
3118 if (maybe_ne (unrolling_factor, 1U)
3119 && is_a <bb_vec_info> (vinfo))
3121 unsigned HOST_WIDE_INT const_max_nunits;
3122 if (!max_nunits.is_constant (&const_max_nunits)
3123 || const_max_nunits > group_size)
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3127 "Build SLP failed: store group "
3128 "size not a multiple of the vector size "
3129 "in basic block SLP\n");
3130 vect_free_slp_tree (node);
3131 return false;
3133 /* Fatal mismatch. */
3134 if (dump_enabled_p ())
3135 dump_printf_loc (MSG_NOTE, vect_location,
3136 "SLP discovery succeeded but node needs "
3137 "splitting\n");
3138 memset (matches, true, group_size);
3139 matches[group_size / const_max_nunits * const_max_nunits] = false;
3140 vect_free_slp_tree (node);
3142 else
3144 /* Create a new SLP instance. */
3145 slp_instance new_instance = XNEW (class _slp_instance);
3146 SLP_INSTANCE_TREE (new_instance) = node;
3147 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3148 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3149 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3150 SLP_INSTANCE_KIND (new_instance) = kind;
3151 new_instance->reduc_phis = NULL;
3152 new_instance->cost_vec = vNULL;
3153 new_instance->subgraph_entries = vNULL;
3155 if (dump_enabled_p ())
3156 dump_printf_loc (MSG_NOTE, vect_location,
3157 "SLP size %u vs. limit %u.\n",
3158 tree_size, max_tree_size);
3160 /* Fixup SLP reduction chains. */
3161 if (kind == slp_inst_kind_reduc_chain)
3163 /* If this is a reduction chain with a conversion in front
3164 amend the SLP tree with a node for that. */
3165 gimple *scalar_def
3166 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3167 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3169 /* Get at the conversion stmt - we know it's the single use
3170 of the last stmt of the reduction chain. */
3171 use_operand_p use_p;
3172 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3173 &use_p, &scalar_def);
3174 gcc_assert (r);
3175 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3176 next_info = vect_stmt_to_vectorize (next_info);
3177 scalar_stmts = vNULL;
3178 scalar_stmts.create (group_size);
3179 for (unsigned i = 0; i < group_size; ++i)
3180 scalar_stmts.quick_push (next_info);
3181 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3182 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3183 SLP_TREE_CHILDREN (conv).quick_push (node);
3184 SLP_INSTANCE_TREE (new_instance) = conv;
3185 /* We also have to fake this conversion stmt as SLP reduction
3186 group so we don't have to mess with too much code
3187 elsewhere. */
3188 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3189 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3191 /* Fill the backedge child of the PHI SLP node. The
3192 general matching code cannot find it because the
3193 scalar code does not reflect how we vectorize the
3194 reduction. */
3195 use_operand_p use_p;
3196 imm_use_iterator imm_iter;
3197 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3198 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3199 gimple_get_lhs (scalar_def))
3200 /* There are exactly two non-debug uses, the reduction
3201 PHI and the loop-closed PHI node. */
3202 if (!is_gimple_debug (USE_STMT (use_p))
3203 && gimple_bb (USE_STMT (use_p)) == loop->header)
3205 auto_vec<stmt_vec_info, 64> phis (group_size);
3206 stmt_vec_info phi_info
3207 = vinfo->lookup_stmt (USE_STMT (use_p));
3208 for (unsigned i = 0; i < group_size; ++i)
3209 phis.quick_push (phi_info);
3210 slp_tree *phi_node = bst_map->get (phis);
3211 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3212 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3213 = SLP_INSTANCE_TREE (new_instance);
3214 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3218 vinfo->slp_instances.safe_push (new_instance);
3220 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3221 the number of scalar stmts in the root in a few places.
3222 Verify that assumption holds. */
3223 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3224 .length () == group_size);
3226 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_NOTE, vect_location,
3229 "Final SLP tree for instance %p:\n",
3230 (void *) new_instance);
3231 vect_print_slp_graph (MSG_NOTE, vect_location,
3232 SLP_INSTANCE_TREE (new_instance));
3235 return true;
3238 else
3240 /* Failed to SLP. */
3241 /* Free the allocated memory. */
3242 scalar_stmts.release ();
3245 stmt_vec_info stmt_info = stmt_info_;
3246 /* Try to break the group up into pieces. */
3247 if (kind == slp_inst_kind_store)
3249 /* ??? We could delay all the actual splitting of store-groups
3250 until after SLP discovery of the original group completed.
3251 Then we can recurse to vect_build_slp_instance directly. */
3252 for (i = 0; i < group_size; i++)
3253 if (!matches[i])
3254 break;
3256 /* For basic block SLP, try to break the group up into multiples of
3257 a vector size. */
3258 if (is_a <bb_vec_info> (vinfo)
3259 && (i > 1 && i < group_size))
3261 tree scalar_type
3262 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3263 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3264 1 << floor_log2 (i));
3265 unsigned HOST_WIDE_INT const_nunits;
3266 if (vectype
3267 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3269 /* Split into two groups at the first vector boundary. */
3270 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3271 unsigned group1_size = i & ~(const_nunits - 1);
3273 if (dump_enabled_p ())
3274 dump_printf_loc (MSG_NOTE, vect_location,
3275 "Splitting SLP group at stmt %u\n", i);
3276 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3277 group1_size);
3278 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3279 kind, max_tree_size,
3280 limit);
3281 /* Split the rest at the failure point and possibly
3282 re-analyze the remaining matching part if it has
3283 at least two lanes. */
3284 if (group1_size < i
3285 && (i + 1 < group_size
3286 || i - group1_size > 1))
3288 stmt_vec_info rest2 = rest;
3289 rest = vect_split_slp_store_group (rest, i - group1_size);
3290 if (i - group1_size > 1)
3291 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3292 kind, max_tree_size,
3293 limit);
3295 /* Re-analyze the non-matching tail if it has at least
3296 two lanes. */
3297 if (i + 1 < group_size)
3298 res |= vect_analyze_slp_instance (vinfo, bst_map,
3299 rest, kind, max_tree_size,
3300 limit);
3301 return res;
3305 /* For loop vectorization split into arbitrary pieces of size > 1. */
3306 if (is_a <loop_vec_info> (vinfo)
3307 && (i > 1 && i < group_size)
3308 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3310 unsigned group1_size = i;
3312 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE, vect_location,
3314 "Splitting SLP group at stmt %u\n", i);
3316 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3317 group1_size);
3318 /* Loop vectorization cannot handle gaps in stores, make sure
3319 the split group appears as strided. */
3320 STMT_VINFO_STRIDED_P (rest) = 1;
3321 DR_GROUP_GAP (rest) = 0;
3322 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3323 DR_GROUP_GAP (stmt_info) = 0;
3325 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3326 kind, max_tree_size, limit);
3327 if (i + 1 < group_size)
3328 res |= vect_analyze_slp_instance (vinfo, bst_map,
3329 rest, kind, max_tree_size, limit);
3331 return res;
3334 /* Even though the first vector did not all match, we might be able to SLP
3335 (some) of the remainder. FORNOW ignore this possibility. */
3338 /* Failed to SLP. */
3339 if (dump_enabled_p ())
3340 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3341 return false;
3345 /* Analyze an SLP instance starting from a group of grouped stores. Call
3346 vect_build_slp_tree to build a tree of packed stmts if possible.
3347 Return FALSE if it's impossible to SLP any stmt in the loop. */
3349 static bool
3350 vect_analyze_slp_instance (vec_info *vinfo,
3351 scalar_stmts_to_slp_tree_map_t *bst_map,
3352 stmt_vec_info stmt_info,
3353 slp_instance_kind kind,
3354 unsigned max_tree_size, unsigned *limit)
3356 unsigned int i;
3357 vec<stmt_vec_info> scalar_stmts;
3359 if (is_a <bb_vec_info> (vinfo))
3360 vect_location = stmt_info->stmt;
3362 stmt_vec_info next_info = stmt_info;
3363 if (kind == slp_inst_kind_store)
3365 /* Collect the stores and store them in scalar_stmts. */
3366 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3367 while (next_info)
3369 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3370 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3373 else if (kind == slp_inst_kind_reduc_chain)
3375 /* Collect the reduction stmts and store them in scalar_stmts. */
3376 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3377 while (next_info)
3379 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3380 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3382 /* Mark the first element of the reduction chain as reduction to properly
3383 transform the node. In the reduction analysis phase only the last
3384 element of the chain is marked as reduction. */
3385 STMT_VINFO_DEF_TYPE (stmt_info)
3386 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3387 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3388 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3390 else if (kind == slp_inst_kind_ctor)
3392 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3393 tree val;
3394 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3395 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3397 stmt_vec_info def_info = vinfo->lookup_def (val);
3398 def_info = vect_stmt_to_vectorize (def_info);
3399 scalar_stmts.quick_push (def_info);
3401 if (dump_enabled_p ())
3402 dump_printf_loc (MSG_NOTE, vect_location,
3403 "Analyzing vectorizable constructor: %G\n",
3404 stmt_info->stmt);
3406 else if (kind == slp_inst_kind_reduc_group)
3408 /* Collect reduction statements. */
3409 const vec<stmt_vec_info> &reductions
3410 = as_a <loop_vec_info> (vinfo)->reductions;
3411 scalar_stmts.create (reductions.length ());
3412 for (i = 0; reductions.iterate (i, &next_info); i++)
3413 if ((STMT_VINFO_RELEVANT_P (next_info)
3414 || STMT_VINFO_LIVE_P (next_info))
3415 /* ??? Make sure we didn't skip a conversion around a reduction
3416 path. In that case we'd have to reverse engineer that conversion
3417 stmt following the chain using reduc_idx and from the PHI
3418 using reduc_def. */
3419 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3420 scalar_stmts.quick_push (next_info);
3421 /* If less than two were relevant/live there's nothing to SLP. */
3422 if (scalar_stmts.length () < 2)
3423 return false;
3425 else
3426 gcc_unreachable ();
3428 vec<stmt_vec_info> roots = vNULL;
3429 if (kind == slp_inst_kind_ctor)
3431 roots.create (1);
3432 roots.quick_push (stmt_info);
3434 /* Build the tree for the SLP instance. */
3435 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3436 roots,
3437 max_tree_size, limit, bst_map,
3438 kind == slp_inst_kind_store
3439 ? stmt_info : NULL);
3440 if (!res)
3441 roots.release ();
3443 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3444 where we should do store group splitting. */
3446 return res;
3449 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3450 trees of packed scalar stmts if SLP is possible. */
3452 opt_result
3453 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3455 unsigned int i;
3456 stmt_vec_info first_element;
3457 slp_instance instance;
3459 DUMP_VECT_SCOPE ("vect_analyze_slp");
3461 unsigned limit = max_tree_size;
3463 scalar_stmts_to_slp_tree_map_t *bst_map
3464 = new scalar_stmts_to_slp_tree_map_t ();
3466 /* Find SLP sequences starting from groups of grouped stores. */
3467 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3468 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3469 STMT_VINFO_GROUPED_ACCESS (first_element)
3470 ? slp_inst_kind_store : slp_inst_kind_ctor,
3471 max_tree_size, &limit);
3473 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3475 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3477 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3478 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3479 bb_vinfo->roots[i].stmts,
3480 bb_vinfo->roots[i].roots,
3481 max_tree_size, &limit, bst_map, NULL))
3483 bb_vinfo->roots[i].stmts = vNULL;
3484 bb_vinfo->roots[i].roots = vNULL;
3489 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3491 /* Find SLP sequences starting from reduction chains. */
3492 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3493 if (! STMT_VINFO_RELEVANT_P (first_element)
3494 && ! STMT_VINFO_LIVE_P (first_element))
3496 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3497 slp_inst_kind_reduc_chain,
3498 max_tree_size, &limit))
3500 /* Dissolve reduction chain group. */
3501 stmt_vec_info vinfo = first_element;
3502 stmt_vec_info last = NULL;
3503 while (vinfo)
3505 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3506 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3507 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3508 last = vinfo;
3509 vinfo = next;
3511 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3512 /* It can be still vectorized as part of an SLP reduction. */
3513 loop_vinfo->reductions.safe_push (last);
3516 /* Find SLP sequences starting from groups of reductions. */
3517 if (loop_vinfo->reductions.length () > 1)
3518 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3519 slp_inst_kind_reduc_group, max_tree_size,
3520 &limit);
3523 hash_set<slp_tree> visited_patterns;
3524 slp_tree_to_load_perm_map_t perm_cache;
3525 slp_compat_nodes_map_t compat_cache;
3527 /* See if any patterns can be found in the SLP tree. */
3528 bool pattern_found = false;
3529 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3530 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3531 &visited_patterns, &perm_cache,
3532 &compat_cache);
3534 /* If any were found optimize permutations of loads. */
3535 if (pattern_found)
3537 hash_map<slp_tree, slp_tree> load_map;
3538 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3540 slp_tree root = SLP_INSTANCE_TREE (instance);
3541 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3542 &load_map, root);
3548 /* The map keeps a reference on SLP nodes built, release that. */
3549 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3550 it != bst_map->end (); ++it)
3551 if ((*it).second)
3552 vect_free_slp_tree ((*it).second);
3553 delete bst_map;
3555 if (pattern_found && dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "Pattern matched SLP tree\n");
3559 hash_set<slp_tree> visited;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3561 vect_print_slp_graph (MSG_NOTE, vect_location,
3562 SLP_INSTANCE_TREE (instance), visited);
3565 return opt_result::success ();
3568 /* Estimates the cost of inserting layout changes into the SLP graph.
3569 It can also say that the insertion is impossible. */
3571 struct slpg_layout_cost
3573 slpg_layout_cost () = default;
3574 slpg_layout_cost (sreal, bool);
3576 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3577 bool is_possible () const { return depth != sreal::max (); }
3579 bool operator== (const slpg_layout_cost &) const;
3580 bool operator!= (const slpg_layout_cost &) const;
3582 bool is_better_than (const slpg_layout_cost &, bool) const;
3584 void add_parallel_cost (const slpg_layout_cost &);
3585 void add_serial_cost (const slpg_layout_cost &);
3586 void split (unsigned int);
3588 /* The longest sequence of layout changes needed during any traversal
3589 of the partition dag, weighted by execution frequency.
3591 This is the most important metric when optimizing for speed, since
3592 it helps to ensure that we keep the number of operations on
3593 critical paths to a minimum. */
3594 sreal depth = 0;
3596 /* An estimate of the total number of operations needed. It is weighted by
3597 execution frequency when optimizing for speed but not when optimizing for
3598 size. In order to avoid double-counting, a node with a fanout of N will
3599 distribute 1/N of its total cost to each successor.
3601 This is the most important metric when optimizing for size, since
3602 it helps to keep the total number of operations to a minimum, */
3603 sreal total = 0;
3606 /* Construct costs for a node with weight WEIGHT. A higher weight
3607 indicates more frequent execution. IS_FOR_SIZE is true if we are
3608 optimizing for size rather than speed. */
3610 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3611 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3615 bool
3616 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3618 return depth == other.depth && total == other.total;
3621 bool
3622 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3624 return !operator== (other);
3627 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3628 true if we are optimizing for size rather than speed. */
3630 bool
3631 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3632 bool is_for_size) const
3634 if (is_for_size)
3636 if (total != other.total)
3637 return total < other.total;
3638 return depth < other.depth;
3640 else
3642 if (depth != other.depth)
3643 return depth < other.depth;
3644 return total < other.total;
3648 /* Increase the costs to account for something with cost INPUT_COST
3649 happening in parallel with the current costs. */
3651 void
3652 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3654 depth = std::max (depth, input_cost.depth);
3655 total += input_cost.total;
3658 /* Increase the costs to account for something with cost INPUT_COST
3659 happening in series with the current costs. */
3661 void
3662 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3664 depth += other.depth;
3665 total += other.total;
3668 /* Split the total cost among TIMES successors or predecessors. */
3670 void
3671 slpg_layout_cost::split (unsigned int times)
3673 if (times > 1)
3674 total /= times;
3677 /* Information about one node in the SLP graph, for use during
3678 vect_optimize_slp_pass. */
3680 struct slpg_vertex
3682 slpg_vertex (slp_tree node_) : node (node_) {}
3684 /* The node itself. */
3685 slp_tree node;
3687 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3688 partitions are flexible; they can have whichever layout consumers
3689 want them to have. */
3690 int partition = -1;
3692 /* The number of nodes that directly use the result of this one
3693 (i.e. the number of nodes that count this one as a child). */
3694 unsigned int out_degree = 0;
3696 /* The execution frequency of the node. */
3697 sreal weight = 0;
3699 /* The total execution frequency of all nodes that directly use the
3700 result of this one. */
3701 sreal out_weight = 0;
3704 /* Information about one partition of the SLP graph, for use during
3705 vect_optimize_slp_pass. */
3707 struct slpg_partition_info
3709 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3710 of m_partitioned_nodes. */
3711 unsigned int node_begin = 0;
3712 unsigned int node_end = 0;
3714 /* Which layout we've chosen to use for this partition, or -1 if
3715 we haven't picked one yet. */
3716 int layout = -1;
3718 /* The number of predecessors and successors in the partition dag.
3719 The predecessors always have lower partition numbers and the
3720 successors always have higher partition numbers.
3722 Note that the directions of these edges are not necessarily the
3723 same as in the data flow graph. For example, if an SCC has separate
3724 partitions for an inner loop and an outer loop, the inner loop's
3725 partition will have at least two incoming edges from the outer loop's
3726 partition: one for a live-in value and one for a live-out value.
3727 In data flow terms, one of these edges would also be from the outer loop
3728 to the inner loop, but the other would be in the opposite direction. */
3729 unsigned int in_degree = 0;
3730 unsigned int out_degree = 0;
3733 /* Information about the costs of using a particular layout for a
3734 particular partition. It can also say that the combination is
3735 impossible. */
3737 struct slpg_partition_layout_costs
3739 bool is_possible () const { return internal_cost.is_possible (); }
3740 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3742 /* The costs inherited from predecessor partitions. */
3743 slpg_layout_cost in_cost;
3745 /* The inherent cost of the layout within the node itself. For example,
3746 this is nonzero for a load if choosing a particular layout would require
3747 the load to permute the loaded elements. It is nonzero for a
3748 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3749 to full-vector moves. */
3750 slpg_layout_cost internal_cost;
3752 /* The costs inherited from successor partitions. */
3753 slpg_layout_cost out_cost;
3756 /* This class tries to optimize the layout of vectors in order to avoid
3757 unnecessary shuffling. At the moment, the set of possible layouts are
3758 restricted to bijective permutations.
3760 The goal of the pass depends on whether we're optimizing for size or
3761 for speed. When optimizing for size, the goal is to reduce the overall
3762 number of layout changes (including layout changes implied by things
3763 like load permutations). When optimizing for speed, the goal is to
3764 reduce the maximum latency attributable to layout changes on any
3765 non-cyclical path through the data flow graph.
3767 For example, when optimizing a loop nest for speed, we will prefer
3768 to make layout changes outside of a loop rather than inside of a loop,
3769 and will prefer to make layout changes in parallel rather than serially,
3770 even if that increases the overall number of layout changes.
3772 The high-level procedure is:
3774 (1) Build a graph in which edges go from uses (parents) to definitions
3775 (children).
3777 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3779 (3) When optimizing for speed, partition the nodes in each SCC based
3780 on their containing cfg loop. When optimizing for size, treat
3781 each SCC as a single partition.
3783 This gives us a dag of partitions. The goal is now to assign a
3784 layout to each partition.
3786 (4) Construct a set of vector layouts that are worth considering.
3787 Record which nodes must keep their current layout.
3789 (5) Perform a forward walk over the partition dag (from loads to stores)
3790 accumulating the "forward" cost of using each layout. When visiting
3791 each partition, assign a tentative choice of layout to the partition
3792 and use that choice when calculating the cost of using a different
3793 layout in successor partitions.
3795 (6) Perform a backward walk over the partition dag (from stores to loads),
3796 accumulating the "backward" cost of using each layout. When visiting
3797 each partition, make a final choice of layout for that partition based
3798 on the accumulated forward costs (from (5)) and backward costs
3799 (from (6)).
3801 (7) Apply the chosen layouts to the SLP graph.
3803 For example, consider the SLP statements:
3805 S1: a_1 = load
3806 loop:
3807 S2: a_2 = PHI<a_1, a_3>
3808 S3: b_1 = load
3809 S4: a_3 = a_2 + b_1
3810 exit:
3811 S5: a_4 = PHI<a_3>
3812 S6: store a_4
3814 S2 and S4 form an SCC and are part of the same loop. Every other
3815 statement is in a singleton SCC. In this example there is a one-to-one
3816 mapping between SCCs and partitions and the partition dag looks like this;
3818 S1 S3
3820 S2+S4
3826 S2, S3 and S4 will have a higher execution frequency than the other
3827 statements, so when optimizing for speed, the goal is to avoid any
3828 layout changes:
3830 - within S3
3831 - within S2+S4
3832 - on the S3->S2+S4 edge
3834 For example, if S3 was originally a reversing load, the goal of the
3835 pass is to make it an unreversed load and change the layout on the
3836 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3837 on S1->S2+S4 and S5->S6 would also be acceptable.)
3839 The difference between SCCs and partitions becomes important if we
3840 add an outer loop:
3842 S1: a_1 = ...
3843 loop1:
3844 S2: a_2 = PHI<a_1, a_6>
3845 S3: b_1 = load
3846 S4: a_3 = a_2 + b_1
3847 loop2:
3848 S5: a_4 = PHI<a_3, a_5>
3849 S6: c_1 = load
3850 S7: a_5 = a_4 + c_1
3851 exit2:
3852 S8: a_6 = PHI<a_5>
3853 S9: store a_6
3854 exit1:
3856 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3857 for speed, we usually do not want restrictions in the outer loop to "infect"
3858 the decision for the inner loop. For example, if an outer-loop node
3859 in the SCC contains a statement with a fixed layout, that should not
3860 prevent the inner loop from using a different layout. Conversely,
3861 the inner loop should not dictate a layout to the outer loop: if the
3862 outer loop does a lot of computation, then it may not be efficient to
3863 do all of that computation in the inner loop's preferred layout.
3865 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3866 and S5+S7 (inner). We also try to arrange partitions so that:
3868 - the partition for an outer loop comes before the partition for
3869 an inner loop
3871 - if a sibling loop A dominates a sibling loop B, A's partition
3872 comes before B's
3874 This gives the following partition dag for the example above:
3876 S1 S3
3878 S2+S4+S8 S6
3879 | \\ /
3880 | S5+S7
3884 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3885 one for a reversal of the edge S7->S8.
3887 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3888 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3889 preferred layout against the cost of changing the layout on entry to the
3890 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3892 Although this works well when optimizing for speed, it has the downside
3893 when optimizing for size that the choice of layout for S5+S7 is completely
3894 independent of S9, which lessens the chance of reducing the overall number
3895 of permutations. We therefore do not partition SCCs when optimizing
3896 for size.
3898 To give a concrete example of the difference between optimizing
3899 for size and speed, consider:
3901 a[0] = (b[1] << c[3]) - d[1];
3902 a[1] = (b[0] << c[2]) - d[0];
3903 a[2] = (b[3] << c[1]) - d[3];
3904 a[3] = (b[2] << c[0]) - d[2];
3906 There are three different layouts here: one for a, one for b and d,
3907 and one for c. When optimizing for speed it is better to permute each
3908 of b, c and d into the order required by a, since those permutations
3909 happen in parallel. But when optimizing for size, it is better to:
3911 - permute c into the same order as b
3912 - do the arithmetic
3913 - permute the result into the order required by a
3915 This gives 2 permutations rather than 3. */
3917 class vect_optimize_slp_pass
3919 public:
3920 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3921 void run ();
3923 private:
3924 /* Graph building. */
3925 struct loop *containing_loop (slp_tree);
3926 bool is_cfg_latch_edge (graph_edge *);
3927 void build_vertices (hash_set<slp_tree> &, slp_tree);
3928 void build_vertices ();
3929 void build_graph ();
3931 /* Partitioning. */
3932 void create_partitions ();
3933 template<typename T> void for_each_partition_edge (unsigned int, T);
3935 /* Layout selection. */
3936 bool is_compatible_layout (slp_tree, unsigned int);
3937 int change_layout_cost (slp_tree, unsigned int, unsigned int);
3938 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3939 unsigned int);
3940 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3941 int, unsigned int);
3942 int internal_node_cost (slp_tree, int, unsigned int);
3943 void start_choosing_layouts ();
3945 /* Cost propagation. */
3946 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3947 unsigned int, unsigned int);
3948 slpg_layout_cost total_in_cost (unsigned int);
3949 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3950 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3951 void forward_pass ();
3952 void backward_pass ();
3954 /* Rematerialization. */
3955 slp_tree get_result_with_layout (slp_tree, unsigned int);
3956 void materialize ();
3958 /* Clean-up. */
3959 void remove_redundant_permutations ();
3961 void dump ();
3963 vec_info *m_vinfo;
3965 /* True if we should optimize the graph for size, false if we should
3966 optimize it for speed. (It wouldn't be easy to make this decision
3967 more locally.) */
3968 bool m_optimize_size;
3970 /* A graph of all SLP nodes, with edges leading from uses to definitions.
3971 In other words, a node's predecessors are its slp_tree parents and
3972 a node's successors are its slp_tree children. */
3973 graph *m_slpg = nullptr;
3975 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
3976 auto_vec<slpg_vertex> m_vertices;
3978 /* The list of all leaves of M_SLPG. such as external definitions, constants,
3979 and loads. */
3980 auto_vec<int> m_leafs;
3982 /* This array has one entry for every vector layout that we're considering.
3983 Element 0 is null and indicates "no change". Other entries describe
3984 permutations that are inherent in the current graph and that we would
3985 like to reverse if possible.
3987 For example, a permutation { 1, 2, 3, 0 } means that something has
3988 effectively been permuted in that way, such as a load group
3989 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3990 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3991 in order to put things "back" in order. */
3992 auto_vec<vec<unsigned> > m_perms;
3994 /* A partitioning of the nodes for which a layout must be chosen.
3995 Each partition represents an <SCC, cfg loop> pair; that is,
3996 nodes in different SCCs belong to different partitions, and nodes
3997 within an SCC can be further partitioned according to a containing
3998 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4000 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4001 from leaves (such as loads) to roots (such as stores).
4003 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4004 auto_vec<slpg_partition_info> m_partitions;
4006 /* The list of all nodes for which a layout must be chosen. Nodes for
4007 partition P come before the nodes for partition P+1. Nodes within a
4008 partition are in reverse postorder. */
4009 auto_vec<unsigned int> m_partitioned_nodes;
4011 /* Index P * num-layouts + L contains the cost of using layout L
4012 for partition P. */
4013 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4015 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4016 original output of node N adjusted to have layout L. */
4017 auto_vec<slp_tree> m_node_layouts;
4020 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4021 Also record whether we should optimize anything for speed rather
4022 than size. */
4024 void
4025 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4026 slp_tree node)
4028 unsigned i;
4029 slp_tree child;
4031 if (visited.add (node))
4032 return;
4034 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4036 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4037 if (optimize_bb_for_speed_p (bb))
4038 m_optimize_size = false;
4041 node->vertex = m_vertices.length ();
4042 m_vertices.safe_push (slpg_vertex (node));
4044 bool leaf = true;
4045 bool force_leaf = false;
4046 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4047 if (child)
4049 leaf = false;
4050 build_vertices (visited, child);
4052 else
4053 force_leaf = true;
4054 /* Since SLP discovery works along use-def edges all cycles have an
4055 entry - but there's the exception of cycles where we do not handle
4056 the entry explicitely (but with a NULL SLP node), like some reductions
4057 and inductions. Force those SLP PHIs to act as leafs to make them
4058 backwards reachable. */
4059 if (leaf || force_leaf)
4060 m_leafs.safe_push (node->vertex);
4063 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4065 void
4066 vect_optimize_slp_pass::build_vertices ()
4068 hash_set<slp_tree> visited;
4069 unsigned i;
4070 slp_instance instance;
4071 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4072 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4075 /* Apply (reverse) bijectite PERM to VEC. */
4077 template <class T>
4078 static void
4079 vect_slp_permute (vec<unsigned> perm,
4080 vec<T> &vec, bool reverse)
4082 auto_vec<T, 64> saved;
4083 saved.create (vec.length ());
4084 for (unsigned i = 0; i < vec.length (); ++i)
4085 saved.quick_push (vec[i]);
4087 if (reverse)
4089 for (unsigned i = 0; i < vec.length (); ++i)
4090 vec[perm[i]] = saved[i];
4091 for (unsigned i = 0; i < vec.length (); ++i)
4092 gcc_assert (vec[perm[i]] == saved[i]);
4094 else
4096 for (unsigned i = 0; i < vec.length (); ++i)
4097 vec[i] = saved[perm[i]];
4098 for (unsigned i = 0; i < vec.length (); ++i)
4099 gcc_assert (vec[i] == saved[perm[i]]);
4103 /* Return the cfg loop that contains NODE. */
4105 struct loop *
4106 vect_optimize_slp_pass::containing_loop (slp_tree node)
4108 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4109 if (!rep)
4110 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4111 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4114 /* Return true if UD (an edge from a use to a definition) is associated
4115 with a loop latch edge in the cfg. */
4117 bool
4118 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4120 slp_tree use = m_vertices[ud->src].node;
4121 slp_tree def = m_vertices[ud->dest].node;
4122 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4123 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4124 return false;
4126 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4127 return (is_a<gphi *> (use_rep->stmt)
4128 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4129 && containing_loop (def) == containing_loop (use));
4132 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4133 a nonnull data field. */
4135 void
4136 vect_optimize_slp_pass::build_graph ()
4138 m_optimize_size = true;
4139 build_vertices ();
4141 m_slpg = new_graph (m_vertices.length ());
4142 for (slpg_vertex &v : m_vertices)
4143 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4144 if (child)
4146 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4147 if (is_cfg_latch_edge (ud))
4148 ud->data = this;
4152 /* Return true if E corresponds to a loop latch edge in the cfg. */
4154 static bool
4155 skip_cfg_latch_edges (graph_edge *e)
4157 return e->data;
4160 /* Create the node partitions. */
4162 void
4163 vect_optimize_slp_pass::create_partitions ()
4165 /* Calculate a postorder of the graph, ignoring edges that correspond
4166 to natural latch edges in the cfg. Reading the vector from the end
4167 to the beginning gives the reverse postorder. */
4168 auto_vec<int> initial_rpo;
4169 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4170 false, NULL, skip_cfg_latch_edges);
4171 gcc_assert (initial_rpo.length () == m_vertices.length ());
4173 /* Calculate the strongly connected components of the graph. */
4174 auto_vec<int> scc_grouping;
4175 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4177 /* Create a new index order in which all nodes from the same SCC are
4178 consecutive. Use scc_pos to record the index of the first node in
4179 each SCC. */
4180 auto_vec<unsigned int> scc_pos (num_sccs);
4181 int last_component = -1;
4182 unsigned int node_count = 0;
4183 for (unsigned int node_i : scc_grouping)
4185 if (last_component != m_slpg->vertices[node_i].component)
4187 last_component = m_slpg->vertices[node_i].component;
4188 gcc_assert (last_component == int (scc_pos.length ()));
4189 scc_pos.quick_push (node_count);
4191 node_count += 1;
4193 gcc_assert (node_count == initial_rpo.length ()
4194 && last_component + 1 == int (num_sccs));
4196 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4197 inside each SCC following the RPO we calculated above. The fact that
4198 we ignored natural latch edges when calculating the RPO should ensure
4199 that, for natural loop nests:
4201 - the first node that we encounter in a cfg loop is the loop header phi
4202 - the loop header phis are in dominance order
4204 Arranging for this is an optimization (see below) rather than a
4205 correctness issue. Unnatural loops with a tangled mess of backedges
4206 will still work correctly, but might give poorer results.
4208 Also update scc_pos so that it gives 1 + the index of the last node
4209 in the SCC. */
4210 m_partitioned_nodes.safe_grow (node_count);
4211 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4213 unsigned int node_i = initial_rpo[old_i];
4214 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4215 m_partitioned_nodes[new_i] = node_i;
4218 /* When optimizing for speed, partition each SCC based on the containing
4219 cfg loop. The order we constructed above should ensure that, for natural
4220 cfg loops, we'll create sub-SCC partitions for outer loops before
4221 the corresponding sub-SCC partitions for inner loops. Similarly,
4222 when one sibling loop A dominates another sibling loop B, we should
4223 create a sub-SCC partition for A before a sub-SCC partition for B.
4225 As above, nothing depends for correctness on whether this achieves
4226 a natural nesting, but we should get better results when it does. */
4227 m_partitions.reserve (m_vertices.length ());
4228 unsigned int next_partition_i = 0;
4229 hash_map<struct loop *, int> loop_partitions;
4230 unsigned int rpo_begin = 0;
4231 unsigned int num_partitioned_nodes = 0;
4232 for (unsigned int rpo_end : scc_pos)
4234 loop_partitions.empty ();
4235 unsigned int partition_i = next_partition_i;
4236 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4238 /* Handle externals and constants optimistically throughout.
4239 But treat existing vectors as fixed since we do not handle
4240 permuting them. */
4241 unsigned int node_i = m_partitioned_nodes[rpo_i];
4242 auto &vertex = m_vertices[node_i];
4243 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4244 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4245 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4246 vertex.partition = -1;
4247 else
4249 bool existed;
4250 if (m_optimize_size)
4251 existed = next_partition_i > partition_i;
4252 else
4254 struct loop *loop = containing_loop (vertex.node);
4255 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4256 if (!existed)
4257 entry = next_partition_i;
4258 partition_i = entry;
4260 if (!existed)
4262 m_partitions.quick_push (slpg_partition_info ());
4263 next_partition_i += 1;
4265 vertex.partition = partition_i;
4266 num_partitioned_nodes += 1;
4267 m_partitions[partition_i].node_end += 1;
4270 rpo_begin = rpo_end;
4273 /* Assign ranges of consecutive node indices to each partition,
4274 in partition order. Start with node_end being the same as
4275 node_begin so that the next loop can use it as a counter. */
4276 unsigned int node_begin = 0;
4277 for (auto &partition : m_partitions)
4279 partition.node_begin = node_begin;
4280 node_begin += partition.node_end;
4281 partition.node_end = partition.node_begin;
4283 gcc_assert (node_begin == num_partitioned_nodes);
4285 /* Finally build the list of nodes in partition order. */
4286 m_partitioned_nodes.truncate (num_partitioned_nodes);
4287 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4289 int partition_i = m_vertices[node_i].partition;
4290 if (partition_i >= 0)
4292 unsigned int order_i = m_partitions[partition_i].node_end++;
4293 m_partitioned_nodes[order_i] = node_i;
4298 /* Look for edges from earlier partitions into node NODE_I and edges from
4299 node NODE_I into later partitions. Call:
4301 FN (ud, other_node_i)
4303 for each such use-to-def edge ud, where other_node_i is the node at the
4304 other end of the edge. */
4306 template<typename T>
4307 void
4308 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4310 int partition_i = m_vertices[node_i].partition;
4311 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4312 pred; pred = pred->pred_next)
4314 int src_partition_i = m_vertices[pred->src].partition;
4315 if (src_partition_i >= 0 && src_partition_i != partition_i)
4316 fn (pred, pred->src);
4318 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4319 succ; succ = succ->succ_next)
4321 int dest_partition_i = m_vertices[succ->dest].partition;
4322 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4323 fn (succ, succ->dest);
4327 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4328 that NODE would operate on. This test is independent of NODE's actual
4329 operation. */
4331 bool
4332 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4333 unsigned int layout_i)
4335 if (layout_i == 0)
4336 return true;
4338 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4339 return false;
4341 return true;
4344 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4345 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4346 layouts is incompatible with NODE or if the change is not possible for
4347 some other reason.
4349 The properties taken from NODE include the number of lanes and the
4350 vector type. The actual operation doesn't matter. */
4353 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4354 unsigned int from_layout_i,
4355 unsigned int to_layout_i)
4357 if (!is_compatible_layout (node, from_layout_i)
4358 || !is_compatible_layout (node, to_layout_i))
4359 return -1;
4361 if (from_layout_i == to_layout_i)
4362 return 0;
4364 auto_vec<slp_tree, 1> children (1);
4365 children.quick_push (node);
4366 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4367 if (from_layout_i > 0)
4368 for (unsigned int i : m_perms[from_layout_i])
4369 perm.quick_push ({ 0, i });
4370 else
4371 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4372 perm.quick_push ({ 0, i });
4373 if (to_layout_i > 0)
4374 vect_slp_permute (m_perms[to_layout_i], perm, true);
4375 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4376 children, false);
4377 if (count >= 0)
4378 return MAX (count, 1);
4380 /* ??? In principle we could try changing via layout 0, giving two
4381 layout changes rather than 1. Doing that would require
4382 corresponding support in get_result_with_layout. */
4383 return -1;
4386 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4388 inline slpg_partition_layout_costs &
4389 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4390 unsigned int layout_i)
4392 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4395 /* Change PERM in one of two ways:
4397 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4398 chosen for child I of NODE.
4400 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4402 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4404 void
4405 vect_optimize_slp_pass::
4406 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4407 int in_layout_i, unsigned int out_layout_i)
4409 for (auto &entry : perm)
4411 int this_in_layout_i = in_layout_i;
4412 if (this_in_layout_i < 0)
4414 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4415 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4416 this_in_layout_i = m_partitions[in_partition_i].layout;
4418 if (this_in_layout_i > 0)
4419 entry.second = m_perms[this_in_layout_i][entry.second];
4421 if (out_layout_i > 0)
4422 vect_slp_permute (m_perms[out_layout_i], perm, true);
4425 /* Check whether the target allows NODE to be rearranged so that the node's
4426 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4427 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4429 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4430 NODE can adapt to the layout changes that have (perhaps provisionally)
4431 been chosen for NODE's children, so that no extra permutations are
4432 needed on either the input or the output of NODE.
4434 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4435 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4437 IN_LAYOUT_I has no meaning for other types of node.
4439 Keeping the node as-is is always valid. If the target doesn't appear
4440 to support the node as-is, but might realistically support other layouts,
4441 then layout 0 instead has the cost of a worst-case permutation. On the
4442 one hand, this ensures that every node has at least one valid layout,
4443 avoiding what would otherwise be an awkward special case. On the other,
4444 it still encourages the pass to change an invalid pre-existing layout
4445 choice into a valid one. */
4448 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4449 unsigned int out_layout_i)
4451 const int fallback_cost = 1;
4453 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4455 auto_lane_permutation_t tmp_perm;
4456 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4458 /* Check that the child nodes support the chosen layout. Checking
4459 the first child is enough, since any second child would have the
4460 same shape. */
4461 auto first_child = SLP_TREE_CHILDREN (node)[0];
4462 if (in_layout_i > 0
4463 && !is_compatible_layout (first_child, in_layout_i))
4464 return -1;
4466 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4467 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4468 node, tmp_perm,
4469 SLP_TREE_CHILDREN (node),
4470 false);
4471 if (count < 0)
4473 if (in_layout_i == 0 && out_layout_i == 0)
4475 /* Use the fallback cost if the node could in principle support
4476 some nonzero layout for both the inputs and the outputs.
4477 Otherwise assume that the node will be rejected later
4478 and rebuilt from scalars. */
4479 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4480 return fallback_cost;
4481 return 0;
4483 return -1;
4486 /* We currently have no way of telling whether the new layout is cheaper
4487 or more expensive than the old one. But at least in principle,
4488 it should be worth making zero permutations (whole-vector shuffles)
4489 cheaper than real permutations, in case the pass is able to remove
4490 the latter. */
4491 return count == 0 ? 0 : 1;
4494 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4495 if (rep
4496 && STMT_VINFO_DATA_REF (rep)
4497 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4498 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4500 auto_load_permutation_t tmp_perm;
4501 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4502 if (out_layout_i > 0)
4503 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4505 poly_uint64 vf = 1;
4506 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4507 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4508 unsigned int n_perms;
4509 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4510 nullptr, vf, true, false, &n_perms))
4512 auto rep = SLP_TREE_REPRESENTATIVE (node);
4513 if (out_layout_i == 0)
4515 /* Use the fallback cost if the load is an N-to-N permutation.
4516 Otherwise assume that the node will be rejected later
4517 and rebuilt from scalars. */
4518 if (STMT_VINFO_GROUPED_ACCESS (rep)
4519 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4520 == SLP_TREE_LANES (node)))
4521 return fallback_cost;
4522 return 0;
4524 return -1;
4527 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4528 return n_perms == 0 ? 0 : 1;
4531 return 0;
4534 /* Decide which element layouts we should consider using. Calculate the
4535 weights associated with inserting layout changes on partition edges.
4536 Also mark partitions that cannot change layout, by setting their
4537 layout to zero. */
4539 void
4540 vect_optimize_slp_pass::start_choosing_layouts ()
4542 /* Used to assign unique permutation indices. */
4543 using perm_hash = unbounded_hashmap_traits<
4544 vec_free_hash_base<int_hash_base<unsigned>>,
4545 int_hash<int, -1, -2>
4547 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4549 /* Layout 0 is "no change". */
4550 m_perms.safe_push (vNULL);
4552 /* Create layouts from existing permutations. */
4553 auto_load_permutation_t tmp_perm;
4554 for (unsigned int node_i : m_partitioned_nodes)
4556 /* Leafs also double as entries to the reverse graph. Allow the
4557 layout of those to be changed. */
4558 auto &vertex = m_vertices[node_i];
4559 auto &partition = m_partitions[vertex.partition];
4560 if (!m_slpg->vertices[node_i].succ)
4561 partition.layout = 0;
4563 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4564 slp_tree node = vertex.node;
4565 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4566 slp_tree child;
4567 unsigned HOST_WIDE_INT imin, imax = 0;
4568 bool any_permute = false;
4569 tmp_perm.truncate (0);
4570 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4572 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4573 unpermuted, record a layout that reverses this permutation.
4575 We would need more work to cope with loads that are internally
4576 permuted and also have inputs (such as masks for
4577 IFN_MASK_LOADs). */
4578 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4579 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4580 continue;
4581 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4582 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4583 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4585 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4586 && SLP_TREE_CHILDREN (node).length () == 1
4587 && (child = SLP_TREE_CHILDREN (node)[0])
4588 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4589 .is_constant (&imin)))
4591 /* If the child has the same vector size as this node,
4592 reversing the permutation can make the permutation a no-op.
4593 In other cases it can change a true permutation into a
4594 full-vector extract. */
4595 tmp_perm.reserve (SLP_TREE_LANES (node));
4596 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4597 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4599 else
4600 continue;
4602 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4604 unsigned idx = tmp_perm[j];
4605 imin = MIN (imin, idx);
4606 imax = MAX (imax, idx);
4607 if (idx - tmp_perm[0] != j)
4608 any_permute = true;
4610 /* If the span doesn't match we'd disrupt VF computation, avoid
4611 that for now. */
4612 if (imax - imin + 1 != SLP_TREE_LANES (node))
4613 continue;
4614 /* If there's no permute no need to split one out. In this case
4615 we can consider turning a load into a permuted load, if that
4616 turns out to be cheaper than alternatives. */
4617 if (!any_permute)
4619 partition.layout = -1;
4620 continue;
4623 /* For now only handle true permutes, like
4624 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4625 when permuting constants and invariants keeping the permute
4626 bijective. */
4627 auto_sbitmap load_index (SLP_TREE_LANES (node));
4628 bitmap_clear (load_index);
4629 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4630 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4631 unsigned j;
4632 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4633 if (!bitmap_bit_p (load_index, j))
4634 break;
4635 if (j != SLP_TREE_LANES (node))
4636 continue;
4638 vec<unsigned> perm = vNULL;
4639 perm.safe_grow (SLP_TREE_LANES (node), true);
4640 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4641 perm[j] = tmp_perm[j] - imin;
4643 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4645 /* Continue to use existing layouts, but don't add any more. */
4646 int *entry = layout_ids.get (perm);
4647 partition.layout = entry ? *entry : 0;
4648 perm.release ();
4650 else
4652 bool existed;
4653 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4654 if (existed)
4655 perm.release ();
4656 else
4658 layout_i = m_perms.length ();
4659 m_perms.safe_push (perm);
4661 partition.layout = layout_i;
4665 /* Initially assume that every layout is possible and has zero cost
4666 in every partition. */
4667 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4668 * m_perms.length ());
4670 /* We have to mark outgoing permutations facing non-reduction graph
4671 entries that are not represented as to be materialized. */
4672 for (slp_instance instance : m_vinfo->slp_instances)
4673 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4675 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4676 m_partitions[m_vertices[node_i].partition].layout = 0;
4679 /* Check which layouts each node and partition can handle. Calculate the
4680 weights associated with inserting layout changes on edges. */
4681 for (unsigned int node_i : m_partitioned_nodes)
4683 auto &vertex = m_vertices[node_i];
4684 auto &partition = m_partitions[vertex.partition];
4685 slp_tree node = vertex.node;
4687 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4689 vertex.weight = vect_slp_node_weight (node);
4691 /* We do not handle stores with a permutation, so all
4692 incoming permutations must have been materialized.
4694 We also don't handle masked grouped loads, which lack a
4695 permutation vector. In this case the memory locations
4696 form an implicit second input to the loads, on top of the
4697 explicit mask input, and the memory input's layout cannot
4698 be changed.
4700 On the other hand, we do support permuting gather loads and
4701 masked gather loads, where each scalar load is independent
4702 of the others. This can be useful if the address/index input
4703 benefits from permutation. */
4704 if (STMT_VINFO_DATA_REF (rep)
4705 && STMT_VINFO_GROUPED_ACCESS (rep)
4706 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4707 partition.layout = 0;
4709 /* We cannot change the layout of an operation that is
4710 not independent on lanes. Note this is an explicit
4711 negative list since that's much shorter than the respective
4712 positive one but it's critical to keep maintaining it. */
4713 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4714 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4716 case CFN_COMPLEX_ADD_ROT90:
4717 case CFN_COMPLEX_ADD_ROT270:
4718 case CFN_COMPLEX_MUL:
4719 case CFN_COMPLEX_MUL_CONJ:
4720 case CFN_VEC_ADDSUB:
4721 case CFN_VEC_FMADDSUB:
4722 case CFN_VEC_FMSUBADD:
4723 partition.layout = 0;
4724 default:;
4728 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4730 auto &other_vertex = m_vertices[other_node_i];
4732 /* Count the number of edges from earlier partitions and the number
4733 of edges to later partitions. */
4734 if (other_vertex.partition < vertex.partition)
4735 partition.in_degree += 1;
4736 else
4737 partition.out_degree += 1;
4739 /* If the current node uses the result of OTHER_NODE_I, accumulate
4740 the effects of that. */
4741 if (ud->src == int (node_i))
4743 other_vertex.out_weight += vertex.weight;
4744 other_vertex.out_degree += 1;
4747 for_each_partition_edge (node_i, process_edge);
4751 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4752 its current (provisional) choice of layout. The inputs do not necessarily
4753 have the same layout as each other. */
4755 slpg_layout_cost
4756 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4758 auto &vertex = m_vertices[node_i];
4759 slpg_layout_cost cost;
4760 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4762 auto &other_vertex = m_vertices[other_node_i];
4763 if (other_vertex.partition < vertex.partition)
4765 auto &other_partition = m_partitions[other_vertex.partition];
4766 auto &other_costs = partition_layout_costs (other_vertex.partition,
4767 other_partition.layout);
4768 slpg_layout_cost this_cost = other_costs.in_cost;
4769 this_cost.add_serial_cost (other_costs.internal_cost);
4770 this_cost.split (other_partition.out_degree);
4771 cost.add_parallel_cost (this_cost);
4774 for_each_partition_edge (node_i, add_cost);
4775 return cost;
4778 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4779 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4780 slpg_layout_cost::impossible () if the change isn't possible. */
4782 slpg_layout_cost
4783 vect_optimize_slp_pass::
4784 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4785 unsigned int layout2_i)
4787 auto &def_vertex = m_vertices[ud->dest];
4788 auto &use_vertex = m_vertices[ud->src];
4789 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4790 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4791 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4792 use_layout_i);
4793 if (factor < 0)
4794 return slpg_layout_cost::impossible ();
4796 /* We have a choice of putting the layout change at the site of the
4797 definition or at the site of the use. Prefer the former when
4798 optimizing for size or when the execution frequency of the
4799 definition is no greater than the combined execution frequencies of
4800 the uses. When putting the layout change at the site of the definition,
4801 divvy up the cost among all consumers. */
4802 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4804 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4805 cost.split (def_vertex.out_degree);
4806 return cost;
4808 return { use_vertex.weight * factor, m_optimize_size };
4811 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4812 partition; FROM_NODE_I could be the definition node or the use node.
4813 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4814 Return the cost of any necessary fix-ups on edge UD, or return
4815 slpg_layout_cost::impossible () if the change isn't possible.
4817 At this point, FROM_NODE_I's partition has chosen the cheapest
4818 layout based on the information available so far, but this choice
4819 is only provisional. */
4821 slpg_layout_cost
4822 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4823 unsigned int to_layout_i)
4825 auto &from_vertex = m_vertices[from_node_i];
4826 unsigned int from_partition_i = from_vertex.partition;
4827 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4828 gcc_assert (from_partition.layout >= 0);
4830 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4831 with its current layout preference. */
4832 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4833 auto edge_cost = edge_layout_cost (ud, from_node_i,
4834 from_partition.layout, to_layout_i);
4835 if (edge_cost.is_possible ())
4837 auto &from_costs = partition_layout_costs (from_partition_i,
4838 from_partition.layout);
4839 cost = from_costs.in_cost;
4840 cost.add_serial_cost (from_costs.internal_cost);
4841 cost.split (from_partition.out_degree);
4842 cost.add_serial_cost (edge_cost);
4845 /* Take the minimum of that cost and the cost that applies if
4846 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4847 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4848 to_layout_i);
4849 if (direct_layout_costs.is_possible ())
4851 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4852 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4853 direct_cost.split (from_partition.out_degree);
4854 if (!cost.is_possible ()
4855 || direct_cost.is_better_than (cost, m_optimize_size))
4856 cost = direct_cost;
4859 return cost;
4862 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4863 partition; TO_NODE_I could be the definition node or the use node.
4864 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4865 return the cost of any necessary fix-ups on edge UD, or
4866 slpg_layout_cost::impossible () if the choice cannot be made.
4868 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4870 slpg_layout_cost
4871 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4872 unsigned int from_layout_i)
4874 auto &to_vertex = m_vertices[to_node_i];
4875 unsigned int to_partition_i = to_vertex.partition;
4876 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4877 gcc_assert (to_partition.layout >= 0);
4879 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4880 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4881 any other inputs keep their current choice of layout. */
4882 auto &to_costs = partition_layout_costs (to_partition_i,
4883 to_partition.layout);
4884 if (ud->src == int (to_node_i)
4885 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4887 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4888 auto old_layout = from_partition.layout;
4889 from_partition.layout = from_layout_i;
4890 int factor = internal_node_cost (to_vertex.node, -1,
4891 to_partition.layout);
4892 from_partition.layout = old_layout;
4893 if (factor >= 0)
4895 slpg_layout_cost cost = to_costs.out_cost;
4896 cost.add_serial_cost ({ to_vertex.weight * factor,
4897 m_optimize_size });
4898 cost.split (to_partition.in_degree);
4899 return cost;
4903 /* Compute the cost if we insert any necessary layout change on edge UD. */
4904 auto edge_cost = edge_layout_cost (ud, to_node_i,
4905 to_partition.layout, from_layout_i);
4906 if (edge_cost.is_possible ())
4908 slpg_layout_cost cost = to_costs.out_cost;
4909 cost.add_serial_cost (to_costs.internal_cost);
4910 cost.split (to_partition.in_degree);
4911 cost.add_serial_cost (edge_cost);
4912 return cost;
4915 return slpg_layout_cost::impossible ();
4918 /* Make a forward pass through the partitions, accumulating input costs.
4919 Make a tentative (provisional) choice of layout for each partition,
4920 ensuring that this choice still allows later partitions to keep
4921 their original layout. */
4923 void
4924 vect_optimize_slp_pass::forward_pass ()
4926 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4927 ++partition_i)
4929 auto &partition = m_partitions[partition_i];
4931 /* If the partition consists of a single VEC_PERM_EXPR, precompute
4932 the incoming cost that would apply if every predecessor partition
4933 keeps its current layout. This is used within the loop below. */
4934 slpg_layout_cost in_cost;
4935 slp_tree single_node = nullptr;
4936 if (partition.node_end == partition.node_begin + 1)
4938 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4939 single_node = m_vertices[node_i].node;
4940 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4941 in_cost = total_in_cost (node_i);
4944 /* Go through the possible layouts. Decide which ones are valid
4945 for this partition and record which of the valid layouts has
4946 the lowest cost. */
4947 unsigned int min_layout_i = 0;
4948 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4949 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4951 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4952 if (!layout_costs.is_possible ())
4953 continue;
4955 /* If the recorded layout is already 0 then the layout cannot
4956 change. */
4957 if (partition.layout == 0 && layout_i != 0)
4959 layout_costs.mark_impossible ();
4960 continue;
4963 bool is_possible = true;
4964 for (unsigned int order_i = partition.node_begin;
4965 order_i < partition.node_end; ++order_i)
4967 unsigned int node_i = m_partitioned_nodes[order_i];
4968 auto &vertex = m_vertices[node_i];
4970 /* Reject the layout if it is individually incompatible
4971 with any node in the partition. */
4972 if (!is_compatible_layout (vertex.node, layout_i))
4974 is_possible = false;
4975 break;
4978 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4980 auto &other_vertex = m_vertices[other_node_i];
4981 if (other_vertex.partition < vertex.partition)
4983 /* Accumulate the incoming costs from earlier
4984 partitions, plus the cost of any layout changes
4985 on UD itself. */
4986 auto cost = forward_cost (ud, other_node_i, layout_i);
4987 if (!cost.is_possible ())
4988 is_possible = false;
4989 else
4990 layout_costs.in_cost.add_parallel_cost (cost);
4992 else
4993 /* Reject the layout if it would make layout 0 impossible
4994 for later partitions. This amounts to testing that the
4995 target supports reversing the layout change on edges
4996 to later partitions.
4998 In principle, it might be possible to push a layout
4999 change all the way down a graph, so that it never
5000 needs to be reversed and so that the target doesn't
5001 need to support the reverse operation. But it would
5002 be awkward to bail out if we hit a partition that
5003 does not support the new layout, especially since
5004 we are not dealing with a lattice. */
5005 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5006 layout_i).is_possible ();
5008 for_each_partition_edge (node_i, add_cost);
5010 /* Accumulate the cost of using LAYOUT_I within NODE,
5011 both for the inputs and the outputs. */
5012 int factor = internal_node_cost (vertex.node, layout_i,
5013 layout_i);
5014 if (factor < 0)
5016 is_possible = false;
5017 break;
5019 else if (factor)
5020 layout_costs.internal_cost.add_serial_cost
5021 ({ vertex.weight * factor, m_optimize_size });
5023 if (!is_possible)
5025 layout_costs.mark_impossible ();
5026 continue;
5029 /* Combine the incoming and partition-internal costs. */
5030 slpg_layout_cost combined_cost = layout_costs.in_cost;
5031 combined_cost.add_serial_cost (layout_costs.internal_cost);
5033 /* If this partition consists of a single VEC_PERM_EXPR, see
5034 if the VEC_PERM_EXPR can be changed to support output layout
5035 LAYOUT_I while keeping all the provisional choices of input
5036 layout. */
5037 if (single_node
5038 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5040 int factor = internal_node_cost (single_node, -1, layout_i);
5041 if (factor >= 0)
5043 auto weight = m_vertices[single_node->vertex].weight;
5044 slpg_layout_cost internal_cost
5045 = { weight * factor, m_optimize_size };
5047 slpg_layout_cost alt_cost = in_cost;
5048 alt_cost.add_serial_cost (internal_cost);
5049 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5051 combined_cost = alt_cost;
5052 layout_costs.in_cost = in_cost;
5053 layout_costs.internal_cost = internal_cost;
5058 /* Record the layout with the lowest cost. Prefer layout 0 in
5059 the event of a tie between it and another layout. */
5060 if (!min_layout_cost.is_possible ()
5061 || combined_cost.is_better_than (min_layout_cost,
5062 m_optimize_size))
5064 min_layout_i = layout_i;
5065 min_layout_cost = combined_cost;
5069 /* This loop's handling of earlier partitions should ensure that
5070 choosing the original layout for the current partition is no
5071 less valid than it was in the original graph, even with the
5072 provisional layout choices for those earlier partitions. */
5073 gcc_assert (min_layout_cost.is_possible ());
5074 partition.layout = min_layout_i;
5078 /* Make a backward pass through the partitions, accumulating output costs.
5079 Make a final choice of layout for each partition. */
5081 void
5082 vect_optimize_slp_pass::backward_pass ()
5084 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5086 auto &partition = m_partitions[partition_i];
5088 unsigned int min_layout_i = 0;
5089 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5090 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5092 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5093 if (!layout_costs.is_possible ())
5094 continue;
5096 /* Accumulate the costs from successor partitions. */
5097 bool is_possible = true;
5098 for (unsigned int order_i = partition.node_begin;
5099 order_i < partition.node_end; ++order_i)
5101 unsigned int node_i = m_partitioned_nodes[order_i];
5102 auto &vertex = m_vertices[node_i];
5103 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5105 auto &other_vertex = m_vertices[other_node_i];
5106 auto &other_partition = m_partitions[other_vertex.partition];
5107 if (other_vertex.partition > vertex.partition)
5109 /* Accumulate the incoming costs from later
5110 partitions, plus the cost of any layout changes
5111 on UD itself. */
5112 auto cost = backward_cost (ud, other_node_i, layout_i);
5113 if (!cost.is_possible ())
5114 is_possible = false;
5115 else
5116 layout_costs.out_cost.add_parallel_cost (cost);
5118 else
5119 /* Make sure that earlier partitions can (if necessary
5120 or beneficial) keep the layout that they chose in
5121 the forward pass. This ensures that there is at
5122 least one valid choice of layout. */
5123 is_possible &= edge_layout_cost (ud, other_node_i,
5124 other_partition.layout,
5125 layout_i).is_possible ();
5127 for_each_partition_edge (node_i, add_cost);
5129 if (!is_possible)
5131 layout_costs.mark_impossible ();
5132 continue;
5135 /* Locally combine the costs from the forward and backward passes.
5136 (This combined cost is not passed on, since that would lead
5137 to double counting.) */
5138 slpg_layout_cost combined_cost = layout_costs.in_cost;
5139 combined_cost.add_serial_cost (layout_costs.internal_cost);
5140 combined_cost.add_serial_cost (layout_costs.out_cost);
5142 /* Record the layout with the lowest cost. Prefer layout 0 in
5143 the event of a tie between it and another layout. */
5144 if (!min_layout_cost.is_possible ()
5145 || combined_cost.is_better_than (min_layout_cost,
5146 m_optimize_size))
5148 min_layout_i = layout_i;
5149 min_layout_cost = combined_cost;
5153 gcc_assert (min_layout_cost.is_possible ());
5154 partition.layout = min_layout_i;
5158 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5159 NODE already has the layout that was selected for its partition. */
5161 slp_tree
5162 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5163 unsigned int to_layout_i)
5165 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5166 slp_tree result = m_node_layouts[result_i];
5167 if (result)
5168 return result;
5170 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5171 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5173 /* If the vector is uniform or unchanged, there's nothing to do. */
5174 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5175 result = node;
5176 else
5178 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5179 result = vect_create_new_slp_node (scalar_ops);
5180 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5183 else
5185 unsigned int partition_i = m_vertices[node->vertex].partition;
5186 unsigned int from_layout_i = m_partitions[partition_i].layout;
5187 if (from_layout_i == to_layout_i)
5188 return node;
5190 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5191 permutation instead of a serial one. Leave the new permutation
5192 in TMP_PERM on success. */
5193 auto_lane_permutation_t tmp_perm;
5194 unsigned int num_inputs = 1;
5195 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5197 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5198 if (from_layout_i != 0)
5199 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5200 if (to_layout_i != 0)
5201 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5202 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5203 tmp_perm,
5204 SLP_TREE_CHILDREN (node),
5205 false) >= 0)
5206 num_inputs = SLP_TREE_CHILDREN (node).length ();
5207 else
5208 tmp_perm.truncate (0);
5211 if (dump_enabled_p ())
5213 if (tmp_perm.length () > 0)
5214 dump_printf_loc (MSG_NOTE, vect_location,
5215 "duplicating permutation node %p with"
5216 " layout %d\n",
5217 (void *) node, to_layout_i);
5218 else
5219 dump_printf_loc (MSG_NOTE, vect_location,
5220 "inserting permutation node in place of %p\n",
5221 (void *) node);
5224 unsigned int num_lanes = SLP_TREE_LANES (node);
5225 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5226 if (SLP_TREE_SCALAR_STMTS (node).length ())
5228 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5229 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5230 if (from_layout_i != 0)
5231 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5232 if (to_layout_i != 0)
5233 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5235 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5236 SLP_TREE_LANES (result) = num_lanes;
5237 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5238 result->vertex = -1;
5240 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5241 if (tmp_perm.length ())
5243 lane_perm.safe_splice (tmp_perm);
5244 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5246 else
5248 lane_perm.create (num_lanes);
5249 for (unsigned j = 0; j < num_lanes; ++j)
5250 lane_perm.quick_push ({ 0, j });
5251 if (from_layout_i != 0)
5252 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5253 if (to_layout_i != 0)
5254 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5255 SLP_TREE_CHILDREN (result).safe_push (node);
5257 for (slp_tree child : SLP_TREE_CHILDREN (result))
5258 child->refcnt++;
5260 m_node_layouts[result_i] = result;
5261 return result;
5264 /* Apply the chosen vector layouts to the SLP graph. */
5266 void
5267 vect_optimize_slp_pass::materialize ()
5269 /* We no longer need the costs, so avoid having two O(N * P) arrays
5270 live at the same time. */
5271 m_partition_layout_costs.release ();
5272 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5274 auto_sbitmap fully_folded (m_vertices.length ());
5275 bitmap_clear (fully_folded);
5276 for (unsigned int node_i : m_partitioned_nodes)
5278 auto &vertex = m_vertices[node_i];
5279 slp_tree node = vertex.node;
5280 int layout_i = m_partitions[vertex.partition].layout;
5281 gcc_assert (layout_i >= 0);
5283 /* Rearrange the scalar statements to match the chosen layout. */
5284 if (layout_i > 0)
5285 vect_slp_permute (m_perms[layout_i],
5286 SLP_TREE_SCALAR_STMTS (node), true);
5288 /* Update load and lane permutations. */
5289 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5291 /* First try to absorb the input vector layouts. If that fails,
5292 force the inputs to have layout LAYOUT_I too. We checked that
5293 that was possible before deciding to use nonzero output layouts.
5294 (Note that at this stage we don't really have any guarantee that
5295 the target supports the original VEC_PERM_EXPR.) */
5296 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5297 auto_lane_permutation_t tmp_perm;
5298 tmp_perm.safe_splice (perm);
5299 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5300 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5301 tmp_perm,
5302 SLP_TREE_CHILDREN (node),
5303 false) >= 0)
5305 if (dump_enabled_p ()
5306 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5307 perm.begin ()))
5308 dump_printf_loc (MSG_NOTE, vect_location,
5309 "absorbing input layouts into %p\n",
5310 (void *) node);
5311 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5312 bitmap_set_bit (fully_folded, node_i);
5314 else
5316 /* Not MSG_MISSED because it would make no sense to users. */
5317 if (dump_enabled_p ())
5318 dump_printf_loc (MSG_NOTE, vect_location,
5319 "failed to absorb input layouts into %p\n",
5320 (void *) node);
5321 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5324 else
5326 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5327 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5328 if (layout_i > 0)
5329 /* ??? When we handle non-bijective permutes the idea
5330 is that we can force the load-permutation to be
5331 { min, min + 1, min + 2, ... max }. But then the
5332 scalar defs might no longer match the lane content
5333 which means wrong-code with live lane vectorization.
5334 So we possibly have to have NULL entries for those. */
5335 vect_slp_permute (m_perms[layout_i], load_perm, true);
5339 /* Do this before any nodes disappear, since it involves a walk
5340 over the leaves. */
5341 remove_redundant_permutations ();
5343 /* Replace each child with a correctly laid-out version. */
5344 for (unsigned int node_i : m_partitioned_nodes)
5346 /* Skip nodes that have already been handled above. */
5347 if (bitmap_bit_p (fully_folded, node_i))
5348 continue;
5350 auto &vertex = m_vertices[node_i];
5351 int in_layout_i = m_partitions[vertex.partition].layout;
5352 gcc_assert (in_layout_i >= 0);
5354 unsigned j;
5355 slp_tree child;
5356 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5358 if (!child)
5359 continue;
5361 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5362 if (new_child != child)
5364 vect_free_slp_tree (child);
5365 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5366 new_child->refcnt += 1;
5372 /* Elide load permutations that are not necessary. Such permutations might
5373 be pre-existing, rather than created by the layout optimizations. */
5375 void
5376 vect_optimize_slp_pass::remove_redundant_permutations ()
5378 for (unsigned int node_i : m_leafs)
5380 slp_tree node = m_vertices[node_i].node;
5381 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5382 continue;
5384 /* In basic block vectorization we allow any subchain of an interleaving
5385 chain.
5386 FORNOW: not in loop SLP because of realignment complications. */
5387 if (is_a <bb_vec_info> (m_vinfo))
5389 bool subchain_p = true;
5390 stmt_vec_info next_load_info = NULL;
5391 stmt_vec_info load_info;
5392 unsigned j;
5393 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5395 if (j != 0
5396 && (next_load_info != load_info
5397 || DR_GROUP_GAP (load_info) != 1))
5399 subchain_p = false;
5400 break;
5402 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5404 if (subchain_p)
5406 SLP_TREE_LOAD_PERMUTATION (node).release ();
5407 continue;
5410 else
5412 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5413 stmt_vec_info load_info;
5414 bool this_load_permuted = false;
5415 unsigned j;
5416 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5417 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5419 this_load_permuted = true;
5420 break;
5422 stmt_vec_info first_stmt_info
5423 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5424 if (!this_load_permuted
5425 /* The load requires permutation when unrolling exposes
5426 a gap either because the group is larger than the SLP
5427 group-size or because there is a gap between the groups. */
5428 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5429 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5430 && DR_GROUP_GAP (first_stmt_info) == 0)))
5432 SLP_TREE_LOAD_PERMUTATION (node).release ();
5433 continue;
5439 /* Print the partition graph and layout information to the dump file. */
5441 void
5442 vect_optimize_slp_pass::dump ()
5444 dump_printf_loc (MSG_NOTE, vect_location,
5445 "SLP optimize permutations:\n");
5446 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5448 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5449 const char *sep = "";
5450 for (unsigned int idx : m_perms[layout_i])
5452 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5453 sep = ", ";
5455 dump_printf (MSG_NOTE, " }\n");
5457 dump_printf_loc (MSG_NOTE, vect_location,
5458 "SLP optimize partitions:\n");
5459 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5460 ++partition_i)
5462 auto &partition = m_partitions[partition_i];
5463 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5464 dump_printf_loc (MSG_NOTE, vect_location,
5465 " partition %d (layout %d):\n",
5466 partition_i, partition.layout);
5467 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5468 for (unsigned int order_i = partition.node_begin;
5469 order_i < partition.node_end; ++order_i)
5471 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5472 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5473 (void *) vertex.node);
5474 dump_printf_loc (MSG_NOTE, vect_location,
5475 " weight: %f\n",
5476 vertex.weight.to_double ());
5477 if (vertex.out_degree)
5478 dump_printf_loc (MSG_NOTE, vect_location,
5479 " out weight: %f (degree %d)\n",
5480 vertex.out_weight.to_double (),
5481 vertex.out_degree);
5482 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5483 dump_printf_loc (MSG_NOTE, vect_location,
5484 " op: VEC_PERM_EXPR\n");
5485 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5486 dump_printf_loc (MSG_NOTE, vect_location,
5487 " op template: %G", rep->stmt);
5489 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5490 for (unsigned int order_i = partition.node_begin;
5491 order_i < partition.node_end; ++order_i)
5493 unsigned int node_i = m_partitioned_nodes[order_i];
5494 auto &vertex = m_vertices[node_i];
5495 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5497 auto &other_vertex = m_vertices[other_node_i];
5498 if (other_vertex.partition < vertex.partition)
5499 dump_printf_loc (MSG_NOTE, vect_location,
5500 " - %p [%d] --> %p\n",
5501 (void *) other_vertex.node,
5502 other_vertex.partition,
5503 (void *) vertex.node);
5504 else
5505 dump_printf_loc (MSG_NOTE, vect_location,
5506 " - %p --> [%d] %p\n",
5507 (void *) vertex.node,
5508 other_vertex.partition,
5509 (void *) other_vertex.node);
5511 for_each_partition_edge (node_i, print_edge);
5514 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5516 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5517 if (layout_costs.is_possible ())
5519 dump_printf_loc (MSG_NOTE, vect_location,
5520 " layout %d:%s\n", layout_i,
5521 partition.layout == int (layout_i)
5522 ? " (*)" : "");
5523 slpg_layout_cost combined_cost = layout_costs.in_cost;
5524 combined_cost.add_serial_cost (layout_costs.internal_cost);
5525 combined_cost.add_serial_cost (layout_costs.out_cost);
5526 #define TEMPLATE "{depth: %f, total: %f}"
5527 dump_printf_loc (MSG_NOTE, vect_location,
5528 " " TEMPLATE "\n",
5529 layout_costs.in_cost.depth.to_double (),
5530 layout_costs.in_cost.total.to_double ());
5531 dump_printf_loc (MSG_NOTE, vect_location,
5532 " + " TEMPLATE "\n",
5533 layout_costs.internal_cost.depth.to_double (),
5534 layout_costs.internal_cost.total.to_double ());
5535 dump_printf_loc (MSG_NOTE, vect_location,
5536 " + " TEMPLATE "\n",
5537 layout_costs.out_cost.depth.to_double (),
5538 layout_costs.out_cost.total.to_double ());
5539 dump_printf_loc (MSG_NOTE, vect_location,
5540 " = " TEMPLATE "\n",
5541 combined_cost.depth.to_double (),
5542 combined_cost.total.to_double ());
5543 #undef TEMPLATE
5545 else
5546 dump_printf_loc (MSG_NOTE, vect_location,
5547 " layout %d: rejected\n", layout_i);
5552 /* Main entry point for the SLP graph optimization pass. */
5554 void
5555 vect_optimize_slp_pass::run ()
5557 build_graph ();
5558 create_partitions ();
5559 start_choosing_layouts ();
5560 if (m_perms.length () > 1)
5562 forward_pass ();
5563 backward_pass ();
5564 if (dump_enabled_p ())
5565 dump ();
5566 materialize ();
5567 while (!m_perms.is_empty ())
5568 m_perms.pop ().release ();
5570 else
5571 remove_redundant_permutations ();
5572 free_graph (m_slpg);
5575 /* Optimize the SLP graph of VINFO. */
5577 void
5578 vect_optimize_slp (vec_info *vinfo)
5580 if (vinfo->slp_instances.is_empty ())
5581 return;
5582 vect_optimize_slp_pass (vinfo).run ();
5585 /* Gather loads reachable from the individual SLP graph entries. */
5587 void
5588 vect_gather_slp_loads (vec_info *vinfo)
5590 unsigned i;
5591 slp_instance instance;
5592 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5594 hash_set<slp_tree> visited;
5595 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5596 SLP_INSTANCE_TREE (instance), visited);
5601 /* For each possible SLP instance decide whether to SLP it and calculate overall
5602 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5603 least one instance. */
5605 bool
5606 vect_make_slp_decision (loop_vec_info loop_vinfo)
5608 unsigned int i;
5609 poly_uint64 unrolling_factor = 1;
5610 const vec<slp_instance> &slp_instances
5611 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5612 slp_instance instance;
5613 int decided_to_slp = 0;
5615 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5617 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5619 /* FORNOW: SLP if you can. */
5620 /* All unroll factors have the form:
5622 GET_MODE_SIZE (vinfo->vector_mode) * X
5624 for some rational X, so they must have a common multiple. */
5625 unrolling_factor
5626 = force_common_multiple (unrolling_factor,
5627 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5629 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5630 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5631 loop-based vectorization. Such stmts will be marked as HYBRID. */
5632 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5633 decided_to_slp++;
5636 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5638 if (decided_to_slp && dump_enabled_p ())
5640 dump_printf_loc (MSG_NOTE, vect_location,
5641 "Decided to SLP %d instances. Unrolling factor ",
5642 decided_to_slp);
5643 dump_dec (MSG_NOTE, unrolling_factor);
5644 dump_printf (MSG_NOTE, "\n");
5647 return (decided_to_slp > 0);
5650 /* Private data for vect_detect_hybrid_slp. */
5651 struct vdhs_data
5653 loop_vec_info loop_vinfo;
5654 vec<stmt_vec_info> *worklist;
5657 /* Walker for walk_gimple_op. */
5659 static tree
5660 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5662 walk_stmt_info *wi = (walk_stmt_info *)data;
5663 vdhs_data *dat = (vdhs_data *)wi->info;
5665 if (wi->is_lhs)
5666 return NULL_TREE;
5668 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5669 if (!def_stmt_info)
5670 return NULL_TREE;
5671 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5672 if (PURE_SLP_STMT (def_stmt_info))
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5676 def_stmt_info->stmt);
5677 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5678 dat->worklist->safe_push (def_stmt_info);
5681 return NULL_TREE;
5684 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5685 if so, otherwise pushing it to WORKLIST. */
5687 static void
5688 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5689 vec<stmt_vec_info> &worklist,
5690 stmt_vec_info stmt_info)
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_NOTE, vect_location,
5694 "Processing hybrid candidate : %G", stmt_info->stmt);
5695 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5696 imm_use_iterator iter2;
5697 ssa_op_iter iter1;
5698 use_operand_p use_p;
5699 def_operand_p def_p;
5700 bool any_def = false;
5701 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5703 any_def = true;
5704 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5706 if (is_gimple_debug (USE_STMT (use_p)))
5707 continue;
5708 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5709 /* An out-of loop use means this is a loop_vect sink. */
5710 if (!use_info)
5712 if (dump_enabled_p ())
5713 dump_printf_loc (MSG_NOTE, vect_location,
5714 "Found loop_vect sink: %G", stmt_info->stmt);
5715 worklist.safe_push (stmt_info);
5716 return;
5718 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5720 if (dump_enabled_p ())
5721 dump_printf_loc (MSG_NOTE, vect_location,
5722 "Found loop_vect use: %G", use_info->stmt);
5723 worklist.safe_push (stmt_info);
5724 return;
5728 /* No def means this is a loo_vect sink. */
5729 if (!any_def)
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 "Found loop_vect sink: %G", stmt_info->stmt);
5734 worklist.safe_push (stmt_info);
5735 return;
5737 if (dump_enabled_p ())
5738 dump_printf_loc (MSG_NOTE, vect_location,
5739 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5740 STMT_SLP_TYPE (stmt_info) = pure_slp;
5743 /* Find stmts that must be both vectorized and SLPed. */
5745 void
5746 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5748 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5750 /* All stmts participating in SLP are marked pure_slp, all other
5751 stmts are loop_vect.
5752 First collect all loop_vect stmts into a worklist.
5753 SLP patterns cause not all original scalar stmts to appear in
5754 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5755 Rectify this here and do a backward walk over the IL only considering
5756 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5757 mark them as pure_slp. */
5758 auto_vec<stmt_vec_info> worklist;
5759 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5761 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5762 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5763 gsi_next (&gsi))
5765 gphi *phi = gsi.phi ();
5766 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5767 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5768 maybe_push_to_hybrid_worklist (loop_vinfo,
5769 worklist, stmt_info);
5771 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5772 gsi_prev (&gsi))
5774 gimple *stmt = gsi_stmt (gsi);
5775 if (is_gimple_debug (stmt))
5776 continue;
5777 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5778 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5780 for (gimple_stmt_iterator gsi2
5781 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5782 !gsi_end_p (gsi2); gsi_next (&gsi2))
5784 stmt_vec_info patt_info
5785 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5786 if (!STMT_SLP_TYPE (patt_info)
5787 && STMT_VINFO_RELEVANT (patt_info))
5788 maybe_push_to_hybrid_worklist (loop_vinfo,
5789 worklist, patt_info);
5791 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5793 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5794 maybe_push_to_hybrid_worklist (loop_vinfo,
5795 worklist, stmt_info);
5799 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5800 mark any SLP vectorized stmt as hybrid.
5801 ??? We're visiting def stmts N times (once for each non-SLP and
5802 once for each hybrid-SLP use). */
5803 walk_stmt_info wi;
5804 vdhs_data dat;
5805 dat.worklist = &worklist;
5806 dat.loop_vinfo = loop_vinfo;
5807 memset (&wi, 0, sizeof (wi));
5808 wi.info = (void *)&dat;
5809 while (!worklist.is_empty ())
5811 stmt_vec_info stmt_info = worklist.pop ();
5812 /* Since SSA operands are not set up for pattern stmts we need
5813 to use walk_gimple_op. */
5814 wi.is_lhs = 0;
5815 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5816 /* For gather/scatter make sure to walk the offset operand, that
5817 can be a scaling and conversion away. */
5818 gather_scatter_info gs_info;
5819 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5820 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5822 int dummy;
5823 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5829 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5831 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5832 : vec_info (vec_info::bb, shared),
5833 bbs (_bbs),
5834 roots (vNULL)
5836 for (unsigned i = 0; i < bbs.length (); ++i)
5838 if (i != 0)
5839 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5840 gsi_next (&si))
5842 gphi *phi = si.phi ();
5843 gimple_set_uid (phi, 0);
5844 add_stmt (phi);
5846 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5847 !gsi_end_p (gsi); gsi_next (&gsi))
5849 gimple *stmt = gsi_stmt (gsi);
5850 gimple_set_uid (stmt, 0);
5851 if (is_gimple_debug (stmt))
5852 continue;
5853 add_stmt (stmt);
5859 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5860 stmts in the basic block. */
5862 _bb_vec_info::~_bb_vec_info ()
5864 /* Reset region marker. */
5865 for (unsigned i = 0; i < bbs.length (); ++i)
5867 if (i != 0)
5868 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5869 gsi_next (&si))
5871 gphi *phi = si.phi ();
5872 gimple_set_uid (phi, -1);
5874 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5875 !gsi_end_p (gsi); gsi_next (&gsi))
5877 gimple *stmt = gsi_stmt (gsi);
5878 gimple_set_uid (stmt, -1);
5882 for (unsigned i = 0; i < roots.length (); ++i)
5884 roots[i].stmts.release ();
5885 roots[i].roots.release ();
5887 roots.release ();
5890 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5891 given then that child nodes have already been processed, and that
5892 their def types currently match their SLP node's def type. */
5894 static bool
5895 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5896 slp_instance node_instance,
5897 stmt_vector_for_cost *cost_vec)
5899 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5901 /* Calculate the number of vector statements to be created for the
5902 scalar stmts in this node. For SLP reductions it is equal to the
5903 number of vector statements in the children (which has already been
5904 calculated by the recursive call). Otherwise it is the number of
5905 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5906 VF divided by the number of elements in a vector. */
5907 if (!STMT_VINFO_DATA_REF (stmt_info)
5908 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5910 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5911 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5913 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5914 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5915 break;
5918 else
5920 poly_uint64 vf;
5921 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5922 vf = loop_vinfo->vectorization_factor;
5923 else
5924 vf = 1;
5925 unsigned int group_size = SLP_TREE_LANES (node);
5926 tree vectype = SLP_TREE_VECTYPE (node);
5927 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5928 = vect_get_num_vectors (vf * group_size, vectype);
5931 /* Handle purely internal nodes. */
5932 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5933 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
5935 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
5937 bool dummy;
5938 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5939 node, node_instance, cost_vec);
5942 /* Try to build NODE from scalars, returning true on success.
5943 NODE_INSTANCE is the SLP instance that contains NODE. */
5945 static bool
5946 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5947 slp_instance node_instance)
5949 stmt_vec_info stmt_info;
5950 unsigned int i;
5952 if (!is_a <bb_vec_info> (vinfo)
5953 || node == SLP_INSTANCE_TREE (node_instance)
5954 || !SLP_TREE_SCALAR_STMTS (node).exists ()
5955 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5956 /* Force the mask use to be built from scalars instead. */
5957 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5958 return false;
5960 if (dump_enabled_p ())
5961 dump_printf_loc (MSG_NOTE, vect_location,
5962 "Building vector operands of %p from scalars instead\n",
5963 (void *) node);
5965 /* Don't remove and free the child nodes here, since they could be
5966 referenced by other structures. The analysis and scheduling phases
5967 (need to) ignore child nodes of anything that isn't vect_internal_def. */
5968 unsigned int group_size = SLP_TREE_LANES (node);
5969 SLP_TREE_DEF_TYPE (node) = vect_external_def;
5970 /* Invariants get their vector type from the uses. */
5971 SLP_TREE_VECTYPE (node) = NULL_TREE;
5972 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5973 SLP_TREE_LOAD_PERMUTATION (node).release ();
5974 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5976 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5977 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5979 return true;
5982 /* Return true if all elements of the slice are the same. */
5983 bool
5984 vect_scalar_ops_slice::all_same_p () const
5986 for (unsigned int i = 1; i < length; ++i)
5987 if (!operand_equal_p (op (0), op (i)))
5988 return false;
5989 return true;
5992 hashval_t
5993 vect_scalar_ops_slice_hash::hash (const value_type &s)
5995 hashval_t hash = 0;
5996 for (unsigned i = 0; i < s.length; ++i)
5997 hash = iterative_hash_expr (s.op (i), hash);
5998 return hash;
6001 bool
6002 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6003 const compare_type &s2)
6005 if (s1.length != s2.length)
6006 return false;
6007 for (unsigned i = 0; i < s1.length; ++i)
6008 if (!operand_equal_p (s1.op (i), s2.op (i)))
6009 return false;
6010 return true;
6013 /* Compute the prologue cost for invariant or constant operands represented
6014 by NODE. */
6016 static void
6017 vect_prologue_cost_for_slp (slp_tree node,
6018 stmt_vector_for_cost *cost_vec)
6020 /* There's a special case of an existing vector, that costs nothing. */
6021 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6022 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6023 return;
6024 /* Without looking at the actual initializer a vector of
6025 constants can be implemented as load from the constant pool.
6026 When all elements are the same we can use a splat. */
6027 tree vectype = SLP_TREE_VECTYPE (node);
6028 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6029 unsigned HOST_WIDE_INT const_nunits;
6030 unsigned nelt_limit;
6031 auto ops = &SLP_TREE_SCALAR_OPS (node);
6032 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6033 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6034 && ! multiple_p (const_nunits, group_size))
6036 nelt_limit = const_nunits;
6037 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6038 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6039 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6040 starts.quick_push (i * const_nunits);
6042 else
6044 /* If either the vector has variable length or the vectors
6045 are composed of repeated whole groups we only need to
6046 cost construction once. All vectors will be the same. */
6047 nelt_limit = group_size;
6048 starts.quick_push (0);
6050 /* ??? We're just tracking whether vectors in a single node are the same.
6051 Ideally we'd do something more global. */
6052 for (unsigned int start : starts)
6054 vect_cost_for_stmt kind;
6055 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6056 kind = vector_load;
6057 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6058 kind = scalar_to_vec;
6059 else
6060 kind = vec_construct;
6061 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
6065 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6066 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6068 Return true if the operations are supported. */
6070 static bool
6071 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6072 slp_instance node_instance,
6073 hash_set<slp_tree> &visited_set,
6074 vec<slp_tree> &visited_vec,
6075 stmt_vector_for_cost *cost_vec)
6077 int i, j;
6078 slp_tree child;
6080 /* Assume we can code-generate all invariants. */
6081 if (!node
6082 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6083 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6084 return true;
6086 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6088 if (dump_enabled_p ())
6089 dump_printf_loc (MSG_NOTE, vect_location,
6090 "Failed cyclic SLP reference in %p\n", (void *) node);
6091 return false;
6093 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6095 /* If we already analyzed the exact same set of scalar stmts we're done.
6096 We share the generated vector stmts for those. */
6097 if (visited_set.add (node))
6098 return true;
6099 visited_vec.safe_push (node);
6101 bool res = true;
6102 unsigned visited_rec_start = visited_vec.length ();
6103 unsigned cost_vec_rec_start = cost_vec->length ();
6104 bool seen_non_constant_child = false;
6105 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6107 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6108 visited_set, visited_vec,
6109 cost_vec);
6110 if (!res)
6111 break;
6112 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6113 seen_non_constant_child = true;
6115 /* We're having difficulties scheduling nodes with just constant
6116 operands and no scalar stmts since we then cannot compute a stmt
6117 insertion place. */
6118 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6120 if (dump_enabled_p ())
6121 dump_printf_loc (MSG_NOTE, vect_location,
6122 "Cannot vectorize all-constant op node %p\n",
6123 (void *) node);
6124 res = false;
6127 if (res)
6128 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6129 cost_vec);
6130 /* If analysis failed we have to pop all recursive visited nodes
6131 plus ourselves. */
6132 if (!res)
6134 while (visited_vec.length () >= visited_rec_start)
6135 visited_set.remove (visited_vec.pop ());
6136 cost_vec->truncate (cost_vec_rec_start);
6139 /* When the node can be vectorized cost invariant nodes it references.
6140 This is not done in DFS order to allow the refering node
6141 vectorizable_* calls to nail down the invariant nodes vector type
6142 and possibly unshare it if it needs a different vector type than
6143 other referrers. */
6144 if (res)
6145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6146 if (child
6147 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6148 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6149 /* Perform usual caching, note code-generation still
6150 code-gens these nodes multiple times but we expect
6151 to CSE them later. */
6152 && !visited_set.add (child))
6154 visited_vec.safe_push (child);
6155 /* ??? After auditing more code paths make a "default"
6156 and push the vector type from NODE to all children
6157 if it is not already set. */
6158 /* Compute the number of vectors to be generated. */
6159 tree vector_type = SLP_TREE_VECTYPE (child);
6160 if (!vector_type)
6162 /* For shifts with a scalar argument we don't need
6163 to cost or code-generate anything.
6164 ??? Represent this more explicitely. */
6165 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6166 == shift_vec_info_type)
6167 && j == 1);
6168 continue;
6170 unsigned group_size = SLP_TREE_LANES (child);
6171 poly_uint64 vf = 1;
6172 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6173 vf = loop_vinfo->vectorization_factor;
6174 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6175 = vect_get_num_vectors (vf * group_size, vector_type);
6176 /* And cost them. */
6177 vect_prologue_cost_for_slp (child, cost_vec);
6180 /* If this node or any of its children can't be vectorized, try pruning
6181 the tree here rather than felling the whole thing. */
6182 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6184 /* We'll need to revisit this for invariant costing and number
6185 of vectorized stmt setting. */
6186 res = true;
6189 return res;
6192 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6193 region and that can be vectorized using vectorizable_live_operation
6194 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6195 scalar code computing it to be retained. */
6197 static void
6198 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6199 slp_instance instance,
6200 stmt_vector_for_cost *cost_vec,
6201 hash_set<stmt_vec_info> &svisited,
6202 hash_set<slp_tree> &visited)
6204 if (visited.add (node))
6205 return;
6207 unsigned i;
6208 stmt_vec_info stmt_info;
6209 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6210 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6212 if (svisited.contains (stmt_info))
6213 continue;
6214 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6215 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6216 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6217 /* Only the pattern root stmt computes the original scalar value. */
6218 continue;
6219 bool mark_visited = true;
6220 gimple *orig_stmt = orig_stmt_info->stmt;
6221 ssa_op_iter op_iter;
6222 def_operand_p def_p;
6223 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6225 imm_use_iterator use_iter;
6226 gimple *use_stmt;
6227 stmt_vec_info use_stmt_info;
6228 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6229 if (!is_gimple_debug (use_stmt))
6231 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6232 if (!use_stmt_info
6233 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6235 STMT_VINFO_LIVE_P (stmt_info) = true;
6236 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6237 NULL, node, instance, i,
6238 false, cost_vec))
6239 /* ??? So we know we can vectorize the live stmt
6240 from one SLP node. If we cannot do so from all
6241 or none consistently we'd have to record which
6242 SLP node (and lane) we want to use for the live
6243 operation. So make sure we can code-generate
6244 from all nodes. */
6245 mark_visited = false;
6246 else
6247 STMT_VINFO_LIVE_P (stmt_info) = false;
6248 break;
6251 /* We have to verify whether we can insert the lane extract
6252 before all uses. The following is a conservative approximation.
6253 We cannot put this into vectorizable_live_operation because
6254 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6255 doesn't work.
6256 Note that while the fact that we emit code for loads at the
6257 first load should make this a non-problem leafs we construct
6258 from scalars are vectorized after the last scalar def.
6259 ??? If we'd actually compute the insert location during
6260 analysis we could use sth less conservative than the last
6261 scalar stmt in the node for the dominance check. */
6262 /* ??? What remains is "live" uses in vector CTORs in the same
6263 SLP graph which is where those uses can end up code-generated
6264 right after their definition instead of close to their original
6265 use. But that would restrict us to code-generate lane-extracts
6266 from the latest stmt in a node. So we compensate for this
6267 during code-generation, simply not replacing uses for those
6268 hopefully rare cases. */
6269 if (STMT_VINFO_LIVE_P (stmt_info))
6270 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6271 if (!is_gimple_debug (use_stmt)
6272 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6273 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6274 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6276 if (dump_enabled_p ())
6277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278 "Cannot determine insertion place for "
6279 "lane extract\n");
6280 STMT_VINFO_LIVE_P (stmt_info) = false;
6281 mark_visited = true;
6284 if (mark_visited)
6285 svisited.add (stmt_info);
6288 slp_tree child;
6289 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6290 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6291 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6292 cost_vec, svisited, visited);
6295 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6297 static bool
6298 vectorizable_bb_reduc_epilogue (slp_instance instance,
6299 stmt_vector_for_cost *cost_vec)
6301 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6302 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6303 if (reduc_code == MINUS_EXPR)
6304 reduc_code = PLUS_EXPR;
6305 internal_fn reduc_fn;
6306 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6307 if (!vectype
6308 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6309 || reduc_fn == IFN_LAST
6310 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6311 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6312 TREE_TYPE (vectype)))
6313 return false;
6315 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6316 cost log2 vector operations plus shuffles and one extraction. */
6317 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6318 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6319 vectype, 0, vect_body);
6320 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6321 vectype, 0, vect_body);
6322 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6323 vectype, 0, vect_body);
6324 return true;
6327 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6328 and recurse to children. */
6330 static void
6331 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6332 hash_set<slp_tree> &visited)
6334 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6335 || visited.add (node))
6336 return;
6338 stmt_vec_info stmt;
6339 unsigned i;
6340 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6341 roots.remove (vect_orig_stmt (stmt));
6343 slp_tree child;
6344 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6345 if (child)
6346 vect_slp_prune_covered_roots (child, roots, visited);
6349 /* Analyze statements in SLP instances of VINFO. Return true if the
6350 operations are supported. */
6352 bool
6353 vect_slp_analyze_operations (vec_info *vinfo)
6355 slp_instance instance;
6356 int i;
6358 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6360 hash_set<slp_tree> visited;
6361 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6363 auto_vec<slp_tree> visited_vec;
6364 stmt_vector_for_cost cost_vec;
6365 cost_vec.create (2);
6366 if (is_a <bb_vec_info> (vinfo))
6367 vect_location = instance->location ();
6368 if (!vect_slp_analyze_node_operations (vinfo,
6369 SLP_INSTANCE_TREE (instance),
6370 instance, visited, visited_vec,
6371 &cost_vec)
6372 /* CTOR instances require vectorized defs for the SLP tree root. */
6373 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6374 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6375 != vect_internal_def
6376 /* Make sure we vectorized with the expected type. */
6377 || !useless_type_conversion_p
6378 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6379 (instance->root_stmts[0]->stmt))),
6380 TREE_TYPE (SLP_TREE_VECTYPE
6381 (SLP_INSTANCE_TREE (instance))))))
6382 /* Check we can vectorize the reduction. */
6383 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6384 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6386 slp_tree node = SLP_INSTANCE_TREE (instance);
6387 stmt_vec_info stmt_info;
6388 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6389 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6390 else
6391 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6392 if (dump_enabled_p ())
6393 dump_printf_loc (MSG_NOTE, vect_location,
6394 "removing SLP instance operations starting from: %G",
6395 stmt_info->stmt);
6396 vect_free_slp_instance (instance);
6397 vinfo->slp_instances.ordered_remove (i);
6398 cost_vec.release ();
6399 while (!visited_vec.is_empty ())
6400 visited.remove (visited_vec.pop ());
6402 else
6404 i++;
6405 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6407 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6408 cost_vec.release ();
6410 else
6411 /* For BB vectorization remember the SLP graph entry
6412 cost for later. */
6413 instance->cost_vec = cost_vec;
6417 /* Now look for SLP instances with a root that are covered by other
6418 instances and remove them. */
6419 hash_set<stmt_vec_info> roots;
6420 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6421 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6422 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6423 if (!roots.is_empty ())
6425 visited.empty ();
6426 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6427 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6428 visited);
6429 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6430 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6431 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6433 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6434 if (dump_enabled_p ())
6435 dump_printf_loc (MSG_NOTE, vect_location,
6436 "removing SLP instance operations starting "
6437 "from: %G", root->stmt);
6438 vect_free_slp_instance (instance);
6439 vinfo->slp_instances.ordered_remove (i);
6441 else
6442 ++i;
6445 /* Compute vectorizable live stmts. */
6446 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6448 hash_set<stmt_vec_info> svisited;
6449 hash_set<slp_tree> visited;
6450 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6452 vect_location = instance->location ();
6453 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6454 instance, &instance->cost_vec, svisited,
6455 visited);
6459 return !vinfo->slp_instances.is_empty ();
6462 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6463 closing the eventual chain. */
6465 static slp_instance
6466 get_ultimate_leader (slp_instance instance,
6467 hash_map<slp_instance, slp_instance> &instance_leader)
6469 auto_vec<slp_instance *, 8> chain;
6470 slp_instance *tem;
6471 while (*(tem = instance_leader.get (instance)) != instance)
6473 chain.safe_push (tem);
6474 instance = *tem;
6476 while (!chain.is_empty ())
6477 *chain.pop () = instance;
6478 return instance;
6481 namespace {
6482 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6483 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6484 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6486 INSTANCE_LEADER is as for get_ultimate_leader. */
6488 template<typename T>
6489 bool
6490 vect_map_to_instance (slp_instance instance, T key,
6491 hash_map<T, slp_instance> &key_to_instance,
6492 hash_map<slp_instance, slp_instance> &instance_leader)
6494 bool existed_p;
6495 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6496 if (!existed_p)
6498 else if (key_instance != instance)
6500 /* If we're running into a previously marked key make us the
6501 leader of the current ultimate leader. This keeps the
6502 leader chain acyclic and works even when the current instance
6503 connects two previously independent graph parts. */
6504 slp_instance key_leader
6505 = get_ultimate_leader (key_instance, instance_leader);
6506 if (key_leader != instance)
6507 instance_leader.put (key_leader, instance);
6509 key_instance = instance;
6510 return existed_p;
6514 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6516 static void
6517 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6518 slp_instance instance, slp_tree node,
6519 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6520 hash_map<slp_tree, slp_instance> &node_to_instance,
6521 hash_map<slp_instance, slp_instance> &instance_leader)
6523 stmt_vec_info stmt_info;
6524 unsigned i;
6526 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6527 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6528 instance_leader);
6530 if (vect_map_to_instance (instance, node, node_to_instance,
6531 instance_leader))
6532 return;
6534 slp_tree child;
6535 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6536 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6537 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6538 node_to_instance, instance_leader);
6541 /* Partition the SLP graph into pieces that can be costed independently. */
6543 static void
6544 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6546 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6548 /* First walk the SLP graph assigning each involved scalar stmt a
6549 corresponding SLP graph entry and upon visiting a previously
6550 marked stmt, make the stmts leader the current SLP graph entry. */
6551 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6552 hash_map<slp_tree, slp_instance> node_to_instance;
6553 hash_map<slp_instance, slp_instance> instance_leader;
6554 slp_instance instance;
6555 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6557 instance_leader.put (instance, instance);
6558 vect_bb_partition_graph_r (bb_vinfo,
6559 instance, SLP_INSTANCE_TREE (instance),
6560 stmt_to_instance, node_to_instance,
6561 instance_leader);
6564 /* Then collect entries to each independent subgraph. */
6565 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6567 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6568 leader->subgraph_entries.safe_push (instance);
6569 if (dump_enabled_p ()
6570 && leader != instance)
6571 dump_printf_loc (MSG_NOTE, vect_location,
6572 "instance %p is leader of %p\n",
6573 (void *) leader, (void *) instance);
6577 /* Compute the set of scalar stmts participating in internal and external
6578 nodes. */
6580 static void
6581 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6582 hash_set<slp_tree> &visited,
6583 hash_set<stmt_vec_info> &vstmts,
6584 hash_set<stmt_vec_info> &estmts)
6586 int i;
6587 stmt_vec_info stmt_info;
6588 slp_tree child;
6590 if (visited.add (node))
6591 return;
6593 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6595 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6596 vstmts.add (stmt_info);
6598 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6599 if (child)
6600 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6601 vstmts, estmts);
6603 else
6604 for (tree def : SLP_TREE_SCALAR_OPS (node))
6606 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6607 if (def_stmt)
6608 estmts.add (def_stmt);
6613 /* Compute the scalar cost of the SLP node NODE and its children
6614 and return it. Do not account defs that are marked in LIFE and
6615 update LIFE according to uses of NODE. */
6617 static void
6618 vect_bb_slp_scalar_cost (vec_info *vinfo,
6619 slp_tree node, vec<bool, va_heap> *life,
6620 stmt_vector_for_cost *cost_vec,
6621 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6622 hash_set<slp_tree> &visited)
6624 unsigned i;
6625 stmt_vec_info stmt_info;
6626 slp_tree child;
6628 if (visited.add (node))
6629 return;
6631 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6633 ssa_op_iter op_iter;
6634 def_operand_p def_p;
6636 if ((*life)[i])
6637 continue;
6639 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6640 gimple *orig_stmt = orig_stmt_info->stmt;
6642 /* If there is a non-vectorized use of the defs then the scalar
6643 stmt is kept live in which case we do not account it or any
6644 required defs in the SLP children in the scalar cost. This
6645 way we make the vectorization more costly when compared to
6646 the scalar cost. */
6647 if (!STMT_VINFO_LIVE_P (stmt_info))
6649 auto_vec<gimple *, 8> worklist;
6650 hash_set<gimple *> *worklist_visited = NULL;
6651 worklist.quick_push (orig_stmt);
6654 gimple *work_stmt = worklist.pop ();
6655 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6657 imm_use_iterator use_iter;
6658 gimple *use_stmt;
6659 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6660 DEF_FROM_PTR (def_p))
6661 if (!is_gimple_debug (use_stmt))
6663 stmt_vec_info use_stmt_info
6664 = vinfo->lookup_stmt (use_stmt);
6665 if (!use_stmt_info
6666 || !vectorized_scalar_stmts.contains (use_stmt_info))
6668 if (use_stmt_info
6669 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6671 /* For stmts participating in patterns we have
6672 to check its uses recursively. */
6673 if (!worklist_visited)
6674 worklist_visited = new hash_set<gimple *> ();
6675 if (!worklist_visited->add (use_stmt))
6676 worklist.safe_push (use_stmt);
6677 continue;
6679 (*life)[i] = true;
6680 goto next_lane;
6685 while (!worklist.is_empty ());
6686 next_lane:
6687 if (worklist_visited)
6688 delete worklist_visited;
6689 if ((*life)[i])
6690 continue;
6693 /* Count scalar stmts only once. */
6694 if (gimple_visited_p (orig_stmt))
6695 continue;
6696 gimple_set_visited (orig_stmt, true);
6698 vect_cost_for_stmt kind;
6699 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6701 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6702 kind = scalar_load;
6703 else
6704 kind = scalar_store;
6706 else if (vect_nop_conversion_p (orig_stmt_info))
6707 continue;
6708 /* For single-argument PHIs assume coalescing which means zero cost
6709 for the scalar and the vector PHIs. This avoids artificially
6710 favoring the vector path (but may pessimize it in some cases). */
6711 else if (is_a <gphi *> (orig_stmt_info->stmt)
6712 && gimple_phi_num_args
6713 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6714 continue;
6715 else
6716 kind = scalar_stmt;
6717 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6718 SLP_TREE_VECTYPE (node), 0, vect_body);
6721 auto_vec<bool, 20> subtree_life;
6722 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6724 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6726 /* Do not directly pass LIFE to the recursive call, copy it to
6727 confine changes in the callee to the current child/subtree. */
6728 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6730 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6731 for (unsigned j = 0;
6732 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6734 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6735 if (perm.first == i)
6736 subtree_life[perm.second] = (*life)[j];
6739 else
6741 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6742 subtree_life.safe_splice (*life);
6744 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6745 vectorized_scalar_stmts, visited);
6746 subtree_life.truncate (0);
6751 /* Comparator for the loop-index sorted cost vectors. */
6753 static int
6754 li_cost_vec_cmp (const void *a_, const void *b_)
6756 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6757 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6758 if (a->first < b->first)
6759 return -1;
6760 else if (a->first == b->first)
6761 return 0;
6762 return 1;
6765 /* Check if vectorization of the basic block is profitable for the
6766 subgraph denoted by SLP_INSTANCES. */
6768 static bool
6769 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6770 vec<slp_instance> slp_instances,
6771 loop_p orig_loop)
6773 slp_instance instance;
6774 int i;
6775 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6776 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6778 if (dump_enabled_p ())
6780 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6781 hash_set<slp_tree> visited;
6782 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6783 vect_print_slp_graph (MSG_NOTE, vect_location,
6784 SLP_INSTANCE_TREE (instance), visited);
6787 /* Compute the set of scalar stmts we know will go away 'locally' when
6788 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6789 not accurate for nodes promoted extern late or for scalar stmts that
6790 are used both in extern defs and in vectorized defs. */
6791 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6792 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6793 hash_set<slp_tree> visited;
6794 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6796 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6797 SLP_INSTANCE_TREE (instance),
6798 visited,
6799 vectorized_scalar_stmts,
6800 scalar_stmts_in_externs);
6801 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6802 vectorized_scalar_stmts.add (rstmt);
6804 /* Scalar stmts used as defs in external nodes need to be preseved, so
6805 remove them from vectorized_scalar_stmts. */
6806 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6807 vectorized_scalar_stmts.remove (stmt);
6809 /* Calculate scalar cost and sum the cost for the vector stmts
6810 previously collected. */
6811 stmt_vector_for_cost scalar_costs = vNULL;
6812 stmt_vector_for_cost vector_costs = vNULL;
6813 visited.empty ();
6814 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6816 auto_vec<bool, 20> life;
6817 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6818 true);
6819 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6820 record_stmt_cost (&scalar_costs,
6821 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6822 scalar_stmt,
6823 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6824 vect_bb_slp_scalar_cost (bb_vinfo,
6825 SLP_INSTANCE_TREE (instance),
6826 &life, &scalar_costs, vectorized_scalar_stmts,
6827 visited);
6828 vector_costs.safe_splice (instance->cost_vec);
6829 instance->cost_vec.release ();
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6835 /* When costing non-loop vectorization we need to consider each covered
6836 loop independently and make sure vectorization is profitable. For
6837 now we assume a loop may be not entered or executed an arbitrary
6838 number of iterations (??? static information can provide more
6839 precise info here) which means we can simply cost each containing
6840 loops stmts separately. */
6842 /* First produce cost vectors sorted by loop index. */
6843 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6844 li_scalar_costs (scalar_costs.length ());
6845 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6846 li_vector_costs (vector_costs.length ());
6847 stmt_info_for_cost *cost;
6848 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6850 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6851 li_scalar_costs.quick_push (std::make_pair (l, cost));
6853 /* Use a random used loop as fallback in case the first vector_costs
6854 entry does not have a stmt_info associated with it. */
6855 unsigned l = li_scalar_costs[0].first;
6856 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6858 /* We inherit from the previous COST, invariants, externals and
6859 extracts immediately follow the cost for the related stmt. */
6860 if (cost->stmt_info)
6861 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6862 li_vector_costs.quick_push (std::make_pair (l, cost));
6864 li_scalar_costs.qsort (li_cost_vec_cmp);
6865 li_vector_costs.qsort (li_cost_vec_cmp);
6867 /* Now cost the portions individually. */
6868 unsigned vi = 0;
6869 unsigned si = 0;
6870 bool profitable = true;
6871 while (si < li_scalar_costs.length ()
6872 && vi < li_vector_costs.length ())
6874 unsigned sl = li_scalar_costs[si].first;
6875 unsigned vl = li_vector_costs[vi].first;
6876 if (sl != vl)
6878 if (dump_enabled_p ())
6879 dump_printf_loc (MSG_NOTE, vect_location,
6880 "Scalar %d and vector %d loop part do not "
6881 "match up, skipping scalar part\n", sl, vl);
6882 /* Skip the scalar part, assuming zero cost on the vector side. */
6885 si++;
6887 while (si < li_scalar_costs.length ()
6888 && li_scalar_costs[si].first == sl);
6889 continue;
6892 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6895 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6896 si++;
6898 while (si < li_scalar_costs.length ()
6899 && li_scalar_costs[si].first == sl);
6900 unsigned dummy;
6901 finish_cost (scalar_target_cost_data, nullptr,
6902 &dummy, &scalar_cost, &dummy);
6904 /* Complete the target-specific vector cost calculation. */
6905 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6908 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6909 vi++;
6911 while (vi < li_vector_costs.length ()
6912 && li_vector_costs[vi].first == vl);
6913 finish_cost (vect_target_cost_data, scalar_target_cost_data,
6914 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6915 delete scalar_target_cost_data;
6916 delete vect_target_cost_data;
6918 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6920 if (dump_enabled_p ())
6922 dump_printf_loc (MSG_NOTE, vect_location,
6923 "Cost model analysis for part in loop %d:\n", sl);
6924 dump_printf (MSG_NOTE, " Vector cost: %d\n",
6925 vec_inside_cost + vec_outside_cost);
6926 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
6929 /* Vectorization is profitable if its cost is more than the cost of scalar
6930 version. Note that we err on the vector side for equal cost because
6931 the cost estimate is otherwise quite pessimistic (constant uses are
6932 free on the scalar side but cost a load on the vector side for
6933 example). */
6934 if (vec_outside_cost + vec_inside_cost > scalar_cost)
6936 profitable = false;
6937 break;
6940 if (profitable && vi < li_vector_costs.length ())
6942 if (dump_enabled_p ())
6943 dump_printf_loc (MSG_NOTE, vect_location,
6944 "Excess vector cost for part in loop %d:\n",
6945 li_vector_costs[vi].first);
6946 profitable = false;
6949 /* Unset visited flag. This is delayed when the subgraph is profitable
6950 and we process the loop for remaining unvectorized if-converted code. */
6951 if (!orig_loop || !profitable)
6952 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6953 gimple_set_visited (cost->stmt_info->stmt, false);
6955 scalar_costs.release ();
6956 vector_costs.release ();
6958 return profitable;
6961 /* qsort comparator for lane defs. */
6963 static int
6964 vld_cmp (const void *a_, const void *b_)
6966 auto *a = (const std::pair<unsigned, tree> *)a_;
6967 auto *b = (const std::pair<unsigned, tree> *)b_;
6968 return a->first - b->first;
6971 /* Return true if USE_STMT is a vector lane insert into VEC and set
6972 *THIS_LANE to the lane number that is set. */
6974 static bool
6975 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
6977 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
6978 if (!use_ass
6979 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
6980 || (vec
6981 ? gimple_assign_rhs1 (use_ass) != vec
6982 : ((vec = gimple_assign_rhs1 (use_ass)), false))
6983 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
6984 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
6985 || !constant_multiple_p
6986 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
6987 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
6988 this_lane))
6989 return false;
6990 return true;
6993 /* Find any vectorizable constructors and add them to the grouped_store
6994 array. */
6996 static void
6997 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
6999 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7000 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7001 !gsi_end_p (gsi); gsi_next (&gsi))
7003 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7004 if (!assign)
7005 continue;
7007 tree rhs = gimple_assign_rhs1 (assign);
7008 enum tree_code code = gimple_assign_rhs_code (assign);
7009 use_operand_p use_p;
7010 gimple *use_stmt;
7011 if (code == CONSTRUCTOR)
7013 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7014 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7015 CONSTRUCTOR_NELTS (rhs))
7016 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7017 || uniform_vector_p (rhs))
7018 continue;
7020 unsigned j;
7021 tree val;
7022 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7023 if (TREE_CODE (val) != SSA_NAME
7024 || !bb_vinfo->lookup_def (val))
7025 break;
7026 if (j != CONSTRUCTOR_NELTS (rhs))
7027 continue;
7029 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7030 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7032 else if (code == BIT_INSERT_EXPR
7033 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7034 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7035 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7036 && integer_zerop (gimple_assign_rhs3 (assign))
7037 && useless_type_conversion_p
7038 (TREE_TYPE (TREE_TYPE (rhs)),
7039 TREE_TYPE (gimple_assign_rhs2 (assign)))
7040 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7042 /* We start to match on insert to lane zero but since the
7043 inserts need not be ordered we'd have to search both
7044 the def and the use chains. */
7045 tree vectype = TREE_TYPE (rhs);
7046 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7047 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7048 auto_sbitmap lanes (nlanes);
7049 bitmap_clear (lanes);
7050 bitmap_set_bit (lanes, 0);
7051 tree def = gimple_assign_lhs (assign);
7052 lane_defs.quick_push
7053 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7054 unsigned lanes_found = 1;
7055 /* Start with the use chains, the last stmt will be the root. */
7056 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7057 vec<stmt_vec_info> roots = vNULL;
7058 roots.safe_push (last);
7061 use_operand_p use_p;
7062 gimple *use_stmt;
7063 if (!single_imm_use (def, &use_p, &use_stmt))
7064 break;
7065 unsigned this_lane;
7066 if (!bb_vinfo->lookup_stmt (use_stmt)
7067 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7068 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7069 break;
7070 if (bitmap_bit_p (lanes, this_lane))
7071 break;
7072 lanes_found++;
7073 bitmap_set_bit (lanes, this_lane);
7074 gassign *use_ass = as_a <gassign *> (use_stmt);
7075 lane_defs.quick_push (std::make_pair
7076 (this_lane, gimple_assign_rhs2 (use_ass)));
7077 last = bb_vinfo->lookup_stmt (use_ass);
7078 roots.safe_push (last);
7079 def = gimple_assign_lhs (use_ass);
7081 while (lanes_found < nlanes);
7082 if (roots.length () > 1)
7083 std::swap(roots[0], roots[roots.length () - 1]);
7084 if (lanes_found < nlanes)
7086 /* Now search the def chain. */
7087 def = gimple_assign_rhs1 (assign);
7090 if (TREE_CODE (def) != SSA_NAME
7091 || !has_single_use (def))
7092 break;
7093 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7094 unsigned this_lane;
7095 if (!bb_vinfo->lookup_stmt (def_stmt)
7096 || !vect_slp_is_lane_insert (def_stmt,
7097 NULL_TREE, &this_lane)
7098 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7099 break;
7100 if (bitmap_bit_p (lanes, this_lane))
7101 break;
7102 lanes_found++;
7103 bitmap_set_bit (lanes, this_lane);
7104 lane_defs.quick_push (std::make_pair
7105 (this_lane,
7106 gimple_assign_rhs2 (def_stmt)));
7107 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7108 def = gimple_assign_rhs1 (def_stmt);
7110 while (lanes_found < nlanes);
7112 if (lanes_found == nlanes)
7114 /* Sort lane_defs after the lane index and register the root. */
7115 lane_defs.qsort (vld_cmp);
7116 vec<stmt_vec_info> stmts;
7117 stmts.create (nlanes);
7118 for (unsigned i = 0; i < nlanes; ++i)
7119 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7120 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7121 stmts, roots));
7123 else
7124 roots.release ();
7126 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7127 && (associative_tree_code (code) || code == MINUS_EXPR)
7128 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
7129 checks pessimize a two-element reduction. PR54400.
7130 ??? In-order reduction could be handled if we only
7131 traverse one operand chain in vect_slp_linearize_chain. */
7132 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7133 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7134 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7135 /* Ops with constants at the tail can be stripped here. */
7136 && TREE_CODE (rhs) == SSA_NAME
7137 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7138 /* Should be the chain end. */
7139 && (!single_imm_use (gimple_assign_lhs (assign),
7140 &use_p, &use_stmt)
7141 || !is_gimple_assign (use_stmt)
7142 || (gimple_assign_rhs_code (use_stmt) != code
7143 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7144 || (gimple_assign_rhs_code (use_stmt)
7145 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7147 /* We start the match at the end of a possible association
7148 chain. */
7149 auto_vec<chain_op_t> chain;
7150 auto_vec<std::pair<tree_code, gimple *> > worklist;
7151 auto_vec<gimple *> chain_stmts;
7152 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7153 if (code == MINUS_EXPR)
7154 code = PLUS_EXPR;
7155 internal_fn reduc_fn;
7156 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7157 || reduc_fn == IFN_LAST)
7158 continue;
7159 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7160 /* ??? */
7161 code_stmt, alt_code_stmt, &chain_stmts);
7162 if (chain.length () > 1)
7164 /* Sort the chain according to def_type and operation. */
7165 chain.sort (dt_sort_cmp, bb_vinfo);
7166 /* ??? Now we'd want to strip externals and constants
7167 but record those to be handled in the epilogue. */
7168 /* ??? For now do not allow mixing ops or externs/constants. */
7169 bool invalid = false;
7170 for (unsigned i = 0; i < chain.length (); ++i)
7171 if (chain[i].dt != vect_internal_def
7172 || chain[i].code != code)
7173 invalid = true;
7174 if (!invalid)
7176 vec<stmt_vec_info> stmts;
7177 stmts.create (chain.length ());
7178 for (unsigned i = 0; i < chain.length (); ++i)
7179 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7180 vec<stmt_vec_info> roots;
7181 roots.create (chain_stmts.length ());
7182 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7183 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7184 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7185 stmts, roots));
7192 /* Walk the grouped store chains and replace entries with their
7193 pattern variant if any. */
7195 static void
7196 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7198 stmt_vec_info first_element;
7199 unsigned i;
7201 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7203 /* We also have CTORs in this array. */
7204 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7205 continue;
7206 if (STMT_VINFO_IN_PATTERN_P (first_element))
7208 stmt_vec_info orig = first_element;
7209 first_element = STMT_VINFO_RELATED_STMT (first_element);
7210 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7211 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7212 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7213 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7214 vinfo->grouped_stores[i] = first_element;
7216 stmt_vec_info prev = first_element;
7217 while (DR_GROUP_NEXT_ELEMENT (prev))
7219 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7220 if (STMT_VINFO_IN_PATTERN_P (elt))
7222 stmt_vec_info orig = elt;
7223 elt = STMT_VINFO_RELATED_STMT (elt);
7224 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7225 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7226 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7228 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7229 prev = elt;
7234 /* Check if the region described by BB_VINFO can be vectorized, returning
7235 true if so. When returning false, set FATAL to true if the same failure
7236 would prevent vectorization at other vector sizes, false if it is still
7237 worth trying other sizes. N_STMTS is the number of statements in the
7238 region. */
7240 static bool
7241 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7242 vec<int> *dataref_groups)
7244 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7246 slp_instance instance;
7247 int i;
7248 poly_uint64 min_vf = 2;
7250 /* The first group of checks is independent of the vector size. */
7251 fatal = true;
7253 /* Analyze the data references. */
7255 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7257 if (dump_enabled_p ())
7258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7259 "not vectorized: unhandled data-ref in basic "
7260 "block.\n");
7261 return false;
7264 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7266 if (dump_enabled_p ())
7267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268 "not vectorized: unhandled data access in "
7269 "basic block.\n");
7270 return false;
7273 vect_slp_check_for_constructors (bb_vinfo);
7275 /* If there are no grouped stores and no constructors in the region
7276 there is no need to continue with pattern recog as vect_analyze_slp
7277 will fail anyway. */
7278 if (bb_vinfo->grouped_stores.is_empty ()
7279 && bb_vinfo->roots.is_empty ())
7281 if (dump_enabled_p ())
7282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7283 "not vectorized: no grouped stores in "
7284 "basic block.\n");
7285 return false;
7288 /* While the rest of the analysis below depends on it in some way. */
7289 fatal = false;
7291 vect_pattern_recog (bb_vinfo);
7293 /* Update store groups from pattern processing. */
7294 vect_fixup_store_groups_with_patterns (bb_vinfo);
7296 /* Check the SLP opportunities in the basic block, analyze and build SLP
7297 trees. */
7298 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7300 if (dump_enabled_p ())
7302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7303 "Failed to SLP the basic block.\n");
7304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7305 "not vectorized: failed to find SLP opportunities "
7306 "in basic block.\n");
7308 return false;
7311 /* Optimize permutations. */
7312 vect_optimize_slp (bb_vinfo);
7314 /* Gather the loads reachable from the SLP graph entries. */
7315 vect_gather_slp_loads (bb_vinfo);
7317 vect_record_base_alignments (bb_vinfo);
7319 /* Analyze and verify the alignment of data references and the
7320 dependence in the SLP instances. */
7321 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7323 vect_location = instance->location ();
7324 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7325 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7327 slp_tree node = SLP_INSTANCE_TREE (instance);
7328 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7329 if (dump_enabled_p ())
7330 dump_printf_loc (MSG_NOTE, vect_location,
7331 "removing SLP instance operations starting from: %G",
7332 stmt_info->stmt);
7333 vect_free_slp_instance (instance);
7334 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7335 continue;
7338 /* Mark all the statements that we want to vectorize as pure SLP and
7339 relevant. */
7340 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7341 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7342 unsigned j;
7343 stmt_vec_info root;
7344 /* Likewise consider instance root stmts as vectorized. */
7345 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7346 STMT_SLP_TYPE (root) = pure_slp;
7348 i++;
7350 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7351 return false;
7353 if (!vect_slp_analyze_operations (bb_vinfo))
7355 if (dump_enabled_p ())
7356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357 "not vectorized: bad operation in basic block.\n");
7358 return false;
7361 vect_bb_partition_graph (bb_vinfo);
7363 return true;
7366 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7367 basic blocks in BBS, returning true on success.
7368 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7370 static bool
7371 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7372 vec<int> *dataref_groups, unsigned int n_stmts,
7373 loop_p orig_loop)
7375 bb_vec_info bb_vinfo;
7376 auto_vector_modes vector_modes;
7378 /* Autodetect first vector size we try. */
7379 machine_mode next_vector_mode = VOIDmode;
7380 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7381 unsigned int mode_i = 0;
7383 vec_info_shared shared;
7385 machine_mode autodetected_vector_mode = VOIDmode;
7386 while (1)
7388 bool vectorized = false;
7389 bool fatal = false;
7390 bb_vinfo = new _bb_vec_info (bbs, &shared);
7392 bool first_time_p = shared.datarefs.is_empty ();
7393 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7394 if (first_time_p)
7395 bb_vinfo->shared->save_datarefs ();
7396 else
7397 bb_vinfo->shared->check_datarefs ();
7398 bb_vinfo->vector_mode = next_vector_mode;
7400 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7402 if (dump_enabled_p ())
7404 dump_printf_loc (MSG_NOTE, vect_location,
7405 "***** Analysis succeeded with vector mode"
7406 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7407 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7410 bb_vinfo->shared->check_datarefs ();
7412 auto_vec<slp_instance> profitable_subgraphs;
7413 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7415 if (instance->subgraph_entries.is_empty ())
7416 continue;
7418 vect_location = instance->location ();
7419 if (!unlimited_cost_model (NULL)
7420 && !vect_bb_vectorization_profitable_p
7421 (bb_vinfo, instance->subgraph_entries, orig_loop))
7423 if (dump_enabled_p ())
7424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7425 "not vectorized: vectorization is not "
7426 "profitable.\n");
7427 continue;
7430 if (!dbg_cnt (vect_slp))
7431 continue;
7433 profitable_subgraphs.safe_push (instance);
7436 /* When we're vectorizing an if-converted loop body make sure
7437 we vectorized all if-converted code. */
7438 if (!profitable_subgraphs.is_empty ()
7439 && orig_loop)
7441 gcc_assert (bb_vinfo->bbs.length () == 1);
7442 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7443 !gsi_end_p (gsi); gsi_next (&gsi))
7445 /* The costing above left us with DCEable vectorized scalar
7446 stmts having the visited flag set on profitable
7447 subgraphs. Do the delayed clearing of the flag here. */
7448 if (gimple_visited_p (gsi_stmt (gsi)))
7450 gimple_set_visited (gsi_stmt (gsi), false);
7451 continue;
7453 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7454 continue;
7456 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7457 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7459 if (!profitable_subgraphs.is_empty ()
7460 && dump_enabled_p ())
7461 dump_printf_loc (MSG_NOTE, vect_location,
7462 "not profitable because of "
7463 "unprofitable if-converted scalar "
7464 "code\n");
7465 profitable_subgraphs.truncate (0);
7470 /* Finally schedule the profitable subgraphs. */
7471 for (slp_instance instance : profitable_subgraphs)
7473 if (!vectorized && dump_enabled_p ())
7474 dump_printf_loc (MSG_NOTE, vect_location,
7475 "Basic block will be vectorized "
7476 "using SLP\n");
7477 vectorized = true;
7479 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7481 unsigned HOST_WIDE_INT bytes;
7482 if (dump_enabled_p ())
7484 if (GET_MODE_SIZE
7485 (bb_vinfo->vector_mode).is_constant (&bytes))
7486 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7487 "basic block part vectorized using %wu "
7488 "byte vectors\n", bytes);
7489 else
7490 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7491 "basic block part vectorized using "
7492 "variable length vectors\n");
7496 else
7498 if (dump_enabled_p ())
7499 dump_printf_loc (MSG_NOTE, vect_location,
7500 "***** Analysis failed with vector mode %s\n",
7501 GET_MODE_NAME (bb_vinfo->vector_mode));
7504 if (mode_i == 0)
7505 autodetected_vector_mode = bb_vinfo->vector_mode;
7507 if (!fatal)
7508 while (mode_i < vector_modes.length ()
7509 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7511 if (dump_enabled_p ())
7512 dump_printf_loc (MSG_NOTE, vect_location,
7513 "***** The result for vector mode %s would"
7514 " be the same\n",
7515 GET_MODE_NAME (vector_modes[mode_i]));
7516 mode_i += 1;
7519 delete bb_vinfo;
7521 if (mode_i < vector_modes.length ()
7522 && VECTOR_MODE_P (autodetected_vector_mode)
7523 && (related_vector_mode (vector_modes[mode_i],
7524 GET_MODE_INNER (autodetected_vector_mode))
7525 == autodetected_vector_mode)
7526 && (related_vector_mode (autodetected_vector_mode,
7527 GET_MODE_INNER (vector_modes[mode_i]))
7528 == vector_modes[mode_i]))
7530 if (dump_enabled_p ())
7531 dump_printf_loc (MSG_NOTE, vect_location,
7532 "***** Skipping vector mode %s, which would"
7533 " repeat the analysis for %s\n",
7534 GET_MODE_NAME (vector_modes[mode_i]),
7535 GET_MODE_NAME (autodetected_vector_mode));
7536 mode_i += 1;
7539 if (vectorized
7540 || mode_i == vector_modes.length ()
7541 || autodetected_vector_mode == VOIDmode
7542 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7543 vector sizes will fail do not bother iterating. */
7544 || fatal)
7545 return vectorized;
7547 /* Try the next biggest vector size. */
7548 next_vector_mode = vector_modes[mode_i++];
7549 if (dump_enabled_p ())
7550 dump_printf_loc (MSG_NOTE, vect_location,
7551 "***** Re-trying analysis with vector mode %s\n",
7552 GET_MODE_NAME (next_vector_mode));
7557 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7558 true if anything in the basic-block was vectorized. */
7560 static bool
7561 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7563 vec<data_reference_p> datarefs = vNULL;
7564 auto_vec<int> dataref_groups;
7565 int insns = 0;
7566 int current_group = 0;
7568 for (unsigned i = 0; i < bbs.length (); i++)
7570 basic_block bb = bbs[i];
7571 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7572 gsi_next (&gsi))
7574 gimple *stmt = gsi_stmt (gsi);
7575 if (is_gimple_debug (stmt))
7576 continue;
7578 insns++;
7580 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7581 vect_location = stmt;
7583 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7584 &dataref_groups, current_group))
7585 ++current_group;
7587 /* New BBs always start a new DR group. */
7588 ++current_group;
7591 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7594 /* Special entry for the BB vectorizer. Analyze and transform a single
7595 if-converted BB with ORIG_LOOPs body being the not if-converted
7596 representation. Returns true if anything in the basic-block was
7597 vectorized. */
7599 bool
7600 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7602 auto_vec<basic_block> bbs;
7603 bbs.safe_push (bb);
7604 return vect_slp_bbs (bbs, orig_loop);
7607 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7608 true if anything in the basic-block was vectorized. */
7610 bool
7611 vect_slp_function (function *fun)
7613 bool r = false;
7614 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7615 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7617 /* For the moment split the function into pieces to avoid making
7618 the iteration on the vector mode moot. Split at points we know
7619 to not handle well which is CFG merges (SLP discovery doesn't
7620 handle non-loop-header PHIs) and loop exits. Since pattern
7621 recog requires reverse iteration to visit uses before defs
7622 simply chop RPO into pieces. */
7623 auto_vec<basic_block> bbs;
7624 for (unsigned i = 0; i < n; i++)
7626 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7627 bool split = false;
7629 /* Split when a BB is not dominated by the first block. */
7630 if (!bbs.is_empty ()
7631 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7633 if (dump_enabled_p ())
7634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7635 "splitting region at dominance boundary bb%d\n",
7636 bb->index);
7637 split = true;
7639 /* Split when the loop determined by the first block
7640 is exited. This is because we eventually insert
7641 invariants at region begin. */
7642 else if (!bbs.is_empty ()
7643 && bbs[0]->loop_father != bb->loop_father
7644 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7646 if (dump_enabled_p ())
7647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7648 "splitting region at loop %d exit at bb%d\n",
7649 bbs[0]->loop_father->num, bb->index);
7650 split = true;
7653 if (split && !bbs.is_empty ())
7655 r |= vect_slp_bbs (bbs, NULL);
7656 bbs.truncate (0);
7657 bbs.quick_push (bb);
7659 else
7660 bbs.safe_push (bb);
7662 /* When we have a stmt ending this block and defining a
7663 value we have to insert on edges when inserting after it for
7664 a vector containing its definition. Avoid this for now. */
7665 if (gimple *last = last_stmt (bb))
7666 if (gimple_get_lhs (last)
7667 && is_ctrl_altering_stmt (last))
7669 if (dump_enabled_p ())
7670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7671 "splitting region at control altering "
7672 "definition %G", last);
7673 r |= vect_slp_bbs (bbs, NULL);
7674 bbs.truncate (0);
7678 if (!bbs.is_empty ())
7679 r |= vect_slp_bbs (bbs, NULL);
7681 free (rpo);
7683 return r;
7686 /* Build a variable-length vector in which the elements in ELTS are repeated
7687 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7688 RESULTS and add any new instructions to SEQ.
7690 The approach we use is:
7692 (1) Find a vector mode VM with integer elements of mode IM.
7694 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7695 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7696 from small vectors to IM.
7698 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7700 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7701 correct byte contents.
7703 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7705 We try to find the largest IM for which this sequence works, in order
7706 to cut down on the number of interleaves. */
7708 void
7709 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7710 const vec<tree> &elts, unsigned int nresults,
7711 vec<tree> &results)
7713 unsigned int nelts = elts.length ();
7714 tree element_type = TREE_TYPE (vector_type);
7716 /* (1) Find a vector mode VM with integer elements of mode IM. */
7717 unsigned int nvectors = 1;
7718 tree new_vector_type;
7719 tree permutes[2];
7720 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7721 &nvectors, &new_vector_type,
7722 permutes))
7723 gcc_unreachable ();
7725 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7726 unsigned int partial_nelts = nelts / nvectors;
7727 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7729 tree_vector_builder partial_elts;
7730 auto_vec<tree, 32> pieces (nvectors * 2);
7731 pieces.quick_grow_cleared (nvectors * 2);
7732 for (unsigned int i = 0; i < nvectors; ++i)
7734 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7735 ELTS' has mode IM. */
7736 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7737 for (unsigned int j = 0; j < partial_nelts; ++j)
7738 partial_elts.quick_push (elts[i * partial_nelts + j]);
7739 tree t = gimple_build_vector (seq, &partial_elts);
7740 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7741 TREE_TYPE (new_vector_type), t);
7743 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7744 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7747 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7748 correct byte contents.
7750 Conceptually, we need to repeat the following operation log2(nvectors)
7751 times, where hi_start = nvectors / 2:
7753 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7754 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7756 However, if each input repeats every N elements and the VF is
7757 a multiple of N * 2, the HI result is the same as the LO result.
7758 This will be true for the first N1 iterations of the outer loop,
7759 followed by N2 iterations for which both the LO and HI results
7760 are needed. I.e.:
7762 N1 + N2 = log2(nvectors)
7764 Each "N1 iteration" doubles the number of redundant vectors and the
7765 effect of the process as a whole is to have a sequence of nvectors/2**N1
7766 vectors that repeats 2**N1 times. Rather than generate these redundant
7767 vectors, we halve the number of vectors for each N1 iteration. */
7768 unsigned int in_start = 0;
7769 unsigned int out_start = nvectors;
7770 unsigned int new_nvectors = nvectors;
7771 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7773 unsigned int hi_start = new_nvectors / 2;
7774 unsigned int out_i = 0;
7775 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7777 if ((in_i & 1) != 0
7778 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7779 2 * in_repeat))
7780 continue;
7782 tree output = make_ssa_name (new_vector_type);
7783 tree input1 = pieces[in_start + (in_i / 2)];
7784 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7785 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7786 input1, input2,
7787 permutes[in_i & 1]);
7788 gimple_seq_add_stmt (seq, stmt);
7789 pieces[out_start + out_i] = output;
7790 out_i += 1;
7792 std::swap (in_start, out_start);
7793 new_nvectors = out_i;
7796 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7797 results.reserve (nresults);
7798 for (unsigned int i = 0; i < nresults; ++i)
7799 if (i < new_nvectors)
7800 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7801 pieces[in_start + i]));
7802 else
7803 results.quick_push (results[i - new_nvectors]);
7807 /* For constant and loop invariant defs in OP_NODE this function creates
7808 vector defs that will be used in the vectorized stmts and stores them
7809 to SLP_TREE_VEC_DEFS of OP_NODE. */
7811 static void
7812 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7814 unsigned HOST_WIDE_INT nunits;
7815 tree vec_cst;
7816 unsigned j, number_of_places_left_in_vector;
7817 tree vector_type;
7818 tree vop;
7819 int group_size = op_node->ops.length ();
7820 unsigned int vec_num, i;
7821 unsigned number_of_copies = 1;
7822 bool constant_p;
7823 gimple_seq ctor_seq = NULL;
7824 auto_vec<tree, 16> permute_results;
7826 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
7827 vector_type = SLP_TREE_VECTYPE (op_node);
7829 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7830 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7831 auto_vec<tree> voprnds (number_of_vectors);
7833 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7834 created vectors. It is greater than 1 if unrolling is performed.
7836 For example, we have two scalar operands, s1 and s2 (e.g., group of
7837 strided accesses of size two), while NUNITS is four (i.e., four scalars
7838 of this type can be packed in a vector). The output vector will contain
7839 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
7840 will be 2).
7842 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7843 containing the operands.
7845 For example, NUNITS is four as before, and the group size is 8
7846 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
7847 {s5, s6, s7, s8}. */
7849 /* When using duplicate_and_interleave, we just need one element for
7850 each scalar statement. */
7851 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7852 nunits = group_size;
7854 number_of_copies = nunits * number_of_vectors / group_size;
7856 number_of_places_left_in_vector = nunits;
7857 constant_p = true;
7858 tree_vector_builder elts (vector_type, nunits, 1);
7859 elts.quick_grow (nunits);
7860 stmt_vec_info insert_after = NULL;
7861 for (j = 0; j < number_of_copies; j++)
7863 tree op;
7864 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7866 /* Create 'vect_ = {op0,op1,...,opn}'. */
7867 number_of_places_left_in_vector--;
7868 tree orig_op = op;
7869 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7871 if (CONSTANT_CLASS_P (op))
7873 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7875 /* Can't use VIEW_CONVERT_EXPR for booleans because
7876 of possibly different sizes of scalar value and
7877 vector element. */
7878 if (integer_zerop (op))
7879 op = build_int_cst (TREE_TYPE (vector_type), 0);
7880 else if (integer_onep (op))
7881 op = build_all_ones_cst (TREE_TYPE (vector_type));
7882 else
7883 gcc_unreachable ();
7885 else
7886 op = fold_unary (VIEW_CONVERT_EXPR,
7887 TREE_TYPE (vector_type), op);
7888 gcc_assert (op && CONSTANT_CLASS_P (op));
7890 else
7892 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7893 gimple *init_stmt;
7894 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7896 tree true_val
7897 = build_all_ones_cst (TREE_TYPE (vector_type));
7898 tree false_val
7899 = build_zero_cst (TREE_TYPE (vector_type));
7900 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7901 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7902 op, true_val,
7903 false_val);
7905 else
7907 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7908 op);
7909 init_stmt
7910 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7911 op);
7913 gimple_seq_add_stmt (&ctor_seq, init_stmt);
7914 op = new_temp;
7917 elts[number_of_places_left_in_vector] = op;
7918 if (!CONSTANT_CLASS_P (op))
7919 constant_p = false;
7920 /* For BB vectorization we have to compute an insert location
7921 when a def is inside the analyzed region since we cannot
7922 simply insert at the BB start in this case. */
7923 stmt_vec_info opdef;
7924 if (TREE_CODE (orig_op) == SSA_NAME
7925 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7926 && is_a <bb_vec_info> (vinfo)
7927 && (opdef = vinfo->lookup_def (orig_op)))
7929 if (!insert_after)
7930 insert_after = opdef;
7931 else
7932 insert_after = get_later_stmt (insert_after, opdef);
7935 if (number_of_places_left_in_vector == 0)
7937 if (constant_p
7938 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7939 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7940 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7941 else
7943 if (permute_results.is_empty ())
7944 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7945 elts, number_of_vectors,
7946 permute_results);
7947 vec_cst = permute_results[number_of_vectors - j - 1];
7949 if (!gimple_seq_empty_p (ctor_seq))
7951 if (insert_after)
7953 gimple_stmt_iterator gsi;
7954 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7956 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7957 gsi_insert_seq_before (&gsi, ctor_seq,
7958 GSI_CONTINUE_LINKING);
7960 else if (!stmt_ends_bb_p (insert_after->stmt))
7962 gsi = gsi_for_stmt (insert_after->stmt);
7963 gsi_insert_seq_after (&gsi, ctor_seq,
7964 GSI_CONTINUE_LINKING);
7966 else
7968 /* When we want to insert after a def where the
7969 defining stmt throws then insert on the fallthru
7970 edge. */
7971 edge e = find_fallthru_edge
7972 (gimple_bb (insert_after->stmt)->succs);
7973 basic_block new_bb
7974 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
7975 gcc_assert (!new_bb);
7978 else
7979 vinfo->insert_seq_on_entry (NULL, ctor_seq);
7980 ctor_seq = NULL;
7982 voprnds.quick_push (vec_cst);
7983 insert_after = NULL;
7984 number_of_places_left_in_vector = nunits;
7985 constant_p = true;
7986 elts.new_vector (vector_type, nunits, 1);
7987 elts.quick_grow (nunits);
7992 /* Since the vectors are created in the reverse order, we should invert
7993 them. */
7994 vec_num = voprnds.length ();
7995 for (j = vec_num; j != 0; j--)
7997 vop = voprnds[j - 1];
7998 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8001 /* In case that VF is greater than the unrolling factor needed for the SLP
8002 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8003 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8004 to replicate the vectors. */
8005 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8006 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8007 i++)
8008 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8011 /* Get the Ith vectorized definition from SLP_NODE. */
8013 tree
8014 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8016 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8017 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8018 else
8019 return SLP_TREE_VEC_DEFS (slp_node)[i];
8022 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8024 void
8025 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8027 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8028 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8030 unsigned j;
8031 gimple *vec_def_stmt;
8032 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8033 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8035 else
8036 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8039 /* Get N vectorized definitions for SLP_NODE. */
8041 void
8042 vect_get_slp_defs (vec_info *,
8043 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8045 if (n == -1U)
8046 n = SLP_TREE_CHILDREN (slp_node).length ();
8048 for (unsigned i = 0; i < n; ++i)
8050 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8051 vec<tree> vec_defs = vNULL;
8052 vect_get_slp_defs (child, &vec_defs);
8053 vec_oprnds->quick_push (vec_defs);
8057 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8058 - PERM gives the permutation that the caller wants to use for NODE,
8059 which might be different from SLP_LOAD_PERMUTATION.
8060 - DUMP_P controls whether the function dumps information. */
8062 static bool
8063 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8064 load_permutation_t &perm,
8065 const vec<tree> &dr_chain,
8066 gimple_stmt_iterator *gsi, poly_uint64 vf,
8067 bool analyze_only, bool dump_p,
8068 unsigned *n_perms, unsigned int *n_loads,
8069 bool dce_chain)
8071 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8072 int vec_index = 0;
8073 tree vectype = SLP_TREE_VECTYPE (node);
8074 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8075 unsigned int mask_element;
8076 machine_mode mode;
8078 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8079 return false;
8081 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8083 mode = TYPE_MODE (vectype);
8084 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8086 /* Initialize the vect stmts of NODE to properly insert the generated
8087 stmts later. */
8088 if (! analyze_only)
8089 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
8090 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
8091 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8093 /* Generate permutation masks for every NODE. Number of masks for each NODE
8094 is equal to GROUP_SIZE.
8095 E.g., we have a group of three nodes with three loads from the same
8096 location in each node, and the vector size is 4. I.e., we have a
8097 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8098 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8099 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8102 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8103 The last mask is illegal since we assume two operands for permute
8104 operation, and the mask element values can't be outside that range.
8105 Hence, the last mask must be converted into {2,5,5,5}.
8106 For the first two permutations we need the first and the second input
8107 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8108 we need the second and the third vectors: {b1,c1,a2,b2} and
8109 {c2,a3,b3,c3}. */
8111 int vect_stmts_counter = 0;
8112 unsigned int index = 0;
8113 int first_vec_index = -1;
8114 int second_vec_index = -1;
8115 bool noop_p = true;
8116 *n_perms = 0;
8118 vec_perm_builder mask;
8119 unsigned int nelts_to_build;
8120 unsigned int nvectors_per_build;
8121 unsigned int in_nlanes;
8122 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8123 && multiple_p (nunits, group_size));
8124 if (repeating_p)
8126 /* A single vector contains a whole number of copies of the node, so:
8127 (a) all permutes can use the same mask; and
8128 (b) the permutes only need a single vector input. */
8129 mask.new_vector (nunits, group_size, 3);
8130 nelts_to_build = mask.encoded_nelts ();
8131 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
8132 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8134 else
8136 /* We need to construct a separate mask for each vector statement. */
8137 unsigned HOST_WIDE_INT const_nunits, const_vf;
8138 if (!nunits.is_constant (&const_nunits)
8139 || !vf.is_constant (&const_vf))
8140 return false;
8141 mask.new_vector (const_nunits, const_nunits, 1);
8142 nelts_to_build = const_vf * group_size;
8143 nvectors_per_build = 1;
8144 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8146 auto_sbitmap used_in_lanes (in_nlanes);
8147 bitmap_clear (used_in_lanes);
8148 auto_bitmap used_defs;
8150 unsigned int count = mask.encoded_nelts ();
8151 mask.quick_grow (count);
8152 vec_perm_indices indices;
8154 for (unsigned int j = 0; j < nelts_to_build; j++)
8156 unsigned int iter_num = j / group_size;
8157 unsigned int stmt_num = j % group_size;
8158 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8159 bitmap_set_bit (used_in_lanes, i);
8160 if (repeating_p)
8162 first_vec_index = 0;
8163 mask_element = i;
8165 else
8167 /* Enforced before the loop when !repeating_p. */
8168 unsigned int const_nunits = nunits.to_constant ();
8169 vec_index = i / const_nunits;
8170 mask_element = i % const_nunits;
8171 if (vec_index == first_vec_index
8172 || first_vec_index == -1)
8174 first_vec_index = vec_index;
8176 else if (vec_index == second_vec_index
8177 || second_vec_index == -1)
8179 second_vec_index = vec_index;
8180 mask_element += const_nunits;
8182 else
8184 if (dump_p)
8185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8186 "permutation requires at "
8187 "least three vectors %G",
8188 stmt_info->stmt);
8189 gcc_assert (analyze_only);
8190 return false;
8193 gcc_assert (mask_element < 2 * const_nunits);
8196 if (mask_element != index)
8197 noop_p = false;
8198 mask[index++] = mask_element;
8200 if (index == count && !noop_p)
8202 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8203 if (!can_vec_perm_const_p (mode, mode, indices))
8205 if (dump_p)
8207 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8208 vect_location,
8209 "unsupported vect permute { ");
8210 for (i = 0; i < count; ++i)
8212 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8213 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8215 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8217 gcc_assert (analyze_only);
8218 return false;
8221 ++*n_perms;
8224 if (index == count)
8226 if (!analyze_only)
8228 tree mask_vec = NULL_TREE;
8230 if (! noop_p)
8231 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8233 if (second_vec_index == -1)
8234 second_vec_index = first_vec_index;
8236 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8238 /* Generate the permute statement if necessary. */
8239 tree first_vec = dr_chain[first_vec_index + ri];
8240 tree second_vec = dr_chain[second_vec_index + ri];
8241 gimple *perm_stmt;
8242 if (! noop_p)
8244 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
8245 tree perm_dest
8246 = vect_create_destination_var (gimple_assign_lhs (stmt),
8247 vectype);
8248 perm_dest = make_ssa_name (perm_dest);
8249 perm_stmt
8250 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8251 first_vec, second_vec,
8252 mask_vec);
8253 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8254 gsi);
8255 if (dce_chain)
8257 bitmap_set_bit (used_defs, first_vec_index + ri);
8258 bitmap_set_bit (used_defs, second_vec_index + ri);
8261 else
8263 /* If mask was NULL_TREE generate the requested
8264 identity transform. */
8265 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8266 if (dce_chain)
8267 bitmap_set_bit (used_defs, first_vec_index + ri);
8270 /* Store the vector statement in NODE. */
8271 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8275 index = 0;
8276 first_vec_index = -1;
8277 second_vec_index = -1;
8278 noop_p = true;
8282 if (n_loads)
8284 if (repeating_p)
8285 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8286 else
8288 /* Enforced above when !repeating_p. */
8289 unsigned int const_nunits = nunits.to_constant ();
8290 *n_loads = 0;
8291 bool load_seen = false;
8292 for (unsigned i = 0; i < in_nlanes; ++i)
8294 if (i % const_nunits == 0)
8296 if (load_seen)
8297 *n_loads += 1;
8298 load_seen = false;
8300 if (bitmap_bit_p (used_in_lanes, i))
8301 load_seen = true;
8303 if (load_seen)
8304 *n_loads += 1;
8308 if (dce_chain)
8309 for (unsigned i = 0; i < dr_chain.length (); ++i)
8310 if (!bitmap_bit_p (used_defs, i))
8312 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8313 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8314 gsi_remove (&rgsi, true);
8315 release_defs (stmt);
8318 return true;
8321 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8322 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8323 permute statements for the SLP node NODE. Store the number of vector
8324 permute instructions in *N_PERMS and the number of vector load
8325 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8326 that were not needed. */
8328 bool
8329 vect_transform_slp_perm_load (vec_info *vinfo,
8330 slp_tree node, const vec<tree> &dr_chain,
8331 gimple_stmt_iterator *gsi, poly_uint64 vf,
8332 bool analyze_only, unsigned *n_perms,
8333 unsigned int *n_loads, bool dce_chain)
8335 return vect_transform_slp_perm_load_1 (vinfo, node,
8336 SLP_TREE_LOAD_PERMUTATION (node),
8337 dr_chain, gsi, vf, analyze_only,
8338 dump_enabled_p (), n_perms, n_loads,
8339 dce_chain);
8342 /* Produce the next vector result for SLP permutation NODE by adding a vector
8343 statement at GSI. If MASK_VEC is nonnull, add:
8345 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8347 otherwise add:
8349 <new SSA name> = FIRST_DEF. */
8351 static void
8352 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8353 slp_tree node, tree first_def, tree second_def,
8354 tree mask_vec)
8356 tree vectype = SLP_TREE_VECTYPE (node);
8358 /* ??? We SLP match existing vector element extracts but
8359 allow punning which we need to re-instantiate at uses
8360 but have no good way of explicitly representing. */
8361 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8362 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8364 gassign *conv_stmt
8365 = gimple_build_assign (make_ssa_name (vectype),
8366 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8367 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8368 first_def = gimple_assign_lhs (conv_stmt);
8370 gassign *perm_stmt;
8371 tree perm_dest = make_ssa_name (vectype);
8372 if (mask_vec)
8374 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8375 TYPE_SIZE (vectype))
8376 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8378 gassign *conv_stmt
8379 = gimple_build_assign (make_ssa_name (vectype),
8380 build1 (VIEW_CONVERT_EXPR,
8381 vectype, second_def));
8382 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8383 second_def = gimple_assign_lhs (conv_stmt);
8385 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8386 first_def, second_def,
8387 mask_vec);
8389 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8391 /* For identity permutes we still need to handle the case
8392 of lowpart extracts or concats. */
8393 unsigned HOST_WIDE_INT c;
8394 auto first_def_nunits
8395 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8396 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8398 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8399 TYPE_SIZE (vectype), bitsize_zero_node);
8400 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8402 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8403 first_def_nunits, &c) && c == 2)
8405 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8406 NULL_TREE, second_def);
8407 perm_stmt = gimple_build_assign (perm_dest, ctor);
8409 else
8410 gcc_unreachable ();
8412 else
8414 /* We need a copy here in case the def was external. */
8415 perm_stmt = gimple_build_assign (perm_dest, first_def);
8417 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8418 /* Store the vector statement in NODE. */
8419 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8422 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8423 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8424 If GSI is nonnull, emit the permutation there.
8426 When GSI is null, the only purpose of NODE is to give properties
8427 of the result, such as the vector type and number of SLP lanes.
8428 The node does not need to be a VEC_PERM_EXPR.
8430 If the target supports the operation, return the number of individual
8431 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8432 dump file if DUMP_P is true. */
8434 static int
8435 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8436 slp_tree node, lane_permutation_t &perm,
8437 vec<slp_tree> &children, bool dump_p)
8439 tree vectype = SLP_TREE_VECTYPE (node);
8441 /* ??? We currently only support all same vector input types
8442 while the SLP IL should really do a concat + select and thus accept
8443 arbitrary mismatches. */
8444 slp_tree child;
8445 unsigned i;
8446 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8447 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8448 tree op_vectype = NULL_TREE;
8449 FOR_EACH_VEC_ELT (children, i, child)
8450 if (SLP_TREE_VECTYPE (child))
8452 op_vectype = SLP_TREE_VECTYPE (child);
8453 break;
8455 if (!op_vectype)
8456 op_vectype = vectype;
8457 FOR_EACH_VEC_ELT (children, i, child)
8459 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8460 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8461 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8462 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8464 if (dump_p)
8465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8466 "Unsupported vector types in lane permutation\n");
8467 return -1;
8469 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8470 repeating_p = false;
8473 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8474 if (dump_p)
8476 dump_printf_loc (MSG_NOTE, vect_location,
8477 "vectorizing permutation");
8478 for (unsigned i = 0; i < perm.length (); ++i)
8479 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8480 if (repeating_p)
8481 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8482 dump_printf (MSG_NOTE, "\n");
8485 /* REPEATING_P is true if every output vector is guaranteed to use the
8486 same permute vector. We can handle that case for both variable-length
8487 and constant-length vectors, but we only handle other cases for
8488 constant-length vectors.
8490 Set:
8492 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8493 mask vector that we want to build.
8495 - NCOPIES to the number of copies of PERM that we need in order
8496 to build the necessary permute mask vectors.
8498 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8499 for each permute mask vector. This is only relevant when GSI is
8500 nonnull. */
8501 uint64_t npatterns;
8502 unsigned nelts_per_pattern;
8503 uint64_t ncopies;
8504 unsigned noutputs_per_mask;
8505 if (repeating_p)
8507 /* We need a single permute mask vector that has the form:
8509 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8511 In other words, the original n-element permute in PERM is
8512 "unrolled" to fill a full vector. The stepped vector encoding
8513 that we use for permutes requires 3n elements. */
8514 npatterns = SLP_TREE_LANES (node);
8515 nelts_per_pattern = ncopies = 3;
8516 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8518 else
8520 /* Calculate every element of every permute mask vector explicitly,
8521 instead of relying on the pattern described above. */
8522 if (!nunits.is_constant (&npatterns))
8523 return -1;
8524 nelts_per_pattern = ncopies = 1;
8525 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8526 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8527 return -1;
8528 noutputs_per_mask = 1;
8530 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8531 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8533 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8534 from the { SLP operand, scalar lane } permutation as recorded in the
8535 SLP node as intermediate step. This part should already work
8536 with SLP children with arbitrary number of lanes. */
8537 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8538 auto_vec<unsigned> active_lane;
8539 vperm.create (olanes);
8540 active_lane.safe_grow_cleared (children.length (), true);
8541 for (unsigned i = 0; i < ncopies; ++i)
8543 for (unsigned pi = 0; pi < perm.length (); ++pi)
8545 std::pair<unsigned, unsigned> p = perm[pi];
8546 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8547 if (repeating_p)
8548 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8549 else
8551 /* We checked above that the vectors are constant-length. */
8552 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8553 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8554 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8555 vperm.quick_push ({{p.first, vi}, vl});
8558 /* Advance to the next group. */
8559 for (unsigned j = 0; j < children.length (); ++j)
8560 active_lane[j] += SLP_TREE_LANES (children[j]);
8563 if (dump_p)
8565 dump_printf_loc (MSG_NOTE, vect_location,
8566 "vectorizing permutation");
8567 for (unsigned i = 0; i < perm.length (); ++i)
8568 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8569 if (repeating_p)
8570 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8571 dump_printf (MSG_NOTE, "\n");
8572 dump_printf_loc (MSG_NOTE, vect_location, "as");
8573 for (unsigned i = 0; i < vperm.length (); ++i)
8575 if (i != 0
8576 && (repeating_p
8577 ? multiple_p (i, npatterns)
8578 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8579 dump_printf (MSG_NOTE, ",");
8580 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8581 vperm[i].first.first, vperm[i].first.second,
8582 vperm[i].second);
8584 dump_printf (MSG_NOTE, "\n");
8587 /* We can only handle two-vector permutes, everything else should
8588 be lowered on the SLP level. The following is closely inspired
8589 by vect_transform_slp_perm_load and is supposed to eventually
8590 replace it.
8591 ??? As intermediate step do code-gen in the SLP tree representation
8592 somehow? */
8593 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8594 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8595 unsigned int index = 0;
8596 poly_uint64 mask_element;
8597 vec_perm_builder mask;
8598 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8599 unsigned int count = mask.encoded_nelts ();
8600 mask.quick_grow (count);
8601 vec_perm_indices indices;
8602 unsigned nperms = 0;
8603 for (unsigned i = 0; i < vperm.length (); ++i)
8605 mask_element = vperm[i].second;
8606 if (first_vec.first == -1U
8607 || first_vec == vperm[i].first)
8608 first_vec = vperm[i].first;
8609 else if (second_vec.first == -1U
8610 || second_vec == vperm[i].first)
8612 second_vec = vperm[i].first;
8613 mask_element += nunits;
8615 else
8617 if (dump_p)
8618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8619 "permutation requires at "
8620 "least three vectors\n");
8621 gcc_assert (!gsi);
8622 return -1;
8625 mask[index++] = mask_element;
8627 if (index == count)
8629 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8630 TYPE_VECTOR_SUBPARTS (op_vectype));
8631 bool identity_p = indices.series_p (0, 1, 0, 1);
8632 machine_mode vmode = TYPE_MODE (vectype);
8633 machine_mode op_vmode = TYPE_MODE (op_vectype);
8634 unsigned HOST_WIDE_INT c;
8635 if ((!identity_p
8636 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8637 || (identity_p
8638 && !known_le (nunits,
8639 TYPE_VECTOR_SUBPARTS (op_vectype))
8640 && (!constant_multiple_p (nunits,
8641 TYPE_VECTOR_SUBPARTS (op_vectype),
8642 &c) || c != 2)))
8644 if (dump_p)
8646 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8647 vect_location,
8648 "unsupported vect permute { ");
8649 for (i = 0; i < count; ++i)
8651 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8652 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8654 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8656 gcc_assert (!gsi);
8657 return -1;
8660 if (!identity_p)
8661 nperms++;
8662 if (gsi)
8664 if (second_vec.first == -1U)
8665 second_vec = first_vec;
8667 slp_tree
8668 first_node = children[first_vec.first],
8669 second_node = children[second_vec.first];
8671 tree mask_vec = NULL_TREE;
8672 if (!identity_p)
8673 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8675 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8677 tree first_def
8678 = vect_get_slp_vect_def (first_node,
8679 first_vec.second + vi);
8680 tree second_def
8681 = vect_get_slp_vect_def (second_node,
8682 second_vec.second + vi);
8683 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8684 second_def, mask_vec);
8688 index = 0;
8689 first_vec = std::make_pair (-1U, -1U);
8690 second_vec = std::make_pair (-1U, -1U);
8694 return nperms;
8697 /* Vectorize the SLP permutations in NODE as specified
8698 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8699 child number and lane number.
8700 Interleaving of two two-lane two-child SLP subtrees (not supported):
8701 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8702 A blend of two four-lane two-child SLP subtrees:
8703 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8704 Highpart of a four-lane one-child SLP subtree (not supported):
8705 [ { 0, 2 }, { 0, 3 } ]
8706 Where currently only a subset is supported by code generating below. */
8708 static bool
8709 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8710 slp_tree node, stmt_vector_for_cost *cost_vec)
8712 tree vectype = SLP_TREE_VECTYPE (node);
8713 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8714 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8715 SLP_TREE_CHILDREN (node),
8716 dump_enabled_p ());
8717 if (nperms < 0)
8718 return false;
8720 if (!gsi)
8721 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8723 return true;
8726 /* Vectorize SLP NODE. */
8728 static void
8729 vect_schedule_slp_node (vec_info *vinfo,
8730 slp_tree node, slp_instance instance)
8732 gimple_stmt_iterator si;
8733 int i;
8734 slp_tree child;
8736 /* For existing vectors there's nothing to do. */
8737 if (SLP_TREE_VEC_DEFS (node).exists ())
8738 return;
8740 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8742 /* Vectorize externals and constants. */
8743 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8744 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8746 /* ??? vectorizable_shift can end up using a scalar operand which is
8747 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8748 node in this case. */
8749 if (!SLP_TREE_VECTYPE (node))
8750 return;
8752 vect_create_constant_vectors (vinfo, node);
8753 return;
8756 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8758 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8759 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8761 if (dump_enabled_p ())
8762 dump_printf_loc (MSG_NOTE, vect_location,
8763 "------>vectorizing SLP node starting from: %G",
8764 stmt_info->stmt);
8766 if (STMT_VINFO_DATA_REF (stmt_info)
8767 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8769 /* Vectorized loads go before the first scalar load to make it
8770 ready early, vectorized stores go before the last scalar
8771 stmt which is where all uses are ready. */
8772 stmt_vec_info last_stmt_info = NULL;
8773 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8774 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8775 else /* DR_IS_WRITE */
8776 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8777 si = gsi_for_stmt (last_stmt_info->stmt);
8779 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8780 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8781 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8782 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8784 /* For PHI node vectorization we do not use the insertion iterator. */
8785 si = gsi_none ();
8787 else
8789 /* Emit other stmts after the children vectorized defs which is
8790 earliest possible. */
8791 gimple *last_stmt = NULL;
8792 bool seen_vector_def = false;
8793 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8794 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8796 /* For fold-left reductions we are retaining the scalar
8797 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8798 set so the representation isn't perfect. Resort to the
8799 last scalar def here. */
8800 if (SLP_TREE_VEC_STMTS (child).is_empty ())
8802 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8803 == cycle_phi_info_type);
8804 gphi *phi = as_a <gphi *>
8805 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8806 if (!last_stmt
8807 || vect_stmt_dominates_stmt_p (last_stmt, phi))
8808 last_stmt = phi;
8810 /* We are emitting all vectorized stmts in the same place and
8811 the last one is the last.
8812 ??? Unless we have a load permutation applied and that
8813 figures to re-use an earlier generated load. */
8814 unsigned j;
8815 gimple *vstmt;
8816 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8817 if (!last_stmt
8818 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8819 last_stmt = vstmt;
8821 else if (!SLP_TREE_VECTYPE (child))
8823 /* For externals we use unvectorized at all scalar defs. */
8824 unsigned j;
8825 tree def;
8826 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8827 if (TREE_CODE (def) == SSA_NAME
8828 && !SSA_NAME_IS_DEFAULT_DEF (def))
8830 gimple *stmt = SSA_NAME_DEF_STMT (def);
8831 if (!last_stmt
8832 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8833 last_stmt = stmt;
8836 else
8838 /* For externals we have to look at all defs since their
8839 insertion place is decided per vector. But beware
8840 of pre-existing vectors where we need to make sure
8841 we do not insert before the region boundary. */
8842 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8843 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8844 seen_vector_def = true;
8845 else
8847 unsigned j;
8848 tree vdef;
8849 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8850 if (TREE_CODE (vdef) == SSA_NAME
8851 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8853 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8854 if (!last_stmt
8855 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8856 last_stmt = vstmt;
8860 /* This can happen when all children are pre-existing vectors or
8861 constants. */
8862 if (!last_stmt)
8863 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8864 if (!last_stmt)
8866 gcc_assert (seen_vector_def);
8867 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8869 else if (is_ctrl_altering_stmt (last_stmt))
8871 /* We split regions to vectorize at control altering stmts
8872 with a definition so this must be an external which
8873 we can insert at the start of the region. */
8874 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8876 else if (is_a <bb_vec_info> (vinfo)
8877 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8878 && gimple_could_trap_p (stmt_info->stmt))
8880 /* We've constrained possibly trapping operations to all come
8881 from the same basic-block, if vectorized defs would allow earlier
8882 scheduling still force vectorized stmts to the original block.
8883 This is only necessary for BB vectorization since for loop vect
8884 all operations are in a single BB and scalar stmt based
8885 placement doesn't play well with epilogue vectorization. */
8886 gcc_assert (dominated_by_p (CDI_DOMINATORS,
8887 gimple_bb (stmt_info->stmt),
8888 gimple_bb (last_stmt)));
8889 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8891 else if (is_a <gphi *> (last_stmt))
8892 si = gsi_after_labels (gimple_bb (last_stmt));
8893 else
8895 si = gsi_for_stmt (last_stmt);
8896 gsi_next (&si);
8900 bool done_p = false;
8902 /* Handle purely internal nodes. */
8903 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8905 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
8906 be shared with different SLP nodes (but usually it's the same
8907 operation apart from the case the stmt is only there for denoting
8908 the actual scalar lane defs ...). So do not call vect_transform_stmt
8909 but open-code it here (partly). */
8910 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8911 gcc_assert (done);
8912 done_p = true;
8914 if (!done_p)
8915 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8918 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8919 For loop vectorization this is done in vectorizable_call, but for SLP
8920 it needs to be deferred until end of vect_schedule_slp, because multiple
8921 SLP instances may refer to the same scalar stmt. */
8923 static void
8924 vect_remove_slp_scalar_calls (vec_info *vinfo,
8925 slp_tree node, hash_set<slp_tree> &visited)
8927 gimple *new_stmt;
8928 gimple_stmt_iterator gsi;
8929 int i;
8930 slp_tree child;
8931 tree lhs;
8932 stmt_vec_info stmt_info;
8934 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8935 return;
8937 if (visited.add (node))
8938 return;
8940 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8941 vect_remove_slp_scalar_calls (vinfo, child, visited);
8943 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8945 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8946 if (!stmt || gimple_bb (stmt) == NULL)
8947 continue;
8948 if (is_pattern_stmt_p (stmt_info)
8949 || !PURE_SLP_STMT (stmt_info))
8950 continue;
8951 lhs = gimple_call_lhs (stmt);
8952 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
8953 gsi = gsi_for_stmt (stmt);
8954 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
8955 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
8959 static void
8960 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
8962 hash_set<slp_tree> visited;
8963 vect_remove_slp_scalar_calls (vinfo, node, visited);
8966 /* Vectorize the instance root. */
8968 void
8969 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
8971 gassign *rstmt = NULL;
8973 if (instance->kind == slp_inst_kind_ctor)
8975 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
8977 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
8978 tree vect_lhs = gimple_get_lhs (child_stmt);
8979 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
8980 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
8981 TREE_TYPE (vect_lhs)))
8982 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
8983 vect_lhs);
8984 rstmt = gimple_build_assign (root_lhs, vect_lhs);
8986 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
8988 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8989 gimple *child_stmt;
8990 int j;
8991 vec<constructor_elt, va_gc> *v;
8992 vec_alloc (v, nelts);
8994 /* A CTOR can handle V16HI composition from VNx8HI so we
8995 do not need to convert vector elements if the types
8996 do not match. */
8997 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
8998 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8999 gimple_get_lhs (child_stmt));
9000 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9001 tree rtype
9002 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9003 tree r_constructor = build_constructor (rtype, v);
9004 rstmt = gimple_build_assign (lhs, r_constructor);
9007 else if (instance->kind == slp_inst_kind_bb_reduc)
9009 /* Largely inspired by reduction chain epilogue handling in
9010 vect_create_epilog_for_reduction. */
9011 vec<tree> vec_defs = vNULL;
9012 vect_get_slp_defs (node, &vec_defs);
9013 enum tree_code reduc_code
9014 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9015 /* ??? We actually have to reflect signs somewhere. */
9016 if (reduc_code == MINUS_EXPR)
9017 reduc_code = PLUS_EXPR;
9018 gimple_seq epilogue = NULL;
9019 /* We may end up with more than one vector result, reduce them
9020 to one vector. */
9021 tree vec_def = vec_defs[0];
9022 for (unsigned i = 1; i < vec_defs.length (); ++i)
9023 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9024 vec_def, vec_defs[i]);
9025 vec_defs.release ();
9026 /* ??? Support other schemes than direct internal fn. */
9027 internal_fn reduc_fn;
9028 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9029 || reduc_fn == IFN_LAST)
9030 gcc_unreachable ();
9031 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9032 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9034 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9035 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9036 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9037 update_stmt (gsi_stmt (rgsi));
9038 return;
9040 else
9041 gcc_unreachable ();
9043 gcc_assert (rstmt);
9045 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9046 gsi_replace (&rgsi, rstmt, true);
9049 struct slp_scc_info
9051 bool on_stack;
9052 int dfs;
9053 int lowlink;
9056 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9058 static void
9059 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9060 hash_map<slp_tree, slp_scc_info> &scc_info,
9061 int &maxdfs, vec<slp_tree> &stack)
9063 bool existed_p;
9064 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9065 gcc_assert (!existed_p);
9066 info->dfs = maxdfs;
9067 info->lowlink = maxdfs;
9068 maxdfs++;
9070 /* Leaf. */
9071 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9073 info->on_stack = false;
9074 vect_schedule_slp_node (vinfo, node, instance);
9075 return;
9078 info->on_stack = true;
9079 stack.safe_push (node);
9081 unsigned i;
9082 slp_tree child;
9083 /* DFS recurse. */
9084 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9086 if (!child)
9087 continue;
9088 slp_scc_info *child_info = scc_info.get (child);
9089 if (!child_info)
9091 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9092 /* Recursion might have re-allocated the node. */
9093 info = scc_info.get (node);
9094 child_info = scc_info.get (child);
9095 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9097 else if (child_info->on_stack)
9098 info->lowlink = MIN (info->lowlink, child_info->dfs);
9100 if (info->lowlink != info->dfs)
9101 return;
9103 auto_vec<slp_tree, 4> phis_to_fixup;
9105 /* Singleton. */
9106 if (stack.last () == node)
9108 stack.pop ();
9109 info->on_stack = false;
9110 vect_schedule_slp_node (vinfo, node, instance);
9111 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9112 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9113 phis_to_fixup.quick_push (node);
9115 else
9117 /* SCC. */
9118 int last_idx = stack.length () - 1;
9119 while (stack[last_idx] != node)
9120 last_idx--;
9121 /* We can break the cycle at PHIs who have at least one child
9122 code generated. Then we could re-start the DFS walk until
9123 all nodes in the SCC are covered (we might have new entries
9124 for only back-reachable nodes). But it's simpler to just
9125 iterate and schedule those that are ready. */
9126 unsigned todo = stack.length () - last_idx;
9129 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9131 slp_tree entry = stack[idx];
9132 if (!entry)
9133 continue;
9134 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9135 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9136 bool ready = !phi;
9137 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9138 if (!child)
9140 gcc_assert (phi);
9141 ready = true;
9142 break;
9144 else if (scc_info.get (child)->on_stack)
9146 if (!phi)
9148 ready = false;
9149 break;
9152 else
9154 if (phi)
9156 ready = true;
9157 break;
9160 if (ready)
9162 vect_schedule_slp_node (vinfo, entry, instance);
9163 scc_info.get (entry)->on_stack = false;
9164 stack[idx] = NULL;
9165 todo--;
9166 if (phi)
9167 phis_to_fixup.safe_push (entry);
9171 while (todo != 0);
9173 /* Pop the SCC. */
9174 stack.truncate (last_idx);
9177 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9178 slp_tree phi_node;
9179 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9181 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9182 edge_iterator ei;
9183 edge e;
9184 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9186 unsigned dest_idx = e->dest_idx;
9187 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9188 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9189 continue;
9190 /* Simply fill all args. */
9191 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
9192 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9193 vect_get_slp_vect_def (child, i),
9194 e, gimple_phi_arg_location (phi, dest_idx));
9199 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9201 void
9202 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9204 slp_instance instance;
9205 unsigned int i;
9207 hash_map<slp_tree, slp_scc_info> scc_info;
9208 int maxdfs = 0;
9209 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9211 slp_tree node = SLP_INSTANCE_TREE (instance);
9212 if (dump_enabled_p ())
9214 dump_printf_loc (MSG_NOTE, vect_location,
9215 "Vectorizing SLP tree:\n");
9216 /* ??? Dump all? */
9217 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9218 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9219 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9220 vect_print_slp_graph (MSG_NOTE, vect_location,
9221 SLP_INSTANCE_TREE (instance));
9223 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9224 have a PHI be the node breaking the cycle. */
9225 auto_vec<slp_tree> stack;
9226 if (!scc_info.get (node))
9227 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9229 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9230 vectorize_slp_instance_root_stmt (node, instance);
9232 if (dump_enabled_p ())
9233 dump_printf_loc (MSG_NOTE, vect_location,
9234 "vectorizing stmts using SLP.\n");
9237 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9239 slp_tree root = SLP_INSTANCE_TREE (instance);
9240 stmt_vec_info store_info;
9241 unsigned int j;
9243 /* Remove scalar call stmts. Do not do this for basic-block
9244 vectorization as not all uses may be vectorized.
9245 ??? Why should this be necessary? DCE should be able to
9246 remove the stmts itself.
9247 ??? For BB vectorization we can as well remove scalar
9248 stmts starting from the SLP tree root if they have no
9249 uses. */
9250 if (is_a <loop_vec_info> (vinfo))
9251 vect_remove_slp_scalar_calls (vinfo, root);
9253 /* Remove vectorized stores original scalar stmts. */
9254 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9256 if (!STMT_VINFO_DATA_REF (store_info)
9257 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9258 break;
9260 store_info = vect_orig_stmt (store_info);
9261 /* Free the attached stmt_vec_info and remove the stmt. */
9262 vinfo->remove_stmt (store_info);
9264 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9265 to not crash in vect_free_slp_tree later. */
9266 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9267 SLP_TREE_REPRESENTATIVE (root) = NULL;