c++: handle _FloatNN redeclaration like bool [PR107128]
[official-gcc.git] / gcc / tree-vect-slp.cc
blob9a4e000925eb999da81994bd224abf0fa685eba5
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
74 void
75 vect_slp_init (void)
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 void
81 vect_slp_fini (void)
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
89 void *
90 _slp_tree::operator new (size_t n)
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
96 void
97 _slp_tree::operator delete (void *node, size_t n)
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_STMTS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 SLP_TREE_CODE (this) = ERROR_MARK;
123 SLP_TREE_VECTYPE (this) = NULL_TREE;
124 SLP_TREE_REPRESENTATIVE (this) = NULL;
125 SLP_TREE_REF_COUNT (this) = 1;
126 this->failed = NULL;
127 this->max_nunits = 1;
128 this->lanes = 0;
131 /* Tear down a SLP node. */
133 _slp_tree::~_slp_tree ()
135 if (this->prev_node)
136 this->prev_node->next_node = this->next_node;
137 else
138 slp_first_node = this->next_node;
139 if (this->next_node)
140 this->next_node->prev_node = this->prev_node;
141 SLP_TREE_CHILDREN (this).release ();
142 SLP_TREE_SCALAR_STMTS (this).release ();
143 SLP_TREE_SCALAR_OPS (this).release ();
144 SLP_TREE_VEC_STMTS (this).release ();
145 SLP_TREE_VEC_DEFS (this).release ();
146 SLP_TREE_LOAD_PERMUTATION (this).release ();
147 SLP_TREE_LANE_PERMUTATION (this).release ();
148 if (this->failed)
149 free (failed);
152 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
154 void
155 vect_free_slp_tree (slp_tree node)
157 int i;
158 slp_tree child;
160 if (--SLP_TREE_REF_COUNT (node) != 0)
161 return;
163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
164 if (child)
165 vect_free_slp_tree (child);
167 /* If the node defines any SLP only patterns then those patterns are no
168 longer valid and should be removed. */
169 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
170 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
172 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
173 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
174 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
177 delete node;
180 /* Return a location suitable for dumpings related to the SLP instance. */
182 dump_user_location_t
183 _slp_instance::location () const
185 if (!root_stmts.is_empty ())
186 return root_stmts[0]->stmt;
187 else
188 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
192 /* Free the memory allocated for the SLP instance. */
194 void
195 vect_free_slp_instance (slp_instance instance)
197 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
198 SLP_INSTANCE_LOADS (instance).release ();
199 SLP_INSTANCE_ROOT_STMTS (instance).release ();
200 instance->subgraph_entries.release ();
201 instance->cost_vec.release ();
202 free (instance);
206 /* Create an SLP node for SCALAR_STMTS. */
208 slp_tree
209 vect_create_new_slp_node (unsigned nops, tree_code code)
211 slp_tree node = new _slp_tree;
212 SLP_TREE_SCALAR_STMTS (node) = vNULL;
213 SLP_TREE_CHILDREN (node).create (nops);
214 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
215 SLP_TREE_CODE (node) = code;
216 return node;
218 /* Create an SLP node for SCALAR_STMTS. */
220 static slp_tree
221 vect_create_new_slp_node (slp_tree node,
222 vec<stmt_vec_info> scalar_stmts, unsigned nops)
224 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
225 SLP_TREE_CHILDREN (node).create (nops);
226 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
227 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
228 SLP_TREE_LANES (node) = scalar_stmts.length ();
229 return node;
232 /* Create an SLP node for SCALAR_STMTS. */
234 static slp_tree
235 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
237 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
240 /* Create an SLP node for OPS. */
242 static slp_tree
243 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
245 SLP_TREE_SCALAR_OPS (node) = ops;
246 SLP_TREE_DEF_TYPE (node) = vect_external_def;
247 SLP_TREE_LANES (node) = ops.length ();
248 return node;
251 /* Create an SLP node for OPS. */
253 static slp_tree
254 vect_create_new_slp_node (vec<tree> ops)
256 return vect_create_new_slp_node (new _slp_tree, ops);
260 /* This structure is used in creation of an SLP tree. Each instance
261 corresponds to the same operand in a group of scalar stmts in an SLP
262 node. */
263 typedef struct _slp_oprnd_info
265 /* Def-stmts for the operands. */
266 vec<stmt_vec_info> def_stmts;
267 /* Operands. */
268 vec<tree> ops;
269 /* Information about the first statement, its vector def-type, type, the
270 operand itself in case it's constant, and an indication if it's a pattern
271 stmt. */
272 tree first_op_type;
273 enum vect_def_type first_dt;
274 bool any_pattern;
275 } *slp_oprnd_info;
278 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
279 operand. */
280 static vec<slp_oprnd_info>
281 vect_create_oprnd_info (int nops, int group_size)
283 int i;
284 slp_oprnd_info oprnd_info;
285 vec<slp_oprnd_info> oprnds_info;
287 oprnds_info.create (nops);
288 for (i = 0; i < nops; i++)
290 oprnd_info = XNEW (struct _slp_oprnd_info);
291 oprnd_info->def_stmts.create (group_size);
292 oprnd_info->ops.create (group_size);
293 oprnd_info->first_dt = vect_uninitialized_def;
294 oprnd_info->first_op_type = NULL_TREE;
295 oprnd_info->any_pattern = false;
296 oprnds_info.quick_push (oprnd_info);
299 return oprnds_info;
303 /* Free operands info. */
305 static void
306 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
308 int i;
309 slp_oprnd_info oprnd_info;
311 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
313 oprnd_info->def_stmts.release ();
314 oprnd_info->ops.release ();
315 XDELETE (oprnd_info);
318 oprnds_info.release ();
321 /* Return the execution frequency of NODE (so that a higher value indicates
322 a "more important" node when optimizing for speed). */
324 static sreal
325 vect_slp_node_weight (slp_tree node)
327 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
328 basic_block bb = gimple_bb (stmt_info->stmt);
329 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
332 /* Return true if STMTS contains a pattern statement. */
334 static bool
335 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
337 stmt_vec_info stmt_info;
338 unsigned int i;
339 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
340 if (is_pattern_stmt_p (stmt_info))
341 return true;
342 return false;
345 /* Return true when all lanes in the external or constant NODE have
346 the same value. */
348 static bool
349 vect_slp_tree_uniform_p (slp_tree node)
351 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
352 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
354 /* Pre-exsting vectors. */
355 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
356 return false;
358 unsigned i;
359 tree op, first = NULL_TREE;
360 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
361 if (!first)
362 first = op;
363 else if (!operand_equal_p (first, op, 0))
364 return false;
366 return true;
369 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
370 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
371 of the chain. */
374 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
375 stmt_vec_info first_stmt_info)
377 stmt_vec_info next_stmt_info = first_stmt_info;
378 int result = 0;
380 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
381 return -1;
385 if (next_stmt_info == stmt_info)
386 return result;
387 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
388 if (next_stmt_info)
389 result += DR_GROUP_GAP (next_stmt_info);
391 while (next_stmt_info);
393 return -1;
396 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
397 using the method implemented by duplicate_and_interleave. Return true
398 if so, returning the number of intermediate vectors in *NVECTORS_OUT
399 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
400 (if nonnull). */
402 bool
403 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
404 tree elt_type, unsigned int *nvectors_out,
405 tree *vector_type_out,
406 tree *permutes)
408 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
409 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
410 return false;
412 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
413 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
414 unsigned int nvectors = 1;
415 for (;;)
417 scalar_int_mode int_mode;
418 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
419 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
421 /* Get the natural vector type for this SLP group size. */
422 tree int_type = build_nonstandard_integer_type
423 (GET_MODE_BITSIZE (int_mode), 1);
424 tree vector_type
425 = get_vectype_for_scalar_type (vinfo, int_type, count);
426 if (vector_type
427 && VECTOR_MODE_P (TYPE_MODE (vector_type))
428 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
429 GET_MODE_SIZE (base_vector_mode)))
431 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
432 together into elements of type INT_TYPE and using the result
433 to build NVECTORS vectors. */
434 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
435 vec_perm_builder sel1 (nelts, 2, 3);
436 vec_perm_builder sel2 (nelts, 2, 3);
437 poly_int64 half_nelts = exact_div (nelts, 2);
438 for (unsigned int i = 0; i < 3; ++i)
440 sel1.quick_push (i);
441 sel1.quick_push (i + nelts);
442 sel2.quick_push (half_nelts + i);
443 sel2.quick_push (half_nelts + i + nelts);
445 vec_perm_indices indices1 (sel1, 2, nelts);
446 vec_perm_indices indices2 (sel2, 2, nelts);
447 machine_mode vmode = TYPE_MODE (vector_type);
448 if (can_vec_perm_const_p (vmode, vmode, indices1)
449 && can_vec_perm_const_p (vmode, vmode, indices2))
451 if (nvectors_out)
452 *nvectors_out = nvectors;
453 if (vector_type_out)
454 *vector_type_out = vector_type;
455 if (permutes)
457 permutes[0] = vect_gen_perm_mask_checked (vector_type,
458 indices1);
459 permutes[1] = vect_gen_perm_mask_checked (vector_type,
460 indices2);
462 return true;
466 if (!multiple_p (elt_bytes, 2, &elt_bytes))
467 return false;
468 nvectors *= 2;
472 /* Return true if DTA and DTB match. */
474 static bool
475 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
477 return (dta == dtb
478 || ((dta == vect_external_def || dta == vect_constant_def)
479 && (dtb == vect_external_def || dtb == vect_constant_def)));
482 static const int cond_expr_maps[3][5] = {
483 { 4, -1, -2, 1, 2 },
484 { 4, -2, -1, 1, 2 },
485 { 4, -1, -2, 2, 1 }
487 static const int arg1_map[] = { 1, 1 };
488 static const int arg2_map[] = { 1, 2 };
489 static const int arg1_arg4_map[] = { 2, 1, 4 };
490 static const int op1_op0_map[] = { 2, 1, 0 };
492 /* For most SLP statements, there is a one-to-one mapping between
493 gimple arguments and child nodes. If that is not true for STMT,
494 return an array that contains:
496 - the number of child nodes, followed by
497 - for each child node, the index of the argument associated with that node.
498 The special index -1 is the first operand of an embedded comparison and
499 the special index -2 is the second operand of an embedded comparison.
501 SWAP is as for vect_get_and_check_slp_defs. */
503 static const int *
504 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
506 if (auto assign = dyn_cast<const gassign *> (stmt))
508 if (gimple_assign_rhs_code (assign) == COND_EXPR
509 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
510 return cond_expr_maps[swap];
511 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
512 && swap)
513 return op1_op0_map;
515 gcc_assert (!swap);
516 if (auto call = dyn_cast<const gcall *> (stmt))
518 if (gimple_call_internal_p (call))
519 switch (gimple_call_internal_fn (call))
521 case IFN_MASK_LOAD:
522 return arg2_map;
524 case IFN_GATHER_LOAD:
525 return arg1_map;
527 case IFN_MASK_GATHER_LOAD:
528 return arg1_arg4_map;
530 default:
531 break;
534 return nullptr;
537 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
538 they are of a valid type and that they match the defs of the first stmt of
539 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
540 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
541 indicates swap is required for cond_expr stmts. Specifically, SWAP
542 is 1 if STMT is cond and operands of comparison need to be swapped;
543 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
545 If there was a fatal error return -1; if the error could be corrected by
546 swapping operands of father node of this one, return 1; if everything is
547 ok return 0. */
548 static int
549 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
550 bool *skip_args,
551 vec<stmt_vec_info> stmts, unsigned stmt_num,
552 vec<slp_oprnd_info> *oprnds_info)
554 stmt_vec_info stmt_info = stmts[stmt_num];
555 tree oprnd;
556 unsigned int i, number_of_oprnds;
557 enum vect_def_type dt = vect_uninitialized_def;
558 slp_oprnd_info oprnd_info;
559 unsigned int commutative_op = -1U;
560 bool first = stmt_num == 0;
562 if (!is_a<gcall *> (stmt_info->stmt)
563 && !is_a<gassign *> (stmt_info->stmt)
564 && !is_a<gphi *> (stmt_info->stmt))
565 return -1;
567 number_of_oprnds = gimple_num_args (stmt_info->stmt);
568 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
569 if (map)
570 number_of_oprnds = *map++;
571 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
573 if (gimple_call_internal_p (stmt))
575 internal_fn ifn = gimple_call_internal_fn (stmt);
576 commutative_op = first_commutative_argument (ifn);
579 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
581 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
582 commutative_op = 0;
585 bool swapped = (swap != 0);
586 bool backedge = false;
587 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
588 for (i = 0; i < number_of_oprnds; i++)
590 int opno = map ? map[i] : int (i);
591 if (opno < 0)
592 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
593 else
595 oprnd = gimple_arg (stmt_info->stmt, opno);
596 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
597 backedge = dominated_by_p (CDI_DOMINATORS,
598 gimple_phi_arg_edge (stmt, opno)->src,
599 gimple_bb (stmt_info->stmt));
601 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
602 oprnd = TREE_OPERAND (oprnd, 0);
604 oprnd_info = (*oprnds_info)[i];
606 stmt_vec_info def_stmt_info;
607 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
611 "Build SLP failed: can't analyze def for %T\n",
612 oprnd);
614 return -1;
617 if (skip_args[i])
619 oprnd_info->def_stmts.quick_push (NULL);
620 oprnd_info->ops.quick_push (NULL_TREE);
621 oprnd_info->first_dt = vect_uninitialized_def;
622 continue;
625 oprnd_info->def_stmts.quick_push (def_stmt_info);
626 oprnd_info->ops.quick_push (oprnd);
628 if (def_stmt_info
629 && is_pattern_stmt_p (def_stmt_info))
631 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
632 != def_stmt_info)
633 oprnd_info->any_pattern = true;
634 else
635 /* If we promote this to external use the original stmt def. */
636 oprnd_info->ops.last ()
637 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
640 /* If there's a extern def on a backedge make sure we can
641 code-generate at the region start.
642 ??? This is another case that could be fixed by adjusting
643 how we split the function but at the moment we'd have conflicting
644 goals there. */
645 if (backedge
646 && dts[i] == vect_external_def
647 && is_a <bb_vec_info> (vinfo)
648 && TREE_CODE (oprnd) == SSA_NAME
649 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
650 && !dominated_by_p (CDI_DOMINATORS,
651 as_a <bb_vec_info> (vinfo)->bbs[0],
652 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "Build SLP failed: extern def %T only defined "
657 "on backedge\n", oprnd);
658 return -1;
661 if (first)
663 tree type = TREE_TYPE (oprnd);
664 dt = dts[i];
665 if ((dt == vect_constant_def
666 || dt == vect_external_def)
667 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
668 && (TREE_CODE (type) == BOOLEAN_TYPE
669 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
670 type)))
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
674 "Build SLP failed: invalid type of def "
675 "for variable-length SLP %T\n", oprnd);
676 return -1;
679 /* For the swapping logic below force vect_reduction_def
680 for the reduction op in a SLP reduction group. */
681 if (!STMT_VINFO_DATA_REF (stmt_info)
682 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
683 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
684 && def_stmt_info)
685 dts[i] = dt = vect_reduction_def;
687 /* Check the types of the definition. */
688 switch (dt)
690 case vect_external_def:
691 case vect_constant_def:
692 case vect_internal_def:
693 case vect_reduction_def:
694 case vect_induction_def:
695 case vect_nested_cycle:
696 case vect_first_order_recurrence:
697 break;
699 default:
700 /* FORNOW: Not supported. */
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
703 "Build SLP failed: illegal type of def %T\n",
704 oprnd);
705 return -1;
708 oprnd_info->first_dt = dt;
709 oprnd_info->first_op_type = type;
712 if (first)
713 return 0;
715 /* Now match the operand definition types to that of the first stmt. */
716 for (i = 0; i < number_of_oprnds;)
718 if (skip_args[i])
720 ++i;
721 continue;
724 oprnd_info = (*oprnds_info)[i];
725 dt = dts[i];
726 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
727 oprnd = oprnd_info->ops[stmt_num];
728 tree type = TREE_TYPE (oprnd);
730 if (!types_compatible_p (oprnd_info->first_op_type, type))
732 if (dump_enabled_p ())
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
734 "Build SLP failed: different operand types\n");
735 return 1;
738 /* Not first stmt of the group, check that the def-stmt/s match
739 the def-stmt/s of the first stmt. Allow different definition
740 types for reduction chains: the first stmt must be a
741 vect_reduction_def (a phi node), and the rest
742 end in the reduction chain. */
743 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
744 && !(oprnd_info->first_dt == vect_reduction_def
745 && !STMT_VINFO_DATA_REF (stmt_info)
746 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
747 && def_stmt_info
748 && !STMT_VINFO_DATA_REF (def_stmt_info)
749 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
750 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
751 || (!STMT_VINFO_DATA_REF (stmt_info)
752 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
753 && ((!def_stmt_info
754 || STMT_VINFO_DATA_REF (def_stmt_info)
755 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
756 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
757 != (oprnd_info->first_dt != vect_reduction_def))))
759 /* Try swapping operands if we got a mismatch. For BB
760 vectorization only in case it will clearly improve things. */
761 if (i == commutative_op && !swapped
762 && (!is_a <bb_vec_info> (vinfo)
763 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
764 dts[i+1])
765 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
766 || vect_def_types_match
767 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "trying swapped operands\n");
772 std::swap (dts[i], dts[i+1]);
773 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
774 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
775 std::swap ((*oprnds_info)[i]->ops[stmt_num],
776 (*oprnds_info)[i+1]->ops[stmt_num]);
777 swapped = true;
778 continue;
781 if (is_a <bb_vec_info> (vinfo)
782 && !oprnd_info->any_pattern)
784 /* Now for commutative ops we should see whether we can
785 make the other operand matching. */
786 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
788 "treating operand as external\n");
789 oprnd_info->first_dt = dt = vect_external_def;
791 else
793 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "Build SLP failed: different types\n");
796 return 1;
800 /* Make sure to demote the overall operand to external. */
801 if (dt == vect_external_def)
802 oprnd_info->first_dt = vect_external_def;
803 /* For a SLP reduction chain we want to duplicate the reduction to
804 each of the chain members. That gets us a sane SLP graph (still
805 the stmts are not 100% correct wrt the initial values). */
806 else if ((dt == vect_internal_def
807 || dt == vect_reduction_def)
808 && oprnd_info->first_dt == vect_reduction_def
809 && !STMT_VINFO_DATA_REF (stmt_info)
810 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
811 && !STMT_VINFO_DATA_REF (def_stmt_info)
812 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
813 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
815 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
816 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
819 ++i;
822 /* Swap operands. */
823 if (swapped)
825 if (dump_enabled_p ())
826 dump_printf_loc (MSG_NOTE, vect_location,
827 "swapped operands to match def types in %G",
828 stmt_info->stmt);
831 return 0;
834 /* Return true if call statements CALL1 and CALL2 are similar enough
835 to be combined into the same SLP group. */
837 bool
838 compatible_calls_p (gcall *call1, gcall *call2)
840 unsigned int nargs = gimple_call_num_args (call1);
841 if (nargs != gimple_call_num_args (call2))
842 return false;
844 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
845 return false;
847 if (gimple_call_internal_p (call1))
849 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
850 TREE_TYPE (gimple_call_lhs (call2))))
851 return false;
852 for (unsigned int i = 0; i < nargs; ++i)
853 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
854 TREE_TYPE (gimple_call_arg (call2, i))))
855 return false;
857 else
859 if (!operand_equal_p (gimple_call_fn (call1),
860 gimple_call_fn (call2), 0))
861 return false;
863 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
864 return false;
867 /* Check that any unvectorized arguments are equal. */
868 if (const int *map = vect_get_operand_map (call1))
870 unsigned int nkept = *map++;
871 unsigned int mapi = 0;
872 for (unsigned int i = 0; i < nargs; ++i)
873 if (mapi < nkept && map[mapi] == int (i))
874 mapi += 1;
875 else if (!operand_equal_p (gimple_call_arg (call1, i),
876 gimple_call_arg (call2, i)))
877 return false;
880 return true;
883 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
884 caller's attempt to find the vector type in STMT_INFO with the narrowest
885 element type. Return true if VECTYPE is nonnull and if it is valid
886 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
887 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
888 vect_build_slp_tree. */
890 static bool
891 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
892 unsigned int group_size,
893 tree vectype, poly_uint64 *max_nunits)
895 if (!vectype)
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
899 "Build SLP failed: unsupported data-type in %G\n",
900 stmt_info->stmt);
901 /* Fatal mismatch. */
902 return false;
905 /* If populating the vector type requires unrolling then fail
906 before adjusting *max_nunits for basic-block vectorization. */
907 if (is_a <bb_vec_info> (vinfo)
908 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
910 if (dump_enabled_p ())
911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912 "Build SLP failed: unrolling required "
913 "in basic block SLP\n");
914 /* Fatal mismatch. */
915 return false;
918 /* In case of multiple types we need to detect the smallest type. */
919 vect_update_max_nunits (max_nunits, vectype);
920 return true;
923 /* Verify if the scalar stmts STMTS are isomorphic, require data
924 permutation or are of unsupported types of operation. Return
925 true if they are, otherwise return false and indicate in *MATCHES
926 which stmts are not isomorphic to the first one. If MATCHES[0]
927 is false then this indicates the comparison could not be
928 carried out or the stmts will never be vectorized by SLP.
930 Note COND_EXPR is possibly isomorphic to another one after swapping its
931 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
932 the first stmt by swapping the two operands of comparison; set SWAP[i]
933 to 2 if stmt I is isormorphic to the first stmt by inverting the code
934 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
935 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
937 static bool
938 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
939 vec<stmt_vec_info> stmts, unsigned int group_size,
940 poly_uint64 *max_nunits, bool *matches,
941 bool *two_operators, tree *node_vectype)
943 unsigned int i;
944 stmt_vec_info first_stmt_info = stmts[0];
945 code_helper first_stmt_code = ERROR_MARK;
946 code_helper alt_stmt_code = ERROR_MARK;
947 code_helper rhs_code = ERROR_MARK;
948 code_helper first_cond_code = ERROR_MARK;
949 tree lhs;
950 bool need_same_oprnds = false;
951 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
952 stmt_vec_info first_load = NULL, prev_first_load = NULL;
953 bool first_stmt_load_p = false, load_p = false;
954 bool first_stmt_phi_p = false, phi_p = false;
955 bool maybe_soft_fail = false;
956 tree soft_fail_nunits_vectype = NULL_TREE;
958 /* For every stmt in NODE find its def stmt/s. */
959 stmt_vec_info stmt_info;
960 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
962 gimple *stmt = stmt_info->stmt;
963 swap[i] = 0;
964 matches[i] = false;
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
969 /* Fail to vectorize statements marked as unvectorizable, throw
970 or are volatile. */
971 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
972 || stmt_can_throw_internal (cfun, stmt)
973 || gimple_has_volatile_ops (stmt))
975 if (dump_enabled_p ())
976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977 "Build SLP failed: unvectorizable statement %G",
978 stmt);
979 /* ??? For BB vectorization we want to commutate operands in a way
980 to shuffle all unvectorizable defs into one operand and have
981 the other still vectorized. The following doesn't reliably
982 work for this though but it's the easiest we can do here. */
983 if (is_a <bb_vec_info> (vinfo) && i != 0)
984 continue;
985 /* Fatal mismatch. */
986 matches[0] = false;
987 return false;
990 lhs = gimple_get_lhs (stmt);
991 if (lhs == NULL_TREE)
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: not GIMPLE_ASSIGN nor "
996 "GIMPLE_CALL %G", stmt);
997 if (is_a <bb_vec_info> (vinfo) && i != 0)
998 continue;
999 /* Fatal mismatch. */
1000 matches[0] = false;
1001 return false;
1004 tree nunits_vectype;
1005 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1006 &nunits_vectype, group_size))
1008 if (is_a <bb_vec_info> (vinfo) && i != 0)
1009 continue;
1010 /* Fatal mismatch. */
1011 matches[0] = false;
1012 return false;
1014 /* Record nunits required but continue analysis, producing matches[]
1015 as if nunits was not an issue. This allows splitting of groups
1016 to happen. */
1017 if (nunits_vectype
1018 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1019 nunits_vectype, max_nunits))
1021 gcc_assert (is_a <bb_vec_info> (vinfo));
1022 maybe_soft_fail = true;
1023 soft_fail_nunits_vectype = nunits_vectype;
1026 gcc_assert (vectype);
1028 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1029 if (call_stmt)
1031 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1032 if (cfn != CFN_LAST)
1033 rhs_code = cfn;
1034 else
1035 rhs_code = CALL_EXPR;
1037 if (cfn == CFN_MASK_LOAD
1038 || cfn == CFN_GATHER_LOAD
1039 || cfn == CFN_MASK_GATHER_LOAD)
1040 load_p = true;
1041 else if ((internal_fn_p (cfn)
1042 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1043 || gimple_call_tail_p (call_stmt)
1044 || gimple_call_noreturn_p (call_stmt)
1045 || gimple_call_chain (call_stmt))
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 "Build SLP failed: unsupported call type %G",
1050 (gimple *) call_stmt);
1051 if (is_a <bb_vec_info> (vinfo) && i != 0)
1052 continue;
1053 /* Fatal mismatch. */
1054 matches[0] = false;
1055 return false;
1058 else if (gimple_code (stmt) == GIMPLE_PHI)
1060 rhs_code = ERROR_MARK;
1061 phi_p = true;
1063 else
1065 rhs_code = gimple_assign_rhs_code (stmt);
1066 load_p = gimple_vuse (stmt);
1069 /* Check the operation. */
1070 if (i == 0)
1072 *node_vectype = vectype;
1073 first_stmt_code = rhs_code;
1074 first_stmt_load_p = load_p;
1075 first_stmt_phi_p = phi_p;
1077 /* Shift arguments should be equal in all the packed stmts for a
1078 vector shift with scalar shift operand. */
1079 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1080 || rhs_code == LROTATE_EXPR
1081 || rhs_code == RROTATE_EXPR)
1083 /* First see if we have a vector/vector shift. */
1084 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1086 /* No vector/vector shift, try for a vector/scalar shift. */
1087 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a <bb_vec_info> (vinfo) && i != 0)
1094 continue;
1095 /* Fatal mismatch. */
1096 matches[0] = false;
1097 return false;
1099 need_same_oprnds = true;
1100 first_op1 = gimple_assign_rhs2 (stmt);
1103 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1105 need_same_oprnds = true;
1106 first_op1 = gimple_assign_rhs2 (stmt);
1108 else if (!load_p
1109 && rhs_code == BIT_FIELD_REF)
1111 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1112 if (!is_a <bb_vec_info> (vinfo)
1113 || TREE_CODE (vec) != SSA_NAME
1114 /* When the element types are not compatible we pun the
1115 source to the target vectype which requires equal size. */
1116 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1117 || !types_compatible_p (TREE_TYPE (vectype),
1118 TREE_TYPE (TREE_TYPE (vec))))
1119 && !operand_equal_p (TYPE_SIZE (vectype),
1120 TYPE_SIZE (TREE_TYPE (vec)))))
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "Build SLP failed: "
1125 "BIT_FIELD_REF not supported\n");
1126 /* Fatal mismatch. */
1127 matches[0] = false;
1128 return false;
1131 else if (rhs_code == CFN_DIV_POW2)
1133 need_same_oprnds = true;
1134 first_op1 = gimple_call_arg (call_stmt, 1);
1137 else
1139 if (first_stmt_code != rhs_code
1140 && alt_stmt_code == ERROR_MARK)
1141 alt_stmt_code = rhs_code;
1142 if ((first_stmt_code != rhs_code
1143 && (first_stmt_code != IMAGPART_EXPR
1144 || rhs_code != REALPART_EXPR)
1145 && (first_stmt_code != REALPART_EXPR
1146 || rhs_code != IMAGPART_EXPR)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code == PLUS_EXPR
1150 || first_stmt_code == MINUS_EXPR)
1151 && (alt_stmt_code == PLUS_EXPR
1152 || alt_stmt_code == MINUS_EXPR)
1153 && rhs_code == alt_stmt_code)
1154 && !(first_stmt_code.is_tree_code ()
1155 && rhs_code.is_tree_code ()
1156 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1157 == tcc_comparison)
1158 && (swap_tree_comparison (tree_code (first_stmt_code))
1159 == tree_code (rhs_code)))
1160 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1161 && (first_stmt_code == ARRAY_REF
1162 || first_stmt_code == BIT_FIELD_REF
1163 || first_stmt_code == INDIRECT_REF
1164 || first_stmt_code == COMPONENT_REF
1165 || first_stmt_code == MEM_REF)
1166 && (rhs_code == ARRAY_REF
1167 || rhs_code == BIT_FIELD_REF
1168 || rhs_code == INDIRECT_REF
1169 || rhs_code == COMPONENT_REF
1170 || rhs_code == MEM_REF)))
1171 || first_stmt_load_p != load_p
1172 || first_stmt_phi_p != phi_p)
1174 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177 "Build SLP failed: different operation "
1178 "in stmt %G", stmt);
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "original stmt %G", first_stmt_info->stmt);
1182 /* Mismatch. */
1183 continue;
1186 if (!load_p
1187 && first_stmt_code == BIT_FIELD_REF
1188 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1189 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1191 if (dump_enabled_p ())
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 "Build SLP failed: different BIT_FIELD_REF "
1194 "arguments in %G", stmt);
1195 /* Mismatch. */
1196 continue;
1199 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1201 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1202 call_stmt))
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206 "Build SLP failed: different calls in %G",
1207 stmt);
1208 /* Mismatch. */
1209 continue;
1213 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1214 && (gimple_bb (first_stmt_info->stmt)
1215 != gimple_bb (stmt_info->stmt)))
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "Build SLP failed: different BB for PHI "
1220 "or possibly trapping operation in %G", stmt);
1221 /* Mismatch. */
1222 continue;
1225 if (need_same_oprnds)
1227 tree other_op1 = gimple_arg (stmt, 1);
1228 if (!operand_equal_p (first_op1, other_op1, 0))
1230 if (dump_enabled_p ())
1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 "Build SLP failed: different shift "
1233 "arguments in %G", stmt);
1234 /* Mismatch. */
1235 continue;
1239 if (!types_compatible_p (vectype, *node_vectype))
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243 "Build SLP failed: different vector type "
1244 "in %G", stmt);
1245 /* Mismatch. */
1246 continue;
1250 /* Grouped store or load. */
1251 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1253 if (REFERENCE_CLASS_P (lhs))
1255 /* Store. */
1258 else
1260 /* Load. */
1261 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1262 if (prev_first_load)
1264 /* Check that there are no loads from different interleaving
1265 chains in the same node. */
1266 if (prev_first_load != first_load)
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1270 vect_location,
1271 "Build SLP failed: different "
1272 "interleaving chains in one node %G",
1273 stmt);
1274 /* Mismatch. */
1275 continue;
1278 else
1279 prev_first_load = first_load;
1281 } /* Grouped access. */
1282 else
1284 if (load_p
1285 && rhs_code != CFN_GATHER_LOAD
1286 && rhs_code != CFN_MASK_GATHER_LOAD)
1288 /* Not grouped load. */
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "Build SLP failed: not grouped load %G", stmt);
1293 /* FORNOW: Not grouped loads are not supported. */
1294 if (is_a <bb_vec_info> (vinfo) && i != 0)
1295 continue;
1296 /* Fatal mismatch. */
1297 matches[0] = false;
1298 return false;
1301 /* Not memory operation. */
1302 if (!phi_p
1303 && rhs_code.is_tree_code ()
1304 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1305 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1306 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1307 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1308 && rhs_code != VIEW_CONVERT_EXPR
1309 && rhs_code != CALL_EXPR
1310 && rhs_code != BIT_FIELD_REF)
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "Build SLP failed: operation unsupported %G",
1315 stmt);
1316 if (is_a <bb_vec_info> (vinfo) && i != 0)
1317 continue;
1318 /* Fatal mismatch. */
1319 matches[0] = false;
1320 return false;
1323 if (rhs_code == COND_EXPR)
1325 tree cond_expr = gimple_assign_rhs1 (stmt);
1326 enum tree_code cond_code = TREE_CODE (cond_expr);
1327 enum tree_code swap_code = ERROR_MARK;
1328 enum tree_code invert_code = ERROR_MARK;
1330 if (i == 0)
1331 first_cond_code = TREE_CODE (cond_expr);
1332 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1334 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1335 swap_code = swap_tree_comparison (cond_code);
1336 invert_code = invert_tree_comparison (cond_code, honor_nans);
1339 if (first_cond_code == cond_code)
1341 /* Isomorphic can be achieved by swapping. */
1342 else if (first_cond_code == swap_code)
1343 swap[i] = 1;
1344 /* Isomorphic can be achieved by inverting. */
1345 else if (first_cond_code == invert_code)
1346 swap[i] = 2;
1347 else
1349 if (dump_enabled_p ())
1350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351 "Build SLP failed: different"
1352 " operation %G", stmt);
1353 /* Mismatch. */
1354 continue;
1358 if (rhs_code.is_tree_code ()
1359 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1360 && (swap_tree_comparison ((tree_code)first_stmt_code)
1361 == (tree_code)rhs_code))
1362 swap[i] = 1;
1365 matches[i] = true;
1368 for (i = 0; i < group_size; ++i)
1369 if (!matches[i])
1370 return false;
1372 /* If we allowed a two-operation SLP node verify the target can cope
1373 with the permute we are going to use. */
1374 if (alt_stmt_code != ERROR_MARK
1375 && (!alt_stmt_code.is_tree_code ()
1376 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1377 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1379 *two_operators = true;
1382 if (maybe_soft_fail)
1384 unsigned HOST_WIDE_INT const_nunits;
1385 if (!TYPE_VECTOR_SUBPARTS
1386 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1387 || const_nunits > group_size)
1388 matches[0] = false;
1389 else
1391 /* With constant vector elements simulate a mismatch at the
1392 point we need to split. */
1393 unsigned tail = group_size & (const_nunits - 1);
1394 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1396 return false;
1399 return true;
1402 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1403 Note we never remove apart from at destruction time so we do not
1404 need a special value for deleted that differs from empty. */
1405 struct bst_traits
1407 typedef vec <stmt_vec_info> value_type;
1408 typedef vec <stmt_vec_info> compare_type;
1409 static inline hashval_t hash (value_type);
1410 static inline bool equal (value_type existing, value_type candidate);
1411 static inline bool is_empty (value_type x) { return !x.exists (); }
1412 static inline bool is_deleted (value_type x) { return !x.exists (); }
1413 static const bool empty_zero_p = true;
1414 static inline void mark_empty (value_type &x) { x.release (); }
1415 static inline void mark_deleted (value_type &x) { x.release (); }
1416 static inline void remove (value_type &x) { x.release (); }
1418 inline hashval_t
1419 bst_traits::hash (value_type x)
1421 inchash::hash h;
1422 for (unsigned i = 0; i < x.length (); ++i)
1423 h.add_int (gimple_uid (x[i]->stmt));
1424 return h.end ();
1426 inline bool
1427 bst_traits::equal (value_type existing, value_type candidate)
1429 if (existing.length () != candidate.length ())
1430 return false;
1431 for (unsigned i = 0; i < existing.length (); ++i)
1432 if (existing[i] != candidate[i])
1433 return false;
1434 return true;
1437 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1438 but then vec::insert does memmove and that's not compatible with
1439 std::pair. */
1440 struct chain_op_t
1442 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1443 : code (code_), dt (dt_), op (op_) {}
1444 tree_code code;
1445 vect_def_type dt;
1446 tree op;
1449 /* Comparator for sorting associatable chains. */
1451 static int
1452 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1454 auto *op1 = (const chain_op_t *) op1_;
1455 auto *op2 = (const chain_op_t *) op2_;
1456 if (op1->dt != op2->dt)
1457 return (int)op1->dt - (int)op2->dt;
1458 return (int)op1->code - (int)op2->code;
1461 /* Linearize the associatable expression chain at START with the
1462 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1463 filling CHAIN with the result and using WORKLIST as intermediate storage.
1464 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1465 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1466 stmts, starting with START. */
1468 static void
1469 vect_slp_linearize_chain (vec_info *vinfo,
1470 vec<std::pair<tree_code, gimple *> > &worklist,
1471 vec<chain_op_t> &chain,
1472 enum tree_code code, gimple *start,
1473 gimple *&code_stmt, gimple *&alt_code_stmt,
1474 vec<gimple *> *chain_stmts)
1476 /* For each lane linearize the addition/subtraction (or other
1477 uniform associatable operation) expression tree. */
1478 worklist.safe_push (std::make_pair (code, start));
1479 while (!worklist.is_empty ())
1481 auto entry = worklist.pop ();
1482 gassign *stmt = as_a <gassign *> (entry.second);
1483 enum tree_code in_code = entry.first;
1484 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1485 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1486 if (!code_stmt
1487 && gimple_assign_rhs_code (stmt) == code)
1488 code_stmt = stmt;
1489 else if (!alt_code_stmt
1490 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1491 alt_code_stmt = stmt;
1492 if (chain_stmts)
1493 chain_stmts->safe_push (stmt);
1494 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1496 tree op = gimple_op (stmt, opnum);
1497 vect_def_type dt;
1498 stmt_vec_info def_stmt_info;
1499 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1500 gcc_assert (res);
1501 if (dt == vect_internal_def
1502 && is_pattern_stmt_p (def_stmt_info))
1503 op = gimple_get_lhs (def_stmt_info->stmt);
1504 gimple *use_stmt;
1505 use_operand_p use_p;
1506 if (dt == vect_internal_def
1507 && single_imm_use (op, &use_p, &use_stmt)
1508 && is_gimple_assign (def_stmt_info->stmt)
1509 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1510 || (code == PLUS_EXPR
1511 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1512 == MINUS_EXPR))))
1514 tree_code op_def_code = this_code;
1515 if (op_def_code == MINUS_EXPR && opnum == 1)
1516 op_def_code = PLUS_EXPR;
1517 if (in_code == MINUS_EXPR)
1518 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1519 worklist.safe_push (std::make_pair (op_def_code,
1520 def_stmt_info->stmt));
1522 else
1524 tree_code op_def_code = this_code;
1525 if (op_def_code == MINUS_EXPR && opnum == 1)
1526 op_def_code = PLUS_EXPR;
1527 if (in_code == MINUS_EXPR)
1528 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1529 chain.safe_push (chain_op_t (op_def_code, dt, op));
1535 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1536 simple_hashmap_traits <bst_traits, slp_tree> >
1537 scalar_stmts_to_slp_tree_map_t;
1539 static slp_tree
1540 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1541 vec<stmt_vec_info> stmts, unsigned int group_size,
1542 poly_uint64 *max_nunits,
1543 bool *matches, unsigned *limit, unsigned *tree_size,
1544 scalar_stmts_to_slp_tree_map_t *bst_map);
1546 static slp_tree
1547 vect_build_slp_tree (vec_info *vinfo,
1548 vec<stmt_vec_info> stmts, unsigned int group_size,
1549 poly_uint64 *max_nunits,
1550 bool *matches, unsigned *limit, unsigned *tree_size,
1551 scalar_stmts_to_slp_tree_map_t *bst_map)
1553 if (slp_tree *leader = bst_map->get (stmts))
1555 if (dump_enabled_p ())
1556 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1557 !(*leader)->failed ? "" : "failed ",
1558 (void *) *leader);
1559 if (!(*leader)->failed)
1561 SLP_TREE_REF_COUNT (*leader)++;
1562 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1563 stmts.release ();
1564 return *leader;
1566 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1567 return NULL;
1570 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1571 so we can pick up backedge destinations during discovery. */
1572 slp_tree res = new _slp_tree;
1573 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1574 SLP_TREE_SCALAR_STMTS (res) = stmts;
1575 bst_map->put (stmts.copy (), res);
1577 if (*limit == 0)
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "SLP discovery limit exceeded\n");
1582 /* Mark the node invalid so we can detect those when still in use
1583 as backedge destinations. */
1584 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1585 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1586 res->failed = XNEWVEC (bool, group_size);
1587 memset (res->failed, 0, sizeof (bool) * group_size);
1588 memset (matches, 0, sizeof (bool) * group_size);
1589 return NULL;
1591 --*limit;
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "starting SLP discovery for node %p\n", (void *) res);
1597 poly_uint64 this_max_nunits = 1;
1598 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1599 &this_max_nunits,
1600 matches, limit, tree_size, bst_map);
1601 if (!res_)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_NOTE, vect_location,
1605 "SLP discovery for node %p failed\n", (void *) res);
1606 /* Mark the node invalid so we can detect those when still in use
1607 as backedge destinations. */
1608 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1609 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1610 res->failed = XNEWVEC (bool, group_size);
1611 if (flag_checking)
1613 unsigned i;
1614 for (i = 0; i < group_size; ++i)
1615 if (!matches[i])
1616 break;
1617 gcc_assert (i < group_size);
1619 memcpy (res->failed, matches, sizeof (bool) * group_size);
1621 else
1623 if (dump_enabled_p ())
1624 dump_printf_loc (MSG_NOTE, vect_location,
1625 "SLP discovery for node %p succeeded\n",
1626 (void *) res);
1627 gcc_assert (res_ == res);
1628 res->max_nunits = this_max_nunits;
1629 vect_update_max_nunits (max_nunits, this_max_nunits);
1630 /* Keep a reference for the bst_map use. */
1631 SLP_TREE_REF_COUNT (res)++;
1633 return res_;
1636 /* Helper for building an associated SLP node chain. */
1638 static void
1639 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1640 slp_tree op0, slp_tree op1,
1641 stmt_vec_info oper1, stmt_vec_info oper2,
1642 vec<std::pair<unsigned, unsigned> > lperm)
1644 unsigned group_size = SLP_TREE_LANES (op1);
1646 slp_tree child1 = new _slp_tree;
1647 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1648 SLP_TREE_VECTYPE (child1) = vectype;
1649 SLP_TREE_LANES (child1) = group_size;
1650 SLP_TREE_CHILDREN (child1).create (2);
1651 SLP_TREE_CHILDREN (child1).quick_push (op0);
1652 SLP_TREE_CHILDREN (child1).quick_push (op1);
1653 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1655 slp_tree child2 = new _slp_tree;
1656 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1657 SLP_TREE_VECTYPE (child2) = vectype;
1658 SLP_TREE_LANES (child2) = group_size;
1659 SLP_TREE_CHILDREN (child2).create (2);
1660 SLP_TREE_CHILDREN (child2).quick_push (op0);
1661 SLP_TREE_REF_COUNT (op0)++;
1662 SLP_TREE_CHILDREN (child2).quick_push (op1);
1663 SLP_TREE_REF_COUNT (op1)++;
1664 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1666 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1667 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1668 SLP_TREE_VECTYPE (perm) = vectype;
1669 SLP_TREE_LANES (perm) = group_size;
1670 /* ??? We should set this NULL but that's not expected. */
1671 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1672 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1673 SLP_TREE_CHILDREN (perm).quick_push (child1);
1674 SLP_TREE_CHILDREN (perm).quick_push (child2);
1677 /* Recursively build an SLP tree starting from NODE.
1678 Fail (and return a value not equal to zero) if def-stmts are not
1679 isomorphic, require data permutation or are of unsupported types of
1680 operation. Otherwise, return 0.
1681 The value returned is the depth in the SLP tree where a mismatch
1682 was found. */
1684 static slp_tree
1685 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686 vec<stmt_vec_info> stmts, unsigned int group_size,
1687 poly_uint64 *max_nunits,
1688 bool *matches, unsigned *limit, unsigned *tree_size,
1689 scalar_stmts_to_slp_tree_map_t *bst_map)
1691 unsigned nops, i, this_tree_size = 0;
1692 poly_uint64 this_max_nunits = *max_nunits;
1694 matches[0] = false;
1696 stmt_vec_info stmt_info = stmts[0];
1697 if (!is_a<gcall *> (stmt_info->stmt)
1698 && !is_a<gassign *> (stmt_info->stmt)
1699 && !is_a<gphi *> (stmt_info->stmt))
1700 return NULL;
1702 nops = gimple_num_args (stmt_info->stmt);
1703 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1704 nops = map[0];
1706 /* If the SLP node is a PHI (induction or reduction), terminate
1707 the recursion. */
1708 bool *skip_args = XALLOCAVEC (bool, nops);
1709 memset (skip_args, 0, sizeof (bool) * nops);
1710 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1711 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1713 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1714 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1715 group_size);
1716 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1717 max_nunits))
1718 return NULL;
1720 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1721 if (def_type == vect_induction_def)
1723 /* Induction PHIs are not cycles but walk the initial
1724 value. Only for inner loops through, for outer loops
1725 we need to pick up the value from the actual PHIs
1726 to more easily support peeling and epilogue vectorization. */
1727 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1728 if (!nested_in_vect_loop_p (loop, stmt_info))
1729 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1730 else
1731 loop = loop->inner;
1732 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1734 else if (def_type == vect_reduction_def
1735 || def_type == vect_double_reduction_def
1736 || def_type == vect_nested_cycle
1737 || def_type == vect_first_order_recurrence)
1739 /* Else def types have to match. */
1740 stmt_vec_info other_info;
1741 bool all_same = true;
1742 FOR_EACH_VEC_ELT (stmts, i, other_info)
1744 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1745 return NULL;
1746 if (other_info != stmt_info)
1747 all_same = false;
1749 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750 /* Reduction initial values are not explicitely represented. */
1751 if (def_type != vect_first_order_recurrence
1752 && !nested_in_vect_loop_p (loop, stmt_info))
1753 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1754 /* Reduction chain backedge defs are filled manually.
1755 ??? Need a better way to identify a SLP reduction chain PHI.
1756 Or a better overall way to SLP match those. */
1757 if (all_same && def_type == vect_reduction_def)
1758 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1760 else if (def_type != vect_internal_def)
1761 return NULL;
1765 bool two_operators = false;
1766 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1767 tree vectype = NULL_TREE;
1768 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1769 &this_max_nunits, matches, &two_operators,
1770 &vectype))
1771 return NULL;
1773 /* If the SLP node is a load, terminate the recursion unless masked. */
1774 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1775 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1777 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1778 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1779 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1780 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1781 else
1783 *max_nunits = this_max_nunits;
1784 (*tree_size)++;
1785 node = vect_create_new_slp_node (node, stmts, 0);
1786 SLP_TREE_VECTYPE (node) = vectype;
1787 /* And compute the load permutation. Whether it is actually
1788 a permutation depends on the unrolling factor which is
1789 decided later. */
1790 vec<unsigned> load_permutation;
1791 int j;
1792 stmt_vec_info load_info;
1793 load_permutation.create (group_size);
1794 stmt_vec_info first_stmt_info
1795 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1796 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1798 int load_place = vect_get_place_in_interleaving_chain
1799 (load_info, first_stmt_info);
1800 gcc_assert (load_place != -1);
1801 load_permutation.safe_push (load_place);
1803 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1804 return node;
1807 else if (gimple_assign_single_p (stmt_info->stmt)
1808 && !gimple_vuse (stmt_info->stmt)
1809 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1811 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1812 the same SSA name vector of a compatible type to vectype. */
1813 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1814 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1815 stmt_vec_info estmt_info;
1816 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1818 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1819 tree bfref = gimple_assign_rhs1 (estmt);
1820 HOST_WIDE_INT lane;
1821 if (!known_eq (bit_field_size (bfref),
1822 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1823 || !constant_multiple_p (bit_field_offset (bfref),
1824 bit_field_size (bfref), &lane))
1826 lperm.release ();
1827 matches[0] = false;
1828 return NULL;
1830 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1832 slp_tree vnode = vect_create_new_slp_node (vNULL);
1833 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1834 /* ??? We record vectype here but we hide eventually necessary
1835 punning and instead rely on code generation to materialize
1836 VIEW_CONVERT_EXPRs as necessary. We instead should make
1837 this explicit somehow. */
1838 SLP_TREE_VECTYPE (vnode) = vectype;
1839 else
1841 /* For different size but compatible elements we can still
1842 use VEC_PERM_EXPR without punning. */
1843 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1844 && types_compatible_p (TREE_TYPE (vectype),
1845 TREE_TYPE (TREE_TYPE (vec))));
1846 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1848 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1849 unsigned HOST_WIDE_INT const_nunits;
1850 if (nunits.is_constant (&const_nunits))
1851 SLP_TREE_LANES (vnode) = const_nunits;
1852 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1853 /* We are always building a permutation node even if it is an identity
1854 permute to shield the rest of the vectorizer from the odd node
1855 representing an actual vector without any scalar ops.
1856 ??? We could hide it completely with making the permute node
1857 external? */
1858 node = vect_create_new_slp_node (node, stmts, 1);
1859 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1860 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1861 SLP_TREE_VECTYPE (node) = vectype;
1862 SLP_TREE_CHILDREN (node).quick_push (vnode);
1863 return node;
1865 /* When discovery reaches an associatable operation see whether we can
1866 improve that to match up lanes in a way superior to the operand
1867 swapping code which at most looks at two defs.
1868 ??? For BB vectorization we cannot do the brute-force search
1869 for matching as we can succeed by means of builds from scalars
1870 and have no good way to "cost" one build against another. */
1871 else if (is_a <loop_vec_info> (vinfo)
1872 /* ??? We don't handle !vect_internal_def defs below. */
1873 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1874 && is_gimple_assign (stmt_info->stmt)
1875 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1876 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1877 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1878 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1879 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1881 /* See if we have a chain of (mixed) adds or subtracts or other
1882 associatable ops. */
1883 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1884 if (code == MINUS_EXPR)
1885 code = PLUS_EXPR;
1886 stmt_vec_info other_op_stmt_info = NULL;
1887 stmt_vec_info op_stmt_info = NULL;
1888 unsigned chain_len = 0;
1889 auto_vec<chain_op_t> chain;
1890 auto_vec<std::pair<tree_code, gimple *> > worklist;
1891 auto_vec<vec<chain_op_t> > chains (group_size);
1892 auto_vec<slp_tree, 4> children;
1893 bool hard_fail = true;
1894 for (unsigned lane = 0; lane < group_size; ++lane)
1896 /* For each lane linearize the addition/subtraction (or other
1897 uniform associatable operation) expression tree. */
1898 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1899 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1900 stmts[lane]->stmt, op_stmt, other_op_stmt,
1901 NULL);
1902 if (!op_stmt_info && op_stmt)
1903 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1904 if (!other_op_stmt_info && other_op_stmt)
1905 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1906 if (chain.length () == 2)
1908 /* In a chain of just two elements resort to the regular
1909 operand swapping scheme. If we run into a length
1910 mismatch still hard-FAIL. */
1911 if (chain_len == 0)
1912 hard_fail = false;
1913 else
1915 matches[lane] = false;
1916 /* ??? We might want to process the other lanes, but
1917 make sure to not give false matching hints to the
1918 caller for lanes we did not process. */
1919 if (lane != group_size - 1)
1920 matches[0] = false;
1922 break;
1924 else if (chain_len == 0)
1925 chain_len = chain.length ();
1926 else if (chain.length () != chain_len)
1928 /* ??? Here we could slip in magic to compensate with
1929 neutral operands. */
1930 matches[lane] = false;
1931 if (lane != group_size - 1)
1932 matches[0] = false;
1933 break;
1935 chains.quick_push (chain.copy ());
1936 chain.truncate (0);
1938 if (chains.length () == group_size)
1940 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1941 if (!op_stmt_info)
1943 hard_fail = false;
1944 goto out;
1946 /* Now we have a set of chains with the same length. */
1947 /* 1. pre-sort according to def_type and operation. */
1948 for (unsigned lane = 0; lane < group_size; ++lane)
1949 chains[lane].stablesort (dt_sort_cmp, vinfo);
1950 if (dump_enabled_p ())
1952 dump_printf_loc (MSG_NOTE, vect_location,
1953 "pre-sorted chains of %s\n",
1954 get_tree_code_name (code));
1955 for (unsigned lane = 0; lane < group_size; ++lane)
1957 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1958 dump_printf (MSG_NOTE, "%s %T ",
1959 get_tree_code_name (chains[lane][opnum].code),
1960 chains[lane][opnum].op);
1961 dump_printf (MSG_NOTE, "\n");
1964 /* 2. try to build children nodes, associating as necessary. */
1965 for (unsigned n = 0; n < chain_len; ++n)
1967 vect_def_type dt = chains[0][n].dt;
1968 unsigned lane;
1969 for (lane = 0; lane < group_size; ++lane)
1970 if (chains[lane][n].dt != dt)
1972 if (dt == vect_constant_def
1973 && chains[lane][n].dt == vect_external_def)
1974 dt = vect_external_def;
1975 else if (dt == vect_external_def
1976 && chains[lane][n].dt == vect_constant_def)
1978 else
1979 break;
1981 if (lane != group_size)
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "giving up on chain due to mismatched "
1986 "def types\n");
1987 matches[lane] = false;
1988 if (lane != group_size - 1)
1989 matches[0] = false;
1990 goto out;
1992 if (dt == vect_constant_def
1993 || dt == vect_external_def)
1995 /* Check whether we can build the invariant. If we can't
1996 we never will be able to. */
1997 tree type = TREE_TYPE (chains[0][n].op);
1998 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1999 && (TREE_CODE (type) == BOOLEAN_TYPE
2000 || !can_duplicate_and_interleave_p (vinfo, group_size,
2001 type)))
2003 matches[0] = false;
2004 goto out;
2006 vec<tree> ops;
2007 ops.create (group_size);
2008 for (lane = 0; lane < group_size; ++lane)
2009 ops.quick_push (chains[lane][n].op);
2010 slp_tree child = vect_create_new_slp_node (ops);
2011 SLP_TREE_DEF_TYPE (child) = dt;
2012 children.safe_push (child);
2014 else if (dt != vect_internal_def)
2016 /* Not sure, we might need sth special.
2017 gcc.dg/vect/pr96854.c,
2018 gfortran.dg/vect/fast-math-pr37021.f90
2019 and gfortran.dg/vect/pr61171.f trigger. */
2020 /* Soft-fail for now. */
2021 hard_fail = false;
2022 goto out;
2024 else
2026 vec<stmt_vec_info> op_stmts;
2027 op_stmts.create (group_size);
2028 slp_tree child = NULL;
2029 /* Brute-force our way. We have to consider a lane
2030 failing after fixing an earlier fail up in the
2031 SLP discovery recursion. So track the current
2032 permute per lane. */
2033 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2034 memset (perms, 0, sizeof (unsigned) * group_size);
2037 op_stmts.truncate (0);
2038 for (lane = 0; lane < group_size; ++lane)
2039 op_stmts.quick_push
2040 (vinfo->lookup_def (chains[lane][n].op));
2041 child = vect_build_slp_tree (vinfo, op_stmts,
2042 group_size, &this_max_nunits,
2043 matches, limit,
2044 &this_tree_size, bst_map);
2045 /* ??? We're likely getting too many fatal mismatches
2046 here so maybe we want to ignore them (but then we
2047 have no idea which lanes fatally mismatched). */
2048 if (child || !matches[0])
2049 break;
2050 /* Swap another lane we have not yet matched up into
2051 lanes that did not match. If we run out of
2052 permute possibilities for a lane terminate the
2053 search. */
2054 bool term = false;
2055 for (lane = 1; lane < group_size; ++lane)
2056 if (!matches[lane])
2058 if (n + perms[lane] + 1 == chain_len)
2060 term = true;
2061 break;
2063 std::swap (chains[lane][n],
2064 chains[lane][n + perms[lane] + 1]);
2065 perms[lane]++;
2067 if (term)
2068 break;
2070 while (1);
2071 if (!child)
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "failed to match up op %d\n", n);
2076 op_stmts.release ();
2077 if (lane != group_size - 1)
2078 matches[0] = false;
2079 else
2080 matches[lane] = false;
2081 goto out;
2083 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location,
2086 "matched up op %d to\n", n);
2087 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2089 children.safe_push (child);
2092 /* 3. build SLP nodes to combine the chain. */
2093 for (unsigned lane = 0; lane < group_size; ++lane)
2094 if (chains[lane][0].code != code)
2096 /* See if there's any alternate all-PLUS entry. */
2097 unsigned n;
2098 for (n = 1; n < chain_len; ++n)
2100 for (lane = 0; lane < group_size; ++lane)
2101 if (chains[lane][n].code != code)
2102 break;
2103 if (lane == group_size)
2104 break;
2106 if (n != chain_len)
2108 /* Swap that in at first position. */
2109 std::swap (children[0], children[n]);
2110 for (lane = 0; lane < group_size; ++lane)
2111 std::swap (chains[lane][0], chains[lane][n]);
2113 else
2115 /* ??? When this triggers and we end up with two
2116 vect_constant/external_def up-front things break (ICE)
2117 spectacularly finding an insertion place for the
2118 all-constant op. We should have a fully
2119 vect_internal_def operand though(?) so we can swap
2120 that into first place and then prepend the all-zero
2121 constant. */
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_NOTE, vect_location,
2124 "inserting constant zero to compensate "
2125 "for (partially) negated first "
2126 "operand\n");
2127 chain_len++;
2128 for (lane = 0; lane < group_size; ++lane)
2129 chains[lane].safe_insert
2130 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2131 vec<tree> zero_ops;
2132 zero_ops.create (group_size);
2133 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2134 for (lane = 1; lane < group_size; ++lane)
2135 zero_ops.quick_push (zero_ops[0]);
2136 slp_tree zero = vect_create_new_slp_node (zero_ops);
2137 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2138 children.safe_insert (0, zero);
2140 break;
2142 for (unsigned i = 1; i < children.length (); ++i)
2144 slp_tree op0 = children[i - 1];
2145 slp_tree op1 = children[i];
2146 bool this_two_op = false;
2147 for (unsigned lane = 0; lane < group_size; ++lane)
2148 if (chains[lane][i].code != chains[0][i].code)
2150 this_two_op = true;
2151 break;
2153 slp_tree child;
2154 if (i == children.length () - 1)
2155 child = vect_create_new_slp_node (node, stmts, 2);
2156 else
2157 child = vect_create_new_slp_node (2, ERROR_MARK);
2158 if (this_two_op)
2160 vec<std::pair<unsigned, unsigned> > lperm;
2161 lperm.create (group_size);
2162 for (unsigned lane = 0; lane < group_size; ++lane)
2163 lperm.quick_push (std::make_pair
2164 (chains[lane][i].code != chains[0][i].code, lane));
2165 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2166 (chains[0][i].code == code
2167 ? op_stmt_info
2168 : other_op_stmt_info),
2169 (chains[0][i].code == code
2170 ? other_op_stmt_info
2171 : op_stmt_info),
2172 lperm);
2174 else
2176 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2177 SLP_TREE_VECTYPE (child) = vectype;
2178 SLP_TREE_LANES (child) = group_size;
2179 SLP_TREE_CHILDREN (child).quick_push (op0);
2180 SLP_TREE_CHILDREN (child).quick_push (op1);
2181 SLP_TREE_REPRESENTATIVE (child)
2182 = (chains[0][i].code == code
2183 ? op_stmt_info : other_op_stmt_info);
2185 children[i] = child;
2187 *tree_size += this_tree_size + 1;
2188 *max_nunits = this_max_nunits;
2189 while (!chains.is_empty ())
2190 chains.pop ().release ();
2191 return node;
2193 out:
2194 while (!children.is_empty ())
2195 vect_free_slp_tree (children.pop ());
2196 while (!chains.is_empty ())
2197 chains.pop ().release ();
2198 /* Hard-fail, otherwise we might run into quadratic processing of the
2199 chains starting one stmt into the chain again. */
2200 if (hard_fail)
2201 return NULL;
2202 /* Fall thru to normal processing. */
2205 /* Get at the operands, verifying they are compatible. */
2206 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2207 slp_oprnd_info oprnd_info;
2208 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2210 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2211 stmts, i, &oprnds_info);
2212 if (res != 0)
2213 matches[(res == -1) ? 0 : i] = false;
2214 if (!matches[0])
2215 break;
2217 for (i = 0; i < group_size; ++i)
2218 if (!matches[i])
2220 vect_free_oprnd_info (oprnds_info);
2221 return NULL;
2223 swap = NULL;
2225 auto_vec<slp_tree, 4> children;
2227 stmt_info = stmts[0];
2229 /* Create SLP_TREE nodes for the definition node/s. */
2230 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2232 slp_tree child;
2233 unsigned int j;
2235 /* We're skipping certain operands from processing, for example
2236 outer loop reduction initial defs. */
2237 if (skip_args[i])
2239 children.safe_push (NULL);
2240 continue;
2243 if (oprnd_info->first_dt == vect_uninitialized_def)
2245 /* COND_EXPR have one too many eventually if the condition
2246 is a SSA name. */
2247 gcc_assert (i == 3 && nops == 4);
2248 continue;
2251 if (is_a <bb_vec_info> (vinfo)
2252 && oprnd_info->first_dt == vect_internal_def
2253 && !oprnd_info->any_pattern)
2255 /* For BB vectorization, if all defs are the same do not
2256 bother to continue the build along the single-lane
2257 graph but use a splat of the scalar value. */
2258 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2259 for (j = 1; j < group_size; ++j)
2260 if (oprnd_info->def_stmts[j] != first_def)
2261 break;
2262 if (j == group_size
2263 /* But avoid doing this for loads where we may be
2264 able to CSE things, unless the stmt is not
2265 vectorizable. */
2266 && (!STMT_VINFO_VECTORIZABLE (first_def)
2267 || !gimple_vuse (first_def->stmt)))
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE, vect_location,
2271 "Using a splat of the uniform operand %G",
2272 first_def->stmt);
2273 oprnd_info->first_dt = vect_external_def;
2277 if (oprnd_info->first_dt == vect_external_def
2278 || oprnd_info->first_dt == vect_constant_def)
2280 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2281 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2282 oprnd_info->ops = vNULL;
2283 children.safe_push (invnode);
2284 continue;
2287 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2288 group_size, &this_max_nunits,
2289 matches, limit,
2290 &this_tree_size, bst_map)) != NULL)
2292 oprnd_info->def_stmts = vNULL;
2293 children.safe_push (child);
2294 continue;
2297 /* If the SLP build for operand zero failed and operand zero
2298 and one can be commutated try that for the scalar stmts
2299 that failed the match. */
2300 if (i == 0
2301 /* A first scalar stmt mismatch signals a fatal mismatch. */
2302 && matches[0]
2303 /* ??? For COND_EXPRs we can swap the comparison operands
2304 as well as the arms under some constraints. */
2305 && nops == 2
2306 && oprnds_info[1]->first_dt == vect_internal_def
2307 && is_gimple_assign (stmt_info->stmt)
2308 /* Swapping operands for reductions breaks assumptions later on. */
2309 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2310 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2312 /* See whether we can swap the matching or the non-matching
2313 stmt operands. */
2314 bool swap_not_matching = true;
2317 for (j = 0; j < group_size; ++j)
2319 if (matches[j] != !swap_not_matching)
2320 continue;
2321 stmt_vec_info stmt_info = stmts[j];
2322 /* Verify if we can swap operands of this stmt. */
2323 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2324 if (!stmt
2325 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2327 if (!swap_not_matching)
2328 goto fail;
2329 swap_not_matching = false;
2330 break;
2334 while (j != group_size);
2336 /* Swap mismatched definition stmts. */
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_NOTE, vect_location,
2339 "Re-trying with swapped operands of stmts ");
2340 for (j = 0; j < group_size; ++j)
2341 if (matches[j] == !swap_not_matching)
2343 std::swap (oprnds_info[0]->def_stmts[j],
2344 oprnds_info[1]->def_stmts[j]);
2345 std::swap (oprnds_info[0]->ops[j],
2346 oprnds_info[1]->ops[j]);
2347 if (dump_enabled_p ())
2348 dump_printf (MSG_NOTE, "%d ", j);
2350 if (dump_enabled_p ())
2351 dump_printf (MSG_NOTE, "\n");
2352 /* After swapping some operands we lost track whether an
2353 operand has any pattern defs so be conservative here. */
2354 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2355 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2356 /* And try again with scratch 'matches' ... */
2357 bool *tem = XALLOCAVEC (bool, group_size);
2358 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2359 group_size, &this_max_nunits,
2360 tem, limit,
2361 &this_tree_size, bst_map)) != NULL)
2363 oprnd_info->def_stmts = vNULL;
2364 children.safe_push (child);
2365 continue;
2368 fail:
2370 /* If the SLP build failed and we analyze a basic-block
2371 simply treat nodes we fail to build as externally defined
2372 (and thus build vectors from the scalar defs).
2373 The cost model will reject outright expensive cases.
2374 ??? This doesn't treat cases where permutation ultimatively
2375 fails (or we don't try permutation below). Ideally we'd
2376 even compute a permutation that will end up with the maximum
2377 SLP tree size... */
2378 if (is_a <bb_vec_info> (vinfo)
2379 /* ??? Rejecting patterns this way doesn't work. We'd have to
2380 do extra work to cancel the pattern so the uses see the
2381 scalar version. */
2382 && !is_pattern_stmt_p (stmt_info)
2383 && !oprnd_info->any_pattern)
2385 /* But if there's a leading vector sized set of matching stmts
2386 fail here so we can split the group. This matches the condition
2387 vect_analyze_slp_instance uses. */
2388 /* ??? We might want to split here and combine the results to support
2389 multiple vector sizes better. */
2390 for (j = 0; j < group_size; ++j)
2391 if (!matches[j])
2392 break;
2393 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_NOTE, vect_location,
2397 "Building vector operands from scalars\n");
2398 this_tree_size++;
2399 child = vect_create_new_slp_node (oprnd_info->ops);
2400 children.safe_push (child);
2401 oprnd_info->ops = vNULL;
2402 continue;
2406 gcc_assert (child == NULL);
2407 FOR_EACH_VEC_ELT (children, j, child)
2408 if (child)
2409 vect_free_slp_tree (child);
2410 vect_free_oprnd_info (oprnds_info);
2411 return NULL;
2414 vect_free_oprnd_info (oprnds_info);
2416 /* If we have all children of a child built up from uniform scalars
2417 or does more than one possibly expensive vector construction then
2418 just throw that away, causing it built up from scalars.
2419 The exception is the SLP node for the vector store. */
2420 if (is_a <bb_vec_info> (vinfo)
2421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2422 /* ??? Rejecting patterns this way doesn't work. We'd have to
2423 do extra work to cancel the pattern so the uses see the
2424 scalar version. */
2425 && !is_pattern_stmt_p (stmt_info))
2427 slp_tree child;
2428 unsigned j;
2429 bool all_uniform_p = true;
2430 unsigned n_vector_builds = 0;
2431 FOR_EACH_VEC_ELT (children, j, child)
2433 if (!child)
2435 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2436 all_uniform_p = false;
2437 else if (!vect_slp_tree_uniform_p (child))
2439 all_uniform_p = false;
2440 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2441 n_vector_builds++;
2444 if (all_uniform_p
2445 || n_vector_builds > 1
2446 || (n_vector_builds == children.length ()
2447 && is_a <gphi *> (stmt_info->stmt)))
2449 /* Roll back. */
2450 matches[0] = false;
2451 FOR_EACH_VEC_ELT (children, j, child)
2452 if (child)
2453 vect_free_slp_tree (child);
2455 if (dump_enabled_p ())
2456 dump_printf_loc (MSG_NOTE, vect_location,
2457 "Building parent vector operands from "
2458 "scalars instead\n");
2459 return NULL;
2463 *tree_size += this_tree_size + 1;
2464 *max_nunits = this_max_nunits;
2466 if (two_operators)
2468 /* ??? We'd likely want to either cache in bst_map sth like
2469 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2470 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2471 explicit stmts to put in so the keying on 'stmts' doesn't
2472 work (but we have the same issue with nodes that use 'ops'). */
2473 slp_tree one = new _slp_tree;
2474 slp_tree two = new _slp_tree;
2475 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2476 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2477 SLP_TREE_VECTYPE (one) = vectype;
2478 SLP_TREE_VECTYPE (two) = vectype;
2479 SLP_TREE_CHILDREN (one).safe_splice (children);
2480 SLP_TREE_CHILDREN (two).safe_splice (children);
2481 slp_tree child;
2482 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2483 SLP_TREE_REF_COUNT (child)++;
2485 /* Here we record the original defs since this
2486 node represents the final lane configuration. */
2487 node = vect_create_new_slp_node (node, stmts, 2);
2488 SLP_TREE_VECTYPE (node) = vectype;
2489 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2490 SLP_TREE_CHILDREN (node).quick_push (one);
2491 SLP_TREE_CHILDREN (node).quick_push (two);
2492 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2493 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2494 enum tree_code ocode = ERROR_MARK;
2495 stmt_vec_info ostmt_info;
2496 unsigned j = 0;
2497 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2499 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2500 if (gimple_assign_rhs_code (ostmt) != code0)
2502 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2503 ocode = gimple_assign_rhs_code (ostmt);
2504 j = i;
2506 else
2507 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2509 SLP_TREE_CODE (one) = code0;
2510 SLP_TREE_CODE (two) = ocode;
2511 SLP_TREE_LANES (one) = stmts.length ();
2512 SLP_TREE_LANES (two) = stmts.length ();
2513 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2514 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2515 return node;
2518 node = vect_create_new_slp_node (node, stmts, nops);
2519 SLP_TREE_VECTYPE (node) = vectype;
2520 SLP_TREE_CHILDREN (node).splice (children);
2521 return node;
2524 /* Dump a single SLP tree NODE. */
2526 static void
2527 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2528 slp_tree node)
2530 unsigned i, j;
2531 slp_tree child;
2532 stmt_vec_info stmt_info;
2533 tree op;
2535 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2536 dump_user_location_t user_loc = loc.get_user_location ();
2537 dump_printf_loc (metadata, user_loc,
2538 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2539 ", refcnt=%u)",
2540 SLP_TREE_DEF_TYPE (node) == vect_external_def
2541 ? " (external)"
2542 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2543 ? " (constant)"
2544 : ""), (void *) node,
2545 estimated_poly_value (node->max_nunits),
2546 SLP_TREE_REF_COUNT (node));
2547 if (SLP_TREE_VECTYPE (node))
2548 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2549 dump_printf (metadata, "\n");
2550 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2552 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2553 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2554 else
2555 dump_printf_loc (metadata, user_loc, "op template: %G",
2556 SLP_TREE_REPRESENTATIVE (node)->stmt);
2558 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2559 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2560 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2561 else
2563 dump_printf_loc (metadata, user_loc, "\t{ ");
2564 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2565 dump_printf (metadata, "%T%s ", op,
2566 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2567 dump_printf (metadata, "}\n");
2569 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2571 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2572 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2573 dump_printf (dump_kind, " %u", j);
2574 dump_printf (dump_kind, " }\n");
2576 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2578 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2579 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2580 dump_printf (dump_kind, " %u[%u]",
2581 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2582 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2583 dump_printf (dump_kind, " }\n");
2585 if (SLP_TREE_CHILDREN (node).is_empty ())
2586 return;
2587 dump_printf_loc (metadata, user_loc, "\tchildren");
2588 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2589 dump_printf (dump_kind, " %p", (void *)child);
2590 dump_printf (dump_kind, "\n");
2593 DEBUG_FUNCTION void
2594 debug (slp_tree node)
2596 debug_dump_context ctx;
2597 vect_print_slp_tree (MSG_NOTE,
2598 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2599 node);
2602 /* Recursive helper for the dot producer below. */
2604 static void
2605 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2607 if (visited.add (node))
2608 return;
2610 fprintf (f, "\"%p\" [label=\"", (void *)node);
2611 vect_print_slp_tree (MSG_NOTE,
2612 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2613 node);
2614 fprintf (f, "\"];\n");
2617 for (slp_tree child : SLP_TREE_CHILDREN (node))
2618 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2620 for (slp_tree child : SLP_TREE_CHILDREN (node))
2621 if (child)
2622 dot_slp_tree (f, child, visited);
2625 DEBUG_FUNCTION void
2626 dot_slp_tree (const char *fname, slp_tree node)
2628 FILE *f = fopen (fname, "w");
2629 fprintf (f, "digraph {\n");
2630 fflush (f);
2632 debug_dump_context ctx (f);
2633 hash_set<slp_tree> visited;
2634 dot_slp_tree (f, node, visited);
2636 fflush (f);
2637 fprintf (f, "}\n");
2638 fclose (f);
2641 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2643 static void
2644 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2645 slp_tree node, hash_set<slp_tree> &visited)
2647 unsigned i;
2648 slp_tree child;
2650 if (visited.add (node))
2651 return;
2653 vect_print_slp_tree (dump_kind, loc, node);
2655 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2656 if (child)
2657 vect_print_slp_graph (dump_kind, loc, child, visited);
2660 static void
2661 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2662 slp_tree entry)
2664 hash_set<slp_tree> visited;
2665 vect_print_slp_graph (dump_kind, loc, entry, visited);
2668 /* Mark the tree rooted at NODE with PURE_SLP. */
2670 static void
2671 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2673 int i;
2674 stmt_vec_info stmt_info;
2675 slp_tree child;
2677 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2678 return;
2680 if (visited.add (node))
2681 return;
2683 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2684 STMT_SLP_TYPE (stmt_info) = pure_slp;
2686 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2687 if (child)
2688 vect_mark_slp_stmts (child, visited);
2691 static void
2692 vect_mark_slp_stmts (slp_tree node)
2694 hash_set<slp_tree> visited;
2695 vect_mark_slp_stmts (node, visited);
2698 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2700 static void
2701 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2703 int i;
2704 stmt_vec_info stmt_info;
2705 slp_tree child;
2707 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2708 return;
2710 if (visited.add (node))
2711 return;
2713 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2715 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2716 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2717 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2720 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2721 if (child)
2722 vect_mark_slp_stmts_relevant (child, visited);
2725 static void
2726 vect_mark_slp_stmts_relevant (slp_tree node)
2728 hash_set<slp_tree> visited;
2729 vect_mark_slp_stmts_relevant (node, visited);
2733 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2735 static void
2736 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2737 hash_set<slp_tree> &visited)
2739 if (!node || visited.add (node))
2740 return;
2742 if (SLP_TREE_CHILDREN (node).length () == 0)
2744 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2745 return;
2746 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2747 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2748 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2749 loads.safe_push (node);
2751 else
2753 unsigned i;
2754 slp_tree child;
2755 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2756 vect_gather_slp_loads (loads, child, visited);
2761 /* Find the last store in SLP INSTANCE. */
2763 stmt_vec_info
2764 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2766 stmt_vec_info last = NULL;
2767 stmt_vec_info stmt_vinfo;
2769 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2771 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2772 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2775 return last;
2778 /* Find the first stmt in NODE. */
2780 stmt_vec_info
2781 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2783 stmt_vec_info first = NULL;
2784 stmt_vec_info stmt_vinfo;
2786 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2788 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2789 if (!first
2790 || get_later_stmt (stmt_vinfo, first) == first)
2791 first = stmt_vinfo;
2794 return first;
2797 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2798 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2799 (also containing the first GROUP1_SIZE stmts, since stores are
2800 consecutive), the second containing the remainder.
2801 Return the first stmt in the second group. */
2803 static stmt_vec_info
2804 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2806 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2807 gcc_assert (group1_size > 0);
2808 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2809 gcc_assert (group2_size > 0);
2810 DR_GROUP_SIZE (first_vinfo) = group1_size;
2812 stmt_vec_info stmt_info = first_vinfo;
2813 for (unsigned i = group1_size; i > 1; i--)
2815 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2818 /* STMT is now the last element of the first group. */
2819 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2820 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2822 DR_GROUP_SIZE (group2) = group2_size;
2823 for (stmt_info = group2; stmt_info;
2824 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2826 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2827 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2830 /* For the second group, the DR_GROUP_GAP is that before the original group,
2831 plus skipping over the first vector. */
2832 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2834 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2835 DR_GROUP_GAP (first_vinfo) += group2_size;
2837 if (dump_enabled_p ())
2838 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2839 group1_size, group2_size);
2841 return group2;
2844 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2845 statements and a vector of NUNITS elements. */
2847 static poly_uint64
2848 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2850 return exact_div (common_multiple (nunits, group_size), group_size);
2853 /* Helper that checks to see if a node is a load node. */
2855 static inline bool
2856 vect_is_slp_load_node (slp_tree root)
2858 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2859 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2860 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2864 /* Helper function of optimize_load_redistribution that performs the operation
2865 recursively. */
2867 static slp_tree
2868 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2869 vec_info *vinfo, unsigned int group_size,
2870 hash_map<slp_tree, slp_tree> *load_map,
2871 slp_tree root)
2873 if (slp_tree *leader = load_map->get (root))
2874 return *leader;
2876 slp_tree node;
2877 unsigned i;
2879 /* For now, we don't know anything about externals so do not do anything. */
2880 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2881 return NULL;
2882 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2884 /* First convert this node into a load node and add it to the leaves
2885 list and flatten the permute from a lane to a load one. If it's
2886 unneeded it will be elided later. */
2887 vec<stmt_vec_info> stmts;
2888 stmts.create (SLP_TREE_LANES (root));
2889 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2890 for (unsigned j = 0; j < lane_perm.length (); j++)
2892 std::pair<unsigned, unsigned> perm = lane_perm[j];
2893 node = SLP_TREE_CHILDREN (root)[perm.first];
2895 if (!vect_is_slp_load_node (node)
2896 || SLP_TREE_CHILDREN (node).exists ())
2898 stmts.release ();
2899 goto next;
2902 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2905 if (dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE, vect_location,
2907 "converting stmts on permute node %p\n",
2908 (void *) root);
2910 bool *matches = XALLOCAVEC (bool, group_size);
2911 poly_uint64 max_nunits = 1;
2912 unsigned tree_size = 0, limit = 1;
2913 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2914 matches, &limit, &tree_size, bst_map);
2915 if (!node)
2916 stmts.release ();
2918 load_map->put (root, node);
2919 return node;
2922 next:
2923 load_map->put (root, NULL);
2925 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2927 slp_tree value
2928 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2929 node);
2930 if (value)
2932 SLP_TREE_REF_COUNT (value)++;
2933 SLP_TREE_CHILDREN (root)[i] = value;
2934 /* ??? We know the original leafs of the replaced nodes will
2935 be referenced by bst_map, only the permutes created by
2936 pattern matching are not. */
2937 if (SLP_TREE_REF_COUNT (node) == 1)
2938 load_map->remove (node);
2939 vect_free_slp_tree (node);
2943 return NULL;
2946 /* Temporary workaround for loads not being CSEd during SLP build. This
2947 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2948 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2949 same DR such that the final operation is equal to a permuted load. Such
2950 NODES are then directly converted into LOADS themselves. The nodes are
2951 CSEd using BST_MAP. */
2953 static void
2954 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2955 vec_info *vinfo, unsigned int group_size,
2956 hash_map<slp_tree, slp_tree> *load_map,
2957 slp_tree root)
2959 slp_tree node;
2960 unsigned i;
2962 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2964 slp_tree value
2965 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2966 node);
2967 if (value)
2969 SLP_TREE_REF_COUNT (value)++;
2970 SLP_TREE_CHILDREN (root)[i] = value;
2971 /* ??? We know the original leafs of the replaced nodes will
2972 be referenced by bst_map, only the permutes created by
2973 pattern matching are not. */
2974 if (SLP_TREE_REF_COUNT (node) == 1)
2975 load_map->remove (node);
2976 vect_free_slp_tree (node);
2981 /* Helper function of vect_match_slp_patterns.
2983 Attempts to match patterns against the slp tree rooted in REF_NODE using
2984 VINFO. Patterns are matched in post-order traversal.
2986 If matching is successful the value in REF_NODE is updated and returned, if
2987 not then it is returned unchanged. */
2989 static bool
2990 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2991 slp_tree_to_load_perm_map_t *perm_cache,
2992 slp_compat_nodes_map_t *compat_cache,
2993 hash_set<slp_tree> *visited)
2995 unsigned i;
2996 slp_tree node = *ref_node;
2997 bool found_p = false;
2998 if (!node || visited->add (node))
2999 return false;
3001 slp_tree child;
3002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3003 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3004 vinfo, perm_cache, compat_cache,
3005 visited);
3007 for (unsigned x = 0; x < num__slp_patterns; x++)
3009 vect_pattern *pattern
3010 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3011 if (pattern)
3013 pattern->build (vinfo);
3014 delete pattern;
3015 found_p = true;
3019 return found_p;
3022 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3023 vec_info VINFO.
3025 The modified tree is returned. Patterns are tried in order and multiple
3026 patterns may match. */
3028 static bool
3029 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3030 hash_set<slp_tree> *visited,
3031 slp_tree_to_load_perm_map_t *perm_cache,
3032 slp_compat_nodes_map_t *compat_cache)
3034 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3035 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3037 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_NOTE, vect_location,
3039 "Analyzing SLP tree %p for patterns\n",
3040 (void *) SLP_INSTANCE_TREE (instance));
3042 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3043 visited);
3046 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3047 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3048 Return true if we could use IFN_STORE_LANES instead and if that appears
3049 to be the better approach. */
3051 static bool
3052 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3053 unsigned int group_size,
3054 unsigned int new_group_size)
3056 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3057 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3058 if (!vectype)
3059 return false;
3060 /* Allow the split if one of the two new groups would operate on full
3061 vectors *within* rather than across one scalar loop iteration.
3062 This is purely a heuristic, but it should work well for group
3063 sizes of 3 and 4, where the possible splits are:
3065 3->2+1: OK if the vector has exactly two elements
3066 4->2+2: Likewise
3067 4->3+1: Less clear-cut. */
3068 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3069 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3070 return false;
3071 return vect_store_lanes_supported (vectype, group_size, false);
3074 /* Analyze an SLP instance starting from a group of grouped stores. Call
3075 vect_build_slp_tree to build a tree of packed stmts if possible.
3076 Return FALSE if it's impossible to SLP any stmt in the loop. */
3078 static bool
3079 vect_analyze_slp_instance (vec_info *vinfo,
3080 scalar_stmts_to_slp_tree_map_t *bst_map,
3081 stmt_vec_info stmt_info, slp_instance_kind kind,
3082 unsigned max_tree_size, unsigned *limit);
3084 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3085 of KIND. Return true if successful. */
3087 static bool
3088 vect_build_slp_instance (vec_info *vinfo,
3089 slp_instance_kind kind,
3090 vec<stmt_vec_info> &scalar_stmts,
3091 vec<stmt_vec_info> &root_stmt_infos,
3092 unsigned max_tree_size, unsigned *limit,
3093 scalar_stmts_to_slp_tree_map_t *bst_map,
3094 /* ??? We need stmt_info for group splitting. */
3095 stmt_vec_info stmt_info_)
3097 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "Starting SLP discovery for\n");
3101 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3102 dump_printf_loc (MSG_NOTE, vect_location,
3103 " %G", scalar_stmts[i]->stmt);
3106 /* Build the tree for the SLP instance. */
3107 unsigned int group_size = scalar_stmts.length ();
3108 bool *matches = XALLOCAVEC (bool, group_size);
3109 poly_uint64 max_nunits = 1;
3110 unsigned tree_size = 0;
3111 unsigned i;
3112 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3113 &max_nunits, matches, limit,
3114 &tree_size, bst_map);
3115 if (node != NULL)
3117 /* Calculate the unrolling factor based on the smallest type. */
3118 poly_uint64 unrolling_factor
3119 = calculate_unrolling_factor (max_nunits, group_size);
3121 if (maybe_ne (unrolling_factor, 1U)
3122 && is_a <bb_vec_info> (vinfo))
3124 unsigned HOST_WIDE_INT const_max_nunits;
3125 if (!max_nunits.is_constant (&const_max_nunits)
3126 || const_max_nunits > group_size)
3128 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "Build SLP failed: store group "
3131 "size not a multiple of the vector size "
3132 "in basic block SLP\n");
3133 vect_free_slp_tree (node);
3134 return false;
3136 /* Fatal mismatch. */
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "SLP discovery succeeded but node needs "
3140 "splitting\n");
3141 memset (matches, true, group_size);
3142 matches[group_size / const_max_nunits * const_max_nunits] = false;
3143 vect_free_slp_tree (node);
3145 else
3147 /* Create a new SLP instance. */
3148 slp_instance new_instance = XNEW (class _slp_instance);
3149 SLP_INSTANCE_TREE (new_instance) = node;
3150 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3151 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3152 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3153 SLP_INSTANCE_KIND (new_instance) = kind;
3154 new_instance->reduc_phis = NULL;
3155 new_instance->cost_vec = vNULL;
3156 new_instance->subgraph_entries = vNULL;
3158 if (dump_enabled_p ())
3159 dump_printf_loc (MSG_NOTE, vect_location,
3160 "SLP size %u vs. limit %u.\n",
3161 tree_size, max_tree_size);
3163 /* Fixup SLP reduction chains. */
3164 if (kind == slp_inst_kind_reduc_chain)
3166 /* If this is a reduction chain with a conversion in front
3167 amend the SLP tree with a node for that. */
3168 gimple *scalar_def
3169 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3170 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3172 /* Get at the conversion stmt - we know it's the single use
3173 of the last stmt of the reduction chain. */
3174 use_operand_p use_p;
3175 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3176 &use_p, &scalar_def);
3177 gcc_assert (r);
3178 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3179 next_info = vect_stmt_to_vectorize (next_info);
3180 scalar_stmts = vNULL;
3181 scalar_stmts.create (group_size);
3182 for (unsigned i = 0; i < group_size; ++i)
3183 scalar_stmts.quick_push (next_info);
3184 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3185 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3186 SLP_TREE_CHILDREN (conv).quick_push (node);
3187 SLP_INSTANCE_TREE (new_instance) = conv;
3188 /* We also have to fake this conversion stmt as SLP reduction
3189 group so we don't have to mess with too much code
3190 elsewhere. */
3191 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3192 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3194 /* Fill the backedge child of the PHI SLP node. The
3195 general matching code cannot find it because the
3196 scalar code does not reflect how we vectorize the
3197 reduction. */
3198 use_operand_p use_p;
3199 imm_use_iterator imm_iter;
3200 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3201 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3202 gimple_get_lhs (scalar_def))
3203 /* There are exactly two non-debug uses, the reduction
3204 PHI and the loop-closed PHI node. */
3205 if (!is_gimple_debug (USE_STMT (use_p))
3206 && gimple_bb (USE_STMT (use_p)) == loop->header)
3208 auto_vec<stmt_vec_info, 64> phis (group_size);
3209 stmt_vec_info phi_info
3210 = vinfo->lookup_stmt (USE_STMT (use_p));
3211 for (unsigned i = 0; i < group_size; ++i)
3212 phis.quick_push (phi_info);
3213 slp_tree *phi_node = bst_map->get (phis);
3214 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3215 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3216 = SLP_INSTANCE_TREE (new_instance);
3217 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3221 vinfo->slp_instances.safe_push (new_instance);
3223 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3224 the number of scalar stmts in the root in a few places.
3225 Verify that assumption holds. */
3226 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3227 .length () == group_size);
3229 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_NOTE, vect_location,
3232 "Final SLP tree for instance %p:\n",
3233 (void *) new_instance);
3234 vect_print_slp_graph (MSG_NOTE, vect_location,
3235 SLP_INSTANCE_TREE (new_instance));
3238 return true;
3241 else
3243 /* Failed to SLP. */
3244 /* Free the allocated memory. */
3245 scalar_stmts.release ();
3248 stmt_vec_info stmt_info = stmt_info_;
3249 /* Try to break the group up into pieces. */
3250 if (kind == slp_inst_kind_store)
3252 /* ??? We could delay all the actual splitting of store-groups
3253 until after SLP discovery of the original group completed.
3254 Then we can recurse to vect_build_slp_instance directly. */
3255 for (i = 0; i < group_size; i++)
3256 if (!matches[i])
3257 break;
3259 /* For basic block SLP, try to break the group up into multiples of
3260 a vector size. */
3261 if (is_a <bb_vec_info> (vinfo)
3262 && (i > 1 && i < group_size))
3264 tree scalar_type
3265 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3266 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3267 1 << floor_log2 (i));
3268 unsigned HOST_WIDE_INT const_nunits;
3269 if (vectype
3270 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3272 /* Split into two groups at the first vector boundary. */
3273 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3274 unsigned group1_size = i & ~(const_nunits - 1);
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE, vect_location,
3278 "Splitting SLP group at stmt %u\n", i);
3279 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3280 group1_size);
3281 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3282 kind, max_tree_size,
3283 limit);
3284 /* Split the rest at the failure point and possibly
3285 re-analyze the remaining matching part if it has
3286 at least two lanes. */
3287 if (group1_size < i
3288 && (i + 1 < group_size
3289 || i - group1_size > 1))
3291 stmt_vec_info rest2 = rest;
3292 rest = vect_split_slp_store_group (rest, i - group1_size);
3293 if (i - group1_size > 1)
3294 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3295 kind, max_tree_size,
3296 limit);
3298 /* Re-analyze the non-matching tail if it has at least
3299 two lanes. */
3300 if (i + 1 < group_size)
3301 res |= vect_analyze_slp_instance (vinfo, bst_map,
3302 rest, kind, max_tree_size,
3303 limit);
3304 return res;
3308 /* For loop vectorization split into arbitrary pieces of size > 1. */
3309 if (is_a <loop_vec_info> (vinfo)
3310 && (i > 1 && i < group_size)
3311 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3313 unsigned group1_size = i;
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "Splitting SLP group at stmt %u\n", i);
3319 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3320 group1_size);
3321 /* Loop vectorization cannot handle gaps in stores, make sure
3322 the split group appears as strided. */
3323 STMT_VINFO_STRIDED_P (rest) = 1;
3324 DR_GROUP_GAP (rest) = 0;
3325 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3326 DR_GROUP_GAP (stmt_info) = 0;
3328 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3329 kind, max_tree_size, limit);
3330 if (i + 1 < group_size)
3331 res |= vect_analyze_slp_instance (vinfo, bst_map,
3332 rest, kind, max_tree_size, limit);
3334 return res;
3337 /* Even though the first vector did not all match, we might be able to SLP
3338 (some) of the remainder. FORNOW ignore this possibility. */
3341 /* Failed to SLP. */
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3344 return false;
3348 /* Analyze an SLP instance starting from a group of grouped stores. Call
3349 vect_build_slp_tree to build a tree of packed stmts if possible.
3350 Return FALSE if it's impossible to SLP any stmt in the loop. */
3352 static bool
3353 vect_analyze_slp_instance (vec_info *vinfo,
3354 scalar_stmts_to_slp_tree_map_t *bst_map,
3355 stmt_vec_info stmt_info,
3356 slp_instance_kind kind,
3357 unsigned max_tree_size, unsigned *limit)
3359 unsigned int i;
3360 vec<stmt_vec_info> scalar_stmts;
3362 if (is_a <bb_vec_info> (vinfo))
3363 vect_location = stmt_info->stmt;
3365 stmt_vec_info next_info = stmt_info;
3366 if (kind == slp_inst_kind_store)
3368 /* Collect the stores and store them in scalar_stmts. */
3369 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3370 while (next_info)
3372 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3373 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3376 else if (kind == slp_inst_kind_reduc_chain)
3378 /* Collect the reduction stmts and store them in scalar_stmts. */
3379 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3380 while (next_info)
3382 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3383 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3385 /* Mark the first element of the reduction chain as reduction to properly
3386 transform the node. In the reduction analysis phase only the last
3387 element of the chain is marked as reduction. */
3388 STMT_VINFO_DEF_TYPE (stmt_info)
3389 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3390 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3391 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3393 else if (kind == slp_inst_kind_ctor)
3395 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3396 tree val;
3397 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3398 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3400 stmt_vec_info def_info = vinfo->lookup_def (val);
3401 def_info = vect_stmt_to_vectorize (def_info);
3402 scalar_stmts.quick_push (def_info);
3404 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_NOTE, vect_location,
3406 "Analyzing vectorizable constructor: %G\n",
3407 stmt_info->stmt);
3409 else if (kind == slp_inst_kind_reduc_group)
3411 /* Collect reduction statements. */
3412 const vec<stmt_vec_info> &reductions
3413 = as_a <loop_vec_info> (vinfo)->reductions;
3414 scalar_stmts.create (reductions.length ());
3415 for (i = 0; reductions.iterate (i, &next_info); i++)
3416 if ((STMT_VINFO_RELEVANT_P (next_info)
3417 || STMT_VINFO_LIVE_P (next_info))
3418 /* ??? Make sure we didn't skip a conversion around a reduction
3419 path. In that case we'd have to reverse engineer that conversion
3420 stmt following the chain using reduc_idx and from the PHI
3421 using reduc_def. */
3422 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3423 scalar_stmts.quick_push (next_info);
3424 /* If less than two were relevant/live there's nothing to SLP. */
3425 if (scalar_stmts.length () < 2)
3426 return false;
3428 else
3429 gcc_unreachable ();
3431 vec<stmt_vec_info> roots = vNULL;
3432 if (kind == slp_inst_kind_ctor)
3434 roots.create (1);
3435 roots.quick_push (stmt_info);
3437 /* Build the tree for the SLP instance. */
3438 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3439 roots,
3440 max_tree_size, limit, bst_map,
3441 kind == slp_inst_kind_store
3442 ? stmt_info : NULL);
3443 if (!res)
3444 roots.release ();
3446 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3447 where we should do store group splitting. */
3449 return res;
3452 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3453 trees of packed scalar stmts if SLP is possible. */
3455 opt_result
3456 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3458 unsigned int i;
3459 stmt_vec_info first_element;
3460 slp_instance instance;
3462 DUMP_VECT_SCOPE ("vect_analyze_slp");
3464 unsigned limit = max_tree_size;
3466 scalar_stmts_to_slp_tree_map_t *bst_map
3467 = new scalar_stmts_to_slp_tree_map_t ();
3469 /* Find SLP sequences starting from groups of grouped stores. */
3470 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3471 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3472 STMT_VINFO_GROUPED_ACCESS (first_element)
3473 ? slp_inst_kind_store : slp_inst_kind_ctor,
3474 max_tree_size, &limit);
3476 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3478 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3480 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3481 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3482 bb_vinfo->roots[i].stmts,
3483 bb_vinfo->roots[i].roots,
3484 max_tree_size, &limit, bst_map, NULL))
3486 bb_vinfo->roots[i].stmts = vNULL;
3487 bb_vinfo->roots[i].roots = vNULL;
3492 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3494 /* Find SLP sequences starting from reduction chains. */
3495 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3496 if (! STMT_VINFO_RELEVANT_P (first_element)
3497 && ! STMT_VINFO_LIVE_P (first_element))
3499 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3500 slp_inst_kind_reduc_chain,
3501 max_tree_size, &limit))
3503 /* Dissolve reduction chain group. */
3504 stmt_vec_info vinfo = first_element;
3505 stmt_vec_info last = NULL;
3506 while (vinfo)
3508 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3509 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3510 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3511 last = vinfo;
3512 vinfo = next;
3514 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3515 /* It can be still vectorized as part of an SLP reduction. */
3516 loop_vinfo->reductions.safe_push (last);
3519 /* Find SLP sequences starting from groups of reductions. */
3520 if (loop_vinfo->reductions.length () > 1)
3521 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3522 slp_inst_kind_reduc_group, max_tree_size,
3523 &limit);
3526 hash_set<slp_tree> visited_patterns;
3527 slp_tree_to_load_perm_map_t perm_cache;
3528 slp_compat_nodes_map_t compat_cache;
3530 /* See if any patterns can be found in the SLP tree. */
3531 bool pattern_found = false;
3532 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3533 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3534 &visited_patterns, &perm_cache,
3535 &compat_cache);
3537 /* If any were found optimize permutations of loads. */
3538 if (pattern_found)
3540 hash_map<slp_tree, slp_tree> load_map;
3541 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3543 slp_tree root = SLP_INSTANCE_TREE (instance);
3544 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3545 &load_map, root);
3551 /* The map keeps a reference on SLP nodes built, release that. */
3552 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3553 it != bst_map->end (); ++it)
3554 if ((*it).second)
3555 vect_free_slp_tree ((*it).second);
3556 delete bst_map;
3558 if (pattern_found && dump_enabled_p ())
3560 dump_printf_loc (MSG_NOTE, vect_location,
3561 "Pattern matched SLP tree\n");
3562 hash_set<slp_tree> visited;
3563 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3564 vect_print_slp_graph (MSG_NOTE, vect_location,
3565 SLP_INSTANCE_TREE (instance), visited);
3568 return opt_result::success ();
3571 /* Estimates the cost of inserting layout changes into the SLP graph.
3572 It can also say that the insertion is impossible. */
3574 struct slpg_layout_cost
3576 slpg_layout_cost () = default;
3577 slpg_layout_cost (sreal, bool);
3579 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3580 bool is_possible () const { return depth != sreal::max (); }
3582 bool operator== (const slpg_layout_cost &) const;
3583 bool operator!= (const slpg_layout_cost &) const;
3585 bool is_better_than (const slpg_layout_cost &, bool) const;
3587 void add_parallel_cost (const slpg_layout_cost &);
3588 void add_serial_cost (const slpg_layout_cost &);
3589 void split (unsigned int);
3591 /* The longest sequence of layout changes needed during any traversal
3592 of the partition dag, weighted by execution frequency.
3594 This is the most important metric when optimizing for speed, since
3595 it helps to ensure that we keep the number of operations on
3596 critical paths to a minimum. */
3597 sreal depth = 0;
3599 /* An estimate of the total number of operations needed. It is weighted by
3600 execution frequency when optimizing for speed but not when optimizing for
3601 size. In order to avoid double-counting, a node with a fanout of N will
3602 distribute 1/N of its total cost to each successor.
3604 This is the most important metric when optimizing for size, since
3605 it helps to keep the total number of operations to a minimum, */
3606 sreal total = 0;
3609 /* Construct costs for a node with weight WEIGHT. A higher weight
3610 indicates more frequent execution. IS_FOR_SIZE is true if we are
3611 optimizing for size rather than speed. */
3613 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3614 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3618 bool
3619 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3621 return depth == other.depth && total == other.total;
3624 bool
3625 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3627 return !operator== (other);
3630 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3631 true if we are optimizing for size rather than speed. */
3633 bool
3634 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3635 bool is_for_size) const
3637 if (is_for_size)
3639 if (total != other.total)
3640 return total < other.total;
3641 return depth < other.depth;
3643 else
3645 if (depth != other.depth)
3646 return depth < other.depth;
3647 return total < other.total;
3651 /* Increase the costs to account for something with cost INPUT_COST
3652 happening in parallel with the current costs. */
3654 void
3655 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3657 depth = std::max (depth, input_cost.depth);
3658 total += input_cost.total;
3661 /* Increase the costs to account for something with cost INPUT_COST
3662 happening in series with the current costs. */
3664 void
3665 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3667 depth += other.depth;
3668 total += other.total;
3671 /* Split the total cost among TIMES successors or predecessors. */
3673 void
3674 slpg_layout_cost::split (unsigned int times)
3676 if (times > 1)
3677 total /= times;
3680 /* Information about one node in the SLP graph, for use during
3681 vect_optimize_slp_pass. */
3683 struct slpg_vertex
3685 slpg_vertex (slp_tree node_) : node (node_) {}
3687 /* The node itself. */
3688 slp_tree node;
3690 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3691 partitions are flexible; they can have whichever layout consumers
3692 want them to have. */
3693 int partition = -1;
3695 /* The number of nodes that directly use the result of this one
3696 (i.e. the number of nodes that count this one as a child). */
3697 unsigned int out_degree = 0;
3699 /* The execution frequency of the node. */
3700 sreal weight = 0;
3702 /* The total execution frequency of all nodes that directly use the
3703 result of this one. */
3704 sreal out_weight = 0;
3707 /* Information about one partition of the SLP graph, for use during
3708 vect_optimize_slp_pass. */
3710 struct slpg_partition_info
3712 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3713 of m_partitioned_nodes. */
3714 unsigned int node_begin = 0;
3715 unsigned int node_end = 0;
3717 /* Which layout we've chosen to use for this partition, or -1 if
3718 we haven't picked one yet. */
3719 int layout = -1;
3721 /* The number of predecessors and successors in the partition dag.
3722 The predecessors always have lower partition numbers and the
3723 successors always have higher partition numbers.
3725 Note that the directions of these edges are not necessarily the
3726 same as in the data flow graph. For example, if an SCC has separate
3727 partitions for an inner loop and an outer loop, the inner loop's
3728 partition will have at least two incoming edges from the outer loop's
3729 partition: one for a live-in value and one for a live-out value.
3730 In data flow terms, one of these edges would also be from the outer loop
3731 to the inner loop, but the other would be in the opposite direction. */
3732 unsigned int in_degree = 0;
3733 unsigned int out_degree = 0;
3736 /* Information about the costs of using a particular layout for a
3737 particular partition. It can also say that the combination is
3738 impossible. */
3740 struct slpg_partition_layout_costs
3742 bool is_possible () const { return internal_cost.is_possible (); }
3743 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3745 /* The costs inherited from predecessor partitions. */
3746 slpg_layout_cost in_cost;
3748 /* The inherent cost of the layout within the node itself. For example,
3749 this is nonzero for a load if choosing a particular layout would require
3750 the load to permute the loaded elements. It is nonzero for a
3751 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3752 to full-vector moves. */
3753 slpg_layout_cost internal_cost;
3755 /* The costs inherited from successor partitions. */
3756 slpg_layout_cost out_cost;
3759 /* This class tries to optimize the layout of vectors in order to avoid
3760 unnecessary shuffling. At the moment, the set of possible layouts are
3761 restricted to bijective permutations.
3763 The goal of the pass depends on whether we're optimizing for size or
3764 for speed. When optimizing for size, the goal is to reduce the overall
3765 number of layout changes (including layout changes implied by things
3766 like load permutations). When optimizing for speed, the goal is to
3767 reduce the maximum latency attributable to layout changes on any
3768 non-cyclical path through the data flow graph.
3770 For example, when optimizing a loop nest for speed, we will prefer
3771 to make layout changes outside of a loop rather than inside of a loop,
3772 and will prefer to make layout changes in parallel rather than serially,
3773 even if that increases the overall number of layout changes.
3775 The high-level procedure is:
3777 (1) Build a graph in which edges go from uses (parents) to definitions
3778 (children).
3780 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3782 (3) When optimizing for speed, partition the nodes in each SCC based
3783 on their containing cfg loop. When optimizing for size, treat
3784 each SCC as a single partition.
3786 This gives us a dag of partitions. The goal is now to assign a
3787 layout to each partition.
3789 (4) Construct a set of vector layouts that are worth considering.
3790 Record which nodes must keep their current layout.
3792 (5) Perform a forward walk over the partition dag (from loads to stores)
3793 accumulating the "forward" cost of using each layout. When visiting
3794 each partition, assign a tentative choice of layout to the partition
3795 and use that choice when calculating the cost of using a different
3796 layout in successor partitions.
3798 (6) Perform a backward walk over the partition dag (from stores to loads),
3799 accumulating the "backward" cost of using each layout. When visiting
3800 each partition, make a final choice of layout for that partition based
3801 on the accumulated forward costs (from (5)) and backward costs
3802 (from (6)).
3804 (7) Apply the chosen layouts to the SLP graph.
3806 For example, consider the SLP statements:
3808 S1: a_1 = load
3809 loop:
3810 S2: a_2 = PHI<a_1, a_3>
3811 S3: b_1 = load
3812 S4: a_3 = a_2 + b_1
3813 exit:
3814 S5: a_4 = PHI<a_3>
3815 S6: store a_4
3817 S2 and S4 form an SCC and are part of the same loop. Every other
3818 statement is in a singleton SCC. In this example there is a one-to-one
3819 mapping between SCCs and partitions and the partition dag looks like this;
3821 S1 S3
3823 S2+S4
3829 S2, S3 and S4 will have a higher execution frequency than the other
3830 statements, so when optimizing for speed, the goal is to avoid any
3831 layout changes:
3833 - within S3
3834 - within S2+S4
3835 - on the S3->S2+S4 edge
3837 For example, if S3 was originally a reversing load, the goal of the
3838 pass is to make it an unreversed load and change the layout on the
3839 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3840 on S1->S2+S4 and S5->S6 would also be acceptable.)
3842 The difference between SCCs and partitions becomes important if we
3843 add an outer loop:
3845 S1: a_1 = ...
3846 loop1:
3847 S2: a_2 = PHI<a_1, a_6>
3848 S3: b_1 = load
3849 S4: a_3 = a_2 + b_1
3850 loop2:
3851 S5: a_4 = PHI<a_3, a_5>
3852 S6: c_1 = load
3853 S7: a_5 = a_4 + c_1
3854 exit2:
3855 S8: a_6 = PHI<a_5>
3856 S9: store a_6
3857 exit1:
3859 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3860 for speed, we usually do not want restrictions in the outer loop to "infect"
3861 the decision for the inner loop. For example, if an outer-loop node
3862 in the SCC contains a statement with a fixed layout, that should not
3863 prevent the inner loop from using a different layout. Conversely,
3864 the inner loop should not dictate a layout to the outer loop: if the
3865 outer loop does a lot of computation, then it may not be efficient to
3866 do all of that computation in the inner loop's preferred layout.
3868 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3869 and S5+S7 (inner). We also try to arrange partitions so that:
3871 - the partition for an outer loop comes before the partition for
3872 an inner loop
3874 - if a sibling loop A dominates a sibling loop B, A's partition
3875 comes before B's
3877 This gives the following partition dag for the example above:
3879 S1 S3
3881 S2+S4+S8 S6
3882 | \\ /
3883 | S5+S7
3887 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3888 one for a reversal of the edge S7->S8.
3890 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3891 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3892 preferred layout against the cost of changing the layout on entry to the
3893 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3895 Although this works well when optimizing for speed, it has the downside
3896 when optimizing for size that the choice of layout for S5+S7 is completely
3897 independent of S9, which lessens the chance of reducing the overall number
3898 of permutations. We therefore do not partition SCCs when optimizing
3899 for size.
3901 To give a concrete example of the difference between optimizing
3902 for size and speed, consider:
3904 a[0] = (b[1] << c[3]) - d[1];
3905 a[1] = (b[0] << c[2]) - d[0];
3906 a[2] = (b[3] << c[1]) - d[3];
3907 a[3] = (b[2] << c[0]) - d[2];
3909 There are three different layouts here: one for a, one for b and d,
3910 and one for c. When optimizing for speed it is better to permute each
3911 of b, c and d into the order required by a, since those permutations
3912 happen in parallel. But when optimizing for size, it is better to:
3914 - permute c into the same order as b
3915 - do the arithmetic
3916 - permute the result into the order required by a
3918 This gives 2 permutations rather than 3. */
3920 class vect_optimize_slp_pass
3922 public:
3923 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3924 void run ();
3926 private:
3927 /* Graph building. */
3928 struct loop *containing_loop (slp_tree);
3929 bool is_cfg_latch_edge (graph_edge *);
3930 void build_vertices (hash_set<slp_tree> &, slp_tree);
3931 void build_vertices ();
3932 void build_graph ();
3934 /* Partitioning. */
3935 void create_partitions ();
3936 template<typename T> void for_each_partition_edge (unsigned int, T);
3938 /* Layout selection. */
3939 bool is_compatible_layout (slp_tree, unsigned int);
3940 int change_layout_cost (slp_tree, unsigned int, unsigned int);
3941 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3942 unsigned int);
3943 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3944 int, unsigned int);
3945 int internal_node_cost (slp_tree, int, unsigned int);
3946 void start_choosing_layouts ();
3948 /* Cost propagation. */
3949 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3950 unsigned int, unsigned int);
3951 slpg_layout_cost total_in_cost (unsigned int);
3952 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3953 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3954 void forward_pass ();
3955 void backward_pass ();
3957 /* Rematerialization. */
3958 slp_tree get_result_with_layout (slp_tree, unsigned int);
3959 void materialize ();
3961 /* Clean-up. */
3962 void remove_redundant_permutations ();
3964 void dump ();
3966 vec_info *m_vinfo;
3968 /* True if we should optimize the graph for size, false if we should
3969 optimize it for speed. (It wouldn't be easy to make this decision
3970 more locally.) */
3971 bool m_optimize_size;
3973 /* A graph of all SLP nodes, with edges leading from uses to definitions.
3974 In other words, a node's predecessors are its slp_tree parents and
3975 a node's successors are its slp_tree children. */
3976 graph *m_slpg = nullptr;
3978 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
3979 auto_vec<slpg_vertex> m_vertices;
3981 /* The list of all leaves of M_SLPG. such as external definitions, constants,
3982 and loads. */
3983 auto_vec<int> m_leafs;
3985 /* This array has one entry for every vector layout that we're considering.
3986 Element 0 is null and indicates "no change". Other entries describe
3987 permutations that are inherent in the current graph and that we would
3988 like to reverse if possible.
3990 For example, a permutation { 1, 2, 3, 0 } means that something has
3991 effectively been permuted in that way, such as a load group
3992 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
3993 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
3994 in order to put things "back" in order. */
3995 auto_vec<vec<unsigned> > m_perms;
3997 /* A partitioning of the nodes for which a layout must be chosen.
3998 Each partition represents an <SCC, cfg loop> pair; that is,
3999 nodes in different SCCs belong to different partitions, and nodes
4000 within an SCC can be further partitioned according to a containing
4001 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4003 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4004 from leaves (such as loads) to roots (such as stores).
4006 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4007 auto_vec<slpg_partition_info> m_partitions;
4009 /* The list of all nodes for which a layout must be chosen. Nodes for
4010 partition P come before the nodes for partition P+1. Nodes within a
4011 partition are in reverse postorder. */
4012 auto_vec<unsigned int> m_partitioned_nodes;
4014 /* Index P * num-layouts + L contains the cost of using layout L
4015 for partition P. */
4016 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4018 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4019 original output of node N adjusted to have layout L. */
4020 auto_vec<slp_tree> m_node_layouts;
4023 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4024 Also record whether we should optimize anything for speed rather
4025 than size. */
4027 void
4028 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4029 slp_tree node)
4031 unsigned i;
4032 slp_tree child;
4034 if (visited.add (node))
4035 return;
4037 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4039 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4040 if (optimize_bb_for_speed_p (bb))
4041 m_optimize_size = false;
4044 node->vertex = m_vertices.length ();
4045 m_vertices.safe_push (slpg_vertex (node));
4047 bool leaf = true;
4048 bool force_leaf = false;
4049 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4050 if (child)
4052 leaf = false;
4053 build_vertices (visited, child);
4055 else
4056 force_leaf = true;
4057 /* Since SLP discovery works along use-def edges all cycles have an
4058 entry - but there's the exception of cycles where we do not handle
4059 the entry explicitely (but with a NULL SLP node), like some reductions
4060 and inductions. Force those SLP PHIs to act as leafs to make them
4061 backwards reachable. */
4062 if (leaf || force_leaf)
4063 m_leafs.safe_push (node->vertex);
4066 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4068 void
4069 vect_optimize_slp_pass::build_vertices ()
4071 hash_set<slp_tree> visited;
4072 unsigned i;
4073 slp_instance instance;
4074 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4075 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4078 /* Apply (reverse) bijectite PERM to VEC. */
4080 template <class T>
4081 static void
4082 vect_slp_permute (vec<unsigned> perm,
4083 vec<T> &vec, bool reverse)
4085 auto_vec<T, 64> saved;
4086 saved.create (vec.length ());
4087 for (unsigned i = 0; i < vec.length (); ++i)
4088 saved.quick_push (vec[i]);
4090 if (reverse)
4092 for (unsigned i = 0; i < vec.length (); ++i)
4093 vec[perm[i]] = saved[i];
4094 for (unsigned i = 0; i < vec.length (); ++i)
4095 gcc_assert (vec[perm[i]] == saved[i]);
4097 else
4099 for (unsigned i = 0; i < vec.length (); ++i)
4100 vec[i] = saved[perm[i]];
4101 for (unsigned i = 0; i < vec.length (); ++i)
4102 gcc_assert (vec[i] == saved[perm[i]]);
4106 /* Return the cfg loop that contains NODE. */
4108 struct loop *
4109 vect_optimize_slp_pass::containing_loop (slp_tree node)
4111 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4112 if (!rep)
4113 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4114 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4117 /* Return true if UD (an edge from a use to a definition) is associated
4118 with a loop latch edge in the cfg. */
4120 bool
4121 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4123 slp_tree use = m_vertices[ud->src].node;
4124 slp_tree def = m_vertices[ud->dest].node;
4125 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4126 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4127 return false;
4129 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4130 return (is_a<gphi *> (use_rep->stmt)
4131 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4132 && containing_loop (def) == containing_loop (use));
4135 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4136 a nonnull data field. */
4138 void
4139 vect_optimize_slp_pass::build_graph ()
4141 m_optimize_size = true;
4142 build_vertices ();
4144 m_slpg = new_graph (m_vertices.length ());
4145 for (slpg_vertex &v : m_vertices)
4146 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4147 if (child)
4149 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4150 if (is_cfg_latch_edge (ud))
4151 ud->data = this;
4155 /* Return true if E corresponds to a loop latch edge in the cfg. */
4157 static bool
4158 skip_cfg_latch_edges (graph_edge *e)
4160 return e->data;
4163 /* Create the node partitions. */
4165 void
4166 vect_optimize_slp_pass::create_partitions ()
4168 /* Calculate a postorder of the graph, ignoring edges that correspond
4169 to natural latch edges in the cfg. Reading the vector from the end
4170 to the beginning gives the reverse postorder. */
4171 auto_vec<int> initial_rpo;
4172 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4173 false, NULL, skip_cfg_latch_edges);
4174 gcc_assert (initial_rpo.length () == m_vertices.length ());
4176 /* Calculate the strongly connected components of the graph. */
4177 auto_vec<int> scc_grouping;
4178 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4180 /* Create a new index order in which all nodes from the same SCC are
4181 consecutive. Use scc_pos to record the index of the first node in
4182 each SCC. */
4183 auto_vec<unsigned int> scc_pos (num_sccs);
4184 int last_component = -1;
4185 unsigned int node_count = 0;
4186 for (unsigned int node_i : scc_grouping)
4188 if (last_component != m_slpg->vertices[node_i].component)
4190 last_component = m_slpg->vertices[node_i].component;
4191 gcc_assert (last_component == int (scc_pos.length ()));
4192 scc_pos.quick_push (node_count);
4194 node_count += 1;
4196 gcc_assert (node_count == initial_rpo.length ()
4197 && last_component + 1 == int (num_sccs));
4199 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4200 inside each SCC following the RPO we calculated above. The fact that
4201 we ignored natural latch edges when calculating the RPO should ensure
4202 that, for natural loop nests:
4204 - the first node that we encounter in a cfg loop is the loop header phi
4205 - the loop header phis are in dominance order
4207 Arranging for this is an optimization (see below) rather than a
4208 correctness issue. Unnatural loops with a tangled mess of backedges
4209 will still work correctly, but might give poorer results.
4211 Also update scc_pos so that it gives 1 + the index of the last node
4212 in the SCC. */
4213 m_partitioned_nodes.safe_grow (node_count);
4214 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4216 unsigned int node_i = initial_rpo[old_i];
4217 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4218 m_partitioned_nodes[new_i] = node_i;
4221 /* When optimizing for speed, partition each SCC based on the containing
4222 cfg loop. The order we constructed above should ensure that, for natural
4223 cfg loops, we'll create sub-SCC partitions for outer loops before
4224 the corresponding sub-SCC partitions for inner loops. Similarly,
4225 when one sibling loop A dominates another sibling loop B, we should
4226 create a sub-SCC partition for A before a sub-SCC partition for B.
4228 As above, nothing depends for correctness on whether this achieves
4229 a natural nesting, but we should get better results when it does. */
4230 m_partitions.reserve (m_vertices.length ());
4231 unsigned int next_partition_i = 0;
4232 hash_map<struct loop *, int> loop_partitions;
4233 unsigned int rpo_begin = 0;
4234 unsigned int num_partitioned_nodes = 0;
4235 for (unsigned int rpo_end : scc_pos)
4237 loop_partitions.empty ();
4238 unsigned int partition_i = next_partition_i;
4239 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4241 /* Handle externals and constants optimistically throughout.
4242 But treat existing vectors as fixed since we do not handle
4243 permuting them. */
4244 unsigned int node_i = m_partitioned_nodes[rpo_i];
4245 auto &vertex = m_vertices[node_i];
4246 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4247 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4248 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4249 vertex.partition = -1;
4250 else
4252 bool existed;
4253 if (m_optimize_size)
4254 existed = next_partition_i > partition_i;
4255 else
4257 struct loop *loop = containing_loop (vertex.node);
4258 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4259 if (!existed)
4260 entry = next_partition_i;
4261 partition_i = entry;
4263 if (!existed)
4265 m_partitions.quick_push (slpg_partition_info ());
4266 next_partition_i += 1;
4268 vertex.partition = partition_i;
4269 num_partitioned_nodes += 1;
4270 m_partitions[partition_i].node_end += 1;
4273 rpo_begin = rpo_end;
4276 /* Assign ranges of consecutive node indices to each partition,
4277 in partition order. Start with node_end being the same as
4278 node_begin so that the next loop can use it as a counter. */
4279 unsigned int node_begin = 0;
4280 for (auto &partition : m_partitions)
4282 partition.node_begin = node_begin;
4283 node_begin += partition.node_end;
4284 partition.node_end = partition.node_begin;
4286 gcc_assert (node_begin == num_partitioned_nodes);
4288 /* Finally build the list of nodes in partition order. */
4289 m_partitioned_nodes.truncate (num_partitioned_nodes);
4290 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4292 int partition_i = m_vertices[node_i].partition;
4293 if (partition_i >= 0)
4295 unsigned int order_i = m_partitions[partition_i].node_end++;
4296 m_partitioned_nodes[order_i] = node_i;
4301 /* Look for edges from earlier partitions into node NODE_I and edges from
4302 node NODE_I into later partitions. Call:
4304 FN (ud, other_node_i)
4306 for each such use-to-def edge ud, where other_node_i is the node at the
4307 other end of the edge. */
4309 template<typename T>
4310 void
4311 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4313 int partition_i = m_vertices[node_i].partition;
4314 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4315 pred; pred = pred->pred_next)
4317 int src_partition_i = m_vertices[pred->src].partition;
4318 if (src_partition_i >= 0 && src_partition_i != partition_i)
4319 fn (pred, pred->src);
4321 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4322 succ; succ = succ->succ_next)
4324 int dest_partition_i = m_vertices[succ->dest].partition;
4325 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4326 fn (succ, succ->dest);
4330 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4331 that NODE would operate on. This test is independent of NODE's actual
4332 operation. */
4334 bool
4335 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4336 unsigned int layout_i)
4338 if (layout_i == 0)
4339 return true;
4341 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4342 return false;
4344 return true;
4347 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4348 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4349 layouts is incompatible with NODE or if the change is not possible for
4350 some other reason.
4352 The properties taken from NODE include the number of lanes and the
4353 vector type. The actual operation doesn't matter. */
4356 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4357 unsigned int from_layout_i,
4358 unsigned int to_layout_i)
4360 if (!is_compatible_layout (node, from_layout_i)
4361 || !is_compatible_layout (node, to_layout_i))
4362 return -1;
4364 if (from_layout_i == to_layout_i)
4365 return 0;
4367 auto_vec<slp_tree, 1> children (1);
4368 children.quick_push (node);
4369 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4370 if (from_layout_i > 0)
4371 for (unsigned int i : m_perms[from_layout_i])
4372 perm.quick_push ({ 0, i });
4373 else
4374 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4375 perm.quick_push ({ 0, i });
4376 if (to_layout_i > 0)
4377 vect_slp_permute (m_perms[to_layout_i], perm, true);
4378 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4379 children, false);
4380 if (count >= 0)
4381 return MAX (count, 1);
4383 /* ??? In principle we could try changing via layout 0, giving two
4384 layout changes rather than 1. Doing that would require
4385 corresponding support in get_result_with_layout. */
4386 return -1;
4389 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4391 inline slpg_partition_layout_costs &
4392 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4393 unsigned int layout_i)
4395 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4398 /* Change PERM in one of two ways:
4400 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4401 chosen for child I of NODE.
4403 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4405 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4407 void
4408 vect_optimize_slp_pass::
4409 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4410 int in_layout_i, unsigned int out_layout_i)
4412 for (auto &entry : perm)
4414 int this_in_layout_i = in_layout_i;
4415 if (this_in_layout_i < 0)
4417 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4418 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4419 this_in_layout_i = m_partitions[in_partition_i].layout;
4421 if (this_in_layout_i > 0)
4422 entry.second = m_perms[this_in_layout_i][entry.second];
4424 if (out_layout_i > 0)
4425 vect_slp_permute (m_perms[out_layout_i], perm, true);
4428 /* Check whether the target allows NODE to be rearranged so that the node's
4429 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4430 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4432 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4433 NODE can adapt to the layout changes that have (perhaps provisionally)
4434 been chosen for NODE's children, so that no extra permutations are
4435 needed on either the input or the output of NODE.
4437 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4438 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4440 IN_LAYOUT_I has no meaning for other types of node.
4442 Keeping the node as-is is always valid. If the target doesn't appear
4443 to support the node as-is, but might realistically support other layouts,
4444 then layout 0 instead has the cost of a worst-case permutation. On the
4445 one hand, this ensures that every node has at least one valid layout,
4446 avoiding what would otherwise be an awkward special case. On the other,
4447 it still encourages the pass to change an invalid pre-existing layout
4448 choice into a valid one. */
4451 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4452 unsigned int out_layout_i)
4454 const int fallback_cost = 1;
4456 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4458 auto_lane_permutation_t tmp_perm;
4459 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4461 /* Check that the child nodes support the chosen layout. Checking
4462 the first child is enough, since any second child would have the
4463 same shape. */
4464 auto first_child = SLP_TREE_CHILDREN (node)[0];
4465 if (in_layout_i > 0
4466 && !is_compatible_layout (first_child, in_layout_i))
4467 return -1;
4469 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4470 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4471 node, tmp_perm,
4472 SLP_TREE_CHILDREN (node),
4473 false);
4474 if (count < 0)
4476 if (in_layout_i == 0 && out_layout_i == 0)
4478 /* Use the fallback cost if the node could in principle support
4479 some nonzero layout for both the inputs and the outputs.
4480 Otherwise assume that the node will be rejected later
4481 and rebuilt from scalars. */
4482 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4483 return fallback_cost;
4484 return 0;
4486 return -1;
4489 /* We currently have no way of telling whether the new layout is cheaper
4490 or more expensive than the old one. But at least in principle,
4491 it should be worth making zero permutations (whole-vector shuffles)
4492 cheaper than real permutations, in case the pass is able to remove
4493 the latter. */
4494 return count == 0 ? 0 : 1;
4497 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4498 if (rep
4499 && STMT_VINFO_DATA_REF (rep)
4500 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4501 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4503 auto_load_permutation_t tmp_perm;
4504 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4505 if (out_layout_i > 0)
4506 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4508 poly_uint64 vf = 1;
4509 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4510 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4511 unsigned int n_perms;
4512 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4513 nullptr, vf, true, false, &n_perms))
4515 auto rep = SLP_TREE_REPRESENTATIVE (node);
4516 if (out_layout_i == 0)
4518 /* Use the fallback cost if the load is an N-to-N permutation.
4519 Otherwise assume that the node will be rejected later
4520 and rebuilt from scalars. */
4521 if (STMT_VINFO_GROUPED_ACCESS (rep)
4522 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4523 == SLP_TREE_LANES (node)))
4524 return fallback_cost;
4525 return 0;
4527 return -1;
4530 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4531 return n_perms == 0 ? 0 : 1;
4534 return 0;
4537 /* Decide which element layouts we should consider using. Calculate the
4538 weights associated with inserting layout changes on partition edges.
4539 Also mark partitions that cannot change layout, by setting their
4540 layout to zero. */
4542 void
4543 vect_optimize_slp_pass::start_choosing_layouts ()
4545 /* Used to assign unique permutation indices. */
4546 using perm_hash = unbounded_hashmap_traits<
4547 vec_free_hash_base<int_hash_base<unsigned>>,
4548 int_hash<int, -1, -2>
4550 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4552 /* Layout 0 is "no change". */
4553 m_perms.safe_push (vNULL);
4555 /* Create layouts from existing permutations. */
4556 auto_load_permutation_t tmp_perm;
4557 for (unsigned int node_i : m_partitioned_nodes)
4559 /* Leafs also double as entries to the reverse graph. Allow the
4560 layout of those to be changed. */
4561 auto &vertex = m_vertices[node_i];
4562 auto &partition = m_partitions[vertex.partition];
4563 if (!m_slpg->vertices[node_i].succ)
4564 partition.layout = 0;
4566 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4567 slp_tree node = vertex.node;
4568 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4569 slp_tree child;
4570 unsigned HOST_WIDE_INT imin, imax = 0;
4571 bool any_permute = false;
4572 tmp_perm.truncate (0);
4573 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4575 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4576 unpermuted, record a layout that reverses this permutation.
4578 We would need more work to cope with loads that are internally
4579 permuted and also have inputs (such as masks for
4580 IFN_MASK_LOADs). */
4581 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4582 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4583 continue;
4584 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4585 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4586 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4588 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4589 && SLP_TREE_CHILDREN (node).length () == 1
4590 && (child = SLP_TREE_CHILDREN (node)[0])
4591 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4592 .is_constant (&imin)))
4594 /* If the child has the same vector size as this node,
4595 reversing the permutation can make the permutation a no-op.
4596 In other cases it can change a true permutation into a
4597 full-vector extract. */
4598 tmp_perm.reserve (SLP_TREE_LANES (node));
4599 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4600 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4602 else
4603 continue;
4605 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4607 unsigned idx = tmp_perm[j];
4608 imin = MIN (imin, idx);
4609 imax = MAX (imax, idx);
4610 if (idx - tmp_perm[0] != j)
4611 any_permute = true;
4613 /* If the span doesn't match we'd disrupt VF computation, avoid
4614 that for now. */
4615 if (imax - imin + 1 != SLP_TREE_LANES (node))
4616 continue;
4617 /* If there's no permute no need to split one out. In this case
4618 we can consider turning a load into a permuted load, if that
4619 turns out to be cheaper than alternatives. */
4620 if (!any_permute)
4622 partition.layout = -1;
4623 continue;
4626 /* For now only handle true permutes, like
4627 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4628 when permuting constants and invariants keeping the permute
4629 bijective. */
4630 auto_sbitmap load_index (SLP_TREE_LANES (node));
4631 bitmap_clear (load_index);
4632 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4633 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4634 unsigned j;
4635 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4636 if (!bitmap_bit_p (load_index, j))
4637 break;
4638 if (j != SLP_TREE_LANES (node))
4639 continue;
4641 vec<unsigned> perm = vNULL;
4642 perm.safe_grow (SLP_TREE_LANES (node), true);
4643 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4644 perm[j] = tmp_perm[j] - imin;
4646 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4648 /* Continue to use existing layouts, but don't add any more. */
4649 int *entry = layout_ids.get (perm);
4650 partition.layout = entry ? *entry : 0;
4651 perm.release ();
4653 else
4655 bool existed;
4656 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4657 if (existed)
4658 perm.release ();
4659 else
4661 layout_i = m_perms.length ();
4662 m_perms.safe_push (perm);
4664 partition.layout = layout_i;
4668 /* Initially assume that every layout is possible and has zero cost
4669 in every partition. */
4670 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4671 * m_perms.length ());
4673 /* We have to mark outgoing permutations facing non-reduction graph
4674 entries that are not represented as to be materialized. */
4675 for (slp_instance instance : m_vinfo->slp_instances)
4676 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4678 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4679 m_partitions[m_vertices[node_i].partition].layout = 0;
4682 /* Check which layouts each node and partition can handle. Calculate the
4683 weights associated with inserting layout changes on edges. */
4684 for (unsigned int node_i : m_partitioned_nodes)
4686 auto &vertex = m_vertices[node_i];
4687 auto &partition = m_partitions[vertex.partition];
4688 slp_tree node = vertex.node;
4690 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4692 vertex.weight = vect_slp_node_weight (node);
4694 /* We do not handle stores with a permutation, so all
4695 incoming permutations must have been materialized.
4697 We also don't handle masked grouped loads, which lack a
4698 permutation vector. In this case the memory locations
4699 form an implicit second input to the loads, on top of the
4700 explicit mask input, and the memory input's layout cannot
4701 be changed.
4703 On the other hand, we do support permuting gather loads and
4704 masked gather loads, where each scalar load is independent
4705 of the others. This can be useful if the address/index input
4706 benefits from permutation. */
4707 if (STMT_VINFO_DATA_REF (rep)
4708 && STMT_VINFO_GROUPED_ACCESS (rep)
4709 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4710 partition.layout = 0;
4712 /* We cannot change the layout of an operation that is
4713 not independent on lanes. Note this is an explicit
4714 negative list since that's much shorter than the respective
4715 positive one but it's critical to keep maintaining it. */
4716 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4717 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4719 case CFN_COMPLEX_ADD_ROT90:
4720 case CFN_COMPLEX_ADD_ROT270:
4721 case CFN_COMPLEX_MUL:
4722 case CFN_COMPLEX_MUL_CONJ:
4723 case CFN_VEC_ADDSUB:
4724 case CFN_VEC_FMADDSUB:
4725 case CFN_VEC_FMSUBADD:
4726 partition.layout = 0;
4727 default:;
4731 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4733 auto &other_vertex = m_vertices[other_node_i];
4735 /* Count the number of edges from earlier partitions and the number
4736 of edges to later partitions. */
4737 if (other_vertex.partition < vertex.partition)
4738 partition.in_degree += 1;
4739 else
4740 partition.out_degree += 1;
4742 /* If the current node uses the result of OTHER_NODE_I, accumulate
4743 the effects of that. */
4744 if (ud->src == int (node_i))
4746 other_vertex.out_weight += vertex.weight;
4747 other_vertex.out_degree += 1;
4750 for_each_partition_edge (node_i, process_edge);
4754 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4755 its current (provisional) choice of layout. The inputs do not necessarily
4756 have the same layout as each other. */
4758 slpg_layout_cost
4759 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4761 auto &vertex = m_vertices[node_i];
4762 slpg_layout_cost cost;
4763 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4765 auto &other_vertex = m_vertices[other_node_i];
4766 if (other_vertex.partition < vertex.partition)
4768 auto &other_partition = m_partitions[other_vertex.partition];
4769 auto &other_costs = partition_layout_costs (other_vertex.partition,
4770 other_partition.layout);
4771 slpg_layout_cost this_cost = other_costs.in_cost;
4772 this_cost.add_serial_cost (other_costs.internal_cost);
4773 this_cost.split (other_partition.out_degree);
4774 cost.add_parallel_cost (this_cost);
4777 for_each_partition_edge (node_i, add_cost);
4778 return cost;
4781 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4782 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4783 slpg_layout_cost::impossible () if the change isn't possible. */
4785 slpg_layout_cost
4786 vect_optimize_slp_pass::
4787 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4788 unsigned int layout2_i)
4790 auto &def_vertex = m_vertices[ud->dest];
4791 auto &use_vertex = m_vertices[ud->src];
4792 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4793 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4794 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4795 use_layout_i);
4796 if (factor < 0)
4797 return slpg_layout_cost::impossible ();
4799 /* We have a choice of putting the layout change at the site of the
4800 definition or at the site of the use. Prefer the former when
4801 optimizing for size or when the execution frequency of the
4802 definition is no greater than the combined execution frequencies of
4803 the uses. When putting the layout change at the site of the definition,
4804 divvy up the cost among all consumers. */
4805 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4807 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4808 cost.split (def_vertex.out_degree);
4809 return cost;
4811 return { use_vertex.weight * factor, m_optimize_size };
4814 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4815 partition; FROM_NODE_I could be the definition node or the use node.
4816 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4817 Return the cost of any necessary fix-ups on edge UD, or return
4818 slpg_layout_cost::impossible () if the change isn't possible.
4820 At this point, FROM_NODE_I's partition has chosen the cheapest
4821 layout based on the information available so far, but this choice
4822 is only provisional. */
4824 slpg_layout_cost
4825 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4826 unsigned int to_layout_i)
4828 auto &from_vertex = m_vertices[from_node_i];
4829 unsigned int from_partition_i = from_vertex.partition;
4830 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4831 gcc_assert (from_partition.layout >= 0);
4833 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4834 with its current layout preference. */
4835 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4836 auto edge_cost = edge_layout_cost (ud, from_node_i,
4837 from_partition.layout, to_layout_i);
4838 if (edge_cost.is_possible ())
4840 auto &from_costs = partition_layout_costs (from_partition_i,
4841 from_partition.layout);
4842 cost = from_costs.in_cost;
4843 cost.add_serial_cost (from_costs.internal_cost);
4844 cost.split (from_partition.out_degree);
4845 cost.add_serial_cost (edge_cost);
4848 /* Take the minimum of that cost and the cost that applies if
4849 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4850 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4851 to_layout_i);
4852 if (direct_layout_costs.is_possible ())
4854 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4855 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4856 direct_cost.split (from_partition.out_degree);
4857 if (!cost.is_possible ()
4858 || direct_cost.is_better_than (cost, m_optimize_size))
4859 cost = direct_cost;
4862 return cost;
4865 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4866 partition; TO_NODE_I could be the definition node or the use node.
4867 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4868 return the cost of any necessary fix-ups on edge UD, or
4869 slpg_layout_cost::impossible () if the choice cannot be made.
4871 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4873 slpg_layout_cost
4874 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4875 unsigned int from_layout_i)
4877 auto &to_vertex = m_vertices[to_node_i];
4878 unsigned int to_partition_i = to_vertex.partition;
4879 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4880 gcc_assert (to_partition.layout >= 0);
4882 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4883 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4884 any other inputs keep their current choice of layout. */
4885 auto &to_costs = partition_layout_costs (to_partition_i,
4886 to_partition.layout);
4887 if (ud->src == int (to_node_i)
4888 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4890 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4891 auto old_layout = from_partition.layout;
4892 from_partition.layout = from_layout_i;
4893 int factor = internal_node_cost (to_vertex.node, -1,
4894 to_partition.layout);
4895 from_partition.layout = old_layout;
4896 if (factor >= 0)
4898 slpg_layout_cost cost = to_costs.out_cost;
4899 cost.add_serial_cost ({ to_vertex.weight * factor,
4900 m_optimize_size });
4901 cost.split (to_partition.in_degree);
4902 return cost;
4906 /* Compute the cost if we insert any necessary layout change on edge UD. */
4907 auto edge_cost = edge_layout_cost (ud, to_node_i,
4908 to_partition.layout, from_layout_i);
4909 if (edge_cost.is_possible ())
4911 slpg_layout_cost cost = to_costs.out_cost;
4912 cost.add_serial_cost (to_costs.internal_cost);
4913 cost.split (to_partition.in_degree);
4914 cost.add_serial_cost (edge_cost);
4915 return cost;
4918 return slpg_layout_cost::impossible ();
4921 /* Make a forward pass through the partitions, accumulating input costs.
4922 Make a tentative (provisional) choice of layout for each partition,
4923 ensuring that this choice still allows later partitions to keep
4924 their original layout. */
4926 void
4927 vect_optimize_slp_pass::forward_pass ()
4929 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4930 ++partition_i)
4932 auto &partition = m_partitions[partition_i];
4934 /* If the partition consists of a single VEC_PERM_EXPR, precompute
4935 the incoming cost that would apply if every predecessor partition
4936 keeps its current layout. This is used within the loop below. */
4937 slpg_layout_cost in_cost;
4938 slp_tree single_node = nullptr;
4939 if (partition.node_end == partition.node_begin + 1)
4941 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4942 single_node = m_vertices[node_i].node;
4943 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4944 in_cost = total_in_cost (node_i);
4947 /* Go through the possible layouts. Decide which ones are valid
4948 for this partition and record which of the valid layouts has
4949 the lowest cost. */
4950 unsigned int min_layout_i = 0;
4951 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4952 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4954 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4955 if (!layout_costs.is_possible ())
4956 continue;
4958 /* If the recorded layout is already 0 then the layout cannot
4959 change. */
4960 if (partition.layout == 0 && layout_i != 0)
4962 layout_costs.mark_impossible ();
4963 continue;
4966 bool is_possible = true;
4967 for (unsigned int order_i = partition.node_begin;
4968 order_i < partition.node_end; ++order_i)
4970 unsigned int node_i = m_partitioned_nodes[order_i];
4971 auto &vertex = m_vertices[node_i];
4973 /* Reject the layout if it is individually incompatible
4974 with any node in the partition. */
4975 if (!is_compatible_layout (vertex.node, layout_i))
4977 is_possible = false;
4978 break;
4981 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
4983 auto &other_vertex = m_vertices[other_node_i];
4984 if (other_vertex.partition < vertex.partition)
4986 /* Accumulate the incoming costs from earlier
4987 partitions, plus the cost of any layout changes
4988 on UD itself. */
4989 auto cost = forward_cost (ud, other_node_i, layout_i);
4990 if (!cost.is_possible ())
4991 is_possible = false;
4992 else
4993 layout_costs.in_cost.add_parallel_cost (cost);
4995 else
4996 /* Reject the layout if it would make layout 0 impossible
4997 for later partitions. This amounts to testing that the
4998 target supports reversing the layout change on edges
4999 to later partitions.
5001 In principle, it might be possible to push a layout
5002 change all the way down a graph, so that it never
5003 needs to be reversed and so that the target doesn't
5004 need to support the reverse operation. But it would
5005 be awkward to bail out if we hit a partition that
5006 does not support the new layout, especially since
5007 we are not dealing with a lattice. */
5008 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5009 layout_i).is_possible ();
5011 for_each_partition_edge (node_i, add_cost);
5013 /* Accumulate the cost of using LAYOUT_I within NODE,
5014 both for the inputs and the outputs. */
5015 int factor = internal_node_cost (vertex.node, layout_i,
5016 layout_i);
5017 if (factor < 0)
5019 is_possible = false;
5020 break;
5022 else if (factor)
5023 layout_costs.internal_cost.add_serial_cost
5024 ({ vertex.weight * factor, m_optimize_size });
5026 if (!is_possible)
5028 layout_costs.mark_impossible ();
5029 continue;
5032 /* Combine the incoming and partition-internal costs. */
5033 slpg_layout_cost combined_cost = layout_costs.in_cost;
5034 combined_cost.add_serial_cost (layout_costs.internal_cost);
5036 /* If this partition consists of a single VEC_PERM_EXPR, see
5037 if the VEC_PERM_EXPR can be changed to support output layout
5038 LAYOUT_I while keeping all the provisional choices of input
5039 layout. */
5040 if (single_node
5041 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5043 int factor = internal_node_cost (single_node, -1, layout_i);
5044 if (factor >= 0)
5046 auto weight = m_vertices[single_node->vertex].weight;
5047 slpg_layout_cost internal_cost
5048 = { weight * factor, m_optimize_size };
5050 slpg_layout_cost alt_cost = in_cost;
5051 alt_cost.add_serial_cost (internal_cost);
5052 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5054 combined_cost = alt_cost;
5055 layout_costs.in_cost = in_cost;
5056 layout_costs.internal_cost = internal_cost;
5061 /* Record the layout with the lowest cost. Prefer layout 0 in
5062 the event of a tie between it and another layout. */
5063 if (!min_layout_cost.is_possible ()
5064 || combined_cost.is_better_than (min_layout_cost,
5065 m_optimize_size))
5067 min_layout_i = layout_i;
5068 min_layout_cost = combined_cost;
5072 /* This loop's handling of earlier partitions should ensure that
5073 choosing the original layout for the current partition is no
5074 less valid than it was in the original graph, even with the
5075 provisional layout choices for those earlier partitions. */
5076 gcc_assert (min_layout_cost.is_possible ());
5077 partition.layout = min_layout_i;
5081 /* Make a backward pass through the partitions, accumulating output costs.
5082 Make a final choice of layout for each partition. */
5084 void
5085 vect_optimize_slp_pass::backward_pass ()
5087 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5089 auto &partition = m_partitions[partition_i];
5091 unsigned int min_layout_i = 0;
5092 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5093 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5095 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5096 if (!layout_costs.is_possible ())
5097 continue;
5099 /* Accumulate the costs from successor partitions. */
5100 bool is_possible = true;
5101 for (unsigned int order_i = partition.node_begin;
5102 order_i < partition.node_end; ++order_i)
5104 unsigned int node_i = m_partitioned_nodes[order_i];
5105 auto &vertex = m_vertices[node_i];
5106 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5108 auto &other_vertex = m_vertices[other_node_i];
5109 auto &other_partition = m_partitions[other_vertex.partition];
5110 if (other_vertex.partition > vertex.partition)
5112 /* Accumulate the incoming costs from later
5113 partitions, plus the cost of any layout changes
5114 on UD itself. */
5115 auto cost = backward_cost (ud, other_node_i, layout_i);
5116 if (!cost.is_possible ())
5117 is_possible = false;
5118 else
5119 layout_costs.out_cost.add_parallel_cost (cost);
5121 else
5122 /* Make sure that earlier partitions can (if necessary
5123 or beneficial) keep the layout that they chose in
5124 the forward pass. This ensures that there is at
5125 least one valid choice of layout. */
5126 is_possible &= edge_layout_cost (ud, other_node_i,
5127 other_partition.layout,
5128 layout_i).is_possible ();
5130 for_each_partition_edge (node_i, add_cost);
5132 if (!is_possible)
5134 layout_costs.mark_impossible ();
5135 continue;
5138 /* Locally combine the costs from the forward and backward passes.
5139 (This combined cost is not passed on, since that would lead
5140 to double counting.) */
5141 slpg_layout_cost combined_cost = layout_costs.in_cost;
5142 combined_cost.add_serial_cost (layout_costs.internal_cost);
5143 combined_cost.add_serial_cost (layout_costs.out_cost);
5145 /* Record the layout with the lowest cost. Prefer layout 0 in
5146 the event of a tie between it and another layout. */
5147 if (!min_layout_cost.is_possible ()
5148 || combined_cost.is_better_than (min_layout_cost,
5149 m_optimize_size))
5151 min_layout_i = layout_i;
5152 min_layout_cost = combined_cost;
5156 gcc_assert (min_layout_cost.is_possible ());
5157 partition.layout = min_layout_i;
5161 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5162 NODE already has the layout that was selected for its partition. */
5164 slp_tree
5165 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5166 unsigned int to_layout_i)
5168 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5169 slp_tree result = m_node_layouts[result_i];
5170 if (result)
5171 return result;
5173 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5174 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5176 /* If the vector is uniform or unchanged, there's nothing to do. */
5177 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5178 result = node;
5179 else
5181 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5182 result = vect_create_new_slp_node (scalar_ops);
5183 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5186 else
5188 unsigned int partition_i = m_vertices[node->vertex].partition;
5189 unsigned int from_layout_i = m_partitions[partition_i].layout;
5190 if (from_layout_i == to_layout_i)
5191 return node;
5193 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5194 permutation instead of a serial one. Leave the new permutation
5195 in TMP_PERM on success. */
5196 auto_lane_permutation_t tmp_perm;
5197 unsigned int num_inputs = 1;
5198 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5200 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5201 if (from_layout_i != 0)
5202 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5203 if (to_layout_i != 0)
5204 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5205 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5206 tmp_perm,
5207 SLP_TREE_CHILDREN (node),
5208 false) >= 0)
5209 num_inputs = SLP_TREE_CHILDREN (node).length ();
5210 else
5211 tmp_perm.truncate (0);
5214 if (dump_enabled_p ())
5216 if (tmp_perm.length () > 0)
5217 dump_printf_loc (MSG_NOTE, vect_location,
5218 "duplicating permutation node %p with"
5219 " layout %d\n",
5220 (void *) node, to_layout_i);
5221 else
5222 dump_printf_loc (MSG_NOTE, vect_location,
5223 "inserting permutation node in place of %p\n",
5224 (void *) node);
5227 unsigned int num_lanes = SLP_TREE_LANES (node);
5228 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5229 if (SLP_TREE_SCALAR_STMTS (node).length ())
5231 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5232 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5233 if (from_layout_i != 0)
5234 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5235 if (to_layout_i != 0)
5236 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5238 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5239 SLP_TREE_LANES (result) = num_lanes;
5240 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5241 result->vertex = -1;
5243 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5244 if (tmp_perm.length ())
5246 lane_perm.safe_splice (tmp_perm);
5247 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5249 else
5251 lane_perm.create (num_lanes);
5252 for (unsigned j = 0; j < num_lanes; ++j)
5253 lane_perm.quick_push ({ 0, j });
5254 if (from_layout_i != 0)
5255 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5256 if (to_layout_i != 0)
5257 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5258 SLP_TREE_CHILDREN (result).safe_push (node);
5260 for (slp_tree child : SLP_TREE_CHILDREN (result))
5261 child->refcnt++;
5263 m_node_layouts[result_i] = result;
5264 return result;
5267 /* Apply the chosen vector layouts to the SLP graph. */
5269 void
5270 vect_optimize_slp_pass::materialize ()
5272 /* We no longer need the costs, so avoid having two O(N * P) arrays
5273 live at the same time. */
5274 m_partition_layout_costs.release ();
5275 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5277 auto_sbitmap fully_folded (m_vertices.length ());
5278 bitmap_clear (fully_folded);
5279 for (unsigned int node_i : m_partitioned_nodes)
5281 auto &vertex = m_vertices[node_i];
5282 slp_tree node = vertex.node;
5283 int layout_i = m_partitions[vertex.partition].layout;
5284 gcc_assert (layout_i >= 0);
5286 /* Rearrange the scalar statements to match the chosen layout. */
5287 if (layout_i > 0)
5288 vect_slp_permute (m_perms[layout_i],
5289 SLP_TREE_SCALAR_STMTS (node), true);
5291 /* Update load and lane permutations. */
5292 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5294 /* First try to absorb the input vector layouts. If that fails,
5295 force the inputs to have layout LAYOUT_I too. We checked that
5296 that was possible before deciding to use nonzero output layouts.
5297 (Note that at this stage we don't really have any guarantee that
5298 the target supports the original VEC_PERM_EXPR.) */
5299 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5300 auto_lane_permutation_t tmp_perm;
5301 tmp_perm.safe_splice (perm);
5302 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5303 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5304 tmp_perm,
5305 SLP_TREE_CHILDREN (node),
5306 false) >= 0)
5308 if (dump_enabled_p ()
5309 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5310 perm.begin ()))
5311 dump_printf_loc (MSG_NOTE, vect_location,
5312 "absorbing input layouts into %p\n",
5313 (void *) node);
5314 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5315 bitmap_set_bit (fully_folded, node_i);
5317 else
5319 /* Not MSG_MISSED because it would make no sense to users. */
5320 if (dump_enabled_p ())
5321 dump_printf_loc (MSG_NOTE, vect_location,
5322 "failed to absorb input layouts into %p\n",
5323 (void *) node);
5324 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5327 else
5329 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5330 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5331 if (layout_i > 0)
5332 /* ??? When we handle non-bijective permutes the idea
5333 is that we can force the load-permutation to be
5334 { min, min + 1, min + 2, ... max }. But then the
5335 scalar defs might no longer match the lane content
5336 which means wrong-code with live lane vectorization.
5337 So we possibly have to have NULL entries for those. */
5338 vect_slp_permute (m_perms[layout_i], load_perm, true);
5342 /* Do this before any nodes disappear, since it involves a walk
5343 over the leaves. */
5344 remove_redundant_permutations ();
5346 /* Replace each child with a correctly laid-out version. */
5347 for (unsigned int node_i : m_partitioned_nodes)
5349 /* Skip nodes that have already been handled above. */
5350 if (bitmap_bit_p (fully_folded, node_i))
5351 continue;
5353 auto &vertex = m_vertices[node_i];
5354 int in_layout_i = m_partitions[vertex.partition].layout;
5355 gcc_assert (in_layout_i >= 0);
5357 unsigned j;
5358 slp_tree child;
5359 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5361 if (!child)
5362 continue;
5364 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5365 if (new_child != child)
5367 vect_free_slp_tree (child);
5368 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5369 new_child->refcnt += 1;
5375 /* Elide load permutations that are not necessary. Such permutations might
5376 be pre-existing, rather than created by the layout optimizations. */
5378 void
5379 vect_optimize_slp_pass::remove_redundant_permutations ()
5381 for (unsigned int node_i : m_leafs)
5383 slp_tree node = m_vertices[node_i].node;
5384 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5385 continue;
5387 /* In basic block vectorization we allow any subchain of an interleaving
5388 chain.
5389 FORNOW: not in loop SLP because of realignment complications. */
5390 if (is_a <bb_vec_info> (m_vinfo))
5392 bool subchain_p = true;
5393 stmt_vec_info next_load_info = NULL;
5394 stmt_vec_info load_info;
5395 unsigned j;
5396 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5398 if (j != 0
5399 && (next_load_info != load_info
5400 || DR_GROUP_GAP (load_info) != 1))
5402 subchain_p = false;
5403 break;
5405 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5407 if (subchain_p)
5409 SLP_TREE_LOAD_PERMUTATION (node).release ();
5410 continue;
5413 else
5415 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5416 stmt_vec_info load_info;
5417 bool this_load_permuted = false;
5418 unsigned j;
5419 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5420 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5422 this_load_permuted = true;
5423 break;
5425 stmt_vec_info first_stmt_info
5426 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5427 if (!this_load_permuted
5428 /* The load requires permutation when unrolling exposes
5429 a gap either because the group is larger than the SLP
5430 group-size or because there is a gap between the groups. */
5431 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5432 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5433 && DR_GROUP_GAP (first_stmt_info) == 0)))
5435 SLP_TREE_LOAD_PERMUTATION (node).release ();
5436 continue;
5442 /* Print the partition graph and layout information to the dump file. */
5444 void
5445 vect_optimize_slp_pass::dump ()
5447 dump_printf_loc (MSG_NOTE, vect_location,
5448 "SLP optimize permutations:\n");
5449 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5451 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5452 const char *sep = "";
5453 for (unsigned int idx : m_perms[layout_i])
5455 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5456 sep = ", ";
5458 dump_printf (MSG_NOTE, " }\n");
5460 dump_printf_loc (MSG_NOTE, vect_location,
5461 "SLP optimize partitions:\n");
5462 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5463 ++partition_i)
5465 auto &partition = m_partitions[partition_i];
5466 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5467 dump_printf_loc (MSG_NOTE, vect_location,
5468 " partition %d (layout %d):\n",
5469 partition_i, partition.layout);
5470 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5471 for (unsigned int order_i = partition.node_begin;
5472 order_i < partition.node_end; ++order_i)
5474 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5475 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5476 (void *) vertex.node);
5477 dump_printf_loc (MSG_NOTE, vect_location,
5478 " weight: %f\n",
5479 vertex.weight.to_double ());
5480 if (vertex.out_degree)
5481 dump_printf_loc (MSG_NOTE, vect_location,
5482 " out weight: %f (degree %d)\n",
5483 vertex.out_weight.to_double (),
5484 vertex.out_degree);
5485 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5486 dump_printf_loc (MSG_NOTE, vect_location,
5487 " op: VEC_PERM_EXPR\n");
5488 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5489 dump_printf_loc (MSG_NOTE, vect_location,
5490 " op template: %G", rep->stmt);
5492 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5493 for (unsigned int order_i = partition.node_begin;
5494 order_i < partition.node_end; ++order_i)
5496 unsigned int node_i = m_partitioned_nodes[order_i];
5497 auto &vertex = m_vertices[node_i];
5498 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5500 auto &other_vertex = m_vertices[other_node_i];
5501 if (other_vertex.partition < vertex.partition)
5502 dump_printf_loc (MSG_NOTE, vect_location,
5503 " - %p [%d] --> %p\n",
5504 (void *) other_vertex.node,
5505 other_vertex.partition,
5506 (void *) vertex.node);
5507 else
5508 dump_printf_loc (MSG_NOTE, vect_location,
5509 " - %p --> [%d] %p\n",
5510 (void *) vertex.node,
5511 other_vertex.partition,
5512 (void *) other_vertex.node);
5514 for_each_partition_edge (node_i, print_edge);
5517 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5519 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5520 if (layout_costs.is_possible ())
5522 dump_printf_loc (MSG_NOTE, vect_location,
5523 " layout %d:%s\n", layout_i,
5524 partition.layout == int (layout_i)
5525 ? " (*)" : "");
5526 slpg_layout_cost combined_cost = layout_costs.in_cost;
5527 combined_cost.add_serial_cost (layout_costs.internal_cost);
5528 combined_cost.add_serial_cost (layout_costs.out_cost);
5529 #define TEMPLATE "{depth: %f, total: %f}"
5530 dump_printf_loc (MSG_NOTE, vect_location,
5531 " " TEMPLATE "\n",
5532 layout_costs.in_cost.depth.to_double (),
5533 layout_costs.in_cost.total.to_double ());
5534 dump_printf_loc (MSG_NOTE, vect_location,
5535 " + " TEMPLATE "\n",
5536 layout_costs.internal_cost.depth.to_double (),
5537 layout_costs.internal_cost.total.to_double ());
5538 dump_printf_loc (MSG_NOTE, vect_location,
5539 " + " TEMPLATE "\n",
5540 layout_costs.out_cost.depth.to_double (),
5541 layout_costs.out_cost.total.to_double ());
5542 dump_printf_loc (MSG_NOTE, vect_location,
5543 " = " TEMPLATE "\n",
5544 combined_cost.depth.to_double (),
5545 combined_cost.total.to_double ());
5546 #undef TEMPLATE
5548 else
5549 dump_printf_loc (MSG_NOTE, vect_location,
5550 " layout %d: rejected\n", layout_i);
5555 /* Main entry point for the SLP graph optimization pass. */
5557 void
5558 vect_optimize_slp_pass::run ()
5560 build_graph ();
5561 create_partitions ();
5562 start_choosing_layouts ();
5563 if (m_perms.length () > 1)
5565 forward_pass ();
5566 backward_pass ();
5567 if (dump_enabled_p ())
5568 dump ();
5569 materialize ();
5570 while (!m_perms.is_empty ())
5571 m_perms.pop ().release ();
5573 else
5574 remove_redundant_permutations ();
5575 free_graph (m_slpg);
5578 /* Optimize the SLP graph of VINFO. */
5580 void
5581 vect_optimize_slp (vec_info *vinfo)
5583 if (vinfo->slp_instances.is_empty ())
5584 return;
5585 vect_optimize_slp_pass (vinfo).run ();
5588 /* Gather loads reachable from the individual SLP graph entries. */
5590 void
5591 vect_gather_slp_loads (vec_info *vinfo)
5593 unsigned i;
5594 slp_instance instance;
5595 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5597 hash_set<slp_tree> visited;
5598 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5599 SLP_INSTANCE_TREE (instance), visited);
5604 /* For each possible SLP instance decide whether to SLP it and calculate overall
5605 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5606 least one instance. */
5608 bool
5609 vect_make_slp_decision (loop_vec_info loop_vinfo)
5611 unsigned int i;
5612 poly_uint64 unrolling_factor = 1;
5613 const vec<slp_instance> &slp_instances
5614 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5615 slp_instance instance;
5616 int decided_to_slp = 0;
5618 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5620 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5622 /* FORNOW: SLP if you can. */
5623 /* All unroll factors have the form:
5625 GET_MODE_SIZE (vinfo->vector_mode) * X
5627 for some rational X, so they must have a common multiple. */
5628 unrolling_factor
5629 = force_common_multiple (unrolling_factor,
5630 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5632 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5633 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5634 loop-based vectorization. Such stmts will be marked as HYBRID. */
5635 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5636 decided_to_slp++;
5639 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5641 if (decided_to_slp && dump_enabled_p ())
5643 dump_printf_loc (MSG_NOTE, vect_location,
5644 "Decided to SLP %d instances. Unrolling factor ",
5645 decided_to_slp);
5646 dump_dec (MSG_NOTE, unrolling_factor);
5647 dump_printf (MSG_NOTE, "\n");
5650 return (decided_to_slp > 0);
5653 /* Private data for vect_detect_hybrid_slp. */
5654 struct vdhs_data
5656 loop_vec_info loop_vinfo;
5657 vec<stmt_vec_info> *worklist;
5660 /* Walker for walk_gimple_op. */
5662 static tree
5663 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5665 walk_stmt_info *wi = (walk_stmt_info *)data;
5666 vdhs_data *dat = (vdhs_data *)wi->info;
5668 if (wi->is_lhs)
5669 return NULL_TREE;
5671 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5672 if (!def_stmt_info)
5673 return NULL_TREE;
5674 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5675 if (PURE_SLP_STMT (def_stmt_info))
5677 if (dump_enabled_p ())
5678 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5679 def_stmt_info->stmt);
5680 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5681 dat->worklist->safe_push (def_stmt_info);
5684 return NULL_TREE;
5687 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5688 if so, otherwise pushing it to WORKLIST. */
5690 static void
5691 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5692 vec<stmt_vec_info> &worklist,
5693 stmt_vec_info stmt_info)
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_NOTE, vect_location,
5697 "Processing hybrid candidate : %G", stmt_info->stmt);
5698 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5699 imm_use_iterator iter2;
5700 ssa_op_iter iter1;
5701 use_operand_p use_p;
5702 def_operand_p def_p;
5703 bool any_def = false;
5704 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5706 any_def = true;
5707 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5709 if (is_gimple_debug (USE_STMT (use_p)))
5710 continue;
5711 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5712 /* An out-of loop use means this is a loop_vect sink. */
5713 if (!use_info)
5715 if (dump_enabled_p ())
5716 dump_printf_loc (MSG_NOTE, vect_location,
5717 "Found loop_vect sink: %G", stmt_info->stmt);
5718 worklist.safe_push (stmt_info);
5719 return;
5721 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5723 if (dump_enabled_p ())
5724 dump_printf_loc (MSG_NOTE, vect_location,
5725 "Found loop_vect use: %G", use_info->stmt);
5726 worklist.safe_push (stmt_info);
5727 return;
5731 /* No def means this is a loo_vect sink. */
5732 if (!any_def)
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_NOTE, vect_location,
5736 "Found loop_vect sink: %G", stmt_info->stmt);
5737 worklist.safe_push (stmt_info);
5738 return;
5740 if (dump_enabled_p ())
5741 dump_printf_loc (MSG_NOTE, vect_location,
5742 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5743 STMT_SLP_TYPE (stmt_info) = pure_slp;
5746 /* Find stmts that must be both vectorized and SLPed. */
5748 void
5749 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5751 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5753 /* All stmts participating in SLP are marked pure_slp, all other
5754 stmts are loop_vect.
5755 First collect all loop_vect stmts into a worklist.
5756 SLP patterns cause not all original scalar stmts to appear in
5757 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5758 Rectify this here and do a backward walk over the IL only considering
5759 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5760 mark them as pure_slp. */
5761 auto_vec<stmt_vec_info> worklist;
5762 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5764 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5765 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5766 gsi_next (&gsi))
5768 gphi *phi = gsi.phi ();
5769 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5770 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5771 maybe_push_to_hybrid_worklist (loop_vinfo,
5772 worklist, stmt_info);
5774 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5775 gsi_prev (&gsi))
5777 gimple *stmt = gsi_stmt (gsi);
5778 if (is_gimple_debug (stmt))
5779 continue;
5780 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5781 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5783 for (gimple_stmt_iterator gsi2
5784 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5785 !gsi_end_p (gsi2); gsi_next (&gsi2))
5787 stmt_vec_info patt_info
5788 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5789 if (!STMT_SLP_TYPE (patt_info)
5790 && STMT_VINFO_RELEVANT (patt_info))
5791 maybe_push_to_hybrid_worklist (loop_vinfo,
5792 worklist, patt_info);
5794 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5796 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5797 maybe_push_to_hybrid_worklist (loop_vinfo,
5798 worklist, stmt_info);
5802 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5803 mark any SLP vectorized stmt as hybrid.
5804 ??? We're visiting def stmts N times (once for each non-SLP and
5805 once for each hybrid-SLP use). */
5806 walk_stmt_info wi;
5807 vdhs_data dat;
5808 dat.worklist = &worklist;
5809 dat.loop_vinfo = loop_vinfo;
5810 memset (&wi, 0, sizeof (wi));
5811 wi.info = (void *)&dat;
5812 while (!worklist.is_empty ())
5814 stmt_vec_info stmt_info = worklist.pop ();
5815 /* Since SSA operands are not set up for pattern stmts we need
5816 to use walk_gimple_op. */
5817 wi.is_lhs = 0;
5818 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5819 /* For gather/scatter make sure to walk the offset operand, that
5820 can be a scaling and conversion away. */
5821 gather_scatter_info gs_info;
5822 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5823 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5825 int dummy;
5826 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5832 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5834 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5835 : vec_info (vec_info::bb, shared),
5836 bbs (_bbs),
5837 roots (vNULL)
5839 for (unsigned i = 0; i < bbs.length (); ++i)
5841 if (i != 0)
5842 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5843 gsi_next (&si))
5845 gphi *phi = si.phi ();
5846 gimple_set_uid (phi, 0);
5847 add_stmt (phi);
5849 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5850 !gsi_end_p (gsi); gsi_next (&gsi))
5852 gimple *stmt = gsi_stmt (gsi);
5853 gimple_set_uid (stmt, 0);
5854 if (is_gimple_debug (stmt))
5855 continue;
5856 add_stmt (stmt);
5862 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5863 stmts in the basic block. */
5865 _bb_vec_info::~_bb_vec_info ()
5867 /* Reset region marker. */
5868 for (unsigned i = 0; i < bbs.length (); ++i)
5870 if (i != 0)
5871 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5872 gsi_next (&si))
5874 gphi *phi = si.phi ();
5875 gimple_set_uid (phi, -1);
5877 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5878 !gsi_end_p (gsi); gsi_next (&gsi))
5880 gimple *stmt = gsi_stmt (gsi);
5881 gimple_set_uid (stmt, -1);
5885 for (unsigned i = 0; i < roots.length (); ++i)
5887 roots[i].stmts.release ();
5888 roots[i].roots.release ();
5890 roots.release ();
5893 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5894 given then that child nodes have already been processed, and that
5895 their def types currently match their SLP node's def type. */
5897 static bool
5898 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5899 slp_instance node_instance,
5900 stmt_vector_for_cost *cost_vec)
5902 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5904 /* Calculate the number of vector statements to be created for the
5905 scalar stmts in this node. For SLP reductions it is equal to the
5906 number of vector statements in the children (which has already been
5907 calculated by the recursive call). Otherwise it is the number of
5908 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5909 VF divided by the number of elements in a vector. */
5910 if (!STMT_VINFO_DATA_REF (stmt_info)
5911 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5913 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5914 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5916 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5917 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5918 break;
5921 else
5923 poly_uint64 vf;
5924 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5925 vf = loop_vinfo->vectorization_factor;
5926 else
5927 vf = 1;
5928 unsigned int group_size = SLP_TREE_LANES (node);
5929 tree vectype = SLP_TREE_VECTYPE (node);
5930 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5931 = vect_get_num_vectors (vf * group_size, vectype);
5934 /* Handle purely internal nodes. */
5935 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5937 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5938 return false;
5940 stmt_vec_info slp_stmt_info;
5941 unsigned int i;
5942 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5944 if (STMT_VINFO_LIVE_P (slp_stmt_info)
5945 && !vectorizable_live_operation (vinfo,
5946 slp_stmt_info, NULL, node,
5947 node_instance, i,
5948 false, cost_vec))
5949 return false;
5951 return true;
5954 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
5956 bool dummy;
5957 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
5958 node, node_instance, cost_vec);
5961 /* Try to build NODE from scalars, returning true on success.
5962 NODE_INSTANCE is the SLP instance that contains NODE. */
5964 static bool
5965 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
5966 slp_instance node_instance)
5968 stmt_vec_info stmt_info;
5969 unsigned int i;
5971 if (!is_a <bb_vec_info> (vinfo)
5972 || node == SLP_INSTANCE_TREE (node_instance)
5973 || !SLP_TREE_SCALAR_STMTS (node).exists ()
5974 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
5975 /* Force the mask use to be built from scalars instead. */
5976 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
5977 return false;
5979 if (dump_enabled_p ())
5980 dump_printf_loc (MSG_NOTE, vect_location,
5981 "Building vector operands of %p from scalars instead\n",
5982 (void *) node);
5984 /* Don't remove and free the child nodes here, since they could be
5985 referenced by other structures. The analysis and scheduling phases
5986 (need to) ignore child nodes of anything that isn't vect_internal_def. */
5987 unsigned int group_size = SLP_TREE_LANES (node);
5988 SLP_TREE_DEF_TYPE (node) = vect_external_def;
5989 /* Invariants get their vector type from the uses. */
5990 SLP_TREE_VECTYPE (node) = NULL_TREE;
5991 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
5992 SLP_TREE_LOAD_PERMUTATION (node).release ();
5993 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5995 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5996 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
5998 return true;
6001 /* Return true if all elements of the slice are the same. */
6002 bool
6003 vect_scalar_ops_slice::all_same_p () const
6005 for (unsigned int i = 1; i < length; ++i)
6006 if (!operand_equal_p (op (0), op (i)))
6007 return false;
6008 return true;
6011 hashval_t
6012 vect_scalar_ops_slice_hash::hash (const value_type &s)
6014 hashval_t hash = 0;
6015 for (unsigned i = 0; i < s.length; ++i)
6016 hash = iterative_hash_expr (s.op (i), hash);
6017 return hash;
6020 bool
6021 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6022 const compare_type &s2)
6024 if (s1.length != s2.length)
6025 return false;
6026 for (unsigned i = 0; i < s1.length; ++i)
6027 if (!operand_equal_p (s1.op (i), s2.op (i)))
6028 return false;
6029 return true;
6032 /* Compute the prologue cost for invariant or constant operands represented
6033 by NODE. */
6035 static void
6036 vect_prologue_cost_for_slp (slp_tree node,
6037 stmt_vector_for_cost *cost_vec)
6039 /* There's a special case of an existing vector, that costs nothing. */
6040 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6041 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6042 return;
6043 /* Without looking at the actual initializer a vector of
6044 constants can be implemented as load from the constant pool.
6045 When all elements are the same we can use a splat. */
6046 tree vectype = SLP_TREE_VECTYPE (node);
6047 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6048 unsigned HOST_WIDE_INT const_nunits;
6049 unsigned nelt_limit;
6050 auto ops = &SLP_TREE_SCALAR_OPS (node);
6051 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6052 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6053 && ! multiple_p (const_nunits, group_size))
6055 nelt_limit = const_nunits;
6056 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6057 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6058 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6059 starts.quick_push (i * const_nunits);
6061 else
6063 /* If either the vector has variable length or the vectors
6064 are composed of repeated whole groups we only need to
6065 cost construction once. All vectors will be the same. */
6066 nelt_limit = group_size;
6067 starts.quick_push (0);
6069 /* ??? We're just tracking whether vectors in a single node are the same.
6070 Ideally we'd do something more global. */
6071 for (unsigned int start : starts)
6073 vect_cost_for_stmt kind;
6074 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6075 kind = vector_load;
6076 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6077 kind = scalar_to_vec;
6078 else
6079 kind = vec_construct;
6080 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
6084 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6085 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6087 Return true if the operations are supported. */
6089 static bool
6090 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6091 slp_instance node_instance,
6092 hash_set<slp_tree> &visited_set,
6093 vec<slp_tree> &visited_vec,
6094 stmt_vector_for_cost *cost_vec)
6096 int i, j;
6097 slp_tree child;
6099 /* Assume we can code-generate all invariants. */
6100 if (!node
6101 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6102 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6103 return true;
6105 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6107 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE, vect_location,
6109 "Failed cyclic SLP reference in %p\n", (void *) node);
6110 return false;
6112 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6114 /* If we already analyzed the exact same set of scalar stmts we're done.
6115 We share the generated vector stmts for those. */
6116 if (visited_set.add (node))
6117 return true;
6118 visited_vec.safe_push (node);
6120 bool res = true;
6121 unsigned visited_rec_start = visited_vec.length ();
6122 unsigned cost_vec_rec_start = cost_vec->length ();
6123 bool seen_non_constant_child = false;
6124 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6126 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6127 visited_set, visited_vec,
6128 cost_vec);
6129 if (!res)
6130 break;
6131 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6132 seen_non_constant_child = true;
6134 /* We're having difficulties scheduling nodes with just constant
6135 operands and no scalar stmts since we then cannot compute a stmt
6136 insertion place. */
6137 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6139 if (dump_enabled_p ())
6140 dump_printf_loc (MSG_NOTE, vect_location,
6141 "Cannot vectorize all-constant op node %p\n",
6142 (void *) node);
6143 res = false;
6146 if (res)
6147 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6148 cost_vec);
6149 /* If analysis failed we have to pop all recursive visited nodes
6150 plus ourselves. */
6151 if (!res)
6153 while (visited_vec.length () >= visited_rec_start)
6154 visited_set.remove (visited_vec.pop ());
6155 cost_vec->truncate (cost_vec_rec_start);
6158 /* When the node can be vectorized cost invariant nodes it references.
6159 This is not done in DFS order to allow the refering node
6160 vectorizable_* calls to nail down the invariant nodes vector type
6161 and possibly unshare it if it needs a different vector type than
6162 other referrers. */
6163 if (res)
6164 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6165 if (child
6166 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6167 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6168 /* Perform usual caching, note code-generation still
6169 code-gens these nodes multiple times but we expect
6170 to CSE them later. */
6171 && !visited_set.add (child))
6173 visited_vec.safe_push (child);
6174 /* ??? After auditing more code paths make a "default"
6175 and push the vector type from NODE to all children
6176 if it is not already set. */
6177 /* Compute the number of vectors to be generated. */
6178 tree vector_type = SLP_TREE_VECTYPE (child);
6179 if (!vector_type)
6181 /* For shifts with a scalar argument we don't need
6182 to cost or code-generate anything.
6183 ??? Represent this more explicitely. */
6184 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6185 == shift_vec_info_type)
6186 && j == 1);
6187 continue;
6189 unsigned group_size = SLP_TREE_LANES (child);
6190 poly_uint64 vf = 1;
6191 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6192 vf = loop_vinfo->vectorization_factor;
6193 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6194 = vect_get_num_vectors (vf * group_size, vector_type);
6195 /* And cost them. */
6196 vect_prologue_cost_for_slp (child, cost_vec);
6199 /* If this node or any of its children can't be vectorized, try pruning
6200 the tree here rather than felling the whole thing. */
6201 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6203 /* We'll need to revisit this for invariant costing and number
6204 of vectorized stmt setting. */
6205 res = true;
6208 return res;
6211 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6212 region and that can be vectorized using vectorizable_live_operation
6213 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6214 scalar code computing it to be retained. */
6216 static void
6217 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6218 slp_instance instance,
6219 stmt_vector_for_cost *cost_vec,
6220 hash_set<stmt_vec_info> &svisited,
6221 hash_set<slp_tree> &visited)
6223 if (visited.add (node))
6224 return;
6226 unsigned i;
6227 stmt_vec_info stmt_info;
6228 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6229 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6231 if (svisited.contains (stmt_info))
6232 continue;
6233 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6234 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6235 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6236 /* Only the pattern root stmt computes the original scalar value. */
6237 continue;
6238 bool mark_visited = true;
6239 gimple *orig_stmt = orig_stmt_info->stmt;
6240 ssa_op_iter op_iter;
6241 def_operand_p def_p;
6242 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6244 imm_use_iterator use_iter;
6245 gimple *use_stmt;
6246 stmt_vec_info use_stmt_info;
6247 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6248 if (!is_gimple_debug (use_stmt))
6250 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6251 if (!use_stmt_info
6252 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6254 STMT_VINFO_LIVE_P (stmt_info) = true;
6255 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6256 NULL, node, instance, i,
6257 false, cost_vec))
6258 /* ??? So we know we can vectorize the live stmt
6259 from one SLP node. If we cannot do so from all
6260 or none consistently we'd have to record which
6261 SLP node (and lane) we want to use for the live
6262 operation. So make sure we can code-generate
6263 from all nodes. */
6264 mark_visited = false;
6265 else
6266 STMT_VINFO_LIVE_P (stmt_info) = false;
6267 break;
6270 /* We have to verify whether we can insert the lane extract
6271 before all uses. The following is a conservative approximation.
6272 We cannot put this into vectorizable_live_operation because
6273 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6274 doesn't work.
6275 Note that while the fact that we emit code for loads at the
6276 first load should make this a non-problem leafs we construct
6277 from scalars are vectorized after the last scalar def.
6278 ??? If we'd actually compute the insert location during
6279 analysis we could use sth less conservative than the last
6280 scalar stmt in the node for the dominance check. */
6281 /* ??? What remains is "live" uses in vector CTORs in the same
6282 SLP graph which is where those uses can end up code-generated
6283 right after their definition instead of close to their original
6284 use. But that would restrict us to code-generate lane-extracts
6285 from the latest stmt in a node. So we compensate for this
6286 during code-generation, simply not replacing uses for those
6287 hopefully rare cases. */
6288 if (STMT_VINFO_LIVE_P (stmt_info))
6289 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6290 if (!is_gimple_debug (use_stmt)
6291 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6292 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6293 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6295 if (dump_enabled_p ())
6296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297 "Cannot determine insertion place for "
6298 "lane extract\n");
6299 STMT_VINFO_LIVE_P (stmt_info) = false;
6300 mark_visited = true;
6303 if (mark_visited)
6304 svisited.add (stmt_info);
6307 slp_tree child;
6308 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6309 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6310 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6311 cost_vec, svisited, visited);
6314 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6316 static bool
6317 vectorizable_bb_reduc_epilogue (slp_instance instance,
6318 stmt_vector_for_cost *cost_vec)
6320 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6321 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6322 if (reduc_code == MINUS_EXPR)
6323 reduc_code = PLUS_EXPR;
6324 internal_fn reduc_fn;
6325 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6326 if (!vectype
6327 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6328 || reduc_fn == IFN_LAST
6329 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6330 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6331 TREE_TYPE (vectype)))
6332 return false;
6334 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6335 cost log2 vector operations plus shuffles and one extraction. */
6336 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6337 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6338 vectype, 0, vect_body);
6339 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6340 vectype, 0, vect_body);
6341 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6342 vectype, 0, vect_body);
6343 return true;
6346 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6347 and recurse to children. */
6349 static void
6350 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6351 hash_set<slp_tree> &visited)
6353 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6354 || visited.add (node))
6355 return;
6357 stmt_vec_info stmt;
6358 unsigned i;
6359 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6360 roots.remove (vect_orig_stmt (stmt));
6362 slp_tree child;
6363 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6364 if (child)
6365 vect_slp_prune_covered_roots (child, roots, visited);
6368 /* Analyze statements in SLP instances of VINFO. Return true if the
6369 operations are supported. */
6371 bool
6372 vect_slp_analyze_operations (vec_info *vinfo)
6374 slp_instance instance;
6375 int i;
6377 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6379 hash_set<slp_tree> visited;
6380 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6382 auto_vec<slp_tree> visited_vec;
6383 stmt_vector_for_cost cost_vec;
6384 cost_vec.create (2);
6385 if (is_a <bb_vec_info> (vinfo))
6386 vect_location = instance->location ();
6387 if (!vect_slp_analyze_node_operations (vinfo,
6388 SLP_INSTANCE_TREE (instance),
6389 instance, visited, visited_vec,
6390 &cost_vec)
6391 /* CTOR instances require vectorized defs for the SLP tree root. */
6392 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6393 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6394 != vect_internal_def
6395 /* Make sure we vectorized with the expected type. */
6396 || !useless_type_conversion_p
6397 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6398 (instance->root_stmts[0]->stmt))),
6399 TREE_TYPE (SLP_TREE_VECTYPE
6400 (SLP_INSTANCE_TREE (instance))))))
6401 /* Check we can vectorize the reduction. */
6402 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6403 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6405 slp_tree node = SLP_INSTANCE_TREE (instance);
6406 stmt_vec_info stmt_info;
6407 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6408 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6409 else
6410 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6411 if (dump_enabled_p ())
6412 dump_printf_loc (MSG_NOTE, vect_location,
6413 "removing SLP instance operations starting from: %G",
6414 stmt_info->stmt);
6415 vect_free_slp_instance (instance);
6416 vinfo->slp_instances.ordered_remove (i);
6417 cost_vec.release ();
6418 while (!visited_vec.is_empty ())
6419 visited.remove (visited_vec.pop ());
6421 else
6423 i++;
6424 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6426 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6427 cost_vec.release ();
6429 else
6430 /* For BB vectorization remember the SLP graph entry
6431 cost for later. */
6432 instance->cost_vec = cost_vec;
6436 /* Now look for SLP instances with a root that are covered by other
6437 instances and remove them. */
6438 hash_set<stmt_vec_info> roots;
6439 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6440 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6441 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6442 if (!roots.is_empty ())
6444 visited.empty ();
6445 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6446 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6447 visited);
6448 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6449 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6450 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6452 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6453 if (dump_enabled_p ())
6454 dump_printf_loc (MSG_NOTE, vect_location,
6455 "removing SLP instance operations starting "
6456 "from: %G", root->stmt);
6457 vect_free_slp_instance (instance);
6458 vinfo->slp_instances.ordered_remove (i);
6460 else
6461 ++i;
6464 /* Compute vectorizable live stmts. */
6465 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6467 hash_set<stmt_vec_info> svisited;
6468 hash_set<slp_tree> visited;
6469 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6471 vect_location = instance->location ();
6472 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6473 instance, &instance->cost_vec, svisited,
6474 visited);
6478 return !vinfo->slp_instances.is_empty ();
6481 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6482 closing the eventual chain. */
6484 static slp_instance
6485 get_ultimate_leader (slp_instance instance,
6486 hash_map<slp_instance, slp_instance> &instance_leader)
6488 auto_vec<slp_instance *, 8> chain;
6489 slp_instance *tem;
6490 while (*(tem = instance_leader.get (instance)) != instance)
6492 chain.safe_push (tem);
6493 instance = *tem;
6495 while (!chain.is_empty ())
6496 *chain.pop () = instance;
6497 return instance;
6500 namespace {
6501 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6502 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6503 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6505 INSTANCE_LEADER is as for get_ultimate_leader. */
6507 template<typename T>
6508 bool
6509 vect_map_to_instance (slp_instance instance, T key,
6510 hash_map<T, slp_instance> &key_to_instance,
6511 hash_map<slp_instance, slp_instance> &instance_leader)
6513 bool existed_p;
6514 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6515 if (!existed_p)
6517 else if (key_instance != instance)
6519 /* If we're running into a previously marked key make us the
6520 leader of the current ultimate leader. This keeps the
6521 leader chain acyclic and works even when the current instance
6522 connects two previously independent graph parts. */
6523 slp_instance key_leader
6524 = get_ultimate_leader (key_instance, instance_leader);
6525 if (key_leader != instance)
6526 instance_leader.put (key_leader, instance);
6528 key_instance = instance;
6529 return existed_p;
6533 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6535 static void
6536 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6537 slp_instance instance, slp_tree node,
6538 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6539 hash_map<slp_tree, slp_instance> &node_to_instance,
6540 hash_map<slp_instance, slp_instance> &instance_leader)
6542 stmt_vec_info stmt_info;
6543 unsigned i;
6545 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6546 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6547 instance_leader);
6549 if (vect_map_to_instance (instance, node, node_to_instance,
6550 instance_leader))
6551 return;
6553 slp_tree child;
6554 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6555 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6556 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6557 node_to_instance, instance_leader);
6560 /* Partition the SLP graph into pieces that can be costed independently. */
6562 static void
6563 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6565 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6567 /* First walk the SLP graph assigning each involved scalar stmt a
6568 corresponding SLP graph entry and upon visiting a previously
6569 marked stmt, make the stmts leader the current SLP graph entry. */
6570 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6571 hash_map<slp_tree, slp_instance> node_to_instance;
6572 hash_map<slp_instance, slp_instance> instance_leader;
6573 slp_instance instance;
6574 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6576 instance_leader.put (instance, instance);
6577 vect_bb_partition_graph_r (bb_vinfo,
6578 instance, SLP_INSTANCE_TREE (instance),
6579 stmt_to_instance, node_to_instance,
6580 instance_leader);
6583 /* Then collect entries to each independent subgraph. */
6584 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6586 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6587 leader->subgraph_entries.safe_push (instance);
6588 if (dump_enabled_p ()
6589 && leader != instance)
6590 dump_printf_loc (MSG_NOTE, vect_location,
6591 "instance %p is leader of %p\n",
6592 (void *) leader, (void *) instance);
6596 /* Compute the set of scalar stmts participating in internal and external
6597 nodes. */
6599 static void
6600 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6601 hash_set<slp_tree> &visited,
6602 hash_set<stmt_vec_info> &vstmts,
6603 hash_set<stmt_vec_info> &estmts)
6605 int i;
6606 stmt_vec_info stmt_info;
6607 slp_tree child;
6609 if (visited.add (node))
6610 return;
6612 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6614 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6615 vstmts.add (stmt_info);
6617 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6618 if (child)
6619 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6620 vstmts, estmts);
6622 else
6623 for (tree def : SLP_TREE_SCALAR_OPS (node))
6625 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6626 if (def_stmt)
6627 estmts.add (def_stmt);
6632 /* Compute the scalar cost of the SLP node NODE and its children
6633 and return it. Do not account defs that are marked in LIFE and
6634 update LIFE according to uses of NODE. */
6636 static void
6637 vect_bb_slp_scalar_cost (vec_info *vinfo,
6638 slp_tree node, vec<bool, va_heap> *life,
6639 stmt_vector_for_cost *cost_vec,
6640 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6641 hash_set<slp_tree> &visited)
6643 unsigned i;
6644 stmt_vec_info stmt_info;
6645 slp_tree child;
6647 if (visited.add (node))
6648 return;
6650 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6652 ssa_op_iter op_iter;
6653 def_operand_p def_p;
6655 if ((*life)[i])
6656 continue;
6658 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6659 gimple *orig_stmt = orig_stmt_info->stmt;
6661 /* If there is a non-vectorized use of the defs then the scalar
6662 stmt is kept live in which case we do not account it or any
6663 required defs in the SLP children in the scalar cost. This
6664 way we make the vectorization more costly when compared to
6665 the scalar cost. */
6666 if (!STMT_VINFO_LIVE_P (stmt_info))
6668 auto_vec<gimple *, 8> worklist;
6669 hash_set<gimple *> *worklist_visited = NULL;
6670 worklist.quick_push (orig_stmt);
6673 gimple *work_stmt = worklist.pop ();
6674 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6676 imm_use_iterator use_iter;
6677 gimple *use_stmt;
6678 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6679 DEF_FROM_PTR (def_p))
6680 if (!is_gimple_debug (use_stmt))
6682 stmt_vec_info use_stmt_info
6683 = vinfo->lookup_stmt (use_stmt);
6684 if (!use_stmt_info
6685 || !vectorized_scalar_stmts.contains (use_stmt_info))
6687 if (use_stmt_info
6688 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6690 /* For stmts participating in patterns we have
6691 to check its uses recursively. */
6692 if (!worklist_visited)
6693 worklist_visited = new hash_set<gimple *> ();
6694 if (!worklist_visited->add (use_stmt))
6695 worklist.safe_push (use_stmt);
6696 continue;
6698 (*life)[i] = true;
6699 goto next_lane;
6704 while (!worklist.is_empty ());
6705 next_lane:
6706 if (worklist_visited)
6707 delete worklist_visited;
6708 if ((*life)[i])
6709 continue;
6712 /* Count scalar stmts only once. */
6713 if (gimple_visited_p (orig_stmt))
6714 continue;
6715 gimple_set_visited (orig_stmt, true);
6717 vect_cost_for_stmt kind;
6718 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6720 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6721 kind = scalar_load;
6722 else
6723 kind = scalar_store;
6725 else if (vect_nop_conversion_p (orig_stmt_info))
6726 continue;
6727 /* For single-argument PHIs assume coalescing which means zero cost
6728 for the scalar and the vector PHIs. This avoids artificially
6729 favoring the vector path (but may pessimize it in some cases). */
6730 else if (is_a <gphi *> (orig_stmt_info->stmt)
6731 && gimple_phi_num_args
6732 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6733 continue;
6734 else
6735 kind = scalar_stmt;
6736 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6737 SLP_TREE_VECTYPE (node), 0, vect_body);
6740 auto_vec<bool, 20> subtree_life;
6741 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6743 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6745 /* Do not directly pass LIFE to the recursive call, copy it to
6746 confine changes in the callee to the current child/subtree. */
6747 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6749 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6750 for (unsigned j = 0;
6751 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6753 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6754 if (perm.first == i)
6755 subtree_life[perm.second] = (*life)[j];
6758 else
6760 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6761 subtree_life.safe_splice (*life);
6763 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6764 vectorized_scalar_stmts, visited);
6765 subtree_life.truncate (0);
6770 /* Comparator for the loop-index sorted cost vectors. */
6772 static int
6773 li_cost_vec_cmp (const void *a_, const void *b_)
6775 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6776 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6777 if (a->first < b->first)
6778 return -1;
6779 else if (a->first == b->first)
6780 return 0;
6781 return 1;
6784 /* Check if vectorization of the basic block is profitable for the
6785 subgraph denoted by SLP_INSTANCES. */
6787 static bool
6788 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6789 vec<slp_instance> slp_instances,
6790 loop_p orig_loop)
6792 slp_instance instance;
6793 int i;
6794 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6795 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6797 if (dump_enabled_p ())
6799 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6800 hash_set<slp_tree> visited;
6801 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6802 vect_print_slp_graph (MSG_NOTE, vect_location,
6803 SLP_INSTANCE_TREE (instance), visited);
6806 /* Compute the set of scalar stmts we know will go away 'locally' when
6807 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6808 not accurate for nodes promoted extern late or for scalar stmts that
6809 are used both in extern defs and in vectorized defs. */
6810 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6811 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6812 hash_set<slp_tree> visited;
6813 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6815 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6816 SLP_INSTANCE_TREE (instance),
6817 visited,
6818 vectorized_scalar_stmts,
6819 scalar_stmts_in_externs);
6820 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6821 vectorized_scalar_stmts.add (rstmt);
6823 /* Scalar stmts used as defs in external nodes need to be preseved, so
6824 remove them from vectorized_scalar_stmts. */
6825 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6826 vectorized_scalar_stmts.remove (stmt);
6828 /* Calculate scalar cost and sum the cost for the vector stmts
6829 previously collected. */
6830 stmt_vector_for_cost scalar_costs = vNULL;
6831 stmt_vector_for_cost vector_costs = vNULL;
6832 visited.empty ();
6833 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6835 auto_vec<bool, 20> life;
6836 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6837 true);
6838 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6839 record_stmt_cost (&scalar_costs,
6840 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6841 scalar_stmt,
6842 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6843 vect_bb_slp_scalar_cost (bb_vinfo,
6844 SLP_INSTANCE_TREE (instance),
6845 &life, &scalar_costs, vectorized_scalar_stmts,
6846 visited);
6847 vector_costs.safe_splice (instance->cost_vec);
6848 instance->cost_vec.release ();
6851 if (dump_enabled_p ())
6852 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6854 /* When costing non-loop vectorization we need to consider each covered
6855 loop independently and make sure vectorization is profitable. For
6856 now we assume a loop may be not entered or executed an arbitrary
6857 number of iterations (??? static information can provide more
6858 precise info here) which means we can simply cost each containing
6859 loops stmts separately. */
6861 /* First produce cost vectors sorted by loop index. */
6862 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6863 li_scalar_costs (scalar_costs.length ());
6864 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6865 li_vector_costs (vector_costs.length ());
6866 stmt_info_for_cost *cost;
6867 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6869 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6870 li_scalar_costs.quick_push (std::make_pair (l, cost));
6872 /* Use a random used loop as fallback in case the first vector_costs
6873 entry does not have a stmt_info associated with it. */
6874 unsigned l = li_scalar_costs[0].first;
6875 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6877 /* We inherit from the previous COST, invariants, externals and
6878 extracts immediately follow the cost for the related stmt. */
6879 if (cost->stmt_info)
6880 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6881 li_vector_costs.quick_push (std::make_pair (l, cost));
6883 li_scalar_costs.qsort (li_cost_vec_cmp);
6884 li_vector_costs.qsort (li_cost_vec_cmp);
6886 /* Now cost the portions individually. */
6887 unsigned vi = 0;
6888 unsigned si = 0;
6889 bool profitable = true;
6890 while (si < li_scalar_costs.length ()
6891 && vi < li_vector_costs.length ())
6893 unsigned sl = li_scalar_costs[si].first;
6894 unsigned vl = li_vector_costs[vi].first;
6895 if (sl != vl)
6897 if (dump_enabled_p ())
6898 dump_printf_loc (MSG_NOTE, vect_location,
6899 "Scalar %d and vector %d loop part do not "
6900 "match up, skipping scalar part\n", sl, vl);
6901 /* Skip the scalar part, assuming zero cost on the vector side. */
6904 si++;
6906 while (si < li_scalar_costs.length ()
6907 && li_scalar_costs[si].first == sl);
6908 continue;
6911 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6914 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6915 si++;
6917 while (si < li_scalar_costs.length ()
6918 && li_scalar_costs[si].first == sl);
6919 unsigned dummy;
6920 finish_cost (scalar_target_cost_data, nullptr,
6921 &dummy, &scalar_cost, &dummy);
6923 /* Complete the target-specific vector cost calculation. */
6924 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6927 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6928 vi++;
6930 while (vi < li_vector_costs.length ()
6931 && li_vector_costs[vi].first == vl);
6932 finish_cost (vect_target_cost_data, scalar_target_cost_data,
6933 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
6934 delete scalar_target_cost_data;
6935 delete vect_target_cost_data;
6937 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
6939 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_NOTE, vect_location,
6942 "Cost model analysis for part in loop %d:\n", sl);
6943 dump_printf (MSG_NOTE, " Vector cost: %d\n",
6944 vec_inside_cost + vec_outside_cost);
6945 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
6948 /* Vectorization is profitable if its cost is more than the cost of scalar
6949 version. Note that we err on the vector side for equal cost because
6950 the cost estimate is otherwise quite pessimistic (constant uses are
6951 free on the scalar side but cost a load on the vector side for
6952 example). */
6953 if (vec_outside_cost + vec_inside_cost > scalar_cost)
6955 profitable = false;
6956 break;
6959 if (profitable && vi < li_vector_costs.length ())
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_NOTE, vect_location,
6963 "Excess vector cost for part in loop %d:\n",
6964 li_vector_costs[vi].first);
6965 profitable = false;
6968 /* Unset visited flag. This is delayed when the subgraph is profitable
6969 and we process the loop for remaining unvectorized if-converted code. */
6970 if (!orig_loop || !profitable)
6971 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6972 gimple_set_visited (cost->stmt_info->stmt, false);
6974 scalar_costs.release ();
6975 vector_costs.release ();
6977 return profitable;
6980 /* qsort comparator for lane defs. */
6982 static int
6983 vld_cmp (const void *a_, const void *b_)
6985 auto *a = (const std::pair<unsigned, tree> *)a_;
6986 auto *b = (const std::pair<unsigned, tree> *)b_;
6987 return a->first - b->first;
6990 /* Return true if USE_STMT is a vector lane insert into VEC and set
6991 *THIS_LANE to the lane number that is set. */
6993 static bool
6994 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
6996 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
6997 if (!use_ass
6998 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
6999 || (vec
7000 ? gimple_assign_rhs1 (use_ass) != vec
7001 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7002 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7003 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7004 || !constant_multiple_p
7005 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7006 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7007 this_lane))
7008 return false;
7009 return true;
7012 /* Find any vectorizable constructors and add them to the grouped_store
7013 array. */
7015 static void
7016 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
7018 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7019 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7020 !gsi_end_p (gsi); gsi_next (&gsi))
7022 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7023 if (!assign)
7024 continue;
7026 tree rhs = gimple_assign_rhs1 (assign);
7027 enum tree_code code = gimple_assign_rhs_code (assign);
7028 use_operand_p use_p;
7029 gimple *use_stmt;
7030 if (code == CONSTRUCTOR)
7032 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7033 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7034 CONSTRUCTOR_NELTS (rhs))
7035 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7036 || uniform_vector_p (rhs))
7037 continue;
7039 unsigned j;
7040 tree val;
7041 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7042 if (TREE_CODE (val) != SSA_NAME
7043 || !bb_vinfo->lookup_def (val))
7044 break;
7045 if (j != CONSTRUCTOR_NELTS (rhs))
7046 continue;
7048 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
7049 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
7051 else if (code == BIT_INSERT_EXPR
7052 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7053 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7054 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7055 && integer_zerop (gimple_assign_rhs3 (assign))
7056 && useless_type_conversion_p
7057 (TREE_TYPE (TREE_TYPE (rhs)),
7058 TREE_TYPE (gimple_assign_rhs2 (assign)))
7059 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7061 /* We start to match on insert to lane zero but since the
7062 inserts need not be ordered we'd have to search both
7063 the def and the use chains. */
7064 tree vectype = TREE_TYPE (rhs);
7065 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7066 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7067 auto_sbitmap lanes (nlanes);
7068 bitmap_clear (lanes);
7069 bitmap_set_bit (lanes, 0);
7070 tree def = gimple_assign_lhs (assign);
7071 lane_defs.quick_push
7072 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7073 unsigned lanes_found = 1;
7074 /* Start with the use chains, the last stmt will be the root. */
7075 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7076 vec<stmt_vec_info> roots = vNULL;
7077 roots.safe_push (last);
7080 use_operand_p use_p;
7081 gimple *use_stmt;
7082 if (!single_imm_use (def, &use_p, &use_stmt))
7083 break;
7084 unsigned this_lane;
7085 if (!bb_vinfo->lookup_stmt (use_stmt)
7086 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7087 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7088 break;
7089 if (bitmap_bit_p (lanes, this_lane))
7090 break;
7091 lanes_found++;
7092 bitmap_set_bit (lanes, this_lane);
7093 gassign *use_ass = as_a <gassign *> (use_stmt);
7094 lane_defs.quick_push (std::make_pair
7095 (this_lane, gimple_assign_rhs2 (use_ass)));
7096 last = bb_vinfo->lookup_stmt (use_ass);
7097 roots.safe_push (last);
7098 def = gimple_assign_lhs (use_ass);
7100 while (lanes_found < nlanes);
7101 if (roots.length () > 1)
7102 std::swap(roots[0], roots[roots.length () - 1]);
7103 if (lanes_found < nlanes)
7105 /* Now search the def chain. */
7106 def = gimple_assign_rhs1 (assign);
7109 if (TREE_CODE (def) != SSA_NAME
7110 || !has_single_use (def))
7111 break;
7112 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7113 unsigned this_lane;
7114 if (!bb_vinfo->lookup_stmt (def_stmt)
7115 || !vect_slp_is_lane_insert (def_stmt,
7116 NULL_TREE, &this_lane)
7117 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7118 break;
7119 if (bitmap_bit_p (lanes, this_lane))
7120 break;
7121 lanes_found++;
7122 bitmap_set_bit (lanes, this_lane);
7123 lane_defs.quick_push (std::make_pair
7124 (this_lane,
7125 gimple_assign_rhs2 (def_stmt)));
7126 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7127 def = gimple_assign_rhs1 (def_stmt);
7129 while (lanes_found < nlanes);
7131 if (lanes_found == nlanes)
7133 /* Sort lane_defs after the lane index and register the root. */
7134 lane_defs.qsort (vld_cmp);
7135 vec<stmt_vec_info> stmts;
7136 stmts.create (nlanes);
7137 for (unsigned i = 0; i < nlanes; ++i)
7138 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7139 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7140 stmts, roots));
7142 else
7143 roots.release ();
7145 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7146 && (associative_tree_code (code) || code == MINUS_EXPR)
7147 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
7148 checks pessimize a two-element reduction. PR54400.
7149 ??? In-order reduction could be handled if we only
7150 traverse one operand chain in vect_slp_linearize_chain. */
7151 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
7152 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
7153 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
7154 /* Ops with constants at the tail can be stripped here. */
7155 && TREE_CODE (rhs) == SSA_NAME
7156 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7157 /* Should be the chain end. */
7158 && (!single_imm_use (gimple_assign_lhs (assign),
7159 &use_p, &use_stmt)
7160 || !is_gimple_assign (use_stmt)
7161 || (gimple_assign_rhs_code (use_stmt) != code
7162 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7163 || (gimple_assign_rhs_code (use_stmt)
7164 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7166 /* We start the match at the end of a possible association
7167 chain. */
7168 auto_vec<chain_op_t> chain;
7169 auto_vec<std::pair<tree_code, gimple *> > worklist;
7170 auto_vec<gimple *> chain_stmts;
7171 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7172 if (code == MINUS_EXPR)
7173 code = PLUS_EXPR;
7174 internal_fn reduc_fn;
7175 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7176 || reduc_fn == IFN_LAST)
7177 continue;
7178 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7179 /* ??? */
7180 code_stmt, alt_code_stmt, &chain_stmts);
7181 if (chain.length () > 1)
7183 /* Sort the chain according to def_type and operation. */
7184 chain.sort (dt_sort_cmp, bb_vinfo);
7185 /* ??? Now we'd want to strip externals and constants
7186 but record those to be handled in the epilogue. */
7187 /* ??? For now do not allow mixing ops or externs/constants. */
7188 bool invalid = false;
7189 for (unsigned i = 0; i < chain.length (); ++i)
7190 if (chain[i].dt != vect_internal_def
7191 || chain[i].code != code)
7192 invalid = true;
7193 if (!invalid)
7195 vec<stmt_vec_info> stmts;
7196 stmts.create (chain.length ());
7197 for (unsigned i = 0; i < chain.length (); ++i)
7198 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7199 vec<stmt_vec_info> roots;
7200 roots.create (chain_stmts.length ());
7201 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7202 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7203 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7204 stmts, roots));
7211 /* Walk the grouped store chains and replace entries with their
7212 pattern variant if any. */
7214 static void
7215 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7217 stmt_vec_info first_element;
7218 unsigned i;
7220 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7222 /* We also have CTORs in this array. */
7223 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7224 continue;
7225 if (STMT_VINFO_IN_PATTERN_P (first_element))
7227 stmt_vec_info orig = first_element;
7228 first_element = STMT_VINFO_RELATED_STMT (first_element);
7229 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7230 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7231 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7232 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7233 vinfo->grouped_stores[i] = first_element;
7235 stmt_vec_info prev = first_element;
7236 while (DR_GROUP_NEXT_ELEMENT (prev))
7238 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7239 if (STMT_VINFO_IN_PATTERN_P (elt))
7241 stmt_vec_info orig = elt;
7242 elt = STMT_VINFO_RELATED_STMT (elt);
7243 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7244 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7245 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7247 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7248 prev = elt;
7253 /* Check if the region described by BB_VINFO can be vectorized, returning
7254 true if so. When returning false, set FATAL to true if the same failure
7255 would prevent vectorization at other vector sizes, false if it is still
7256 worth trying other sizes. N_STMTS is the number of statements in the
7257 region. */
7259 static bool
7260 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7261 vec<int> *dataref_groups)
7263 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7265 slp_instance instance;
7266 int i;
7267 poly_uint64 min_vf = 2;
7269 /* The first group of checks is independent of the vector size. */
7270 fatal = true;
7272 /* Analyze the data references. */
7274 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7276 if (dump_enabled_p ())
7277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278 "not vectorized: unhandled data-ref in basic "
7279 "block.\n");
7280 return false;
7283 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7285 if (dump_enabled_p ())
7286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287 "not vectorized: unhandled data access in "
7288 "basic block.\n");
7289 return false;
7292 vect_slp_check_for_constructors (bb_vinfo);
7294 /* If there are no grouped stores and no constructors in the region
7295 there is no need to continue with pattern recog as vect_analyze_slp
7296 will fail anyway. */
7297 if (bb_vinfo->grouped_stores.is_empty ()
7298 && bb_vinfo->roots.is_empty ())
7300 if (dump_enabled_p ())
7301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7302 "not vectorized: no grouped stores in "
7303 "basic block.\n");
7304 return false;
7307 /* While the rest of the analysis below depends on it in some way. */
7308 fatal = false;
7310 vect_pattern_recog (bb_vinfo);
7312 /* Update store groups from pattern processing. */
7313 vect_fixup_store_groups_with_patterns (bb_vinfo);
7315 /* Check the SLP opportunities in the basic block, analyze and build SLP
7316 trees. */
7317 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7319 if (dump_enabled_p ())
7321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7322 "Failed to SLP the basic block.\n");
7323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7324 "not vectorized: failed to find SLP opportunities "
7325 "in basic block.\n");
7327 return false;
7330 /* Optimize permutations. */
7331 vect_optimize_slp (bb_vinfo);
7333 /* Gather the loads reachable from the SLP graph entries. */
7334 vect_gather_slp_loads (bb_vinfo);
7336 vect_record_base_alignments (bb_vinfo);
7338 /* Analyze and verify the alignment of data references and the
7339 dependence in the SLP instances. */
7340 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7342 vect_location = instance->location ();
7343 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7344 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7346 slp_tree node = SLP_INSTANCE_TREE (instance);
7347 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7348 if (dump_enabled_p ())
7349 dump_printf_loc (MSG_NOTE, vect_location,
7350 "removing SLP instance operations starting from: %G",
7351 stmt_info->stmt);
7352 vect_free_slp_instance (instance);
7353 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7354 continue;
7357 /* Mark all the statements that we want to vectorize as pure SLP and
7358 relevant. */
7359 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7360 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7361 unsigned j;
7362 stmt_vec_info root;
7363 /* Likewise consider instance root stmts as vectorized. */
7364 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7365 STMT_SLP_TYPE (root) = pure_slp;
7367 i++;
7369 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7370 return false;
7372 if (!vect_slp_analyze_operations (bb_vinfo))
7374 if (dump_enabled_p ())
7375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7376 "not vectorized: bad operation in basic block.\n");
7377 return false;
7380 vect_bb_partition_graph (bb_vinfo);
7382 return true;
7385 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7386 basic blocks in BBS, returning true on success.
7387 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7389 static bool
7390 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7391 vec<int> *dataref_groups, unsigned int n_stmts,
7392 loop_p orig_loop)
7394 bb_vec_info bb_vinfo;
7395 auto_vector_modes vector_modes;
7397 /* Autodetect first vector size we try. */
7398 machine_mode next_vector_mode = VOIDmode;
7399 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7400 unsigned int mode_i = 0;
7402 vec_info_shared shared;
7404 machine_mode autodetected_vector_mode = VOIDmode;
7405 while (1)
7407 bool vectorized = false;
7408 bool fatal = false;
7409 bb_vinfo = new _bb_vec_info (bbs, &shared);
7411 bool first_time_p = shared.datarefs.is_empty ();
7412 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7413 if (first_time_p)
7414 bb_vinfo->shared->save_datarefs ();
7415 else
7416 bb_vinfo->shared->check_datarefs ();
7417 bb_vinfo->vector_mode = next_vector_mode;
7419 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7421 if (dump_enabled_p ())
7423 dump_printf_loc (MSG_NOTE, vect_location,
7424 "***** Analysis succeeded with vector mode"
7425 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7426 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7429 bb_vinfo->shared->check_datarefs ();
7431 auto_vec<slp_instance> profitable_subgraphs;
7432 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7434 if (instance->subgraph_entries.is_empty ())
7435 continue;
7437 vect_location = instance->location ();
7438 if (!unlimited_cost_model (NULL)
7439 && !vect_bb_vectorization_profitable_p
7440 (bb_vinfo, instance->subgraph_entries, orig_loop))
7442 if (dump_enabled_p ())
7443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7444 "not vectorized: vectorization is not "
7445 "profitable.\n");
7446 continue;
7449 if (!dbg_cnt (vect_slp))
7450 continue;
7452 profitable_subgraphs.safe_push (instance);
7455 /* When we're vectorizing an if-converted loop body make sure
7456 we vectorized all if-converted code. */
7457 if (!profitable_subgraphs.is_empty ()
7458 && orig_loop)
7460 gcc_assert (bb_vinfo->bbs.length () == 1);
7461 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7462 !gsi_end_p (gsi); gsi_next (&gsi))
7464 /* The costing above left us with DCEable vectorized scalar
7465 stmts having the visited flag set on profitable
7466 subgraphs. Do the delayed clearing of the flag here. */
7467 if (gimple_visited_p (gsi_stmt (gsi)))
7469 gimple_set_visited (gsi_stmt (gsi), false);
7470 continue;
7472 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7473 continue;
7475 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7476 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7478 if (!profitable_subgraphs.is_empty ()
7479 && dump_enabled_p ())
7480 dump_printf_loc (MSG_NOTE, vect_location,
7481 "not profitable because of "
7482 "unprofitable if-converted scalar "
7483 "code\n");
7484 profitable_subgraphs.truncate (0);
7489 /* Finally schedule the profitable subgraphs. */
7490 for (slp_instance instance : profitable_subgraphs)
7492 if (!vectorized && dump_enabled_p ())
7493 dump_printf_loc (MSG_NOTE, vect_location,
7494 "Basic block will be vectorized "
7495 "using SLP\n");
7496 vectorized = true;
7498 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7500 unsigned HOST_WIDE_INT bytes;
7501 if (dump_enabled_p ())
7503 if (GET_MODE_SIZE
7504 (bb_vinfo->vector_mode).is_constant (&bytes))
7505 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7506 "basic block part vectorized using %wu "
7507 "byte vectors\n", bytes);
7508 else
7509 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
7510 "basic block part vectorized using "
7511 "variable length vectors\n");
7515 else
7517 if (dump_enabled_p ())
7518 dump_printf_loc (MSG_NOTE, vect_location,
7519 "***** Analysis failed with vector mode %s\n",
7520 GET_MODE_NAME (bb_vinfo->vector_mode));
7523 if (mode_i == 0)
7524 autodetected_vector_mode = bb_vinfo->vector_mode;
7526 if (!fatal)
7527 while (mode_i < vector_modes.length ()
7528 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7530 if (dump_enabled_p ())
7531 dump_printf_loc (MSG_NOTE, vect_location,
7532 "***** The result for vector mode %s would"
7533 " be the same\n",
7534 GET_MODE_NAME (vector_modes[mode_i]));
7535 mode_i += 1;
7538 delete bb_vinfo;
7540 if (mode_i < vector_modes.length ()
7541 && VECTOR_MODE_P (autodetected_vector_mode)
7542 && (related_vector_mode (vector_modes[mode_i],
7543 GET_MODE_INNER (autodetected_vector_mode))
7544 == autodetected_vector_mode)
7545 && (related_vector_mode (autodetected_vector_mode,
7546 GET_MODE_INNER (vector_modes[mode_i]))
7547 == vector_modes[mode_i]))
7549 if (dump_enabled_p ())
7550 dump_printf_loc (MSG_NOTE, vect_location,
7551 "***** Skipping vector mode %s, which would"
7552 " repeat the analysis for %s\n",
7553 GET_MODE_NAME (vector_modes[mode_i]),
7554 GET_MODE_NAME (autodetected_vector_mode));
7555 mode_i += 1;
7558 if (vectorized
7559 || mode_i == vector_modes.length ()
7560 || autodetected_vector_mode == VOIDmode
7561 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7562 vector sizes will fail do not bother iterating. */
7563 || fatal)
7564 return vectorized;
7566 /* Try the next biggest vector size. */
7567 next_vector_mode = vector_modes[mode_i++];
7568 if (dump_enabled_p ())
7569 dump_printf_loc (MSG_NOTE, vect_location,
7570 "***** Re-trying analysis with vector mode %s\n",
7571 GET_MODE_NAME (next_vector_mode));
7576 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7577 true if anything in the basic-block was vectorized. */
7579 static bool
7580 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7582 vec<data_reference_p> datarefs = vNULL;
7583 auto_vec<int> dataref_groups;
7584 int insns = 0;
7585 int current_group = 0;
7587 for (unsigned i = 0; i < bbs.length (); i++)
7589 basic_block bb = bbs[i];
7590 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7591 gsi_next (&gsi))
7593 gimple *stmt = gsi_stmt (gsi);
7594 if (is_gimple_debug (stmt))
7595 continue;
7597 insns++;
7599 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7600 vect_location = stmt;
7602 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7603 &dataref_groups, current_group))
7604 ++current_group;
7606 /* New BBs always start a new DR group. */
7607 ++current_group;
7610 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7613 /* Special entry for the BB vectorizer. Analyze and transform a single
7614 if-converted BB with ORIG_LOOPs body being the not if-converted
7615 representation. Returns true if anything in the basic-block was
7616 vectorized. */
7618 bool
7619 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7621 auto_vec<basic_block> bbs;
7622 bbs.safe_push (bb);
7623 return vect_slp_bbs (bbs, orig_loop);
7626 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7627 true if anything in the basic-block was vectorized. */
7629 bool
7630 vect_slp_function (function *fun)
7632 bool r = false;
7633 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7634 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7636 /* For the moment split the function into pieces to avoid making
7637 the iteration on the vector mode moot. Split at points we know
7638 to not handle well which is CFG merges (SLP discovery doesn't
7639 handle non-loop-header PHIs) and loop exits. Since pattern
7640 recog requires reverse iteration to visit uses before defs
7641 simply chop RPO into pieces. */
7642 auto_vec<basic_block> bbs;
7643 for (unsigned i = 0; i < n; i++)
7645 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7646 bool split = false;
7648 /* Split when a BB is not dominated by the first block. */
7649 if (!bbs.is_empty ()
7650 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7652 if (dump_enabled_p ())
7653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7654 "splitting region at dominance boundary bb%d\n",
7655 bb->index);
7656 split = true;
7658 /* Split when the loop determined by the first block
7659 is exited. This is because we eventually insert
7660 invariants at region begin. */
7661 else if (!bbs.is_empty ()
7662 && bbs[0]->loop_father != bb->loop_father
7663 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7665 if (dump_enabled_p ())
7666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667 "splitting region at loop %d exit at bb%d\n",
7668 bbs[0]->loop_father->num, bb->index);
7669 split = true;
7672 if (split && !bbs.is_empty ())
7674 r |= vect_slp_bbs (bbs, NULL);
7675 bbs.truncate (0);
7676 bbs.quick_push (bb);
7678 else
7679 bbs.safe_push (bb);
7681 /* When we have a stmt ending this block and defining a
7682 value we have to insert on edges when inserting after it for
7683 a vector containing its definition. Avoid this for now. */
7684 if (gimple *last = last_stmt (bb))
7685 if (gimple_get_lhs (last)
7686 && is_ctrl_altering_stmt (last))
7688 if (dump_enabled_p ())
7689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690 "splitting region at control altering "
7691 "definition %G", last);
7692 r |= vect_slp_bbs (bbs, NULL);
7693 bbs.truncate (0);
7697 if (!bbs.is_empty ())
7698 r |= vect_slp_bbs (bbs, NULL);
7700 free (rpo);
7702 return r;
7705 /* Build a variable-length vector in which the elements in ELTS are repeated
7706 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7707 RESULTS and add any new instructions to SEQ.
7709 The approach we use is:
7711 (1) Find a vector mode VM with integer elements of mode IM.
7713 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7714 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7715 from small vectors to IM.
7717 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7719 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7720 correct byte contents.
7722 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7724 We try to find the largest IM for which this sequence works, in order
7725 to cut down on the number of interleaves. */
7727 void
7728 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7729 const vec<tree> &elts, unsigned int nresults,
7730 vec<tree> &results)
7732 unsigned int nelts = elts.length ();
7733 tree element_type = TREE_TYPE (vector_type);
7735 /* (1) Find a vector mode VM with integer elements of mode IM. */
7736 unsigned int nvectors = 1;
7737 tree new_vector_type;
7738 tree permutes[2];
7739 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7740 &nvectors, &new_vector_type,
7741 permutes))
7742 gcc_unreachable ();
7744 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7745 unsigned int partial_nelts = nelts / nvectors;
7746 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7748 tree_vector_builder partial_elts;
7749 auto_vec<tree, 32> pieces (nvectors * 2);
7750 pieces.quick_grow_cleared (nvectors * 2);
7751 for (unsigned int i = 0; i < nvectors; ++i)
7753 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7754 ELTS' has mode IM. */
7755 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7756 for (unsigned int j = 0; j < partial_nelts; ++j)
7757 partial_elts.quick_push (elts[i * partial_nelts + j]);
7758 tree t = gimple_build_vector (seq, &partial_elts);
7759 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7760 TREE_TYPE (new_vector_type), t);
7762 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7763 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7766 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7767 correct byte contents.
7769 Conceptually, we need to repeat the following operation log2(nvectors)
7770 times, where hi_start = nvectors / 2:
7772 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7773 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7775 However, if each input repeats every N elements and the VF is
7776 a multiple of N * 2, the HI result is the same as the LO result.
7777 This will be true for the first N1 iterations of the outer loop,
7778 followed by N2 iterations for which both the LO and HI results
7779 are needed. I.e.:
7781 N1 + N2 = log2(nvectors)
7783 Each "N1 iteration" doubles the number of redundant vectors and the
7784 effect of the process as a whole is to have a sequence of nvectors/2**N1
7785 vectors that repeats 2**N1 times. Rather than generate these redundant
7786 vectors, we halve the number of vectors for each N1 iteration. */
7787 unsigned int in_start = 0;
7788 unsigned int out_start = nvectors;
7789 unsigned int new_nvectors = nvectors;
7790 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7792 unsigned int hi_start = new_nvectors / 2;
7793 unsigned int out_i = 0;
7794 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7796 if ((in_i & 1) != 0
7797 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7798 2 * in_repeat))
7799 continue;
7801 tree output = make_ssa_name (new_vector_type);
7802 tree input1 = pieces[in_start + (in_i / 2)];
7803 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7804 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7805 input1, input2,
7806 permutes[in_i & 1]);
7807 gimple_seq_add_stmt (seq, stmt);
7808 pieces[out_start + out_i] = output;
7809 out_i += 1;
7811 std::swap (in_start, out_start);
7812 new_nvectors = out_i;
7815 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7816 results.reserve (nresults);
7817 for (unsigned int i = 0; i < nresults; ++i)
7818 if (i < new_nvectors)
7819 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7820 pieces[in_start + i]));
7821 else
7822 results.quick_push (results[i - new_nvectors]);
7826 /* For constant and loop invariant defs in OP_NODE this function creates
7827 vector defs that will be used in the vectorized stmts and stores them
7828 to SLP_TREE_VEC_DEFS of OP_NODE. */
7830 static void
7831 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7833 unsigned HOST_WIDE_INT nunits;
7834 tree vec_cst;
7835 unsigned j, number_of_places_left_in_vector;
7836 tree vector_type;
7837 tree vop;
7838 int group_size = op_node->ops.length ();
7839 unsigned int vec_num, i;
7840 unsigned number_of_copies = 1;
7841 bool constant_p;
7842 gimple_seq ctor_seq = NULL;
7843 auto_vec<tree, 16> permute_results;
7845 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
7846 vector_type = SLP_TREE_VECTYPE (op_node);
7848 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7849 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7850 auto_vec<tree> voprnds (number_of_vectors);
7852 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7853 created vectors. It is greater than 1 if unrolling is performed.
7855 For example, we have two scalar operands, s1 and s2 (e.g., group of
7856 strided accesses of size two), while NUNITS is four (i.e., four scalars
7857 of this type can be packed in a vector). The output vector will contain
7858 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
7859 will be 2).
7861 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7862 containing the operands.
7864 For example, NUNITS is four as before, and the group size is 8
7865 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
7866 {s5, s6, s7, s8}. */
7868 /* When using duplicate_and_interleave, we just need one element for
7869 each scalar statement. */
7870 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7871 nunits = group_size;
7873 number_of_copies = nunits * number_of_vectors / group_size;
7875 number_of_places_left_in_vector = nunits;
7876 constant_p = true;
7877 tree_vector_builder elts (vector_type, nunits, 1);
7878 elts.quick_grow (nunits);
7879 stmt_vec_info insert_after = NULL;
7880 for (j = 0; j < number_of_copies; j++)
7882 tree op;
7883 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7885 /* Create 'vect_ = {op0,op1,...,opn}'. */
7886 number_of_places_left_in_vector--;
7887 tree orig_op = op;
7888 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
7890 if (CONSTANT_CLASS_P (op))
7892 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7894 /* Can't use VIEW_CONVERT_EXPR for booleans because
7895 of possibly different sizes of scalar value and
7896 vector element. */
7897 if (integer_zerop (op))
7898 op = build_int_cst (TREE_TYPE (vector_type), 0);
7899 else if (integer_onep (op))
7900 op = build_all_ones_cst (TREE_TYPE (vector_type));
7901 else
7902 gcc_unreachable ();
7904 else
7905 op = fold_unary (VIEW_CONVERT_EXPR,
7906 TREE_TYPE (vector_type), op);
7907 gcc_assert (op && CONSTANT_CLASS_P (op));
7909 else
7911 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
7912 gimple *init_stmt;
7913 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
7915 tree true_val
7916 = build_all_ones_cst (TREE_TYPE (vector_type));
7917 tree false_val
7918 = build_zero_cst (TREE_TYPE (vector_type));
7919 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
7920 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
7921 op, true_val,
7922 false_val);
7924 else
7926 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
7927 op);
7928 init_stmt
7929 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
7930 op);
7932 gimple_seq_add_stmt (&ctor_seq, init_stmt);
7933 op = new_temp;
7936 elts[number_of_places_left_in_vector] = op;
7937 if (!CONSTANT_CLASS_P (op))
7938 constant_p = false;
7939 /* For BB vectorization we have to compute an insert location
7940 when a def is inside the analyzed region since we cannot
7941 simply insert at the BB start in this case. */
7942 stmt_vec_info opdef;
7943 if (TREE_CODE (orig_op) == SSA_NAME
7944 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
7945 && is_a <bb_vec_info> (vinfo)
7946 && (opdef = vinfo->lookup_def (orig_op)))
7948 if (!insert_after)
7949 insert_after = opdef;
7950 else
7951 insert_after = get_later_stmt (insert_after, opdef);
7954 if (number_of_places_left_in_vector == 0)
7956 if (constant_p
7957 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
7958 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
7959 vec_cst = gimple_build_vector (&ctor_seq, &elts);
7960 else
7962 if (permute_results.is_empty ())
7963 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
7964 elts, number_of_vectors,
7965 permute_results);
7966 vec_cst = permute_results[number_of_vectors - j - 1];
7968 if (!gimple_seq_empty_p (ctor_seq))
7970 if (insert_after)
7972 gimple_stmt_iterator gsi;
7973 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
7975 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
7976 gsi_insert_seq_before (&gsi, ctor_seq,
7977 GSI_CONTINUE_LINKING);
7979 else if (!stmt_ends_bb_p (insert_after->stmt))
7981 gsi = gsi_for_stmt (insert_after->stmt);
7982 gsi_insert_seq_after (&gsi, ctor_seq,
7983 GSI_CONTINUE_LINKING);
7985 else
7987 /* When we want to insert after a def where the
7988 defining stmt throws then insert on the fallthru
7989 edge. */
7990 edge e = find_fallthru_edge
7991 (gimple_bb (insert_after->stmt)->succs);
7992 basic_block new_bb
7993 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
7994 gcc_assert (!new_bb);
7997 else
7998 vinfo->insert_seq_on_entry (NULL, ctor_seq);
7999 ctor_seq = NULL;
8001 voprnds.quick_push (vec_cst);
8002 insert_after = NULL;
8003 number_of_places_left_in_vector = nunits;
8004 constant_p = true;
8005 elts.new_vector (vector_type, nunits, 1);
8006 elts.quick_grow (nunits);
8011 /* Since the vectors are created in the reverse order, we should invert
8012 them. */
8013 vec_num = voprnds.length ();
8014 for (j = vec_num; j != 0; j--)
8016 vop = voprnds[j - 1];
8017 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8020 /* In case that VF is greater than the unrolling factor needed for the SLP
8021 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8022 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8023 to replicate the vectors. */
8024 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8025 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8026 i++)
8027 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8030 /* Get the Ith vectorized definition from SLP_NODE. */
8032 tree
8033 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8035 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
8036 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
8037 else
8038 return SLP_TREE_VEC_DEFS (slp_node)[i];
8041 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8043 void
8044 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8046 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8047 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
8049 unsigned j;
8050 gimple *vec_def_stmt;
8051 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
8052 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
8054 else
8055 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8058 /* Get N vectorized definitions for SLP_NODE. */
8060 void
8061 vect_get_slp_defs (vec_info *,
8062 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8064 if (n == -1U)
8065 n = SLP_TREE_CHILDREN (slp_node).length ();
8067 for (unsigned i = 0; i < n; ++i)
8069 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8070 vec<tree> vec_defs = vNULL;
8071 vect_get_slp_defs (child, &vec_defs);
8072 vec_oprnds->quick_push (vec_defs);
8076 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8077 - PERM gives the permutation that the caller wants to use for NODE,
8078 which might be different from SLP_LOAD_PERMUTATION.
8079 - DUMP_P controls whether the function dumps information. */
8081 static bool
8082 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8083 load_permutation_t &perm,
8084 const vec<tree> &dr_chain,
8085 gimple_stmt_iterator *gsi, poly_uint64 vf,
8086 bool analyze_only, bool dump_p,
8087 unsigned *n_perms, unsigned int *n_loads,
8088 bool dce_chain)
8090 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8091 int vec_index = 0;
8092 tree vectype = SLP_TREE_VECTYPE (node);
8093 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8094 unsigned int mask_element;
8095 machine_mode mode;
8097 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8098 return false;
8100 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8102 mode = TYPE_MODE (vectype);
8103 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8105 /* Initialize the vect stmts of NODE to properly insert the generated
8106 stmts later. */
8107 if (! analyze_only)
8108 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
8109 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
8110 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
8112 /* Generate permutation masks for every NODE. Number of masks for each NODE
8113 is equal to GROUP_SIZE.
8114 E.g., we have a group of three nodes with three loads from the same
8115 location in each node, and the vector size is 4. I.e., we have a
8116 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8117 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8118 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8121 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8122 The last mask is illegal since we assume two operands for permute
8123 operation, and the mask element values can't be outside that range.
8124 Hence, the last mask must be converted into {2,5,5,5}.
8125 For the first two permutations we need the first and the second input
8126 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8127 we need the second and the third vectors: {b1,c1,a2,b2} and
8128 {c2,a3,b3,c3}. */
8130 int vect_stmts_counter = 0;
8131 unsigned int index = 0;
8132 int first_vec_index = -1;
8133 int second_vec_index = -1;
8134 bool noop_p = true;
8135 *n_perms = 0;
8137 vec_perm_builder mask;
8138 unsigned int nelts_to_build;
8139 unsigned int nvectors_per_build;
8140 unsigned int in_nlanes;
8141 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
8142 && multiple_p (nunits, group_size));
8143 if (repeating_p)
8145 /* A single vector contains a whole number of copies of the node, so:
8146 (a) all permutes can use the same mask; and
8147 (b) the permutes only need a single vector input. */
8148 mask.new_vector (nunits, group_size, 3);
8149 nelts_to_build = mask.encoded_nelts ();
8150 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
8151 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
8153 else
8155 /* We need to construct a separate mask for each vector statement. */
8156 unsigned HOST_WIDE_INT const_nunits, const_vf;
8157 if (!nunits.is_constant (&const_nunits)
8158 || !vf.is_constant (&const_vf))
8159 return false;
8160 mask.new_vector (const_nunits, const_nunits, 1);
8161 nelts_to_build = const_vf * group_size;
8162 nvectors_per_build = 1;
8163 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
8165 auto_sbitmap used_in_lanes (in_nlanes);
8166 bitmap_clear (used_in_lanes);
8167 auto_bitmap used_defs;
8169 unsigned int count = mask.encoded_nelts ();
8170 mask.quick_grow (count);
8171 vec_perm_indices indices;
8173 for (unsigned int j = 0; j < nelts_to_build; j++)
8175 unsigned int iter_num = j / group_size;
8176 unsigned int stmt_num = j % group_size;
8177 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
8178 bitmap_set_bit (used_in_lanes, i);
8179 if (repeating_p)
8181 first_vec_index = 0;
8182 mask_element = i;
8184 else
8186 /* Enforced before the loop when !repeating_p. */
8187 unsigned int const_nunits = nunits.to_constant ();
8188 vec_index = i / const_nunits;
8189 mask_element = i % const_nunits;
8190 if (vec_index == first_vec_index
8191 || first_vec_index == -1)
8193 first_vec_index = vec_index;
8195 else if (vec_index == second_vec_index
8196 || second_vec_index == -1)
8198 second_vec_index = vec_index;
8199 mask_element += const_nunits;
8201 else
8203 if (dump_p)
8204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8205 "permutation requires at "
8206 "least three vectors %G",
8207 stmt_info->stmt);
8208 gcc_assert (analyze_only);
8209 return false;
8212 gcc_assert (mask_element < 2 * const_nunits);
8215 if (mask_element != index)
8216 noop_p = false;
8217 mask[index++] = mask_element;
8219 if (index == count && !noop_p)
8221 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8222 if (!can_vec_perm_const_p (mode, mode, indices))
8224 if (dump_p)
8226 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8227 vect_location,
8228 "unsupported vect permute { ");
8229 for (i = 0; i < count; ++i)
8231 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8232 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8234 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8236 gcc_assert (analyze_only);
8237 return false;
8240 ++*n_perms;
8243 if (index == count)
8245 if (!analyze_only)
8247 tree mask_vec = NULL_TREE;
8249 if (! noop_p)
8250 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8252 if (second_vec_index == -1)
8253 second_vec_index = first_vec_index;
8255 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8257 /* Generate the permute statement if necessary. */
8258 tree first_vec = dr_chain[first_vec_index + ri];
8259 tree second_vec = dr_chain[second_vec_index + ri];
8260 gimple *perm_stmt;
8261 if (! noop_p)
8263 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
8264 tree perm_dest
8265 = vect_create_destination_var (gimple_assign_lhs (stmt),
8266 vectype);
8267 perm_dest = make_ssa_name (perm_dest);
8268 perm_stmt
8269 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8270 first_vec, second_vec,
8271 mask_vec);
8272 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8273 gsi);
8274 if (dce_chain)
8276 bitmap_set_bit (used_defs, first_vec_index + ri);
8277 bitmap_set_bit (used_defs, second_vec_index + ri);
8280 else
8282 /* If mask was NULL_TREE generate the requested
8283 identity transform. */
8284 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
8285 if (dce_chain)
8286 bitmap_set_bit (used_defs, first_vec_index + ri);
8289 /* Store the vector statement in NODE. */
8290 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
8294 index = 0;
8295 first_vec_index = -1;
8296 second_vec_index = -1;
8297 noop_p = true;
8301 if (n_loads)
8303 if (repeating_p)
8304 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8305 else
8307 /* Enforced above when !repeating_p. */
8308 unsigned int const_nunits = nunits.to_constant ();
8309 *n_loads = 0;
8310 bool load_seen = false;
8311 for (unsigned i = 0; i < in_nlanes; ++i)
8313 if (i % const_nunits == 0)
8315 if (load_seen)
8316 *n_loads += 1;
8317 load_seen = false;
8319 if (bitmap_bit_p (used_in_lanes, i))
8320 load_seen = true;
8322 if (load_seen)
8323 *n_loads += 1;
8327 if (dce_chain)
8328 for (unsigned i = 0; i < dr_chain.length (); ++i)
8329 if (!bitmap_bit_p (used_defs, i))
8331 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8332 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8333 gsi_remove (&rgsi, true);
8334 release_defs (stmt);
8337 return true;
8340 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8341 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8342 permute statements for the SLP node NODE. Store the number of vector
8343 permute instructions in *N_PERMS and the number of vector load
8344 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8345 that were not needed. */
8347 bool
8348 vect_transform_slp_perm_load (vec_info *vinfo,
8349 slp_tree node, const vec<tree> &dr_chain,
8350 gimple_stmt_iterator *gsi, poly_uint64 vf,
8351 bool analyze_only, unsigned *n_perms,
8352 unsigned int *n_loads, bool dce_chain)
8354 return vect_transform_slp_perm_load_1 (vinfo, node,
8355 SLP_TREE_LOAD_PERMUTATION (node),
8356 dr_chain, gsi, vf, analyze_only,
8357 dump_enabled_p (), n_perms, n_loads,
8358 dce_chain);
8361 /* Produce the next vector result for SLP permutation NODE by adding a vector
8362 statement at GSI. If MASK_VEC is nonnull, add:
8364 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8366 otherwise add:
8368 <new SSA name> = FIRST_DEF. */
8370 static void
8371 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8372 slp_tree node, tree first_def, tree second_def,
8373 tree mask_vec)
8375 tree vectype = SLP_TREE_VECTYPE (node);
8377 /* ??? We SLP match existing vector element extracts but
8378 allow punning which we need to re-instantiate at uses
8379 but have no good way of explicitly representing. */
8380 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8381 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8383 gassign *conv_stmt
8384 = gimple_build_assign (make_ssa_name (vectype),
8385 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8386 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8387 first_def = gimple_assign_lhs (conv_stmt);
8389 gassign *perm_stmt;
8390 tree perm_dest = make_ssa_name (vectype);
8391 if (mask_vec)
8393 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8394 TYPE_SIZE (vectype))
8395 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8397 gassign *conv_stmt
8398 = gimple_build_assign (make_ssa_name (vectype),
8399 build1 (VIEW_CONVERT_EXPR,
8400 vectype, second_def));
8401 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8402 second_def = gimple_assign_lhs (conv_stmt);
8404 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8405 first_def, second_def,
8406 mask_vec);
8408 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8410 /* For identity permutes we still need to handle the case
8411 of lowpart extracts or concats. */
8412 unsigned HOST_WIDE_INT c;
8413 auto first_def_nunits
8414 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8415 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8417 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8418 TYPE_SIZE (vectype), bitsize_zero_node);
8419 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8421 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8422 first_def_nunits, &c) && c == 2)
8424 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8425 NULL_TREE, second_def);
8426 perm_stmt = gimple_build_assign (perm_dest, ctor);
8428 else
8429 gcc_unreachable ();
8431 else
8433 /* We need a copy here in case the def was external. */
8434 perm_stmt = gimple_build_assign (perm_dest, first_def);
8436 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8437 /* Store the vector statement in NODE. */
8438 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
8441 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8442 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8443 If GSI is nonnull, emit the permutation there.
8445 When GSI is null, the only purpose of NODE is to give properties
8446 of the result, such as the vector type and number of SLP lanes.
8447 The node does not need to be a VEC_PERM_EXPR.
8449 If the target supports the operation, return the number of individual
8450 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8451 dump file if DUMP_P is true. */
8453 static int
8454 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8455 slp_tree node, lane_permutation_t &perm,
8456 vec<slp_tree> &children, bool dump_p)
8458 tree vectype = SLP_TREE_VECTYPE (node);
8460 /* ??? We currently only support all same vector input types
8461 while the SLP IL should really do a concat + select and thus accept
8462 arbitrary mismatches. */
8463 slp_tree child;
8464 unsigned i;
8465 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8466 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8467 tree op_vectype = NULL_TREE;
8468 FOR_EACH_VEC_ELT (children, i, child)
8469 if (SLP_TREE_VECTYPE (child))
8471 op_vectype = SLP_TREE_VECTYPE (child);
8472 break;
8474 if (!op_vectype)
8475 op_vectype = vectype;
8476 FOR_EACH_VEC_ELT (children, i, child)
8478 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8479 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8480 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8481 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8483 if (dump_p)
8484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8485 "Unsupported vector types in lane permutation\n");
8486 return -1;
8488 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8489 repeating_p = false;
8492 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8493 if (dump_p)
8495 dump_printf_loc (MSG_NOTE, vect_location,
8496 "vectorizing permutation");
8497 for (unsigned i = 0; i < perm.length (); ++i)
8498 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8499 if (repeating_p)
8500 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8501 dump_printf (MSG_NOTE, "\n");
8504 /* REPEATING_P is true if every output vector is guaranteed to use the
8505 same permute vector. We can handle that case for both variable-length
8506 and constant-length vectors, but we only handle other cases for
8507 constant-length vectors.
8509 Set:
8511 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8512 mask vector that we want to build.
8514 - NCOPIES to the number of copies of PERM that we need in order
8515 to build the necessary permute mask vectors.
8517 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8518 for each permute mask vector. This is only relevant when GSI is
8519 nonnull. */
8520 uint64_t npatterns;
8521 unsigned nelts_per_pattern;
8522 uint64_t ncopies;
8523 unsigned noutputs_per_mask;
8524 if (repeating_p)
8526 /* We need a single permute mask vector that has the form:
8528 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8530 In other words, the original n-element permute in PERM is
8531 "unrolled" to fill a full vector. The stepped vector encoding
8532 that we use for permutes requires 3n elements. */
8533 npatterns = SLP_TREE_LANES (node);
8534 nelts_per_pattern = ncopies = 3;
8535 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8537 else
8539 /* Calculate every element of every permute mask vector explicitly,
8540 instead of relying on the pattern described above. */
8541 if (!nunits.is_constant (&npatterns))
8542 return -1;
8543 nelts_per_pattern = ncopies = 1;
8544 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8545 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8546 return -1;
8547 noutputs_per_mask = 1;
8549 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8550 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8552 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8553 from the { SLP operand, scalar lane } permutation as recorded in the
8554 SLP node as intermediate step. This part should already work
8555 with SLP children with arbitrary number of lanes. */
8556 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8557 auto_vec<unsigned> active_lane;
8558 vperm.create (olanes);
8559 active_lane.safe_grow_cleared (children.length (), true);
8560 for (unsigned i = 0; i < ncopies; ++i)
8562 for (unsigned pi = 0; pi < perm.length (); ++pi)
8564 std::pair<unsigned, unsigned> p = perm[pi];
8565 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8566 if (repeating_p)
8567 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8568 else
8570 /* We checked above that the vectors are constant-length. */
8571 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8572 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8573 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8574 vperm.quick_push ({{p.first, vi}, vl});
8577 /* Advance to the next group. */
8578 for (unsigned j = 0; j < children.length (); ++j)
8579 active_lane[j] += SLP_TREE_LANES (children[j]);
8582 if (dump_p)
8584 dump_printf_loc (MSG_NOTE, vect_location,
8585 "vectorizing permutation");
8586 for (unsigned i = 0; i < perm.length (); ++i)
8587 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8588 if (repeating_p)
8589 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8590 dump_printf (MSG_NOTE, "\n");
8591 dump_printf_loc (MSG_NOTE, vect_location, "as");
8592 for (unsigned i = 0; i < vperm.length (); ++i)
8594 if (i != 0
8595 && (repeating_p
8596 ? multiple_p (i, npatterns)
8597 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8598 dump_printf (MSG_NOTE, ",");
8599 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8600 vperm[i].first.first, vperm[i].first.second,
8601 vperm[i].second);
8603 dump_printf (MSG_NOTE, "\n");
8606 /* We can only handle two-vector permutes, everything else should
8607 be lowered on the SLP level. The following is closely inspired
8608 by vect_transform_slp_perm_load and is supposed to eventually
8609 replace it.
8610 ??? As intermediate step do code-gen in the SLP tree representation
8611 somehow? */
8612 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8613 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8614 unsigned int index = 0;
8615 poly_uint64 mask_element;
8616 vec_perm_builder mask;
8617 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8618 unsigned int count = mask.encoded_nelts ();
8619 mask.quick_grow (count);
8620 vec_perm_indices indices;
8621 unsigned nperms = 0;
8622 for (unsigned i = 0; i < vperm.length (); ++i)
8624 mask_element = vperm[i].second;
8625 if (first_vec.first == -1U
8626 || first_vec == vperm[i].first)
8627 first_vec = vperm[i].first;
8628 else if (second_vec.first == -1U
8629 || second_vec == vperm[i].first)
8631 second_vec = vperm[i].first;
8632 mask_element += nunits;
8634 else
8636 if (dump_p)
8637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8638 "permutation requires at "
8639 "least three vectors\n");
8640 gcc_assert (!gsi);
8641 return -1;
8644 mask[index++] = mask_element;
8646 if (index == count)
8648 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8649 TYPE_VECTOR_SUBPARTS (op_vectype));
8650 bool identity_p = indices.series_p (0, 1, 0, 1);
8651 machine_mode vmode = TYPE_MODE (vectype);
8652 machine_mode op_vmode = TYPE_MODE (op_vectype);
8653 unsigned HOST_WIDE_INT c;
8654 if ((!identity_p
8655 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8656 || (identity_p
8657 && !known_le (nunits,
8658 TYPE_VECTOR_SUBPARTS (op_vectype))
8659 && (!constant_multiple_p (nunits,
8660 TYPE_VECTOR_SUBPARTS (op_vectype),
8661 &c) || c != 2)))
8663 if (dump_p)
8665 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8666 vect_location,
8667 "unsupported vect permute { ");
8668 for (i = 0; i < count; ++i)
8670 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8671 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8673 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8675 gcc_assert (!gsi);
8676 return -1;
8679 if (!identity_p)
8680 nperms++;
8681 if (gsi)
8683 if (second_vec.first == -1U)
8684 second_vec = first_vec;
8686 slp_tree
8687 first_node = children[first_vec.first],
8688 second_node = children[second_vec.first];
8690 tree mask_vec = NULL_TREE;
8691 if (!identity_p)
8692 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8694 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8696 tree first_def
8697 = vect_get_slp_vect_def (first_node,
8698 first_vec.second + vi);
8699 tree second_def
8700 = vect_get_slp_vect_def (second_node,
8701 second_vec.second + vi);
8702 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8703 second_def, mask_vec);
8707 index = 0;
8708 first_vec = std::make_pair (-1U, -1U);
8709 second_vec = std::make_pair (-1U, -1U);
8713 return nperms;
8716 /* Vectorize the SLP permutations in NODE as specified
8717 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8718 child number and lane number.
8719 Interleaving of two two-lane two-child SLP subtrees (not supported):
8720 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8721 A blend of two four-lane two-child SLP subtrees:
8722 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8723 Highpart of a four-lane one-child SLP subtree (not supported):
8724 [ { 0, 2 }, { 0, 3 } ]
8725 Where currently only a subset is supported by code generating below. */
8727 static bool
8728 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8729 slp_tree node, stmt_vector_for_cost *cost_vec)
8731 tree vectype = SLP_TREE_VECTYPE (node);
8732 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8733 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8734 SLP_TREE_CHILDREN (node),
8735 dump_enabled_p ());
8736 if (nperms < 0)
8737 return false;
8739 if (!gsi)
8740 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8742 return true;
8745 /* Vectorize SLP NODE. */
8747 static void
8748 vect_schedule_slp_node (vec_info *vinfo,
8749 slp_tree node, slp_instance instance)
8751 gimple_stmt_iterator si;
8752 int i;
8753 slp_tree child;
8755 /* For existing vectors there's nothing to do. */
8756 if (SLP_TREE_VEC_DEFS (node).exists ())
8757 return;
8759 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
8761 /* Vectorize externals and constants. */
8762 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8763 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8765 /* ??? vectorizable_shift can end up using a scalar operand which is
8766 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8767 node in this case. */
8768 if (!SLP_TREE_VECTYPE (node))
8769 return;
8771 vect_create_constant_vectors (vinfo, node);
8772 return;
8775 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8777 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8778 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8780 if (dump_enabled_p ())
8781 dump_printf_loc (MSG_NOTE, vect_location,
8782 "------>vectorizing SLP node starting from: %G",
8783 stmt_info->stmt);
8785 if (STMT_VINFO_DATA_REF (stmt_info)
8786 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8788 /* Vectorized loads go before the first scalar load to make it
8789 ready early, vectorized stores go before the last scalar
8790 stmt which is where all uses are ready. */
8791 stmt_vec_info last_stmt_info = NULL;
8792 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8793 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8794 else /* DR_IS_WRITE */
8795 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8796 si = gsi_for_stmt (last_stmt_info->stmt);
8798 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8799 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8800 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8801 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8803 /* For PHI node vectorization we do not use the insertion iterator. */
8804 si = gsi_none ();
8806 else
8808 /* Emit other stmts after the children vectorized defs which is
8809 earliest possible. */
8810 gimple *last_stmt = NULL;
8811 bool seen_vector_def = false;
8812 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8813 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8815 /* For fold-left reductions we are retaining the scalar
8816 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8817 set so the representation isn't perfect. Resort to the
8818 last scalar def here. */
8819 if (SLP_TREE_VEC_STMTS (child).is_empty ())
8821 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8822 == cycle_phi_info_type);
8823 gphi *phi = as_a <gphi *>
8824 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8825 if (!last_stmt
8826 || vect_stmt_dominates_stmt_p (last_stmt, phi))
8827 last_stmt = phi;
8829 /* We are emitting all vectorized stmts in the same place and
8830 the last one is the last.
8831 ??? Unless we have a load permutation applied and that
8832 figures to re-use an earlier generated load. */
8833 unsigned j;
8834 gimple *vstmt;
8835 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
8836 if (!last_stmt
8837 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8838 last_stmt = vstmt;
8840 else if (!SLP_TREE_VECTYPE (child))
8842 /* For externals we use unvectorized at all scalar defs. */
8843 unsigned j;
8844 tree def;
8845 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8846 if (TREE_CODE (def) == SSA_NAME
8847 && !SSA_NAME_IS_DEFAULT_DEF (def))
8849 gimple *stmt = SSA_NAME_DEF_STMT (def);
8850 if (!last_stmt
8851 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8852 last_stmt = stmt;
8855 else
8857 /* For externals we have to look at all defs since their
8858 insertion place is decided per vector. But beware
8859 of pre-existing vectors where we need to make sure
8860 we do not insert before the region boundary. */
8861 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8862 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8863 seen_vector_def = true;
8864 else
8866 unsigned j;
8867 tree vdef;
8868 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8869 if (TREE_CODE (vdef) == SSA_NAME
8870 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8872 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8873 if (!last_stmt
8874 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8875 last_stmt = vstmt;
8879 /* This can happen when all children are pre-existing vectors or
8880 constants. */
8881 if (!last_stmt)
8882 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8883 if (!last_stmt)
8885 gcc_assert (seen_vector_def);
8886 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8888 else if (is_ctrl_altering_stmt (last_stmt))
8890 /* We split regions to vectorize at control altering stmts
8891 with a definition so this must be an external which
8892 we can insert at the start of the region. */
8893 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
8895 else if (is_a <bb_vec_info> (vinfo)
8896 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
8897 && gimple_could_trap_p (stmt_info->stmt))
8899 /* We've constrained possibly trapping operations to all come
8900 from the same basic-block, if vectorized defs would allow earlier
8901 scheduling still force vectorized stmts to the original block.
8902 This is only necessary for BB vectorization since for loop vect
8903 all operations are in a single BB and scalar stmt based
8904 placement doesn't play well with epilogue vectorization. */
8905 gcc_assert (dominated_by_p (CDI_DOMINATORS,
8906 gimple_bb (stmt_info->stmt),
8907 gimple_bb (last_stmt)));
8908 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
8910 else if (is_a <gphi *> (last_stmt))
8911 si = gsi_after_labels (gimple_bb (last_stmt));
8912 else
8914 si = gsi_for_stmt (last_stmt);
8915 gsi_next (&si);
8919 /* Handle purely internal nodes. */
8920 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8922 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
8923 be shared with different SLP nodes (but usually it's the same
8924 operation apart from the case the stmt is only there for denoting
8925 the actual scalar lane defs ...). So do not call vect_transform_stmt
8926 but open-code it here (partly). */
8927 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
8928 gcc_assert (done);
8929 stmt_vec_info slp_stmt_info;
8930 unsigned int i;
8931 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8932 if (STMT_VINFO_LIVE_P (slp_stmt_info))
8934 done = vectorizable_live_operation (vinfo,
8935 slp_stmt_info, &si, node,
8936 instance, i, true, NULL);
8937 gcc_assert (done);
8940 else
8941 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
8944 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
8945 For loop vectorization this is done in vectorizable_call, but for SLP
8946 it needs to be deferred until end of vect_schedule_slp, because multiple
8947 SLP instances may refer to the same scalar stmt. */
8949 static void
8950 vect_remove_slp_scalar_calls (vec_info *vinfo,
8951 slp_tree node, hash_set<slp_tree> &visited)
8953 gimple *new_stmt;
8954 gimple_stmt_iterator gsi;
8955 int i;
8956 slp_tree child;
8957 tree lhs;
8958 stmt_vec_info stmt_info;
8960 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8961 return;
8963 if (visited.add (node))
8964 return;
8966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8967 vect_remove_slp_scalar_calls (vinfo, child, visited);
8969 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8971 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
8972 if (!stmt || gimple_bb (stmt) == NULL)
8973 continue;
8974 if (is_pattern_stmt_p (stmt_info)
8975 || !PURE_SLP_STMT (stmt_info))
8976 continue;
8977 lhs = gimple_call_lhs (stmt);
8978 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
8979 gsi = gsi_for_stmt (stmt);
8980 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
8981 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
8985 static void
8986 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
8988 hash_set<slp_tree> visited;
8989 vect_remove_slp_scalar_calls (vinfo, node, visited);
8992 /* Vectorize the instance root. */
8994 void
8995 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
8997 gassign *rstmt = NULL;
8999 if (instance->kind == slp_inst_kind_ctor)
9001 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9003 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
9004 tree vect_lhs = gimple_get_lhs (child_stmt);
9005 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9006 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9007 TREE_TYPE (vect_lhs)))
9008 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9009 vect_lhs);
9010 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9012 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9014 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9015 gimple *child_stmt;
9016 int j;
9017 vec<constructor_elt, va_gc> *v;
9018 vec_alloc (v, nelts);
9020 /* A CTOR can handle V16HI composition from VNx8HI so we
9021 do not need to convert vector elements if the types
9022 do not match. */
9023 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
9024 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
9025 gimple_get_lhs (child_stmt));
9026 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9027 tree rtype
9028 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9029 tree r_constructor = build_constructor (rtype, v);
9030 rstmt = gimple_build_assign (lhs, r_constructor);
9033 else if (instance->kind == slp_inst_kind_bb_reduc)
9035 /* Largely inspired by reduction chain epilogue handling in
9036 vect_create_epilog_for_reduction. */
9037 vec<tree> vec_defs = vNULL;
9038 vect_get_slp_defs (node, &vec_defs);
9039 enum tree_code reduc_code
9040 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9041 /* ??? We actually have to reflect signs somewhere. */
9042 if (reduc_code == MINUS_EXPR)
9043 reduc_code = PLUS_EXPR;
9044 gimple_seq epilogue = NULL;
9045 /* We may end up with more than one vector result, reduce them
9046 to one vector. */
9047 tree vec_def = vec_defs[0];
9048 for (unsigned i = 1; i < vec_defs.length (); ++i)
9049 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
9050 vec_def, vec_defs[i]);
9051 vec_defs.release ();
9052 /* ??? Support other schemes than direct internal fn. */
9053 internal_fn reduc_fn;
9054 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9055 || reduc_fn == IFN_LAST)
9056 gcc_unreachable ();
9057 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9058 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
9060 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9061 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9062 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9063 update_stmt (gsi_stmt (rgsi));
9064 return;
9066 else
9067 gcc_unreachable ();
9069 gcc_assert (rstmt);
9071 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9072 gsi_replace (&rgsi, rstmt, true);
9075 struct slp_scc_info
9077 bool on_stack;
9078 int dfs;
9079 int lowlink;
9082 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9084 static void
9085 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9086 hash_map<slp_tree, slp_scc_info> &scc_info,
9087 int &maxdfs, vec<slp_tree> &stack)
9089 bool existed_p;
9090 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9091 gcc_assert (!existed_p);
9092 info->dfs = maxdfs;
9093 info->lowlink = maxdfs;
9094 maxdfs++;
9096 /* Leaf. */
9097 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9099 info->on_stack = false;
9100 vect_schedule_slp_node (vinfo, node, instance);
9101 return;
9104 info->on_stack = true;
9105 stack.safe_push (node);
9107 unsigned i;
9108 slp_tree child;
9109 /* DFS recurse. */
9110 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9112 if (!child)
9113 continue;
9114 slp_scc_info *child_info = scc_info.get (child);
9115 if (!child_info)
9117 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9118 /* Recursion might have re-allocated the node. */
9119 info = scc_info.get (node);
9120 child_info = scc_info.get (child);
9121 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9123 else if (child_info->on_stack)
9124 info->lowlink = MIN (info->lowlink, child_info->dfs);
9126 if (info->lowlink != info->dfs)
9127 return;
9129 auto_vec<slp_tree, 4> phis_to_fixup;
9131 /* Singleton. */
9132 if (stack.last () == node)
9134 stack.pop ();
9135 info->on_stack = false;
9136 vect_schedule_slp_node (vinfo, node, instance);
9137 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9138 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9139 phis_to_fixup.quick_push (node);
9141 else
9143 /* SCC. */
9144 int last_idx = stack.length () - 1;
9145 while (stack[last_idx] != node)
9146 last_idx--;
9147 /* We can break the cycle at PHIs who have at least one child
9148 code generated. Then we could re-start the DFS walk until
9149 all nodes in the SCC are covered (we might have new entries
9150 for only back-reachable nodes). But it's simpler to just
9151 iterate and schedule those that are ready. */
9152 unsigned todo = stack.length () - last_idx;
9155 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9157 slp_tree entry = stack[idx];
9158 if (!entry)
9159 continue;
9160 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9161 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9162 bool ready = !phi;
9163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9164 if (!child)
9166 gcc_assert (phi);
9167 ready = true;
9168 break;
9170 else if (scc_info.get (child)->on_stack)
9172 if (!phi)
9174 ready = false;
9175 break;
9178 else
9180 if (phi)
9182 ready = true;
9183 break;
9186 if (ready)
9188 vect_schedule_slp_node (vinfo, entry, instance);
9189 scc_info.get (entry)->on_stack = false;
9190 stack[idx] = NULL;
9191 todo--;
9192 if (phi)
9193 phis_to_fixup.safe_push (entry);
9197 while (todo != 0);
9199 /* Pop the SCC. */
9200 stack.truncate (last_idx);
9203 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9204 slp_tree phi_node;
9205 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9207 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9208 edge_iterator ei;
9209 edge e;
9210 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9212 unsigned dest_idx = e->dest_idx;
9213 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9214 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9215 continue;
9216 unsigned n = SLP_TREE_VEC_STMTS (phi_node).length ();
9217 /* Simply fill all args. */
9218 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9219 != vect_first_order_recurrence)
9220 for (unsigned i = 0; i < n; ++i)
9221 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
9222 vect_get_slp_vect_def (child, i),
9223 e, gimple_phi_arg_location (phi, dest_idx));
9224 else
9226 /* Unless it is a first order recurrence which needs
9227 args filled in for both the PHI node and the permutes. */
9228 gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[0];
9229 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9230 add_phi_arg (as_a <gphi *> (rphi),
9231 vect_get_slp_vect_def (child, n - 1),
9232 e, gimple_phi_arg_location (phi, dest_idx));
9233 for (unsigned i = 0; i < n; ++i)
9235 gimple *perm = SLP_TREE_VEC_STMTS (phi_node)[i];
9236 if (i > 0)
9237 gimple_assign_set_rhs1 (perm,
9238 vect_get_slp_vect_def (child, i - 1));
9239 gimple_assign_set_rhs2 (perm,
9240 vect_get_slp_vect_def (child, i));
9241 update_stmt (perm);
9248 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9250 void
9251 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9253 slp_instance instance;
9254 unsigned int i;
9256 hash_map<slp_tree, slp_scc_info> scc_info;
9257 int maxdfs = 0;
9258 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9260 slp_tree node = SLP_INSTANCE_TREE (instance);
9261 if (dump_enabled_p ())
9263 dump_printf_loc (MSG_NOTE, vect_location,
9264 "Vectorizing SLP tree:\n");
9265 /* ??? Dump all? */
9266 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9267 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9268 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9269 vect_print_slp_graph (MSG_NOTE, vect_location,
9270 SLP_INSTANCE_TREE (instance));
9272 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9273 have a PHI be the node breaking the cycle. */
9274 auto_vec<slp_tree> stack;
9275 if (!scc_info.get (node))
9276 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9278 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9279 vectorize_slp_instance_root_stmt (node, instance);
9281 if (dump_enabled_p ())
9282 dump_printf_loc (MSG_NOTE, vect_location,
9283 "vectorizing stmts using SLP.\n");
9286 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9288 slp_tree root = SLP_INSTANCE_TREE (instance);
9289 stmt_vec_info store_info;
9290 unsigned int j;
9292 /* Remove scalar call stmts. Do not do this for basic-block
9293 vectorization as not all uses may be vectorized.
9294 ??? Why should this be necessary? DCE should be able to
9295 remove the stmts itself.
9296 ??? For BB vectorization we can as well remove scalar
9297 stmts starting from the SLP tree root if they have no
9298 uses. */
9299 if (is_a <loop_vec_info> (vinfo))
9300 vect_remove_slp_scalar_calls (vinfo, root);
9302 /* Remove vectorized stores original scalar stmts. */
9303 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9305 if (!STMT_VINFO_DATA_REF (store_info)
9306 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9307 break;
9309 store_info = vect_orig_stmt (store_info);
9310 /* Free the attached stmt_vec_info and remove the stmt. */
9311 vinfo->remove_stmt (store_info);
9313 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9314 to not crash in vect_free_slp_tree later. */
9315 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9316 SLP_TREE_REPRESENTATIVE (root) = NULL;