1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
23 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
52 #include "alloc-pool.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info
*, slp_tree
,
59 gimple_stmt_iterator
*,
60 poly_uint64
, bool, bool,
64 static int vectorizable_slp_permutation_1 (vec_info
*, gimple_stmt_iterator
*,
65 slp_tree
, lane_permutation_t
&,
66 vec
<slp_tree
> &, bool);
67 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
68 slp_tree
, stmt_vector_for_cost
*);
69 static void vect_print_slp_tree (dump_flags_t
, dump_location_t
, slp_tree
);
71 static object_allocator
<_slp_tree
> *slp_tree_pool
;
72 static slp_tree slp_first_node
;
77 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
83 while (slp_first_node
)
84 delete slp_first_node
;
90 _slp_tree::operator new (size_t n
)
92 gcc_assert (n
== sizeof (_slp_tree
));
93 return slp_tree_pool
->allocate_raw ();
97 _slp_tree::operator delete (void *node
, size_t n
)
99 gcc_assert (n
== sizeof (_slp_tree
));
100 slp_tree_pool
->remove_raw (node
);
104 /* Initialize a SLP node. */
106 _slp_tree::_slp_tree ()
108 this->prev_node
= NULL
;
110 slp_first_node
->prev_node
= this;
111 this->next_node
= slp_first_node
;
112 slp_first_node
= this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
114 SLP_TREE_SCALAR_OPS (this) = vNULL
;
115 SLP_TREE_VEC_DEFS (this) = vNULL
;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL
;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
120 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
121 SLP_TREE_CODE (this) = ERROR_MARK
;
122 SLP_TREE_VECTYPE (this) = NULL_TREE
;
123 SLP_TREE_REPRESENTATIVE (this) = NULL
;
124 SLP_TREE_REF_COUNT (this) = 1;
126 this->max_nunits
= 1;
130 /* Tear down a SLP node. */
132 _slp_tree::~_slp_tree ()
135 this->prev_node
->next_node
= this->next_node
;
137 slp_first_node
= this->next_node
;
139 this->next_node
->prev_node
= this->prev_node
;
140 SLP_TREE_CHILDREN (this).release ();
141 SLP_TREE_SCALAR_STMTS (this).release ();
142 SLP_TREE_SCALAR_OPS (this).release ();
143 SLP_TREE_VEC_DEFS (this).release ();
144 SLP_TREE_LOAD_PERMUTATION (this).release ();
145 SLP_TREE_LANE_PERMUTATION (this).release ();
150 /* Push the single SSA definition in DEF to the vector of vector defs. */
153 _slp_tree::push_vec_def (gimple
*def
)
155 if (gphi
*phi
= dyn_cast
<gphi
*> (def
))
156 vec_defs
.quick_push (gimple_phi_result (phi
));
159 def_operand_p defop
= single_ssa_def_operand (def
, SSA_OP_ALL_DEFS
);
160 vec_defs
.quick_push (get_def_from_ptr (defop
));
164 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167 vect_free_slp_tree (slp_tree node
)
172 if (--SLP_TREE_REF_COUNT (node
) != 0)
175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
177 vect_free_slp_tree (child
);
179 /* If the node defines any SLP only patterns then those patterns are no
180 longer valid and should be removed. */
181 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
182 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
184 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
185 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
186 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
192 /* Return a location suitable for dumpings related to the SLP instance. */
195 _slp_instance::location () const
197 if (!root_stmts
.is_empty ())
198 return root_stmts
[0]->stmt
;
200 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
204 /* Free the memory allocated for the SLP instance. */
207 vect_free_slp_instance (slp_instance instance
)
209 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
210 SLP_INSTANCE_LOADS (instance
).release ();
211 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
212 SLP_INSTANCE_REMAIN_DEFS (instance
).release ();
213 instance
->subgraph_entries
.release ();
214 instance
->cost_vec
.release ();
219 /* Create an SLP node for SCALAR_STMTS. */
222 vect_create_new_slp_node (unsigned nops
, tree_code code
)
224 slp_tree node
= new _slp_tree
;
225 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
226 SLP_TREE_CHILDREN (node
).create (nops
);
227 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
228 SLP_TREE_CODE (node
) = code
;
231 /* Create an SLP node for SCALAR_STMTS. */
234 vect_create_new_slp_node (slp_tree node
,
235 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
237 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
238 SLP_TREE_CHILDREN (node
).create (nops
);
239 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
240 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
241 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
245 /* Create an SLP node for SCALAR_STMTS. */
248 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
250 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
253 /* Create an SLP node for OPS. */
256 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
258 SLP_TREE_SCALAR_OPS (node
) = ops
;
259 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
260 SLP_TREE_LANES (node
) = ops
.length ();
264 /* Create an SLP node for OPS. */
267 vect_create_new_slp_node (vec
<tree
> ops
)
269 return vect_create_new_slp_node (new _slp_tree
, ops
);
273 /* This structure is used in creation of an SLP tree. Each instance
274 corresponds to the same operand in a group of scalar stmts in an SLP
276 typedef struct _slp_oprnd_info
278 /* Def-stmts for the operands. */
279 vec
<stmt_vec_info
> def_stmts
;
282 /* Information about the first statement, its vector def-type, type, the
283 operand itself in case it's constant, and an indication if it's a pattern
286 enum vect_def_type first_dt
;
291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
293 static vec
<slp_oprnd_info
>
294 vect_create_oprnd_info (int nops
, int group_size
)
297 slp_oprnd_info oprnd_info
;
298 vec
<slp_oprnd_info
> oprnds_info
;
300 oprnds_info
.create (nops
);
301 for (i
= 0; i
< nops
; i
++)
303 oprnd_info
= XNEW (struct _slp_oprnd_info
);
304 oprnd_info
->def_stmts
.create (group_size
);
305 oprnd_info
->ops
.create (group_size
);
306 oprnd_info
->first_dt
= vect_uninitialized_def
;
307 oprnd_info
->first_op_type
= NULL_TREE
;
308 oprnd_info
->any_pattern
= false;
309 oprnds_info
.quick_push (oprnd_info
);
316 /* Free operands info. */
319 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
322 slp_oprnd_info oprnd_info
;
324 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
326 oprnd_info
->def_stmts
.release ();
327 oprnd_info
->ops
.release ();
328 XDELETE (oprnd_info
);
331 oprnds_info
.release ();
334 /* Return the execution frequency of NODE (so that a higher value indicates
335 a "more important" node when optimizing for speed). */
338 vect_slp_node_weight (slp_tree node
)
340 stmt_vec_info stmt_info
= vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node
));
341 basic_block bb
= gimple_bb (stmt_info
->stmt
);
342 return bb
->count
.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
);
345 /* Return true if STMTS contains a pattern statement. */
348 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
350 stmt_vec_info stmt_info
;
352 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
353 if (is_pattern_stmt_p (stmt_info
))
358 /* Return true when all lanes in the external or constant NODE have
362 vect_slp_tree_uniform_p (slp_tree node
)
364 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
365 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
367 /* Pre-exsting vectors. */
368 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
372 tree op
, first
= NULL_TREE
;
373 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
376 else if (!operand_equal_p (first
, op
, 0))
382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
383 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
388 stmt_vec_info first_stmt_info
)
390 stmt_vec_info next_stmt_info
= first_stmt_info
;
393 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
398 if (next_stmt_info
== stmt_info
)
400 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
402 result
+= DR_GROUP_GAP (next_stmt_info
);
404 while (next_stmt_info
);
409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
410 using the method implemented by duplicate_and_interleave. Return true
411 if so, returning the number of intermediate vectors in *NVECTORS_OUT
412 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
416 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
417 tree elt_type
, unsigned int *nvectors_out
,
418 tree
*vector_type_out
,
421 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
422 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
425 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
426 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
427 unsigned int nvectors
= 1;
430 scalar_int_mode int_mode
;
431 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
432 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
434 /* Get the natural vector type for this SLP group size. */
435 tree int_type
= build_nonstandard_integer_type
436 (GET_MODE_BITSIZE (int_mode
), 1);
438 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
439 poly_int64 half_nelts
;
441 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
442 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
443 GET_MODE_SIZE (base_vector_mode
))
444 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type
)),
447 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
448 together into elements of type INT_TYPE and using the result
449 to build NVECTORS vectors. */
450 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
451 vec_perm_builder
sel1 (nelts
, 2, 3);
452 vec_perm_builder
sel2 (nelts
, 2, 3);
454 for (unsigned int i
= 0; i
< 3; ++i
)
457 sel1
.quick_push (i
+ nelts
);
458 sel2
.quick_push (half_nelts
+ i
);
459 sel2
.quick_push (half_nelts
+ i
+ nelts
);
461 vec_perm_indices
indices1 (sel1
, 2, nelts
);
462 vec_perm_indices
indices2 (sel2
, 2, nelts
);
463 machine_mode vmode
= TYPE_MODE (vector_type
);
464 if (can_vec_perm_const_p (vmode
, vmode
, indices1
)
465 && can_vec_perm_const_p (vmode
, vmode
, indices2
))
468 *nvectors_out
= nvectors
;
470 *vector_type_out
= vector_type
;
473 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
475 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
482 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
488 /* Return true if DTA and DTB match. */
491 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
494 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
495 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
498 static const int cond_expr_maps
[3][5] = {
503 static const int arg1_map
[] = { 1, 1 };
504 static const int arg2_map
[] = { 1, 2 };
505 static const int arg1_arg4_map
[] = { 2, 1, 4 };
506 static const int arg3_arg2_map
[] = { 2, 3, 2 };
507 static const int op1_op0_map
[] = { 2, 1, 0 };
509 /* For most SLP statements, there is a one-to-one mapping between
510 gimple arguments and child nodes. If that is not true for STMT,
511 return an array that contains:
513 - the number of child nodes, followed by
514 - for each child node, the index of the argument associated with that node.
515 The special index -1 is the first operand of an embedded comparison and
516 the special index -2 is the second operand of an embedded comparison.
518 SWAP is as for vect_get_and_check_slp_defs. */
521 vect_get_operand_map (const gimple
*stmt
, unsigned char swap
= 0)
523 if (auto assign
= dyn_cast
<const gassign
*> (stmt
))
525 if (gimple_assign_rhs_code (assign
) == COND_EXPR
526 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign
)))
527 return cond_expr_maps
[swap
];
528 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign
)) == tcc_comparison
533 if (auto call
= dyn_cast
<const gcall
*> (stmt
))
535 if (gimple_call_internal_p (call
))
536 switch (gimple_call_internal_fn (call
))
541 case IFN_GATHER_LOAD
:
544 case IFN_MASK_GATHER_LOAD
:
545 return arg1_arg4_map
;
548 return arg3_arg2_map
;
557 /* Return the SLP node child index for operand OP of STMT. */
560 vect_slp_child_index_for_operand (const gimple
*stmt
, int op
)
562 const int *opmap
= vect_get_operand_map (stmt
);
565 for (int i
= 1; i
< 1 + opmap
[0]; ++i
)
571 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
572 they are of a valid type and that they match the defs of the first stmt of
573 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
574 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
575 indicates swap is required for cond_expr stmts. Specifically, SWAP
576 is 1 if STMT is cond and operands of comparison need to be swapped;
577 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
579 If there was a fatal error return -1; if the error could be corrected by
580 swapping operands of father node of this one, return 1; if everything is
583 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
585 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
586 vec
<slp_oprnd_info
> *oprnds_info
)
588 stmt_vec_info stmt_info
= stmts
[stmt_num
];
590 unsigned int i
, number_of_oprnds
;
591 enum vect_def_type dt
= vect_uninitialized_def
;
592 slp_oprnd_info oprnd_info
;
593 unsigned int commutative_op
= -1U;
594 bool first
= stmt_num
== 0;
596 if (!is_a
<gcall
*> (stmt_info
->stmt
)
597 && !is_a
<gassign
*> (stmt_info
->stmt
)
598 && !is_a
<gphi
*> (stmt_info
->stmt
))
601 number_of_oprnds
= gimple_num_args (stmt_info
->stmt
);
602 const int *map
= vect_get_operand_map (stmt_info
->stmt
, swap
);
604 number_of_oprnds
= *map
++;
605 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
607 if (gimple_call_internal_p (stmt
))
609 internal_fn ifn
= gimple_call_internal_fn (stmt
);
610 commutative_op
= first_commutative_argument (ifn
);
613 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
615 if (commutative_tree_code (gimple_assign_rhs_code (stmt
)))
619 bool swapped
= (swap
!= 0);
620 bool backedge
= false;
621 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
622 for (i
= 0; i
< number_of_oprnds
; i
++)
624 int opno
= map
? map
[i
] : int (i
);
626 oprnd
= TREE_OPERAND (gimple_arg (stmt_info
->stmt
, 0), -1 - opno
);
629 oprnd
= gimple_arg (stmt_info
->stmt
, opno
);
630 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
632 edge e
= gimple_phi_arg_edge (stmt
, opno
);
633 backedge
= (is_a
<bb_vec_info
> (vinfo
)
634 ? e
->flags
& EDGE_DFS_BACK
635 : dominated_by_p (CDI_DOMINATORS
, e
->src
,
636 gimple_bb (stmt_info
->stmt
)));
639 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
640 oprnd
= TREE_OPERAND (oprnd
, 0);
642 oprnd_info
= (*oprnds_info
)[i
];
644 stmt_vec_info def_stmt_info
;
645 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
647 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
649 "Build SLP failed: can't analyze def for %T\n",
657 oprnd_info
->def_stmts
.quick_push (NULL
);
658 oprnd_info
->ops
.quick_push (NULL_TREE
);
659 oprnd_info
->first_dt
= vect_uninitialized_def
;
663 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
664 oprnd_info
->ops
.quick_push (oprnd
);
667 && is_pattern_stmt_p (def_stmt_info
))
669 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
671 oprnd_info
->any_pattern
= true;
673 /* If we promote this to external use the original stmt def. */
674 oprnd_info
->ops
.last ()
675 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
678 /* If there's a extern def on a backedge make sure we can
679 code-generate at the region start.
680 ??? This is another case that could be fixed by adjusting
681 how we split the function but at the moment we'd have conflicting
684 && dts
[i
] == vect_external_def
685 && is_a
<bb_vec_info
> (vinfo
)
686 && TREE_CODE (oprnd
) == SSA_NAME
687 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
688 && !dominated_by_p (CDI_DOMINATORS
,
689 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
690 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
694 "Build SLP failed: extern def %T only defined "
695 "on backedge\n", oprnd
);
701 tree type
= TREE_TYPE (oprnd
);
703 if ((dt
== vect_constant_def
704 || dt
== vect_external_def
)
705 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
706 && (TREE_CODE (type
) == BOOLEAN_TYPE
707 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
712 "Build SLP failed: invalid type of def "
713 "for variable-length SLP %T\n", oprnd
);
717 /* For the swapping logic below force vect_reduction_def
718 for the reduction op in a SLP reduction group. */
719 if (!STMT_VINFO_DATA_REF (stmt_info
)
720 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
721 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
723 dts
[i
] = dt
= vect_reduction_def
;
725 /* Check the types of the definition. */
728 case vect_external_def
:
729 case vect_constant_def
:
730 case vect_internal_def
:
731 case vect_reduction_def
:
732 case vect_induction_def
:
733 case vect_nested_cycle
:
734 case vect_first_order_recurrence
:
738 /* FORNOW: Not supported. */
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
741 "Build SLP failed: illegal type of def %T\n",
746 oprnd_info
->first_dt
= dt
;
747 oprnd_info
->first_op_type
= type
;
753 /* Now match the operand definition types to that of the first stmt. */
754 for (i
= 0; i
< number_of_oprnds
;)
762 oprnd_info
= (*oprnds_info
)[i
];
764 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
765 oprnd
= oprnd_info
->ops
[stmt_num
];
766 tree type
= TREE_TYPE (oprnd
);
768 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
772 "Build SLP failed: different operand types\n");
776 /* Not first stmt of the group, check that the def-stmt/s match
777 the def-stmt/s of the first stmt. Allow different definition
778 types for reduction chains: the first stmt must be a
779 vect_reduction_def (a phi node), and the rest
780 end in the reduction chain. */
781 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
782 && !(oprnd_info
->first_dt
== vect_reduction_def
783 && !STMT_VINFO_DATA_REF (stmt_info
)
784 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
786 && !STMT_VINFO_DATA_REF (def_stmt_info
)
787 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
788 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
789 || (!STMT_VINFO_DATA_REF (stmt_info
)
790 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
792 || STMT_VINFO_DATA_REF (def_stmt_info
)
793 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
794 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
795 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
797 /* Try swapping operands if we got a mismatch. For BB
798 vectorization only in case it will clearly improve things. */
799 if (i
== commutative_op
&& !swapped
800 && (!is_a
<bb_vec_info
> (vinfo
)
801 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
803 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
804 || vect_def_types_match
805 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
807 if (dump_enabled_p ())
808 dump_printf_loc (MSG_NOTE
, vect_location
,
809 "trying swapped operands\n");
810 std::swap (dts
[i
], dts
[i
+1]);
811 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
812 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
813 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
814 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
819 if (is_a
<bb_vec_info
> (vinfo
)
820 && !oprnd_info
->any_pattern
)
822 /* Now for commutative ops we should see whether we can
823 make the other operand matching. */
824 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
826 "treating operand as external\n");
827 oprnd_info
->first_dt
= dt
= vect_external_def
;
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
833 "Build SLP failed: different types\n");
838 /* Make sure to demote the overall operand to external. */
839 if (dt
== vect_external_def
)
840 oprnd_info
->first_dt
= vect_external_def
;
841 /* For a SLP reduction chain we want to duplicate the reduction to
842 each of the chain members. That gets us a sane SLP graph (still
843 the stmts are not 100% correct wrt the initial values). */
844 else if ((dt
== vect_internal_def
845 || dt
== vect_reduction_def
)
846 && oprnd_info
->first_dt
== vect_reduction_def
847 && !STMT_VINFO_DATA_REF (stmt_info
)
848 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
849 && !STMT_VINFO_DATA_REF (def_stmt_info
)
850 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
851 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
853 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
854 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
863 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE
, vect_location
,
865 "swapped operands to match def types in %G",
872 /* Return true if call statements CALL1 and CALL2 are similar enough
873 to be combined into the same SLP group. */
876 compatible_calls_p (gcall
*call1
, gcall
*call2
)
878 unsigned int nargs
= gimple_call_num_args (call1
);
879 if (nargs
!= gimple_call_num_args (call2
))
882 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
885 if (gimple_call_internal_p (call1
))
887 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
888 TREE_TYPE (gimple_call_lhs (call2
))))
890 for (unsigned int i
= 0; i
< nargs
; ++i
)
891 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
892 TREE_TYPE (gimple_call_arg (call2
, i
))))
897 if (!operand_equal_p (gimple_call_fn (call1
),
898 gimple_call_fn (call2
), 0))
901 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
905 /* Check that any unvectorized arguments are equal. */
906 if (const int *map
= vect_get_operand_map (call1
))
908 unsigned int nkept
= *map
++;
909 unsigned int mapi
= 0;
910 for (unsigned int i
= 0; i
< nargs
; ++i
)
911 if (mapi
< nkept
&& map
[mapi
] == int (i
))
913 else if (!operand_equal_p (gimple_call_arg (call1
, i
),
914 gimple_call_arg (call2
, i
)))
921 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
922 caller's attempt to find the vector type in STMT_INFO with the narrowest
923 element type. Return true if VECTYPE is nonnull and if it is valid
924 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
925 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
926 vect_build_slp_tree. */
929 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
930 unsigned int group_size
,
931 tree vectype
, poly_uint64
*max_nunits
)
935 if (dump_enabled_p ())
936 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
937 "Build SLP failed: unsupported data-type in %G\n",
939 /* Fatal mismatch. */
943 /* If populating the vector type requires unrolling then fail
944 before adjusting *max_nunits for basic-block vectorization. */
945 if (is_a
<bb_vec_info
> (vinfo
)
946 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
950 "Build SLP failed: unrolling required "
951 "in basic block SLP\n");
952 /* Fatal mismatch. */
956 /* In case of multiple types we need to detect the smallest type. */
957 vect_update_max_nunits (max_nunits
, vectype
);
961 /* Verify if the scalar stmts STMTS are isomorphic, require data
962 permutation or are of unsupported types of operation. Return
963 true if they are, otherwise return false and indicate in *MATCHES
964 which stmts are not isomorphic to the first one. If MATCHES[0]
965 is false then this indicates the comparison could not be
966 carried out or the stmts will never be vectorized by SLP.
968 Note COND_EXPR is possibly isomorphic to another one after swapping its
969 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
970 the first stmt by swapping the two operands of comparison; set SWAP[i]
971 to 2 if stmt I is isormorphic to the first stmt by inverting the code
972 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
973 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
976 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
977 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
978 poly_uint64
*max_nunits
, bool *matches
,
979 bool *two_operators
, tree
*node_vectype
)
982 stmt_vec_info first_stmt_info
= stmts
[0];
983 code_helper first_stmt_code
= ERROR_MARK
;
984 code_helper alt_stmt_code
= ERROR_MARK
;
985 code_helper rhs_code
= ERROR_MARK
;
986 code_helper first_cond_code
= ERROR_MARK
;
988 bool need_same_oprnds
= false;
989 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
990 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
991 bool first_stmt_ldst_p
= false, ldst_p
= false;
992 bool first_stmt_phi_p
= false, phi_p
= false;
993 bool maybe_soft_fail
= false;
994 tree soft_fail_nunits_vectype
= NULL_TREE
;
996 /* For every stmt in NODE find its def stmt/s. */
997 stmt_vec_info stmt_info
;
998 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1000 gimple
*stmt
= stmt_info
->stmt
;
1004 if (dump_enabled_p ())
1005 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
1007 /* Fail to vectorize statements marked as unvectorizable, throw
1009 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
1010 || stmt_can_throw_internal (cfun
, stmt
)
1011 || gimple_has_volatile_ops (stmt
))
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1015 "Build SLP failed: unvectorizable statement %G",
1017 /* ??? For BB vectorization we want to commutate operands in a way
1018 to shuffle all unvectorizable defs into one operand and have
1019 the other still vectorized. The following doesn't reliably
1020 work for this though but it's the easiest we can do here. */
1021 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1023 /* Fatal mismatch. */
1028 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1029 lhs
= gimple_get_lhs (stmt
);
1030 if (lhs
== NULL_TREE
1032 || !gimple_call_internal_p (stmt
)
1033 || !internal_store_fn_p (gimple_call_internal_fn (stmt
))))
1035 if (dump_enabled_p ())
1036 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1037 "Build SLP failed: not GIMPLE_ASSIGN nor "
1038 "GIMPLE_CALL %G", stmt
);
1039 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1041 /* Fatal mismatch. */
1046 tree nunits_vectype
;
1047 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
1048 &nunits_vectype
, group_size
))
1050 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1052 /* Fatal mismatch. */
1056 /* Record nunits required but continue analysis, producing matches[]
1057 as if nunits was not an issue. This allows splitting of groups
1060 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1061 nunits_vectype
, max_nunits
))
1063 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1064 maybe_soft_fail
= true;
1065 soft_fail_nunits_vectype
= nunits_vectype
;
1068 gcc_assert (vectype
);
1072 combined_fn cfn
= gimple_call_combined_fn (call_stmt
);
1073 if (cfn
!= CFN_LAST
)
1076 rhs_code
= CALL_EXPR
;
1078 if (cfn
== CFN_MASK_LOAD
1079 || cfn
== CFN_GATHER_LOAD
1080 || cfn
== CFN_MASK_GATHER_LOAD
)
1082 else if (cfn
== CFN_MASK_STORE
)
1085 rhs_code
= CFN_MASK_STORE
;
1087 else if ((internal_fn_p (cfn
)
1088 && !vectorizable_internal_fn_p (as_internal_fn (cfn
)))
1089 || gimple_call_tail_p (call_stmt
)
1090 || gimple_call_noreturn_p (call_stmt
)
1091 || gimple_call_chain (call_stmt
))
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1095 "Build SLP failed: unsupported call type %G",
1096 (gimple
*) call_stmt
);
1097 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1099 /* Fatal mismatch. */
1104 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1106 rhs_code
= ERROR_MARK
;
1111 rhs_code
= gimple_assign_rhs_code (stmt
);
1112 ldst_p
= STMT_VINFO_DATA_REF (stmt_info
) != nullptr;
1115 /* Check the operation. */
1118 *node_vectype
= vectype
;
1119 first_stmt_code
= rhs_code
;
1120 first_stmt_ldst_p
= ldst_p
;
1121 first_stmt_phi_p
= phi_p
;
1123 /* Shift arguments should be equal in all the packed stmts for a
1124 vector shift with scalar shift operand. */
1125 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1126 || rhs_code
== LROTATE_EXPR
1127 || rhs_code
== RROTATE_EXPR
)
1129 /* First see if we have a vector/vector shift. */
1130 if (!directly_supported_p (rhs_code
, vectype
, optab_vector
))
1132 /* No vector/vector shift, try for a vector/scalar shift. */
1133 if (!directly_supported_p (rhs_code
, vectype
, optab_scalar
))
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1137 "Build SLP failed: "
1138 "op not supported by target.\n");
1139 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1141 /* Fatal mismatch. */
1145 need_same_oprnds
= true;
1146 first_op1
= gimple_assign_rhs2 (stmt
);
1149 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1151 need_same_oprnds
= true;
1152 first_op1
= gimple_assign_rhs2 (stmt
);
1155 && rhs_code
== BIT_FIELD_REF
)
1157 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1158 if (!is_a
<bb_vec_info
> (vinfo
)
1159 || TREE_CODE (vec
) != SSA_NAME
1160 /* When the element types are not compatible we pun the
1161 source to the target vectype which requires equal size. */
1162 || ((!VECTOR_TYPE_P (TREE_TYPE (vec
))
1163 || !types_compatible_p (TREE_TYPE (vectype
),
1164 TREE_TYPE (TREE_TYPE (vec
))))
1165 && !operand_equal_p (TYPE_SIZE (vectype
),
1166 TYPE_SIZE (TREE_TYPE (vec
)))))
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1170 "Build SLP failed: "
1171 "BIT_FIELD_REF not supported\n");
1172 /* Fatal mismatch. */
1177 else if (rhs_code
== CFN_DIV_POW2
)
1179 need_same_oprnds
= true;
1180 first_op1
= gimple_call_arg (call_stmt
, 1);
1185 if (first_stmt_code
!= rhs_code
1186 && alt_stmt_code
== ERROR_MARK
)
1187 alt_stmt_code
= rhs_code
;
1188 if ((first_stmt_code
!= rhs_code
1189 && (first_stmt_code
!= IMAGPART_EXPR
1190 || rhs_code
!= REALPART_EXPR
)
1191 && (first_stmt_code
!= REALPART_EXPR
1192 || rhs_code
!= IMAGPART_EXPR
)
1193 /* Handle mismatches in plus/minus by computing both
1194 and merging the results. */
1195 && !((first_stmt_code
== PLUS_EXPR
1196 || first_stmt_code
== MINUS_EXPR
)
1197 && (alt_stmt_code
== PLUS_EXPR
1198 || alt_stmt_code
== MINUS_EXPR
)
1199 && rhs_code
== alt_stmt_code
)
1200 && !(first_stmt_code
.is_tree_code ()
1201 && rhs_code
.is_tree_code ()
1202 && (TREE_CODE_CLASS (tree_code (first_stmt_code
))
1204 && (swap_tree_comparison (tree_code (first_stmt_code
))
1205 == tree_code (rhs_code
)))
1206 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1207 && (first_stmt_code
== ARRAY_REF
1208 || first_stmt_code
== BIT_FIELD_REF
1209 || first_stmt_code
== INDIRECT_REF
1210 || first_stmt_code
== COMPONENT_REF
1211 || first_stmt_code
== MEM_REF
)
1212 && (rhs_code
== ARRAY_REF
1213 || rhs_code
== BIT_FIELD_REF
1214 || rhs_code
== INDIRECT_REF
1215 || rhs_code
== COMPONENT_REF
1216 || rhs_code
== MEM_REF
)))
1217 || first_stmt_ldst_p
!= ldst_p
1218 || first_stmt_phi_p
!= phi_p
)
1220 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: different operation "
1224 "in stmt %G", stmt
);
1225 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1226 "original stmt %G", first_stmt_info
->stmt
);
1233 && first_stmt_code
== BIT_FIELD_REF
1234 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1235 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1239 "Build SLP failed: different BIT_FIELD_REF "
1240 "arguments in %G", stmt
);
1246 && first_stmt_code
!= CFN_MASK_LOAD
1247 && first_stmt_code
!= CFN_MASK_STORE
)
1249 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1252 if (dump_enabled_p ())
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1254 "Build SLP failed: different calls in %G",
1261 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1262 && (gimple_bb (first_stmt_info
->stmt
)
1263 != gimple_bb (stmt_info
->stmt
)))
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1267 "Build SLP failed: different BB for PHI "
1268 "or possibly trapping operation in %G", stmt
);
1273 if (need_same_oprnds
)
1275 tree other_op1
= gimple_arg (stmt
, 1);
1276 if (!operand_equal_p (first_op1
, other_op1
, 0))
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1280 "Build SLP failed: different shift "
1281 "arguments in %G", stmt
);
1287 if (!types_compatible_p (vectype
, *node_vectype
))
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1291 "Build SLP failed: different vector type "
1298 /* Grouped store or load. */
1299 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1301 gcc_assert (ldst_p
);
1302 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1305 gcc_assert (rhs_code
== CFN_MASK_STORE
1306 || REFERENCE_CLASS_P (lhs
)
1312 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1313 if (prev_first_load
)
1315 /* Check that there are no loads from different interleaving
1316 chains in the same node. */
1317 if (prev_first_load
!= first_load
)
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1322 "Build SLP failed: different "
1323 "interleaving chains in one node %G",
1330 prev_first_load
= first_load
;
1333 /* Non-grouped store or load. */
1336 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
))
1337 && rhs_code
!= CFN_GATHER_LOAD
1338 && rhs_code
!= CFN_MASK_GATHER_LOAD
1339 /* Not grouped loads are handled as externals for BB
1340 vectorization. For loop vectorization we can handle
1341 splats the same we handle single element interleaving. */
1342 && (is_a
<bb_vec_info
> (vinfo
)
1343 || stmt_info
!= first_stmt_info
1344 || STMT_VINFO_GATHER_SCATTER_P (stmt_info
)))
1346 /* Not grouped load. */
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1349 "Build SLP failed: not grouped load %G", stmt
);
1353 /* Fatal mismatch. */
1358 /* Not memory operation. */
1362 && rhs_code
.is_tree_code ()
1363 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_binary
1364 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_unary
1365 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_expression
1366 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_comparison
1367 && rhs_code
!= VIEW_CONVERT_EXPR
1368 && rhs_code
!= CALL_EXPR
1369 && rhs_code
!= BIT_FIELD_REF
)
1371 if (dump_enabled_p ())
1372 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1373 "Build SLP failed: operation unsupported %G",
1375 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1377 /* Fatal mismatch. */
1382 if (rhs_code
== COND_EXPR
)
1384 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1385 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1386 enum tree_code swap_code
= ERROR_MARK
;
1387 enum tree_code invert_code
= ERROR_MARK
;
1390 first_cond_code
= TREE_CODE (cond_expr
);
1391 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1393 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1394 swap_code
= swap_tree_comparison (cond_code
);
1395 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1398 if (first_cond_code
== cond_code
)
1400 /* Isomorphic can be achieved by swapping. */
1401 else if (first_cond_code
== swap_code
)
1403 /* Isomorphic can be achieved by inverting. */
1404 else if (first_cond_code
== invert_code
)
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1410 "Build SLP failed: different"
1411 " operation %G", stmt
);
1417 if (rhs_code
.is_tree_code ()
1418 && TREE_CODE_CLASS ((tree_code
)rhs_code
) == tcc_comparison
1419 && (swap_tree_comparison ((tree_code
)first_stmt_code
)
1420 == (tree_code
)rhs_code
))
1427 for (i
= 0; i
< group_size
; ++i
)
1431 /* If we allowed a two-operation SLP node verify the target can cope
1432 with the permute we are going to use. */
1433 if (alt_stmt_code
!= ERROR_MARK
1434 && (!alt_stmt_code
.is_tree_code ()
1435 || (TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_reference
1436 && TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_comparison
)))
1438 *two_operators
= true;
1441 if (maybe_soft_fail
)
1443 unsigned HOST_WIDE_INT const_nunits
;
1444 if (!TYPE_VECTOR_SUBPARTS
1445 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1446 || const_nunits
> group_size
)
1450 /* With constant vector elements simulate a mismatch at the
1451 point we need to split. */
1452 unsigned tail
= group_size
& (const_nunits
- 1);
1453 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1461 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1462 Note we never remove apart from at destruction time so we do not
1463 need a special value for deleted that differs from empty. */
1466 typedef vec
<stmt_vec_info
> value_type
;
1467 typedef vec
<stmt_vec_info
> compare_type
;
1468 static inline hashval_t
hash (value_type
);
1469 static inline bool equal (value_type existing
, value_type candidate
);
1470 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1471 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1472 static const bool empty_zero_p
= true;
1473 static inline void mark_empty (value_type
&x
) { x
.release (); }
1474 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1475 static inline void remove (value_type
&x
) { x
.release (); }
1478 bst_traits::hash (value_type x
)
1481 for (unsigned i
= 0; i
< x
.length (); ++i
)
1482 h
.add_int (gimple_uid (x
[i
]->stmt
));
1486 bst_traits::equal (value_type existing
, value_type candidate
)
1488 if (existing
.length () != candidate
.length ())
1490 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1491 if (existing
[i
] != candidate
[i
])
1496 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1497 but then vec::insert does memmove and that's not compatible with
1501 chain_op_t (tree_code code_
, vect_def_type dt_
, tree op_
)
1502 : code (code_
), dt (dt_
), op (op_
) {}
1508 /* Comparator for sorting associatable chains. */
1511 dt_sort_cmp (const void *op1_
, const void *op2_
, void *)
1513 auto *op1
= (const chain_op_t
*) op1_
;
1514 auto *op2
= (const chain_op_t
*) op2_
;
1515 if (op1
->dt
!= op2
->dt
)
1516 return (int)op1
->dt
- (int)op2
->dt
;
1517 return (int)op1
->code
- (int)op2
->code
;
1520 /* Linearize the associatable expression chain at START with the
1521 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1522 filling CHAIN with the result and using WORKLIST as intermediate storage.
1523 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1524 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1525 stmts, starting with START. */
1528 vect_slp_linearize_chain (vec_info
*vinfo
,
1529 vec
<std::pair
<tree_code
, gimple
*> > &worklist
,
1530 vec
<chain_op_t
> &chain
,
1531 enum tree_code code
, gimple
*start
,
1532 gimple
*&code_stmt
, gimple
*&alt_code_stmt
,
1533 vec
<gimple
*> *chain_stmts
)
1535 /* For each lane linearize the addition/subtraction (or other
1536 uniform associatable operation) expression tree. */
1537 worklist
.safe_push (std::make_pair (code
, start
));
1538 while (!worklist
.is_empty ())
1540 auto entry
= worklist
.pop ();
1541 gassign
*stmt
= as_a
<gassign
*> (entry
.second
);
1542 enum tree_code in_code
= entry
.first
;
1543 enum tree_code this_code
= gimple_assign_rhs_code (stmt
);
1544 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1546 && gimple_assign_rhs_code (stmt
) == code
)
1548 else if (!alt_code_stmt
1549 && gimple_assign_rhs_code (stmt
) == MINUS_EXPR
)
1550 alt_code_stmt
= stmt
;
1552 chain_stmts
->safe_push (stmt
);
1553 for (unsigned opnum
= 1; opnum
<= 2; ++opnum
)
1555 tree op
= gimple_op (stmt
, opnum
);
1557 stmt_vec_info def_stmt_info
;
1558 bool res
= vect_is_simple_use (op
, vinfo
, &dt
, &def_stmt_info
);
1560 if (dt
== vect_internal_def
1561 && is_pattern_stmt_p (def_stmt_info
))
1562 op
= gimple_get_lhs (def_stmt_info
->stmt
);
1564 use_operand_p use_p
;
1565 if (dt
== vect_internal_def
1566 && single_imm_use (op
, &use_p
, &use_stmt
)
1567 && is_gimple_assign (def_stmt_info
->stmt
)
1568 && (gimple_assign_rhs_code (def_stmt_info
->stmt
) == code
1569 || (code
== PLUS_EXPR
1570 && (gimple_assign_rhs_code (def_stmt_info
->stmt
)
1573 tree_code op_def_code
= this_code
;
1574 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1575 op_def_code
= PLUS_EXPR
;
1576 if (in_code
== MINUS_EXPR
)
1577 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1578 worklist
.safe_push (std::make_pair (op_def_code
,
1579 def_stmt_info
->stmt
));
1583 tree_code op_def_code
= this_code
;
1584 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1585 op_def_code
= PLUS_EXPR
;
1586 if (in_code
== MINUS_EXPR
)
1587 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1588 chain
.safe_push (chain_op_t (op_def_code
, dt
, op
));
1594 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1595 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1596 scalar_stmts_to_slp_tree_map_t
;
1599 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1600 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1601 poly_uint64
*max_nunits
,
1602 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1603 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1606 vect_build_slp_tree (vec_info
*vinfo
,
1607 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1608 poly_uint64
*max_nunits
,
1609 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1610 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1612 if (slp_tree
*leader
= bst_map
->get (stmts
))
1614 if (dump_enabled_p ())
1615 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1616 !(*leader
)->failed
? "" : "failed ",
1618 if (!(*leader
)->failed
)
1620 SLP_TREE_REF_COUNT (*leader
)++;
1621 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1625 memcpy (matches
, (*leader
)->failed
, sizeof (bool) * group_size
);
1629 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1630 so we can pick up backedge destinations during discovery. */
1631 slp_tree res
= new _slp_tree
;
1632 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1633 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1634 bst_map
->put (stmts
.copy (), res
);
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_NOTE
, vect_location
,
1640 "SLP discovery limit exceeded\n");
1641 /* Mark the node invalid so we can detect those when still in use
1642 as backedge destinations. */
1643 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1644 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1645 res
->failed
= XNEWVEC (bool, group_size
);
1646 memset (res
->failed
, 0, sizeof (bool) * group_size
);
1647 memset (matches
, 0, sizeof (bool) * group_size
);
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_NOTE
, vect_location
,
1654 "starting SLP discovery for node %p\n", (void *) res
);
1656 poly_uint64 this_max_nunits
= 1;
1657 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1659 matches
, limit
, tree_size
, bst_map
);
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_NOTE
, vect_location
,
1664 "SLP discovery for node %p failed\n", (void *) res
);
1665 /* Mark the node invalid so we can detect those when still in use
1666 as backedge destinations. */
1667 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1668 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1669 res
->failed
= XNEWVEC (bool, group_size
);
1673 for (i
= 0; i
< group_size
; ++i
)
1676 gcc_assert (i
< group_size
);
1678 memcpy (res
->failed
, matches
, sizeof (bool) * group_size
);
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE
, vect_location
,
1684 "SLP discovery for node %p succeeded\n",
1686 gcc_assert (res_
== res
);
1687 res
->max_nunits
= this_max_nunits
;
1688 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1689 /* Keep a reference for the bst_map use. */
1690 SLP_TREE_REF_COUNT (res
)++;
1695 /* Helper for building an associated SLP node chain. */
1698 vect_slp_build_two_operator_nodes (slp_tree perm
, tree vectype
,
1699 slp_tree op0
, slp_tree op1
,
1700 stmt_vec_info oper1
, stmt_vec_info oper2
,
1701 vec
<std::pair
<unsigned, unsigned> > lperm
)
1703 unsigned group_size
= SLP_TREE_LANES (op1
);
1705 slp_tree child1
= new _slp_tree
;
1706 SLP_TREE_DEF_TYPE (child1
) = vect_internal_def
;
1707 SLP_TREE_VECTYPE (child1
) = vectype
;
1708 SLP_TREE_LANES (child1
) = group_size
;
1709 SLP_TREE_CHILDREN (child1
).create (2);
1710 SLP_TREE_CHILDREN (child1
).quick_push (op0
);
1711 SLP_TREE_CHILDREN (child1
).quick_push (op1
);
1712 SLP_TREE_REPRESENTATIVE (child1
) = oper1
;
1714 slp_tree child2
= new _slp_tree
;
1715 SLP_TREE_DEF_TYPE (child2
) = vect_internal_def
;
1716 SLP_TREE_VECTYPE (child2
) = vectype
;
1717 SLP_TREE_LANES (child2
) = group_size
;
1718 SLP_TREE_CHILDREN (child2
).create (2);
1719 SLP_TREE_CHILDREN (child2
).quick_push (op0
);
1720 SLP_TREE_REF_COUNT (op0
)++;
1721 SLP_TREE_CHILDREN (child2
).quick_push (op1
);
1722 SLP_TREE_REF_COUNT (op1
)++;
1723 SLP_TREE_REPRESENTATIVE (child2
) = oper2
;
1725 SLP_TREE_DEF_TYPE (perm
) = vect_internal_def
;
1726 SLP_TREE_CODE (perm
) = VEC_PERM_EXPR
;
1727 SLP_TREE_VECTYPE (perm
) = vectype
;
1728 SLP_TREE_LANES (perm
) = group_size
;
1729 /* ??? We should set this NULL but that's not expected. */
1730 SLP_TREE_REPRESENTATIVE (perm
) = oper1
;
1731 SLP_TREE_LANE_PERMUTATION (perm
) = lperm
;
1732 SLP_TREE_CHILDREN (perm
).quick_push (child1
);
1733 SLP_TREE_CHILDREN (perm
).quick_push (child2
);
1736 /* Recursively build an SLP tree starting from NODE.
1737 Fail (and return a value not equal to zero) if def-stmts are not
1738 isomorphic, require data permutation or are of unsupported types of
1739 operation. Otherwise, return 0.
1740 The value returned is the depth in the SLP tree where a mismatch
1744 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1745 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1746 poly_uint64
*max_nunits
,
1747 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1748 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1750 unsigned nops
, i
, this_tree_size
= 0;
1751 poly_uint64 this_max_nunits
= *max_nunits
;
1755 stmt_vec_info stmt_info
= stmts
[0];
1756 if (!is_a
<gcall
*> (stmt_info
->stmt
)
1757 && !is_a
<gassign
*> (stmt_info
->stmt
)
1758 && !is_a
<gphi
*> (stmt_info
->stmt
))
1761 nops
= gimple_num_args (stmt_info
->stmt
);
1762 if (const int *map
= vect_get_operand_map (stmt_info
->stmt
))
1765 /* If the SLP node is a PHI (induction or reduction), terminate
1767 bool *skip_args
= XALLOCAVEC (bool, nops
);
1768 memset (skip_args
, 0, sizeof (bool) * nops
);
1769 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1770 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1772 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1773 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1775 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1779 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1780 if (def_type
== vect_induction_def
)
1782 /* Induction PHIs are not cycles but walk the initial
1783 value. Only for inner loops through, for outer loops
1784 we need to pick up the value from the actual PHIs
1785 to more easily support peeling and epilogue vectorization. */
1786 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1787 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1788 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1791 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1793 else if (def_type
== vect_reduction_def
1794 || def_type
== vect_double_reduction_def
1795 || def_type
== vect_nested_cycle
1796 || def_type
== vect_first_order_recurrence
)
1798 /* Else def types have to match. */
1799 stmt_vec_info other_info
;
1800 bool all_same
= true;
1801 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1803 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1805 if (other_info
!= stmt_info
)
1808 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1809 /* Reduction initial values are not explicitely represented. */
1810 if (def_type
!= vect_first_order_recurrence
1811 && !nested_in_vect_loop_p (loop
, stmt_info
))
1812 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1813 /* Reduction chain backedge defs are filled manually.
1814 ??? Need a better way to identify a SLP reduction chain PHI.
1815 Or a better overall way to SLP match those. */
1816 if (all_same
&& def_type
== vect_reduction_def
)
1817 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1819 else if (def_type
!= vect_internal_def
)
1824 bool two_operators
= false;
1825 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1826 tree vectype
= NULL_TREE
;
1827 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1828 &this_max_nunits
, matches
, &two_operators
,
1832 /* If the SLP node is a load, terminate the recursion unless masked. */
1833 if (STMT_VINFO_DATA_REF (stmt_info
)
1834 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1836 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1837 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
)
1838 || gimple_call_internal_p (stmt
, IFN_GATHER_LOAD
)
1839 || gimple_call_internal_p (stmt
, IFN_MASK_GATHER_LOAD
));
1842 *max_nunits
= this_max_nunits
;
1844 node
= vect_create_new_slp_node (node
, stmts
, 0);
1845 SLP_TREE_VECTYPE (node
) = vectype
;
1846 /* And compute the load permutation. Whether it is actually
1847 a permutation depends on the unrolling factor which is
1849 vec
<unsigned> load_permutation
;
1851 stmt_vec_info load_info
;
1852 load_permutation
.create (group_size
);
1853 stmt_vec_info first_stmt_info
1854 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1855 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1858 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1859 load_place
= vect_get_place_in_interleaving_chain
1860 (load_info
, first_stmt_info
);
1863 gcc_assert (load_place
!= -1);
1864 load_permutation
.safe_push (load_place
);
1866 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1870 else if (gimple_assign_single_p (stmt_info
->stmt
)
1871 && !gimple_vuse (stmt_info
->stmt
)
1872 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1874 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1875 the same SSA name vector of a compatible type to vectype. */
1876 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1877 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1878 stmt_vec_info estmt_info
;
1879 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1881 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1882 tree bfref
= gimple_assign_rhs1 (estmt
);
1884 if (!known_eq (bit_field_size (bfref
),
1885 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1886 || !constant_multiple_p (bit_field_offset (bfref
),
1887 bit_field_size (bfref
), &lane
))
1893 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1895 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1896 if (operand_equal_p (TYPE_SIZE (vectype
), TYPE_SIZE (TREE_TYPE (vec
))))
1897 /* ??? We record vectype here but we hide eventually necessary
1898 punning and instead rely on code generation to materialize
1899 VIEW_CONVERT_EXPRs as necessary. We instead should make
1900 this explicit somehow. */
1901 SLP_TREE_VECTYPE (vnode
) = vectype
;
1904 /* For different size but compatible elements we can still
1905 use VEC_PERM_EXPR without punning. */
1906 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec
))
1907 && types_compatible_p (TREE_TYPE (vectype
),
1908 TREE_TYPE (TREE_TYPE (vec
))));
1909 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1911 auto nunits
= TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode
));
1912 unsigned HOST_WIDE_INT const_nunits
;
1913 if (nunits
.is_constant (&const_nunits
))
1914 SLP_TREE_LANES (vnode
) = const_nunits
;
1915 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1916 /* We are always building a permutation node even if it is an identity
1917 permute to shield the rest of the vectorizer from the odd node
1918 representing an actual vector without any scalar ops.
1919 ??? We could hide it completely with making the permute node
1921 node
= vect_create_new_slp_node (node
, stmts
, 1);
1922 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1923 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1924 SLP_TREE_VECTYPE (node
) = vectype
;
1925 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1928 /* When discovery reaches an associatable operation see whether we can
1929 improve that to match up lanes in a way superior to the operand
1930 swapping code which at most looks at two defs.
1931 ??? For BB vectorization we cannot do the brute-force search
1932 for matching as we can succeed by means of builds from scalars
1933 and have no good way to "cost" one build against another. */
1934 else if (is_a
<loop_vec_info
> (vinfo
)
1935 /* ??? We don't handle !vect_internal_def defs below. */
1936 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1937 && is_gimple_assign (stmt_info
->stmt
)
1938 && (associative_tree_code (gimple_assign_rhs_code (stmt_info
->stmt
))
1939 || gimple_assign_rhs_code (stmt_info
->stmt
) == MINUS_EXPR
)
1940 && ((FLOAT_TYPE_P (vectype
) && flag_associative_math
)
1941 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype
))
1942 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype
)))))
1944 /* See if we have a chain of (mixed) adds or subtracts or other
1945 associatable ops. */
1946 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
1947 if (code
== MINUS_EXPR
)
1949 stmt_vec_info other_op_stmt_info
= NULL
;
1950 stmt_vec_info op_stmt_info
= NULL
;
1951 unsigned chain_len
= 0;
1952 auto_vec
<chain_op_t
> chain
;
1953 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
1954 auto_vec
<vec
<chain_op_t
> > chains (group_size
);
1955 auto_vec
<slp_tree
, 4> children
;
1956 bool hard_fail
= true;
1957 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1959 /* For each lane linearize the addition/subtraction (or other
1960 uniform associatable operation) expression tree. */
1961 gimple
*op_stmt
= NULL
, *other_op_stmt
= NULL
;
1962 vect_slp_linearize_chain (vinfo
, worklist
, chain
, code
,
1963 stmts
[lane
]->stmt
, op_stmt
, other_op_stmt
,
1965 if (!op_stmt_info
&& op_stmt
)
1966 op_stmt_info
= vinfo
->lookup_stmt (op_stmt
);
1967 if (!other_op_stmt_info
&& other_op_stmt
)
1968 other_op_stmt_info
= vinfo
->lookup_stmt (other_op_stmt
);
1969 if (chain
.length () == 2)
1971 /* In a chain of just two elements resort to the regular
1972 operand swapping scheme. If we run into a length
1973 mismatch still hard-FAIL. */
1978 matches
[lane
] = false;
1979 /* ??? We might want to process the other lanes, but
1980 make sure to not give false matching hints to the
1981 caller for lanes we did not process. */
1982 if (lane
!= group_size
- 1)
1987 else if (chain_len
== 0)
1988 chain_len
= chain
.length ();
1989 else if (chain
.length () != chain_len
)
1991 /* ??? Here we could slip in magic to compensate with
1992 neutral operands. */
1993 matches
[lane
] = false;
1994 if (lane
!= group_size
- 1)
1998 chains
.quick_push (chain
.copy ());
2001 if (chains
.length () == group_size
)
2003 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2009 /* Now we have a set of chains with the same length. */
2010 /* 1. pre-sort according to def_type and operation. */
2011 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2012 chains
[lane
].stablesort (dt_sort_cmp
, vinfo
);
2013 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE
, vect_location
,
2016 "pre-sorted chains of %s\n",
2017 get_tree_code_name (code
));
2018 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2020 for (unsigned opnum
= 0; opnum
< chain_len
; ++opnum
)
2021 dump_printf (MSG_NOTE
, "%s %T ",
2022 get_tree_code_name (chains
[lane
][opnum
].code
),
2023 chains
[lane
][opnum
].op
);
2024 dump_printf (MSG_NOTE
, "\n");
2027 /* 2. try to build children nodes, associating as necessary. */
2028 for (unsigned n
= 0; n
< chain_len
; ++n
)
2030 vect_def_type dt
= chains
[0][n
].dt
;
2032 for (lane
= 0; lane
< group_size
; ++lane
)
2033 if (chains
[lane
][n
].dt
!= dt
)
2035 if (dt
== vect_constant_def
2036 && chains
[lane
][n
].dt
== vect_external_def
)
2037 dt
= vect_external_def
;
2038 else if (dt
== vect_external_def
2039 && chains
[lane
][n
].dt
== vect_constant_def
)
2044 if (lane
!= group_size
)
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE
, vect_location
,
2048 "giving up on chain due to mismatched "
2050 matches
[lane
] = false;
2051 if (lane
!= group_size
- 1)
2055 if (dt
== vect_constant_def
2056 || dt
== vect_external_def
)
2058 /* Check whether we can build the invariant. If we can't
2059 we never will be able to. */
2060 tree type
= TREE_TYPE (chains
[0][n
].op
);
2061 if (!GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
2062 && (TREE_CODE (type
) == BOOLEAN_TYPE
2063 || !can_duplicate_and_interleave_p (vinfo
, group_size
,
2070 ops
.create (group_size
);
2071 for (lane
= 0; lane
< group_size
; ++lane
)
2072 ops
.quick_push (chains
[lane
][n
].op
);
2073 slp_tree child
= vect_create_new_slp_node (ops
);
2074 SLP_TREE_DEF_TYPE (child
) = dt
;
2075 children
.safe_push (child
);
2077 else if (dt
!= vect_internal_def
)
2079 /* Not sure, we might need sth special.
2080 gcc.dg/vect/pr96854.c,
2081 gfortran.dg/vect/fast-math-pr37021.f90
2082 and gfortran.dg/vect/pr61171.f trigger. */
2083 /* Soft-fail for now. */
2089 vec
<stmt_vec_info
> op_stmts
;
2090 op_stmts
.create (group_size
);
2091 slp_tree child
= NULL
;
2092 /* Brute-force our way. We have to consider a lane
2093 failing after fixing an earlier fail up in the
2094 SLP discovery recursion. So track the current
2095 permute per lane. */
2096 unsigned *perms
= XALLOCAVEC (unsigned, group_size
);
2097 memset (perms
, 0, sizeof (unsigned) * group_size
);
2100 op_stmts
.truncate (0);
2101 for (lane
= 0; lane
< group_size
; ++lane
)
2103 (vinfo
->lookup_def (chains
[lane
][n
].op
));
2104 child
= vect_build_slp_tree (vinfo
, op_stmts
,
2105 group_size
, &this_max_nunits
,
2107 &this_tree_size
, bst_map
);
2108 /* ??? We're likely getting too many fatal mismatches
2109 here so maybe we want to ignore them (but then we
2110 have no idea which lanes fatally mismatched). */
2111 if (child
|| !matches
[0])
2113 /* Swap another lane we have not yet matched up into
2114 lanes that did not match. If we run out of
2115 permute possibilities for a lane terminate the
2118 for (lane
= 1; lane
< group_size
; ++lane
)
2121 if (n
+ perms
[lane
] + 1 == chain_len
)
2126 std::swap (chains
[lane
][n
],
2127 chains
[lane
][n
+ perms
[lane
] + 1]);
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE
, vect_location
,
2138 "failed to match up op %d\n", n
);
2139 op_stmts
.release ();
2140 if (lane
!= group_size
- 1)
2143 matches
[lane
] = false;
2146 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE
, vect_location
,
2149 "matched up op %d to\n", n
);
2150 vect_print_slp_tree (MSG_NOTE
, vect_location
, child
);
2152 children
.safe_push (child
);
2155 /* 3. build SLP nodes to combine the chain. */
2156 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2157 if (chains
[lane
][0].code
!= code
)
2159 /* See if there's any alternate all-PLUS entry. */
2161 for (n
= 1; n
< chain_len
; ++n
)
2163 for (lane
= 0; lane
< group_size
; ++lane
)
2164 if (chains
[lane
][n
].code
!= code
)
2166 if (lane
== group_size
)
2171 /* Swap that in at first position. */
2172 std::swap (children
[0], children
[n
]);
2173 for (lane
= 0; lane
< group_size
; ++lane
)
2174 std::swap (chains
[lane
][0], chains
[lane
][n
]);
2178 /* ??? When this triggers and we end up with two
2179 vect_constant/external_def up-front things break (ICE)
2180 spectacularly finding an insertion place for the
2181 all-constant op. We should have a fully
2182 vect_internal_def operand though(?) so we can swap
2183 that into first place and then prepend the all-zero
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_NOTE
, vect_location
,
2187 "inserting constant zero to compensate "
2188 "for (partially) negated first "
2191 for (lane
= 0; lane
< group_size
; ++lane
)
2192 chains
[lane
].safe_insert
2193 (0, chain_op_t (code
, vect_constant_def
, NULL_TREE
));
2195 zero_ops
.create (group_size
);
2196 zero_ops
.quick_push (build_zero_cst (TREE_TYPE (vectype
)));
2197 for (lane
= 1; lane
< group_size
; ++lane
)
2198 zero_ops
.quick_push (zero_ops
[0]);
2199 slp_tree zero
= vect_create_new_slp_node (zero_ops
);
2200 SLP_TREE_DEF_TYPE (zero
) = vect_constant_def
;
2201 children
.safe_insert (0, zero
);
2205 for (unsigned i
= 1; i
< children
.length (); ++i
)
2207 slp_tree op0
= children
[i
- 1];
2208 slp_tree op1
= children
[i
];
2209 bool this_two_op
= false;
2210 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2211 if (chains
[lane
][i
].code
!= chains
[0][i
].code
)
2217 if (i
== children
.length () - 1)
2218 child
= vect_create_new_slp_node (node
, stmts
, 2);
2220 child
= vect_create_new_slp_node (2, ERROR_MARK
);
2223 vec
<std::pair
<unsigned, unsigned> > lperm
;
2224 lperm
.create (group_size
);
2225 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2226 lperm
.quick_push (std::make_pair
2227 (chains
[lane
][i
].code
!= chains
[0][i
].code
, lane
));
2228 vect_slp_build_two_operator_nodes (child
, vectype
, op0
, op1
,
2229 (chains
[0][i
].code
== code
2231 : other_op_stmt_info
),
2232 (chains
[0][i
].code
== code
2233 ? other_op_stmt_info
2239 SLP_TREE_DEF_TYPE (child
) = vect_internal_def
;
2240 SLP_TREE_VECTYPE (child
) = vectype
;
2241 SLP_TREE_LANES (child
) = group_size
;
2242 SLP_TREE_CHILDREN (child
).quick_push (op0
);
2243 SLP_TREE_CHILDREN (child
).quick_push (op1
);
2244 SLP_TREE_REPRESENTATIVE (child
)
2245 = (chains
[0][i
].code
== code
2246 ? op_stmt_info
: other_op_stmt_info
);
2248 children
[i
] = child
;
2250 *tree_size
+= this_tree_size
+ 1;
2251 *max_nunits
= this_max_nunits
;
2252 while (!chains
.is_empty ())
2253 chains
.pop ().release ();
2257 while (!children
.is_empty ())
2258 vect_free_slp_tree (children
.pop ());
2259 while (!chains
.is_empty ())
2260 chains
.pop ().release ();
2261 /* Hard-fail, otherwise we might run into quadratic processing of the
2262 chains starting one stmt into the chain again. */
2265 /* Fall thru to normal processing. */
2268 /* Get at the operands, verifying they are compatible. */
2269 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
2270 slp_oprnd_info oprnd_info
;
2271 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
2273 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
2274 stmts
, i
, &oprnds_info
);
2276 matches
[(res
== -1) ? 0 : i
] = false;
2280 for (i
= 0; i
< group_size
; ++i
)
2283 vect_free_oprnd_info (oprnds_info
);
2288 auto_vec
<slp_tree
, 4> children
;
2290 stmt_info
= stmts
[0];
2292 /* Create SLP_TREE nodes for the definition node/s. */
2293 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2298 /* We're skipping certain operands from processing, for example
2299 outer loop reduction initial defs. */
2302 children
.safe_push (NULL
);
2306 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
2308 /* COND_EXPR have one too many eventually if the condition
2310 gcc_assert (i
== 3 && nops
== 4);
2314 if (is_a
<bb_vec_info
> (vinfo
)
2315 && oprnd_info
->first_dt
== vect_internal_def
2316 && !oprnd_info
->any_pattern
)
2318 /* For BB vectorization, if all defs are the same do not
2319 bother to continue the build along the single-lane
2320 graph but use a splat of the scalar value. */
2321 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
2322 for (j
= 1; j
< group_size
; ++j
)
2323 if (oprnd_info
->def_stmts
[j
] != first_def
)
2326 /* But avoid doing this for loads where we may be
2327 able to CSE things, unless the stmt is not
2329 && (!STMT_VINFO_VECTORIZABLE (first_def
)
2330 || !gimple_vuse (first_def
->stmt
)))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE
, vect_location
,
2334 "Using a splat of the uniform operand %G",
2336 oprnd_info
->first_dt
= vect_external_def
;
2340 if (oprnd_info
->first_dt
== vect_external_def
2341 || oprnd_info
->first_dt
== vect_constant_def
)
2343 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
2344 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
2345 oprnd_info
->ops
= vNULL
;
2346 children
.safe_push (invnode
);
2350 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2351 group_size
, &this_max_nunits
,
2353 &this_tree_size
, bst_map
)) != NULL
)
2355 oprnd_info
->def_stmts
= vNULL
;
2356 children
.safe_push (child
);
2360 /* If the SLP build for operand zero failed and operand zero
2361 and one can be commutated try that for the scalar stmts
2362 that failed the match. */
2364 /* A first scalar stmt mismatch signals a fatal mismatch. */
2366 /* ??? For COND_EXPRs we can swap the comparison operands
2367 as well as the arms under some constraints. */
2369 && oprnds_info
[1]->first_dt
== vect_internal_def
2370 && is_gimple_assign (stmt_info
->stmt
)
2371 /* Swapping operands for reductions breaks assumptions later on. */
2372 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
2373 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2375 /* See whether we can swap the matching or the non-matching
2377 bool swap_not_matching
= true;
2380 for (j
= 0; j
< group_size
; ++j
)
2382 if (matches
[j
] != !swap_not_matching
)
2384 stmt_vec_info stmt_info
= stmts
[j
];
2385 /* Verify if we can swap operands of this stmt. */
2386 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
2388 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
2390 if (!swap_not_matching
)
2392 swap_not_matching
= false;
2397 while (j
!= group_size
);
2399 /* Swap mismatched definition stmts. */
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_NOTE
, vect_location
,
2402 "Re-trying with swapped operands of stmts ");
2403 for (j
= 0; j
< group_size
; ++j
)
2404 if (matches
[j
] == !swap_not_matching
)
2406 std::swap (oprnds_info
[0]->def_stmts
[j
],
2407 oprnds_info
[1]->def_stmts
[j
]);
2408 std::swap (oprnds_info
[0]->ops
[j
],
2409 oprnds_info
[1]->ops
[j
]);
2410 if (dump_enabled_p ())
2411 dump_printf (MSG_NOTE
, "%d ", j
);
2413 if (dump_enabled_p ())
2414 dump_printf (MSG_NOTE
, "\n");
2415 /* After swapping some operands we lost track whether an
2416 operand has any pattern defs so be conservative here. */
2417 if (oprnds_info
[0]->any_pattern
|| oprnds_info
[1]->any_pattern
)
2418 oprnds_info
[0]->any_pattern
= oprnds_info
[1]->any_pattern
= true;
2419 /* And try again with scratch 'matches' ... */
2420 bool *tem
= XALLOCAVEC (bool, group_size
);
2421 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2422 group_size
, &this_max_nunits
,
2424 &this_tree_size
, bst_map
)) != NULL
)
2426 oprnd_info
->def_stmts
= vNULL
;
2427 children
.safe_push (child
);
2433 /* If the SLP build failed and we analyze a basic-block
2434 simply treat nodes we fail to build as externally defined
2435 (and thus build vectors from the scalar defs).
2436 The cost model will reject outright expensive cases.
2437 ??? This doesn't treat cases where permutation ultimatively
2438 fails (or we don't try permutation below). Ideally we'd
2439 even compute a permutation that will end up with the maximum
2441 if (is_a
<bb_vec_info
> (vinfo
)
2442 /* ??? Rejecting patterns this way doesn't work. We'd have to
2443 do extra work to cancel the pattern so the uses see the
2445 && !is_pattern_stmt_p (stmt_info
)
2446 && !oprnd_info
->any_pattern
)
2448 /* But if there's a leading vector sized set of matching stmts
2449 fail here so we can split the group. This matches the condition
2450 vect_analyze_slp_instance uses. */
2451 /* ??? We might want to split here and combine the results to support
2452 multiple vector sizes better. */
2453 for (j
= 0; j
< group_size
; ++j
)
2456 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE
, vect_location
,
2460 "Building vector operands from scalars\n");
2462 child
= vect_create_new_slp_node (oprnd_info
->ops
);
2463 children
.safe_push (child
);
2464 oprnd_info
->ops
= vNULL
;
2469 gcc_assert (child
== NULL
);
2470 FOR_EACH_VEC_ELT (children
, j
, child
)
2472 vect_free_slp_tree (child
);
2473 vect_free_oprnd_info (oprnds_info
);
2477 vect_free_oprnd_info (oprnds_info
);
2479 /* If we have all children of a child built up from uniform scalars
2480 or does more than one possibly expensive vector construction then
2481 just throw that away, causing it built up from scalars.
2482 The exception is the SLP node for the vector store. */
2483 if (is_a
<bb_vec_info
> (vinfo
)
2484 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2485 /* ??? Rejecting patterns this way doesn't work. We'd have to
2486 do extra work to cancel the pattern so the uses see the
2488 && !is_pattern_stmt_p (stmt_info
))
2492 bool all_uniform_p
= true;
2493 unsigned n_vector_builds
= 0;
2494 FOR_EACH_VEC_ELT (children
, j
, child
)
2498 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2499 all_uniform_p
= false;
2500 else if (!vect_slp_tree_uniform_p (child
))
2502 all_uniform_p
= false;
2503 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2508 || n_vector_builds
> 1
2509 || (n_vector_builds
== children
.length ()
2510 && is_a
<gphi
*> (stmt_info
->stmt
)))
2514 FOR_EACH_VEC_ELT (children
, j
, child
)
2516 vect_free_slp_tree (child
);
2518 if (dump_enabled_p ())
2519 dump_printf_loc (MSG_NOTE
, vect_location
,
2520 "Building parent vector operands from "
2521 "scalars instead\n");
2526 *tree_size
+= this_tree_size
+ 1;
2527 *max_nunits
= this_max_nunits
;
2531 /* ??? We'd likely want to either cache in bst_map sth like
2532 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2533 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2534 explicit stmts to put in so the keying on 'stmts' doesn't
2535 work (but we have the same issue with nodes that use 'ops'). */
2536 slp_tree one
= new _slp_tree
;
2537 slp_tree two
= new _slp_tree
;
2538 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
2539 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
2540 SLP_TREE_VECTYPE (one
) = vectype
;
2541 SLP_TREE_VECTYPE (two
) = vectype
;
2542 SLP_TREE_CHILDREN (one
).safe_splice (children
);
2543 SLP_TREE_CHILDREN (two
).safe_splice (children
);
2545 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
2546 SLP_TREE_REF_COUNT (child
)++;
2548 /* Here we record the original defs since this
2549 node represents the final lane configuration. */
2550 node
= vect_create_new_slp_node (node
, stmts
, 2);
2551 SLP_TREE_VECTYPE (node
) = vectype
;
2552 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2553 SLP_TREE_CHILDREN (node
).quick_push (one
);
2554 SLP_TREE_CHILDREN (node
).quick_push (two
);
2555 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
2556 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
2557 enum tree_code ocode
= ERROR_MARK
;
2558 stmt_vec_info ostmt_info
;
2560 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
2562 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
2563 if (gimple_assign_rhs_code (ostmt
) != code0
)
2565 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
2566 ocode
= gimple_assign_rhs_code (ostmt
);
2570 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
2572 SLP_TREE_CODE (one
) = code0
;
2573 SLP_TREE_CODE (two
) = ocode
;
2574 SLP_TREE_LANES (one
) = stmts
.length ();
2575 SLP_TREE_LANES (two
) = stmts
.length ();
2576 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
2577 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
2581 node
= vect_create_new_slp_node (node
, stmts
, nops
);
2582 SLP_TREE_VECTYPE (node
) = vectype
;
2583 SLP_TREE_CHILDREN (node
).splice (children
);
2587 /* Dump a single SLP tree NODE. */
2590 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
2595 stmt_vec_info stmt_info
;
2598 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2599 dump_user_location_t user_loc
= loc
.get_user_location ();
2600 dump_printf_loc (metadata
, user_loc
,
2601 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2603 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2605 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2607 : ""), (void *) node
,
2608 estimated_poly_value (node
->max_nunits
),
2609 SLP_TREE_REF_COUNT (node
));
2610 if (SLP_TREE_VECTYPE (node
))
2611 dump_printf (metadata
, " %T", SLP_TREE_VECTYPE (node
));
2612 dump_printf (metadata
, "\n");
2613 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2615 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2616 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2618 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2619 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2621 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2622 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2623 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2626 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2627 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2628 dump_printf (metadata
, "%T%s ", op
,
2629 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2630 dump_printf (metadata
, "}\n");
2632 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2634 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2635 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2636 dump_printf (dump_kind
, " %u", j
);
2637 dump_printf (dump_kind
, " }\n");
2639 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2641 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2642 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2643 dump_printf (dump_kind
, " %u[%u]",
2644 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2645 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2646 dump_printf (dump_kind
, " }\n");
2648 if (SLP_TREE_CHILDREN (node
).is_empty ())
2650 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2651 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2652 dump_printf (dump_kind
, " %p", (void *)child
);
2653 dump_printf (dump_kind
, "\n");
2657 debug (slp_tree node
)
2659 debug_dump_context ctx
;
2660 vect_print_slp_tree (MSG_NOTE
,
2661 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2665 /* Recursive helper for the dot producer below. */
2668 dot_slp_tree (FILE *f
, slp_tree node
, hash_set
<slp_tree
> &visited
)
2670 if (visited
.add (node
))
2673 fprintf (f
, "\"%p\" [label=\"", (void *)node
);
2674 vect_print_slp_tree (MSG_NOTE
,
2675 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2677 fprintf (f
, "\"];\n");
2680 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2681 fprintf (f
, "\"%p\" -> \"%p\";", (void *)node
, (void *)child
);
2683 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2685 dot_slp_tree (f
, child
, visited
);
2689 dot_slp_tree (const char *fname
, slp_tree node
)
2691 FILE *f
= fopen (fname
, "w");
2692 fprintf (f
, "digraph {\n");
2695 debug_dump_context
ctx (f
);
2696 hash_set
<slp_tree
> visited
;
2697 dot_slp_tree (f
, node
, visited
);
2704 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2707 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2708 slp_tree node
, hash_set
<slp_tree
> &visited
)
2713 if (visited
.add (node
))
2716 vect_print_slp_tree (dump_kind
, loc
, node
);
2718 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2720 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2724 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2727 hash_set
<slp_tree
> visited
;
2728 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2731 /* Mark the tree rooted at NODE with PURE_SLP. */
2734 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2737 stmt_vec_info stmt_info
;
2740 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2743 if (visited
.add (node
))
2746 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2747 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2749 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2751 vect_mark_slp_stmts (child
, visited
);
2755 vect_mark_slp_stmts (slp_tree node
)
2757 hash_set
<slp_tree
> visited
;
2758 vect_mark_slp_stmts (node
, visited
);
2761 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2764 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2767 stmt_vec_info stmt_info
;
2770 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2773 if (visited
.add (node
))
2776 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2778 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2779 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2780 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2783 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2785 vect_mark_slp_stmts_relevant (child
, visited
);
2789 vect_mark_slp_stmts_relevant (slp_tree node
)
2791 hash_set
<slp_tree
> visited
;
2792 vect_mark_slp_stmts_relevant (node
, visited
);
2796 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2799 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2800 hash_set
<slp_tree
> &visited
)
2802 if (!node
|| visited
.add (node
))
2805 if (SLP_TREE_CHILDREN (node
).length () == 0)
2807 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2809 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2810 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2811 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2812 loads
.safe_push (node
);
2818 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2819 vect_gather_slp_loads (loads
, child
, visited
);
2824 /* Find the last store in SLP INSTANCE. */
2827 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2829 stmt_vec_info last
= NULL
;
2830 stmt_vec_info stmt_vinfo
;
2832 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2834 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2835 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2841 /* Find the first stmt in NODE. */
2844 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2846 stmt_vec_info first
= NULL
;
2847 stmt_vec_info stmt_vinfo
;
2849 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2851 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2853 || get_later_stmt (stmt_vinfo
, first
) == first
)
2860 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2861 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2862 (also containing the first GROUP1_SIZE stmts, since stores are
2863 consecutive), the second containing the remainder.
2864 Return the first stmt in the second group. */
2866 static stmt_vec_info
2867 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2869 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2870 gcc_assert (group1_size
> 0);
2871 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2872 gcc_assert (group2_size
> 0);
2873 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2875 stmt_vec_info stmt_info
= first_vinfo
;
2876 for (unsigned i
= group1_size
; i
> 1; i
--)
2878 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2879 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2881 /* STMT is now the last element of the first group. */
2882 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2883 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2885 DR_GROUP_SIZE (group2
) = group2_size
;
2886 for (stmt_info
= group2
; stmt_info
;
2887 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2889 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2890 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2893 /* For the second group, the DR_GROUP_GAP is that before the original group,
2894 plus skipping over the first vector. */
2895 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2897 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2898 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2902 group1_size
, group2_size
);
2907 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2908 statements and a vector of NUNITS elements. */
2911 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2913 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2916 /* Helper that checks to see if a node is a load node. */
2919 vect_is_slp_load_node (slp_tree root
)
2921 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2922 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2923 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2927 /* Helper function of optimize_load_redistribution that performs the operation
2931 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2932 vec_info
*vinfo
, unsigned int group_size
,
2933 hash_map
<slp_tree
, slp_tree
> *load_map
,
2936 if (slp_tree
*leader
= load_map
->get (root
))
2942 /* For now, we don't know anything about externals so do not do anything. */
2943 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2945 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2947 /* First convert this node into a load node and add it to the leaves
2948 list and flatten the permute from a lane to a load one. If it's
2949 unneeded it will be elided later. */
2950 vec
<stmt_vec_info
> stmts
;
2951 stmts
.create (SLP_TREE_LANES (root
));
2952 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2953 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2955 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2956 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2958 if (!vect_is_slp_load_node (node
)
2959 || SLP_TREE_CHILDREN (node
).exists ())
2965 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE
, vect_location
,
2970 "converting stmts on permute node %p\n",
2973 bool *matches
= XALLOCAVEC (bool, group_size
);
2974 poly_uint64 max_nunits
= 1;
2975 unsigned tree_size
= 0, limit
= 1;
2976 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2977 matches
, &limit
, &tree_size
, bst_map
);
2981 load_map
->put (root
, node
);
2986 load_map
->put (root
, NULL
);
2988 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2991 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2995 SLP_TREE_REF_COUNT (value
)++;
2996 SLP_TREE_CHILDREN (root
)[i
] = value
;
2997 /* ??? We know the original leafs of the replaced nodes will
2998 be referenced by bst_map, only the permutes created by
2999 pattern matching are not. */
3000 if (SLP_TREE_REF_COUNT (node
) == 1)
3001 load_map
->remove (node
);
3002 vect_free_slp_tree (node
);
3009 /* Temporary workaround for loads not being CSEd during SLP build. This
3010 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3011 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3012 same DR such that the final operation is equal to a permuted load. Such
3013 NODES are then directly converted into LOADS themselves. The nodes are
3014 CSEd using BST_MAP. */
3017 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
3018 vec_info
*vinfo
, unsigned int group_size
,
3019 hash_map
<slp_tree
, slp_tree
> *load_map
,
3025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
3028 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
3032 SLP_TREE_REF_COUNT (value
)++;
3033 SLP_TREE_CHILDREN (root
)[i
] = value
;
3034 /* ??? We know the original leafs of the replaced nodes will
3035 be referenced by bst_map, only the permutes created by
3036 pattern matching are not. */
3037 if (SLP_TREE_REF_COUNT (node
) == 1)
3038 load_map
->remove (node
);
3039 vect_free_slp_tree (node
);
3044 /* Helper function of vect_match_slp_patterns.
3046 Attempts to match patterns against the slp tree rooted in REF_NODE using
3047 VINFO. Patterns are matched in post-order traversal.
3049 If matching is successful the value in REF_NODE is updated and returned, if
3050 not then it is returned unchanged. */
3053 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
3054 slp_tree_to_load_perm_map_t
*perm_cache
,
3055 slp_compat_nodes_map_t
*compat_cache
,
3056 hash_set
<slp_tree
> *visited
)
3059 slp_tree node
= *ref_node
;
3060 bool found_p
= false;
3061 if (!node
|| visited
->add (node
))
3065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3066 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
3067 vinfo
, perm_cache
, compat_cache
,
3070 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
3072 vect_pattern
*pattern
3073 = slp_patterns
[x
] (perm_cache
, compat_cache
, ref_node
);
3076 pattern
->build (vinfo
);
3085 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3088 The modified tree is returned. Patterns are tried in order and multiple
3089 patterns may match. */
3092 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
3093 hash_set
<slp_tree
> *visited
,
3094 slp_tree_to_load_perm_map_t
*perm_cache
,
3095 slp_compat_nodes_map_t
*compat_cache
)
3097 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3098 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE
, vect_location
,
3102 "Analyzing SLP tree %p for patterns\n",
3103 (void *) SLP_INSTANCE_TREE (instance
));
3105 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, compat_cache
,
3109 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3110 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3111 Return true if we could use IFN_STORE_LANES instead and if that appears
3112 to be the better approach. */
3115 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
3116 unsigned int group_size
,
3117 unsigned int new_group_size
)
3119 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3120 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
3123 /* Allow the split if one of the two new groups would operate on full
3124 vectors *within* rather than across one scalar loop iteration.
3125 This is purely a heuristic, but it should work well for group
3126 sizes of 3 and 4, where the possible splits are:
3128 3->2+1: OK if the vector has exactly two elements
3130 4->3+1: Less clear-cut. */
3131 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
3132 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
3134 return vect_store_lanes_supported (vectype
, group_size
, false) != IFN_LAST
;
3137 /* Analyze an SLP instance starting from a group of grouped stores. Call
3138 vect_build_slp_tree to build a tree of packed stmts if possible.
3139 Return FALSE if it's impossible to SLP any stmt in the loop. */
3142 vect_analyze_slp_instance (vec_info
*vinfo
,
3143 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3144 stmt_vec_info stmt_info
, slp_instance_kind kind
,
3145 unsigned max_tree_size
, unsigned *limit
);
3147 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3148 of KIND. Return true if successful. */
3151 vect_build_slp_instance (vec_info
*vinfo
,
3152 slp_instance_kind kind
,
3153 vec
<stmt_vec_info
> &scalar_stmts
,
3154 vec
<stmt_vec_info
> &root_stmt_infos
,
3156 unsigned max_tree_size
, unsigned *limit
,
3157 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3158 /* ??? We need stmt_info for group splitting. */
3159 stmt_vec_info stmt_info_
)
3161 if (kind
== slp_inst_kind_ctor
)
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE
, vect_location
,
3165 "Analyzing vectorizable constructor: %G\n",
3166 root_stmt_infos
[0]->stmt
);
3169 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_NOTE
, vect_location
,
3172 "Starting SLP discovery for\n");
3173 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
3174 dump_printf_loc (MSG_NOTE
, vect_location
,
3175 " %G", scalar_stmts
[i
]->stmt
);
3178 /* When a BB reduction doesn't have an even number of lanes
3179 strip it down, treating the remaining lane as scalar.
3180 ??? Selecting the optimal set of lanes to vectorize would be nice
3181 but SLP build for all lanes will fail quickly because we think
3182 we're going to need unrolling. */
3183 if (kind
== slp_inst_kind_bb_reduc
3184 && (scalar_stmts
.length () & 1))
3185 remain
.safe_insert (0, gimple_get_lhs (scalar_stmts
.pop ()->stmt
));
3187 /* Build the tree for the SLP instance. */
3188 unsigned int group_size
= scalar_stmts
.length ();
3189 bool *matches
= XALLOCAVEC (bool, group_size
);
3190 poly_uint64 max_nunits
= 1;
3191 unsigned tree_size
= 0;
3193 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
3194 &max_nunits
, matches
, limit
,
3195 &tree_size
, bst_map
);
3198 /* Calculate the unrolling factor based on the smallest type. */
3199 poly_uint64 unrolling_factor
3200 = calculate_unrolling_factor (max_nunits
, group_size
);
3202 if (maybe_ne (unrolling_factor
, 1U)
3203 && is_a
<bb_vec_info
> (vinfo
))
3205 unsigned HOST_WIDE_INT const_max_nunits
;
3206 if (!max_nunits
.is_constant (&const_max_nunits
)
3207 || const_max_nunits
> group_size
)
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3211 "Build SLP failed: store group "
3212 "size not a multiple of the vector size "
3213 "in basic block SLP\n");
3214 vect_free_slp_tree (node
);
3217 /* Fatal mismatch. */
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_NOTE
, vect_location
,
3220 "SLP discovery succeeded but node needs "
3222 memset (matches
, true, group_size
);
3223 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
3224 vect_free_slp_tree (node
);
3228 /* Create a new SLP instance. */
3229 slp_instance new_instance
= XNEW (class _slp_instance
);
3230 SLP_INSTANCE_TREE (new_instance
) = node
;
3231 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
3232 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
3233 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
3234 SLP_INSTANCE_REMAIN_DEFS (new_instance
) = remain
;
3235 SLP_INSTANCE_KIND (new_instance
) = kind
;
3236 new_instance
->reduc_phis
= NULL
;
3237 new_instance
->cost_vec
= vNULL
;
3238 new_instance
->subgraph_entries
= vNULL
;
3240 if (dump_enabled_p ())
3241 dump_printf_loc (MSG_NOTE
, vect_location
,
3242 "SLP size %u vs. limit %u.\n",
3243 tree_size
, max_tree_size
);
3245 /* Fixup SLP reduction chains. */
3246 if (kind
== slp_inst_kind_reduc_chain
)
3248 /* If this is a reduction chain with a conversion in front
3249 amend the SLP tree with a node for that. */
3251 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
3252 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
3254 /* Get at the conversion stmt - we know it's the single use
3255 of the last stmt of the reduction chain. */
3256 use_operand_p use_p
;
3257 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
3258 &use_p
, &scalar_def
);
3260 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
3261 next_info
= vect_stmt_to_vectorize (next_info
);
3262 scalar_stmts
= vNULL
;
3263 scalar_stmts
.create (group_size
);
3264 for (unsigned i
= 0; i
< group_size
; ++i
)
3265 scalar_stmts
.quick_push (next_info
);
3266 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
3267 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
3268 SLP_TREE_CHILDREN (conv
).quick_push (node
);
3269 SLP_INSTANCE_TREE (new_instance
) = conv
;
3270 /* We also have to fake this conversion stmt as SLP reduction
3271 group so we don't have to mess with too much code
3273 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
3274 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
3276 /* Fill the backedge child of the PHI SLP node. The
3277 general matching code cannot find it because the
3278 scalar code does not reflect how we vectorize the
3280 use_operand_p use_p
;
3281 imm_use_iterator imm_iter
;
3282 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
3283 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
3284 gimple_get_lhs (scalar_def
))
3285 /* There are exactly two non-debug uses, the reduction
3286 PHI and the loop-closed PHI node. */
3287 if (!is_gimple_debug (USE_STMT (use_p
))
3288 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
3290 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
3291 stmt_vec_info phi_info
3292 = vinfo
->lookup_stmt (USE_STMT (use_p
));
3293 for (unsigned i
= 0; i
< group_size
; ++i
)
3294 phis
.quick_push (phi_info
);
3295 slp_tree
*phi_node
= bst_map
->get (phis
);
3296 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
3297 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
3298 = SLP_INSTANCE_TREE (new_instance
);
3299 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
3303 vinfo
->slp_instances
.safe_push (new_instance
);
3305 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3306 the number of scalar stmts in the root in a few places.
3307 Verify that assumption holds. */
3308 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
3309 .length () == group_size
);
3311 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE
, vect_location
,
3314 "Final SLP tree for instance %p:\n",
3315 (void *) new_instance
);
3316 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3317 SLP_INSTANCE_TREE (new_instance
));
3325 /* Failed to SLP. */
3326 /* Free the allocated memory. */
3327 scalar_stmts
.release ();
3330 stmt_vec_info stmt_info
= stmt_info_
;
3331 /* Try to break the group up into pieces. */
3332 if (kind
== slp_inst_kind_store
)
3334 /* ??? We could delay all the actual splitting of store-groups
3335 until after SLP discovery of the original group completed.
3336 Then we can recurse to vect_build_slp_instance directly. */
3337 for (i
= 0; i
< group_size
; i
++)
3341 /* For basic block SLP, try to break the group up into multiples of
3343 if (is_a
<bb_vec_info
> (vinfo
)
3344 && (i
> 1 && i
< group_size
))
3347 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3348 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
3349 1 << floor_log2 (i
));
3350 unsigned HOST_WIDE_INT const_nunits
;
3352 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
3354 /* Split into two groups at the first vector boundary. */
3355 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
3356 unsigned group1_size
= i
& ~(const_nunits
- 1);
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE
, vect_location
,
3360 "Splitting SLP group at stmt %u\n", i
);
3361 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3363 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3364 kind
, max_tree_size
,
3366 /* Split the rest at the failure point and possibly
3367 re-analyze the remaining matching part if it has
3368 at least two lanes. */
3370 && (i
+ 1 < group_size
3371 || i
- group1_size
> 1))
3373 stmt_vec_info rest2
= rest
;
3374 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
3375 if (i
- group1_size
> 1)
3376 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
3377 kind
, max_tree_size
,
3380 /* Re-analyze the non-matching tail if it has at least
3382 if (i
+ 1 < group_size
)
3383 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3384 rest
, kind
, max_tree_size
,
3390 /* For loop vectorization split into arbitrary pieces of size > 1. */
3391 if (is_a
<loop_vec_info
> (vinfo
)
3392 && (i
> 1 && i
< group_size
)
3393 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
3395 unsigned group1_size
= i
;
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE
, vect_location
,
3399 "Splitting SLP group at stmt %u\n", i
);
3401 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3403 /* Loop vectorization cannot handle gaps in stores, make sure
3404 the split group appears as strided. */
3405 STMT_VINFO_STRIDED_P (rest
) = 1;
3406 DR_GROUP_GAP (rest
) = 0;
3407 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
3408 DR_GROUP_GAP (stmt_info
) = 0;
3410 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3411 kind
, max_tree_size
, limit
);
3412 if (i
+ 1 < group_size
)
3413 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3414 rest
, kind
, max_tree_size
, limit
);
3419 /* Even though the first vector did not all match, we might be able to SLP
3420 (some) of the remainder. FORNOW ignore this possibility. */
3423 /* Failed to SLP. */
3424 if (dump_enabled_p ())
3425 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
3430 /* Analyze an SLP instance starting from a group of grouped stores. Call
3431 vect_build_slp_tree to build a tree of packed stmts if possible.
3432 Return FALSE if it's impossible to SLP any stmt in the loop. */
3435 vect_analyze_slp_instance (vec_info
*vinfo
,
3436 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3437 stmt_vec_info stmt_info
,
3438 slp_instance_kind kind
,
3439 unsigned max_tree_size
, unsigned *limit
)
3442 vec
<stmt_vec_info
> scalar_stmts
;
3444 if (is_a
<bb_vec_info
> (vinfo
))
3445 vect_location
= stmt_info
->stmt
;
3447 stmt_vec_info next_info
= stmt_info
;
3448 if (kind
== slp_inst_kind_store
)
3450 /* Collect the stores and store them in scalar_stmts. */
3451 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
3454 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3455 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
3458 else if (kind
== slp_inst_kind_reduc_chain
)
3460 /* Collect the reduction stmts and store them in scalar_stmts. */
3461 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
3464 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3465 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
3467 /* Mark the first element of the reduction chain as reduction to properly
3468 transform the node. In the reduction analysis phase only the last
3469 element of the chain is marked as reduction. */
3470 STMT_VINFO_DEF_TYPE (stmt_info
)
3471 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
3472 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
3473 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
3475 else if (kind
== slp_inst_kind_reduc_group
)
3477 /* Collect reduction statements. */
3478 const vec
<stmt_vec_info
> &reductions
3479 = as_a
<loop_vec_info
> (vinfo
)->reductions
;
3480 scalar_stmts
.create (reductions
.length ());
3481 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
3482 if ((STMT_VINFO_RELEVANT_P (next_info
)
3483 || STMT_VINFO_LIVE_P (next_info
))
3484 /* ??? Make sure we didn't skip a conversion around a reduction
3485 path. In that case we'd have to reverse engineer that conversion
3486 stmt following the chain using reduc_idx and from the PHI
3488 && STMT_VINFO_DEF_TYPE (next_info
) == vect_reduction_def
)
3489 scalar_stmts
.quick_push (next_info
);
3490 /* If less than two were relevant/live there's nothing to SLP. */
3491 if (scalar_stmts
.length () < 2)
3497 vec
<stmt_vec_info
> roots
= vNULL
;
3498 vec
<tree
> remain
= vNULL
;
3499 /* Build the tree for the SLP instance. */
3500 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
3502 max_tree_size
, limit
, bst_map
,
3503 kind
== slp_inst_kind_store
3504 ? stmt_info
: NULL
);
3506 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3507 where we should do store group splitting. */
3512 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3513 trees of packed scalar stmts if SLP is possible. */
3516 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
3519 stmt_vec_info first_element
;
3520 slp_instance instance
;
3522 DUMP_VECT_SCOPE ("vect_analyze_slp");
3524 unsigned limit
= max_tree_size
;
3526 scalar_stmts_to_slp_tree_map_t
*bst_map
3527 = new scalar_stmts_to_slp_tree_map_t ();
3529 /* Find SLP sequences starting from groups of grouped stores. */
3530 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
3531 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3532 slp_inst_kind_store
, max_tree_size
, &limit
);
3534 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3536 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
3538 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
3539 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
3540 bb_vinfo
->roots
[i
].stmts
,
3541 bb_vinfo
->roots
[i
].roots
,
3542 bb_vinfo
->roots
[i
].remain
,
3543 max_tree_size
, &limit
, bst_map
, NULL
))
3545 bb_vinfo
->roots
[i
].stmts
= vNULL
;
3546 bb_vinfo
->roots
[i
].roots
= vNULL
;
3547 bb_vinfo
->roots
[i
].remain
= vNULL
;
3552 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3554 /* Find SLP sequences starting from reduction chains. */
3555 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
3556 if (! STMT_VINFO_RELEVANT_P (first_element
)
3557 && ! STMT_VINFO_LIVE_P (first_element
))
3559 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3560 slp_inst_kind_reduc_chain
,
3561 max_tree_size
, &limit
))
3563 /* Dissolve reduction chain group. */
3564 stmt_vec_info vinfo
= first_element
;
3565 stmt_vec_info last
= NULL
;
3568 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
3569 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
3570 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
3574 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
3575 /* It can be still vectorized as part of an SLP reduction. */
3576 loop_vinfo
->reductions
.safe_push (last
);
3579 /* Find SLP sequences starting from groups of reductions. */
3580 if (loop_vinfo
->reductions
.length () > 1)
3581 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
3582 slp_inst_kind_reduc_group
, max_tree_size
,
3586 hash_set
<slp_tree
> visited_patterns
;
3587 slp_tree_to_load_perm_map_t perm_cache
;
3588 slp_compat_nodes_map_t compat_cache
;
3590 /* See if any patterns can be found in the SLP tree. */
3591 bool pattern_found
= false;
3592 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3593 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
3594 &visited_patterns
, &perm_cache
,
3597 /* If any were found optimize permutations of loads. */
3600 hash_map
<slp_tree
, slp_tree
> load_map
;
3601 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3603 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3604 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
3611 /* The map keeps a reference on SLP nodes built, release that. */
3612 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
3613 it
!= bst_map
->end (); ++it
)
3615 vect_free_slp_tree ((*it
).second
);
3618 if (pattern_found
&& dump_enabled_p ())
3620 dump_printf_loc (MSG_NOTE
, vect_location
,
3621 "Pattern matched SLP tree\n");
3622 hash_set
<slp_tree
> visited
;
3623 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3624 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3625 SLP_INSTANCE_TREE (instance
), visited
);
3628 return opt_result::success ();
3631 /* Estimates the cost of inserting layout changes into the SLP graph.
3632 It can also say that the insertion is impossible. */
3634 struct slpg_layout_cost
3636 slpg_layout_cost () = default;
3637 slpg_layout_cost (sreal
, bool);
3639 static slpg_layout_cost
impossible () { return { sreal::max (), 0 }; }
3640 bool is_possible () const { return depth
!= sreal::max (); }
3642 bool operator== (const slpg_layout_cost
&) const;
3643 bool operator!= (const slpg_layout_cost
&) const;
3645 bool is_better_than (const slpg_layout_cost
&, bool) const;
3647 void add_parallel_cost (const slpg_layout_cost
&);
3648 void add_serial_cost (const slpg_layout_cost
&);
3649 void split (unsigned int);
3651 /* The longest sequence of layout changes needed during any traversal
3652 of the partition dag, weighted by execution frequency.
3654 This is the most important metric when optimizing for speed, since
3655 it helps to ensure that we keep the number of operations on
3656 critical paths to a minimum. */
3659 /* An estimate of the total number of operations needed. It is weighted by
3660 execution frequency when optimizing for speed but not when optimizing for
3661 size. In order to avoid double-counting, a node with a fanout of N will
3662 distribute 1/N of its total cost to each successor.
3664 This is the most important metric when optimizing for size, since
3665 it helps to keep the total number of operations to a minimum, */
3669 /* Construct costs for a node with weight WEIGHT. A higher weight
3670 indicates more frequent execution. IS_FOR_SIZE is true if we are
3671 optimizing for size rather than speed. */
3673 slpg_layout_cost::slpg_layout_cost (sreal weight
, bool is_for_size
)
3674 : depth (weight
), total (is_for_size
&& weight
> 0 ? 1 : weight
)
3679 slpg_layout_cost::operator== (const slpg_layout_cost
&other
) const
3681 return depth
== other
.depth
&& total
== other
.total
;
3685 slpg_layout_cost::operator!= (const slpg_layout_cost
&other
) const
3687 return !operator== (other
);
3690 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3691 true if we are optimizing for size rather than speed. */
3694 slpg_layout_cost::is_better_than (const slpg_layout_cost
&other
,
3695 bool is_for_size
) const
3699 if (total
!= other
.total
)
3700 return total
< other
.total
;
3701 return depth
< other
.depth
;
3705 if (depth
!= other
.depth
)
3706 return depth
< other
.depth
;
3707 return total
< other
.total
;
3711 /* Increase the costs to account for something with cost INPUT_COST
3712 happening in parallel with the current costs. */
3715 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost
&input_cost
)
3717 depth
= std::max (depth
, input_cost
.depth
);
3718 total
+= input_cost
.total
;
3721 /* Increase the costs to account for something with cost INPUT_COST
3722 happening in series with the current costs. */
3725 slpg_layout_cost::add_serial_cost (const slpg_layout_cost
&other
)
3727 depth
+= other
.depth
;
3728 total
+= other
.total
;
3731 /* Split the total cost among TIMES successors or predecessors. */
3734 slpg_layout_cost::split (unsigned int times
)
3740 /* Information about one node in the SLP graph, for use during
3741 vect_optimize_slp_pass. */
3745 slpg_vertex (slp_tree node_
) : node (node_
) {}
3747 /* The node itself. */
3750 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3751 partitions are flexible; they can have whichever layout consumers
3752 want them to have. */
3755 /* The number of nodes that directly use the result of this one
3756 (i.e. the number of nodes that count this one as a child). */
3757 unsigned int out_degree
= 0;
3759 /* The execution frequency of the node. */
3762 /* The total execution frequency of all nodes that directly use the
3763 result of this one. */
3764 sreal out_weight
= 0;
3767 /* Information about one partition of the SLP graph, for use during
3768 vect_optimize_slp_pass. */
3770 struct slpg_partition_info
3772 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3773 of m_partitioned_nodes. */
3774 unsigned int node_begin
= 0;
3775 unsigned int node_end
= 0;
3777 /* Which layout we've chosen to use for this partition, or -1 if
3778 we haven't picked one yet. */
3781 /* The number of predecessors and successors in the partition dag.
3782 The predecessors always have lower partition numbers and the
3783 successors always have higher partition numbers.
3785 Note that the directions of these edges are not necessarily the
3786 same as in the data flow graph. For example, if an SCC has separate
3787 partitions for an inner loop and an outer loop, the inner loop's
3788 partition will have at least two incoming edges from the outer loop's
3789 partition: one for a live-in value and one for a live-out value.
3790 In data flow terms, one of these edges would also be from the outer loop
3791 to the inner loop, but the other would be in the opposite direction. */
3792 unsigned int in_degree
= 0;
3793 unsigned int out_degree
= 0;
3796 /* Information about the costs of using a particular layout for a
3797 particular partition. It can also say that the combination is
3800 struct slpg_partition_layout_costs
3802 bool is_possible () const { return internal_cost
.is_possible (); }
3803 void mark_impossible () { internal_cost
= slpg_layout_cost::impossible (); }
3805 /* The costs inherited from predecessor partitions. */
3806 slpg_layout_cost in_cost
;
3808 /* The inherent cost of the layout within the node itself. For example,
3809 this is nonzero for a load if choosing a particular layout would require
3810 the load to permute the loaded elements. It is nonzero for a
3811 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3812 to full-vector moves. */
3813 slpg_layout_cost internal_cost
;
3815 /* The costs inherited from successor partitions. */
3816 slpg_layout_cost out_cost
;
3819 /* This class tries to optimize the layout of vectors in order to avoid
3820 unnecessary shuffling. At the moment, the set of possible layouts are
3821 restricted to bijective permutations.
3823 The goal of the pass depends on whether we're optimizing for size or
3824 for speed. When optimizing for size, the goal is to reduce the overall
3825 number of layout changes (including layout changes implied by things
3826 like load permutations). When optimizing for speed, the goal is to
3827 reduce the maximum latency attributable to layout changes on any
3828 non-cyclical path through the data flow graph.
3830 For example, when optimizing a loop nest for speed, we will prefer
3831 to make layout changes outside of a loop rather than inside of a loop,
3832 and will prefer to make layout changes in parallel rather than serially,
3833 even if that increases the overall number of layout changes.
3835 The high-level procedure is:
3837 (1) Build a graph in which edges go from uses (parents) to definitions
3840 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3842 (3) When optimizing for speed, partition the nodes in each SCC based
3843 on their containing cfg loop. When optimizing for size, treat
3844 each SCC as a single partition.
3846 This gives us a dag of partitions. The goal is now to assign a
3847 layout to each partition.
3849 (4) Construct a set of vector layouts that are worth considering.
3850 Record which nodes must keep their current layout.
3852 (5) Perform a forward walk over the partition dag (from loads to stores)
3853 accumulating the "forward" cost of using each layout. When visiting
3854 each partition, assign a tentative choice of layout to the partition
3855 and use that choice when calculating the cost of using a different
3856 layout in successor partitions.
3858 (6) Perform a backward walk over the partition dag (from stores to loads),
3859 accumulating the "backward" cost of using each layout. When visiting
3860 each partition, make a final choice of layout for that partition based
3861 on the accumulated forward costs (from (5)) and backward costs
3864 (7) Apply the chosen layouts to the SLP graph.
3866 For example, consider the SLP statements:
3870 S2: a_2 = PHI<a_1, a_3>
3877 S2 and S4 form an SCC and are part of the same loop. Every other
3878 statement is in a singleton SCC. In this example there is a one-to-one
3879 mapping between SCCs and partitions and the partition dag looks like this;
3889 S2, S3 and S4 will have a higher execution frequency than the other
3890 statements, so when optimizing for speed, the goal is to avoid any
3895 - on the S3->S2+S4 edge
3897 For example, if S3 was originally a reversing load, the goal of the
3898 pass is to make it an unreversed load and change the layout on the
3899 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3900 on S1->S2+S4 and S5->S6 would also be acceptable.)
3902 The difference between SCCs and partitions becomes important if we
3907 S2: a_2 = PHI<a_1, a_6>
3911 S5: a_4 = PHI<a_3, a_5>
3919 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3920 for speed, we usually do not want restrictions in the outer loop to "infect"
3921 the decision for the inner loop. For example, if an outer-loop node
3922 in the SCC contains a statement with a fixed layout, that should not
3923 prevent the inner loop from using a different layout. Conversely,
3924 the inner loop should not dictate a layout to the outer loop: if the
3925 outer loop does a lot of computation, then it may not be efficient to
3926 do all of that computation in the inner loop's preferred layout.
3928 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3929 and S5+S7 (inner). We also try to arrange partitions so that:
3931 - the partition for an outer loop comes before the partition for
3934 - if a sibling loop A dominates a sibling loop B, A's partition
3937 This gives the following partition dag for the example above:
3947 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3948 one for a reversal of the edge S7->S8.
3950 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3951 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3952 preferred layout against the cost of changing the layout on entry to the
3953 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3955 Although this works well when optimizing for speed, it has the downside
3956 when optimizing for size that the choice of layout for S5+S7 is completely
3957 independent of S9, which lessens the chance of reducing the overall number
3958 of permutations. We therefore do not partition SCCs when optimizing
3961 To give a concrete example of the difference between optimizing
3962 for size and speed, consider:
3964 a[0] = (b[1] << c[3]) - d[1];
3965 a[1] = (b[0] << c[2]) - d[0];
3966 a[2] = (b[3] << c[1]) - d[3];
3967 a[3] = (b[2] << c[0]) - d[2];
3969 There are three different layouts here: one for a, one for b and d,
3970 and one for c. When optimizing for speed it is better to permute each
3971 of b, c and d into the order required by a, since those permutations
3972 happen in parallel. But when optimizing for size, it is better to:
3974 - permute c into the same order as b
3976 - permute the result into the order required by a
3978 This gives 2 permutations rather than 3. */
3980 class vect_optimize_slp_pass
3983 vect_optimize_slp_pass (vec_info
*vinfo
) : m_vinfo (vinfo
) {}
3987 /* Graph building. */
3988 struct loop
*containing_loop (slp_tree
);
3989 bool is_cfg_latch_edge (graph_edge
*);
3990 void build_vertices (hash_set
<slp_tree
> &, slp_tree
);
3991 void build_vertices ();
3992 void build_graph ();
3995 void create_partitions ();
3996 template<typename T
> void for_each_partition_edge (unsigned int, T
);
3998 /* Layout selection. */
3999 bool is_compatible_layout (slp_tree
, unsigned int);
4000 int change_layout_cost (slp_tree
, unsigned int, unsigned int);
4001 slpg_partition_layout_costs
&partition_layout_costs (unsigned int,
4003 void change_vec_perm_layout (slp_tree
, lane_permutation_t
&,
4005 int internal_node_cost (slp_tree
, int, unsigned int);
4006 void start_choosing_layouts ();
4008 /* Cost propagation. */
4009 slpg_layout_cost
edge_layout_cost (graph_edge
*, unsigned int,
4010 unsigned int, unsigned int);
4011 slpg_layout_cost
total_in_cost (unsigned int);
4012 slpg_layout_cost
forward_cost (graph_edge
*, unsigned int, unsigned int);
4013 slpg_layout_cost
backward_cost (graph_edge
*, unsigned int, unsigned int);
4014 void forward_pass ();
4015 void backward_pass ();
4017 /* Rematerialization. */
4018 slp_tree
get_result_with_layout (slp_tree
, unsigned int);
4019 void materialize ();
4022 void remove_redundant_permutations ();
4028 /* True if we should optimize the graph for size, false if we should
4029 optimize it for speed. (It wouldn't be easy to make this decision
4031 bool m_optimize_size
;
4033 /* A graph of all SLP nodes, with edges leading from uses to definitions.
4034 In other words, a node's predecessors are its slp_tree parents and
4035 a node's successors are its slp_tree children. */
4036 graph
*m_slpg
= nullptr;
4038 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4039 auto_vec
<slpg_vertex
> m_vertices
;
4041 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4043 auto_vec
<int> m_leafs
;
4045 /* This array has one entry for every vector layout that we're considering.
4046 Element 0 is null and indicates "no change". Other entries describe
4047 permutations that are inherent in the current graph and that we would
4048 like to reverse if possible.
4050 For example, a permutation { 1, 2, 3, 0 } means that something has
4051 effectively been permuted in that way, such as a load group
4052 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4053 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4054 in order to put things "back" in order. */
4055 auto_vec
<vec
<unsigned> > m_perms
;
4057 /* A partitioning of the nodes for which a layout must be chosen.
4058 Each partition represents an <SCC, cfg loop> pair; that is,
4059 nodes in different SCCs belong to different partitions, and nodes
4060 within an SCC can be further partitioned according to a containing
4061 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4063 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4064 from leaves (such as loads) to roots (such as stores).
4066 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4067 auto_vec
<slpg_partition_info
> m_partitions
;
4069 /* The list of all nodes for which a layout must be chosen. Nodes for
4070 partition P come before the nodes for partition P+1. Nodes within a
4071 partition are in reverse postorder. */
4072 auto_vec
<unsigned int> m_partitioned_nodes
;
4074 /* Index P * num-layouts + L contains the cost of using layout L
4076 auto_vec
<slpg_partition_layout_costs
> m_partition_layout_costs
;
4078 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4079 original output of node N adjusted to have layout L. */
4080 auto_vec
<slp_tree
> m_node_layouts
;
4083 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4084 Also record whether we should optimize anything for speed rather
4088 vect_optimize_slp_pass::build_vertices (hash_set
<slp_tree
> &visited
,
4094 if (visited
.add (node
))
4097 if (stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
))
4099 basic_block bb
= gimple_bb (vect_orig_stmt (rep
)->stmt
);
4100 if (optimize_bb_for_speed_p (bb
))
4101 m_optimize_size
= false;
4104 node
->vertex
= m_vertices
.length ();
4105 m_vertices
.safe_push (slpg_vertex (node
));
4108 bool force_leaf
= false;
4109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4113 build_vertices (visited
, child
);
4117 /* Since SLP discovery works along use-def edges all cycles have an
4118 entry - but there's the exception of cycles where we do not handle
4119 the entry explicitely (but with a NULL SLP node), like some reductions
4120 and inductions. Force those SLP PHIs to act as leafs to make them
4121 backwards reachable. */
4122 if (leaf
|| force_leaf
)
4123 m_leafs
.safe_push (node
->vertex
);
4126 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4129 vect_optimize_slp_pass::build_vertices ()
4131 hash_set
<slp_tree
> visited
;
4133 slp_instance instance
;
4134 FOR_EACH_VEC_ELT (m_vinfo
->slp_instances
, i
, instance
)
4135 build_vertices (visited
, SLP_INSTANCE_TREE (instance
));
4138 /* Apply (reverse) bijectite PERM to VEC. */
4142 vect_slp_permute (vec
<unsigned> perm
,
4143 vec
<T
> &vec
, bool reverse
)
4145 auto_vec
<T
, 64> saved
;
4146 saved
.create (vec
.length ());
4147 for (unsigned i
= 0; i
< vec
.length (); ++i
)
4148 saved
.quick_push (vec
[i
]);
4152 for (unsigned i
= 0; i
< vec
.length (); ++i
)
4153 vec
[perm
[i
]] = saved
[i
];
4154 for (unsigned i
= 0; i
< vec
.length (); ++i
)
4155 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
4159 for (unsigned i
= 0; i
< vec
.length (); ++i
)
4160 vec
[i
] = saved
[perm
[i
]];
4161 for (unsigned i
= 0; i
< vec
.length (); ++i
)
4162 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
4166 /* Return the cfg loop that contains NODE. */
4169 vect_optimize_slp_pass::containing_loop (slp_tree node
)
4171 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
4173 return ENTRY_BLOCK_PTR_FOR_FN (cfun
)->loop_father
;
4174 return gimple_bb (vect_orig_stmt (rep
)->stmt
)->loop_father
;
4177 /* Return true if UD (an edge from a use to a definition) is associated
4178 with a loop latch edge in the cfg. */
4181 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge
*ud
)
4183 slp_tree use
= m_vertices
[ud
->src
].node
;
4184 slp_tree def
= m_vertices
[ud
->dest
].node
;
4185 if (SLP_TREE_DEF_TYPE (use
) != vect_internal_def
4186 || SLP_TREE_DEF_TYPE (def
) != vect_internal_def
)
4189 stmt_vec_info use_rep
= vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use
));
4190 return (is_a
<gphi
*> (use_rep
->stmt
)
4191 && bb_loop_header_p (gimple_bb (use_rep
->stmt
))
4192 && containing_loop (def
) == containing_loop (use
));
4195 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4196 a nonnull data field. */
4199 vect_optimize_slp_pass::build_graph ()
4201 m_optimize_size
= true;
4204 m_slpg
= new_graph (m_vertices
.length ());
4205 for (slpg_vertex
&v
: m_vertices
)
4206 for (slp_tree child
: SLP_TREE_CHILDREN (v
.node
))
4209 graph_edge
*ud
= add_edge (m_slpg
, v
.node
->vertex
, child
->vertex
);
4210 if (is_cfg_latch_edge (ud
))
4215 /* Return true if E corresponds to a loop latch edge in the cfg. */
4218 skip_cfg_latch_edges (graph_edge
*e
)
4223 /* Create the node partitions. */
4226 vect_optimize_slp_pass::create_partitions ()
4228 /* Calculate a postorder of the graph, ignoring edges that correspond
4229 to natural latch edges in the cfg. Reading the vector from the end
4230 to the beginning gives the reverse postorder. */
4231 auto_vec
<int> initial_rpo
;
4232 graphds_dfs (m_slpg
, &m_leafs
[0], m_leafs
.length (), &initial_rpo
,
4233 false, NULL
, skip_cfg_latch_edges
);
4234 gcc_assert (initial_rpo
.length () == m_vertices
.length ());
4236 /* Calculate the strongly connected components of the graph. */
4237 auto_vec
<int> scc_grouping
;
4238 unsigned int num_sccs
= graphds_scc (m_slpg
, NULL
, NULL
, &scc_grouping
);
4240 /* Create a new index order in which all nodes from the same SCC are
4241 consecutive. Use scc_pos to record the index of the first node in
4243 auto_vec
<unsigned int> scc_pos (num_sccs
);
4244 int last_component
= -1;
4245 unsigned int node_count
= 0;
4246 for (unsigned int node_i
: scc_grouping
)
4248 if (last_component
!= m_slpg
->vertices
[node_i
].component
)
4250 last_component
= m_slpg
->vertices
[node_i
].component
;
4251 gcc_assert (last_component
== int (scc_pos
.length ()));
4252 scc_pos
.quick_push (node_count
);
4256 gcc_assert (node_count
== initial_rpo
.length ()
4257 && last_component
+ 1 == int (num_sccs
));
4259 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4260 inside each SCC following the RPO we calculated above. The fact that
4261 we ignored natural latch edges when calculating the RPO should ensure
4262 that, for natural loop nests:
4264 - the first node that we encounter in a cfg loop is the loop header phi
4265 - the loop header phis are in dominance order
4267 Arranging for this is an optimization (see below) rather than a
4268 correctness issue. Unnatural loops with a tangled mess of backedges
4269 will still work correctly, but might give poorer results.
4271 Also update scc_pos so that it gives 1 + the index of the last node
4273 m_partitioned_nodes
.safe_grow (node_count
);
4274 for (unsigned int old_i
= initial_rpo
.length (); old_i
-- > 0;)
4276 unsigned int node_i
= initial_rpo
[old_i
];
4277 unsigned int new_i
= scc_pos
[m_slpg
->vertices
[node_i
].component
]++;
4278 m_partitioned_nodes
[new_i
] = node_i
;
4281 /* When optimizing for speed, partition each SCC based on the containing
4282 cfg loop. The order we constructed above should ensure that, for natural
4283 cfg loops, we'll create sub-SCC partitions for outer loops before
4284 the corresponding sub-SCC partitions for inner loops. Similarly,
4285 when one sibling loop A dominates another sibling loop B, we should
4286 create a sub-SCC partition for A before a sub-SCC partition for B.
4288 As above, nothing depends for correctness on whether this achieves
4289 a natural nesting, but we should get better results when it does. */
4290 m_partitions
.reserve (m_vertices
.length ());
4291 unsigned int next_partition_i
= 0;
4292 hash_map
<struct loop
*, int> loop_partitions
;
4293 unsigned int rpo_begin
= 0;
4294 unsigned int num_partitioned_nodes
= 0;
4295 for (unsigned int rpo_end
: scc_pos
)
4297 loop_partitions
.empty ();
4298 unsigned int partition_i
= next_partition_i
;
4299 for (unsigned int rpo_i
= rpo_begin
; rpo_i
< rpo_end
; ++rpo_i
)
4301 /* Handle externals and constants optimistically throughout.
4302 But treat existing vectors as fixed since we do not handle
4304 unsigned int node_i
= m_partitioned_nodes
[rpo_i
];
4305 auto &vertex
= m_vertices
[node_i
];
4306 if ((SLP_TREE_DEF_TYPE (vertex
.node
) == vect_external_def
4307 && !SLP_TREE_VEC_DEFS (vertex
.node
).exists ())
4308 || SLP_TREE_DEF_TYPE (vertex
.node
) == vect_constant_def
)
4309 vertex
.partition
= -1;
4313 if (m_optimize_size
)
4314 existed
= next_partition_i
> partition_i
;
4317 struct loop
*loop
= containing_loop (vertex
.node
);
4318 auto &entry
= loop_partitions
.get_or_insert (loop
, &existed
);
4320 entry
= next_partition_i
;
4321 partition_i
= entry
;
4325 m_partitions
.quick_push (slpg_partition_info ());
4326 next_partition_i
+= 1;
4328 vertex
.partition
= partition_i
;
4329 num_partitioned_nodes
+= 1;
4330 m_partitions
[partition_i
].node_end
+= 1;
4333 rpo_begin
= rpo_end
;
4336 /* Assign ranges of consecutive node indices to each partition,
4337 in partition order. Start with node_end being the same as
4338 node_begin so that the next loop can use it as a counter. */
4339 unsigned int node_begin
= 0;
4340 for (auto &partition
: m_partitions
)
4342 partition
.node_begin
= node_begin
;
4343 node_begin
+= partition
.node_end
;
4344 partition
.node_end
= partition
.node_begin
;
4346 gcc_assert (node_begin
== num_partitioned_nodes
);
4348 /* Finally build the list of nodes in partition order. */
4349 m_partitioned_nodes
.truncate (num_partitioned_nodes
);
4350 for (unsigned int node_i
= 0; node_i
< m_vertices
.length (); ++node_i
)
4352 int partition_i
= m_vertices
[node_i
].partition
;
4353 if (partition_i
>= 0)
4355 unsigned int order_i
= m_partitions
[partition_i
].node_end
++;
4356 m_partitioned_nodes
[order_i
] = node_i
;
4361 /* Look for edges from earlier partitions into node NODE_I and edges from
4362 node NODE_I into later partitions. Call:
4364 FN (ud, other_node_i)
4366 for each such use-to-def edge ud, where other_node_i is the node at the
4367 other end of the edge. */
4369 template<typename T
>
4371 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i
, T fn
)
4373 int partition_i
= m_vertices
[node_i
].partition
;
4374 for (graph_edge
*pred
= m_slpg
->vertices
[node_i
].pred
;
4375 pred
; pred
= pred
->pred_next
)
4377 int src_partition_i
= m_vertices
[pred
->src
].partition
;
4378 if (src_partition_i
>= 0 && src_partition_i
!= partition_i
)
4379 fn (pred
, pred
->src
);
4381 for (graph_edge
*succ
= m_slpg
->vertices
[node_i
].succ
;
4382 succ
; succ
= succ
->succ_next
)
4384 int dest_partition_i
= m_vertices
[succ
->dest
].partition
;
4385 if (dest_partition_i
>= 0 && dest_partition_i
!= partition_i
)
4386 fn (succ
, succ
->dest
);
4390 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4391 that NODE would operate on. This test is independent of NODE's actual
4395 vect_optimize_slp_pass::is_compatible_layout (slp_tree node
,
4396 unsigned int layout_i
)
4401 if (SLP_TREE_LANES (node
) != m_perms
[layout_i
].length ())
4407 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4408 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4409 layouts is incompatible with NODE or if the change is not possible for
4412 The properties taken from NODE include the number of lanes and the
4413 vector type. The actual operation doesn't matter. */
4416 vect_optimize_slp_pass::change_layout_cost (slp_tree node
,
4417 unsigned int from_layout_i
,
4418 unsigned int to_layout_i
)
4420 if (!is_compatible_layout (node
, from_layout_i
)
4421 || !is_compatible_layout (node
, to_layout_i
))
4424 if (from_layout_i
== to_layout_i
)
4427 auto_vec
<slp_tree
, 1> children (1);
4428 children
.quick_push (node
);
4429 auto_lane_permutation_t
perm (SLP_TREE_LANES (node
));
4430 if (from_layout_i
> 0)
4431 for (unsigned int i
: m_perms
[from_layout_i
])
4432 perm
.quick_push ({ 0, i
});
4434 for (unsigned int i
= 0; i
< SLP_TREE_LANES (node
); ++i
)
4435 perm
.quick_push ({ 0, i
});
4436 if (to_layout_i
> 0)
4437 vect_slp_permute (m_perms
[to_layout_i
], perm
, true);
4438 auto count
= vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
, perm
,
4441 return MAX (count
, 1);
4443 /* ??? In principle we could try changing via layout 0, giving two
4444 layout changes rather than 1. Doing that would require
4445 corresponding support in get_result_with_layout. */
4449 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4451 inline slpg_partition_layout_costs
&
4452 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i
,
4453 unsigned int layout_i
)
4455 return m_partition_layout_costs
[partition_i
* m_perms
.length () + layout_i
];
4458 /* Change PERM in one of two ways:
4460 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4461 chosen for child I of NODE.
4463 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4465 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4468 vect_optimize_slp_pass::
4469 change_vec_perm_layout (slp_tree node
, lane_permutation_t
&perm
,
4470 int in_layout_i
, unsigned int out_layout_i
)
4472 for (auto &entry
: perm
)
4474 int this_in_layout_i
= in_layout_i
;
4475 if (this_in_layout_i
< 0)
4477 slp_tree in_node
= SLP_TREE_CHILDREN (node
)[entry
.first
];
4478 unsigned int in_partition_i
= m_vertices
[in_node
->vertex
].partition
;
4479 this_in_layout_i
= m_partitions
[in_partition_i
].layout
;
4481 if (this_in_layout_i
> 0)
4482 entry
.second
= m_perms
[this_in_layout_i
][entry
.second
];
4484 if (out_layout_i
> 0)
4485 vect_slp_permute (m_perms
[out_layout_i
], perm
, true);
4488 /* Check whether the target allows NODE to be rearranged so that the node's
4489 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4490 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4492 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4493 NODE can adapt to the layout changes that have (perhaps provisionally)
4494 been chosen for NODE's children, so that no extra permutations are
4495 needed on either the input or the output of NODE.
4497 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4498 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4500 IN_LAYOUT_I has no meaning for other types of node.
4502 Keeping the node as-is is always valid. If the target doesn't appear
4503 to support the node as-is, but might realistically support other layouts,
4504 then layout 0 instead has the cost of a worst-case permutation. On the
4505 one hand, this ensures that every node has at least one valid layout,
4506 avoiding what would otherwise be an awkward special case. On the other,
4507 it still encourages the pass to change an invalid pre-existing layout
4508 choice into a valid one. */
4511 vect_optimize_slp_pass::internal_node_cost (slp_tree node
, int in_layout_i
,
4512 unsigned int out_layout_i
)
4514 const int fallback_cost
= 1;
4516 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4518 auto_lane_permutation_t tmp_perm
;
4519 tmp_perm
.safe_splice (SLP_TREE_LANE_PERMUTATION (node
));
4521 /* Check that the child nodes support the chosen layout. Checking
4522 the first child is enough, since any second child would have the
4524 auto first_child
= SLP_TREE_CHILDREN (node
)[0];
4526 && !is_compatible_layout (first_child
, in_layout_i
))
4529 change_vec_perm_layout (node
, tmp_perm
, in_layout_i
, out_layout_i
);
4530 int count
= vectorizable_slp_permutation_1 (m_vinfo
, nullptr,
4532 SLP_TREE_CHILDREN (node
),
4536 if (in_layout_i
== 0 && out_layout_i
== 0)
4538 /* Use the fallback cost if the node could in principle support
4539 some nonzero layout for both the inputs and the outputs.
4540 Otherwise assume that the node will be rejected later
4541 and rebuilt from scalars. */
4542 if (SLP_TREE_LANES (node
) == SLP_TREE_LANES (first_child
))
4543 return fallback_cost
;
4549 /* We currently have no way of telling whether the new layout is cheaper
4550 or more expensive than the old one. But at least in principle,
4551 it should be worth making zero permutations (whole-vector shuffles)
4552 cheaper than real permutations, in case the pass is able to remove
4554 return count
== 0 ? 0 : 1;
4557 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
4559 && STMT_VINFO_DATA_REF (rep
)
4560 && DR_IS_READ (STMT_VINFO_DATA_REF (rep
))
4561 && SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4563 auto_load_permutation_t tmp_perm
;
4564 tmp_perm
.safe_splice (SLP_TREE_LOAD_PERMUTATION (node
));
4565 if (out_layout_i
> 0)
4566 vect_slp_permute (m_perms
[out_layout_i
], tmp_perm
, true);
4569 if (auto loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
))
4570 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
4571 unsigned int n_perms
;
4572 if (!vect_transform_slp_perm_load_1 (m_vinfo
, node
, tmp_perm
, vNULL
,
4573 nullptr, vf
, true, false, &n_perms
))
4575 auto rep
= SLP_TREE_REPRESENTATIVE (node
);
4576 if (out_layout_i
== 0)
4578 /* Use the fallback cost if the load is an N-to-N permutation.
4579 Otherwise assume that the node will be rejected later
4580 and rebuilt from scalars. */
4581 if (STMT_VINFO_GROUPED_ACCESS (rep
)
4582 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep
))
4583 == SLP_TREE_LANES (node
)))
4584 return fallback_cost
;
4590 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4591 return n_perms
== 0 ? 0 : 1;
4597 /* Decide which element layouts we should consider using. Calculate the
4598 weights associated with inserting layout changes on partition edges.
4599 Also mark partitions that cannot change layout, by setting their
4603 vect_optimize_slp_pass::start_choosing_layouts ()
4605 /* Used to assign unique permutation indices. */
4606 using perm_hash
= unbounded_hashmap_traits
<
4607 vec_free_hash_base
<int_hash_base
<unsigned>>,
4608 int_hash
<int, -1, -2>
4610 hash_map
<vec
<unsigned>, int, perm_hash
> layout_ids
;
4612 /* Layout 0 is "no change". */
4613 m_perms
.safe_push (vNULL
);
4615 /* Create layouts from existing permutations. */
4616 auto_load_permutation_t tmp_perm
;
4617 for (unsigned int node_i
: m_partitioned_nodes
)
4619 /* Leafs also double as entries to the reverse graph. Allow the
4620 layout of those to be changed. */
4621 auto &vertex
= m_vertices
[node_i
];
4622 auto &partition
= m_partitions
[vertex
.partition
];
4623 if (!m_slpg
->vertices
[node_i
].succ
)
4624 partition
.layout
= 0;
4626 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4627 slp_tree node
= vertex
.node
;
4628 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
4630 unsigned HOST_WIDE_INT imin
, imax
= 0;
4631 bool any_permute
= false;
4632 tmp_perm
.truncate (0);
4633 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4635 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4636 unpermuted, record a layout that reverses this permutation.
4638 We would need more work to cope with loads that are internally
4639 permuted and also have inputs (such as masks for
4641 gcc_assert (partition
.layout
== 0 && !m_slpg
->vertices
[node_i
].succ
);
4642 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
4644 partition
.layout
= -1;
4647 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
4648 imin
= DR_GROUP_SIZE (dr_stmt
) + 1;
4649 tmp_perm
.safe_splice (SLP_TREE_LOAD_PERMUTATION (node
));
4651 else if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
4652 && SLP_TREE_CHILDREN (node
).length () == 1
4653 && (child
= SLP_TREE_CHILDREN (node
)[0])
4654 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child
))
4655 .is_constant (&imin
)))
4657 /* If the child has the same vector size as this node,
4658 reversing the permutation can make the permutation a no-op.
4659 In other cases it can change a true permutation into a
4660 full-vector extract. */
4661 tmp_perm
.reserve (SLP_TREE_LANES (node
));
4662 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4663 tmp_perm
.quick_push (SLP_TREE_LANE_PERMUTATION (node
)[j
].second
);
4668 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4670 unsigned idx
= tmp_perm
[j
];
4671 imin
= MIN (imin
, idx
);
4672 imax
= MAX (imax
, idx
);
4673 if (idx
- tmp_perm
[0] != j
)
4676 /* If the span doesn't match we'd disrupt VF computation, avoid
4678 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
4680 /* If there's no permute no need to split one out. In this case
4681 we can consider turning a load into a permuted load, if that
4682 turns out to be cheaper than alternatives. */
4685 partition
.layout
= -1;
4689 /* For now only handle true permutes, like
4690 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4691 when permuting constants and invariants keeping the permute
4693 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
4694 bitmap_clear (load_index
);
4695 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4696 bitmap_set_bit (load_index
, tmp_perm
[j
] - imin
);
4698 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4699 if (!bitmap_bit_p (load_index
, j
))
4701 if (j
!= SLP_TREE_LANES (node
))
4704 vec
<unsigned> perm
= vNULL
;
4705 perm
.safe_grow (SLP_TREE_LANES (node
), true);
4706 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4707 perm
[j
] = tmp_perm
[j
] - imin
;
4709 if (int (m_perms
.length ()) >= param_vect_max_layout_candidates
)
4711 /* Continue to use existing layouts, but don't add any more. */
4712 int *entry
= layout_ids
.get (perm
);
4713 partition
.layout
= entry
? *entry
: 0;
4719 int &layout_i
= layout_ids
.get_or_insert (perm
, &existed
);
4724 layout_i
= m_perms
.length ();
4725 m_perms
.safe_push (perm
);
4727 partition
.layout
= layout_i
;
4731 /* Initially assume that every layout is possible and has zero cost
4732 in every partition. */
4733 m_partition_layout_costs
.safe_grow_cleared (m_partitions
.length ()
4734 * m_perms
.length ());
4736 /* We have to mark outgoing permutations facing non-associating-reduction
4737 graph entries that are not represented as to be materialized.
4738 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4739 for (slp_instance instance
: m_vinfo
->slp_instances
)
4740 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
)
4742 unsigned int node_i
= SLP_INSTANCE_TREE (instance
)->vertex
;
4743 m_partitions
[m_vertices
[node_i
].partition
].layout
= 0;
4745 else if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_reduc_chain
)
4747 stmt_vec_info stmt_info
4748 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance
));
4749 stmt_vec_info reduc_info
= info_for_reduction (m_vinfo
, stmt_info
);
4750 if (needs_fold_left_reduction_p (TREE_TYPE
4751 (gimple_get_lhs (stmt_info
->stmt
)),
4752 STMT_VINFO_REDUC_CODE (reduc_info
)))
4754 unsigned int node_i
= SLP_INSTANCE_TREE (instance
)->vertex
;
4755 m_partitions
[m_vertices
[node_i
].partition
].layout
= 0;
4759 /* Check which layouts each node and partition can handle. Calculate the
4760 weights associated with inserting layout changes on edges. */
4761 for (unsigned int node_i
: m_partitioned_nodes
)
4763 auto &vertex
= m_vertices
[node_i
];
4764 auto &partition
= m_partitions
[vertex
.partition
];
4765 slp_tree node
= vertex
.node
;
4767 if (stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
))
4769 vertex
.weight
= vect_slp_node_weight (node
);
4771 /* We do not handle stores with a permutation, so all
4772 incoming permutations must have been materialized.
4774 We also don't handle masked grouped loads, which lack a
4775 permutation vector. In this case the memory locations
4776 form an implicit second input to the loads, on top of the
4777 explicit mask input, and the memory input's layout cannot
4780 On the other hand, we do support permuting gather loads and
4781 masked gather loads, where each scalar load is independent
4782 of the others. This can be useful if the address/index input
4783 benefits from permutation. */
4784 if (STMT_VINFO_DATA_REF (rep
)
4785 && STMT_VINFO_GROUPED_ACCESS (rep
)
4786 && !SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4787 partition
.layout
= 0;
4789 /* We cannot change the layout of an operation that is
4790 not independent on lanes. Note this is an explicit
4791 negative list since that's much shorter than the respective
4792 positive one but it's critical to keep maintaining it. */
4793 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
4794 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
4796 case CFN_COMPLEX_ADD_ROT90
:
4797 case CFN_COMPLEX_ADD_ROT270
:
4798 case CFN_COMPLEX_MUL
:
4799 case CFN_COMPLEX_MUL_CONJ
:
4800 case CFN_VEC_ADDSUB
:
4801 case CFN_VEC_FMADDSUB
:
4802 case CFN_VEC_FMSUBADD
:
4803 partition
.layout
= 0;
4808 auto process_edge
= [&](graph_edge
*ud
, unsigned int other_node_i
)
4810 auto &other_vertex
= m_vertices
[other_node_i
];
4812 /* Count the number of edges from earlier partitions and the number
4813 of edges to later partitions. */
4814 if (other_vertex
.partition
< vertex
.partition
)
4815 partition
.in_degree
+= 1;
4817 partition
.out_degree
+= 1;
4819 /* If the current node uses the result of OTHER_NODE_I, accumulate
4820 the effects of that. */
4821 if (ud
->src
== int (node_i
))
4823 other_vertex
.out_weight
+= vertex
.weight
;
4824 other_vertex
.out_degree
+= 1;
4827 for_each_partition_edge (node_i
, process_edge
);
4831 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4832 its current (provisional) choice of layout. The inputs do not necessarily
4833 have the same layout as each other. */
4836 vect_optimize_slp_pass::total_in_cost (unsigned int node_i
)
4838 auto &vertex
= m_vertices
[node_i
];
4839 slpg_layout_cost cost
;
4840 auto add_cost
= [&](graph_edge
*, unsigned int other_node_i
)
4842 auto &other_vertex
= m_vertices
[other_node_i
];
4843 if (other_vertex
.partition
< vertex
.partition
)
4845 auto &other_partition
= m_partitions
[other_vertex
.partition
];
4846 auto &other_costs
= partition_layout_costs (other_vertex
.partition
,
4847 other_partition
.layout
);
4848 slpg_layout_cost this_cost
= other_costs
.in_cost
;
4849 this_cost
.add_serial_cost (other_costs
.internal_cost
);
4850 this_cost
.split (other_partition
.out_degree
);
4851 cost
.add_parallel_cost (this_cost
);
4854 for_each_partition_edge (node_i
, add_cost
);
4858 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4859 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4860 slpg_layout_cost::impossible () if the change isn't possible. */
4863 vect_optimize_slp_pass::
4864 edge_layout_cost (graph_edge
*ud
, unsigned int node1_i
, unsigned int layout1_i
,
4865 unsigned int layout2_i
)
4867 auto &def_vertex
= m_vertices
[ud
->dest
];
4868 auto &use_vertex
= m_vertices
[ud
->src
];
4869 auto def_layout_i
= ud
->dest
== int (node1_i
) ? layout1_i
: layout2_i
;
4870 auto use_layout_i
= ud
->dest
== int (node1_i
) ? layout2_i
: layout1_i
;
4871 auto factor
= change_layout_cost (def_vertex
.node
, def_layout_i
,
4874 return slpg_layout_cost::impossible ();
4876 /* We have a choice of putting the layout change at the site of the
4877 definition or at the site of the use. Prefer the former when
4878 optimizing for size or when the execution frequency of the
4879 definition is no greater than the combined execution frequencies of
4880 the uses. When putting the layout change at the site of the definition,
4881 divvy up the cost among all consumers. */
4882 if (m_optimize_size
|| def_vertex
.weight
<= def_vertex
.out_weight
)
4884 slpg_layout_cost cost
= { def_vertex
.weight
* factor
, m_optimize_size
};
4885 cost
.split (def_vertex
.out_degree
);
4888 return { use_vertex
.weight
* factor
, m_optimize_size
};
4891 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4892 partition; FROM_NODE_I could be the definition node or the use node.
4893 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4894 Return the cost of any necessary fix-ups on edge UD, or return
4895 slpg_layout_cost::impossible () if the change isn't possible.
4897 At this point, FROM_NODE_I's partition has chosen the cheapest
4898 layout based on the information available so far, but this choice
4899 is only provisional. */
4902 vect_optimize_slp_pass::forward_cost (graph_edge
*ud
, unsigned int from_node_i
,
4903 unsigned int to_layout_i
)
4905 auto &from_vertex
= m_vertices
[from_node_i
];
4906 unsigned int from_partition_i
= from_vertex
.partition
;
4907 slpg_partition_info
&from_partition
= m_partitions
[from_partition_i
];
4908 gcc_assert (from_partition
.layout
>= 0);
4910 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4911 with its current layout preference. */
4912 slpg_layout_cost cost
= slpg_layout_cost::impossible ();
4913 auto edge_cost
= edge_layout_cost (ud
, from_node_i
,
4914 from_partition
.layout
, to_layout_i
);
4915 if (edge_cost
.is_possible ())
4917 auto &from_costs
= partition_layout_costs (from_partition_i
,
4918 from_partition
.layout
);
4919 cost
= from_costs
.in_cost
;
4920 cost
.add_serial_cost (from_costs
.internal_cost
);
4921 cost
.split (from_partition
.out_degree
);
4922 cost
.add_serial_cost (edge_cost
);
4925 /* Take the minimum of that cost and the cost that applies if
4926 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4927 auto &direct_layout_costs
= partition_layout_costs (from_partition_i
,
4929 if (direct_layout_costs
.is_possible ())
4931 slpg_layout_cost direct_cost
= direct_layout_costs
.in_cost
;
4932 direct_cost
.add_serial_cost (direct_layout_costs
.internal_cost
);
4933 direct_cost
.split (from_partition
.out_degree
);
4934 if (!cost
.is_possible ()
4935 || direct_cost
.is_better_than (cost
, m_optimize_size
))
4942 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4943 partition; TO_NODE_I could be the definition node or the use node.
4944 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4945 return the cost of any necessary fix-ups on edge UD, or
4946 slpg_layout_cost::impossible () if the choice cannot be made.
4948 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4951 vect_optimize_slp_pass::backward_cost (graph_edge
*ud
, unsigned int to_node_i
,
4952 unsigned int from_layout_i
)
4954 auto &to_vertex
= m_vertices
[to_node_i
];
4955 unsigned int to_partition_i
= to_vertex
.partition
;
4956 slpg_partition_info
&to_partition
= m_partitions
[to_partition_i
];
4957 gcc_assert (to_partition
.layout
>= 0);
4959 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4960 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4961 any other inputs keep their current choice of layout. */
4962 auto &to_costs
= partition_layout_costs (to_partition_i
,
4963 to_partition
.layout
);
4964 if (ud
->src
== int (to_node_i
)
4965 && SLP_TREE_CODE (to_vertex
.node
) == VEC_PERM_EXPR
)
4967 auto &from_partition
= m_partitions
[m_vertices
[ud
->dest
].partition
];
4968 auto old_layout
= from_partition
.layout
;
4969 from_partition
.layout
= from_layout_i
;
4970 int factor
= internal_node_cost (to_vertex
.node
, -1,
4971 to_partition
.layout
);
4972 from_partition
.layout
= old_layout
;
4975 slpg_layout_cost cost
= to_costs
.out_cost
;
4976 cost
.add_serial_cost ({ to_vertex
.weight
* factor
,
4978 cost
.split (to_partition
.in_degree
);
4983 /* Compute the cost if we insert any necessary layout change on edge UD. */
4984 auto edge_cost
= edge_layout_cost (ud
, to_node_i
,
4985 to_partition
.layout
, from_layout_i
);
4986 if (edge_cost
.is_possible ())
4988 slpg_layout_cost cost
= to_costs
.out_cost
;
4989 cost
.add_serial_cost (to_costs
.internal_cost
);
4990 cost
.split (to_partition
.in_degree
);
4991 cost
.add_serial_cost (edge_cost
);
4995 return slpg_layout_cost::impossible ();
4998 /* Make a forward pass through the partitions, accumulating input costs.
4999 Make a tentative (provisional) choice of layout for each partition,
5000 ensuring that this choice still allows later partitions to keep
5001 their original layout. */
5004 vect_optimize_slp_pass::forward_pass ()
5006 for (unsigned int partition_i
= 0; partition_i
< m_partitions
.length ();
5009 auto &partition
= m_partitions
[partition_i
];
5011 /* If the partition consists of a single VEC_PERM_EXPR, precompute
5012 the incoming cost that would apply if every predecessor partition
5013 keeps its current layout. This is used within the loop below. */
5014 slpg_layout_cost in_cost
;
5015 slp_tree single_node
= nullptr;
5016 if (partition
.node_end
== partition
.node_begin
+ 1)
5018 unsigned int node_i
= m_partitioned_nodes
[partition
.node_begin
];
5019 single_node
= m_vertices
[node_i
].node
;
5020 if (SLP_TREE_CODE (single_node
) == VEC_PERM_EXPR
)
5021 in_cost
= total_in_cost (node_i
);
5024 /* Go through the possible layouts. Decide which ones are valid
5025 for this partition and record which of the valid layouts has
5027 unsigned int min_layout_i
= 0;
5028 slpg_layout_cost min_layout_cost
= slpg_layout_cost::impossible ();
5029 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
5031 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
5032 if (!layout_costs
.is_possible ())
5035 /* If the recorded layout is already 0 then the layout cannot
5037 if (partition
.layout
== 0 && layout_i
!= 0)
5039 layout_costs
.mark_impossible ();
5043 bool is_possible
= true;
5044 for (unsigned int order_i
= partition
.node_begin
;
5045 order_i
< partition
.node_end
; ++order_i
)
5047 unsigned int node_i
= m_partitioned_nodes
[order_i
];
5048 auto &vertex
= m_vertices
[node_i
];
5050 /* Reject the layout if it is individually incompatible
5051 with any node in the partition. */
5052 if (!is_compatible_layout (vertex
.node
, layout_i
))
5054 is_possible
= false;
5058 auto add_cost
= [&](graph_edge
*ud
, unsigned int other_node_i
)
5060 auto &other_vertex
= m_vertices
[other_node_i
];
5061 if (other_vertex
.partition
< vertex
.partition
)
5063 /* Accumulate the incoming costs from earlier
5064 partitions, plus the cost of any layout changes
5066 auto cost
= forward_cost (ud
, other_node_i
, layout_i
);
5067 if (!cost
.is_possible ())
5068 is_possible
= false;
5070 layout_costs
.in_cost
.add_parallel_cost (cost
);
5073 /* Reject the layout if it would make layout 0 impossible
5074 for later partitions. This amounts to testing that the
5075 target supports reversing the layout change on edges
5076 to later partitions.
5078 In principle, it might be possible to push a layout
5079 change all the way down a graph, so that it never
5080 needs to be reversed and so that the target doesn't
5081 need to support the reverse operation. But it would
5082 be awkward to bail out if we hit a partition that
5083 does not support the new layout, especially since
5084 we are not dealing with a lattice. */
5085 is_possible
&= edge_layout_cost (ud
, other_node_i
, 0,
5086 layout_i
).is_possible ();
5088 for_each_partition_edge (node_i
, add_cost
);
5090 /* Accumulate the cost of using LAYOUT_I within NODE,
5091 both for the inputs and the outputs. */
5092 int factor
= internal_node_cost (vertex
.node
, layout_i
,
5096 is_possible
= false;
5100 layout_costs
.internal_cost
.add_serial_cost
5101 ({ vertex
.weight
* factor
, m_optimize_size
});
5105 layout_costs
.mark_impossible ();
5109 /* Combine the incoming and partition-internal costs. */
5110 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
5111 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
5113 /* If this partition consists of a single VEC_PERM_EXPR, see
5114 if the VEC_PERM_EXPR can be changed to support output layout
5115 LAYOUT_I while keeping all the provisional choices of input
5118 && SLP_TREE_CODE (single_node
) == VEC_PERM_EXPR
)
5120 int factor
= internal_node_cost (single_node
, -1, layout_i
);
5123 auto weight
= m_vertices
[single_node
->vertex
].weight
;
5124 slpg_layout_cost internal_cost
5125 = { weight
* factor
, m_optimize_size
};
5127 slpg_layout_cost alt_cost
= in_cost
;
5128 alt_cost
.add_serial_cost (internal_cost
);
5129 if (alt_cost
.is_better_than (combined_cost
, m_optimize_size
))
5131 combined_cost
= alt_cost
;
5132 layout_costs
.in_cost
= in_cost
;
5133 layout_costs
.internal_cost
= internal_cost
;
5138 /* Record the layout with the lowest cost. Prefer layout 0 in
5139 the event of a tie between it and another layout. */
5140 if (!min_layout_cost
.is_possible ()
5141 || combined_cost
.is_better_than (min_layout_cost
,
5144 min_layout_i
= layout_i
;
5145 min_layout_cost
= combined_cost
;
5149 /* This loop's handling of earlier partitions should ensure that
5150 choosing the original layout for the current partition is no
5151 less valid than it was in the original graph, even with the
5152 provisional layout choices for those earlier partitions. */
5153 gcc_assert (min_layout_cost
.is_possible ());
5154 partition
.layout
= min_layout_i
;
5158 /* Make a backward pass through the partitions, accumulating output costs.
5159 Make a final choice of layout for each partition. */
5162 vect_optimize_slp_pass::backward_pass ()
5164 for (unsigned int partition_i
= m_partitions
.length (); partition_i
-- > 0;)
5166 auto &partition
= m_partitions
[partition_i
];
5168 unsigned int min_layout_i
= 0;
5169 slpg_layout_cost min_layout_cost
= slpg_layout_cost::impossible ();
5170 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
5172 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
5173 if (!layout_costs
.is_possible ())
5176 /* Accumulate the costs from successor partitions. */
5177 bool is_possible
= true;
5178 for (unsigned int order_i
= partition
.node_begin
;
5179 order_i
< partition
.node_end
; ++order_i
)
5181 unsigned int node_i
= m_partitioned_nodes
[order_i
];
5182 auto &vertex
= m_vertices
[node_i
];
5183 auto add_cost
= [&](graph_edge
*ud
, unsigned int other_node_i
)
5185 auto &other_vertex
= m_vertices
[other_node_i
];
5186 auto &other_partition
= m_partitions
[other_vertex
.partition
];
5187 if (other_vertex
.partition
> vertex
.partition
)
5189 /* Accumulate the incoming costs from later
5190 partitions, plus the cost of any layout changes
5192 auto cost
= backward_cost (ud
, other_node_i
, layout_i
);
5193 if (!cost
.is_possible ())
5194 is_possible
= false;
5196 layout_costs
.out_cost
.add_parallel_cost (cost
);
5199 /* Make sure that earlier partitions can (if necessary
5200 or beneficial) keep the layout that they chose in
5201 the forward pass. This ensures that there is at
5202 least one valid choice of layout. */
5203 is_possible
&= edge_layout_cost (ud
, other_node_i
,
5204 other_partition
.layout
,
5205 layout_i
).is_possible ();
5207 for_each_partition_edge (node_i
, add_cost
);
5211 layout_costs
.mark_impossible ();
5215 /* Locally combine the costs from the forward and backward passes.
5216 (This combined cost is not passed on, since that would lead
5217 to double counting.) */
5218 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
5219 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
5220 combined_cost
.add_serial_cost (layout_costs
.out_cost
);
5222 /* Record the layout with the lowest cost. Prefer layout 0 in
5223 the event of a tie between it and another layout. */
5224 if (!min_layout_cost
.is_possible ()
5225 || combined_cost
.is_better_than (min_layout_cost
,
5228 min_layout_i
= layout_i
;
5229 min_layout_cost
= combined_cost
;
5233 gcc_assert (min_layout_cost
.is_possible ());
5234 partition
.layout
= min_layout_i
;
5238 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5239 NODE already has the layout that was selected for its partition. */
5242 vect_optimize_slp_pass::get_result_with_layout (slp_tree node
,
5243 unsigned int to_layout_i
)
5245 unsigned int result_i
= node
->vertex
* m_perms
.length () + to_layout_i
;
5246 slp_tree result
= m_node_layouts
[result_i
];
5250 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5251 || (SLP_TREE_DEF_TYPE (node
) == vect_external_def
5252 /* We can't permute vector defs in place. */
5253 && SLP_TREE_VEC_DEFS (node
).is_empty ()))
5255 /* If the vector is uniform or unchanged, there's nothing to do. */
5256 if (to_layout_i
== 0 || vect_slp_tree_uniform_p (node
))
5260 auto scalar_ops
= SLP_TREE_SCALAR_OPS (node
).copy ();
5261 result
= vect_create_new_slp_node (scalar_ops
);
5262 vect_slp_permute (m_perms
[to_layout_i
], scalar_ops
, true);
5267 unsigned int partition_i
= m_vertices
[node
->vertex
].partition
;
5268 unsigned int from_layout_i
= m_partitions
[partition_i
].layout
;
5269 if (from_layout_i
== to_layout_i
)
5272 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5273 permutation instead of a serial one. Leave the new permutation
5274 in TMP_PERM on success. */
5275 auto_lane_permutation_t tmp_perm
;
5276 unsigned int num_inputs
= 1;
5277 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5279 tmp_perm
.safe_splice (SLP_TREE_LANE_PERMUTATION (node
));
5280 if (from_layout_i
!= 0)
5281 vect_slp_permute (m_perms
[from_layout_i
], tmp_perm
, false);
5282 if (to_layout_i
!= 0)
5283 vect_slp_permute (m_perms
[to_layout_i
], tmp_perm
, true);
5284 if (vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
,
5286 SLP_TREE_CHILDREN (node
),
5288 num_inputs
= SLP_TREE_CHILDREN (node
).length ();
5290 tmp_perm
.truncate (0);
5293 if (dump_enabled_p ())
5295 if (tmp_perm
.length () > 0)
5296 dump_printf_loc (MSG_NOTE
, vect_location
,
5297 "duplicating permutation node %p with"
5299 (void *) node
, to_layout_i
);
5301 dump_printf_loc (MSG_NOTE
, vect_location
,
5302 "inserting permutation node in place of %p\n",
5306 unsigned int num_lanes
= SLP_TREE_LANES (node
);
5307 result
= vect_create_new_slp_node (num_inputs
, VEC_PERM_EXPR
);
5308 if (SLP_TREE_SCALAR_STMTS (node
).length ())
5310 auto &stmts
= SLP_TREE_SCALAR_STMTS (result
);
5311 stmts
.safe_splice (SLP_TREE_SCALAR_STMTS (node
));
5312 if (from_layout_i
!= 0)
5313 vect_slp_permute (m_perms
[from_layout_i
], stmts
, false);
5314 if (to_layout_i
!= 0)
5315 vect_slp_permute (m_perms
[to_layout_i
], stmts
, true);
5317 SLP_TREE_REPRESENTATIVE (result
) = SLP_TREE_REPRESENTATIVE (node
);
5318 SLP_TREE_LANES (result
) = num_lanes
;
5319 SLP_TREE_VECTYPE (result
) = SLP_TREE_VECTYPE (node
);
5320 result
->vertex
= -1;
5322 auto &lane_perm
= SLP_TREE_LANE_PERMUTATION (result
);
5323 if (tmp_perm
.length ())
5325 lane_perm
.safe_splice (tmp_perm
);
5326 SLP_TREE_CHILDREN (result
).safe_splice (SLP_TREE_CHILDREN (node
));
5330 lane_perm
.create (num_lanes
);
5331 for (unsigned j
= 0; j
< num_lanes
; ++j
)
5332 lane_perm
.quick_push ({ 0, j
});
5333 if (from_layout_i
!= 0)
5334 vect_slp_permute (m_perms
[from_layout_i
], lane_perm
, false);
5335 if (to_layout_i
!= 0)
5336 vect_slp_permute (m_perms
[to_layout_i
], lane_perm
, true);
5337 SLP_TREE_CHILDREN (result
).safe_push (node
);
5339 for (slp_tree child
: SLP_TREE_CHILDREN (result
))
5342 m_node_layouts
[result_i
] = result
;
5346 /* Apply the chosen vector layouts to the SLP graph. */
5349 vect_optimize_slp_pass::materialize ()
5351 /* We no longer need the costs, so avoid having two O(N * P) arrays
5352 live at the same time. */
5353 m_partition_layout_costs
.release ();
5354 m_node_layouts
.safe_grow_cleared (m_vertices
.length () * m_perms
.length ());
5356 auto_sbitmap
fully_folded (m_vertices
.length ());
5357 bitmap_clear (fully_folded
);
5358 for (unsigned int node_i
: m_partitioned_nodes
)
5360 auto &vertex
= m_vertices
[node_i
];
5361 slp_tree node
= vertex
.node
;
5362 int layout_i
= m_partitions
[vertex
.partition
].layout
;
5363 gcc_assert (layout_i
>= 0);
5365 /* Rearrange the scalar statements to match the chosen layout. */
5367 vect_slp_permute (m_perms
[layout_i
],
5368 SLP_TREE_SCALAR_STMTS (node
), true);
5370 /* Update load and lane permutations. */
5371 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5373 /* First try to absorb the input vector layouts. If that fails,
5374 force the inputs to have layout LAYOUT_I too. We checked that
5375 that was possible before deciding to use nonzero output layouts.
5376 (Note that at this stage we don't really have any guarantee that
5377 the target supports the original VEC_PERM_EXPR.) */
5378 auto &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5379 auto_lane_permutation_t tmp_perm
;
5380 tmp_perm
.safe_splice (perm
);
5381 change_vec_perm_layout (node
, tmp_perm
, -1, layout_i
);
5382 if (vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
,
5384 SLP_TREE_CHILDREN (node
),
5387 if (dump_enabled_p ()
5388 && !std::equal (tmp_perm
.begin (), tmp_perm
.end (),
5390 dump_printf_loc (MSG_NOTE
, vect_location
,
5391 "absorbing input layouts into %p\n",
5393 std::copy (tmp_perm
.begin (), tmp_perm
.end (), perm
.begin ());
5394 bitmap_set_bit (fully_folded
, node_i
);
5398 /* Not MSG_MISSED because it would make no sense to users. */
5399 if (dump_enabled_p ())
5400 dump_printf_loc (MSG_NOTE
, vect_location
,
5401 "failed to absorb input layouts into %p\n",
5403 change_vec_perm_layout (nullptr, perm
, layout_i
, layout_i
);
5408 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node
).exists ());
5409 auto &load_perm
= SLP_TREE_LOAD_PERMUTATION (node
);
5411 /* ??? When we handle non-bijective permutes the idea
5412 is that we can force the load-permutation to be
5413 { min, min + 1, min + 2, ... max }. But then the
5414 scalar defs might no longer match the lane content
5415 which means wrong-code with live lane vectorization.
5416 So we possibly have to have NULL entries for those. */
5417 vect_slp_permute (m_perms
[layout_i
], load_perm
, true);
5421 /* Do this before any nodes disappear, since it involves a walk
5423 remove_redundant_permutations ();
5425 /* Replace each child with a correctly laid-out version. */
5426 for (unsigned int node_i
: m_partitioned_nodes
)
5428 /* Skip nodes that have already been handled above. */
5429 if (bitmap_bit_p (fully_folded
, node_i
))
5432 auto &vertex
= m_vertices
[node_i
];
5433 int in_layout_i
= m_partitions
[vertex
.partition
].layout
;
5434 gcc_assert (in_layout_i
>= 0);
5438 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex
.node
), j
, child
)
5443 slp_tree new_child
= get_result_with_layout (child
, in_layout_i
);
5444 if (new_child
!= child
)
5446 vect_free_slp_tree (child
);
5447 SLP_TREE_CHILDREN (vertex
.node
)[j
] = new_child
;
5448 new_child
->refcnt
+= 1;
5454 /* Elide load permutations that are not necessary. Such permutations might
5455 be pre-existing, rather than created by the layout optimizations. */
5458 vect_optimize_slp_pass::remove_redundant_permutations ()
5460 for (unsigned int node_i
: m_leafs
)
5462 slp_tree node
= m_vertices
[node_i
].node
;
5463 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
5466 /* In basic block vectorization we allow any subchain of an interleaving
5468 FORNOW: not in loop SLP because of realignment complications. */
5469 if (is_a
<bb_vec_info
> (m_vinfo
))
5471 bool subchain_p
= true;
5472 stmt_vec_info next_load_info
= NULL
;
5473 stmt_vec_info load_info
;
5475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
5478 && (next_load_info
!= load_info
5479 || DR_GROUP_GAP (load_info
) != 1))
5484 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
5488 SLP_TREE_LOAD_PERMUTATION (node
).release ();
5494 loop_vec_info loop_vinfo
= as_a
<loop_vec_info
> (m_vinfo
);
5495 stmt_vec_info load_info
;
5496 bool this_load_permuted
= false;
5498 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
5499 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
5501 this_load_permuted
= true;
5504 /* When this isn't a grouped access we know it's single element
5506 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node
)[0]))
5508 if (!this_load_permuted
5509 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U)
5510 || SLP_TREE_LANES (node
) == 1))
5511 SLP_TREE_LOAD_PERMUTATION (node
).release ();
5514 stmt_vec_info first_stmt_info
5515 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
5516 if (!this_load_permuted
5517 /* The load requires permutation when unrolling exposes
5518 a gap either because the group is larger than the SLP
5519 group-size or because there is a gap between the groups. */
5520 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U)
5521 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
5522 && DR_GROUP_GAP (first_stmt_info
) == 0)))
5524 SLP_TREE_LOAD_PERMUTATION (node
).release ();
5531 /* Print the partition graph and layout information to the dump file. */
5534 vect_optimize_slp_pass::dump ()
5536 dump_printf_loc (MSG_NOTE
, vect_location
,
5537 "SLP optimize permutations:\n");
5538 for (unsigned int layout_i
= 1; layout_i
< m_perms
.length (); ++layout_i
)
5540 dump_printf_loc (MSG_NOTE
, vect_location
, " %d: { ", layout_i
);
5541 const char *sep
= "";
5542 for (unsigned int idx
: m_perms
[layout_i
])
5544 dump_printf (MSG_NOTE
, "%s%d", sep
, idx
);
5547 dump_printf (MSG_NOTE
, " }\n");
5549 dump_printf_loc (MSG_NOTE
, vect_location
,
5550 "SLP optimize partitions:\n");
5551 for (unsigned int partition_i
= 0; partition_i
< m_partitions
.length ();
5554 auto &partition
= m_partitions
[partition_i
];
5555 dump_printf_loc (MSG_NOTE
, vect_location
, " -------------\n");
5556 dump_printf_loc (MSG_NOTE
, vect_location
,
5557 " partition %d (layout %d):\n",
5558 partition_i
, partition
.layout
);
5559 dump_printf_loc (MSG_NOTE
, vect_location
, " nodes:\n");
5560 for (unsigned int order_i
= partition
.node_begin
;
5561 order_i
< partition
.node_end
; ++order_i
)
5563 auto &vertex
= m_vertices
[m_partitioned_nodes
[order_i
]];
5564 dump_printf_loc (MSG_NOTE
, vect_location
, " - %p:\n",
5565 (void *) vertex
.node
);
5566 dump_printf_loc (MSG_NOTE
, vect_location
,
5568 vertex
.weight
.to_double ());
5569 if (vertex
.out_degree
)
5570 dump_printf_loc (MSG_NOTE
, vect_location
,
5571 " out weight: %f (degree %d)\n",
5572 vertex
.out_weight
.to_double (),
5574 if (SLP_TREE_CODE (vertex
.node
) == VEC_PERM_EXPR
)
5575 dump_printf_loc (MSG_NOTE
, vect_location
,
5576 " op: VEC_PERM_EXPR\n");
5577 else if (auto rep
= SLP_TREE_REPRESENTATIVE (vertex
.node
))
5578 dump_printf_loc (MSG_NOTE
, vect_location
,
5579 " op template: %G", rep
->stmt
);
5581 dump_printf_loc (MSG_NOTE
, vect_location
, " edges:\n");
5582 for (unsigned int order_i
= partition
.node_begin
;
5583 order_i
< partition
.node_end
; ++order_i
)
5585 unsigned int node_i
= m_partitioned_nodes
[order_i
];
5586 auto &vertex
= m_vertices
[node_i
];
5587 auto print_edge
= [&](graph_edge
*, unsigned int other_node_i
)
5589 auto &other_vertex
= m_vertices
[other_node_i
];
5590 if (other_vertex
.partition
< vertex
.partition
)
5591 dump_printf_loc (MSG_NOTE
, vect_location
,
5592 " - %p [%d] --> %p\n",
5593 (void *) other_vertex
.node
,
5594 other_vertex
.partition
,
5595 (void *) vertex
.node
);
5597 dump_printf_loc (MSG_NOTE
, vect_location
,
5598 " - %p --> [%d] %p\n",
5599 (void *) vertex
.node
,
5600 other_vertex
.partition
,
5601 (void *) other_vertex
.node
);
5603 for_each_partition_edge (node_i
, print_edge
);
5606 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
5608 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
5609 if (layout_costs
.is_possible ())
5611 dump_printf_loc (MSG_NOTE
, vect_location
,
5612 " layout %d:%s\n", layout_i
,
5613 partition
.layout
== int (layout_i
)
5615 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
5616 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
5617 combined_cost
.add_serial_cost (layout_costs
.out_cost
);
5618 #define TEMPLATE "{depth: %f, total: %f}"
5619 dump_printf_loc (MSG_NOTE
, vect_location
,
5621 layout_costs
.in_cost
.depth
.to_double (),
5622 layout_costs
.in_cost
.total
.to_double ());
5623 dump_printf_loc (MSG_NOTE
, vect_location
,
5624 " + " TEMPLATE
"\n",
5625 layout_costs
.internal_cost
.depth
.to_double (),
5626 layout_costs
.internal_cost
.total
.to_double ());
5627 dump_printf_loc (MSG_NOTE
, vect_location
,
5628 " + " TEMPLATE
"\n",
5629 layout_costs
.out_cost
.depth
.to_double (),
5630 layout_costs
.out_cost
.total
.to_double ());
5631 dump_printf_loc (MSG_NOTE
, vect_location
,
5632 " = " TEMPLATE
"\n",
5633 combined_cost
.depth
.to_double (),
5634 combined_cost
.total
.to_double ());
5638 dump_printf_loc (MSG_NOTE
, vect_location
,
5639 " layout %d: rejected\n", layout_i
);
5644 /* Main entry point for the SLP graph optimization pass. */
5647 vect_optimize_slp_pass::run ()
5650 create_partitions ();
5651 start_choosing_layouts ();
5652 if (m_perms
.length () > 1)
5656 if (dump_enabled_p ())
5659 while (!m_perms
.is_empty ())
5660 m_perms
.pop ().release ();
5663 remove_redundant_permutations ();
5664 free_graph (m_slpg
);
5667 /* Optimize the SLP graph of VINFO. */
5670 vect_optimize_slp (vec_info
*vinfo
)
5672 if (vinfo
->slp_instances
.is_empty ())
5674 vect_optimize_slp_pass (vinfo
).run ();
5677 /* Gather loads reachable from the individual SLP graph entries. */
5680 vect_gather_slp_loads (vec_info
*vinfo
)
5683 slp_instance instance
;
5684 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
5686 hash_set
<slp_tree
> visited
;
5687 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
5688 SLP_INSTANCE_TREE (instance
), visited
);
5693 /* For each possible SLP instance decide whether to SLP it and calculate overall
5694 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5695 least one instance. */
5698 vect_make_slp_decision (loop_vec_info loop_vinfo
)
5701 poly_uint64 unrolling_factor
= 1;
5702 const vec
<slp_instance
> &slp_instances
5703 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
5704 slp_instance instance
;
5705 int decided_to_slp
= 0;
5707 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5709 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5711 /* FORNOW: SLP if you can. */
5712 /* All unroll factors have the form:
5714 GET_MODE_SIZE (vinfo->vector_mode) * X
5716 for some rational X, so they must have a common multiple. */
5718 = force_common_multiple (unrolling_factor
,
5719 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
5721 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5722 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5723 loop-based vectorization. Such stmts will be marked as HYBRID. */
5724 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
5728 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
5730 if (decided_to_slp
&& dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE
, vect_location
,
5733 "Decided to SLP %d instances. Unrolling factor ",
5735 dump_dec (MSG_NOTE
, unrolling_factor
);
5736 dump_printf (MSG_NOTE
, "\n");
5739 return (decided_to_slp
> 0);
5742 /* Private data for vect_detect_hybrid_slp. */
5745 loop_vec_info loop_vinfo
;
5746 vec
<stmt_vec_info
> *worklist
;
5749 /* Walker for walk_gimple_op. */
5752 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
5754 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
5755 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
5760 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
5763 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
5764 if (PURE_SLP_STMT (def_stmt_info
))
5766 if (dump_enabled_p ())
5767 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
5768 def_stmt_info
->stmt
);
5769 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
5770 dat
->worklist
->safe_push (def_stmt_info
);
5776 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5777 if so, otherwise pushing it to WORKLIST. */
5780 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
5781 vec
<stmt_vec_info
> &worklist
,
5782 stmt_vec_info stmt_info
)
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_NOTE
, vect_location
,
5786 "Processing hybrid candidate : %G", stmt_info
->stmt
);
5787 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
5788 imm_use_iterator iter2
;
5790 use_operand_p use_p
;
5791 def_operand_p def_p
;
5792 bool any_def
= false;
5793 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
5796 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
5798 if (is_gimple_debug (USE_STMT (use_p
)))
5800 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
5801 /* An out-of loop use means this is a loop_vect sink. */
5804 if (dump_enabled_p ())
5805 dump_printf_loc (MSG_NOTE
, vect_location
,
5806 "Found loop_vect sink: %G", stmt_info
->stmt
);
5807 worklist
.safe_push (stmt_info
);
5810 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
5812 if (dump_enabled_p ())
5813 dump_printf_loc (MSG_NOTE
, vect_location
,
5814 "Found loop_vect use: %G", use_info
->stmt
);
5815 worklist
.safe_push (stmt_info
);
5820 /* No def means this is a loo_vect sink. */
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_NOTE
, vect_location
,
5825 "Found loop_vect sink: %G", stmt_info
->stmt
);
5826 worklist
.safe_push (stmt_info
);
5829 if (dump_enabled_p ())
5830 dump_printf_loc (MSG_NOTE
, vect_location
,
5831 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
5832 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
5835 /* Find stmts that must be both vectorized and SLPed. */
5838 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
5840 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5842 /* All stmts participating in SLP are marked pure_slp, all other
5843 stmts are loop_vect.
5844 First collect all loop_vect stmts into a worklist.
5845 SLP patterns cause not all original scalar stmts to appear in
5846 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5847 Rectify this here and do a backward walk over the IL only considering
5848 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5849 mark them as pure_slp. */
5850 auto_vec
<stmt_vec_info
> worklist
;
5851 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
5853 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
5854 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
5857 gphi
*phi
= gsi
.phi ();
5858 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
5859 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
5860 maybe_push_to_hybrid_worklist (loop_vinfo
,
5861 worklist
, stmt_info
);
5863 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
5866 gimple
*stmt
= gsi_stmt (gsi
);
5867 if (is_gimple_debug (stmt
))
5869 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
5870 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
5872 for (gimple_stmt_iterator gsi2
5873 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
5874 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
5876 stmt_vec_info patt_info
5877 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
5878 if (!STMT_SLP_TYPE (patt_info
)
5879 && STMT_VINFO_RELEVANT (patt_info
))
5880 maybe_push_to_hybrid_worklist (loop_vinfo
,
5881 worklist
, patt_info
);
5883 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
5885 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
5886 maybe_push_to_hybrid_worklist (loop_vinfo
,
5887 worklist
, stmt_info
);
5891 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5892 mark any SLP vectorized stmt as hybrid.
5893 ??? We're visiting def stmts N times (once for each non-SLP and
5894 once for each hybrid-SLP use). */
5897 dat
.worklist
= &worklist
;
5898 dat
.loop_vinfo
= loop_vinfo
;
5899 memset (&wi
, 0, sizeof (wi
));
5900 wi
.info
= (void *)&dat
;
5901 while (!worklist
.is_empty ())
5903 stmt_vec_info stmt_info
= worklist
.pop ();
5904 /* Since SSA operands are not set up for pattern stmts we need
5905 to use walk_gimple_op. */
5907 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
5908 /* For gather/scatter make sure to walk the offset operand, that
5909 can be a scaling and conversion away. */
5910 gather_scatter_info gs_info
;
5911 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
5912 && vect_check_gather_scatter (stmt_info
, loop_vinfo
, &gs_info
))
5915 vect_detect_hybrid_slp (&gs_info
.offset
, &dummy
, &wi
);
5921 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5923 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
5924 : vec_info (vec_info::bb
, shared
),
5928 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
5931 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
5934 gphi
*phi
= si
.phi ();
5935 gimple_set_uid (phi
, 0);
5938 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
5939 !gsi_end_p (gsi
); gsi_next (&gsi
))
5941 gimple
*stmt
= gsi_stmt (gsi
);
5942 gimple_set_uid (stmt
, 0);
5943 if (is_gimple_debug (stmt
))
5951 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5952 stmts in the basic block. */
5954 _bb_vec_info::~_bb_vec_info ()
5956 /* Reset region marker. */
5957 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
5960 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
5963 gphi
*phi
= si
.phi ();
5964 gimple_set_uid (phi
, -1);
5966 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
5967 !gsi_end_p (gsi
); gsi_next (&gsi
))
5969 gimple
*stmt
= gsi_stmt (gsi
);
5970 gimple_set_uid (stmt
, -1);
5974 for (unsigned i
= 0; i
< roots
.length (); ++i
)
5976 roots
[i
].stmts
.release ();
5977 roots
[i
].roots
.release ();
5978 roots
[i
].remain
.release ();
5983 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5984 given then that child nodes have already been processed, and that
5985 their def types currently match their SLP node's def type. */
5988 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
5989 slp_instance node_instance
,
5990 stmt_vector_for_cost
*cost_vec
)
5992 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5994 /* Calculate the number of vector statements to be created for the
5995 scalar stmts in this node. For SLP reductions it is equal to the
5996 number of vector statements in the children (which has already been
5997 calculated by the recursive call). Otherwise it is the number of
5998 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5999 VF divided by the number of elements in a vector. */
6000 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6001 && !STMT_VINFO_DATA_REF (stmt_info
)
6002 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6004 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
6005 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
6007 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
6008 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
6015 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6016 vf
= loop_vinfo
->vectorization_factor
;
6019 unsigned int group_size
= SLP_TREE_LANES (node
);
6020 tree vectype
= SLP_TREE_VECTYPE (node
);
6021 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
6022 = vect_get_num_vectors (vf
* group_size
, vectype
);
6025 /* Handle purely internal nodes. */
6026 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6028 if (!vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
))
6031 stmt_vec_info slp_stmt_info
;
6033 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, slp_stmt_info
)
6035 if (STMT_VINFO_LIVE_P (slp_stmt_info
)
6036 && !vectorizable_live_operation (vinfo
, slp_stmt_info
, node
,
6045 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
6046 node
, node_instance
, cost_vec
);
6049 /* Try to build NODE from scalars, returning true on success.
6050 NODE_INSTANCE is the SLP instance that contains NODE. */
6053 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
6054 slp_instance node_instance
)
6056 stmt_vec_info stmt_info
;
6059 if (!is_a
<bb_vec_info
> (vinfo
)
6060 || node
== SLP_INSTANCE_TREE (node_instance
)
6061 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
6062 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
))
6063 /* Force the mask use to be built from scalars instead. */
6064 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node
)))
6067 if (dump_enabled_p ())
6068 dump_printf_loc (MSG_NOTE
, vect_location
,
6069 "Building vector operands of %p from scalars instead\n",
6072 /* Don't remove and free the child nodes here, since they could be
6073 referenced by other structures. The analysis and scheduling phases
6074 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6075 unsigned int group_size
= SLP_TREE_LANES (node
);
6076 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
6077 /* Invariants get their vector type from the uses. */
6078 SLP_TREE_VECTYPE (node
) = NULL_TREE
;
6079 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
6080 SLP_TREE_LOAD_PERMUTATION (node
).release ();
6081 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6083 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
6084 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
6089 /* Return true if all elements of the slice are the same. */
6091 vect_scalar_ops_slice::all_same_p () const
6093 for (unsigned int i
= 1; i
< length
; ++i
)
6094 if (!operand_equal_p (op (0), op (i
)))
6100 vect_scalar_ops_slice_hash::hash (const value_type
&s
)
6103 for (unsigned i
= 0; i
< s
.length
; ++i
)
6104 hash
= iterative_hash_expr (s
.op (i
), hash
);
6109 vect_scalar_ops_slice_hash::equal (const value_type
&s1
,
6110 const compare_type
&s2
)
6112 if (s1
.length
!= s2
.length
)
6114 for (unsigned i
= 0; i
< s1
.length
; ++i
)
6115 if (!operand_equal_p (s1
.op (i
), s2
.op (i
)))
6120 /* Compute the prologue cost for invariant or constant operands represented
6124 vect_prologue_cost_for_slp (slp_tree node
,
6125 stmt_vector_for_cost
*cost_vec
)
6127 /* There's a special case of an existing vector, that costs nothing. */
6128 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
6129 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
6131 /* Without looking at the actual initializer a vector of
6132 constants can be implemented as load from the constant pool.
6133 When all elements are the same we can use a splat. */
6134 tree vectype
= SLP_TREE_VECTYPE (node
);
6135 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
6136 unsigned HOST_WIDE_INT const_nunits
;
6137 unsigned nelt_limit
;
6138 auto ops
= &SLP_TREE_SCALAR_OPS (node
);
6139 auto_vec
<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6140 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
6141 && ! multiple_p (const_nunits
, group_size
))
6143 nelt_limit
= const_nunits
;
6144 hash_set
<vect_scalar_ops_slice_hash
> vector_ops
;
6145 for (unsigned int i
= 0; i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); ++i
)
6146 if (!vector_ops
.add ({ ops
, i
* const_nunits
, const_nunits
}))
6147 starts
.quick_push (i
* const_nunits
);
6151 /* If either the vector has variable length or the vectors
6152 are composed of repeated whole groups we only need to
6153 cost construction once. All vectors will be the same. */
6154 nelt_limit
= group_size
;
6155 starts
.quick_push (0);
6157 /* ??? We're just tracking whether vectors in a single node are the same.
6158 Ideally we'd do something more global. */
6159 bool passed
= false;
6160 for (unsigned int start
: starts
)
6162 vect_cost_for_stmt kind
;
6163 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
6165 else if (vect_scalar_ops_slice
{ ops
, start
, nelt_limit
}.all_same_p ())
6166 kind
= scalar_to_vec
;
6168 kind
= vec_construct
;
6169 /* The target cost hook has no idea which part of the SLP node
6170 we are costing so avoid passing it down more than once. Pass
6171 it to the first vec_construct or scalar_to_vec part since for those
6172 the x86 backend tries to account for GPR to XMM register moves. */
6173 record_stmt_cost (cost_vec
, 1, kind
,
6174 (kind
!= vector_load
&& !passed
) ? node
: nullptr,
6175 vectype
, 0, vect_prologue
);
6176 if (kind
!= vector_load
)
6181 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6182 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6184 Return true if the operations are supported. */
6187 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
6188 slp_instance node_instance
,
6189 hash_set
<slp_tree
> &visited_set
,
6190 vec
<slp_tree
> &visited_vec
,
6191 stmt_vector_for_cost
*cost_vec
)
6196 /* Assume we can code-generate all invariants. */
6198 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6199 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6202 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
6204 if (dump_enabled_p ())
6205 dump_printf_loc (MSG_NOTE
, vect_location
,
6206 "Failed cyclic SLP reference in %p\n", (void *) node
);
6209 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
6211 /* If we already analyzed the exact same set of scalar stmts we're done.
6212 We share the generated vector stmts for those. */
6213 if (visited_set
.add (node
))
6215 visited_vec
.safe_push (node
);
6218 unsigned visited_rec_start
= visited_vec
.length ();
6219 unsigned cost_vec_rec_start
= cost_vec
->length ();
6220 bool seen_non_constant_child
= false;
6221 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6223 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
6224 visited_set
, visited_vec
,
6228 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
6229 seen_non_constant_child
= true;
6231 /* We're having difficulties scheduling nodes with just constant
6232 operands and no scalar stmts since we then cannot compute a stmt
6234 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
6236 if (dump_enabled_p ())
6237 dump_printf_loc (MSG_NOTE
, vect_location
,
6238 "Cannot vectorize all-constant op node %p\n",
6244 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
6246 /* If analysis failed we have to pop all recursive visited nodes
6250 while (visited_vec
.length () >= visited_rec_start
)
6251 visited_set
.remove (visited_vec
.pop ());
6252 cost_vec
->truncate (cost_vec_rec_start
);
6255 /* When the node can be vectorized cost invariant nodes it references.
6256 This is not done in DFS order to allow the refering node
6257 vectorizable_* calls to nail down the invariant nodes vector type
6258 and possibly unshare it if it needs a different vector type than
6261 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
6263 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
6264 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
6265 /* Perform usual caching, note code-generation still
6266 code-gens these nodes multiple times but we expect
6267 to CSE them later. */
6268 && !visited_set
.add (child
))
6270 visited_vec
.safe_push (child
);
6271 /* ??? After auditing more code paths make a "default"
6272 and push the vector type from NODE to all children
6273 if it is not already set. */
6274 /* Compute the number of vectors to be generated. */
6275 tree vector_type
= SLP_TREE_VECTYPE (child
);
6278 /* For shifts with a scalar argument we don't need
6279 to cost or code-generate anything.
6280 ??? Represent this more explicitely. */
6281 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
6282 == shift_vec_info_type
)
6286 unsigned group_size
= SLP_TREE_LANES (child
);
6288 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6289 vf
= loop_vinfo
->vectorization_factor
;
6290 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
6291 = vect_get_num_vectors (vf
* group_size
, vector_type
);
6292 /* And cost them. */
6293 vect_prologue_cost_for_slp (child
, cost_vec
);
6296 /* If this node or any of its children can't be vectorized, try pruning
6297 the tree here rather than felling the whole thing. */
6298 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
6300 /* We'll need to revisit this for invariant costing and number
6301 of vectorized stmt setting. */
6308 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6309 region and that can be vectorized using vectorizable_live_operation
6310 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6311 scalar code computing it to be retained. */
6314 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
6315 slp_instance instance
,
6316 stmt_vector_for_cost
*cost_vec
,
6317 hash_set
<stmt_vec_info
> &svisited
,
6318 hash_set
<slp_tree
> &visited
)
6320 if (visited
.add (node
))
6324 stmt_vec_info stmt_info
;
6325 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
6326 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6328 if (svisited
.contains (stmt_info
))
6330 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
6331 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
6332 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
6333 /* Only the pattern root stmt computes the original scalar value. */
6335 bool mark_visited
= true;
6336 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
6337 ssa_op_iter op_iter
;
6338 def_operand_p def_p
;
6339 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
6341 imm_use_iterator use_iter
;
6343 stmt_vec_info use_stmt_info
;
6344 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
6345 if (!is_gimple_debug (use_stmt
))
6347 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
6349 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
6351 STMT_VINFO_LIVE_P (stmt_info
) = true;
6352 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
6355 /* ??? So we know we can vectorize the live stmt
6356 from one SLP node. If we cannot do so from all
6357 or none consistently we'd have to record which
6358 SLP node (and lane) we want to use for the live
6359 operation. So make sure we can code-generate
6361 mark_visited
= false;
6363 STMT_VINFO_LIVE_P (stmt_info
) = false;
6367 /* We have to verify whether we can insert the lane extract
6368 before all uses. The following is a conservative approximation.
6369 We cannot put this into vectorizable_live_operation because
6370 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6372 Note that while the fact that we emit code for loads at the
6373 first load should make this a non-problem leafs we construct
6374 from scalars are vectorized after the last scalar def.
6375 ??? If we'd actually compute the insert location during
6376 analysis we could use sth less conservative than the last
6377 scalar stmt in the node for the dominance check. */
6378 /* ??? What remains is "live" uses in vector CTORs in the same
6379 SLP graph which is where those uses can end up code-generated
6380 right after their definition instead of close to their original
6381 use. But that would restrict us to code-generate lane-extracts
6382 from the latest stmt in a node. So we compensate for this
6383 during code-generation, simply not replacing uses for those
6384 hopefully rare cases. */
6385 if (STMT_VINFO_LIVE_P (stmt_info
))
6386 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
6387 if (!is_gimple_debug (use_stmt
)
6388 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
6389 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
6390 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
6392 if (dump_enabled_p ())
6393 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6394 "Cannot determine insertion place for "
6396 STMT_VINFO_LIVE_P (stmt_info
) = false;
6397 mark_visited
= true;
6401 svisited
.add (stmt_info
);
6405 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6406 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6407 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
6408 cost_vec
, svisited
, visited
);
6411 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6414 vectorizable_bb_reduc_epilogue (slp_instance instance
,
6415 stmt_vector_for_cost
*cost_vec
)
6417 gassign
*stmt
= as_a
<gassign
*> (instance
->root_stmts
[0]->stmt
);
6418 enum tree_code reduc_code
= gimple_assign_rhs_code (stmt
);
6419 if (reduc_code
== MINUS_EXPR
)
6420 reduc_code
= PLUS_EXPR
;
6421 internal_fn reduc_fn
;
6422 tree vectype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance
));
6424 || !reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
6425 || reduc_fn
== IFN_LAST
6426 || !direct_internal_fn_supported_p (reduc_fn
, vectype
, OPTIMIZE_FOR_BOTH
)
6427 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt
)),
6428 TREE_TYPE (vectype
)))
6430 if (dump_enabled_p ())
6431 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6432 "not vectorized: basic block reduction epilogue "
6433 "operation unsupported.\n");
6437 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6438 cost log2 vector operations plus shuffles and one extraction. */
6439 unsigned steps
= floor_log2 (vect_nunits_for_cost (vectype
));
6440 record_stmt_cost (cost_vec
, steps
, vector_stmt
, instance
->root_stmts
[0],
6441 vectype
, 0, vect_body
);
6442 record_stmt_cost (cost_vec
, steps
, vec_perm
, instance
->root_stmts
[0],
6443 vectype
, 0, vect_body
);
6444 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, instance
->root_stmts
[0],
6445 vectype
, 0, vect_body
);
6447 /* Since we replace all stmts of a possibly longer scalar reduction
6448 chain account for the extra scalar stmts for that. */
6449 record_stmt_cost (cost_vec
, instance
->remain_defs
.length (), scalar_stmt
,
6450 instance
->root_stmts
[0], 0, vect_body
);
6454 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6455 and recurse to children. */
6458 vect_slp_prune_covered_roots (slp_tree node
, hash_set
<stmt_vec_info
> &roots
,
6459 hash_set
<slp_tree
> &visited
)
6461 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
6462 || visited
.add (node
))
6467 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
6468 roots
.remove (vect_orig_stmt (stmt
));
6471 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6473 vect_slp_prune_covered_roots (child
, roots
, visited
);
6476 /* Analyze statements in SLP instances of VINFO. Return true if the
6477 operations are supported. */
6480 vect_slp_analyze_operations (vec_info
*vinfo
)
6482 slp_instance instance
;
6485 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6487 hash_set
<slp_tree
> visited
;
6488 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
6490 auto_vec
<slp_tree
> visited_vec
;
6491 stmt_vector_for_cost cost_vec
;
6492 cost_vec
.create (2);
6493 if (is_a
<bb_vec_info
> (vinfo
))
6494 vect_location
= instance
->location ();
6495 if (!vect_slp_analyze_node_operations (vinfo
,
6496 SLP_INSTANCE_TREE (instance
),
6497 instance
, visited
, visited_vec
,
6499 /* CTOR instances require vectorized defs for the SLP tree root. */
6500 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
6501 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
6502 != vect_internal_def
6503 /* Make sure we vectorized with the expected type. */
6504 || !useless_type_conversion_p
6505 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6506 (instance
->root_stmts
[0]->stmt
))),
6507 TREE_TYPE (SLP_TREE_VECTYPE
6508 (SLP_INSTANCE_TREE (instance
))))))
6509 /* Check we can vectorize the reduction. */
6510 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
6511 && !vectorizable_bb_reduc_epilogue (instance
, &cost_vec
)))
6513 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6514 stmt_vec_info stmt_info
;
6515 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
6516 stmt_info
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
6518 stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
6519 if (dump_enabled_p ())
6520 dump_printf_loc (MSG_NOTE
, vect_location
,
6521 "removing SLP instance operations starting from: %G",
6523 vect_free_slp_instance (instance
);
6524 vinfo
->slp_instances
.ordered_remove (i
);
6525 cost_vec
.release ();
6526 while (!visited_vec
.is_empty ())
6527 visited
.remove (visited_vec
.pop ());
6532 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6534 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
6535 cost_vec
.release ();
6538 /* For BB vectorization remember the SLP graph entry
6540 instance
->cost_vec
= cost_vec
;
6544 /* Now look for SLP instances with a root that are covered by other
6545 instances and remove them. */
6546 hash_set
<stmt_vec_info
> roots
;
6547 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
6548 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
6549 roots
.add (SLP_INSTANCE_ROOT_STMTS (instance
)[0]);
6550 if (!roots
.is_empty ())
6553 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
6554 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance
), roots
,
6556 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
6557 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
6558 && !roots
.contains (SLP_INSTANCE_ROOT_STMTS (instance
)[0]))
6560 stmt_vec_info root
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
6561 if (dump_enabled_p ())
6562 dump_printf_loc (MSG_NOTE
, vect_location
,
6563 "removing SLP instance operations starting "
6564 "from: %G", root
->stmt
);
6565 vect_free_slp_instance (instance
);
6566 vinfo
->slp_instances
.ordered_remove (i
);
6572 /* Compute vectorizable live stmts. */
6573 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
6575 hash_set
<stmt_vec_info
> svisited
;
6576 hash_set
<slp_tree
> visited
;
6577 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
6579 vect_location
= instance
->location ();
6580 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
6581 instance
, &instance
->cost_vec
, svisited
,
6586 return !vinfo
->slp_instances
.is_empty ();
6589 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6590 closing the eventual chain. */
6593 get_ultimate_leader (slp_instance instance
,
6594 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
6596 auto_vec
<slp_instance
*, 8> chain
;
6598 while (*(tem
= instance_leader
.get (instance
)) != instance
)
6600 chain
.safe_push (tem
);
6603 while (!chain
.is_empty ())
6604 *chain
.pop () = instance
;
6609 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6610 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6611 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6613 INSTANCE_LEADER is as for get_ultimate_leader. */
6615 template<typename T
>
6617 vect_map_to_instance (slp_instance instance
, T key
,
6618 hash_map
<T
, slp_instance
> &key_to_instance
,
6619 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
6622 slp_instance
&key_instance
= key_to_instance
.get_or_insert (key
, &existed_p
);
6625 else if (key_instance
!= instance
)
6627 /* If we're running into a previously marked key make us the
6628 leader of the current ultimate leader. This keeps the
6629 leader chain acyclic and works even when the current instance
6630 connects two previously independent graph parts. */
6631 slp_instance key_leader
6632 = get_ultimate_leader (key_instance
, instance_leader
);
6633 if (key_leader
!= instance
)
6634 instance_leader
.put (key_leader
, instance
);
6636 key_instance
= instance
;
6641 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6644 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
6645 slp_instance instance
, slp_tree node
,
6646 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
6647 hash_map
<slp_tree
, slp_instance
> &node_to_instance
,
6648 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
6650 stmt_vec_info stmt_info
;
6653 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6654 vect_map_to_instance (instance
, stmt_info
, stmt_to_instance
,
6657 if (vect_map_to_instance (instance
, node
, node_to_instance
,
6662 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6663 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6664 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
6665 node_to_instance
, instance_leader
);
6668 /* Partition the SLP graph into pieces that can be costed independently. */
6671 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
6673 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6675 /* First walk the SLP graph assigning each involved scalar stmt a
6676 corresponding SLP graph entry and upon visiting a previously
6677 marked stmt, make the stmts leader the current SLP graph entry. */
6678 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
6679 hash_map
<slp_tree
, slp_instance
> node_to_instance
;
6680 hash_map
<slp_instance
, slp_instance
> instance_leader
;
6681 slp_instance instance
;
6682 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
6684 instance_leader
.put (instance
, instance
);
6685 vect_bb_partition_graph_r (bb_vinfo
,
6686 instance
, SLP_INSTANCE_TREE (instance
),
6687 stmt_to_instance
, node_to_instance
,
6691 /* Then collect entries to each independent subgraph. */
6692 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
6694 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
6695 leader
->subgraph_entries
.safe_push (instance
);
6696 if (dump_enabled_p ()
6697 && leader
!= instance
)
6698 dump_printf_loc (MSG_NOTE
, vect_location
,
6699 "instance %p is leader of %p\n",
6700 (void *) leader
, (void *) instance
);
6704 /* Compute the set of scalar stmts participating in internal and external
6708 vect_slp_gather_vectorized_scalar_stmts (vec_info
*vinfo
, slp_tree node
,
6709 hash_set
<slp_tree
> &visited
,
6710 hash_set
<stmt_vec_info
> &vstmts
,
6711 hash_set
<stmt_vec_info
> &estmts
)
6714 stmt_vec_info stmt_info
;
6717 if (visited
.add (node
))
6720 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
6722 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6723 vstmts
.add (stmt_info
);
6725 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6727 vect_slp_gather_vectorized_scalar_stmts (vinfo
, child
, visited
,
6731 for (tree def
: SLP_TREE_SCALAR_OPS (node
))
6733 stmt_vec_info def_stmt
= vinfo
->lookup_def (def
);
6735 estmts
.add (def_stmt
);
6740 /* Compute the scalar cost of the SLP node NODE and its children
6741 and return it. Do not account defs that are marked in LIFE and
6742 update LIFE according to uses of NODE. */
6745 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
6746 slp_tree node
, vec
<bool, va_heap
> *life
,
6747 stmt_vector_for_cost
*cost_vec
,
6748 hash_set
<stmt_vec_info
> &vectorized_scalar_stmts
,
6749 hash_set
<slp_tree
> &visited
)
6752 stmt_vec_info stmt_info
;
6755 if (visited
.add (node
))
6758 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6760 ssa_op_iter op_iter
;
6761 def_operand_p def_p
;
6766 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
6767 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
6769 /* If there is a non-vectorized use of the defs then the scalar
6770 stmt is kept live in which case we do not account it or any
6771 required defs in the SLP children in the scalar cost. This
6772 way we make the vectorization more costly when compared to
6774 if (!STMT_VINFO_LIVE_P (stmt_info
))
6776 auto_vec
<gimple
*, 8> worklist
;
6777 hash_set
<gimple
*> *worklist_visited
= NULL
;
6778 worklist
.quick_push (orig_stmt
);
6781 gimple
*work_stmt
= worklist
.pop ();
6782 FOR_EACH_PHI_OR_STMT_DEF (def_p
, work_stmt
, op_iter
, SSA_OP_DEF
)
6784 imm_use_iterator use_iter
;
6786 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
,
6787 DEF_FROM_PTR (def_p
))
6788 if (!is_gimple_debug (use_stmt
))
6790 stmt_vec_info use_stmt_info
6791 = vinfo
->lookup_stmt (use_stmt
);
6793 || !vectorized_scalar_stmts
.contains (use_stmt_info
))
6796 && STMT_VINFO_IN_PATTERN_P (use_stmt_info
))
6798 /* For stmts participating in patterns we have
6799 to check its uses recursively. */
6800 if (!worklist_visited
)
6801 worklist_visited
= new hash_set
<gimple
*> ();
6802 if (!worklist_visited
->add (use_stmt
))
6803 worklist
.safe_push (use_stmt
);
6812 while (!worklist
.is_empty ());
6814 if (worklist_visited
)
6815 delete worklist_visited
;
6820 /* Count scalar stmts only once. */
6821 if (gimple_visited_p (orig_stmt
))
6823 gimple_set_visited (orig_stmt
, true);
6825 vect_cost_for_stmt kind
;
6826 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
6828 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
6831 kind
= scalar_store
;
6833 else if (vect_nop_conversion_p (orig_stmt_info
))
6835 /* For single-argument PHIs assume coalescing which means zero cost
6836 for the scalar and the vector PHIs. This avoids artificially
6837 favoring the vector path (but may pessimize it in some cases). */
6838 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
6839 && gimple_phi_num_args
6840 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
6844 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
6845 SLP_TREE_VECTYPE (node
), 0, vect_body
);
6848 auto_vec
<bool, 20> subtree_life
;
6849 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6851 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6853 /* Do not directly pass LIFE to the recursive call, copy it to
6854 confine changes in the callee to the current child/subtree. */
6855 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6857 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
6858 for (unsigned j
= 0;
6859 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
6861 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
6862 if (perm
.first
== i
)
6863 subtree_life
[perm
.second
] = (*life
)[j
];
6868 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
6869 subtree_life
.safe_splice (*life
);
6871 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
6872 vectorized_scalar_stmts
, visited
);
6873 subtree_life
.truncate (0);
6878 /* Comparator for the loop-index sorted cost vectors. */
6881 li_cost_vec_cmp (const void *a_
, const void *b_
)
6883 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
6884 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
6885 if (a
->first
< b
->first
)
6887 else if (a
->first
== b
->first
)
6892 /* Check if vectorization of the basic block is profitable for the
6893 subgraph denoted by SLP_INSTANCES. */
6896 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
6897 vec
<slp_instance
> slp_instances
,
6900 slp_instance instance
;
6902 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
6903 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
6905 if (dump_enabled_p ())
6907 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
6908 hash_set
<slp_tree
> visited
;
6909 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6910 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6911 SLP_INSTANCE_TREE (instance
), visited
);
6914 /* Compute the set of scalar stmts we know will go away 'locally' when
6915 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6916 not accurate for nodes promoted extern late or for scalar stmts that
6917 are used both in extern defs and in vectorized defs. */
6918 hash_set
<stmt_vec_info
> vectorized_scalar_stmts
;
6919 hash_set
<stmt_vec_info
> scalar_stmts_in_externs
;
6920 hash_set
<slp_tree
> visited
;
6921 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6923 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo
,
6924 SLP_INSTANCE_TREE (instance
),
6926 vectorized_scalar_stmts
,
6927 scalar_stmts_in_externs
);
6928 for (stmt_vec_info rstmt
: SLP_INSTANCE_ROOT_STMTS (instance
))
6929 vectorized_scalar_stmts
.add (rstmt
);
6931 /* Scalar stmts used as defs in external nodes need to be preseved, so
6932 remove them from vectorized_scalar_stmts. */
6933 for (stmt_vec_info stmt
: scalar_stmts_in_externs
)
6934 vectorized_scalar_stmts
.remove (stmt
);
6936 /* Calculate scalar cost and sum the cost for the vector stmts
6937 previously collected. */
6938 stmt_vector_for_cost scalar_costs
= vNULL
;
6939 stmt_vector_for_cost vector_costs
= vNULL
;
6941 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6943 auto_vec
<bool, 20> life
;
6944 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
6946 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
6947 record_stmt_cost (&scalar_costs
,
6948 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
6950 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
6951 vect_bb_slp_scalar_cost (bb_vinfo
,
6952 SLP_INSTANCE_TREE (instance
),
6953 &life
, &scalar_costs
, vectorized_scalar_stmts
,
6955 vector_costs
.safe_splice (instance
->cost_vec
);
6956 instance
->cost_vec
.release ();
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
6962 /* When costing non-loop vectorization we need to consider each covered
6963 loop independently and make sure vectorization is profitable. For
6964 now we assume a loop may be not entered or executed an arbitrary
6965 number of iterations (??? static information can provide more
6966 precise info here) which means we can simply cost each containing
6967 loops stmts separately. */
6969 /* First produce cost vectors sorted by loop index. */
6970 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
6971 li_scalar_costs (scalar_costs
.length ());
6972 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
6973 li_vector_costs (vector_costs
.length ());
6974 stmt_info_for_cost
*cost
;
6975 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
6977 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
6978 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
6980 /* Use a random used loop as fallback in case the first vector_costs
6981 entry does not have a stmt_info associated with it. */
6982 unsigned l
= li_scalar_costs
[0].first
;
6983 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
6985 /* We inherit from the previous COST, invariants, externals and
6986 extracts immediately follow the cost for the related stmt. */
6987 if (cost
->stmt_info
)
6988 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
6989 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
6991 li_scalar_costs
.qsort (li_cost_vec_cmp
);
6992 li_vector_costs
.qsort (li_cost_vec_cmp
);
6994 /* Now cost the portions individually. */
6997 bool profitable
= true;
6998 while (si
< li_scalar_costs
.length ()
6999 && vi
< li_vector_costs
.length ())
7001 unsigned sl
= li_scalar_costs
[si
].first
;
7002 unsigned vl
= li_vector_costs
[vi
].first
;
7005 if (dump_enabled_p ())
7006 dump_printf_loc (MSG_NOTE
, vect_location
,
7007 "Scalar %d and vector %d loop part do not "
7008 "match up, skipping scalar part\n", sl
, vl
);
7009 /* Skip the scalar part, assuming zero cost on the vector side. */
7014 while (si
< li_scalar_costs
.length ()
7015 && li_scalar_costs
[si
].first
== sl
);
7019 class vector_costs
*scalar_target_cost_data
= init_cost (bb_vinfo
, true);
7022 add_stmt_cost (scalar_target_cost_data
, li_scalar_costs
[si
].second
);
7025 while (si
< li_scalar_costs
.length ()
7026 && li_scalar_costs
[si
].first
== sl
);
7028 finish_cost (scalar_target_cost_data
, nullptr,
7029 &dummy
, &scalar_cost
, &dummy
);
7031 /* Complete the target-specific vector cost calculation. */
7032 class vector_costs
*vect_target_cost_data
= init_cost (bb_vinfo
, false);
7035 add_stmt_cost (vect_target_cost_data
, li_vector_costs
[vi
].second
);
7038 while (vi
< li_vector_costs
.length ()
7039 && li_vector_costs
[vi
].first
== vl
);
7040 finish_cost (vect_target_cost_data
, scalar_target_cost_data
,
7041 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
);
7042 delete scalar_target_cost_data
;
7043 delete vect_target_cost_data
;
7045 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
7047 if (dump_enabled_p ())
7049 dump_printf_loc (MSG_NOTE
, vect_location
,
7050 "Cost model analysis for part in loop %d:\n", sl
);
7051 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
7052 vec_inside_cost
+ vec_outside_cost
);
7053 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
7056 /* Vectorization is profitable if its cost is more than the cost of scalar
7057 version. Note that we err on the vector side for equal cost because
7058 the cost estimate is otherwise quite pessimistic (constant uses are
7059 free on the scalar side but cost a load on the vector side for
7061 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
7067 if (profitable
&& vi
< li_vector_costs
.length ())
7069 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_NOTE
, vect_location
,
7071 "Excess vector cost for part in loop %d:\n",
7072 li_vector_costs
[vi
].first
);
7076 /* Unset visited flag. This is delayed when the subgraph is profitable
7077 and we process the loop for remaining unvectorized if-converted code. */
7078 if (!orig_loop
|| !profitable
)
7079 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
7080 gimple_set_visited (cost
->stmt_info
->stmt
, false);
7082 scalar_costs
.release ();
7083 vector_costs
.release ();
7088 /* qsort comparator for lane defs. */
7091 vld_cmp (const void *a_
, const void *b_
)
7093 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
7094 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
7095 return a
->first
- b
->first
;
7098 /* Return true if USE_STMT is a vector lane insert into VEC and set
7099 *THIS_LANE to the lane number that is set. */
7102 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
7104 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
7106 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
7108 ? gimple_assign_rhs1 (use_ass
) != vec
7109 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
7110 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
7111 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
7112 || !constant_multiple_p
7113 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
7114 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
7120 /* Find any vectorizable constructors and add them to the grouped_store
7124 vect_slp_check_for_roots (bb_vec_info bb_vinfo
)
7126 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
7127 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
7128 !gsi_end_p (gsi
); gsi_next (&gsi
))
7130 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
7134 tree rhs
= gimple_assign_rhs1 (assign
);
7135 enum tree_code code
= gimple_assign_rhs_code (assign
);
7136 use_operand_p use_p
;
7138 if (code
== CONSTRUCTOR
)
7140 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
7141 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
7142 CONSTRUCTOR_NELTS (rhs
))
7143 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
7144 || uniform_vector_p (rhs
))
7149 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
7150 if (TREE_CODE (val
) != SSA_NAME
7151 || !bb_vinfo
->lookup_def (val
))
7153 if (j
!= CONSTRUCTOR_NELTS (rhs
))
7156 vec
<stmt_vec_info
> roots
= vNULL
;
7157 roots
.safe_push (bb_vinfo
->lookup_stmt (assign
));
7158 vec
<stmt_vec_info
> stmts
;
7159 stmts
.create (CONSTRUCTOR_NELTS (rhs
));
7160 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
7162 (vect_stmt_to_vectorize (bb_vinfo
->lookup_def (val
)));
7163 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
7166 else if (code
== BIT_INSERT_EXPR
7167 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
7168 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
7169 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
7170 && integer_zerop (gimple_assign_rhs3 (assign
))
7171 && useless_type_conversion_p
7172 (TREE_TYPE (TREE_TYPE (rhs
)),
7173 TREE_TYPE (gimple_assign_rhs2 (assign
)))
7174 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
7176 /* We start to match on insert to lane zero but since the
7177 inserts need not be ordered we'd have to search both
7178 the def and the use chains. */
7179 tree vectype
= TREE_TYPE (rhs
);
7180 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
7181 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
7182 auto_sbitmap
lanes (nlanes
);
7183 bitmap_clear (lanes
);
7184 bitmap_set_bit (lanes
, 0);
7185 tree def
= gimple_assign_lhs (assign
);
7186 lane_defs
.quick_push
7187 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
7188 unsigned lanes_found
= 1;
7189 /* Start with the use chains, the last stmt will be the root. */
7190 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
7191 vec
<stmt_vec_info
> roots
= vNULL
;
7192 roots
.safe_push (last
);
7195 use_operand_p use_p
;
7197 if (!single_imm_use (def
, &use_p
, &use_stmt
))
7200 if (!bb_vinfo
->lookup_stmt (use_stmt
)
7201 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
7202 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
7204 if (bitmap_bit_p (lanes
, this_lane
))
7207 bitmap_set_bit (lanes
, this_lane
);
7208 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
7209 lane_defs
.quick_push (std::make_pair
7210 (this_lane
, gimple_assign_rhs2 (use_ass
)));
7211 last
= bb_vinfo
->lookup_stmt (use_ass
);
7212 roots
.safe_push (last
);
7213 def
= gimple_assign_lhs (use_ass
);
7215 while (lanes_found
< nlanes
);
7216 if (roots
.length () > 1)
7217 std::swap(roots
[0], roots
[roots
.length () - 1]);
7218 if (lanes_found
< nlanes
)
7220 /* Now search the def chain. */
7221 def
= gimple_assign_rhs1 (assign
);
7224 if (TREE_CODE (def
) != SSA_NAME
7225 || !has_single_use (def
))
7227 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
7229 if (!bb_vinfo
->lookup_stmt (def_stmt
)
7230 || !vect_slp_is_lane_insert (def_stmt
,
7231 NULL_TREE
, &this_lane
)
7232 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
7234 if (bitmap_bit_p (lanes
, this_lane
))
7237 bitmap_set_bit (lanes
, this_lane
);
7238 lane_defs
.quick_push (std::make_pair
7240 gimple_assign_rhs2 (def_stmt
)));
7241 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
7242 def
= gimple_assign_rhs1 (def_stmt
);
7244 while (lanes_found
< nlanes
);
7246 if (lanes_found
== nlanes
)
7248 /* Sort lane_defs after the lane index and register the root. */
7249 lane_defs
.qsort (vld_cmp
);
7250 vec
<stmt_vec_info
> stmts
;
7251 stmts
.create (nlanes
);
7252 for (unsigned i
= 0; i
< nlanes
; ++i
)
7253 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
7254 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
7260 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
7261 && (associative_tree_code (code
) || code
== MINUS_EXPR
)
7262 /* ??? This pessimizes a two-element reduction. PR54400.
7263 ??? In-order reduction could be handled if we only
7264 traverse one operand chain in vect_slp_linearize_chain. */
7265 && !needs_fold_left_reduction_p (TREE_TYPE (rhs
), code
)
7266 /* Ops with constants at the tail can be stripped here. */
7267 && TREE_CODE (rhs
) == SSA_NAME
7268 && TREE_CODE (gimple_assign_rhs2 (assign
)) == SSA_NAME
7269 /* Should be the chain end. */
7270 && (!single_imm_use (gimple_assign_lhs (assign
),
7272 || !is_gimple_assign (use_stmt
)
7273 || (gimple_assign_rhs_code (use_stmt
) != code
7274 && ((code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
7275 || (gimple_assign_rhs_code (use_stmt
)
7276 != (code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
))))))
7278 /* We start the match at the end of a possible association
7280 auto_vec
<chain_op_t
> chain
;
7281 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
7282 auto_vec
<gimple
*> chain_stmts
;
7283 gimple
*code_stmt
= NULL
, *alt_code_stmt
= NULL
;
7284 if (code
== MINUS_EXPR
)
7286 internal_fn reduc_fn
;
7287 if (!reduction_fn_for_scalar_code (code
, &reduc_fn
)
7288 || reduc_fn
== IFN_LAST
)
7290 vect_slp_linearize_chain (bb_vinfo
, worklist
, chain
, code
, assign
,
7292 code_stmt
, alt_code_stmt
, &chain_stmts
);
7293 if (chain
.length () > 1)
7295 /* Sort the chain according to def_type and operation. */
7296 chain
.sort (dt_sort_cmp
, bb_vinfo
);
7297 /* ??? Now we'd want to strip externals and constants
7298 but record those to be handled in the epilogue. */
7299 /* ??? For now do not allow mixing ops or externs/constants. */
7300 bool invalid
= false;
7301 unsigned remain_cnt
= 0;
7302 for (unsigned i
= 0; i
< chain
.length (); ++i
)
7304 if (chain
[i
].code
!= code
)
7309 if (chain
[i
].dt
!= vect_internal_def
)
7312 if (!invalid
&& chain
.length () - remain_cnt
> 1)
7314 vec
<stmt_vec_info
> stmts
;
7315 vec
<tree
> remain
= vNULL
;
7316 stmts
.create (chain
.length ());
7318 remain
.create (remain_cnt
);
7319 for (unsigned i
= 0; i
< chain
.length (); ++i
)
7321 if (chain
[i
].dt
== vect_internal_def
)
7322 stmts
.quick_push (bb_vinfo
->lookup_def (chain
[i
].op
));
7324 remain
.quick_push (chain
[i
].op
);
7326 vec
<stmt_vec_info
> roots
;
7327 roots
.create (chain_stmts
.length ());
7328 for (unsigned i
= 0; i
< chain_stmts
.length (); ++i
)
7329 roots
.quick_push (bb_vinfo
->lookup_stmt (chain_stmts
[i
]));
7330 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_bb_reduc
,
7331 stmts
, roots
, remain
));
7338 /* Walk the grouped store chains and replace entries with their
7339 pattern variant if any. */
7342 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
7344 stmt_vec_info first_element
;
7347 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
7349 /* We also have CTORs in this array. */
7350 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
7352 if (STMT_VINFO_IN_PATTERN_P (first_element
))
7354 stmt_vec_info orig
= first_element
;
7355 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
7356 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
7357 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
7358 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
7359 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
7360 vinfo
->grouped_stores
[i
] = first_element
;
7362 stmt_vec_info prev
= first_element
;
7363 while (DR_GROUP_NEXT_ELEMENT (prev
))
7365 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
7366 if (STMT_VINFO_IN_PATTERN_P (elt
))
7368 stmt_vec_info orig
= elt
;
7369 elt
= STMT_VINFO_RELATED_STMT (elt
);
7370 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
7371 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
7372 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
7374 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
7380 /* Check if the region described by BB_VINFO can be vectorized, returning
7381 true if so. When returning false, set FATAL to true if the same failure
7382 would prevent vectorization at other vector sizes, false if it is still
7383 worth trying other sizes. N_STMTS is the number of statements in the
7387 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
7388 vec
<int> *dataref_groups
)
7390 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7392 slp_instance instance
;
7394 poly_uint64 min_vf
= 2;
7396 /* The first group of checks is independent of the vector size. */
7399 /* Analyze the data references. */
7401 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
7403 if (dump_enabled_p ())
7404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7405 "not vectorized: unhandled data-ref in basic "
7410 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
7412 if (dump_enabled_p ())
7413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7414 "not vectorized: unhandled data access in "
7419 vect_slp_check_for_roots (bb_vinfo
);
7421 /* If there are no grouped stores and no constructors in the region
7422 there is no need to continue with pattern recog as vect_analyze_slp
7423 will fail anyway. */
7424 if (bb_vinfo
->grouped_stores
.is_empty ()
7425 && bb_vinfo
->roots
.is_empty ())
7427 if (dump_enabled_p ())
7428 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7429 "not vectorized: no grouped stores in "
7434 /* While the rest of the analysis below depends on it in some way. */
7437 vect_pattern_recog (bb_vinfo
);
7439 /* Update store groups from pattern processing. */
7440 vect_fixup_store_groups_with_patterns (bb_vinfo
);
7442 /* Check the SLP opportunities in the basic block, analyze and build SLP
7444 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
7446 if (dump_enabled_p ())
7448 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7449 "Failed to SLP the basic block.\n");
7450 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7451 "not vectorized: failed to find SLP opportunities "
7452 "in basic block.\n");
7457 /* Optimize permutations. */
7458 vect_optimize_slp (bb_vinfo
);
7460 /* Gather the loads reachable from the SLP graph entries. */
7461 vect_gather_slp_loads (bb_vinfo
);
7463 vect_record_base_alignments (bb_vinfo
);
7465 /* Analyze and verify the alignment of data references and the
7466 dependence in the SLP instances. */
7467 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
7469 vect_location
= instance
->location ();
7470 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
7471 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
7473 slp_tree node
= SLP_INSTANCE_TREE (instance
);
7474 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
7475 if (dump_enabled_p ())
7476 dump_printf_loc (MSG_NOTE
, vect_location
,
7477 "removing SLP instance operations starting from: %G",
7479 vect_free_slp_instance (instance
);
7480 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
7484 /* Mark all the statements that we want to vectorize as pure SLP and
7486 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
7487 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
7490 /* Likewise consider instance root stmts as vectorized. */
7491 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
7492 STMT_SLP_TYPE (root
) = pure_slp
;
7496 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
7499 if (!vect_slp_analyze_operations (bb_vinfo
))
7501 if (dump_enabled_p ())
7502 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7503 "not vectorized: bad operation in basic block.\n");
7507 vect_bb_partition_graph (bb_vinfo
);
7512 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7513 basic blocks in BBS, returning true on success.
7514 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7517 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
7518 vec
<int> *dataref_groups
, unsigned int n_stmts
,
7521 bb_vec_info bb_vinfo
;
7522 auto_vector_modes vector_modes
;
7524 /* Autodetect first vector size we try. */
7525 machine_mode next_vector_mode
= VOIDmode
;
7526 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
7527 unsigned int mode_i
= 0;
7529 vec_info_shared shared
;
7531 machine_mode autodetected_vector_mode
= VOIDmode
;
7534 bool vectorized
= false;
7536 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
7538 bool first_time_p
= shared
.datarefs
.is_empty ();
7539 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
7541 bb_vinfo
->shared
->save_datarefs ();
7543 bb_vinfo
->shared
->check_datarefs ();
7544 bb_vinfo
->vector_mode
= next_vector_mode
;
7546 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
7548 if (dump_enabled_p ())
7550 dump_printf_loc (MSG_NOTE
, vect_location
,
7551 "***** Analysis succeeded with vector mode"
7552 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
7553 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
7556 bb_vinfo
->shared
->check_datarefs ();
7558 auto_vec
<slp_instance
> profitable_subgraphs
;
7559 for (slp_instance instance
: BB_VINFO_SLP_INSTANCES (bb_vinfo
))
7561 if (instance
->subgraph_entries
.is_empty ())
7564 dump_user_location_t saved_vect_location
= vect_location
;
7565 vect_location
= instance
->location ();
7566 if (!unlimited_cost_model (NULL
)
7567 && !vect_bb_vectorization_profitable_p
7568 (bb_vinfo
, instance
->subgraph_entries
, orig_loop
))
7570 if (dump_enabled_p ())
7571 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7572 "not vectorized: vectorization is not "
7574 vect_location
= saved_vect_location
;
7578 vect_location
= saved_vect_location
;
7579 if (!dbg_cnt (vect_slp
))
7582 profitable_subgraphs
.safe_push (instance
);
7585 /* When we're vectorizing an if-converted loop body make sure
7586 we vectorized all if-converted code. */
7587 if (!profitable_subgraphs
.is_empty ()
7590 gcc_assert (bb_vinfo
->bbs
.length () == 1);
7591 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[0]);
7592 !gsi_end_p (gsi
); gsi_next (&gsi
))
7594 /* The costing above left us with DCEable vectorized scalar
7595 stmts having the visited flag set on profitable
7596 subgraphs. Do the delayed clearing of the flag here. */
7597 if (gimple_visited_p (gsi_stmt (gsi
)))
7599 gimple_set_visited (gsi_stmt (gsi
), false);
7602 if (flag_vect_cost_model
== VECT_COST_MODEL_UNLIMITED
)
7605 if (gassign
*ass
= dyn_cast
<gassign
*> (gsi_stmt (gsi
)))
7606 if (gimple_assign_rhs_code (ass
) == COND_EXPR
)
7608 if (!profitable_subgraphs
.is_empty ()
7609 && dump_enabled_p ())
7610 dump_printf_loc (MSG_NOTE
, vect_location
,
7611 "not profitable because of "
7612 "unprofitable if-converted scalar "
7614 profitable_subgraphs
.truncate (0);
7619 /* Finally schedule the profitable subgraphs. */
7620 for (slp_instance instance
: profitable_subgraphs
)
7622 if (!vectorized
&& dump_enabled_p ())
7623 dump_printf_loc (MSG_NOTE
, vect_location
,
7624 "Basic block will be vectorized "
7628 /* Dump before scheduling as store vectorization will remove
7629 the original stores and mess with the instance tree
7630 so querying its location will eventually ICE. */
7632 for (slp_instance sub
: instance
->subgraph_entries
)
7633 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub
)));
7634 unsigned HOST_WIDE_INT bytes
;
7635 if (dump_enabled_p ())
7636 for (slp_instance sub
: instance
->subgraph_entries
)
7638 tree vtype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub
));
7639 if (GET_MODE_SIZE (TYPE_MODE (vtype
)).is_constant (&bytes
))
7640 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
7642 "basic block part vectorized using %wu "
7643 "byte vectors\n", bytes
);
7645 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
7647 "basic block part vectorized using "
7648 "variable length vectors\n");
7651 dump_user_location_t saved_vect_location
= vect_location
;
7652 vect_location
= instance
->location ();
7654 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
7656 vect_location
= saved_vect_location
;
7661 if (dump_enabled_p ())
7662 dump_printf_loc (MSG_NOTE
, vect_location
,
7663 "***** Analysis failed with vector mode %s\n",
7664 GET_MODE_NAME (bb_vinfo
->vector_mode
));
7668 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
7671 while (mode_i
< vector_modes
.length ()
7672 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
7674 if (dump_enabled_p ())
7675 dump_printf_loc (MSG_NOTE
, vect_location
,
7676 "***** The result for vector mode %s would"
7678 GET_MODE_NAME (vector_modes
[mode_i
]));
7684 if (mode_i
< vector_modes
.length ()
7685 && VECTOR_MODE_P (autodetected_vector_mode
)
7686 && (related_vector_mode (vector_modes
[mode_i
],
7687 GET_MODE_INNER (autodetected_vector_mode
))
7688 == autodetected_vector_mode
)
7689 && (related_vector_mode (autodetected_vector_mode
,
7690 GET_MODE_INNER (vector_modes
[mode_i
]))
7691 == vector_modes
[mode_i
]))
7693 if (dump_enabled_p ())
7694 dump_printf_loc (MSG_NOTE
, vect_location
,
7695 "***** Skipping vector mode %s, which would"
7696 " repeat the analysis for %s\n",
7697 GET_MODE_NAME (vector_modes
[mode_i
]),
7698 GET_MODE_NAME (autodetected_vector_mode
));
7703 || mode_i
== vector_modes
.length ()
7704 || autodetected_vector_mode
== VOIDmode
7705 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7706 vector sizes will fail do not bother iterating. */
7710 /* Try the next biggest vector size. */
7711 next_vector_mode
= vector_modes
[mode_i
++];
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_NOTE
, vect_location
,
7714 "***** Re-trying analysis with vector mode %s\n",
7715 GET_MODE_NAME (next_vector_mode
));
7720 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7721 true if anything in the basic-block was vectorized. */
7724 vect_slp_bbs (const vec
<basic_block
> &bbs
, loop_p orig_loop
)
7726 vec
<data_reference_p
> datarefs
= vNULL
;
7727 auto_vec
<int> dataref_groups
;
7729 int current_group
= 0;
7731 for (unsigned i
= 0; i
< bbs
.length (); i
++)
7733 basic_block bb
= bbs
[i
];
7734 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
7737 gimple
*stmt
= gsi_stmt (gsi
);
7738 if (is_gimple_debug (stmt
))
7743 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
7744 vect_location
= stmt
;
7746 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
7747 &dataref_groups
, current_group
))
7750 /* New BBs always start a new DR group. */
7754 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
, orig_loop
);
7757 /* Special entry for the BB vectorizer. Analyze and transform a single
7758 if-converted BB with ORIG_LOOPs body being the not if-converted
7759 representation. Returns true if anything in the basic-block was
7763 vect_slp_if_converted_bb (basic_block bb
, loop_p orig_loop
)
7765 auto_vec
<basic_block
> bbs
;
7767 return vect_slp_bbs (bbs
, orig_loop
);
7770 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7771 true if anything in the basic-block was vectorized. */
7774 vect_slp_function (function
*fun
)
7777 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
7778 auto_bitmap exit_bbs
;
7779 bitmap_set_bit (exit_bbs
, EXIT_BLOCK
);
7780 edge entry
= single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun
));
7781 unsigned n
= rev_post_order_and_mark_dfs_back_seme (fun
, entry
, exit_bbs
,
7784 /* For the moment split the function into pieces to avoid making
7785 the iteration on the vector mode moot. Split at points we know
7786 to not handle well which is CFG merges (SLP discovery doesn't
7787 handle non-loop-header PHIs) and loop exits. Since pattern
7788 recog requires reverse iteration to visit uses before defs
7789 simply chop RPO into pieces. */
7790 auto_vec
<basic_block
> bbs
;
7791 for (unsigned i
= 0; i
< n
; i
++)
7793 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
7796 /* Split when a BB is not dominated by the first block. */
7797 if (!bbs
.is_empty ()
7798 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
7800 if (dump_enabled_p ())
7801 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7802 "splitting region at dominance boundary bb%d\n",
7806 /* Split when the loop determined by the first block
7807 is exited. This is because we eventually insert
7808 invariants at region begin. */
7809 else if (!bbs
.is_empty ()
7810 && bbs
[0]->loop_father
!= bb
->loop_father
7811 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
7813 if (dump_enabled_p ())
7814 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7815 "splitting region at loop %d exit at bb%d\n",
7816 bbs
[0]->loop_father
->num
, bb
->index
);
7819 else if (!bbs
.is_empty ()
7820 && bb
->loop_father
->header
== bb
7821 && bb
->loop_father
->dont_vectorize
)
7823 if (dump_enabled_p ())
7824 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7825 "splitting region at dont-vectorize loop %d "
7827 bb
->loop_father
->num
, bb
->index
);
7831 if (split
&& !bbs
.is_empty ())
7833 r
|= vect_slp_bbs (bbs
, NULL
);
7837 if (bbs
.is_empty ())
7839 /* We need to be able to insert at the head of the region which
7840 we cannot for region starting with a returns-twice call. */
7841 if (gcall
*first
= safe_dyn_cast
<gcall
*> (first_stmt (bb
)))
7842 if (gimple_call_flags (first
) & ECF_RETURNS_TWICE
)
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7846 "skipping bb%d as start of region as it "
7847 "starts with returns-twice call\n",
7851 /* If the loop this BB belongs to is marked as not to be vectorized
7852 honor that also for BB vectorization. */
7853 if (bb
->loop_father
->dont_vectorize
)
7859 /* When we have a stmt ending this block and defining a
7860 value we have to insert on edges when inserting after it for
7861 a vector containing its definition. Avoid this for now. */
7862 if (gimple
*last
= *gsi_last_bb (bb
))
7863 if (gimple_get_lhs (last
)
7864 && is_ctrl_altering_stmt (last
))
7866 if (dump_enabled_p ())
7867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7868 "splitting region at control altering "
7869 "definition %G", last
);
7870 r
|= vect_slp_bbs (bbs
, NULL
);
7875 if (!bbs
.is_empty ())
7876 r
|= vect_slp_bbs (bbs
, NULL
);
7883 /* Build a variable-length vector in which the elements in ELTS are repeated
7884 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7885 RESULTS and add any new instructions to SEQ.
7887 The approach we use is:
7889 (1) Find a vector mode VM with integer elements of mode IM.
7891 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7892 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7893 from small vectors to IM.
7895 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7897 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7898 correct byte contents.
7900 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7902 We try to find the largest IM for which this sequence works, in order
7903 to cut down on the number of interleaves. */
7906 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
7907 const vec
<tree
> &elts
, unsigned int nresults
,
7910 unsigned int nelts
= elts
.length ();
7911 tree element_type
= TREE_TYPE (vector_type
);
7913 /* (1) Find a vector mode VM with integer elements of mode IM. */
7914 unsigned int nvectors
= 1;
7915 tree new_vector_type
;
7917 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
7918 &nvectors
, &new_vector_type
,
7922 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7923 unsigned int partial_nelts
= nelts
/ nvectors
;
7924 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
7926 tree_vector_builder partial_elts
;
7927 auto_vec
<tree
, 32> pieces (nvectors
* 2);
7928 pieces
.quick_grow_cleared (nvectors
* 2);
7929 for (unsigned int i
= 0; i
< nvectors
; ++i
)
7931 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7932 ELTS' has mode IM. */
7933 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
7934 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
7935 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
7936 tree t
= gimple_build_vector (seq
, &partial_elts
);
7937 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
7938 TREE_TYPE (new_vector_type
), t
);
7940 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7941 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
7944 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7945 correct byte contents.
7947 Conceptually, we need to repeat the following operation log2(nvectors)
7948 times, where hi_start = nvectors / 2:
7950 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7951 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7953 However, if each input repeats every N elements and the VF is
7954 a multiple of N * 2, the HI result is the same as the LO result.
7955 This will be true for the first N1 iterations of the outer loop,
7956 followed by N2 iterations for which both the LO and HI results
7959 N1 + N2 = log2(nvectors)
7961 Each "N1 iteration" doubles the number of redundant vectors and the
7962 effect of the process as a whole is to have a sequence of nvectors/2**N1
7963 vectors that repeats 2**N1 times. Rather than generate these redundant
7964 vectors, we halve the number of vectors for each N1 iteration. */
7965 unsigned int in_start
= 0;
7966 unsigned int out_start
= nvectors
;
7967 unsigned int new_nvectors
= nvectors
;
7968 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
7970 unsigned int hi_start
= new_nvectors
/ 2;
7971 unsigned int out_i
= 0;
7972 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
7975 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
7979 tree output
= make_ssa_name (new_vector_type
);
7980 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
7981 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
7982 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
7984 permutes
[in_i
& 1]);
7985 gimple_seq_add_stmt (seq
, stmt
);
7986 pieces
[out_start
+ out_i
] = output
;
7989 std::swap (in_start
, out_start
);
7990 new_nvectors
= out_i
;
7993 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7994 results
.reserve (nresults
);
7995 for (unsigned int i
= 0; i
< nresults
; ++i
)
7996 if (i
< new_nvectors
)
7997 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
7998 pieces
[in_start
+ i
]));
8000 results
.quick_push (results
[i
- new_nvectors
]);
8004 /* For constant and loop invariant defs in OP_NODE this function creates
8005 vector defs that will be used in the vectorized stmts and stores them
8006 to SLP_TREE_VEC_DEFS of OP_NODE. */
8009 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
8011 unsigned HOST_WIDE_INT nunits
;
8013 unsigned j
, number_of_places_left_in_vector
;
8016 int group_size
= op_node
->ops
.length ();
8017 unsigned int vec_num
, i
;
8018 unsigned number_of_copies
= 1;
8020 gimple_seq ctor_seq
= NULL
;
8021 auto_vec
<tree
, 16> permute_results
;
8023 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8024 vector_type
= SLP_TREE_VECTYPE (op_node
);
8026 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
8027 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
8028 auto_vec
<tree
> voprnds (number_of_vectors
);
8030 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8031 created vectors. It is greater than 1 if unrolling is performed.
8033 For example, we have two scalar operands, s1 and s2 (e.g., group of
8034 strided accesses of size two), while NUNITS is four (i.e., four scalars
8035 of this type can be packed in a vector). The output vector will contain
8036 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8039 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8040 containing the operands.
8042 For example, NUNITS is four as before, and the group size is 8
8043 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8044 {s5, s6, s7, s8}. */
8046 /* When using duplicate_and_interleave, we just need one element for
8047 each scalar statement. */
8048 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
8049 nunits
= group_size
;
8051 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
8053 number_of_places_left_in_vector
= nunits
;
8055 tree_vector_builder
elts (vector_type
, nunits
, 1);
8056 elts
.quick_grow (nunits
);
8057 stmt_vec_info insert_after
= NULL
;
8058 for (j
= 0; j
< number_of_copies
; j
++)
8061 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
8063 /* Create 'vect_ = {op0,op1,...,opn}'. */
8064 number_of_places_left_in_vector
--;
8066 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
8068 if (CONSTANT_CLASS_P (op
))
8070 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
8072 /* Can't use VIEW_CONVERT_EXPR for booleans because
8073 of possibly different sizes of scalar value and
8075 if (integer_zerop (op
))
8076 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
8077 else if (integer_onep (op
))
8078 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
8083 op
= fold_unary (VIEW_CONVERT_EXPR
,
8084 TREE_TYPE (vector_type
), op
);
8085 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
8089 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
8091 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
8094 = build_all_ones_cst (TREE_TYPE (vector_type
));
8096 = build_zero_cst (TREE_TYPE (vector_type
));
8097 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
8098 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
8104 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
8107 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
8110 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
8114 elts
[number_of_places_left_in_vector
] = op
;
8115 if (!CONSTANT_CLASS_P (op
))
8117 /* For BB vectorization we have to compute an insert location
8118 when a def is inside the analyzed region since we cannot
8119 simply insert at the BB start in this case. */
8120 stmt_vec_info opdef
;
8121 if (TREE_CODE (orig_op
) == SSA_NAME
8122 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
8123 && is_a
<bb_vec_info
> (vinfo
)
8124 && (opdef
= vinfo
->lookup_def (orig_op
)))
8127 insert_after
= opdef
;
8129 insert_after
= get_later_stmt (insert_after
, opdef
);
8132 if (number_of_places_left_in_vector
== 0)
8135 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
8136 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
8137 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
8140 if (permute_results
.is_empty ())
8141 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
8142 elts
, number_of_vectors
,
8144 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
8146 if (!gimple_seq_empty_p (ctor_seq
))
8150 gimple_stmt_iterator gsi
;
8151 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
8153 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
8154 gsi_insert_seq_before (&gsi
, ctor_seq
,
8155 GSI_CONTINUE_LINKING
);
8157 else if (!stmt_ends_bb_p (insert_after
->stmt
))
8159 gsi
= gsi_for_stmt (insert_after
->stmt
);
8160 gsi_insert_seq_after (&gsi
, ctor_seq
,
8161 GSI_CONTINUE_LINKING
);
8165 /* When we want to insert after a def where the
8166 defining stmt throws then insert on the fallthru
8168 edge e
= find_fallthru_edge
8169 (gimple_bb (insert_after
->stmt
)->succs
);
8171 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
8172 gcc_assert (!new_bb
);
8176 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
8179 voprnds
.quick_push (vec_cst
);
8180 insert_after
= NULL
;
8181 number_of_places_left_in_vector
= nunits
;
8183 elts
.new_vector (vector_type
, nunits
, 1);
8184 elts
.quick_grow (nunits
);
8189 /* Since the vectors are created in the reverse order, we should invert
8191 vec_num
= voprnds
.length ();
8192 for (j
= vec_num
; j
!= 0; j
--)
8194 vop
= voprnds
[j
- 1];
8195 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
8198 /* In case that VF is greater than the unrolling factor needed for the SLP
8199 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8200 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8201 to replicate the vectors. */
8202 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
8203 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
8205 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
8208 /* Get the Ith vectorized definition from SLP_NODE. */
8211 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
8213 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
8216 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8219 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
8221 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
8222 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
8225 /* Get N vectorized definitions for SLP_NODE. */
8228 vect_get_slp_defs (vec_info
*,
8229 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
8232 n
= SLP_TREE_CHILDREN (slp_node
).length ();
8234 for (unsigned i
= 0; i
< n
; ++i
)
8236 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
8237 vec
<tree
> vec_defs
= vNULL
;
8238 vect_get_slp_defs (child
, &vec_defs
);
8239 vec_oprnds
->quick_push (vec_defs
);
8243 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8244 - PERM gives the permutation that the caller wants to use for NODE,
8245 which might be different from SLP_LOAD_PERMUTATION.
8246 - DUMP_P controls whether the function dumps information. */
8249 vect_transform_slp_perm_load_1 (vec_info
*vinfo
, slp_tree node
,
8250 load_permutation_t
&perm
,
8251 const vec
<tree
> &dr_chain
,
8252 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
8253 bool analyze_only
, bool dump_p
,
8254 unsigned *n_perms
, unsigned int *n_loads
,
8257 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
8259 tree vectype
= SLP_TREE_VECTYPE (node
);
8260 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
8261 unsigned int mask_element
;
8262 unsigned dr_group_size
;
8265 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
8269 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
8270 dr_group_size
= DR_GROUP_SIZE (stmt_info
);
8273 mode
= TYPE_MODE (vectype
);
8274 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8275 unsigned int nstmts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
8277 /* Initialize the vect stmts of NODE to properly insert the generated
8280 for (unsigned i
= SLP_TREE_VEC_DEFS (node
).length (); i
< nstmts
; i
++)
8281 SLP_TREE_VEC_DEFS (node
).quick_push (NULL_TREE
);
8283 /* Generate permutation masks for every NODE. Number of masks for each NODE
8284 is equal to GROUP_SIZE.
8285 E.g., we have a group of three nodes with three loads from the same
8286 location in each node, and the vector size is 4. I.e., we have a
8287 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8288 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8289 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8292 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8293 The last mask is illegal since we assume two operands for permute
8294 operation, and the mask element values can't be outside that range.
8295 Hence, the last mask must be converted into {2,5,5,5}.
8296 For the first two permutations we need the first and the second input
8297 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8298 we need the second and the third vectors: {b1,c1,a2,b2} and
8301 int vect_stmts_counter
= 0;
8302 unsigned int index
= 0;
8303 int first_vec_index
= -1;
8304 int second_vec_index
= -1;
8308 vec_perm_builder mask
;
8309 unsigned int nelts_to_build
;
8310 unsigned int nvectors_per_build
;
8311 unsigned int in_nlanes
;
8312 bool repeating_p
= (group_size
== dr_group_size
8313 && multiple_p (nunits
, group_size
));
8316 /* A single vector contains a whole number of copies of the node, so:
8317 (a) all permutes can use the same mask; and
8318 (b) the permutes only need a single vector input. */
8319 mask
.new_vector (nunits
, group_size
, 3);
8320 nelts_to_build
= mask
.encoded_nelts ();
8321 /* It's possible to obtain zero nstmts during analyze_only, so make
8322 it at least one to ensure the later computation for n_perms
8324 nvectors_per_build
= nstmts
> 0 ? nstmts
: 1;
8325 in_nlanes
= dr_group_size
* 3;
8329 /* We need to construct a separate mask for each vector statement. */
8330 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
8331 if (!nunits
.is_constant (&const_nunits
)
8332 || !vf
.is_constant (&const_vf
))
8334 mask
.new_vector (const_nunits
, const_nunits
, 1);
8335 nelts_to_build
= const_vf
* group_size
;
8336 nvectors_per_build
= 1;
8337 in_nlanes
= const_vf
* dr_group_size
;
8339 auto_sbitmap
used_in_lanes (in_nlanes
);
8340 bitmap_clear (used_in_lanes
);
8341 auto_bitmap used_defs
;
8343 unsigned int count
= mask
.encoded_nelts ();
8344 mask
.quick_grow (count
);
8345 vec_perm_indices indices
;
8347 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
8349 unsigned int iter_num
= j
/ group_size
;
8350 unsigned int stmt_num
= j
% group_size
;
8351 unsigned int i
= (iter_num
* dr_group_size
+ perm
[stmt_num
]);
8352 bitmap_set_bit (used_in_lanes
, i
);
8355 first_vec_index
= 0;
8360 /* Enforced before the loop when !repeating_p. */
8361 unsigned int const_nunits
= nunits
.to_constant ();
8362 vec_index
= i
/ const_nunits
;
8363 mask_element
= i
% const_nunits
;
8364 if (vec_index
== first_vec_index
8365 || first_vec_index
== -1)
8367 first_vec_index
= vec_index
;
8369 else if (vec_index
== second_vec_index
8370 || second_vec_index
== -1)
8372 second_vec_index
= vec_index
;
8373 mask_element
+= const_nunits
;
8378 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8379 "permutation requires at "
8380 "least three vectors %G",
8382 gcc_assert (analyze_only
);
8386 gcc_assert (mask_element
< 2 * const_nunits
);
8389 if (mask_element
!= index
)
8391 mask
[index
++] = mask_element
;
8397 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
8398 if (!can_vec_perm_const_p (mode
, mode
, indices
))
8402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8403 "unsupported vect permute { ");
8404 for (i
= 0; i
< count
; ++i
)
8406 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
8407 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
8409 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
8411 gcc_assert (analyze_only
);
8415 tree mask_vec
= NULL_TREE
;
8417 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
8419 if (second_vec_index
== -1)
8420 second_vec_index
= first_vec_index
;
8422 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
8427 /* Generate the permute statement if necessary. */
8428 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
8429 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
8430 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
8432 = vect_create_destination_var (gimple_assign_lhs (stmt
),
8434 perm_dest
= make_ssa_name (perm_dest
);
8436 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
, first_vec
,
8437 second_vec
, mask_vec
);
8438 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
8442 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
8443 bitmap_set_bit (used_defs
, second_vec_index
+ ri
);
8446 /* Store the vector statement in NODE. */
8447 SLP_TREE_VEC_DEFS (node
)[vect_stmts_counter
++] = perm_dest
;
8450 else if (!analyze_only
)
8452 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
8454 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
8455 /* If mask was NULL_TREE generate the requested
8456 identity transform. */
8458 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
8460 /* Store the vector statement in NODE. */
8461 SLP_TREE_VEC_DEFS (node
)[vect_stmts_counter
++] = first_vec
;
8466 first_vec_index
= -1;
8467 second_vec_index
= -1;
8475 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
8478 /* Enforced above when !repeating_p. */
8479 unsigned int const_nunits
= nunits
.to_constant ();
8481 bool load_seen
= false;
8482 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
8484 if (i
% const_nunits
== 0)
8490 if (bitmap_bit_p (used_in_lanes
, i
))
8499 for (unsigned i
= 0; i
< dr_chain
.length (); ++i
)
8500 if (!bitmap_bit_p (used_defs
, i
))
8502 gimple
*stmt
= SSA_NAME_DEF_STMT (dr_chain
[i
]);
8503 gimple_stmt_iterator rgsi
= gsi_for_stmt (stmt
);
8504 gsi_remove (&rgsi
, true);
8505 release_defs (stmt
);
8511 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8512 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8513 permute statements for the SLP node NODE. Store the number of vector
8514 permute instructions in *N_PERMS and the number of vector load
8515 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8516 that were not needed. */
8519 vect_transform_slp_perm_load (vec_info
*vinfo
,
8520 slp_tree node
, const vec
<tree
> &dr_chain
,
8521 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
8522 bool analyze_only
, unsigned *n_perms
,
8523 unsigned int *n_loads
, bool dce_chain
)
8525 return vect_transform_slp_perm_load_1 (vinfo
, node
,
8526 SLP_TREE_LOAD_PERMUTATION (node
),
8527 dr_chain
, gsi
, vf
, analyze_only
,
8528 dump_enabled_p (), n_perms
, n_loads
,
8532 /* Produce the next vector result for SLP permutation NODE by adding a vector
8533 statement at GSI. If MASK_VEC is nonnull, add:
8535 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8539 <new SSA name> = FIRST_DEF. */
8542 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
8543 slp_tree node
, tree first_def
, tree second_def
,
8544 tree mask_vec
, poly_uint64 identity_offset
)
8546 tree vectype
= SLP_TREE_VECTYPE (node
);
8548 /* ??? We SLP match existing vector element extracts but
8549 allow punning which we need to re-instantiate at uses
8550 but have no good way of explicitly representing. */
8551 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)), TYPE_SIZE (vectype
))
8552 && !types_compatible_p (TREE_TYPE (first_def
), vectype
))
8555 = gimple_build_assign (make_ssa_name (vectype
),
8556 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
8557 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
8558 first_def
= gimple_assign_lhs (conv_stmt
);
8561 tree perm_dest
= make_ssa_name (vectype
);
8564 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)),
8565 TYPE_SIZE (vectype
))
8566 && !types_compatible_p (TREE_TYPE (second_def
), vectype
))
8569 = gimple_build_assign (make_ssa_name (vectype
),
8570 build1 (VIEW_CONVERT_EXPR
,
8571 vectype
, second_def
));
8572 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
8573 second_def
= gimple_assign_lhs (conv_stmt
);
8575 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
8576 first_def
, second_def
,
8579 else if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
8581 /* For identity permutes we still need to handle the case
8582 of offsetted extracts or concats. */
8583 unsigned HOST_WIDE_INT c
;
8584 auto first_def_nunits
8585 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def
));
8586 if (known_le (TYPE_VECTOR_SUBPARTS (vectype
), first_def_nunits
))
8588 unsigned HOST_WIDE_INT elsz
8589 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def
))));
8590 tree lowpart
= build3 (BIT_FIELD_REF
, vectype
, first_def
,
8591 TYPE_SIZE (vectype
),
8592 bitsize_int (identity_offset
* elsz
));
8593 perm_stmt
= gimple_build_assign (perm_dest
, lowpart
);
8595 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
),
8596 first_def_nunits
, &c
) && c
== 2)
8598 tree ctor
= build_constructor_va (vectype
, 2, NULL_TREE
, first_def
,
8599 NULL_TREE
, second_def
);
8600 perm_stmt
= gimple_build_assign (perm_dest
, ctor
);
8607 /* We need a copy here in case the def was external. */
8608 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
8610 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
8611 /* Store the vector statement in NODE. */
8612 node
->push_vec_def (perm_stmt
);
8615 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8616 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8617 If GSI is nonnull, emit the permutation there.
8619 When GSI is null, the only purpose of NODE is to give properties
8620 of the result, such as the vector type and number of SLP lanes.
8621 The node does not need to be a VEC_PERM_EXPR.
8623 If the target supports the operation, return the number of individual
8624 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8625 dump file if DUMP_P is true. */
8628 vectorizable_slp_permutation_1 (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
8629 slp_tree node
, lane_permutation_t
&perm
,
8630 vec
<slp_tree
> &children
, bool dump_p
)
8632 tree vectype
= SLP_TREE_VECTYPE (node
);
8634 /* ??? We currently only support all same vector input types
8635 while the SLP IL should really do a concat + select and thus accept
8636 arbitrary mismatches. */
8639 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8640 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
8641 tree op_vectype
= NULL_TREE
;
8642 FOR_EACH_VEC_ELT (children
, i
, child
)
8643 if (SLP_TREE_VECTYPE (child
))
8645 op_vectype
= SLP_TREE_VECTYPE (child
);
8649 op_vectype
= vectype
;
8650 FOR_EACH_VEC_ELT (children
, i
, child
)
8652 if ((SLP_TREE_DEF_TYPE (child
) != vect_internal_def
8653 && !vect_maybe_update_slp_op_vectype (child
, op_vectype
))
8654 || !types_compatible_p (SLP_TREE_VECTYPE (child
), op_vectype
)
8655 || !types_compatible_p (TREE_TYPE (vectype
), TREE_TYPE (op_vectype
)))
8658 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8659 "Unsupported vector types in lane permutation\n");
8662 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
8663 repeating_p
= false;
8666 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
8669 dump_printf_loc (MSG_NOTE
, vect_location
,
8670 "vectorizing permutation");
8671 for (unsigned i
= 0; i
< perm
.length (); ++i
)
8672 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
8674 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
8675 dump_printf (MSG_NOTE
, "\n");
8678 /* REPEATING_P is true if every output vector is guaranteed to use the
8679 same permute vector. We can handle that case for both variable-length
8680 and constant-length vectors, but we only handle other cases for
8681 constant-length vectors.
8685 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8686 mask vector that we want to build.
8688 - NCOPIES to the number of copies of PERM that we need in order
8689 to build the necessary permute mask vectors.
8691 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8692 for each permute mask vector. This is only relevant when GSI is
8695 unsigned nelts_per_pattern
;
8697 unsigned noutputs_per_mask
;
8700 /* We need a single permute mask vector that has the form:
8702 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8704 In other words, the original n-element permute in PERM is
8705 "unrolled" to fill a full vector. The stepped vector encoding
8706 that we use for permutes requires 3n elements. */
8707 npatterns
= SLP_TREE_LANES (node
);
8708 nelts_per_pattern
= ncopies
= 3;
8709 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
8713 /* Calculate every element of every permute mask vector explicitly,
8714 instead of relying on the pattern described above. */
8715 if (!nunits
.is_constant (&npatterns
))
8717 nelts_per_pattern
= ncopies
= 1;
8718 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
8719 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
8721 noutputs_per_mask
= 1;
8723 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
8724 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
8726 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8727 from the { SLP operand, scalar lane } permutation as recorded in the
8728 SLP node as intermediate step. This part should already work
8729 with SLP children with arbitrary number of lanes. */
8730 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
8731 auto_vec
<unsigned> active_lane
;
8732 vperm
.create (olanes
);
8733 active_lane
.safe_grow_cleared (children
.length (), true);
8734 for (unsigned i
= 0; i
< ncopies
; ++i
)
8736 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
8738 std::pair
<unsigned, unsigned> p
= perm
[pi
];
8739 tree vtype
= SLP_TREE_VECTYPE (children
[p
.first
]);
8741 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
8744 /* We checked above that the vectors are constant-length. */
8745 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
8746 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
8747 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
8748 vperm
.quick_push ({{p
.first
, vi
}, vl
});
8751 /* Advance to the next group. */
8752 for (unsigned j
= 0; j
< children
.length (); ++j
)
8753 active_lane
[j
] += SLP_TREE_LANES (children
[j
]);
8758 dump_printf_loc (MSG_NOTE
, vect_location
,
8759 "vectorizing permutation");
8760 for (unsigned i
= 0; i
< perm
.length (); ++i
)
8761 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
8763 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
8764 dump_printf (MSG_NOTE
, "\n");
8765 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
8766 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
8770 ? multiple_p (i
, npatterns
)
8771 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
8772 dump_printf (MSG_NOTE
, ",");
8773 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
8774 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
8777 dump_printf (MSG_NOTE
, "\n");
8780 /* We can only handle two-vector permutes, everything else should
8781 be lowered on the SLP level. The following is closely inspired
8782 by vect_transform_slp_perm_load and is supposed to eventually
8784 ??? As intermediate step do code-gen in the SLP tree representation
8786 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
8787 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
8788 unsigned int index
= 0;
8789 poly_uint64 mask_element
;
8790 vec_perm_builder mask
;
8791 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
8792 unsigned int count
= mask
.encoded_nelts ();
8793 mask
.quick_grow (count
);
8794 vec_perm_indices indices
;
8795 unsigned nperms
= 0;
8796 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
8798 mask_element
= vperm
[i
].second
;
8799 if (first_vec
.first
== -1U
8800 || first_vec
== vperm
[i
].first
)
8801 first_vec
= vperm
[i
].first
;
8802 else if (second_vec
.first
== -1U
8803 || second_vec
== vperm
[i
].first
)
8805 second_vec
= vperm
[i
].first
;
8806 mask_element
+= nunits
;
8811 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8812 "permutation requires at "
8813 "least three vectors\n");
8818 mask
[index
++] = mask_element
;
8822 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
8823 TYPE_VECTOR_SUBPARTS (op_vectype
));
8824 bool identity_p
= (indices
.series_p (0, 1, mask
[0], 1)
8825 && constant_multiple_p (mask
[0], nunits
));
8826 machine_mode vmode
= TYPE_MODE (vectype
);
8827 machine_mode op_vmode
= TYPE_MODE (op_vectype
);
8828 unsigned HOST_WIDE_INT c
;
8830 && !can_vec_perm_const_p (vmode
, op_vmode
, indices
))
8832 && !known_le (nunits
,
8833 TYPE_VECTOR_SUBPARTS (op_vectype
))
8834 && (!constant_multiple_p (nunits
,
8835 TYPE_VECTOR_SUBPARTS (op_vectype
),
8840 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
8842 "unsupported vect permute { ");
8843 for (i
= 0; i
< count
; ++i
)
8845 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
8846 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
8848 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
8858 if (second_vec
.first
== -1U)
8859 second_vec
= first_vec
;
8862 first_node
= children
[first_vec
.first
],
8863 second_node
= children
[second_vec
.first
];
8865 tree mask_vec
= NULL_TREE
;
8867 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
8869 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
8872 = vect_get_slp_vect_def (first_node
,
8873 first_vec
.second
+ vi
);
8875 = vect_get_slp_vect_def (second_node
,
8876 second_vec
.second
+ vi
);
8877 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
8878 second_def
, mask_vec
, mask
[0]);
8883 first_vec
= std::make_pair (-1U, -1U);
8884 second_vec
= std::make_pair (-1U, -1U);
8891 /* Vectorize the SLP permutations in NODE as specified
8892 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8893 child number and lane number.
8894 Interleaving of two two-lane two-child SLP subtrees (not supported):
8895 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8896 A blend of two four-lane two-child SLP subtrees:
8897 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8898 Highpart of a four-lane one-child SLP subtree (not supported):
8899 [ { 0, 2 }, { 0, 3 } ]
8900 Where currently only a subset is supported by code generating below. */
8903 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
8904 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
8906 tree vectype
= SLP_TREE_VECTYPE (node
);
8907 lane_permutation_t
&perm
= SLP_TREE_LANE_PERMUTATION (node
);
8908 int nperms
= vectorizable_slp_permutation_1 (vinfo
, gsi
, node
, perm
,
8909 SLP_TREE_CHILDREN (node
),
8915 record_stmt_cost (cost_vec
, nperms
, vec_perm
, node
, vectype
, 0, vect_body
);
8920 /* Vectorize SLP NODE. */
8923 vect_schedule_slp_node (vec_info
*vinfo
,
8924 slp_tree node
, slp_instance instance
)
8926 gimple_stmt_iterator si
;
8930 /* For existing vectors there's nothing to do. */
8931 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
8932 && SLP_TREE_VEC_DEFS (node
).exists ())
8935 gcc_assert (SLP_TREE_VEC_DEFS (node
).is_empty ());
8937 /* Vectorize externals and constants. */
8938 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
8939 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
8941 /* ??? vectorizable_shift can end up using a scalar operand which is
8942 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8943 node in this case. */
8944 if (!SLP_TREE_VECTYPE (node
))
8947 vect_create_constant_vectors (vinfo
, node
);
8951 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
8953 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
8954 SLP_TREE_VEC_DEFS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
8956 if (dump_enabled_p ())
8957 dump_printf_loc (MSG_NOTE
, vect_location
,
8958 "------>vectorizing SLP node starting from: %G",
8961 if (STMT_VINFO_DATA_REF (stmt_info
)
8962 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
8964 /* Vectorized loads go before the first scalar load to make it
8965 ready early, vectorized stores go before the last scalar
8966 stmt which is where all uses are ready. */
8967 stmt_vec_info last_stmt_info
= NULL
;
8968 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
8969 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
8970 else /* DR_IS_WRITE */
8971 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
8972 si
= gsi_for_stmt (last_stmt_info
->stmt
);
8974 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
8975 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
8976 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
8977 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
8979 /* For PHI node vectorization we do not use the insertion iterator. */
8984 /* Emit other stmts after the children vectorized defs which is
8985 earliest possible. */
8986 gimple
*last_stmt
= NULL
;
8987 bool seen_vector_def
= false;
8988 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8989 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
8991 /* For fold-left reductions we are retaining the scalar
8992 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8993 set so the representation isn't perfect. Resort to the
8994 last scalar def here. */
8995 if (SLP_TREE_VEC_DEFS (child
).is_empty ())
8997 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
8998 == cycle_phi_info_type
);
8999 gphi
*phi
= as_a
<gphi
*>
9000 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
9002 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
9005 /* We are emitting all vectorized stmts in the same place and
9006 the last one is the last.
9007 ??? Unless we have a load permutation applied and that
9008 figures to re-use an earlier generated load. */
9011 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
9013 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
9015 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
9019 else if (!SLP_TREE_VECTYPE (child
))
9021 /* For externals we use unvectorized at all scalar defs. */
9024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
9025 if (TREE_CODE (def
) == SSA_NAME
9026 && !SSA_NAME_IS_DEFAULT_DEF (def
))
9028 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
9030 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
9036 /* For externals we have to look at all defs since their
9037 insertion place is decided per vector. But beware
9038 of pre-existing vectors where we need to make sure
9039 we do not insert before the region boundary. */
9040 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
9041 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
9042 seen_vector_def
= true;
9047 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
9048 if (TREE_CODE (vdef
) == SSA_NAME
9049 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
9051 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
9053 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
9058 /* This can happen when all children are pre-existing vectors or
9061 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
9064 gcc_assert (seen_vector_def
);
9065 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
9067 else if (is_ctrl_altering_stmt (last_stmt
))
9069 /* We split regions to vectorize at control altering stmts
9070 with a definition so this must be an external which
9071 we can insert at the start of the region. */
9072 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
9074 else if (is_a
<bb_vec_info
> (vinfo
)
9075 && gimple_bb (last_stmt
) != gimple_bb (stmt_info
->stmt
)
9076 && gimple_could_trap_p (stmt_info
->stmt
))
9078 /* We've constrained possibly trapping operations to all come
9079 from the same basic-block, if vectorized defs would allow earlier
9080 scheduling still force vectorized stmts to the original block.
9081 This is only necessary for BB vectorization since for loop vect
9082 all operations are in a single BB and scalar stmt based
9083 placement doesn't play well with epilogue vectorization. */
9084 gcc_assert (dominated_by_p (CDI_DOMINATORS
,
9085 gimple_bb (stmt_info
->stmt
),
9086 gimple_bb (last_stmt
)));
9087 si
= gsi_after_labels (gimple_bb (stmt_info
->stmt
));
9089 else if (is_a
<gphi
*> (last_stmt
))
9090 si
= gsi_after_labels (gimple_bb (last_stmt
));
9093 si
= gsi_for_stmt (last_stmt
);
9098 /* Handle purely internal nodes. */
9099 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
9101 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9102 be shared with different SLP nodes (but usually it's the same
9103 operation apart from the case the stmt is only there for denoting
9104 the actual scalar lane defs ...). So do not call vect_transform_stmt
9105 but open-code it here (partly). */
9106 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
9108 stmt_vec_info slp_stmt_info
;
9110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, slp_stmt_info
)
9111 if (STMT_VINFO_LIVE_P (slp_stmt_info
))
9113 done
= vectorizable_live_operation (vinfo
, slp_stmt_info
, node
,
9114 instance
, i
, true, NULL
);
9119 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
9122 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9123 For loop vectorization this is done in vectorizable_call, but for SLP
9124 it needs to be deferred until end of vect_schedule_slp, because multiple
9125 SLP instances may refer to the same scalar stmt. */
9128 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
9129 slp_tree node
, hash_set
<slp_tree
> &visited
)
9132 gimple_stmt_iterator gsi
;
9136 stmt_vec_info stmt_info
;
9138 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
9141 if (visited
.add (node
))
9144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
9145 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
9147 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
9149 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
9150 if (!stmt
|| gimple_bb (stmt
) == NULL
)
9152 if (is_pattern_stmt_p (stmt_info
)
9153 || !PURE_SLP_STMT (stmt_info
))
9155 lhs
= gimple_call_lhs (stmt
);
9157 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
9160 new_stmt
= gimple_build_nop ();
9161 unlink_stmt_vdef (stmt_info
->stmt
);
9163 gsi
= gsi_for_stmt (stmt
);
9164 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
9166 SSA_NAME_DEF_STMT (lhs
) = new_stmt
;
9171 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
9173 hash_set
<slp_tree
> visited
;
9174 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
9177 /* Vectorize the instance root. */
9180 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
9182 gassign
*rstmt
= NULL
;
9184 if (instance
->kind
== slp_inst_kind_ctor
)
9186 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
9188 tree vect_lhs
= SLP_TREE_VEC_DEFS (node
)[0];
9189 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
9190 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
9191 TREE_TYPE (vect_lhs
)))
9192 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
9194 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
9196 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
9198 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
9201 vec
<constructor_elt
, va_gc
> *v
;
9202 vec_alloc (v
, nelts
);
9204 /* A CTOR can handle V16HI composition from VNx8HI so we
9205 do not need to convert vector elements if the types
9207 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node
), j
, child_def
)
9208 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, child_def
);
9209 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
9211 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
9212 tree r_constructor
= build_constructor (rtype
, v
);
9213 rstmt
= gimple_build_assign (lhs
, r_constructor
);
9216 else if (instance
->kind
== slp_inst_kind_bb_reduc
)
9218 /* Largely inspired by reduction chain epilogue handling in
9219 vect_create_epilog_for_reduction. */
9220 vec
<tree
> vec_defs
= vNULL
;
9221 vect_get_slp_defs (node
, &vec_defs
);
9222 enum tree_code reduc_code
9223 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
9224 /* ??? We actually have to reflect signs somewhere. */
9225 if (reduc_code
== MINUS_EXPR
)
9226 reduc_code
= PLUS_EXPR
;
9227 gimple_seq epilogue
= NULL
;
9228 /* We may end up with more than one vector result, reduce them
9230 tree vec_def
= vec_defs
[0];
9231 tree vectype
= TREE_TYPE (vec_def
);
9232 tree compute_vectype
= vectype
;
9233 bool pun_for_overflow_p
= (ANY_INTEGRAL_TYPE_P (vectype
)
9234 && TYPE_OVERFLOW_UNDEFINED (vectype
)
9235 && operation_can_overflow (reduc_code
));
9236 if (pun_for_overflow_p
)
9238 compute_vectype
= unsigned_type_for (vectype
);
9239 vec_def
= gimple_build (&epilogue
, VIEW_CONVERT_EXPR
,
9240 compute_vectype
, vec_def
);
9242 for (unsigned i
= 1; i
< vec_defs
.length (); ++i
)
9244 tree def
= vec_defs
[i
];
9245 if (pun_for_overflow_p
)
9246 def
= gimple_build (&epilogue
, VIEW_CONVERT_EXPR
,
9247 compute_vectype
, def
);
9248 vec_def
= gimple_build (&epilogue
, reduc_code
, compute_vectype
,
9251 vec_defs
.release ();
9252 /* ??? Support other schemes than direct internal fn. */
9253 internal_fn reduc_fn
;
9254 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
9255 || reduc_fn
== IFN_LAST
)
9257 tree scalar_def
= gimple_build (&epilogue
, as_combined_fn (reduc_fn
),
9258 TREE_TYPE (compute_vectype
), vec_def
);
9259 if (!SLP_INSTANCE_REMAIN_DEFS (instance
).is_empty ())
9261 tree rem_def
= NULL_TREE
;
9262 for (auto def
: SLP_INSTANCE_REMAIN_DEFS (instance
))
9264 def
= gimple_convert (&epilogue
, TREE_TYPE (scalar_def
), def
);
9268 rem_def
= gimple_build (&epilogue
, reduc_code
,
9269 TREE_TYPE (scalar_def
),
9272 scalar_def
= gimple_build (&epilogue
, reduc_code
,
9273 TREE_TYPE (scalar_def
),
9274 scalar_def
, rem_def
);
9276 scalar_def
= gimple_convert (&epilogue
,
9277 TREE_TYPE (vectype
), scalar_def
);
9278 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
9279 gsi_insert_seq_before (&rgsi
, epilogue
, GSI_SAME_STMT
);
9280 gimple_assign_set_rhs_from_tree (&rgsi
, scalar_def
);
9281 update_stmt (gsi_stmt (rgsi
));
9289 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
9290 gsi_replace (&rgsi
, rstmt
, true);
9300 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9303 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
9304 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
9305 int &maxdfs
, vec
<slp_tree
> &stack
)
9308 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
9309 gcc_assert (!existed_p
);
9311 info
->lowlink
= maxdfs
;
9315 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
9317 info
->on_stack
= false;
9318 vect_schedule_slp_node (vinfo
, node
, instance
);
9322 info
->on_stack
= true;
9323 stack
.safe_push (node
);
9328 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
9332 slp_scc_info
*child_info
= scc_info
.get (child
);
9335 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
9336 /* Recursion might have re-allocated the node. */
9337 info
= scc_info
.get (node
);
9338 child_info
= scc_info
.get (child
);
9339 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
9341 else if (child_info
->on_stack
)
9342 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
9344 if (info
->lowlink
!= info
->dfs
)
9347 auto_vec
<slp_tree
, 4> phis_to_fixup
;
9350 if (stack
.last () == node
)
9353 info
->on_stack
= false;
9354 vect_schedule_slp_node (vinfo
, node
, instance
);
9355 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
9356 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
9357 phis_to_fixup
.quick_push (node
);
9362 int last_idx
= stack
.length () - 1;
9363 while (stack
[last_idx
] != node
)
9365 /* We can break the cycle at PHIs who have at least one child
9366 code generated. Then we could re-start the DFS walk until
9367 all nodes in the SCC are covered (we might have new entries
9368 for only back-reachable nodes). But it's simpler to just
9369 iterate and schedule those that are ready. */
9370 unsigned todo
= stack
.length () - last_idx
;
9373 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
9375 slp_tree entry
= stack
[idx
];
9378 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
9379 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
9381 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
9388 else if (scc_info
.get (child
)->on_stack
)
9406 vect_schedule_slp_node (vinfo
, entry
, instance
);
9407 scc_info
.get (entry
)->on_stack
= false;
9411 phis_to_fixup
.safe_push (entry
);
9418 stack
.truncate (last_idx
);
9421 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9423 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
9425 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
9428 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
9430 unsigned dest_idx
= e
->dest_idx
;
9431 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
9432 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
9434 unsigned n
= SLP_TREE_VEC_DEFS (phi_node
).length ();
9435 /* Simply fill all args. */
9436 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node
))
9437 != vect_first_order_recurrence
)
9438 for (unsigned i
= 0; i
< n
; ++i
)
9440 tree phidef
= SLP_TREE_VEC_DEFS (phi_node
)[i
];
9441 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (phidef
));
9442 add_phi_arg (phi
, vect_get_slp_vect_def (child
, i
),
9443 e
, gimple_phi_arg_location (phi
, dest_idx
));
9447 /* Unless it is a first order recurrence which needs
9448 args filled in for both the PHI node and the permutes. */
9450 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node
)[0]);
9451 gimple
*rphi
= SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm
));
9452 add_phi_arg (as_a
<gphi
*> (rphi
),
9453 vect_get_slp_vect_def (child
, n
- 1),
9454 e
, gimple_phi_arg_location (phi
, dest_idx
));
9455 for (unsigned i
= 0; i
< n
; ++i
)
9458 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node
)[i
]);
9460 gimple_assign_set_rhs1 (perm
,
9461 vect_get_slp_vect_def (child
, i
- 1));
9462 gimple_assign_set_rhs2 (perm
,
9463 vect_get_slp_vect_def (child
, i
));
9471 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9474 vect_schedule_slp (vec_info
*vinfo
, const vec
<slp_instance
> &slp_instances
)
9476 slp_instance instance
;
9479 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
9481 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
9483 slp_tree node
= SLP_INSTANCE_TREE (instance
);
9484 if (dump_enabled_p ())
9486 dump_printf_loc (MSG_NOTE
, vect_location
,
9487 "Vectorizing SLP tree:\n");
9489 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
9490 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
9491 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
9492 vect_print_slp_graph (MSG_NOTE
, vect_location
,
9493 SLP_INSTANCE_TREE (instance
));
9495 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9496 have a PHI be the node breaking the cycle. */
9497 auto_vec
<slp_tree
> stack
;
9498 if (!scc_info
.get (node
))
9499 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
9501 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
9502 vectorize_slp_instance_root_stmt (node
, instance
);
9504 if (dump_enabled_p ())
9505 dump_printf_loc (MSG_NOTE
, vect_location
,
9506 "vectorizing stmts using SLP.\n");
9509 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
9511 slp_tree root
= SLP_INSTANCE_TREE (instance
);
9512 stmt_vec_info store_info
;
9515 /* Remove scalar call stmts. Do not do this for basic-block
9516 vectorization as not all uses may be vectorized.
9517 ??? Why should this be necessary? DCE should be able to
9518 remove the stmts itself.
9519 ??? For BB vectorization we can as well remove scalar
9520 stmts starting from the SLP tree root if they have no
9522 if (is_a
<loop_vec_info
> (vinfo
))
9523 vect_remove_slp_scalar_calls (vinfo
, root
);
9525 /* Remove vectorized stores original scalar stmts. */
9526 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
9528 if (!STMT_VINFO_DATA_REF (store_info
)
9529 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
9532 store_info
= vect_orig_stmt (store_info
);
9533 /* Free the attached stmt_vec_info and remove the stmt. */
9534 vinfo
->remove_stmt (store_info
);
9536 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9537 to not crash in vect_free_slp_tree later. */
9538 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
9539 SLP_TREE_REPRESENTATIVE (root
) = NULL
;