1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
55 static void vect_print_slp_tree (dump_flags_t
, dump_location_t
, slp_tree
);
57 static object_allocator
<_slp_tree
> *slp_tree_pool
;
58 static slp_tree slp_first_node
;
63 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
69 while (slp_first_node
)
70 delete slp_first_node
;
76 _slp_tree::operator new (size_t n
)
78 gcc_assert (n
== sizeof (_slp_tree
));
79 return slp_tree_pool
->allocate_raw ();
83 _slp_tree::operator delete (void *node
, size_t n
)
85 gcc_assert (n
== sizeof (_slp_tree
));
86 slp_tree_pool
->remove_raw (node
);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node
= NULL
;
96 slp_first_node
->prev_node
= this;
97 this->next_node
= slp_first_node
;
98 slp_first_node
= this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
100 SLP_TREE_SCALAR_OPS (this) = vNULL
;
101 SLP_TREE_VEC_STMTS (this) = vNULL
;
102 SLP_TREE_VEC_DEFS (this) = vNULL
;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL
;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
108 SLP_TREE_CODE (this) = ERROR_MARK
;
109 SLP_TREE_VECTYPE (this) = NULL_TREE
;
110 SLP_TREE_REPRESENTATIVE (this) = NULL
;
111 SLP_TREE_REF_COUNT (this) = 1;
113 this->max_nunits
= 1;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
122 this->prev_node
->next_node
= this->next_node
;
124 slp_first_node
= this->next_node
;
126 this->next_node
->prev_node
= this->prev_node
;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
141 vect_free_slp_tree (slp_tree node
)
146 if (--SLP_TREE_REF_COUNT (node
) != 0)
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
151 vect_free_slp_tree (child
);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
156 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
158 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
159 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
160 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
166 /* Return a location suitable for dumpings related to the SLP instance. */
169 _slp_instance::location () const
171 if (!root_stmts
.is_empty ())
172 return root_stmts
[0]->stmt
;
174 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
178 /* Free the memory allocated for the SLP instance. */
181 vect_free_slp_instance (slp_instance instance
)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
184 SLP_INSTANCE_LOADS (instance
).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
186 instance
->subgraph_entries
.release ();
187 instance
->cost_vec
.release ();
192 /* Create an SLP node for SCALAR_STMTS. */
195 vect_create_new_slp_node (unsigned nops
, tree_code code
)
197 slp_tree node
= new _slp_tree
;
198 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
199 SLP_TREE_CHILDREN (node
).create (nops
);
200 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
201 SLP_TREE_CODE (node
) = code
;
204 /* Create an SLP node for SCALAR_STMTS. */
207 vect_create_new_slp_node (slp_tree node
,
208 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
210 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
211 SLP_TREE_CHILDREN (node
).create (nops
);
212 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
213 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
214 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
218 /* Create an SLP node for SCALAR_STMTS. */
221 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
223 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
226 /* Create an SLP node for OPS. */
229 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
231 SLP_TREE_SCALAR_OPS (node
) = ops
;
232 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
233 SLP_TREE_LANES (node
) = ops
.length ();
237 /* Create an SLP node for OPS. */
240 vect_create_new_slp_node (vec
<tree
> ops
)
242 return vect_create_new_slp_node (new _slp_tree
, ops
);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec
<stmt_vec_info
> def_stmts
;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
259 enum vect_def_type first_dt
;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
266 static vec
<slp_oprnd_info
>
267 vect_create_oprnd_info (int nops
, int group_size
)
270 slp_oprnd_info oprnd_info
;
271 vec
<slp_oprnd_info
> oprnds_info
;
273 oprnds_info
.create (nops
);
274 for (i
= 0; i
< nops
; i
++)
276 oprnd_info
= XNEW (struct _slp_oprnd_info
);
277 oprnd_info
->def_stmts
.create (group_size
);
278 oprnd_info
->ops
.create (group_size
);
279 oprnd_info
->first_dt
= vect_uninitialized_def
;
280 oprnd_info
->first_op_type
= NULL_TREE
;
281 oprnd_info
->any_pattern
= false;
282 oprnds_info
.quick_push (oprnd_info
);
289 /* Free operands info. */
292 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
295 slp_oprnd_info oprnd_info
;
297 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
299 oprnd_info
->def_stmts
.release ();
300 oprnd_info
->ops
.release ();
301 XDELETE (oprnd_info
);
304 oprnds_info
.release ();
308 /* Return true if STMTS contains a pattern statement. */
311 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
313 stmt_vec_info stmt_info
;
315 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
316 if (is_pattern_stmt_p (stmt_info
))
321 /* Return true when all lanes in the external or constant NODE have
325 vect_slp_tree_uniform_p (slp_tree node
)
327 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
335 tree op
, first
= NULL_TREE
;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
339 else if (!operand_equal_p (first
, op
, 0))
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
351 stmt_vec_info first_stmt_info
)
353 stmt_vec_info next_stmt_info
= first_stmt_info
;
356 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
361 if (next_stmt_info
== stmt_info
)
363 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
365 result
+= DR_GROUP_GAP (next_stmt_info
);
367 while (next_stmt_info
);
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
379 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
380 tree elt_type
, unsigned int *nvectors_out
,
381 tree
*vector_type_out
,
384 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
385 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
388 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
389 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
390 unsigned int nvectors
= 1;
393 scalar_int_mode int_mode
;
394 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
395 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type
= build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode
), 1);
401 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
403 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
405 GET_MODE_SIZE (base_vector_mode
)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
411 vec_perm_builder
sel1 (nelts
, 2, 3);
412 vec_perm_builder
sel2 (nelts
, 2, 3);
413 poly_int64 half_nelts
= exact_div (nelts
, 2);
414 for (unsigned int i
= 0; i
< 3; ++i
)
417 sel1
.quick_push (i
+ nelts
);
418 sel2
.quick_push (half_nelts
+ i
);
419 sel2
.quick_push (half_nelts
+ i
+ nelts
);
421 vec_perm_indices
indices1 (sel1
, 2, nelts
);
422 vec_perm_indices
indices2 (sel2
, 2, nelts
);
423 machine_mode vmode
= TYPE_MODE (vector_type
);
424 if (can_vec_perm_const_p (vmode
, vmode
, indices1
)
425 && can_vec_perm_const_p (vmode
, vmode
, indices2
))
428 *nvectors_out
= nvectors
;
430 *vector_type_out
= vector_type
;
433 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
435 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
442 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
448 /* Return true if DTA and DTB match. */
451 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
454 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
455 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
458 static const int cond_expr_maps
[3][5] = {
463 static const int arg1_map
[] = { 1, 1 };
464 static const int arg2_map
[] = { 1, 2 };
465 static const int arg1_arg4_map
[] = { 2, 1, 4 };
466 static const int op1_op0_map
[] = { 2, 1, 0 };
468 /* For most SLP statements, there is a one-to-one mapping between
469 gimple arguments and child nodes. If that is not true for STMT,
470 return an array that contains:
472 - the number of child nodes, followed by
473 - for each child node, the index of the argument associated with that node.
474 The special index -1 is the first operand of an embedded comparison and
475 the special index -2 is the second operand of an embedded comparison.
477 SWAP is as for vect_get_and_check_slp_defs. */
480 vect_get_operand_map (const gimple
*stmt
, unsigned char swap
= 0)
482 if (auto assign
= dyn_cast
<const gassign
*> (stmt
))
484 if (gimple_assign_rhs_code (assign
) == COND_EXPR
485 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign
)))
486 return cond_expr_maps
[swap
];
487 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign
)) == tcc_comparison
492 if (auto call
= dyn_cast
<const gcall
*> (stmt
))
494 if (gimple_call_internal_p (call
))
495 switch (gimple_call_internal_fn (call
))
500 case IFN_GATHER_LOAD
:
503 case IFN_MASK_GATHER_LOAD
:
504 return arg1_arg4_map
;
513 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
514 they are of a valid type and that they match the defs of the first stmt of
515 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
516 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
517 indicates swap is required for cond_expr stmts. Specifically, SWAP
518 is 1 if STMT is cond and operands of comparison need to be swapped;
519 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
521 If there was a fatal error return -1; if the error could be corrected by
522 swapping operands of father node of this one, return 1; if everything is
525 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
527 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
528 vec
<slp_oprnd_info
> *oprnds_info
)
530 stmt_vec_info stmt_info
= stmts
[stmt_num
];
532 unsigned int i
, number_of_oprnds
;
533 enum vect_def_type dt
= vect_uninitialized_def
;
534 slp_oprnd_info oprnd_info
;
535 unsigned int commutative_op
= -1U;
536 bool first
= stmt_num
== 0;
538 if (!is_a
<gcall
*> (stmt_info
->stmt
)
539 && !is_a
<gassign
*> (stmt_info
->stmt
)
540 && !is_a
<gphi
*> (stmt_info
->stmt
))
543 number_of_oprnds
= gimple_num_args (stmt_info
->stmt
);
544 const int *map
= vect_get_operand_map (stmt_info
->stmt
, swap
);
546 number_of_oprnds
= *map
++;
547 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
549 if (gimple_call_internal_p (stmt
))
551 internal_fn ifn
= gimple_call_internal_fn (stmt
);
552 commutative_op
= first_commutative_argument (ifn
);
555 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
557 if (commutative_tree_code (gimple_assign_rhs_code (stmt
)))
561 bool swapped
= (swap
!= 0);
562 bool backedge
= false;
563 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
564 for (i
= 0; i
< number_of_oprnds
; i
++)
566 int opno
= map
? map
[i
] : int (i
);
568 oprnd
= TREE_OPERAND (gimple_arg (stmt_info
->stmt
, 0), -1 - opno
);
571 oprnd
= gimple_arg (stmt_info
->stmt
, opno
);
572 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
573 backedge
= dominated_by_p (CDI_DOMINATORS
,
574 gimple_phi_arg_edge (stmt
, opno
)->src
,
575 gimple_bb (stmt_info
->stmt
));
577 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
578 oprnd
= TREE_OPERAND (oprnd
, 0);
580 oprnd_info
= (*oprnds_info
)[i
];
582 stmt_vec_info def_stmt_info
;
583 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
587 "Build SLP failed: can't analyze def for %T\n",
595 oprnd_info
->def_stmts
.quick_push (NULL
);
596 oprnd_info
->ops
.quick_push (NULL_TREE
);
597 oprnd_info
->first_dt
= vect_uninitialized_def
;
601 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
602 oprnd_info
->ops
.quick_push (oprnd
);
605 && is_pattern_stmt_p (def_stmt_info
))
607 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
609 oprnd_info
->any_pattern
= true;
611 /* If we promote this to external use the original stmt def. */
612 oprnd_info
->ops
.last ()
613 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
616 /* If there's a extern def on a backedge make sure we can
617 code-generate at the region start.
618 ??? This is another case that could be fixed by adjusting
619 how we split the function but at the moment we'd have conflicting
622 && dts
[i
] == vect_external_def
623 && is_a
<bb_vec_info
> (vinfo
)
624 && TREE_CODE (oprnd
) == SSA_NAME
625 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
626 && !dominated_by_p (CDI_DOMINATORS
,
627 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
628 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
630 if (dump_enabled_p ())
631 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
632 "Build SLP failed: extern def %T only defined "
633 "on backedge\n", oprnd
);
639 tree type
= TREE_TYPE (oprnd
);
641 if ((dt
== vect_constant_def
642 || dt
== vect_external_def
)
643 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
644 && (TREE_CODE (type
) == BOOLEAN_TYPE
645 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
648 if (dump_enabled_p ())
649 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
650 "Build SLP failed: invalid type of def "
651 "for variable-length SLP %T\n", oprnd
);
655 /* For the swapping logic below force vect_reduction_def
656 for the reduction op in a SLP reduction group. */
657 if (!STMT_VINFO_DATA_REF (stmt_info
)
658 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
659 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
661 dts
[i
] = dt
= vect_reduction_def
;
663 /* Check the types of the definition. */
666 case vect_external_def
:
667 case vect_constant_def
:
668 case vect_internal_def
:
669 case vect_reduction_def
:
670 case vect_induction_def
:
671 case vect_nested_cycle
:
675 /* FORNOW: Not supported. */
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
678 "Build SLP failed: illegal type of def %T\n",
683 oprnd_info
->first_dt
= dt
;
684 oprnd_info
->first_op_type
= type
;
690 /* Now match the operand definition types to that of the first stmt. */
691 for (i
= 0; i
< number_of_oprnds
;)
699 oprnd_info
= (*oprnds_info
)[i
];
701 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
702 oprnd
= oprnd_info
->ops
[stmt_num
];
703 tree type
= TREE_TYPE (oprnd
);
705 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
709 "Build SLP failed: different operand types\n");
713 /* Not first stmt of the group, check that the def-stmt/s match
714 the def-stmt/s of the first stmt. Allow different definition
715 types for reduction chains: the first stmt must be a
716 vect_reduction_def (a phi node), and the rest
717 end in the reduction chain. */
718 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
719 && !(oprnd_info
->first_dt
== vect_reduction_def
720 && !STMT_VINFO_DATA_REF (stmt_info
)
721 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
723 && !STMT_VINFO_DATA_REF (def_stmt_info
)
724 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
725 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
726 || (!STMT_VINFO_DATA_REF (stmt_info
)
727 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
729 || STMT_VINFO_DATA_REF (def_stmt_info
)
730 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
731 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
732 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
734 /* Try swapping operands if we got a mismatch. For BB
735 vectorization only in case it will clearly improve things. */
736 if (i
== commutative_op
&& !swapped
737 && (!is_a
<bb_vec_info
> (vinfo
)
738 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
740 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
741 || vect_def_types_match
742 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE
, vect_location
,
746 "trying swapped operands\n");
747 std::swap (dts
[i
], dts
[i
+1]);
748 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
749 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
750 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
751 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
756 if (is_a
<bb_vec_info
> (vinfo
)
757 && !oprnd_info
->any_pattern
)
759 /* Now for commutative ops we should see whether we can
760 make the other operand matching. */
761 if (dump_enabled_p ())
762 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
763 "treating operand as external\n");
764 oprnd_info
->first_dt
= dt
= vect_external_def
;
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
770 "Build SLP failed: different types\n");
775 /* Make sure to demote the overall operand to external. */
776 if (dt
== vect_external_def
)
777 oprnd_info
->first_dt
= vect_external_def
;
778 /* For a SLP reduction chain we want to duplicate the reduction to
779 each of the chain members. That gets us a sane SLP graph (still
780 the stmts are not 100% correct wrt the initial values). */
781 else if ((dt
== vect_internal_def
782 || dt
== vect_reduction_def
)
783 && oprnd_info
->first_dt
== vect_reduction_def
784 && !STMT_VINFO_DATA_REF (stmt_info
)
785 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
786 && !STMT_VINFO_DATA_REF (def_stmt_info
)
787 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
788 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
790 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
791 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
800 if (dump_enabled_p ())
801 dump_printf_loc (MSG_NOTE
, vect_location
,
802 "swapped operands to match def types in %G",
809 /* Return true if call statements CALL1 and CALL2 are similar enough
810 to be combined into the same SLP group. */
813 compatible_calls_p (gcall
*call1
, gcall
*call2
)
815 unsigned int nargs
= gimple_call_num_args (call1
);
816 if (nargs
!= gimple_call_num_args (call2
))
819 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
822 if (gimple_call_internal_p (call1
))
824 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
825 TREE_TYPE (gimple_call_lhs (call2
))))
827 for (unsigned int i
= 0; i
< nargs
; ++i
)
828 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
829 TREE_TYPE (gimple_call_arg (call2
, i
))))
834 if (!operand_equal_p (gimple_call_fn (call1
),
835 gimple_call_fn (call2
), 0))
838 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
842 /* Check that any unvectorized arguments are equal. */
843 if (const int *map
= vect_get_operand_map (call1
))
845 unsigned int nkept
= *map
++;
846 unsigned int mapi
= 0;
847 for (unsigned int i
= 0; i
< nargs
; ++i
)
848 if (mapi
< nkept
&& map
[mapi
] == int (i
))
850 else if (!operand_equal_p (gimple_call_arg (call1
, i
),
851 gimple_call_arg (call2
, i
)))
858 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
859 caller's attempt to find the vector type in STMT_INFO with the narrowest
860 element type. Return true if VECTYPE is nonnull and if it is valid
861 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
862 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
863 vect_build_slp_tree. */
866 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
867 unsigned int group_size
,
868 tree vectype
, poly_uint64
*max_nunits
)
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
874 "Build SLP failed: unsupported data-type in %G\n",
876 /* Fatal mismatch. */
880 /* If populating the vector type requires unrolling then fail
881 before adjusting *max_nunits for basic-block vectorization. */
882 if (is_a
<bb_vec_info
> (vinfo
)
883 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
887 "Build SLP failed: unrolling required "
888 "in basic block SLP\n");
889 /* Fatal mismatch. */
893 /* In case of multiple types we need to detect the smallest type. */
894 vect_update_max_nunits (max_nunits
, vectype
);
898 /* Verify if the scalar stmts STMTS are isomorphic, require data
899 permutation or are of unsupported types of operation. Return
900 true if they are, otherwise return false and indicate in *MATCHES
901 which stmts are not isomorphic to the first one. If MATCHES[0]
902 is false then this indicates the comparison could not be
903 carried out or the stmts will never be vectorized by SLP.
905 Note COND_EXPR is possibly isomorphic to another one after swapping its
906 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
907 the first stmt by swapping the two operands of comparison; set SWAP[i]
908 to 2 if stmt I is isormorphic to the first stmt by inverting the code
909 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
910 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
913 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
914 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
915 poly_uint64
*max_nunits
, bool *matches
,
916 bool *two_operators
, tree
*node_vectype
)
919 stmt_vec_info first_stmt_info
= stmts
[0];
920 code_helper first_stmt_code
= ERROR_MARK
;
921 code_helper alt_stmt_code
= ERROR_MARK
;
922 code_helper rhs_code
= ERROR_MARK
;
923 code_helper first_cond_code
= ERROR_MARK
;
925 bool need_same_oprnds
= false;
926 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
927 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
928 bool first_stmt_load_p
= false, load_p
= false;
929 bool first_stmt_phi_p
= false, phi_p
= false;
930 bool maybe_soft_fail
= false;
931 tree soft_fail_nunits_vectype
= NULL_TREE
;
933 /* For every stmt in NODE find its def stmt/s. */
934 stmt_vec_info stmt_info
;
935 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
937 gimple
*stmt
= stmt_info
->stmt
;
941 if (dump_enabled_p ())
942 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
944 /* Fail to vectorize statements marked as unvectorizable, throw
946 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
947 || stmt_can_throw_internal (cfun
, stmt
)
948 || gimple_has_volatile_ops (stmt
))
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
952 "Build SLP failed: unvectorizable statement %G",
954 /* ??? For BB vectorization we want to commutate operands in a way
955 to shuffle all unvectorizable defs into one operand and have
956 the other still vectorized. The following doesn't reliably
957 work for this though but it's the easiest we can do here. */
958 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
960 /* Fatal mismatch. */
965 lhs
= gimple_get_lhs (stmt
);
966 if (lhs
== NULL_TREE
)
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
970 "Build SLP failed: not GIMPLE_ASSIGN nor "
971 "GIMPLE_CALL %G", stmt
);
972 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
974 /* Fatal mismatch. */
980 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
981 &nunits_vectype
, group_size
))
983 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
985 /* Fatal mismatch. */
989 /* Record nunits required but continue analysis, producing matches[]
990 as if nunits was not an issue. This allows splitting of groups
993 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
994 nunits_vectype
, max_nunits
))
996 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
997 maybe_soft_fail
= true;
998 soft_fail_nunits_vectype
= nunits_vectype
;
1001 gcc_assert (vectype
);
1003 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1006 combined_fn cfn
= gimple_call_combined_fn (call_stmt
);
1007 if (cfn
!= CFN_LAST
)
1010 rhs_code
= CALL_EXPR
;
1012 if (cfn
== CFN_MASK_LOAD
1013 || cfn
== CFN_GATHER_LOAD
1014 || cfn
== CFN_MASK_GATHER_LOAD
)
1016 else if ((internal_fn_p (cfn
)
1017 && !vectorizable_internal_fn_p (as_internal_fn (cfn
)))
1018 || gimple_call_tail_p (call_stmt
)
1019 || gimple_call_noreturn_p (call_stmt
)
1020 || gimple_call_chain (call_stmt
))
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1024 "Build SLP failed: unsupported call type %G",
1026 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1028 /* Fatal mismatch. */
1033 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1035 rhs_code
= ERROR_MARK
;
1040 rhs_code
= gimple_assign_rhs_code (stmt
);
1041 load_p
= gimple_vuse (stmt
);
1044 /* Check the operation. */
1047 *node_vectype
= vectype
;
1048 first_stmt_code
= rhs_code
;
1049 first_stmt_load_p
= load_p
;
1050 first_stmt_phi_p
= phi_p
;
1052 /* Shift arguments should be equal in all the packed stmts for a
1053 vector shift with scalar shift operand. */
1054 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1055 || rhs_code
== LROTATE_EXPR
1056 || rhs_code
== RROTATE_EXPR
)
1058 /* First see if we have a vector/vector shift. */
1059 if (!directly_supported_p (rhs_code
, vectype
, optab_vector
))
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 if (!directly_supported_p (rhs_code
, vectype
, optab_scalar
))
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1066 "Build SLP failed: "
1067 "op not supported by target.\n");
1068 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1070 /* Fatal mismatch. */
1074 need_same_oprnds
= true;
1075 first_op1
= gimple_assign_rhs2 (stmt
);
1078 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1080 need_same_oprnds
= true;
1081 first_op1
= gimple_assign_rhs2 (stmt
);
1084 && rhs_code
== BIT_FIELD_REF
)
1086 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1087 if (!is_a
<bb_vec_info
> (vinfo
)
1088 || TREE_CODE (vec
) != SSA_NAME
1089 /* When the element types are not compatible we pun the
1090 source to the target vectype which requires equal size. */
1091 || ((!VECTOR_TYPE_P (TREE_TYPE (vec
))
1092 || !types_compatible_p (TREE_TYPE (vectype
),
1093 TREE_TYPE (TREE_TYPE (vec
))))
1094 && !operand_equal_p (TYPE_SIZE (vectype
),
1095 TYPE_SIZE (TREE_TYPE (vec
)))))
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1099 "Build SLP failed: "
1100 "BIT_FIELD_REF not supported\n");
1101 /* Fatal mismatch. */
1106 else if (rhs_code
== CFN_DIV_POW2
)
1108 need_same_oprnds
= true;
1109 first_op1
= gimple_call_arg (call_stmt
, 1);
1114 if (first_stmt_code
!= rhs_code
1115 && alt_stmt_code
== ERROR_MARK
)
1116 alt_stmt_code
= rhs_code
;
1117 if ((first_stmt_code
!= rhs_code
1118 && (first_stmt_code
!= IMAGPART_EXPR
1119 || rhs_code
!= REALPART_EXPR
)
1120 && (first_stmt_code
!= REALPART_EXPR
1121 || rhs_code
!= IMAGPART_EXPR
)
1122 /* Handle mismatches in plus/minus by computing both
1123 and merging the results. */
1124 && !((first_stmt_code
== PLUS_EXPR
1125 || first_stmt_code
== MINUS_EXPR
)
1126 && (alt_stmt_code
== PLUS_EXPR
1127 || alt_stmt_code
== MINUS_EXPR
)
1128 && rhs_code
== alt_stmt_code
)
1129 && !(first_stmt_code
.is_tree_code ()
1130 && rhs_code
.is_tree_code ()
1131 && (TREE_CODE_CLASS (tree_code (first_stmt_code
))
1133 && (swap_tree_comparison (tree_code (first_stmt_code
))
1134 == tree_code (rhs_code
)))
1135 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1136 && (first_stmt_code
== ARRAY_REF
1137 || first_stmt_code
== BIT_FIELD_REF
1138 || first_stmt_code
== INDIRECT_REF
1139 || first_stmt_code
== COMPONENT_REF
1140 || first_stmt_code
== MEM_REF
)
1141 && (rhs_code
== ARRAY_REF
1142 || rhs_code
== BIT_FIELD_REF
1143 || rhs_code
== INDIRECT_REF
1144 || rhs_code
== COMPONENT_REF
1145 || rhs_code
== MEM_REF
)))
1146 || first_stmt_load_p
!= load_p
1147 || first_stmt_phi_p
!= phi_p
)
1149 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1152 "Build SLP failed: different operation "
1153 "in stmt %G", stmt
);
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1155 "original stmt %G", first_stmt_info
->stmt
);
1162 && first_stmt_code
== BIT_FIELD_REF
1163 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1164 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1166 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1168 "Build SLP failed: different BIT_FIELD_REF "
1169 "arguments in %G", stmt
);
1174 if (call_stmt
&& first_stmt_code
!= CFN_MASK_LOAD
)
1176 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1181 "Build SLP failed: different calls in %G",
1188 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1189 && (gimple_bb (first_stmt_info
->stmt
)
1190 != gimple_bb (stmt_info
->stmt
)))
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1194 "Build SLP failed: different BB for PHI "
1195 "or possibly trapping operation in %G", stmt
);
1200 if (need_same_oprnds
)
1202 tree other_op1
= gimple_arg (stmt
, 1);
1203 if (!operand_equal_p (first_op1
, other_op1
, 0))
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1207 "Build SLP failed: different shift "
1208 "arguments in %G", stmt
);
1214 if (!types_compatible_p (vectype
, *node_vectype
))
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1218 "Build SLP failed: different vector type "
1225 /* Grouped store or load. */
1226 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1228 if (REFERENCE_CLASS_P (lhs
))
1236 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1237 if (prev_first_load
)
1239 /* Check that there are no loads from different interleaving
1240 chains in the same node. */
1241 if (prev_first_load
!= first_load
)
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1246 "Build SLP failed: different "
1247 "interleaving chains in one node %G",
1254 prev_first_load
= first_load
;
1256 } /* Grouped access. */
1260 && rhs_code
!= CFN_GATHER_LOAD
1261 && rhs_code
!= CFN_MASK_GATHER_LOAD
)
1263 /* Not grouped load. */
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1266 "Build SLP failed: not grouped load %G", stmt
);
1268 /* FORNOW: Not grouped loads are not supported. */
1269 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1271 /* Fatal mismatch. */
1276 /* Not memory operation. */
1278 && rhs_code
.is_tree_code ()
1279 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_binary
1280 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_unary
1281 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_expression
1282 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_comparison
1283 && rhs_code
!= VIEW_CONVERT_EXPR
1284 && rhs_code
!= CALL_EXPR
1285 && rhs_code
!= BIT_FIELD_REF
)
1287 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1289 "Build SLP failed: operation unsupported %G",
1291 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1293 /* Fatal mismatch. */
1298 if (rhs_code
== COND_EXPR
)
1300 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1301 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1302 enum tree_code swap_code
= ERROR_MARK
;
1303 enum tree_code invert_code
= ERROR_MARK
;
1306 first_cond_code
= TREE_CODE (cond_expr
);
1307 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1309 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1310 swap_code
= swap_tree_comparison (cond_code
);
1311 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1314 if (first_cond_code
== cond_code
)
1316 /* Isomorphic can be achieved by swapping. */
1317 else if (first_cond_code
== swap_code
)
1319 /* Isomorphic can be achieved by inverting. */
1320 else if (first_cond_code
== invert_code
)
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1326 "Build SLP failed: different"
1327 " operation %G", stmt
);
1333 if (rhs_code
.is_tree_code ()
1334 && TREE_CODE_CLASS ((tree_code
)rhs_code
) == tcc_comparison
1335 && (swap_tree_comparison ((tree_code
)first_stmt_code
)
1336 == (tree_code
)rhs_code
))
1343 for (i
= 0; i
< group_size
; ++i
)
1347 /* If we allowed a two-operation SLP node verify the target can cope
1348 with the permute we are going to use. */
1349 if (alt_stmt_code
!= ERROR_MARK
1350 && (!alt_stmt_code
.is_tree_code ()
1351 || (TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_reference
1352 && TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_comparison
)))
1354 *two_operators
= true;
1357 if (maybe_soft_fail
)
1359 unsigned HOST_WIDE_INT const_nunits
;
1360 if (!TYPE_VECTOR_SUBPARTS
1361 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1362 || const_nunits
> group_size
)
1366 /* With constant vector elements simulate a mismatch at the
1367 point we need to split. */
1368 unsigned tail
= group_size
& (const_nunits
- 1);
1369 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1377 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1378 Note we never remove apart from at destruction time so we do not
1379 need a special value for deleted that differs from empty. */
1382 typedef vec
<stmt_vec_info
> value_type
;
1383 typedef vec
<stmt_vec_info
> compare_type
;
1384 static inline hashval_t
hash (value_type
);
1385 static inline bool equal (value_type existing
, value_type candidate
);
1386 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1387 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1388 static const bool empty_zero_p
= true;
1389 static inline void mark_empty (value_type
&x
) { x
.release (); }
1390 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1391 static inline void remove (value_type
&x
) { x
.release (); }
1394 bst_traits::hash (value_type x
)
1397 for (unsigned i
= 0; i
< x
.length (); ++i
)
1398 h
.add_int (gimple_uid (x
[i
]->stmt
));
1402 bst_traits::equal (value_type existing
, value_type candidate
)
1404 if (existing
.length () != candidate
.length ())
1406 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1407 if (existing
[i
] != candidate
[i
])
1412 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1413 but then vec::insert does memmove and that's not compatible with
1417 chain_op_t (tree_code code_
, vect_def_type dt_
, tree op_
)
1418 : code (code_
), dt (dt_
), op (op_
) {}
1424 /* Comparator for sorting associatable chains. */
1427 dt_sort_cmp (const void *op1_
, const void *op2_
, void *)
1429 auto *op1
= (const chain_op_t
*) op1_
;
1430 auto *op2
= (const chain_op_t
*) op2_
;
1431 if (op1
->dt
!= op2
->dt
)
1432 return (int)op1
->dt
- (int)op2
->dt
;
1433 return (int)op1
->code
- (int)op2
->code
;
1436 /* Linearize the associatable expression chain at START with the
1437 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1438 filling CHAIN with the result and using WORKLIST as intermediate storage.
1439 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1440 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1441 stmts, starting with START. */
1444 vect_slp_linearize_chain (vec_info
*vinfo
,
1445 vec
<std::pair
<tree_code
, gimple
*> > &worklist
,
1446 vec
<chain_op_t
> &chain
,
1447 enum tree_code code
, gimple
*start
,
1448 gimple
*&code_stmt
, gimple
*&alt_code_stmt
,
1449 vec
<gimple
*> *chain_stmts
)
1451 /* For each lane linearize the addition/subtraction (or other
1452 uniform associatable operation) expression tree. */
1453 worklist
.safe_push (std::make_pair (code
, start
));
1454 while (!worklist
.is_empty ())
1456 auto entry
= worklist
.pop ();
1457 gassign
*stmt
= as_a
<gassign
*> (entry
.second
);
1458 enum tree_code in_code
= entry
.first
;
1459 enum tree_code this_code
= gimple_assign_rhs_code (stmt
);
1460 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1462 && gimple_assign_rhs_code (stmt
) == code
)
1464 else if (!alt_code_stmt
1465 && gimple_assign_rhs_code (stmt
) == MINUS_EXPR
)
1466 alt_code_stmt
= stmt
;
1468 chain_stmts
->safe_push (stmt
);
1469 for (unsigned opnum
= 1; opnum
<= 2; ++opnum
)
1471 tree op
= gimple_op (stmt
, opnum
);
1473 stmt_vec_info def_stmt_info
;
1474 bool res
= vect_is_simple_use (op
, vinfo
, &dt
, &def_stmt_info
);
1476 if (dt
== vect_internal_def
1477 && is_pattern_stmt_p (def_stmt_info
))
1478 op
= gimple_get_lhs (def_stmt_info
->stmt
);
1480 use_operand_p use_p
;
1481 if (dt
== vect_internal_def
1482 && single_imm_use (op
, &use_p
, &use_stmt
)
1483 && is_gimple_assign (def_stmt_info
->stmt
)
1484 && (gimple_assign_rhs_code (def_stmt_info
->stmt
) == code
1485 || (code
== PLUS_EXPR
1486 && (gimple_assign_rhs_code (def_stmt_info
->stmt
)
1489 tree_code op_def_code
= this_code
;
1490 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1491 op_def_code
= PLUS_EXPR
;
1492 if (in_code
== MINUS_EXPR
)
1493 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1494 worklist
.safe_push (std::make_pair (op_def_code
,
1495 def_stmt_info
->stmt
));
1499 tree_code op_def_code
= this_code
;
1500 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1501 op_def_code
= PLUS_EXPR
;
1502 if (in_code
== MINUS_EXPR
)
1503 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1504 chain
.safe_push (chain_op_t (op_def_code
, dt
, op
));
1510 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1511 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1512 scalar_stmts_to_slp_tree_map_t
;
1515 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1516 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1517 poly_uint64
*max_nunits
,
1518 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1519 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1522 vect_build_slp_tree (vec_info
*vinfo
,
1523 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1524 poly_uint64
*max_nunits
,
1525 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1526 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1528 if (slp_tree
*leader
= bst_map
->get (stmts
))
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1532 !(*leader
)->failed
? "" : "failed ", *leader
);
1533 if (!(*leader
)->failed
)
1535 SLP_TREE_REF_COUNT (*leader
)++;
1536 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1540 memcpy (matches
, (*leader
)->failed
, sizeof (bool) * group_size
);
1544 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1545 so we can pick up backedge destinations during discovery. */
1546 slp_tree res
= new _slp_tree
;
1547 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1548 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1549 bst_map
->put (stmts
.copy (), res
);
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_NOTE
, vect_location
,
1555 "SLP discovery limit exceeded\n");
1556 /* Mark the node invalid so we can detect those when still in use
1557 as backedge destinations. */
1558 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1559 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1560 res
->failed
= XNEWVEC (bool, group_size
);
1561 memset (res
->failed
, 0, sizeof (bool) * group_size
);
1562 memset (matches
, 0, sizeof (bool) * group_size
);
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_NOTE
, vect_location
,
1569 "starting SLP discovery for node %p\n", res
);
1571 poly_uint64 this_max_nunits
= 1;
1572 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1574 matches
, limit
, tree_size
, bst_map
);
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE
, vect_location
,
1579 "SLP discovery for node %p failed\n", res
);
1580 /* Mark the node invalid so we can detect those when still in use
1581 as backedge destinations. */
1582 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1583 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1584 res
->failed
= XNEWVEC (bool, group_size
);
1588 for (i
= 0; i
< group_size
; ++i
)
1591 gcc_assert (i
< group_size
);
1593 memcpy (res
->failed
, matches
, sizeof (bool) * group_size
);
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_NOTE
, vect_location
,
1599 "SLP discovery for node %p succeeded\n", res
);
1600 gcc_assert (res_
== res
);
1601 res
->max_nunits
= this_max_nunits
;
1602 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1603 /* Keep a reference for the bst_map use. */
1604 SLP_TREE_REF_COUNT (res
)++;
1609 /* Helper for building an associated SLP node chain. */
1612 vect_slp_build_two_operator_nodes (slp_tree perm
, tree vectype
,
1613 slp_tree op0
, slp_tree op1
,
1614 stmt_vec_info oper1
, stmt_vec_info oper2
,
1615 vec
<std::pair
<unsigned, unsigned> > lperm
)
1617 unsigned group_size
= SLP_TREE_LANES (op1
);
1619 slp_tree child1
= new _slp_tree
;
1620 SLP_TREE_DEF_TYPE (child1
) = vect_internal_def
;
1621 SLP_TREE_VECTYPE (child1
) = vectype
;
1622 SLP_TREE_LANES (child1
) = group_size
;
1623 SLP_TREE_CHILDREN (child1
).create (2);
1624 SLP_TREE_CHILDREN (child1
).quick_push (op0
);
1625 SLP_TREE_CHILDREN (child1
).quick_push (op1
);
1626 SLP_TREE_REPRESENTATIVE (child1
) = oper1
;
1628 slp_tree child2
= new _slp_tree
;
1629 SLP_TREE_DEF_TYPE (child2
) = vect_internal_def
;
1630 SLP_TREE_VECTYPE (child2
) = vectype
;
1631 SLP_TREE_LANES (child2
) = group_size
;
1632 SLP_TREE_CHILDREN (child2
).create (2);
1633 SLP_TREE_CHILDREN (child2
).quick_push (op0
);
1634 SLP_TREE_REF_COUNT (op0
)++;
1635 SLP_TREE_CHILDREN (child2
).quick_push (op1
);
1636 SLP_TREE_REF_COUNT (op1
)++;
1637 SLP_TREE_REPRESENTATIVE (child2
) = oper2
;
1639 SLP_TREE_DEF_TYPE (perm
) = vect_internal_def
;
1640 SLP_TREE_CODE (perm
) = VEC_PERM_EXPR
;
1641 SLP_TREE_VECTYPE (perm
) = vectype
;
1642 SLP_TREE_LANES (perm
) = group_size
;
1643 /* ??? We should set this NULL but that's not expected. */
1644 SLP_TREE_REPRESENTATIVE (perm
) = oper1
;
1645 SLP_TREE_LANE_PERMUTATION (perm
) = lperm
;
1646 SLP_TREE_CHILDREN (perm
).quick_push (child1
);
1647 SLP_TREE_CHILDREN (perm
).quick_push (child2
);
1650 /* Recursively build an SLP tree starting from NODE.
1651 Fail (and return a value not equal to zero) if def-stmts are not
1652 isomorphic, require data permutation or are of unsupported types of
1653 operation. Otherwise, return 0.
1654 The value returned is the depth in the SLP tree where a mismatch
1658 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1659 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1660 poly_uint64
*max_nunits
,
1661 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1662 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1664 unsigned nops
, i
, this_tree_size
= 0;
1665 poly_uint64 this_max_nunits
= *max_nunits
;
1669 stmt_vec_info stmt_info
= stmts
[0];
1670 if (!is_a
<gcall
*> (stmt_info
->stmt
)
1671 && !is_a
<gassign
*> (stmt_info
->stmt
)
1672 && !is_a
<gphi
*> (stmt_info
->stmt
))
1675 nops
= gimple_num_args (stmt_info
->stmt
);
1676 if (const int *map
= vect_get_operand_map (stmt_info
->stmt
))
1679 /* If the SLP node is a PHI (induction or reduction), terminate
1681 bool *skip_args
= XALLOCAVEC (bool, nops
);
1682 memset (skip_args
, 0, sizeof (bool) * nops
);
1683 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1684 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1686 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1687 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1689 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1693 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1694 if (def_type
== vect_induction_def
)
1696 /* Induction PHIs are not cycles but walk the initial
1697 value. Only for inner loops through, for outer loops
1698 we need to pick up the value from the actual PHIs
1699 to more easily support peeling and epilogue vectorization. */
1700 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1701 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1702 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1705 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1707 else if (def_type
== vect_reduction_def
1708 || def_type
== vect_double_reduction_def
1709 || def_type
== vect_nested_cycle
)
1711 /* Else def types have to match. */
1712 stmt_vec_info other_info
;
1713 bool all_same
= true;
1714 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1716 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1718 if (other_info
!= stmt_info
)
1721 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1722 /* Reduction initial values are not explicitely represented. */
1723 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1724 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1725 /* Reduction chain backedge defs are filled manually.
1726 ??? Need a better way to identify a SLP reduction chain PHI.
1727 Or a better overall way to SLP match those. */
1728 if (all_same
&& def_type
== vect_reduction_def
)
1729 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1731 else if (def_type
!= vect_internal_def
)
1736 bool two_operators
= false;
1737 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1738 tree vectype
= NULL_TREE
;
1739 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1740 &this_max_nunits
, matches
, &two_operators
,
1744 /* If the SLP node is a load, terminate the recursion unless masked. */
1745 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1746 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1748 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1749 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
)
1750 || gimple_call_internal_p (stmt
, IFN_GATHER_LOAD
)
1751 || gimple_call_internal_p (stmt
, IFN_MASK_GATHER_LOAD
));
1754 *max_nunits
= this_max_nunits
;
1756 node
= vect_create_new_slp_node (node
, stmts
, 0);
1757 SLP_TREE_VECTYPE (node
) = vectype
;
1758 /* And compute the load permutation. Whether it is actually
1759 a permutation depends on the unrolling factor which is
1761 vec
<unsigned> load_permutation
;
1763 stmt_vec_info load_info
;
1764 load_permutation
.create (group_size
);
1765 stmt_vec_info first_stmt_info
1766 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1769 int load_place
= vect_get_place_in_interleaving_chain
1770 (load_info
, first_stmt_info
);
1771 gcc_assert (load_place
!= -1);
1772 load_permutation
.safe_push (load_place
);
1774 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1778 else if (gimple_assign_single_p (stmt_info
->stmt
)
1779 && !gimple_vuse (stmt_info
->stmt
)
1780 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1782 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1783 the same SSA name vector of a compatible type to vectype. */
1784 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1785 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1786 stmt_vec_info estmt_info
;
1787 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1789 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1790 tree bfref
= gimple_assign_rhs1 (estmt
);
1792 if (!known_eq (bit_field_size (bfref
),
1793 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1794 || !constant_multiple_p (bit_field_offset (bfref
),
1795 bit_field_size (bfref
), &lane
))
1801 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1803 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1804 if (operand_equal_p (TYPE_SIZE (vectype
), TYPE_SIZE (TREE_TYPE (vec
))))
1805 /* ??? We record vectype here but we hide eventually necessary
1806 punning and instead rely on code generation to materialize
1807 VIEW_CONVERT_EXPRs as necessary. We instead should make
1808 this explicit somehow. */
1809 SLP_TREE_VECTYPE (vnode
) = vectype
;
1812 /* For different size but compatible elements we can still
1813 use VEC_PERM_EXPR without punning. */
1814 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec
))
1815 && types_compatible_p (TREE_TYPE (vectype
),
1816 TREE_TYPE (TREE_TYPE (vec
))));
1817 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1819 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1820 /* We are always building a permutation node even if it is an identity
1821 permute to shield the rest of the vectorizer from the odd node
1822 representing an actual vector without any scalar ops.
1823 ??? We could hide it completely with making the permute node
1825 node
= vect_create_new_slp_node (node
, stmts
, 1);
1826 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1827 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1828 SLP_TREE_VECTYPE (node
) = vectype
;
1829 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1832 /* When discovery reaches an associatable operation see whether we can
1833 improve that to match up lanes in a way superior to the operand
1834 swapping code which at most looks at two defs.
1835 ??? For BB vectorization we cannot do the brute-force search
1836 for matching as we can succeed by means of builds from scalars
1837 and have no good way to "cost" one build against another. */
1838 else if (is_a
<loop_vec_info
> (vinfo
)
1839 /* ??? We don't handle !vect_internal_def defs below. */
1840 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1841 && is_gimple_assign (stmt_info
->stmt
)
1842 && (associative_tree_code (gimple_assign_rhs_code (stmt_info
->stmt
))
1843 || gimple_assign_rhs_code (stmt_info
->stmt
) == MINUS_EXPR
)
1844 && ((FLOAT_TYPE_P (vectype
) && flag_associative_math
)
1845 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype
))
1846 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype
)))))
1848 /* See if we have a chain of (mixed) adds or subtracts or other
1849 associatable ops. */
1850 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
1851 if (code
== MINUS_EXPR
)
1853 stmt_vec_info other_op_stmt_info
= NULL
;
1854 stmt_vec_info op_stmt_info
= NULL
;
1855 unsigned chain_len
= 0;
1856 auto_vec
<chain_op_t
> chain
;
1857 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
1858 auto_vec
<vec
<chain_op_t
> > chains (group_size
);
1859 auto_vec
<slp_tree
, 4> children
;
1860 bool hard_fail
= true;
1861 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1863 /* For each lane linearize the addition/subtraction (or other
1864 uniform associatable operation) expression tree. */
1865 gimple
*op_stmt
= NULL
, *other_op_stmt
= NULL
;
1866 vect_slp_linearize_chain (vinfo
, worklist
, chain
, code
,
1867 stmts
[lane
]->stmt
, op_stmt
, other_op_stmt
,
1869 if (!op_stmt_info
&& op_stmt
)
1870 op_stmt_info
= vinfo
->lookup_stmt (op_stmt
);
1871 if (!other_op_stmt_info
&& other_op_stmt
)
1872 other_op_stmt_info
= vinfo
->lookup_stmt (other_op_stmt
);
1873 if (chain
.length () == 2)
1875 /* In a chain of just two elements resort to the regular
1876 operand swapping scheme. If we run into a length
1877 mismatch still hard-FAIL. */
1882 matches
[lane
] = false;
1883 /* ??? We might want to process the other lanes, but
1884 make sure to not give false matching hints to the
1885 caller for lanes we did not process. */
1886 if (lane
!= group_size
- 1)
1891 else if (chain_len
== 0)
1892 chain_len
= chain
.length ();
1893 else if (chain
.length () != chain_len
)
1895 /* ??? Here we could slip in magic to compensate with
1896 neutral operands. */
1897 matches
[lane
] = false;
1898 if (lane
!= group_size
- 1)
1902 chains
.quick_push (chain
.copy ());
1905 if (chains
.length () == group_size
)
1907 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1913 /* Now we have a set of chains with the same length. */
1914 /* 1. pre-sort according to def_type and operation. */
1915 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1916 chains
[lane
].stablesort (dt_sort_cmp
, vinfo
);
1917 if (dump_enabled_p ())
1919 dump_printf_loc (MSG_NOTE
, vect_location
,
1920 "pre-sorted chains of %s\n",
1921 get_tree_code_name (code
));
1922 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1924 for (unsigned opnum
= 0; opnum
< chain_len
; ++opnum
)
1925 dump_printf (MSG_NOTE
, "%s %T ",
1926 get_tree_code_name (chains
[lane
][opnum
].code
),
1927 chains
[lane
][opnum
].op
);
1928 dump_printf (MSG_NOTE
, "\n");
1931 /* 2. try to build children nodes, associating as necessary. */
1932 for (unsigned n
= 0; n
< chain_len
; ++n
)
1934 vect_def_type dt
= chains
[0][n
].dt
;
1936 for (lane
= 0; lane
< group_size
; ++lane
)
1937 if (chains
[lane
][n
].dt
!= dt
)
1939 if (dt
== vect_constant_def
1940 && chains
[lane
][n
].dt
== vect_external_def
)
1941 dt
= vect_external_def
;
1942 else if (dt
== vect_external_def
1943 && chains
[lane
][n
].dt
== vect_constant_def
)
1948 if (lane
!= group_size
)
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE
, vect_location
,
1952 "giving up on chain due to mismatched "
1954 matches
[lane
] = false;
1955 if (lane
!= group_size
- 1)
1959 if (dt
== vect_constant_def
1960 || dt
== vect_external_def
)
1962 /* Check whether we can build the invariant. If we can't
1963 we never will be able to. */
1964 tree type
= TREE_TYPE (chains
[0][n
].op
);
1965 if (!GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
1966 && (TREE_CODE (type
) == BOOLEAN_TYPE
1967 || !can_duplicate_and_interleave_p (vinfo
, group_size
,
1974 ops
.create (group_size
);
1975 for (lane
= 0; lane
< group_size
; ++lane
)
1976 ops
.quick_push (chains
[lane
][n
].op
);
1977 slp_tree child
= vect_create_new_slp_node (ops
);
1978 SLP_TREE_DEF_TYPE (child
) = dt
;
1979 children
.safe_push (child
);
1981 else if (dt
!= vect_internal_def
)
1983 /* Not sure, we might need sth special.
1984 gcc.dg/vect/pr96854.c,
1985 gfortran.dg/vect/fast-math-pr37021.f90
1986 and gfortran.dg/vect/pr61171.f trigger. */
1987 /* Soft-fail for now. */
1993 vec
<stmt_vec_info
> op_stmts
;
1994 op_stmts
.create (group_size
);
1995 slp_tree child
= NULL
;
1996 /* Brute-force our way. We have to consider a lane
1997 failing after fixing an earlier fail up in the
1998 SLP discovery recursion. So track the current
1999 permute per lane. */
2000 unsigned *perms
= XALLOCAVEC (unsigned, group_size
);
2001 memset (perms
, 0, sizeof (unsigned) * group_size
);
2004 op_stmts
.truncate (0);
2005 for (lane
= 0; lane
< group_size
; ++lane
)
2007 (vinfo
->lookup_def (chains
[lane
][n
].op
));
2008 child
= vect_build_slp_tree (vinfo
, op_stmts
,
2009 group_size
, &this_max_nunits
,
2011 &this_tree_size
, bst_map
);
2012 /* ??? We're likely getting too many fatal mismatches
2013 here so maybe we want to ignore them (but then we
2014 have no idea which lanes fatally mismatched). */
2015 if (child
|| !matches
[0])
2017 /* Swap another lane we have not yet matched up into
2018 lanes that did not match. If we run out of
2019 permute possibilities for a lane terminate the
2022 for (lane
= 1; lane
< group_size
; ++lane
)
2025 if (n
+ perms
[lane
] + 1 == chain_len
)
2030 std::swap (chains
[lane
][n
],
2031 chains
[lane
][n
+ perms
[lane
] + 1]);
2040 if (dump_enabled_p ())
2041 dump_printf_loc (MSG_NOTE
, vect_location
,
2042 "failed to match up op %d\n", n
);
2043 op_stmts
.release ();
2044 if (lane
!= group_size
- 1)
2047 matches
[lane
] = false;
2050 if (dump_enabled_p ())
2052 dump_printf_loc (MSG_NOTE
, vect_location
,
2053 "matched up op %d to\n", n
);
2054 vect_print_slp_tree (MSG_NOTE
, vect_location
, child
);
2056 children
.safe_push (child
);
2059 /* 3. build SLP nodes to combine the chain. */
2060 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2061 if (chains
[lane
][0].code
!= code
)
2063 /* See if there's any alternate all-PLUS entry. */
2065 for (n
= 1; n
< chain_len
; ++n
)
2067 for (lane
= 0; lane
< group_size
; ++lane
)
2068 if (chains
[lane
][n
].code
!= code
)
2070 if (lane
== group_size
)
2075 /* Swap that in at first position. */
2076 std::swap (children
[0], children
[n
]);
2077 for (lane
= 0; lane
< group_size
; ++lane
)
2078 std::swap (chains
[lane
][0], chains
[lane
][n
]);
2082 /* ??? When this triggers and we end up with two
2083 vect_constant/external_def up-front things break (ICE)
2084 spectacularly finding an insertion place for the
2085 all-constant op. We should have a fully
2086 vect_internal_def operand though(?) so we can swap
2087 that into first place and then prepend the all-zero
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_NOTE
, vect_location
,
2091 "inserting constant zero to compensate "
2092 "for (partially) negated first "
2095 for (lane
= 0; lane
< group_size
; ++lane
)
2096 chains
[lane
].safe_insert
2097 (0, chain_op_t (code
, vect_constant_def
, NULL_TREE
));
2099 zero_ops
.create (group_size
);
2100 zero_ops
.quick_push (build_zero_cst (TREE_TYPE (vectype
)));
2101 for (lane
= 1; lane
< group_size
; ++lane
)
2102 zero_ops
.quick_push (zero_ops
[0]);
2103 slp_tree zero
= vect_create_new_slp_node (zero_ops
);
2104 SLP_TREE_DEF_TYPE (zero
) = vect_constant_def
;
2105 children
.safe_insert (0, zero
);
2109 for (unsigned i
= 1; i
< children
.length (); ++i
)
2111 slp_tree op0
= children
[i
- 1];
2112 slp_tree op1
= children
[i
];
2113 bool this_two_op
= false;
2114 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2115 if (chains
[lane
][i
].code
!= chains
[0][i
].code
)
2121 if (i
== children
.length () - 1)
2122 child
= vect_create_new_slp_node (node
, stmts
, 2);
2124 child
= vect_create_new_slp_node (2, ERROR_MARK
);
2127 vec
<std::pair
<unsigned, unsigned> > lperm
;
2128 lperm
.create (group_size
);
2129 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2130 lperm
.quick_push (std::make_pair
2131 (chains
[lane
][i
].code
!= chains
[0][i
].code
, lane
));
2132 vect_slp_build_two_operator_nodes (child
, vectype
, op0
, op1
,
2133 (chains
[0][i
].code
== code
2135 : other_op_stmt_info
),
2136 (chains
[0][i
].code
== code
2137 ? other_op_stmt_info
2143 SLP_TREE_DEF_TYPE (child
) = vect_internal_def
;
2144 SLP_TREE_VECTYPE (child
) = vectype
;
2145 SLP_TREE_LANES (child
) = group_size
;
2146 SLP_TREE_CHILDREN (child
).quick_push (op0
);
2147 SLP_TREE_CHILDREN (child
).quick_push (op1
);
2148 SLP_TREE_REPRESENTATIVE (child
)
2149 = (chains
[0][i
].code
== code
2150 ? op_stmt_info
: other_op_stmt_info
);
2152 children
[i
] = child
;
2154 *tree_size
+= this_tree_size
+ 1;
2155 *max_nunits
= this_max_nunits
;
2156 while (!chains
.is_empty ())
2157 chains
.pop ().release ();
2161 while (!children
.is_empty ())
2162 vect_free_slp_tree (children
.pop ());
2163 while (!chains
.is_empty ())
2164 chains
.pop ().release ();
2165 /* Hard-fail, otherwise we might run into quadratic processing of the
2166 chains starting one stmt into the chain again. */
2169 /* Fall thru to normal processing. */
2172 /* Get at the operands, verifying they are compatible. */
2173 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
2174 slp_oprnd_info oprnd_info
;
2175 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
2177 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
2178 stmts
, i
, &oprnds_info
);
2180 matches
[(res
== -1) ? 0 : i
] = false;
2184 for (i
= 0; i
< group_size
; ++i
)
2187 vect_free_oprnd_info (oprnds_info
);
2192 auto_vec
<slp_tree
, 4> children
;
2194 stmt_info
= stmts
[0];
2196 /* Create SLP_TREE nodes for the definition node/s. */
2197 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2202 /* We're skipping certain operands from processing, for example
2203 outer loop reduction initial defs. */
2206 children
.safe_push (NULL
);
2210 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
2212 /* COND_EXPR have one too many eventually if the condition
2214 gcc_assert (i
== 3 && nops
== 4);
2218 if (is_a
<bb_vec_info
> (vinfo
)
2219 && oprnd_info
->first_dt
== vect_internal_def
2220 && !oprnd_info
->any_pattern
)
2222 /* For BB vectorization, if all defs are the same do not
2223 bother to continue the build along the single-lane
2224 graph but use a splat of the scalar value. */
2225 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
2226 for (j
= 1; j
< group_size
; ++j
)
2227 if (oprnd_info
->def_stmts
[j
] != first_def
)
2230 /* But avoid doing this for loads where we may be
2231 able to CSE things, unless the stmt is not
2233 && (!STMT_VINFO_VECTORIZABLE (first_def
)
2234 || !gimple_vuse (first_def
->stmt
)))
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_NOTE
, vect_location
,
2238 "Using a splat of the uniform operand %G",
2240 oprnd_info
->first_dt
= vect_external_def
;
2244 if (oprnd_info
->first_dt
== vect_external_def
2245 || oprnd_info
->first_dt
== vect_constant_def
)
2247 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
2248 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
2249 oprnd_info
->ops
= vNULL
;
2250 children
.safe_push (invnode
);
2254 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2255 group_size
, &this_max_nunits
,
2257 &this_tree_size
, bst_map
)) != NULL
)
2259 oprnd_info
->def_stmts
= vNULL
;
2260 children
.safe_push (child
);
2264 /* If the SLP build for operand zero failed and operand zero
2265 and one can be commutated try that for the scalar stmts
2266 that failed the match. */
2268 /* A first scalar stmt mismatch signals a fatal mismatch. */
2270 /* ??? For COND_EXPRs we can swap the comparison operands
2271 as well as the arms under some constraints. */
2273 && oprnds_info
[1]->first_dt
== vect_internal_def
2274 && is_gimple_assign (stmt_info
->stmt
)
2275 /* Swapping operands for reductions breaks assumptions later on. */
2276 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
2277 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2279 /* See whether we can swap the matching or the non-matching
2281 bool swap_not_matching
= true;
2284 for (j
= 0; j
< group_size
; ++j
)
2286 if (matches
[j
] != !swap_not_matching
)
2288 stmt_vec_info stmt_info
= stmts
[j
];
2289 /* Verify if we can swap operands of this stmt. */
2290 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
2292 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
2294 if (!swap_not_matching
)
2296 swap_not_matching
= false;
2301 while (j
!= group_size
);
2303 /* Swap mismatched definition stmts. */
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE
, vect_location
,
2306 "Re-trying with swapped operands of stmts ");
2307 for (j
= 0; j
< group_size
; ++j
)
2308 if (matches
[j
] == !swap_not_matching
)
2310 std::swap (oprnds_info
[0]->def_stmts
[j
],
2311 oprnds_info
[1]->def_stmts
[j
]);
2312 std::swap (oprnds_info
[0]->ops
[j
],
2313 oprnds_info
[1]->ops
[j
]);
2314 if (dump_enabled_p ())
2315 dump_printf (MSG_NOTE
, "%d ", j
);
2317 if (dump_enabled_p ())
2318 dump_printf (MSG_NOTE
, "\n");
2319 /* After swapping some operands we lost track whether an
2320 operand has any pattern defs so be conservative here. */
2321 if (oprnds_info
[0]->any_pattern
|| oprnds_info
[1]->any_pattern
)
2322 oprnds_info
[0]->any_pattern
= oprnds_info
[1]->any_pattern
= true;
2323 /* And try again with scratch 'matches' ... */
2324 bool *tem
= XALLOCAVEC (bool, group_size
);
2325 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2326 group_size
, &this_max_nunits
,
2328 &this_tree_size
, bst_map
)) != NULL
)
2330 oprnd_info
->def_stmts
= vNULL
;
2331 children
.safe_push (child
);
2337 /* If the SLP build failed and we analyze a basic-block
2338 simply treat nodes we fail to build as externally defined
2339 (and thus build vectors from the scalar defs).
2340 The cost model will reject outright expensive cases.
2341 ??? This doesn't treat cases where permutation ultimatively
2342 fails (or we don't try permutation below). Ideally we'd
2343 even compute a permutation that will end up with the maximum
2345 if (is_a
<bb_vec_info
> (vinfo
)
2346 /* ??? Rejecting patterns this way doesn't work. We'd have to
2347 do extra work to cancel the pattern so the uses see the
2349 && !is_pattern_stmt_p (stmt_info
)
2350 && !oprnd_info
->any_pattern
)
2352 /* But if there's a leading vector sized set of matching stmts
2353 fail here so we can split the group. This matches the condition
2354 vect_analyze_slp_instance uses. */
2355 /* ??? We might want to split here and combine the results to support
2356 multiple vector sizes better. */
2357 for (j
= 0; j
< group_size
; ++j
)
2360 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
2362 if (dump_enabled_p ())
2363 dump_printf_loc (MSG_NOTE
, vect_location
,
2364 "Building vector operands from scalars\n");
2366 child
= vect_create_new_slp_node (oprnd_info
->ops
);
2367 children
.safe_push (child
);
2368 oprnd_info
->ops
= vNULL
;
2373 gcc_assert (child
== NULL
);
2374 FOR_EACH_VEC_ELT (children
, j
, child
)
2376 vect_free_slp_tree (child
);
2377 vect_free_oprnd_info (oprnds_info
);
2381 vect_free_oprnd_info (oprnds_info
);
2383 /* If we have all children of a child built up from uniform scalars
2384 or does more than one possibly expensive vector construction then
2385 just throw that away, causing it built up from scalars.
2386 The exception is the SLP node for the vector store. */
2387 if (is_a
<bb_vec_info
> (vinfo
)
2388 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2389 /* ??? Rejecting patterns this way doesn't work. We'd have to
2390 do extra work to cancel the pattern so the uses see the
2392 && !is_pattern_stmt_p (stmt_info
))
2396 bool all_uniform_p
= true;
2397 unsigned n_vector_builds
= 0;
2398 FOR_EACH_VEC_ELT (children
, j
, child
)
2402 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2403 all_uniform_p
= false;
2404 else if (!vect_slp_tree_uniform_p (child
))
2406 all_uniform_p
= false;
2407 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2412 || n_vector_builds
> 1
2413 || (n_vector_builds
== children
.length ()
2414 && is_a
<gphi
*> (stmt_info
->stmt
)))
2418 FOR_EACH_VEC_ELT (children
, j
, child
)
2420 vect_free_slp_tree (child
);
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE
, vect_location
,
2424 "Building parent vector operands from "
2425 "scalars instead\n");
2430 *tree_size
+= this_tree_size
+ 1;
2431 *max_nunits
= this_max_nunits
;
2435 /* ??? We'd likely want to either cache in bst_map sth like
2436 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2437 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2438 explicit stmts to put in so the keying on 'stmts' doesn't
2439 work (but we have the same issue with nodes that use 'ops'). */
2440 slp_tree one
= new _slp_tree
;
2441 slp_tree two
= new _slp_tree
;
2442 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
2443 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
2444 SLP_TREE_VECTYPE (one
) = vectype
;
2445 SLP_TREE_VECTYPE (two
) = vectype
;
2446 SLP_TREE_CHILDREN (one
).safe_splice (children
);
2447 SLP_TREE_CHILDREN (two
).safe_splice (children
);
2449 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
2450 SLP_TREE_REF_COUNT (child
)++;
2452 /* Here we record the original defs since this
2453 node represents the final lane configuration. */
2454 node
= vect_create_new_slp_node (node
, stmts
, 2);
2455 SLP_TREE_VECTYPE (node
) = vectype
;
2456 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2457 SLP_TREE_CHILDREN (node
).quick_push (one
);
2458 SLP_TREE_CHILDREN (node
).quick_push (two
);
2459 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
2460 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
2461 enum tree_code ocode
= ERROR_MARK
;
2462 stmt_vec_info ostmt_info
;
2464 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
2466 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
2467 if (gimple_assign_rhs_code (ostmt
) != code0
)
2469 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
2470 ocode
= gimple_assign_rhs_code (ostmt
);
2474 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
2476 SLP_TREE_CODE (one
) = code0
;
2477 SLP_TREE_CODE (two
) = ocode
;
2478 SLP_TREE_LANES (one
) = stmts
.length ();
2479 SLP_TREE_LANES (two
) = stmts
.length ();
2480 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
2481 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
2485 node
= vect_create_new_slp_node (node
, stmts
, nops
);
2486 SLP_TREE_VECTYPE (node
) = vectype
;
2487 SLP_TREE_CHILDREN (node
).splice (children
);
2491 /* Dump a single SLP tree NODE. */
2494 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
2499 stmt_vec_info stmt_info
;
2502 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2503 dump_user_location_t user_loc
= loc
.get_user_location ();
2504 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)",
2505 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2507 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2510 estimated_poly_value (node
->max_nunits
),
2511 SLP_TREE_REF_COUNT (node
));
2512 if (SLP_TREE_VECTYPE (node
))
2513 dump_printf (metadata
, " %T", SLP_TREE_VECTYPE (node
));
2514 dump_printf (metadata
, "\n");
2515 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2517 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2518 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2520 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2521 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2523 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2524 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2525 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2528 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2529 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2530 dump_printf (metadata
, "%T%s ", op
,
2531 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2532 dump_printf (metadata
, "}\n");
2534 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2536 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2537 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2538 dump_printf (dump_kind
, " %u", j
);
2539 dump_printf (dump_kind
, " }\n");
2541 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2543 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2544 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2545 dump_printf (dump_kind
, " %u[%u]",
2546 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2547 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2548 dump_printf (dump_kind
, " }\n");
2550 if (SLP_TREE_CHILDREN (node
).is_empty ())
2552 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2553 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2554 dump_printf (dump_kind
, " %p", (void *)child
);
2555 dump_printf (dump_kind
, "\n");
2559 debug (slp_tree node
)
2561 debug_dump_context ctx
;
2562 vect_print_slp_tree (MSG_NOTE
,
2563 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2567 /* Recursive helper for the dot producer below. */
2570 dot_slp_tree (FILE *f
, slp_tree node
, hash_set
<slp_tree
> &visited
)
2572 if (visited
.add (node
))
2575 fprintf (f
, "\"%p\" [label=\"", (void *)node
);
2576 vect_print_slp_tree (MSG_NOTE
,
2577 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2579 fprintf (f
, "\"];\n");
2582 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2583 fprintf (f
, "\"%p\" -> \"%p\";", (void *)node
, (void *)child
);
2585 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2587 dot_slp_tree (f
, child
, visited
);
2591 dot_slp_tree (const char *fname
, slp_tree node
)
2593 FILE *f
= fopen (fname
, "w");
2594 fprintf (f
, "digraph {\n");
2597 debug_dump_context
ctx (f
);
2598 hash_set
<slp_tree
> visited
;
2599 dot_slp_tree (f
, node
, visited
);
2606 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2609 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2610 slp_tree node
, hash_set
<slp_tree
> &visited
)
2615 if (visited
.add (node
))
2618 vect_print_slp_tree (dump_kind
, loc
, node
);
2620 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2622 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2626 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2629 hash_set
<slp_tree
> visited
;
2630 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2633 /* Mark the tree rooted at NODE with PURE_SLP. */
2636 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2639 stmt_vec_info stmt_info
;
2642 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2645 if (visited
.add (node
))
2648 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2649 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2651 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2653 vect_mark_slp_stmts (child
, visited
);
2657 vect_mark_slp_stmts (slp_tree node
)
2659 hash_set
<slp_tree
> visited
;
2660 vect_mark_slp_stmts (node
, visited
);
2663 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2666 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2669 stmt_vec_info stmt_info
;
2672 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2675 if (visited
.add (node
))
2678 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2680 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2681 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2682 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2685 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2687 vect_mark_slp_stmts_relevant (child
, visited
);
2691 vect_mark_slp_stmts_relevant (slp_tree node
)
2693 hash_set
<slp_tree
> visited
;
2694 vect_mark_slp_stmts_relevant (node
, visited
);
2698 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2701 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2702 hash_set
<slp_tree
> &visited
)
2704 if (!node
|| visited
.add (node
))
2707 if (SLP_TREE_CHILDREN (node
).length () == 0)
2709 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2711 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2712 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2713 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2714 loads
.safe_push (node
);
2720 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2721 vect_gather_slp_loads (loads
, child
, visited
);
2726 /* Find the last store in SLP INSTANCE. */
2729 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2731 stmt_vec_info last
= NULL
;
2732 stmt_vec_info stmt_vinfo
;
2734 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2736 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2737 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2743 /* Find the first stmt in NODE. */
2746 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2748 stmt_vec_info first
= NULL
;
2749 stmt_vec_info stmt_vinfo
;
2751 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2753 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2755 || get_later_stmt (stmt_vinfo
, first
) == first
)
2762 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2763 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2764 (also containing the first GROUP1_SIZE stmts, since stores are
2765 consecutive), the second containing the remainder.
2766 Return the first stmt in the second group. */
2768 static stmt_vec_info
2769 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2771 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2772 gcc_assert (group1_size
> 0);
2773 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2774 gcc_assert (group2_size
> 0);
2775 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2777 stmt_vec_info stmt_info
= first_vinfo
;
2778 for (unsigned i
= group1_size
; i
> 1; i
--)
2780 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2781 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2783 /* STMT is now the last element of the first group. */
2784 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2785 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2787 DR_GROUP_SIZE (group2
) = group2_size
;
2788 for (stmt_info
= group2
; stmt_info
;
2789 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2791 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2792 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2795 /* For the second group, the DR_GROUP_GAP is that before the original group,
2796 plus skipping over the first vector. */
2797 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2799 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2800 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2804 group1_size
, group2_size
);
2809 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2810 statements and a vector of NUNITS elements. */
2813 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2815 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2818 /* Helper that checks to see if a node is a load node. */
2821 vect_is_slp_load_node (slp_tree root
)
2823 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2824 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2825 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2829 /* Helper function of optimize_load_redistribution that performs the operation
2833 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2834 vec_info
*vinfo
, unsigned int group_size
,
2835 hash_map
<slp_tree
, slp_tree
> *load_map
,
2838 if (slp_tree
*leader
= load_map
->get (root
))
2844 /* For now, we don't know anything about externals so do not do anything. */
2845 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2847 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2849 /* First convert this node into a load node and add it to the leaves
2850 list and flatten the permute from a lane to a load one. If it's
2851 unneeded it will be elided later. */
2852 vec
<stmt_vec_info
> stmts
;
2853 stmts
.create (SLP_TREE_LANES (root
));
2854 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2855 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2857 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2858 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2860 if (!vect_is_slp_load_node (node
)
2861 || SLP_TREE_CHILDREN (node
).exists ())
2867 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_NOTE
, vect_location
,
2872 "converting stmts on permute node %p\n", root
);
2874 bool *matches
= XALLOCAVEC (bool, group_size
);
2875 poly_uint64 max_nunits
= 1;
2876 unsigned tree_size
= 0, limit
= 1;
2877 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2878 matches
, &limit
, &tree_size
, bst_map
);
2882 load_map
->put (root
, node
);
2887 load_map
->put (root
, NULL
);
2889 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2892 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2896 SLP_TREE_REF_COUNT (value
)++;
2897 SLP_TREE_CHILDREN (root
)[i
] = value
;
2898 /* ??? We know the original leafs of the replaced nodes will
2899 be referenced by bst_map, only the permutes created by
2900 pattern matching are not. */
2901 if (SLP_TREE_REF_COUNT (node
) == 1)
2902 load_map
->remove (node
);
2903 vect_free_slp_tree (node
);
2910 /* Temporary workaround for loads not being CSEd during SLP build. This
2911 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2912 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2913 same DR such that the final operation is equal to a permuted load. Such
2914 NODES are then directly converted into LOADS themselves. The nodes are
2915 CSEd using BST_MAP. */
2918 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2919 vec_info
*vinfo
, unsigned int group_size
,
2920 hash_map
<slp_tree
, slp_tree
> *load_map
,
2926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2929 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2933 SLP_TREE_REF_COUNT (value
)++;
2934 SLP_TREE_CHILDREN (root
)[i
] = value
;
2935 /* ??? We know the original leafs of the replaced nodes will
2936 be referenced by bst_map, only the permutes created by
2937 pattern matching are not. */
2938 if (SLP_TREE_REF_COUNT (node
) == 1)
2939 load_map
->remove (node
);
2940 vect_free_slp_tree (node
);
2945 /* Helper function of vect_match_slp_patterns.
2947 Attempts to match patterns against the slp tree rooted in REF_NODE using
2948 VINFO. Patterns are matched in post-order traversal.
2950 If matching is successful the value in REF_NODE is updated and returned, if
2951 not then it is returned unchanged. */
2954 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2955 slp_tree_to_load_perm_map_t
*perm_cache
,
2956 slp_compat_nodes_map_t
*compat_cache
,
2957 hash_set
<slp_tree
> *visited
)
2960 slp_tree node
= *ref_node
;
2961 bool found_p
= false;
2962 if (!node
|| visited
->add (node
))
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2967 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2968 vinfo
, perm_cache
, compat_cache
,
2971 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2973 vect_pattern
*pattern
2974 = slp_patterns
[x
] (perm_cache
, compat_cache
, ref_node
);
2977 pattern
->build (vinfo
);
2986 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2989 The modified tree is returned. Patterns are tried in order and multiple
2990 patterns may match. */
2993 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2994 hash_set
<slp_tree
> *visited
,
2995 slp_tree_to_load_perm_map_t
*perm_cache
,
2996 slp_compat_nodes_map_t
*compat_cache
)
2998 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2999 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE
, vect_location
,
3003 "Analyzing SLP tree %p for patterns\n",
3004 SLP_INSTANCE_TREE (instance
));
3006 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, compat_cache
,
3010 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3011 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3012 Return true if we could use IFN_STORE_LANES instead and if that appears
3013 to be the better approach. */
3016 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
3017 unsigned int group_size
,
3018 unsigned int new_group_size
)
3020 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3021 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
3024 /* Allow the split if one of the two new groups would operate on full
3025 vectors *within* rather than across one scalar loop iteration.
3026 This is purely a heuristic, but it should work well for group
3027 sizes of 3 and 4, where the possible splits are:
3029 3->2+1: OK if the vector has exactly two elements
3031 4->3+1: Less clear-cut. */
3032 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
3033 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
3035 return vect_store_lanes_supported (vectype
, group_size
, false);
3038 /* Analyze an SLP instance starting from a group of grouped stores. Call
3039 vect_build_slp_tree to build a tree of packed stmts if possible.
3040 Return FALSE if it's impossible to SLP any stmt in the loop. */
3043 vect_analyze_slp_instance (vec_info
*vinfo
,
3044 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3045 stmt_vec_info stmt_info
, slp_instance_kind kind
,
3046 unsigned max_tree_size
, unsigned *limit
);
3048 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3049 of KIND. Return true if successful. */
3052 vect_build_slp_instance (vec_info
*vinfo
,
3053 slp_instance_kind kind
,
3054 vec
<stmt_vec_info
> &scalar_stmts
,
3055 vec
<stmt_vec_info
> &root_stmt_infos
,
3056 unsigned max_tree_size
, unsigned *limit
,
3057 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3058 /* ??? We need stmt_info for group splitting. */
3059 stmt_vec_info stmt_info_
)
3061 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_NOTE
, vect_location
,
3064 "Starting SLP discovery for\n");
3065 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
3066 dump_printf_loc (MSG_NOTE
, vect_location
,
3067 " %G", scalar_stmts
[i
]->stmt
);
3070 /* Build the tree for the SLP instance. */
3071 unsigned int group_size
= scalar_stmts
.length ();
3072 bool *matches
= XALLOCAVEC (bool, group_size
);
3073 poly_uint64 max_nunits
= 1;
3074 unsigned tree_size
= 0;
3076 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
3077 &max_nunits
, matches
, limit
,
3078 &tree_size
, bst_map
);
3081 /* Calculate the unrolling factor based on the smallest type. */
3082 poly_uint64 unrolling_factor
3083 = calculate_unrolling_factor (max_nunits
, group_size
);
3085 if (maybe_ne (unrolling_factor
, 1U)
3086 && is_a
<bb_vec_info
> (vinfo
))
3088 unsigned HOST_WIDE_INT const_max_nunits
;
3089 if (!max_nunits
.is_constant (&const_max_nunits
)
3090 || const_max_nunits
> group_size
)
3092 if (dump_enabled_p ())
3093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3094 "Build SLP failed: store group "
3095 "size not a multiple of the vector size "
3096 "in basic block SLP\n");
3097 vect_free_slp_tree (node
);
3100 /* Fatal mismatch. */
3101 if (dump_enabled_p ())
3102 dump_printf_loc (MSG_NOTE
, vect_location
,
3103 "SLP discovery succeeded but node needs "
3105 memset (matches
, true, group_size
);
3106 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
3107 vect_free_slp_tree (node
);
3111 /* Create a new SLP instance. */
3112 slp_instance new_instance
= XNEW (class _slp_instance
);
3113 SLP_INSTANCE_TREE (new_instance
) = node
;
3114 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
3115 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
3116 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
3117 SLP_INSTANCE_KIND (new_instance
) = kind
;
3118 new_instance
->reduc_phis
= NULL
;
3119 new_instance
->cost_vec
= vNULL
;
3120 new_instance
->subgraph_entries
= vNULL
;
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_NOTE
, vect_location
,
3124 "SLP size %u vs. limit %u.\n",
3125 tree_size
, max_tree_size
);
3127 /* Fixup SLP reduction chains. */
3128 if (kind
== slp_inst_kind_reduc_chain
)
3130 /* If this is a reduction chain with a conversion in front
3131 amend the SLP tree with a node for that. */
3133 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
3134 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
3136 /* Get at the conversion stmt - we know it's the single use
3137 of the last stmt of the reduction chain. */
3138 use_operand_p use_p
;
3139 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
3140 &use_p
, &scalar_def
);
3142 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
3143 next_info
= vect_stmt_to_vectorize (next_info
);
3144 scalar_stmts
= vNULL
;
3145 scalar_stmts
.create (group_size
);
3146 for (unsigned i
= 0; i
< group_size
; ++i
)
3147 scalar_stmts
.quick_push (next_info
);
3148 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
3149 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
3150 SLP_TREE_CHILDREN (conv
).quick_push (node
);
3151 SLP_INSTANCE_TREE (new_instance
) = conv
;
3152 /* We also have to fake this conversion stmt as SLP reduction
3153 group so we don't have to mess with too much code
3155 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
3156 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
3158 /* Fill the backedge child of the PHI SLP node. The
3159 general matching code cannot find it because the
3160 scalar code does not reflect how we vectorize the
3162 use_operand_p use_p
;
3163 imm_use_iterator imm_iter
;
3164 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
3165 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
3166 gimple_get_lhs (scalar_def
))
3167 /* There are exactly two non-debug uses, the reduction
3168 PHI and the loop-closed PHI node. */
3169 if (!is_gimple_debug (USE_STMT (use_p
))
3170 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
3172 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
3173 stmt_vec_info phi_info
3174 = vinfo
->lookup_stmt (USE_STMT (use_p
));
3175 for (unsigned i
= 0; i
< group_size
; ++i
)
3176 phis
.quick_push (phi_info
);
3177 slp_tree
*phi_node
= bst_map
->get (phis
);
3178 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
3179 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
3180 = SLP_INSTANCE_TREE (new_instance
);
3181 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
3185 vinfo
->slp_instances
.safe_push (new_instance
);
3187 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3188 the number of scalar stmts in the root in a few places.
3189 Verify that assumption holds. */
3190 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
3191 .length () == group_size
);
3193 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_NOTE
, vect_location
,
3196 "Final SLP tree for instance %p:\n", new_instance
);
3197 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3198 SLP_INSTANCE_TREE (new_instance
));
3206 /* Failed to SLP. */
3207 /* Free the allocated memory. */
3208 scalar_stmts
.release ();
3211 stmt_vec_info stmt_info
= stmt_info_
;
3212 /* Try to break the group up into pieces. */
3213 if (kind
== slp_inst_kind_store
)
3215 /* ??? We could delay all the actual splitting of store-groups
3216 until after SLP discovery of the original group completed.
3217 Then we can recurse to vect_build_slp_instance directly. */
3218 for (i
= 0; i
< group_size
; i
++)
3222 /* For basic block SLP, try to break the group up into multiples of
3224 if (is_a
<bb_vec_info
> (vinfo
)
3225 && (i
> 1 && i
< group_size
))
3228 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3229 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
3230 1 << floor_log2 (i
));
3231 unsigned HOST_WIDE_INT const_nunits
;
3233 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
3235 /* Split into two groups at the first vector boundary. */
3236 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
3237 unsigned group1_size
= i
& ~(const_nunits
- 1);
3239 if (dump_enabled_p ())
3240 dump_printf_loc (MSG_NOTE
, vect_location
,
3241 "Splitting SLP group at stmt %u\n", i
);
3242 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3244 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3245 kind
, max_tree_size
,
3247 /* Split the rest at the failure point and possibly
3248 re-analyze the remaining matching part if it has
3249 at least two lanes. */
3251 && (i
+ 1 < group_size
3252 || i
- group1_size
> 1))
3254 stmt_vec_info rest2
= rest
;
3255 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
3256 if (i
- group1_size
> 1)
3257 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
3258 kind
, max_tree_size
,
3261 /* Re-analyze the non-matching tail if it has at least
3263 if (i
+ 1 < group_size
)
3264 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3265 rest
, kind
, max_tree_size
,
3271 /* For loop vectorization split into arbitrary pieces of size > 1. */
3272 if (is_a
<loop_vec_info
> (vinfo
)
3273 && (i
> 1 && i
< group_size
)
3274 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
3276 unsigned group1_size
= i
;
3278 if (dump_enabled_p ())
3279 dump_printf_loc (MSG_NOTE
, vect_location
,
3280 "Splitting SLP group at stmt %u\n", i
);
3282 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3284 /* Loop vectorization cannot handle gaps in stores, make sure
3285 the split group appears as strided. */
3286 STMT_VINFO_STRIDED_P (rest
) = 1;
3287 DR_GROUP_GAP (rest
) = 0;
3288 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
3289 DR_GROUP_GAP (stmt_info
) = 0;
3291 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3292 kind
, max_tree_size
, limit
);
3293 if (i
+ 1 < group_size
)
3294 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3295 rest
, kind
, max_tree_size
, limit
);
3300 /* Even though the first vector did not all match, we might be able to SLP
3301 (some) of the remainder. FORNOW ignore this possibility. */
3304 /* Failed to SLP. */
3305 if (dump_enabled_p ())
3306 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
3311 /* Analyze an SLP instance starting from a group of grouped stores. Call
3312 vect_build_slp_tree to build a tree of packed stmts if possible.
3313 Return FALSE if it's impossible to SLP any stmt in the loop. */
3316 vect_analyze_slp_instance (vec_info
*vinfo
,
3317 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3318 stmt_vec_info stmt_info
,
3319 slp_instance_kind kind
,
3320 unsigned max_tree_size
, unsigned *limit
)
3323 vec
<stmt_vec_info
> scalar_stmts
;
3325 if (is_a
<bb_vec_info
> (vinfo
))
3326 vect_location
= stmt_info
->stmt
;
3328 stmt_vec_info next_info
= stmt_info
;
3329 if (kind
== slp_inst_kind_store
)
3331 /* Collect the stores and store them in scalar_stmts. */
3332 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
3335 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3336 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
3339 else if (kind
== slp_inst_kind_reduc_chain
)
3341 /* Collect the reduction stmts and store them in scalar_stmts. */
3342 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
3345 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3346 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
3348 /* Mark the first element of the reduction chain as reduction to properly
3349 transform the node. In the reduction analysis phase only the last
3350 element of the chain is marked as reduction. */
3351 STMT_VINFO_DEF_TYPE (stmt_info
)
3352 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
3353 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
3354 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
3356 else if (kind
== slp_inst_kind_ctor
)
3358 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
3360 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
3361 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
3363 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
3364 def_info
= vect_stmt_to_vectorize (def_info
);
3365 scalar_stmts
.quick_push (def_info
);
3367 if (dump_enabled_p ())
3368 dump_printf_loc (MSG_NOTE
, vect_location
,
3369 "Analyzing vectorizable constructor: %G\n",
3372 else if (kind
== slp_inst_kind_reduc_group
)
3374 /* Collect reduction statements. */
3375 const vec
<stmt_vec_info
> &reductions
3376 = as_a
<loop_vec_info
> (vinfo
)->reductions
;
3377 scalar_stmts
.create (reductions
.length ());
3378 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
3379 if ((STMT_VINFO_RELEVANT_P (next_info
)
3380 || STMT_VINFO_LIVE_P (next_info
))
3381 /* ??? Make sure we didn't skip a conversion around a reduction
3382 path. In that case we'd have to reverse engineer that conversion
3383 stmt following the chain using reduc_idx and from the PHI
3385 && STMT_VINFO_DEF_TYPE (next_info
) == vect_reduction_def
)
3386 scalar_stmts
.quick_push (next_info
);
3387 /* If less than two were relevant/live there's nothing to SLP. */
3388 if (scalar_stmts
.length () < 2)
3394 vec
<stmt_vec_info
> roots
= vNULL
;
3395 if (kind
== slp_inst_kind_ctor
)
3398 roots
.quick_push (stmt_info
);
3400 /* Build the tree for the SLP instance. */
3401 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
3403 max_tree_size
, limit
, bst_map
,
3404 kind
== slp_inst_kind_store
3405 ? stmt_info
: NULL
);
3409 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3410 where we should do store group splitting. */
3415 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3416 trees of packed scalar stmts if SLP is possible. */
3419 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
3422 stmt_vec_info first_element
;
3423 slp_instance instance
;
3425 DUMP_VECT_SCOPE ("vect_analyze_slp");
3427 unsigned limit
= max_tree_size
;
3429 scalar_stmts_to_slp_tree_map_t
*bst_map
3430 = new scalar_stmts_to_slp_tree_map_t ();
3432 /* Find SLP sequences starting from groups of grouped stores. */
3433 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
3434 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3435 STMT_VINFO_GROUPED_ACCESS (first_element
)
3436 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
3437 max_tree_size
, &limit
);
3439 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3441 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
3443 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
3444 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
3445 bb_vinfo
->roots
[i
].stmts
,
3446 bb_vinfo
->roots
[i
].roots
,
3447 max_tree_size
, &limit
, bst_map
, NULL
))
3449 bb_vinfo
->roots
[i
].stmts
= vNULL
;
3450 bb_vinfo
->roots
[i
].roots
= vNULL
;
3455 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3457 /* Find SLP sequences starting from reduction chains. */
3458 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
3459 if (! STMT_VINFO_RELEVANT_P (first_element
)
3460 && ! STMT_VINFO_LIVE_P (first_element
))
3462 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3463 slp_inst_kind_reduc_chain
,
3464 max_tree_size
, &limit
))
3466 /* Dissolve reduction chain group. */
3467 stmt_vec_info vinfo
= first_element
;
3468 stmt_vec_info last
= NULL
;
3471 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
3472 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
3473 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
3477 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
3478 /* It can be still vectorized as part of an SLP reduction. */
3479 loop_vinfo
->reductions
.safe_push (last
);
3482 /* Find SLP sequences starting from groups of reductions. */
3483 if (loop_vinfo
->reductions
.length () > 1)
3484 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
3485 slp_inst_kind_reduc_group
, max_tree_size
,
3489 hash_set
<slp_tree
> visited_patterns
;
3490 slp_tree_to_load_perm_map_t perm_cache
;
3491 slp_compat_nodes_map_t compat_cache
;
3493 /* See if any patterns can be found in the SLP tree. */
3494 bool pattern_found
= false;
3495 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3496 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
3497 &visited_patterns
, &perm_cache
,
3500 /* If any were found optimize permutations of loads. */
3503 hash_map
<slp_tree
, slp_tree
> load_map
;
3504 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3506 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3507 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
3514 /* The map keeps a reference on SLP nodes built, release that. */
3515 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
3516 it
!= bst_map
->end (); ++it
)
3518 vect_free_slp_tree ((*it
).second
);
3521 if (pattern_found
&& dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE
, vect_location
,
3524 "Pattern matched SLP tree\n");
3525 hash_set
<slp_tree
> visited
;
3526 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3527 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3528 SLP_INSTANCE_TREE (instance
), visited
);
3531 return opt_result::success ();
3536 slpg_vertex (slp_tree node_
)
3537 : node (node_
), perm_in (-1), perm_out (-1) {}
3539 int get_perm_materialized () const
3540 { return perm_in
!= perm_out
? perm_in
: 0; }
3543 /* The common permutation on the incoming lanes (towards SLP children). */
3545 /* The permutation on the outgoing lanes (towards SLP parents). When
3546 the node is a materialization point for a permute this differs
3547 from perm_in (and is then usually zero). Materialization happens
3548 on the input side. */
3552 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3555 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
3556 vec
<slpg_vertex
> &vertices
, vec
<int> &leafs
)
3561 if (visited
.add (node
))
3564 node
->vertex
= vertices
.length ();
3565 vertices
.safe_push (slpg_vertex (node
));
3568 bool force_leaf
= false;
3569 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3573 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
3577 /* Since SLP discovery works along use-def edges all cycles have an
3578 entry - but there's the exception of cycles where we do not handle
3579 the entry explicitely (but with a NULL SLP node), like some reductions
3580 and inductions. Force those SLP PHIs to act as leafs to make them
3581 backwards reachable. */
3582 if (leaf
|| force_leaf
)
3583 leafs
.safe_push (node
->vertex
);
3586 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3589 vect_slp_build_vertices (vec_info
*info
, vec
<slpg_vertex
> &vertices
,
3592 hash_set
<slp_tree
> visited
;
3594 slp_instance instance
;
3595 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
3596 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
3600 /* Apply (reverse) bijectite PERM to VEC. */
3604 vect_slp_permute (vec
<unsigned> perm
,
3605 vec
<T
> &vec
, bool reverse
)
3607 auto_vec
<T
, 64> saved
;
3608 saved
.create (vec
.length ());
3609 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3610 saved
.quick_push (vec
[i
]);
3614 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3615 vec
[perm
[i
]] = saved
[i
];
3616 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3617 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3621 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3622 vec
[i
] = saved
[perm
[i
]];
3623 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3624 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3628 /* Return whether permutations PERM_A and PERM_B as recorded in the
3629 PERMS vector are equal. */
3632 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3633 int perm_a
, int perm_b
)
3635 return (perm_a
== perm_b
3636 || (perm_a
!= -1 && perm_b
!= -1
3637 && perms
[perm_a
].length () == perms
[perm_b
].length ()
3638 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3639 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3642 /* Optimize the SLP graph of VINFO. */
3645 vect_optimize_slp (vec_info
*vinfo
)
3647 if (vinfo
->slp_instances
.is_empty ())
3652 auto_vec
<slpg_vertex
> vertices
;
3653 auto_vec
<int> leafs
;
3654 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3656 struct graph
*slpg
= new_graph (vertices
.length ());
3657 for (slpg_vertex
&v
: vertices
)
3658 for (slp_tree child
: SLP_TREE_CHILDREN (v
.node
))
3660 add_edge (slpg
, v
.node
->vertex
, child
->vertex
);
3662 /* Compute (reverse) postorder on the inverted graph. */
3664 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3666 auto_vec
<vec
<unsigned> > perms
;
3667 perms
.safe_push (vNULL
); /* zero is no permute */
3669 /* Produce initial permutations. */
3670 for (i
= 0; i
< leafs
.length (); ++i
)
3673 slp_tree node
= vertices
[idx
].node
;
3675 /* Handle externals and constants optimistically throughout the
3676 iteration. But treat existing vectors as fixed since we
3677 do not handle permuting them below. */
3678 if ((SLP_TREE_DEF_TYPE (node
) == vect_external_def
3679 && !SLP_TREE_VEC_DEFS (node
).exists ())
3680 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3683 /* Leafs do not change across iterations. Note leafs also double
3684 as entries to the reverse graph. */
3685 if (!slpg
->vertices
[idx
].succ
)
3687 vertices
[idx
].perm_in
= 0;
3688 vertices
[idx
].perm_out
= 0;
3691 /* Loads are the only thing generating permutes. */
3692 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3695 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3696 node unpermuted, record this permute. */
3697 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3698 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3700 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3701 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3702 bool any_permute
= false;
3703 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3705 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3706 imin
= MIN (imin
, idx
);
3707 imax
= MAX (imax
, idx
);
3708 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3711 /* If there's no permute no need to split one out. */
3714 /* If the span doesn't match we'd disrupt VF computation, avoid
3716 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3719 /* For now only handle true permutes, like
3720 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3721 when permuting constants and invariants keeping the permute
3723 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3724 bitmap_clear (load_index
);
3725 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3726 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3728 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3729 if (!bitmap_bit_p (load_index
, j
))
3731 if (j
!= SLP_TREE_LANES (node
))
3734 vec
<unsigned> perm
= vNULL
;
3735 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3736 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3737 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3738 perms
.safe_push (perm
);
3739 vertices
[idx
].perm_in
= perms
.length () - 1;
3740 vertices
[idx
].perm_out
= perms
.length () - 1;
3743 /* In addition to the above we have to mark outgoing permutes facing
3744 non-reduction graph entries that are not represented as to be
3746 for (slp_instance instance
: vinfo
->slp_instances
)
3747 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
)
3749 /* Just setting perm_out isn't enough for the propagation to
3751 vertices
[SLP_INSTANCE_TREE (instance
)->vertex
].perm_in
= 0;
3752 vertices
[SLP_INSTANCE_TREE (instance
)->vertex
].perm_out
= 0;
3755 /* Propagate permutes along the graph and compute materialization points. */
3757 bool do_materialization
= false;
3758 unsigned iteration
= 0;
3764 if (dump_enabled_p ())
3765 dump_printf_loc (MSG_NOTE
, vect_location
,
3766 "SLP optimize iteration %d\n", iteration
);
3768 for (i
= vertices
.length (); i
> 0 ; --i
)
3771 slp_tree node
= vertices
[idx
].node
;
3773 /* Handle externals and constants optimistically throughout the
3775 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3776 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3779 /* We still eventually have failed backedge SLP nodes in the
3780 graph, those are only cancelled when analyzing operations.
3781 Simply treat them as transparent ops, propagating permutes
3783 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
3785 /* We do not handle stores with a permutation, so all
3786 incoming permutes must have been materialized. */
3787 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
3788 if (STMT_VINFO_DATA_REF (rep
)
3789 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep
)))
3791 /* ??? We're forcing materialization in place
3792 of the child here, we'd need special handling
3793 in materialization to leave perm_in -1 here. */
3794 vertices
[idx
].perm_in
= 0;
3795 vertices
[idx
].perm_out
= 0;
3797 /* We cannot move a permute across an operation that is
3798 not independent on lanes. Note this is an explicit
3799 negative list since that's much shorter than the respective
3800 positive one but it's critical to keep maintaining it. */
3801 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
3802 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
3804 case CFN_COMPLEX_ADD_ROT90
:
3805 case CFN_COMPLEX_ADD_ROT270
:
3806 case CFN_COMPLEX_MUL
:
3807 case CFN_COMPLEX_MUL_CONJ
:
3808 case CFN_VEC_ADDSUB
:
3809 case CFN_VEC_FMADDSUB
:
3810 case CFN_VEC_FMSUBADD
:
3811 vertices
[idx
].perm_in
= 0;
3812 vertices
[idx
].perm_out
= 0;
3817 if (!slpg
->vertices
[idx
].succ
)
3818 /* Pick up pre-computed leaf values. */
3822 bool any_succ_perm_out_m1
= false;
3823 int perm_in
= vertices
[idx
].perm_in
;
3824 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3825 succ
; succ
= succ
->succ_next
)
3827 int succ_idx
= succ
->dest
;
3828 int succ_perm
= vertices
[succ_idx
].perm_out
;
3829 /* Handle unvisited (and constant) nodes optimistically. */
3830 /* ??? But for constants once we want to handle
3831 non-bijective permutes we have to verify the permute,
3832 when unifying lanes, will not unify different constants.
3833 For example see gcc.dg/vect/bb-slp-14.c for a case
3834 that would break. */
3835 if (succ_perm
== -1)
3837 /* When we handled a non-leaf optimistically, note
3838 that so we can adjust its outgoing permute below. */
3839 slp_tree succ_node
= vertices
[succ_idx
].node
;
3840 if (SLP_TREE_DEF_TYPE (succ_node
) != vect_external_def
3841 && SLP_TREE_DEF_TYPE (succ_node
) != vect_constant_def
)
3842 any_succ_perm_out_m1
= true;
3846 perm_in
= succ_perm
;
3847 else if (succ_perm
== 0
3848 || !vect_slp_perms_eq (perms
, perm_in
, succ_perm
))
3855 /* Adjust any incoming permutes we treated optimistically. */
3856 if (perm_in
!= -1 && any_succ_perm_out_m1
)
3858 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3859 succ
; succ
= succ
->succ_next
)
3861 slp_tree succ_node
= vertices
[succ
->dest
].node
;
3862 if (vertices
[succ
->dest
].perm_out
== -1
3863 && SLP_TREE_DEF_TYPE (succ_node
) != vect_external_def
3864 && SLP_TREE_DEF_TYPE (succ_node
) != vect_constant_def
)
3866 vertices
[succ
->dest
].perm_out
= perm_in
;
3867 /* And ensure this propagates. */
3868 if (vertices
[succ
->dest
].perm_in
== -1)
3869 vertices
[succ
->dest
].perm_in
= perm_in
;
3875 if (!vect_slp_perms_eq (perms
, perm_in
,
3876 vertices
[idx
].perm_in
))
3878 /* Make sure we eventually converge. */
3879 gcc_checking_assert (vertices
[idx
].perm_in
== -1
3881 vertices
[idx
].perm_in
= perm_in
;
3883 /* While we can handle VEC_PERM nodes as transparent
3884 pass-through they can be a cheap materialization
3885 point as well. In addition they can act as source
3886 of a random permutation as well.
3887 The following ensures that former materialization
3888 points that now have zero incoming permutes no
3889 longer appear as such and that former "any" permutes
3890 get pass-through. We keep VEC_PERM nodes optimistic
3891 as "any" outgoing permute though. */
3892 if (vertices
[idx
].perm_out
!= 0
3893 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
3894 vertices
[idx
].perm_out
= perm_in
;
3899 /* Elide pruning at materialization points in the first
3901 if (!do_materialization
)
3904 int perm
= vertices
[idx
].perm_out
;
3905 if (perm
== 0 || perm
== -1)
3908 /* Decide on permute materialization. Look whether there's
3909 a use (pred) edge that is permuted differently than us.
3910 In that case mark ourselves so the permutation is applied. */
3911 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
3912 if (all_preds_permuted
)
3913 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3914 pred
; pred
= pred
->pred_next
)
3916 int pred_perm
= vertices
[pred
->src
].perm_in
;
3917 gcc_checking_assert (pred_perm
!= -1);
3918 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3920 all_preds_permuted
= false;
3924 if (!all_preds_permuted
)
3926 vertices
[idx
].perm_out
= 0;
3931 /* If the initial propagation converged, switch on materialization
3932 and re-propagate. */
3933 if (!changed
&& !do_materialization
)
3935 do_materialization
= true;
3940 statistics_histogram_event (cfun
, "SLP optimize perm iterations", iteration
);
3943 for (i
= 0; i
< vertices
.length (); ++i
)
3945 int perm_in
= vertices
[i
].perm_in
;
3946 slp_tree node
= vertices
[i
].node
;
3948 /* First permute invariant/external original successors, we handle
3949 those optimistically during propagation and duplicate them if
3950 they are used with different permutations. */
3954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3957 || (SLP_TREE_DEF_TYPE (child
) != vect_constant_def
3958 && SLP_TREE_DEF_TYPE (child
) != vect_external_def
))
3961 /* If the vector is uniform there's nothing to do. */
3962 if (vect_slp_tree_uniform_p (child
))
3965 /* We can end up sharing some externals via two_operator
3966 handling. Be prepared to unshare those. */
3967 if (child
->refcnt
!= 1)
3969 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3970 SLP_TREE_CHILDREN (node
)[j
] = child
3971 = vect_create_new_slp_node
3972 (SLP_TREE_SCALAR_OPS (child
).copy ());
3974 vect_slp_permute (perms
[perm_in
],
3975 SLP_TREE_SCALAR_OPS (child
), true);
3978 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3980 /* Apply the common permutes to the input vectors. */
3983 /* If the node is already a permute node we can apply
3984 the permutation to the lane selection, effectively
3985 materializing it on the incoming vectors. */
3986 if (dump_enabled_p ())
3987 dump_printf_loc (MSG_NOTE
, vect_location
,
3988 "simplifying permute node %p\n",
3990 for (unsigned k
= 0;
3991 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3992 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3993 = perms
[perm_in
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3995 /* Apply the anticipated output permute to the permute and
3997 int perm_out
= vertices
[i
].perm_out
;
4000 vect_slp_permute (perms
[perm_out
],
4001 SLP_TREE_SCALAR_STMTS (node
), true);
4002 vect_slp_permute (perms
[perm_out
],
4003 SLP_TREE_LANE_PERMUTATION (node
), true);
4006 else if (vertices
[i
].get_perm_materialized () != 0)
4008 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4009 /* For loads simply drop the permutation, the load permutation
4010 already performs the desired permutation. */
4012 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
4016 if (dump_enabled_p ())
4017 dump_printf_loc (MSG_NOTE
, vect_location
,
4018 "inserting permute node in place of %p\n",
4021 /* Make a copy of NODE and in-place change it to a
4022 VEC_PERM node to permute the lanes of the copy. */
4023 slp_tree copy
= new _slp_tree
;
4024 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
4025 SLP_TREE_CHILDREN (node
) = vNULL
;
4026 SLP_TREE_SCALAR_STMTS (copy
)
4027 = SLP_TREE_SCALAR_STMTS (node
).copy ();
4028 vect_slp_permute (perms
[perm_in
],
4029 SLP_TREE_SCALAR_STMTS (copy
), true);
4030 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
4031 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
4032 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
4033 SLP_TREE_LANE_PERMUTATION (copy
)
4034 = SLP_TREE_LANE_PERMUTATION (node
);
4035 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
4036 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
4038 copy
->max_nunits
= node
->max_nunits
;
4039 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
4040 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
4041 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
4043 /* Now turn NODE into a VEC_PERM. */
4044 SLP_TREE_CHILDREN (node
).safe_push (copy
);
4045 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
4046 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4047 SLP_TREE_LANE_PERMUTATION (node
)
4048 .quick_push (std::make_pair (0, perms
[perm_in
][j
]));
4049 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
4052 else if (perm_in
> 0) /* perm_in == perm_out */
4054 /* Apply the reverse permutation to our stmts. */
4055 vect_slp_permute (perms
[perm_in
],
4056 SLP_TREE_SCALAR_STMTS (node
), true);
4057 /* And to the lane/load permutation, which we can simply
4058 make regular by design. */
4059 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4061 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node
).exists ());
4062 /* ??? When we handle non-bijective permutes the idea
4063 is that we can force the load-permutation to be
4064 { min, min + 1, min + 2, ... max }. But then the
4065 scalar defs might no longer match the lane content
4066 which means wrong-code with live lane vectorization.
4067 So we possibly have to have NULL entries for those. */
4068 vect_slp_permute (perms
[perm_in
],
4069 SLP_TREE_LOAD_PERMUTATION (node
), true);
4071 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
4076 /* Elide any permutations at BB reduction roots. */
4077 if (is_a
<bb_vec_info
> (vinfo
))
4079 for (slp_instance instance
: vinfo
->slp_instances
)
4081 if (SLP_INSTANCE_KIND (instance
) != slp_inst_kind_bb_reduc
)
4083 slp_tree old
= SLP_INSTANCE_TREE (instance
);
4084 if (SLP_TREE_CODE (old
) == VEC_PERM_EXPR
4085 && SLP_TREE_CHILDREN (old
).length () == 1)
4087 slp_tree child
= SLP_TREE_CHILDREN (old
)[0];
4088 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4090 /* Preserve the special VEC_PERM we use to shield existing
4091 vector defs from the rest. But make it a no-op. */
4093 for (std::pair
<unsigned, unsigned> &p
4094 : SLP_TREE_LANE_PERMUTATION (old
))
4099 SLP_INSTANCE_TREE (instance
) = child
;
4100 SLP_TREE_REF_COUNT (child
)++;
4101 vect_free_slp_tree (old
);
4104 else if (SLP_TREE_LOAD_PERMUTATION (old
).exists ()
4105 && SLP_TREE_REF_COUNT (old
) == 1
4106 && vertices
[old
->vertex
].get_perm_materialized () != 0)
4108 /* ??? For loads the situation is more complex since
4109 we can't modify the permute in place in case the
4110 node is used multiple times. In fact for loads this
4111 should be somehow handled in the propagation engine. */
4112 /* Apply the reverse permutation to our stmts. */
4113 int perm
= vertices
[old
->vertex
].get_perm_materialized ();
4114 vect_slp_permute (perms
[perm
],
4115 SLP_TREE_SCALAR_STMTS (old
), true);
4116 vect_slp_permute (perms
[perm
],
4117 SLP_TREE_LOAD_PERMUTATION (old
), true);
4122 /* Free the perms vector used for propagation. */
4123 while (!perms
.is_empty ())
4124 perms
.pop ().release ();
4128 /* Now elide load permutations that are not necessary. */
4129 for (i
= 0; i
< leafs
.length (); ++i
)
4131 node
= vertices
[leafs
[i
]].node
;
4132 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4135 /* In basic block vectorization we allow any subchain of an interleaving
4137 FORNOW: not in loop SLP because of realignment complications. */
4138 if (is_a
<bb_vec_info
> (vinfo
))
4140 bool subchain_p
= true;
4141 stmt_vec_info next_load_info
= NULL
;
4142 stmt_vec_info load_info
;
4144 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4147 && (next_load_info
!= load_info
4148 || DR_GROUP_GAP (load_info
) != 1))
4153 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
4157 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4163 stmt_vec_info load_info
;
4164 bool this_load_permuted
= false;
4166 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4167 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
4169 this_load_permuted
= true;
4172 stmt_vec_info first_stmt_info
4173 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
4174 if (!this_load_permuted
4175 /* The load requires permutation when unrolling exposes
4176 a gap either because the group is larger than the SLP
4177 group-size or because there is a gap between the groups. */
4178 && (known_eq (LOOP_VINFO_VECT_FACTOR
4179 (as_a
<loop_vec_info
> (vinfo
)), 1U)
4180 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
4181 && DR_GROUP_GAP (first_stmt_info
) == 0)))
4183 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4190 /* Gather loads reachable from the individual SLP graph entries. */
4193 vect_gather_slp_loads (vec_info
*vinfo
)
4196 slp_instance instance
;
4197 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
4199 hash_set
<slp_tree
> visited
;
4200 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
4201 SLP_INSTANCE_TREE (instance
), visited
);
4206 /* For each possible SLP instance decide whether to SLP it and calculate overall
4207 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4208 least one instance. */
4211 vect_make_slp_decision (loop_vec_info loop_vinfo
)
4214 poly_uint64 unrolling_factor
= 1;
4215 const vec
<slp_instance
> &slp_instances
4216 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
4217 slp_instance instance
;
4218 int decided_to_slp
= 0;
4220 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4222 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4224 /* FORNOW: SLP if you can. */
4225 /* All unroll factors have the form:
4227 GET_MODE_SIZE (vinfo->vector_mode) * X
4229 for some rational X, so they must have a common multiple. */
4231 = force_common_multiple (unrolling_factor
,
4232 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
4234 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4235 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4236 loop-based vectorization. Such stmts will be marked as HYBRID. */
4237 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4241 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
4243 if (decided_to_slp
&& dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE
, vect_location
,
4246 "Decided to SLP %d instances. Unrolling factor ",
4248 dump_dec (MSG_NOTE
, unrolling_factor
);
4249 dump_printf (MSG_NOTE
, "\n");
4252 return (decided_to_slp
> 0);
4255 /* Private data for vect_detect_hybrid_slp. */
4258 loop_vec_info loop_vinfo
;
4259 vec
<stmt_vec_info
> *worklist
;
4262 /* Walker for walk_gimple_op. */
4265 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
4267 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
4268 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
4273 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
4276 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
4277 if (PURE_SLP_STMT (def_stmt_info
))
4279 if (dump_enabled_p ())
4280 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
4281 def_stmt_info
->stmt
);
4282 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
4283 dat
->worklist
->safe_push (def_stmt_info
);
4289 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4290 if so, otherwise pushing it to WORKLIST. */
4293 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
4294 vec
<stmt_vec_info
> &worklist
,
4295 stmt_vec_info stmt_info
)
4297 if (dump_enabled_p ())
4298 dump_printf_loc (MSG_NOTE
, vect_location
,
4299 "Processing hybrid candidate : %G", stmt_info
->stmt
);
4300 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
4301 imm_use_iterator iter2
;
4303 use_operand_p use_p
;
4304 def_operand_p def_p
;
4305 bool any_def
= false;
4306 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
4309 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
4311 if (is_gimple_debug (USE_STMT (use_p
)))
4313 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
4314 /* An out-of loop use means this is a loop_vect sink. */
4317 if (dump_enabled_p ())
4318 dump_printf_loc (MSG_NOTE
, vect_location
,
4319 "Found loop_vect sink: %G", stmt_info
->stmt
);
4320 worklist
.safe_push (stmt_info
);
4323 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
4325 if (dump_enabled_p ())
4326 dump_printf_loc (MSG_NOTE
, vect_location
,
4327 "Found loop_vect use: %G", use_info
->stmt
);
4328 worklist
.safe_push (stmt_info
);
4333 /* No def means this is a loo_vect sink. */
4336 if (dump_enabled_p ())
4337 dump_printf_loc (MSG_NOTE
, vect_location
,
4338 "Found loop_vect sink: %G", stmt_info
->stmt
);
4339 worklist
.safe_push (stmt_info
);
4342 if (dump_enabled_p ())
4343 dump_printf_loc (MSG_NOTE
, vect_location
,
4344 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
4345 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
4348 /* Find stmts that must be both vectorized and SLPed. */
4351 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
4353 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4355 /* All stmts participating in SLP are marked pure_slp, all other
4356 stmts are loop_vect.
4357 First collect all loop_vect stmts into a worklist.
4358 SLP patterns cause not all original scalar stmts to appear in
4359 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4360 Rectify this here and do a backward walk over the IL only considering
4361 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4362 mark them as pure_slp. */
4363 auto_vec
<stmt_vec_info
> worklist
;
4364 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
4366 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
4367 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
4370 gphi
*phi
= gsi
.phi ();
4371 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
4372 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4373 maybe_push_to_hybrid_worklist (loop_vinfo
,
4374 worklist
, stmt_info
);
4376 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
4379 gimple
*stmt
= gsi_stmt (gsi
);
4380 if (is_gimple_debug (stmt
))
4382 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
4383 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
4385 for (gimple_stmt_iterator gsi2
4386 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
4387 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
4389 stmt_vec_info patt_info
4390 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
4391 if (!STMT_SLP_TYPE (patt_info
)
4392 && STMT_VINFO_RELEVANT (patt_info
))
4393 maybe_push_to_hybrid_worklist (loop_vinfo
,
4394 worklist
, patt_info
);
4396 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
4398 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4399 maybe_push_to_hybrid_worklist (loop_vinfo
,
4400 worklist
, stmt_info
);
4404 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4405 mark any SLP vectorized stmt as hybrid.
4406 ??? We're visiting def stmts N times (once for each non-SLP and
4407 once for each hybrid-SLP use). */
4410 dat
.worklist
= &worklist
;
4411 dat
.loop_vinfo
= loop_vinfo
;
4412 memset (&wi
, 0, sizeof (wi
));
4413 wi
.info
= (void *)&dat
;
4414 while (!worklist
.is_empty ())
4416 stmt_vec_info stmt_info
= worklist
.pop ();
4417 /* Since SSA operands are not set up for pattern stmts we need
4418 to use walk_gimple_op. */
4420 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
4425 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4427 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
4428 : vec_info (vec_info::bb
, shared
),
4432 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4435 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4438 gphi
*phi
= si
.phi ();
4439 gimple_set_uid (phi
, 0);
4442 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4443 !gsi_end_p (gsi
); gsi_next (&gsi
))
4445 gimple
*stmt
= gsi_stmt (gsi
);
4446 gimple_set_uid (stmt
, 0);
4447 if (is_gimple_debug (stmt
))
4455 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4456 stmts in the basic block. */
4458 _bb_vec_info::~_bb_vec_info ()
4460 /* Reset region marker. */
4461 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4464 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4467 gphi
*phi
= si
.phi ();
4468 gimple_set_uid (phi
, -1);
4470 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4471 !gsi_end_p (gsi
); gsi_next (&gsi
))
4473 gimple
*stmt
= gsi_stmt (gsi
);
4474 gimple_set_uid (stmt
, -1);
4478 for (unsigned i
= 0; i
< roots
.length (); ++i
)
4480 roots
[i
].stmts
.release ();
4481 roots
[i
].roots
.release ();
4486 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4487 given then that child nodes have already been processed, and that
4488 their def types currently match their SLP node's def type. */
4491 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
4492 slp_instance node_instance
,
4493 stmt_vector_for_cost
*cost_vec
)
4495 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4497 /* Calculate the number of vector statements to be created for the
4498 scalar stmts in this node. For SLP reductions it is equal to the
4499 number of vector statements in the children (which has already been
4500 calculated by the recursive call). Otherwise it is the number of
4501 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4502 VF divided by the number of elements in a vector. */
4503 if (!STMT_VINFO_DATA_REF (stmt_info
)
4504 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
4506 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
4507 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
4509 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4510 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
4517 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4518 vf
= loop_vinfo
->vectorization_factor
;
4521 unsigned int group_size
= SLP_TREE_LANES (node
);
4522 tree vectype
= SLP_TREE_VECTYPE (node
);
4523 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4524 = vect_get_num_vectors (vf
* group_size
, vectype
);
4527 /* Handle purely internal nodes. */
4528 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4529 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
4531 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
4534 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
4535 node
, node_instance
, cost_vec
);
4538 /* Try to build NODE from scalars, returning true on success.
4539 NODE_INSTANCE is the SLP instance that contains NODE. */
4542 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
4543 slp_instance node_instance
)
4545 stmt_vec_info stmt_info
;
4548 if (!is_a
<bb_vec_info
> (vinfo
)
4549 || node
== SLP_INSTANCE_TREE (node_instance
)
4550 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
4551 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
))
4552 /* Force the mask use to be built from scalars instead. */
4553 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node
)))
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE
, vect_location
,
4558 "Building vector operands of %p from scalars instead\n", node
);
4560 /* Don't remove and free the child nodes here, since they could be
4561 referenced by other structures. The analysis and scheduling phases
4562 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4563 unsigned int group_size
= SLP_TREE_LANES (node
);
4564 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
4565 /* Invariants get their vector type from the uses. */
4566 SLP_TREE_VECTYPE (node
) = NULL_TREE
;
4567 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
4568 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4569 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4571 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
4572 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
4577 /* Return true if all elements of the slice are the same. */
4579 vect_scalar_ops_slice::all_same_p () const
4581 for (unsigned int i
= 1; i
< length
; ++i
)
4582 if (!operand_equal_p (op (0), op (i
)))
4588 vect_scalar_ops_slice_hash::hash (const value_type
&s
)
4591 for (unsigned i
= 0; i
< s
.length
; ++i
)
4592 hash
= iterative_hash_expr (s
.op (i
), hash
);
4597 vect_scalar_ops_slice_hash::equal (const value_type
&s1
,
4598 const compare_type
&s2
)
4600 if (s1
.length
!= s2
.length
)
4602 for (unsigned i
= 0; i
< s1
.length
; ++i
)
4603 if (!operand_equal_p (s1
.op (i
), s2
.op (i
)))
4608 /* Compute the prologue cost for invariant or constant operands represented
4612 vect_prologue_cost_for_slp (slp_tree node
,
4613 stmt_vector_for_cost
*cost_vec
)
4615 /* There's a special case of an existing vector, that costs nothing. */
4616 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
4617 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
4619 /* Without looking at the actual initializer a vector of
4620 constants can be implemented as load from the constant pool.
4621 When all elements are the same we can use a splat. */
4622 tree vectype
= SLP_TREE_VECTYPE (node
);
4623 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
4624 unsigned HOST_WIDE_INT const_nunits
;
4625 unsigned nelt_limit
;
4626 auto ops
= &SLP_TREE_SCALAR_OPS (node
);
4627 auto_vec
<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4628 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
4629 && ! multiple_p (const_nunits
, group_size
))
4631 nelt_limit
= const_nunits
;
4632 hash_set
<vect_scalar_ops_slice_hash
> vector_ops
;
4633 for (unsigned int i
= 0; i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); ++i
)
4634 if (!vector_ops
.add ({ ops
, i
* const_nunits
, const_nunits
}))
4635 starts
.quick_push (i
* const_nunits
);
4639 /* If either the vector has variable length or the vectors
4640 are composed of repeated whole groups we only need to
4641 cost construction once. All vectors will be the same. */
4642 nelt_limit
= group_size
;
4643 starts
.quick_push (0);
4645 /* ??? We're just tracking whether vectors in a single node are the same.
4646 Ideally we'd do something more global. */
4647 for (unsigned int start
: starts
)
4649 vect_cost_for_stmt kind
;
4650 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
4652 else if (vect_scalar_ops_slice
{ ops
, start
, nelt_limit
}.all_same_p ())
4653 kind
= scalar_to_vec
;
4655 kind
= vec_construct
;
4656 record_stmt_cost (cost_vec
, 1, kind
, node
, vectype
, 0, vect_prologue
);
4660 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4661 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4663 Return true if the operations are supported. */
4666 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
4667 slp_instance node_instance
,
4668 hash_set
<slp_tree
> &visited_set
,
4669 vec
<slp_tree
> &visited_vec
,
4670 stmt_vector_for_cost
*cost_vec
)
4675 /* Assume we can code-generate all invariants. */
4677 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4678 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4681 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
4683 if (dump_enabled_p ())
4684 dump_printf_loc (MSG_NOTE
, vect_location
,
4685 "Failed cyclic SLP reference in %p\n", node
);
4688 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
4690 /* If we already analyzed the exact same set of scalar stmts we're done.
4691 We share the generated vector stmts for those. */
4692 if (visited_set
.add (node
))
4694 visited_vec
.safe_push (node
);
4697 unsigned visited_rec_start
= visited_vec
.length ();
4698 unsigned cost_vec_rec_start
= cost_vec
->length ();
4699 bool seen_non_constant_child
= false;
4700 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4702 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
4703 visited_set
, visited_vec
,
4707 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
4708 seen_non_constant_child
= true;
4710 /* We're having difficulties scheduling nodes with just constant
4711 operands and no scalar stmts since we then cannot compute a stmt
4713 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
4715 if (dump_enabled_p ())
4716 dump_printf_loc (MSG_NOTE
, vect_location
,
4717 "Cannot vectorize all-constant op node %p\n", node
);
4722 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
4724 /* If analysis failed we have to pop all recursive visited nodes
4728 while (visited_vec
.length () >= visited_rec_start
)
4729 visited_set
.remove (visited_vec
.pop ());
4730 cost_vec
->truncate (cost_vec_rec_start
);
4733 /* When the node can be vectorized cost invariant nodes it references.
4734 This is not done in DFS order to allow the refering node
4735 vectorizable_* calls to nail down the invariant nodes vector type
4736 and possibly unshare it if it needs a different vector type than
4739 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
4741 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
4742 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4743 /* Perform usual caching, note code-generation still
4744 code-gens these nodes multiple times but we expect
4745 to CSE them later. */
4746 && !visited_set
.add (child
))
4748 visited_vec
.safe_push (child
);
4749 /* ??? After auditing more code paths make a "default"
4750 and push the vector type from NODE to all children
4751 if it is not already set. */
4752 /* Compute the number of vectors to be generated. */
4753 tree vector_type
= SLP_TREE_VECTYPE (child
);
4756 /* For shifts with a scalar argument we don't need
4757 to cost or code-generate anything.
4758 ??? Represent this more explicitely. */
4759 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4760 == shift_vec_info_type
)
4764 unsigned group_size
= SLP_TREE_LANES (child
);
4766 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4767 vf
= loop_vinfo
->vectorization_factor
;
4768 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
4769 = vect_get_num_vectors (vf
* group_size
, vector_type
);
4770 /* And cost them. */
4771 vect_prologue_cost_for_slp (child
, cost_vec
);
4774 /* If this node or any of its children can't be vectorized, try pruning
4775 the tree here rather than felling the whole thing. */
4776 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
4778 /* We'll need to revisit this for invariant costing and number
4779 of vectorized stmt setting. */
4786 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4787 region and that can be vectorized using vectorizable_live_operation
4788 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4789 scalar code computing it to be retained. */
4792 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4793 slp_instance instance
,
4794 stmt_vector_for_cost
*cost_vec
,
4795 hash_set
<stmt_vec_info
> &svisited
,
4796 hash_set
<slp_tree
> &visited
)
4798 if (visited
.add (node
))
4802 stmt_vec_info stmt_info
;
4803 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4804 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4806 if (svisited
.contains (stmt_info
))
4808 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4809 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4810 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4811 /* Only the pattern root stmt computes the original scalar value. */
4813 bool mark_visited
= true;
4814 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4815 ssa_op_iter op_iter
;
4816 def_operand_p def_p
;
4817 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4819 imm_use_iterator use_iter
;
4821 stmt_vec_info use_stmt_info
;
4822 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4823 if (!is_gimple_debug (use_stmt
))
4825 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4827 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4829 STMT_VINFO_LIVE_P (stmt_info
) = true;
4830 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4831 NULL
, node
, instance
, i
,
4833 /* ??? So we know we can vectorize the live stmt
4834 from one SLP node. If we cannot do so from all
4835 or none consistently we'd have to record which
4836 SLP node (and lane) we want to use for the live
4837 operation. So make sure we can code-generate
4839 mark_visited
= false;
4841 STMT_VINFO_LIVE_P (stmt_info
) = false;
4845 /* We have to verify whether we can insert the lane extract
4846 before all uses. The following is a conservative approximation.
4847 We cannot put this into vectorizable_live_operation because
4848 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4850 Note that while the fact that we emit code for loads at the
4851 first load should make this a non-problem leafs we construct
4852 from scalars are vectorized after the last scalar def.
4853 ??? If we'd actually compute the insert location during
4854 analysis we could use sth less conservative than the last
4855 scalar stmt in the node for the dominance check. */
4856 /* ??? What remains is "live" uses in vector CTORs in the same
4857 SLP graph which is where those uses can end up code-generated
4858 right after their definition instead of close to their original
4859 use. But that would restrict us to code-generate lane-extracts
4860 from the latest stmt in a node. So we compensate for this
4861 during code-generation, simply not replacing uses for those
4862 hopefully rare cases. */
4863 if (STMT_VINFO_LIVE_P (stmt_info
))
4864 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4865 if (!is_gimple_debug (use_stmt
)
4866 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4867 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4868 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4870 if (dump_enabled_p ())
4871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4872 "Cannot determine insertion place for "
4874 STMT_VINFO_LIVE_P (stmt_info
) = false;
4875 mark_visited
= true;
4879 svisited
.add (stmt_info
);
4883 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4884 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4885 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4886 cost_vec
, svisited
, visited
);
4889 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4892 vectorizable_bb_reduc_epilogue (slp_instance instance
,
4893 stmt_vector_for_cost
*cost_vec
)
4895 gassign
*stmt
= as_a
<gassign
*> (instance
->root_stmts
[0]->stmt
);
4896 enum tree_code reduc_code
= gimple_assign_rhs_code (stmt
);
4897 if (reduc_code
== MINUS_EXPR
)
4898 reduc_code
= PLUS_EXPR
;
4899 internal_fn reduc_fn
;
4900 tree vectype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance
));
4902 || !reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
4903 || reduc_fn
== IFN_LAST
4904 || !direct_internal_fn_supported_p (reduc_fn
, vectype
, OPTIMIZE_FOR_BOTH
)
4905 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt
)),
4906 TREE_TYPE (vectype
)))
4909 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4910 cost log2 vector operations plus shuffles and one extraction. */
4911 unsigned steps
= floor_log2 (vect_nunits_for_cost (vectype
));
4912 record_stmt_cost (cost_vec
, steps
, vector_stmt
, instance
->root_stmts
[0],
4913 vectype
, 0, vect_body
);
4914 record_stmt_cost (cost_vec
, steps
, vec_perm
, instance
->root_stmts
[0],
4915 vectype
, 0, vect_body
);
4916 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, instance
->root_stmts
[0],
4917 vectype
, 0, vect_body
);
4921 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4922 and recurse to children. */
4925 vect_slp_prune_covered_roots (slp_tree node
, hash_set
<stmt_vec_info
> &roots
,
4926 hash_set
<slp_tree
> &visited
)
4928 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
4929 || visited
.add (node
))
4934 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
4935 roots
.remove (vect_orig_stmt (stmt
));
4938 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4940 vect_slp_prune_covered_roots (child
, roots
, visited
);
4943 /* Analyze statements in SLP instances of VINFO. Return true if the
4944 operations are supported. */
4947 vect_slp_analyze_operations (vec_info
*vinfo
)
4949 slp_instance instance
;
4952 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4954 hash_set
<slp_tree
> visited
;
4955 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4957 auto_vec
<slp_tree
> visited_vec
;
4958 stmt_vector_for_cost cost_vec
;
4959 cost_vec
.create (2);
4960 if (is_a
<bb_vec_info
> (vinfo
))
4961 vect_location
= instance
->location ();
4962 if (!vect_slp_analyze_node_operations (vinfo
,
4963 SLP_INSTANCE_TREE (instance
),
4964 instance
, visited
, visited_vec
,
4966 /* CTOR instances require vectorized defs for the SLP tree root. */
4967 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
4968 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4969 != vect_internal_def
4970 /* Make sure we vectorized with the expected type. */
4971 || !useless_type_conversion_p
4972 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4973 (instance
->root_stmts
[0]->stmt
))),
4974 TREE_TYPE (SLP_TREE_VECTYPE
4975 (SLP_INSTANCE_TREE (instance
))))))
4976 /* Check we can vectorize the reduction. */
4977 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
4978 && !vectorizable_bb_reduc_epilogue (instance
, &cost_vec
)))
4980 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4981 stmt_vec_info stmt_info
;
4982 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4983 stmt_info
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
4985 stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_NOTE
, vect_location
,
4988 "removing SLP instance operations starting from: %G",
4990 vect_free_slp_instance (instance
);
4991 vinfo
->slp_instances
.ordered_remove (i
);
4992 cost_vec
.release ();
4993 while (!visited_vec
.is_empty ())
4994 visited
.remove (visited_vec
.pop ());
4999 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5001 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
5002 cost_vec
.release ();
5005 /* For BB vectorization remember the SLP graph entry
5007 instance
->cost_vec
= cost_vec
;
5011 /* Now look for SLP instances with a root that are covered by other
5012 instances and remove them. */
5013 hash_set
<stmt_vec_info
> roots
;
5014 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5015 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
5016 roots
.add (SLP_INSTANCE_ROOT_STMTS (instance
)[0]);
5017 if (!roots
.is_empty ())
5020 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5021 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance
), roots
,
5023 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
5024 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
5025 && !roots
.contains (SLP_INSTANCE_ROOT_STMTS (instance
)[0]))
5027 stmt_vec_info root
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
5028 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_NOTE
, vect_location
,
5030 "removing SLP instance operations starting "
5031 "from: %G", root
->stmt
);
5032 vect_free_slp_instance (instance
);
5033 vinfo
->slp_instances
.ordered_remove (i
);
5039 /* Compute vectorizable live stmts. */
5040 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
5042 hash_set
<stmt_vec_info
> svisited
;
5043 hash_set
<slp_tree
> visited
;
5044 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5046 vect_location
= instance
->location ();
5047 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
5048 instance
, &instance
->cost_vec
, svisited
,
5053 return !vinfo
->slp_instances
.is_empty ();
5056 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5057 closing the eventual chain. */
5060 get_ultimate_leader (slp_instance instance
,
5061 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
5063 auto_vec
<slp_instance
*, 8> chain
;
5065 while (*(tem
= instance_leader
.get (instance
)) != instance
)
5067 chain
.safe_push (tem
);
5070 while (!chain
.is_empty ())
5071 *chain
.pop () = instance
;
5075 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5078 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
5079 slp_instance instance
, slp_tree node
,
5080 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
5081 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
5082 hash_set
<slp_tree
> &visited
)
5084 stmt_vec_info stmt_info
;
5087 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5090 slp_instance
&stmt_instance
5091 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
5094 else if (stmt_instance
!= instance
)
5096 /* If we're running into a previously marked stmt make us the
5097 leader of the current ultimate leader. This keeps the
5098 leader chain acyclic and works even when the current instance
5099 connects two previously independent graph parts. */
5100 slp_instance stmt_leader
5101 = get_ultimate_leader (stmt_instance
, instance_leader
);
5102 if (stmt_leader
!= instance
)
5103 instance_leader
.put (stmt_leader
, instance
);
5105 stmt_instance
= instance
;
5108 if (!SLP_TREE_SCALAR_STMTS (node
).is_empty () && visited
.add (node
))
5112 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5113 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5114 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
5115 instance_leader
, visited
);
5118 /* Partition the SLP graph into pieces that can be costed independently. */
5121 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
5123 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5125 /* First walk the SLP graph assigning each involved scalar stmt a
5126 corresponding SLP graph entry and upon visiting a previously
5127 marked stmt, make the stmts leader the current SLP graph entry. */
5128 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
5129 hash_map
<slp_instance
, slp_instance
> instance_leader
;
5130 hash_set
<slp_tree
> visited
;
5131 slp_instance instance
;
5132 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5134 instance_leader
.put (instance
, instance
);
5135 vect_bb_partition_graph_r (bb_vinfo
,
5136 instance
, SLP_INSTANCE_TREE (instance
),
5137 stmt_to_instance
, instance_leader
,
5141 /* Then collect entries to each independent subgraph. */
5142 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5144 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
5145 leader
->subgraph_entries
.safe_push (instance
);
5146 if (dump_enabled_p ()
5147 && leader
!= instance
)
5148 dump_printf_loc (MSG_NOTE
, vect_location
,
5149 "instance %p is leader of %p\n",
5154 /* Compute the set of scalar stmts participating in internal and external
5158 vect_slp_gather_vectorized_scalar_stmts (vec_info
*vinfo
, slp_tree node
,
5159 hash_set
<slp_tree
> &visited
,
5160 hash_set
<stmt_vec_info
> &vstmts
,
5161 hash_set
<stmt_vec_info
> &estmts
)
5164 stmt_vec_info stmt_info
;
5167 if (visited
.add (node
))
5170 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
5172 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5173 vstmts
.add (stmt_info
);
5175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5177 vect_slp_gather_vectorized_scalar_stmts (vinfo
, child
, visited
,
5181 for (tree def
: SLP_TREE_SCALAR_OPS (node
))
5183 stmt_vec_info def_stmt
= vinfo
->lookup_def (def
);
5185 estmts
.add (def_stmt
);
5190 /* Compute the scalar cost of the SLP node NODE and its children
5191 and return it. Do not account defs that are marked in LIFE and
5192 update LIFE according to uses of NODE. */
5195 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
5196 slp_tree node
, vec
<bool, va_heap
> *life
,
5197 stmt_vector_for_cost
*cost_vec
,
5198 hash_set
<stmt_vec_info
> &vectorized_scalar_stmts
,
5199 hash_set
<slp_tree
> &visited
)
5202 stmt_vec_info stmt_info
;
5205 if (visited
.add (node
))
5208 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5210 ssa_op_iter op_iter
;
5211 def_operand_p def_p
;
5216 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5217 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
5219 /* If there is a non-vectorized use of the defs then the scalar
5220 stmt is kept live in which case we do not account it or any
5221 required defs in the SLP children in the scalar cost. This
5222 way we make the vectorization more costly when compared to
5224 if (!STMT_VINFO_LIVE_P (stmt_info
))
5226 auto_vec
<gimple
*, 8> worklist
;
5227 hash_set
<gimple
*> *worklist_visited
= NULL
;
5228 worklist
.quick_push (orig_stmt
);
5231 gimple
*work_stmt
= worklist
.pop ();
5232 FOR_EACH_PHI_OR_STMT_DEF (def_p
, work_stmt
, op_iter
, SSA_OP_DEF
)
5234 imm_use_iterator use_iter
;
5236 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
,
5237 DEF_FROM_PTR (def_p
))
5238 if (!is_gimple_debug (use_stmt
))
5240 stmt_vec_info use_stmt_info
5241 = vinfo
->lookup_stmt (use_stmt
);
5243 || !vectorized_scalar_stmts
.contains (use_stmt_info
))
5246 && STMT_VINFO_IN_PATTERN_P (use_stmt_info
))
5248 /* For stmts participating in patterns we have
5249 to check its uses recursively. */
5250 if (!worklist_visited
)
5251 worklist_visited
= new hash_set
<gimple
*> ();
5252 if (!worklist_visited
->add (use_stmt
))
5253 worklist
.safe_push (use_stmt
);
5262 while (!worklist
.is_empty ());
5264 if (worklist_visited
)
5265 delete worklist_visited
;
5270 /* Count scalar stmts only once. */
5271 if (gimple_visited_p (orig_stmt
))
5273 gimple_set_visited (orig_stmt
, true);
5275 vect_cost_for_stmt kind
;
5276 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
5278 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
5281 kind
= scalar_store
;
5283 else if (vect_nop_conversion_p (orig_stmt_info
))
5285 /* For single-argument PHIs assume coalescing which means zero cost
5286 for the scalar and the vector PHIs. This avoids artificially
5287 favoring the vector path (but may pessimize it in some cases). */
5288 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
5289 && gimple_phi_num_args
5290 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
5294 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
5295 SLP_TREE_VECTYPE (node
), 0, vect_body
);
5298 auto_vec
<bool, 20> subtree_life
;
5299 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5301 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5303 /* Do not directly pass LIFE to the recursive call, copy it to
5304 confine changes in the callee to the current child/subtree. */
5305 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5307 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
5308 for (unsigned j
= 0;
5309 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
5311 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
5312 if (perm
.first
== i
)
5313 subtree_life
[perm
.second
] = (*life
)[j
];
5318 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
5319 subtree_life
.safe_splice (*life
);
5321 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
5322 vectorized_scalar_stmts
, visited
);
5323 subtree_life
.truncate (0);
5328 /* Comparator for the loop-index sorted cost vectors. */
5331 li_cost_vec_cmp (const void *a_
, const void *b_
)
5333 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
5334 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
5335 if (a
->first
< b
->first
)
5337 else if (a
->first
== b
->first
)
5342 /* Check if vectorization of the basic block is profitable for the
5343 subgraph denoted by SLP_INSTANCES. */
5346 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
5347 vec
<slp_instance
> slp_instances
,
5350 slp_instance instance
;
5352 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
5353 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
5355 if (dump_enabled_p ())
5357 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
5358 hash_set
<slp_tree
> visited
;
5359 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5360 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5361 SLP_INSTANCE_TREE (instance
), visited
);
5364 /* Compute the set of scalar stmts we know will go away 'locally' when
5365 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5366 not accurate for nodes promoted extern late or for scalar stmts that
5367 are used both in extern defs and in vectorized defs. */
5368 hash_set
<stmt_vec_info
> vectorized_scalar_stmts
;
5369 hash_set
<stmt_vec_info
> scalar_stmts_in_externs
;
5370 hash_set
<slp_tree
> visited
;
5371 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5373 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo
,
5374 SLP_INSTANCE_TREE (instance
),
5376 vectorized_scalar_stmts
,
5377 scalar_stmts_in_externs
);
5378 for (stmt_vec_info rstmt
: SLP_INSTANCE_ROOT_STMTS (instance
))
5379 vectorized_scalar_stmts
.add (rstmt
);
5381 /* Scalar stmts used as defs in external nodes need to be preseved, so
5382 remove them from vectorized_scalar_stmts. */
5383 for (stmt_vec_info stmt
: scalar_stmts_in_externs
)
5384 vectorized_scalar_stmts
.remove (stmt
);
5386 /* Calculate scalar cost and sum the cost for the vector stmts
5387 previously collected. */
5388 stmt_vector_for_cost scalar_costs
= vNULL
;
5389 stmt_vector_for_cost vector_costs
= vNULL
;
5391 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5393 auto_vec
<bool, 20> life
;
5394 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
5396 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
5397 record_stmt_cost (&scalar_costs
,
5398 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
5400 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
5401 vect_bb_slp_scalar_cost (bb_vinfo
,
5402 SLP_INSTANCE_TREE (instance
),
5403 &life
, &scalar_costs
, vectorized_scalar_stmts
,
5405 vector_costs
.safe_splice (instance
->cost_vec
);
5406 instance
->cost_vec
.release ();
5409 if (dump_enabled_p ())
5410 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
5412 /* When costing non-loop vectorization we need to consider each covered
5413 loop independently and make sure vectorization is profitable. For
5414 now we assume a loop may be not entered or executed an arbitrary
5415 number of iterations (??? static information can provide more
5416 precise info here) which means we can simply cost each containing
5417 loops stmts separately. */
5419 /* First produce cost vectors sorted by loop index. */
5420 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5421 li_scalar_costs (scalar_costs
.length ());
5422 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5423 li_vector_costs (vector_costs
.length ());
5424 stmt_info_for_cost
*cost
;
5425 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5427 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5428 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
5430 /* Use a random used loop as fallback in case the first vector_costs
5431 entry does not have a stmt_info associated with it. */
5432 unsigned l
= li_scalar_costs
[0].first
;
5433 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
5435 /* We inherit from the previous COST, invariants, externals and
5436 extracts immediately follow the cost for the related stmt. */
5437 if (cost
->stmt_info
)
5438 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5439 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
5441 li_scalar_costs
.qsort (li_cost_vec_cmp
);
5442 li_vector_costs
.qsort (li_cost_vec_cmp
);
5444 /* Now cost the portions individually. */
5447 bool profitable
= true;
5448 while (si
< li_scalar_costs
.length ()
5449 && vi
< li_vector_costs
.length ())
5451 unsigned sl
= li_scalar_costs
[si
].first
;
5452 unsigned vl
= li_vector_costs
[vi
].first
;
5455 if (dump_enabled_p ())
5456 dump_printf_loc (MSG_NOTE
, vect_location
,
5457 "Scalar %d and vector %d loop part do not "
5458 "match up, skipping scalar part\n", sl
, vl
);
5459 /* Skip the scalar part, assuming zero cost on the vector side. */
5464 while (si
< li_scalar_costs
.length ()
5465 && li_scalar_costs
[si
].first
== sl
);
5469 class vector_costs
*scalar_target_cost_data
= init_cost (bb_vinfo
, true);
5472 add_stmt_cost (scalar_target_cost_data
, li_scalar_costs
[si
].second
);
5475 while (si
< li_scalar_costs
.length ()
5476 && li_scalar_costs
[si
].first
== sl
);
5478 finish_cost (scalar_target_cost_data
, nullptr,
5479 &dummy
, &scalar_cost
, &dummy
);
5481 /* Complete the target-specific vector cost calculation. */
5482 class vector_costs
*vect_target_cost_data
= init_cost (bb_vinfo
, false);
5485 add_stmt_cost (vect_target_cost_data
, li_vector_costs
[vi
].second
);
5488 while (vi
< li_vector_costs
.length ()
5489 && li_vector_costs
[vi
].first
== vl
);
5490 finish_cost (vect_target_cost_data
, scalar_target_cost_data
,
5491 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
);
5492 delete scalar_target_cost_data
;
5493 delete vect_target_cost_data
;
5495 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
5497 if (dump_enabled_p ())
5499 dump_printf_loc (MSG_NOTE
, vect_location
,
5500 "Cost model analysis for part in loop %d:\n", sl
);
5501 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
5502 vec_inside_cost
+ vec_outside_cost
);
5503 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
5506 /* Vectorization is profitable if its cost is more than the cost of scalar
5507 version. Note that we err on the vector side for equal cost because
5508 the cost estimate is otherwise quite pessimistic (constant uses are
5509 free on the scalar side but cost a load on the vector side for
5511 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
5517 if (profitable
&& vi
< li_vector_costs
.length ())
5519 if (dump_enabled_p ())
5520 dump_printf_loc (MSG_NOTE
, vect_location
,
5521 "Excess vector cost for part in loop %d:\n",
5522 li_vector_costs
[vi
].first
);
5526 /* Unset visited flag. This is delayed when the subgraph is profitable
5527 and we process the loop for remaining unvectorized if-converted code. */
5528 if (!orig_loop
|| !profitable
)
5529 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5530 gimple_set_visited (cost
->stmt_info
->stmt
, false);
5532 scalar_costs
.release ();
5533 vector_costs
.release ();
5538 /* qsort comparator for lane defs. */
5541 vld_cmp (const void *a_
, const void *b_
)
5543 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
5544 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
5545 return a
->first
- b
->first
;
5548 /* Return true if USE_STMT is a vector lane insert into VEC and set
5549 *THIS_LANE to the lane number that is set. */
5552 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
5554 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
5556 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
5558 ? gimple_assign_rhs1 (use_ass
) != vec
5559 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
5560 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
5561 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
5562 || !constant_multiple_p
5563 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
5564 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
5570 /* Find any vectorizable constructors and add them to the grouped_store
5574 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
5576 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
5577 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
5578 !gsi_end_p (gsi
); gsi_next (&gsi
))
5580 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
5584 tree rhs
= gimple_assign_rhs1 (assign
);
5585 enum tree_code code
= gimple_assign_rhs_code (assign
);
5586 use_operand_p use_p
;
5588 if (code
== CONSTRUCTOR
)
5590 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5591 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
5592 CONSTRUCTOR_NELTS (rhs
))
5593 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
5594 || uniform_vector_p (rhs
))
5599 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
5600 if (TREE_CODE (val
) != SSA_NAME
5601 || !bb_vinfo
->lookup_def (val
))
5603 if (j
!= CONSTRUCTOR_NELTS (rhs
))
5606 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
5607 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
5609 else if (code
== BIT_INSERT_EXPR
5610 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
5611 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
5612 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
5613 && integer_zerop (gimple_assign_rhs3 (assign
))
5614 && useless_type_conversion_p
5615 (TREE_TYPE (TREE_TYPE (rhs
)),
5616 TREE_TYPE (gimple_assign_rhs2 (assign
)))
5617 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
5619 /* We start to match on insert to lane zero but since the
5620 inserts need not be ordered we'd have to search both
5621 the def and the use chains. */
5622 tree vectype
= TREE_TYPE (rhs
);
5623 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5624 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
5625 auto_sbitmap
lanes (nlanes
);
5626 bitmap_clear (lanes
);
5627 bitmap_set_bit (lanes
, 0);
5628 tree def
= gimple_assign_lhs (assign
);
5629 lane_defs
.quick_push
5630 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
5631 unsigned lanes_found
= 1;
5632 /* Start with the use chains, the last stmt will be the root. */
5633 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
5634 vec
<stmt_vec_info
> roots
= vNULL
;
5635 roots
.safe_push (last
);
5638 use_operand_p use_p
;
5640 if (!single_imm_use (def
, &use_p
, &use_stmt
))
5643 if (!bb_vinfo
->lookup_stmt (use_stmt
)
5644 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
5645 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
5647 if (bitmap_bit_p (lanes
, this_lane
))
5650 bitmap_set_bit (lanes
, this_lane
);
5651 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
5652 lane_defs
.quick_push (std::make_pair
5653 (this_lane
, gimple_assign_rhs2 (use_ass
)));
5654 last
= bb_vinfo
->lookup_stmt (use_ass
);
5655 roots
.safe_push (last
);
5656 def
= gimple_assign_lhs (use_ass
);
5658 while (lanes_found
< nlanes
);
5659 if (roots
.length () > 1)
5660 std::swap(roots
[0], roots
[roots
.length () - 1]);
5661 if (lanes_found
< nlanes
)
5663 /* Now search the def chain. */
5664 def
= gimple_assign_rhs1 (assign
);
5667 if (TREE_CODE (def
) != SSA_NAME
5668 || !has_single_use (def
))
5670 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
5672 if (!bb_vinfo
->lookup_stmt (def_stmt
)
5673 || !vect_slp_is_lane_insert (def_stmt
,
5674 NULL_TREE
, &this_lane
)
5675 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
5677 if (bitmap_bit_p (lanes
, this_lane
))
5680 bitmap_set_bit (lanes
, this_lane
);
5681 lane_defs
.quick_push (std::make_pair
5683 gimple_assign_rhs2 (def_stmt
)));
5684 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
5685 def
= gimple_assign_rhs1 (def_stmt
);
5687 while (lanes_found
< nlanes
);
5689 if (lanes_found
== nlanes
)
5691 /* Sort lane_defs after the lane index and register the root. */
5692 lane_defs
.qsort (vld_cmp
);
5693 vec
<stmt_vec_info
> stmts
;
5694 stmts
.create (nlanes
);
5695 for (unsigned i
= 0; i
< nlanes
; ++i
)
5696 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
5697 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
5703 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5704 && (associative_tree_code (code
) || code
== MINUS_EXPR
)
5705 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5706 checks pessimize a two-element reduction. PR54400.
5707 ??? In-order reduction could be handled if we only
5708 traverse one operand chain in vect_slp_linearize_chain. */
5709 && ((FLOAT_TYPE_P (TREE_TYPE (rhs
)) && flag_associative_math
)
5710 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs
))
5711 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs
))))
5712 /* Ops with constants at the tail can be stripped here. */
5713 && TREE_CODE (rhs
) == SSA_NAME
5714 && TREE_CODE (gimple_assign_rhs2 (assign
)) == SSA_NAME
5715 /* Should be the chain end. */
5716 && (!single_imm_use (gimple_assign_lhs (assign
),
5718 || !is_gimple_assign (use_stmt
)
5719 || (gimple_assign_rhs_code (use_stmt
) != code
5720 && ((code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
5721 || (gimple_assign_rhs_code (use_stmt
)
5722 != (code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
))))))
5724 /* We start the match at the end of a possible association
5726 auto_vec
<chain_op_t
> chain
;
5727 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
5728 auto_vec
<gimple
*> chain_stmts
;
5729 gimple
*code_stmt
= NULL
, *alt_code_stmt
= NULL
;
5730 if (code
== MINUS_EXPR
)
5732 internal_fn reduc_fn
;
5733 if (!reduction_fn_for_scalar_code (code
, &reduc_fn
)
5734 || reduc_fn
== IFN_LAST
)
5736 vect_slp_linearize_chain (bb_vinfo
, worklist
, chain
, code
, assign
,
5738 code_stmt
, alt_code_stmt
, &chain_stmts
);
5739 if (chain
.length () > 1)
5741 /* Sort the chain according to def_type and operation. */
5742 chain
.sort (dt_sort_cmp
, bb_vinfo
);
5743 /* ??? Now we'd want to strip externals and constants
5744 but record those to be handled in the epilogue. */
5745 /* ??? For now do not allow mixing ops or externs/constants. */
5746 bool invalid
= false;
5747 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5748 if (chain
[i
].dt
!= vect_internal_def
5749 || chain
[i
].code
!= code
)
5753 vec
<stmt_vec_info
> stmts
;
5754 stmts
.create (chain
.length ());
5755 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5756 stmts
.quick_push (bb_vinfo
->lookup_def (chain
[i
].op
));
5757 vec
<stmt_vec_info
> roots
;
5758 roots
.create (chain_stmts
.length ());
5759 for (unsigned i
= 0; i
< chain_stmts
.length (); ++i
)
5760 roots
.quick_push (bb_vinfo
->lookup_stmt (chain_stmts
[i
]));
5761 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_bb_reduc
,
5769 /* Walk the grouped store chains and replace entries with their
5770 pattern variant if any. */
5773 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
5775 stmt_vec_info first_element
;
5778 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
5780 /* We also have CTORs in this array. */
5781 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
5783 if (STMT_VINFO_IN_PATTERN_P (first_element
))
5785 stmt_vec_info orig
= first_element
;
5786 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
5787 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
5788 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
5789 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
5790 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
5791 vinfo
->grouped_stores
[i
] = first_element
;
5793 stmt_vec_info prev
= first_element
;
5794 while (DR_GROUP_NEXT_ELEMENT (prev
))
5796 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
5797 if (STMT_VINFO_IN_PATTERN_P (elt
))
5799 stmt_vec_info orig
= elt
;
5800 elt
= STMT_VINFO_RELATED_STMT (elt
);
5801 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
5802 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
5803 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
5805 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
5811 /* Check if the region described by BB_VINFO can be vectorized, returning
5812 true if so. When returning false, set FATAL to true if the same failure
5813 would prevent vectorization at other vector sizes, false if it is still
5814 worth trying other sizes. N_STMTS is the number of statements in the
5818 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
5819 vec
<int> *dataref_groups
)
5821 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5823 slp_instance instance
;
5825 poly_uint64 min_vf
= 2;
5827 /* The first group of checks is independent of the vector size. */
5830 /* Analyze the data references. */
5832 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
5834 if (dump_enabled_p ())
5835 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5836 "not vectorized: unhandled data-ref in basic "
5841 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
5843 if (dump_enabled_p ())
5844 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5845 "not vectorized: unhandled data access in "
5850 vect_slp_check_for_constructors (bb_vinfo
);
5852 /* If there are no grouped stores and no constructors in the region
5853 there is no need to continue with pattern recog as vect_analyze_slp
5854 will fail anyway. */
5855 if (bb_vinfo
->grouped_stores
.is_empty ()
5856 && bb_vinfo
->roots
.is_empty ())
5858 if (dump_enabled_p ())
5859 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5860 "not vectorized: no grouped stores in "
5865 /* While the rest of the analysis below depends on it in some way. */
5868 vect_pattern_recog (bb_vinfo
);
5870 /* Update store groups from pattern processing. */
5871 vect_fixup_store_groups_with_patterns (bb_vinfo
);
5873 /* Check the SLP opportunities in the basic block, analyze and build SLP
5875 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
5877 if (dump_enabled_p ())
5879 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5880 "Failed to SLP the basic block.\n");
5881 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5882 "not vectorized: failed to find SLP opportunities "
5883 "in basic block.\n");
5888 /* Optimize permutations. */
5889 vect_optimize_slp (bb_vinfo
);
5891 /* Gather the loads reachable from the SLP graph entries. */
5892 vect_gather_slp_loads (bb_vinfo
);
5894 vect_record_base_alignments (bb_vinfo
);
5896 /* Analyze and verify the alignment of data references and the
5897 dependence in the SLP instances. */
5898 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
5900 vect_location
= instance
->location ();
5901 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
5902 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
5904 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5905 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5906 if (dump_enabled_p ())
5907 dump_printf_loc (MSG_NOTE
, vect_location
,
5908 "removing SLP instance operations starting from: %G",
5910 vect_free_slp_instance (instance
);
5911 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
5915 /* Mark all the statements that we want to vectorize as pure SLP and
5917 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
5918 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
5921 /* Likewise consider instance root stmts as vectorized. */
5922 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
5923 STMT_SLP_TYPE (root
) = pure_slp
;
5927 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
5930 if (!vect_slp_analyze_operations (bb_vinfo
))
5932 if (dump_enabled_p ())
5933 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5934 "not vectorized: bad operation in basic block.\n");
5938 vect_bb_partition_graph (bb_vinfo
);
5943 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5944 basic blocks in BBS, returning true on success.
5945 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5948 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
5949 vec
<int> *dataref_groups
, unsigned int n_stmts
,
5952 bb_vec_info bb_vinfo
;
5953 auto_vector_modes vector_modes
;
5955 /* Autodetect first vector size we try. */
5956 machine_mode next_vector_mode
= VOIDmode
;
5957 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
5958 unsigned int mode_i
= 0;
5960 vec_info_shared shared
;
5962 machine_mode autodetected_vector_mode
= VOIDmode
;
5965 bool vectorized
= false;
5967 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
5969 bool first_time_p
= shared
.datarefs
.is_empty ();
5970 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
5972 bb_vinfo
->shared
->save_datarefs ();
5974 bb_vinfo
->shared
->check_datarefs ();
5975 bb_vinfo
->vector_mode
= next_vector_mode
;
5977 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
5979 if (dump_enabled_p ())
5981 dump_printf_loc (MSG_NOTE
, vect_location
,
5982 "***** Analysis succeeded with vector mode"
5983 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
5984 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
5987 bb_vinfo
->shared
->check_datarefs ();
5989 auto_vec
<slp_instance
> profitable_subgraphs
;
5990 for (slp_instance instance
: BB_VINFO_SLP_INSTANCES (bb_vinfo
))
5992 if (instance
->subgraph_entries
.is_empty ())
5995 vect_location
= instance
->location ();
5996 if (!unlimited_cost_model (NULL
)
5997 && !vect_bb_vectorization_profitable_p
5998 (bb_vinfo
, instance
->subgraph_entries
, orig_loop
))
6000 if (dump_enabled_p ())
6001 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6002 "not vectorized: vectorization is not "
6007 if (!dbg_cnt (vect_slp
))
6010 profitable_subgraphs
.safe_push (instance
);
6013 /* When we're vectorizing an if-converted loop body make sure
6014 we vectorized all if-converted code. */
6015 if (!profitable_subgraphs
.is_empty ()
6018 gcc_assert (bb_vinfo
->bbs
.length () == 1);
6019 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[0]);
6020 !gsi_end_p (gsi
); gsi_next (&gsi
))
6022 /* The costing above left us with DCEable vectorized scalar
6023 stmts having the visited flag set on profitable
6024 subgraphs. Do the delayed clearing of the flag here. */
6025 if (gimple_visited_p (gsi_stmt (gsi
)))
6027 gimple_set_visited (gsi_stmt (gsi
), false);
6030 if (flag_vect_cost_model
== VECT_COST_MODEL_UNLIMITED
)
6033 if (gassign
*ass
= dyn_cast
<gassign
*> (gsi_stmt (gsi
)))
6034 if (gimple_assign_rhs_code (ass
) == COND_EXPR
)
6036 if (!profitable_subgraphs
.is_empty ()
6037 && dump_enabled_p ())
6038 dump_printf_loc (MSG_NOTE
, vect_location
,
6039 "not profitable because of "
6040 "unprofitable if-converted scalar "
6042 profitable_subgraphs
.truncate (0);
6047 /* Finally schedule the profitable subgraphs. */
6048 for (slp_instance instance
: profitable_subgraphs
)
6050 if (!vectorized
&& dump_enabled_p ())
6051 dump_printf_loc (MSG_NOTE
, vect_location
,
6052 "Basic block will be vectorized "
6056 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
6058 unsigned HOST_WIDE_INT bytes
;
6059 if (dump_enabled_p ())
6062 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
6063 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
6064 "basic block part vectorized using %wu "
6065 "byte vectors\n", bytes
);
6067 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
6068 "basic block part vectorized using "
6069 "variable length vectors\n");
6075 if (dump_enabled_p ())
6076 dump_printf_loc (MSG_NOTE
, vect_location
,
6077 "***** Analysis failed with vector mode %s\n",
6078 GET_MODE_NAME (bb_vinfo
->vector_mode
));
6082 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
6085 while (mode_i
< vector_modes
.length ()
6086 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
6088 if (dump_enabled_p ())
6089 dump_printf_loc (MSG_NOTE
, vect_location
,
6090 "***** The result for vector mode %s would"
6092 GET_MODE_NAME (vector_modes
[mode_i
]));
6098 if (mode_i
< vector_modes
.length ()
6099 && VECTOR_MODE_P (autodetected_vector_mode
)
6100 && (related_vector_mode (vector_modes
[mode_i
],
6101 GET_MODE_INNER (autodetected_vector_mode
))
6102 == autodetected_vector_mode
)
6103 && (related_vector_mode (autodetected_vector_mode
,
6104 GET_MODE_INNER (vector_modes
[mode_i
]))
6105 == vector_modes
[mode_i
]))
6107 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE
, vect_location
,
6109 "***** Skipping vector mode %s, which would"
6110 " repeat the analysis for %s\n",
6111 GET_MODE_NAME (vector_modes
[mode_i
]),
6112 GET_MODE_NAME (autodetected_vector_mode
));
6117 || mode_i
== vector_modes
.length ()
6118 || autodetected_vector_mode
== VOIDmode
6119 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6120 vector sizes will fail do not bother iterating. */
6124 /* Try the next biggest vector size. */
6125 next_vector_mode
= vector_modes
[mode_i
++];
6126 if (dump_enabled_p ())
6127 dump_printf_loc (MSG_NOTE
, vect_location
,
6128 "***** Re-trying analysis with vector mode %s\n",
6129 GET_MODE_NAME (next_vector_mode
));
6134 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6135 true if anything in the basic-block was vectorized. */
6138 vect_slp_bbs (const vec
<basic_block
> &bbs
, loop_p orig_loop
)
6140 vec
<data_reference_p
> datarefs
= vNULL
;
6141 auto_vec
<int> dataref_groups
;
6143 int current_group
= 0;
6145 for (unsigned i
= 0; i
< bbs
.length (); i
++)
6147 basic_block bb
= bbs
[i
];
6148 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
6151 gimple
*stmt
= gsi_stmt (gsi
);
6152 if (is_gimple_debug (stmt
))
6157 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
6158 vect_location
= stmt
;
6160 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
6161 &dataref_groups
, current_group
))
6164 /* New BBs always start a new DR group. */
6168 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
, orig_loop
);
6171 /* Special entry for the BB vectorizer. Analyze and transform a single
6172 if-converted BB with ORIG_LOOPs body being the not if-converted
6173 representation. Returns true if anything in the basic-block was
6177 vect_slp_if_converted_bb (basic_block bb
, loop_p orig_loop
)
6179 auto_vec
<basic_block
> bbs
;
6181 return vect_slp_bbs (bbs
, orig_loop
);
6184 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6185 true if anything in the basic-block was vectorized. */
6188 vect_slp_function (function
*fun
)
6191 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
6192 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
6194 /* For the moment split the function into pieces to avoid making
6195 the iteration on the vector mode moot. Split at points we know
6196 to not handle well which is CFG merges (SLP discovery doesn't
6197 handle non-loop-header PHIs) and loop exits. Since pattern
6198 recog requires reverse iteration to visit uses before defs
6199 simply chop RPO into pieces. */
6200 auto_vec
<basic_block
> bbs
;
6201 for (unsigned i
= 0; i
< n
; i
++)
6203 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
6206 /* Split when a BB is not dominated by the first block. */
6207 if (!bbs
.is_empty ()
6208 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6212 "splitting region at dominance boundary bb%d\n",
6216 /* Split when the loop determined by the first block
6217 is exited. This is because we eventually insert
6218 invariants at region begin. */
6219 else if (!bbs
.is_empty ()
6220 && bbs
[0]->loop_father
!= bb
->loop_father
6221 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6225 "splitting region at loop %d exit at bb%d\n",
6226 bbs
[0]->loop_father
->num
, bb
->index
);
6230 if (split
&& !bbs
.is_empty ())
6232 r
|= vect_slp_bbs (bbs
, NULL
);
6234 bbs
.quick_push (bb
);
6239 /* When we have a stmt ending this block and defining a
6240 value we have to insert on edges when inserting after it for
6241 a vector containing its definition. Avoid this for now. */
6242 if (gimple
*last
= last_stmt (bb
))
6243 if (gimple_get_lhs (last
)
6244 && is_ctrl_altering_stmt (last
))
6246 if (dump_enabled_p ())
6247 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6248 "splitting region at control altering "
6249 "definition %G", last
);
6250 r
|= vect_slp_bbs (bbs
, NULL
);
6255 if (!bbs
.is_empty ())
6256 r
|= vect_slp_bbs (bbs
, NULL
);
6263 /* Build a variable-length vector in which the elements in ELTS are repeated
6264 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6265 RESULTS and add any new instructions to SEQ.
6267 The approach we use is:
6269 (1) Find a vector mode VM with integer elements of mode IM.
6271 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6272 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6273 from small vectors to IM.
6275 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6277 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6278 correct byte contents.
6280 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6282 We try to find the largest IM for which this sequence works, in order
6283 to cut down on the number of interleaves. */
6286 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
6287 const vec
<tree
> &elts
, unsigned int nresults
,
6290 unsigned int nelts
= elts
.length ();
6291 tree element_type
= TREE_TYPE (vector_type
);
6293 /* (1) Find a vector mode VM with integer elements of mode IM. */
6294 unsigned int nvectors
= 1;
6295 tree new_vector_type
;
6297 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
6298 &nvectors
, &new_vector_type
,
6302 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6303 unsigned int partial_nelts
= nelts
/ nvectors
;
6304 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
6306 tree_vector_builder partial_elts
;
6307 auto_vec
<tree
, 32> pieces (nvectors
* 2);
6308 pieces
.quick_grow_cleared (nvectors
* 2);
6309 for (unsigned int i
= 0; i
< nvectors
; ++i
)
6311 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6312 ELTS' has mode IM. */
6313 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
6314 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
6315 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
6316 tree t
= gimple_build_vector (seq
, &partial_elts
);
6317 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
6318 TREE_TYPE (new_vector_type
), t
);
6320 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6321 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
6324 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6325 correct byte contents.
6327 Conceptually, we need to repeat the following operation log2(nvectors)
6328 times, where hi_start = nvectors / 2:
6330 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6331 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6333 However, if each input repeats every N elements and the VF is
6334 a multiple of N * 2, the HI result is the same as the LO result.
6335 This will be true for the first N1 iterations of the outer loop,
6336 followed by N2 iterations for which both the LO and HI results
6339 N1 + N2 = log2(nvectors)
6341 Each "N1 iteration" doubles the number of redundant vectors and the
6342 effect of the process as a whole is to have a sequence of nvectors/2**N1
6343 vectors that repeats 2**N1 times. Rather than generate these redundant
6344 vectors, we halve the number of vectors for each N1 iteration. */
6345 unsigned int in_start
= 0;
6346 unsigned int out_start
= nvectors
;
6347 unsigned int new_nvectors
= nvectors
;
6348 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
6350 unsigned int hi_start
= new_nvectors
/ 2;
6351 unsigned int out_i
= 0;
6352 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
6355 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
6359 tree output
= make_ssa_name (new_vector_type
);
6360 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
6361 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
6362 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
6364 permutes
[in_i
& 1]);
6365 gimple_seq_add_stmt (seq
, stmt
);
6366 pieces
[out_start
+ out_i
] = output
;
6369 std::swap (in_start
, out_start
);
6370 new_nvectors
= out_i
;
6373 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6374 results
.reserve (nresults
);
6375 for (unsigned int i
= 0; i
< nresults
; ++i
)
6376 if (i
< new_nvectors
)
6377 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
6378 pieces
[in_start
+ i
]));
6380 results
.quick_push (results
[i
- new_nvectors
]);
6384 /* For constant and loop invariant defs in OP_NODE this function creates
6385 vector defs that will be used in the vectorized stmts and stores them
6386 to SLP_TREE_VEC_DEFS of OP_NODE. */
6389 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
6391 unsigned HOST_WIDE_INT nunits
;
6393 unsigned j
, number_of_places_left_in_vector
;
6396 int group_size
= op_node
->ops
.length ();
6397 unsigned int vec_num
, i
;
6398 unsigned number_of_copies
= 1;
6400 gimple_seq ctor_seq
= NULL
;
6401 auto_vec
<tree
, 16> permute_results
;
6403 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6404 vector_type
= SLP_TREE_VECTYPE (op_node
);
6406 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
6407 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
6408 auto_vec
<tree
> voprnds (number_of_vectors
);
6410 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6411 created vectors. It is greater than 1 if unrolling is performed.
6413 For example, we have two scalar operands, s1 and s2 (e.g., group of
6414 strided accesses of size two), while NUNITS is four (i.e., four scalars
6415 of this type can be packed in a vector). The output vector will contain
6416 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6419 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6420 containing the operands.
6422 For example, NUNITS is four as before, and the group size is 8
6423 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6424 {s5, s6, s7, s8}. */
6426 /* When using duplicate_and_interleave, we just need one element for
6427 each scalar statement. */
6428 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
6429 nunits
= group_size
;
6431 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
6433 number_of_places_left_in_vector
= nunits
;
6435 tree_vector_builder
elts (vector_type
, nunits
, 1);
6436 elts
.quick_grow (nunits
);
6437 stmt_vec_info insert_after
= NULL
;
6438 for (j
= 0; j
< number_of_copies
; j
++)
6441 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
6443 /* Create 'vect_ = {op0,op1,...,opn}'. */
6444 number_of_places_left_in_vector
--;
6446 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
6448 if (CONSTANT_CLASS_P (op
))
6450 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6452 /* Can't use VIEW_CONVERT_EXPR for booleans because
6453 of possibly different sizes of scalar value and
6455 if (integer_zerop (op
))
6456 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
6457 else if (integer_onep (op
))
6458 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
6463 op
= fold_unary (VIEW_CONVERT_EXPR
,
6464 TREE_TYPE (vector_type
), op
);
6465 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
6469 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
6471 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6474 = build_all_ones_cst (TREE_TYPE (vector_type
));
6476 = build_zero_cst (TREE_TYPE (vector_type
));
6477 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
6478 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
6484 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
6487 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
6490 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
6494 elts
[number_of_places_left_in_vector
] = op
;
6495 if (!CONSTANT_CLASS_P (op
))
6497 /* For BB vectorization we have to compute an insert location
6498 when a def is inside the analyzed region since we cannot
6499 simply insert at the BB start in this case. */
6500 stmt_vec_info opdef
;
6501 if (TREE_CODE (orig_op
) == SSA_NAME
6502 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
6503 && is_a
<bb_vec_info
> (vinfo
)
6504 && (opdef
= vinfo
->lookup_def (orig_op
)))
6507 insert_after
= opdef
;
6509 insert_after
= get_later_stmt (insert_after
, opdef
);
6512 if (number_of_places_left_in_vector
== 0)
6515 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
6516 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
6517 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
6520 if (permute_results
.is_empty ())
6521 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
6522 elts
, number_of_vectors
,
6524 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
6526 if (!gimple_seq_empty_p (ctor_seq
))
6530 gimple_stmt_iterator gsi
;
6531 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
6533 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
6534 gsi_insert_seq_before (&gsi
, ctor_seq
,
6535 GSI_CONTINUE_LINKING
);
6537 else if (!stmt_ends_bb_p (insert_after
->stmt
))
6539 gsi
= gsi_for_stmt (insert_after
->stmt
);
6540 gsi_insert_seq_after (&gsi
, ctor_seq
,
6541 GSI_CONTINUE_LINKING
);
6545 /* When we want to insert after a def where the
6546 defining stmt throws then insert on the fallthru
6548 edge e
= find_fallthru_edge
6549 (gimple_bb (insert_after
->stmt
)->succs
);
6551 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
6552 gcc_assert (!new_bb
);
6556 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
6559 voprnds
.quick_push (vec_cst
);
6560 insert_after
= NULL
;
6561 number_of_places_left_in_vector
= nunits
;
6563 elts
.new_vector (vector_type
, nunits
, 1);
6564 elts
.quick_grow (nunits
);
6569 /* Since the vectors are created in the reverse order, we should invert
6571 vec_num
= voprnds
.length ();
6572 for (j
= vec_num
; j
!= 0; j
--)
6574 vop
= voprnds
[j
- 1];
6575 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6578 /* In case that VF is greater than the unrolling factor needed for the SLP
6579 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6580 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6581 to replicate the vectors. */
6582 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
6583 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
6585 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6588 /* Get the Ith vectorized definition from SLP_NODE. */
6591 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
6593 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
6594 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
6596 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
6599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6602 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
6604 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
6605 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
6608 gimple
*vec_def_stmt
;
6609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
6610 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
6613 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
6616 /* Get N vectorized definitions for SLP_NODE. */
6619 vect_get_slp_defs (vec_info
*,
6620 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
6623 n
= SLP_TREE_CHILDREN (slp_node
).length ();
6625 for (unsigned i
= 0; i
< n
; ++i
)
6627 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
6628 vec
<tree
> vec_defs
= vNULL
;
6629 vect_get_slp_defs (child
, &vec_defs
);
6630 vec_oprnds
->quick_push (vec_defs
);
6634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6635 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6636 permute statements for the SLP node NODE. Store the number of vector
6637 permute instructions in *N_PERMS and the number of vector load
6638 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6639 that were not needed. */
6642 vect_transform_slp_perm_load (vec_info
*vinfo
,
6643 slp_tree node
, const vec
<tree
> &dr_chain
,
6644 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
6645 bool analyze_only
, unsigned *n_perms
,
6646 unsigned int *n_loads
, bool dce_chain
)
6648 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
6650 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6651 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
6652 unsigned int mask_element
;
6655 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6658 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6660 mode
= TYPE_MODE (vectype
);
6661 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6663 /* Initialize the vect stmts of NODE to properly insert the generated
6666 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
6667 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
6668 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
6670 /* Generate permutation masks for every NODE. Number of masks for each NODE
6671 is equal to GROUP_SIZE.
6672 E.g., we have a group of three nodes with three loads from the same
6673 location in each node, and the vector size is 4. I.e., we have a
6674 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6675 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6676 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6679 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6680 The last mask is illegal since we assume two operands for permute
6681 operation, and the mask element values can't be outside that range.
6682 Hence, the last mask must be converted into {2,5,5,5}.
6683 For the first two permutations we need the first and the second input
6684 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6685 we need the second and the third vectors: {b1,c1,a2,b2} and
6688 int vect_stmts_counter
= 0;
6689 unsigned int index
= 0;
6690 int first_vec_index
= -1;
6691 int second_vec_index
= -1;
6695 vec_perm_builder mask
;
6696 unsigned int nelts_to_build
;
6697 unsigned int nvectors_per_build
;
6698 unsigned int in_nlanes
;
6699 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
6700 && multiple_p (nunits
, group_size
));
6703 /* A single vector contains a whole number of copies of the node, so:
6704 (a) all permutes can use the same mask; and
6705 (b) the permutes only need a single vector input. */
6706 mask
.new_vector (nunits
, group_size
, 3);
6707 nelts_to_build
= mask
.encoded_nelts ();
6708 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
6709 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
6713 /* We need to construct a separate mask for each vector statement. */
6714 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
6715 if (!nunits
.is_constant (&const_nunits
)
6716 || !vf
.is_constant (&const_vf
))
6718 mask
.new_vector (const_nunits
, const_nunits
, 1);
6719 nelts_to_build
= const_vf
* group_size
;
6720 nvectors_per_build
= 1;
6721 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
6723 auto_sbitmap
used_in_lanes (in_nlanes
);
6724 bitmap_clear (used_in_lanes
);
6725 auto_bitmap used_defs
;
6727 unsigned int count
= mask
.encoded_nelts ();
6728 mask
.quick_grow (count
);
6729 vec_perm_indices indices
;
6731 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
6733 unsigned int iter_num
= j
/ group_size
;
6734 unsigned int stmt_num
= j
% group_size
;
6735 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
6736 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
6737 bitmap_set_bit (used_in_lanes
, i
);
6740 first_vec_index
= 0;
6745 /* Enforced before the loop when !repeating_p. */
6746 unsigned int const_nunits
= nunits
.to_constant ();
6747 vec_index
= i
/ const_nunits
;
6748 mask_element
= i
% const_nunits
;
6749 if (vec_index
== first_vec_index
6750 || first_vec_index
== -1)
6752 first_vec_index
= vec_index
;
6754 else if (vec_index
== second_vec_index
6755 || second_vec_index
== -1)
6757 second_vec_index
= vec_index
;
6758 mask_element
+= const_nunits
;
6762 if (dump_enabled_p ())
6763 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6764 "permutation requires at "
6765 "least three vectors %G",
6767 gcc_assert (analyze_only
);
6771 gcc_assert (mask_element
< 2 * const_nunits
);
6774 if (mask_element
!= index
)
6776 mask
[index
++] = mask_element
;
6778 if (index
== count
&& !noop_p
)
6780 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
6781 if (!can_vec_perm_const_p (mode
, mode
, indices
))
6783 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6787 "unsupported vect permute { ");
6788 for (i
= 0; i
< count
; ++i
)
6790 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6791 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6793 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6795 gcc_assert (analyze_only
);
6806 tree mask_vec
= NULL_TREE
;
6809 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6811 if (second_vec_index
== -1)
6812 second_vec_index
= first_vec_index
;
6814 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
6816 /* Generate the permute statement if necessary. */
6817 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
6818 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
6822 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6824 = vect_create_destination_var (gimple_assign_lhs (stmt
),
6826 perm_dest
= make_ssa_name (perm_dest
);
6828 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6829 first_vec
, second_vec
,
6831 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
6835 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6836 bitmap_set_bit (used_defs
, second_vec_index
+ ri
);
6841 /* If mask was NULL_TREE generate the requested
6842 identity transform. */
6843 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
6845 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6848 /* Store the vector statement in NODE. */
6849 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
6854 first_vec_index
= -1;
6855 second_vec_index
= -1;
6863 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6866 /* Enforced above when !repeating_p. */
6867 unsigned int const_nunits
= nunits
.to_constant ();
6869 bool load_seen
= false;
6870 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
6872 if (i
% const_nunits
== 0)
6878 if (bitmap_bit_p (used_in_lanes
, i
))
6887 for (unsigned i
= 0; i
< dr_chain
.length (); ++i
)
6888 if (!bitmap_bit_p (used_defs
, i
))
6890 gimple
*stmt
= SSA_NAME_DEF_STMT (dr_chain
[i
]);
6891 gimple_stmt_iterator rgsi
= gsi_for_stmt (stmt
);
6892 gsi_remove (&rgsi
, true);
6893 release_defs (stmt
);
6899 /* Produce the next vector result for SLP permutation NODE by adding a vector
6900 statement at GSI. If MASK_VEC is nonnull, add:
6902 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6906 <new SSA name> = FIRST_DEF. */
6909 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6910 slp_tree node
, tree first_def
, tree second_def
,
6913 tree vectype
= SLP_TREE_VECTYPE (node
);
6915 /* ??? We SLP match existing vector element extracts but
6916 allow punning which we need to re-instantiate at uses
6917 but have no good way of explicitly representing. */
6918 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)), TYPE_SIZE (vectype
))
6919 && !types_compatible_p (TREE_TYPE (first_def
), vectype
))
6922 = gimple_build_assign (make_ssa_name (vectype
),
6923 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
6924 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6925 first_def
= gimple_assign_lhs (conv_stmt
);
6928 tree perm_dest
= make_ssa_name (vectype
);
6931 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)),
6932 TYPE_SIZE (vectype
))
6933 && !types_compatible_p (TREE_TYPE (second_def
), vectype
))
6936 = gimple_build_assign (make_ssa_name (vectype
),
6937 build1 (VIEW_CONVERT_EXPR
,
6938 vectype
, second_def
));
6939 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6940 second_def
= gimple_assign_lhs (conv_stmt
);
6942 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6943 first_def
, second_def
,
6946 else if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
6948 /* For identity permutes we still need to handle the case
6949 of lowpart extracts or concats. */
6950 unsigned HOST_WIDE_INT c
;
6951 auto first_def_nunits
6952 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def
));
6953 if (known_le (TYPE_VECTOR_SUBPARTS (vectype
), first_def_nunits
))
6955 tree lowpart
= build3 (BIT_FIELD_REF
, vectype
, first_def
,
6956 TYPE_SIZE (vectype
), bitsize_zero_node
);
6957 perm_stmt
= gimple_build_assign (perm_dest
, lowpart
);
6959 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
),
6960 first_def_nunits
, &c
) && c
== 2)
6962 tree ctor
= build_constructor_va (vectype
, 2, NULL_TREE
, first_def
,
6963 NULL_TREE
, second_def
);
6964 perm_stmt
= gimple_build_assign (perm_dest
, ctor
);
6971 /* We need a copy here in case the def was external. */
6972 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
6974 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
6975 /* Store the vector statement in NODE. */
6976 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
6979 /* Vectorize the SLP permutations in NODE as specified
6980 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6981 child number and lane number.
6982 Interleaving of two two-lane two-child SLP subtrees (not supported):
6983 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6984 A blend of two four-lane two-child SLP subtrees:
6985 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6986 Highpart of a four-lane one-child SLP subtree (not supported):
6987 [ { 0, 2 }, { 0, 3 } ]
6988 Where currently only a subset is supported by code generating below. */
6991 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6992 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
6994 tree vectype
= SLP_TREE_VECTYPE (node
);
6996 /* ??? We currently only support all same vector input types
6997 while the SLP IL should really do a concat + select and thus accept
6998 arbitrary mismatches. */
7001 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7002 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
7003 tree op_vectype
= NULL_TREE
;
7004 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7005 if (SLP_TREE_VECTYPE (child
))
7007 op_vectype
= SLP_TREE_VECTYPE (child
);
7011 op_vectype
= vectype
;
7012 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7014 if ((SLP_TREE_DEF_TYPE (child
) != vect_internal_def
7015 && !vect_maybe_update_slp_op_vectype (child
, op_vectype
))
7016 || !types_compatible_p (SLP_TREE_VECTYPE (child
), op_vectype
)
7017 || !types_compatible_p (TREE_TYPE (vectype
), TREE_TYPE (op_vectype
)))
7019 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7021 "Unsupported vector types in lane permutation\n");
7024 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
7025 repeating_p
= false;
7028 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
7029 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
7030 if (dump_enabled_p ())
7032 dump_printf_loc (MSG_NOTE
, vect_location
,
7033 "vectorizing permutation");
7034 for (unsigned i
= 0; i
< perm
.length (); ++i
)
7035 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
7037 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
7038 dump_printf (MSG_NOTE
, "\n");
7041 /* REPEATING_P is true if every output vector is guaranteed to use the
7042 same permute vector. We can handle that case for both variable-length
7043 and constant-length vectors, but we only handle other cases for
7044 constant-length vectors.
7048 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7049 mask vector that we want to build.
7051 - NCOPIES to the number of copies of PERM that we need in order
7052 to build the necessary permute mask vectors.
7054 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7055 for each permute mask vector. This is only relevant when GSI is
7058 unsigned nelts_per_pattern
;
7060 unsigned noutputs_per_mask
;
7063 /* We need a single permute mask vector that has the form:
7065 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7067 In other words, the original n-element permute in PERM is
7068 "unrolled" to fill a full vector. The stepped vector encoding
7069 that we use for permutes requires 3n elements. */
7070 npatterns
= SLP_TREE_LANES (node
);
7071 nelts_per_pattern
= ncopies
= 3;
7072 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
7076 /* Calculate every element of every permute mask vector explicitly,
7077 instead of relying on the pattern described above. */
7078 if (!nunits
.is_constant (&npatterns
))
7080 nelts_per_pattern
= ncopies
= 1;
7081 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
7082 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
7084 noutputs_per_mask
= 1;
7086 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
7087 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
7089 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7090 from the { SLP operand, scalar lane } permutation as recorded in the
7091 SLP node as intermediate step. This part should already work
7092 with SLP children with arbitrary number of lanes. */
7093 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
7094 auto_vec
<unsigned> active_lane
;
7095 vperm
.create (olanes
);
7096 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
7097 for (unsigned i
= 0; i
< ncopies
; ++i
)
7099 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
7101 std::pair
<unsigned, unsigned> p
= perm
[pi
];
7102 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
7104 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
7107 /* We checked above that the vectors are constant-length. */
7108 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
7109 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
7110 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
7111 vperm
.quick_push ({{p
.first
, vi
}, vl
});
7114 /* Advance to the next group. */
7115 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
7116 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
7119 if (dump_enabled_p ())
7121 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
7122 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
7126 ? multiple_p (i
, npatterns
)
7127 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
7128 dump_printf (MSG_NOTE
, ",");
7129 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
7130 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
7133 dump_printf (MSG_NOTE
, "\n");
7136 /* We can only handle two-vector permutes, everything else should
7137 be lowered on the SLP level. The following is closely inspired
7138 by vect_transform_slp_perm_load and is supposed to eventually
7140 ??? As intermediate step do code-gen in the SLP tree representation
7142 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
7143 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
7144 unsigned int index
= 0;
7145 poly_uint64 mask_element
;
7146 vec_perm_builder mask
;
7147 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
7148 unsigned int count
= mask
.encoded_nelts ();
7149 mask
.quick_grow (count
);
7150 vec_perm_indices indices
;
7151 unsigned nperms
= 0;
7152 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
7154 mask_element
= vperm
[i
].second
;
7155 if (first_vec
.first
== -1U
7156 || first_vec
== vperm
[i
].first
)
7157 first_vec
= vperm
[i
].first
;
7158 else if (second_vec
.first
== -1U
7159 || second_vec
== vperm
[i
].first
)
7161 second_vec
= vperm
[i
].first
;
7162 mask_element
+= nunits
;
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7168 "permutation requires at "
7169 "least three vectors\n");
7174 mask
[index
++] = mask_element
;
7178 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
7179 TYPE_VECTOR_SUBPARTS (op_vectype
));
7180 bool identity_p
= indices
.series_p (0, 1, 0, 1);
7181 machine_mode vmode
= TYPE_MODE (vectype
);
7182 machine_mode op_vmode
= TYPE_MODE (op_vectype
);
7183 unsigned HOST_WIDE_INT c
;
7185 && !can_vec_perm_const_p (vmode
, op_vmode
, indices
))
7187 && !known_le (nunits
,
7188 TYPE_VECTOR_SUBPARTS (op_vectype
))
7189 && (!constant_multiple_p (nunits
,
7190 TYPE_VECTOR_SUBPARTS (op_vectype
),
7193 if (dump_enabled_p ())
7195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
7197 "unsupported vect permute { ");
7198 for (i
= 0; i
< count
; ++i
)
7200 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
7201 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
7203 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
7213 if (second_vec
.first
== -1U)
7214 second_vec
= first_vec
;
7217 first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
],
7218 second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
7220 tree mask_vec
= NULL_TREE
;
7222 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
7224 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
7227 = vect_get_slp_vect_def (first_node
,
7228 first_vec
.second
+ vi
);
7230 = vect_get_slp_vect_def (second_node
,
7231 second_vec
.second
+ vi
);
7232 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
7233 second_def
, mask_vec
);
7238 first_vec
= std::make_pair (-1U, -1U);
7239 second_vec
= std::make_pair (-1U, -1U);
7244 record_stmt_cost (cost_vec
, nperms
, vec_perm
, node
, vectype
, 0, vect_body
);
7249 /* Vectorize SLP NODE. */
7252 vect_schedule_slp_node (vec_info
*vinfo
,
7253 slp_tree node
, slp_instance instance
)
7255 gimple_stmt_iterator si
;
7259 /* For existing vectors there's nothing to do. */
7260 if (SLP_TREE_VEC_DEFS (node
).exists ())
7263 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
7265 /* Vectorize externals and constants. */
7266 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
7267 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
7269 /* ??? vectorizable_shift can end up using a scalar operand which is
7270 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7271 node in this case. */
7272 if (!SLP_TREE_VECTYPE (node
))
7275 vect_create_constant_vectors (vinfo
, node
);
7279 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
7281 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
7282 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
7284 if (dump_enabled_p ())
7285 dump_printf_loc (MSG_NOTE
, vect_location
,
7286 "------>vectorizing SLP node starting from: %G",
7289 if (STMT_VINFO_DATA_REF (stmt_info
)
7290 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7292 /* Vectorized loads go before the first scalar load to make it
7293 ready early, vectorized stores go before the last scalar
7294 stmt which is where all uses are ready. */
7295 stmt_vec_info last_stmt_info
= NULL
;
7296 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
7297 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
7298 else /* DR_IS_WRITE */
7299 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
7300 si
= gsi_for_stmt (last_stmt_info
->stmt
);
7302 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
7303 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
7304 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
7305 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7307 /* For PHI node vectorization we do not use the insertion iterator. */
7312 /* Emit other stmts after the children vectorized defs which is
7313 earliest possible. */
7314 gimple
*last_stmt
= NULL
;
7315 bool seen_vector_def
= false;
7316 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7317 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
7319 /* For fold-left reductions we are retaining the scalar
7320 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7321 set so the representation isn't perfect. Resort to the
7322 last scalar def here. */
7323 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
7325 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
7326 == cycle_phi_info_type
);
7327 gphi
*phi
= as_a
<gphi
*>
7328 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
7330 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
7333 /* We are emitting all vectorized stmts in the same place and
7334 the last one is the last.
7335 ??? Unless we have a load permutation applied and that
7336 figures to re-use an earlier generated load. */
7339 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
7341 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7344 else if (!SLP_TREE_VECTYPE (child
))
7346 /* For externals we use unvectorized at all scalar defs. */
7349 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
7350 if (TREE_CODE (def
) == SSA_NAME
7351 && !SSA_NAME_IS_DEFAULT_DEF (def
))
7353 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
7355 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
7361 /* For externals we have to look at all defs since their
7362 insertion place is decided per vector. But beware
7363 of pre-existing vectors where we need to make sure
7364 we do not insert before the region boundary. */
7365 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
7366 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
7367 seen_vector_def
= true;
7372 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
7373 if (TREE_CODE (vdef
) == SSA_NAME
7374 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
7376 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
7378 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7383 /* This can happen when all children are pre-existing vectors or
7386 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
7389 gcc_assert (seen_vector_def
);
7390 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
7392 else if (is_ctrl_altering_stmt (last_stmt
))
7394 /* We split regions to vectorize at control altering stmts
7395 with a definition so this must be an external which
7396 we can insert at the start of the region. */
7397 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
7399 else if (is_a
<bb_vec_info
> (vinfo
)
7400 && gimple_bb (last_stmt
) != gimple_bb (stmt_info
->stmt
)
7401 && gimple_could_trap_p (stmt_info
->stmt
))
7403 /* We've constrained possibly trapping operations to all come
7404 from the same basic-block, if vectorized defs would allow earlier
7405 scheduling still force vectorized stmts to the original block.
7406 This is only necessary for BB vectorization since for loop vect
7407 all operations are in a single BB and scalar stmt based
7408 placement doesn't play well with epilogue vectorization. */
7409 gcc_assert (dominated_by_p (CDI_DOMINATORS
,
7410 gimple_bb (stmt_info
->stmt
),
7411 gimple_bb (last_stmt
)));
7412 si
= gsi_after_labels (gimple_bb (stmt_info
->stmt
));
7414 else if (is_a
<gphi
*> (last_stmt
))
7415 si
= gsi_after_labels (gimple_bb (last_stmt
));
7418 si
= gsi_for_stmt (last_stmt
);
7423 bool done_p
= false;
7425 /* Handle purely internal nodes. */
7426 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
7428 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7429 be shared with different SLP nodes (but usually it's the same
7430 operation apart from the case the stmt is only there for denoting
7431 the actual scalar lane defs ...). So do not call vect_transform_stmt
7432 but open-code it here (partly). */
7433 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
7438 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
7441 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7442 For loop vectorization this is done in vectorizable_call, but for SLP
7443 it needs to be deferred until end of vect_schedule_slp, because multiple
7444 SLP instances may refer to the same scalar stmt. */
7447 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
7448 slp_tree node
, hash_set
<slp_tree
> &visited
)
7451 gimple_stmt_iterator gsi
;
7455 stmt_vec_info stmt_info
;
7457 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7460 if (visited
.add (node
))
7463 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7464 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
7466 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
7468 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
7469 if (!stmt
|| gimple_bb (stmt
) == NULL
)
7471 if (is_pattern_stmt_p (stmt_info
)
7472 || !PURE_SLP_STMT (stmt_info
))
7474 lhs
= gimple_call_lhs (stmt
);
7475 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
7476 gsi
= gsi_for_stmt (stmt
);
7477 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
7478 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
7483 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
7485 hash_set
<slp_tree
> visited
;
7486 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
7489 /* Vectorize the instance root. */
7492 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
7494 gassign
*rstmt
= NULL
;
7496 if (instance
->kind
== slp_inst_kind_ctor
)
7498 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
7500 gimple
*child_stmt
= SLP_TREE_VEC_STMTS (node
)[0];
7501 tree vect_lhs
= gimple_get_lhs (child_stmt
);
7502 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7503 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
7504 TREE_TYPE (vect_lhs
)))
7505 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
7507 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
7509 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
7511 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
7514 vec
<constructor_elt
, va_gc
> *v
;
7515 vec_alloc (v
, nelts
);
7517 /* A CTOR can handle V16HI composition from VNx8HI so we
7518 do not need to convert vector elements if the types
7520 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
7521 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
,
7522 gimple_get_lhs (child_stmt
));
7523 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7525 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
7526 tree r_constructor
= build_constructor (rtype
, v
);
7527 rstmt
= gimple_build_assign (lhs
, r_constructor
);
7530 else if (instance
->kind
== slp_inst_kind_bb_reduc
)
7532 /* Largely inspired by reduction chain epilogue handling in
7533 vect_create_epilog_for_reduction. */
7534 vec
<tree
> vec_defs
= vNULL
;
7535 vect_get_slp_defs (node
, &vec_defs
);
7536 enum tree_code reduc_code
7537 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
7538 /* ??? We actually have to reflect signs somewhere. */
7539 if (reduc_code
== MINUS_EXPR
)
7540 reduc_code
= PLUS_EXPR
;
7541 gimple_seq epilogue
= NULL
;
7542 /* We may end up with more than one vector result, reduce them
7544 tree vec_def
= vec_defs
[0];
7545 for (unsigned i
= 1; i
< vec_defs
.length (); ++i
)
7546 vec_def
= gimple_build (&epilogue
, reduc_code
, TREE_TYPE (vec_def
),
7547 vec_def
, vec_defs
[i
]);
7548 vec_defs
.release ();
7549 /* ??? Support other schemes than direct internal fn. */
7550 internal_fn reduc_fn
;
7551 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
7552 || reduc_fn
== IFN_LAST
)
7554 tree scalar_def
= gimple_build (&epilogue
, as_combined_fn (reduc_fn
),
7555 TREE_TYPE (TREE_TYPE (vec_def
)), vec_def
);
7557 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7558 gsi_insert_seq_before (&rgsi
, epilogue
, GSI_SAME_STMT
);
7559 gimple_assign_set_rhs_from_tree (&rgsi
, scalar_def
);
7560 update_stmt (gsi_stmt (rgsi
));
7568 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7569 gsi_replace (&rgsi
, rstmt
, true);
7579 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7582 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
7583 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
7584 int &maxdfs
, vec
<slp_tree
> &stack
)
7587 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
7588 gcc_assert (!existed_p
);
7590 info
->lowlink
= maxdfs
;
7594 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7596 info
->on_stack
= false;
7597 vect_schedule_slp_node (vinfo
, node
, instance
);
7601 info
->on_stack
= true;
7602 stack
.safe_push (node
);
7607 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7611 slp_scc_info
*child_info
= scc_info
.get (child
);
7614 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
7615 /* Recursion might have re-allocated the node. */
7616 info
= scc_info
.get (node
);
7617 child_info
= scc_info
.get (child
);
7618 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
7620 else if (child_info
->on_stack
)
7621 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
7623 if (info
->lowlink
!= info
->dfs
)
7626 auto_vec
<slp_tree
, 4> phis_to_fixup
;
7629 if (stack
.last () == node
)
7632 info
->on_stack
= false;
7633 vect_schedule_slp_node (vinfo
, node
, instance
);
7634 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
7635 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
7636 phis_to_fixup
.quick_push (node
);
7641 int last_idx
= stack
.length () - 1;
7642 while (stack
[last_idx
] != node
)
7644 /* We can break the cycle at PHIs who have at least one child
7645 code generated. Then we could re-start the DFS walk until
7646 all nodes in the SCC are covered (we might have new entries
7647 for only back-reachable nodes). But it's simpler to just
7648 iterate and schedule those that are ready. */
7649 unsigned todo
= stack
.length () - last_idx
;
7652 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
7654 slp_tree entry
= stack
[idx
];
7657 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
7658 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
7660 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
7667 else if (scc_info
.get (child
)->on_stack
)
7685 vect_schedule_slp_node (vinfo
, entry
, instance
);
7686 scc_info
.get (entry
)->on_stack
= false;
7690 phis_to_fixup
.safe_push (entry
);
7697 stack
.truncate (last_idx
);
7700 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7702 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
7704 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
7707 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
7709 unsigned dest_idx
= e
->dest_idx
;
7710 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
7711 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
7713 /* Simply fill all args. */
7714 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
7715 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
7716 vect_get_slp_vect_def (child
, i
),
7717 e
, gimple_phi_arg_location (phi
, dest_idx
));
7722 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7725 vect_schedule_slp (vec_info
*vinfo
, const vec
<slp_instance
> &slp_instances
)
7727 slp_instance instance
;
7730 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
7732 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7734 slp_tree node
= SLP_INSTANCE_TREE (instance
);
7735 if (dump_enabled_p ())
7737 dump_printf_loc (MSG_NOTE
, vect_location
,
7738 "Vectorizing SLP tree:\n");
7740 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7741 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
7742 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
7743 vect_print_slp_graph (MSG_NOTE
, vect_location
,
7744 SLP_INSTANCE_TREE (instance
));
7746 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7747 have a PHI be the node breaking the cycle. */
7748 auto_vec
<slp_tree
> stack
;
7749 if (!scc_info
.get (node
))
7750 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
7752 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7753 vectorize_slp_instance_root_stmt (node
, instance
);
7755 if (dump_enabled_p ())
7756 dump_printf_loc (MSG_NOTE
, vect_location
,
7757 "vectorizing stmts using SLP.\n");
7760 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7762 slp_tree root
= SLP_INSTANCE_TREE (instance
);
7763 stmt_vec_info store_info
;
7766 /* Remove scalar call stmts. Do not do this for basic-block
7767 vectorization as not all uses may be vectorized.
7768 ??? Why should this be necessary? DCE should be able to
7769 remove the stmts itself.
7770 ??? For BB vectorization we can as well remove scalar
7771 stmts starting from the SLP tree root if they have no
7773 if (is_a
<loop_vec_info
> (vinfo
))
7774 vect_remove_slp_scalar_calls (vinfo
, root
);
7776 /* Remove vectorized stores original scalar stmts. */
7777 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
7779 if (!STMT_VINFO_DATA_REF (store_info
)
7780 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
7783 store_info
= vect_orig_stmt (store_info
);
7784 /* Free the attached stmt_vec_info and remove the stmt. */
7785 vinfo
->remove_stmt (store_info
);
7787 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7788 to not crash in vect_free_slp_tree later. */
7789 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
7790 SLP_TREE_REPRESENTATIVE (root
) = NULL
;