1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
55 static void vect_print_slp_tree (dump_flags_t
, dump_location_t
, slp_tree
);
57 static object_allocator
<_slp_tree
> *slp_tree_pool
;
58 static slp_tree slp_first_node
;
63 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
69 while (slp_first_node
)
70 delete slp_first_node
;
76 _slp_tree::operator new (size_t n
)
78 gcc_assert (n
== sizeof (_slp_tree
));
79 return slp_tree_pool
->allocate_raw ();
83 _slp_tree::operator delete (void *node
, size_t n
)
85 gcc_assert (n
== sizeof (_slp_tree
));
86 slp_tree_pool
->remove_raw (node
);
90 /* Initialize a SLP node. */
92 _slp_tree::_slp_tree ()
94 this->prev_node
= NULL
;
96 slp_first_node
->prev_node
= this;
97 this->next_node
= slp_first_node
;
98 slp_first_node
= this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
100 SLP_TREE_SCALAR_OPS (this) = vNULL
;
101 SLP_TREE_VEC_STMTS (this) = vNULL
;
102 SLP_TREE_VEC_DEFS (this) = vNULL
;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL
;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
108 SLP_TREE_CODE (this) = ERROR_MARK
;
109 SLP_TREE_VECTYPE (this) = NULL_TREE
;
110 SLP_TREE_REPRESENTATIVE (this) = NULL
;
111 SLP_TREE_REF_COUNT (this) = 1;
113 this->max_nunits
= 1;
117 /* Tear down a SLP node. */
119 _slp_tree::~_slp_tree ()
122 this->prev_node
->next_node
= this->next_node
;
124 slp_first_node
= this->next_node
;
126 this->next_node
->prev_node
= this->prev_node
;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
141 vect_free_slp_tree (slp_tree node
)
146 if (--SLP_TREE_REF_COUNT (node
) != 0)
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
151 vect_free_slp_tree (child
);
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
156 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
158 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
159 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
160 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
166 /* Return a location suitable for dumpings related to the SLP instance. */
169 _slp_instance::location () const
171 if (!root_stmts
.is_empty ())
172 return root_stmts
[0]->stmt
;
174 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
178 /* Free the memory allocated for the SLP instance. */
181 vect_free_slp_instance (slp_instance instance
)
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
184 SLP_INSTANCE_LOADS (instance
).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
186 instance
->subgraph_entries
.release ();
187 instance
->cost_vec
.release ();
192 /* Create an SLP node for SCALAR_STMTS. */
195 vect_create_new_slp_node (unsigned nops
, tree_code code
)
197 slp_tree node
= new _slp_tree
;
198 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
199 SLP_TREE_CHILDREN (node
).create (nops
);
200 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
201 SLP_TREE_CODE (node
) = code
;
204 /* Create an SLP node for SCALAR_STMTS. */
207 vect_create_new_slp_node (slp_tree node
,
208 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
210 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
211 SLP_TREE_CHILDREN (node
).create (nops
);
212 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
213 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
214 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
218 /* Create an SLP node for SCALAR_STMTS. */
221 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
223 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
226 /* Create an SLP node for OPS. */
229 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
231 SLP_TREE_SCALAR_OPS (node
) = ops
;
232 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
233 SLP_TREE_LANES (node
) = ops
.length ();
237 /* Create an SLP node for OPS. */
240 vect_create_new_slp_node (vec
<tree
> ops
)
242 return vect_create_new_slp_node (new _slp_tree
, ops
);
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
249 typedef struct _slp_oprnd_info
251 /* Def-stmts for the operands. */
252 vec
<stmt_vec_info
> def_stmts
;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
259 enum vect_def_type first_dt
;
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
266 static vec
<slp_oprnd_info
>
267 vect_create_oprnd_info (int nops
, int group_size
)
270 slp_oprnd_info oprnd_info
;
271 vec
<slp_oprnd_info
> oprnds_info
;
273 oprnds_info
.create (nops
);
274 for (i
= 0; i
< nops
; i
++)
276 oprnd_info
= XNEW (struct _slp_oprnd_info
);
277 oprnd_info
->def_stmts
.create (group_size
);
278 oprnd_info
->ops
.create (group_size
);
279 oprnd_info
->first_dt
= vect_uninitialized_def
;
280 oprnd_info
->first_op_type
= NULL_TREE
;
281 oprnd_info
->any_pattern
= false;
282 oprnds_info
.quick_push (oprnd_info
);
289 /* Free operands info. */
292 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
295 slp_oprnd_info oprnd_info
;
297 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
299 oprnd_info
->def_stmts
.release ();
300 oprnd_info
->ops
.release ();
301 XDELETE (oprnd_info
);
304 oprnds_info
.release ();
308 /* Return true if STMTS contains a pattern statement. */
311 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
313 stmt_vec_info stmt_info
;
315 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
316 if (is_pattern_stmt_p (stmt_info
))
321 /* Return true when all lanes in the external or constant NODE have
325 vect_slp_tree_uniform_p (slp_tree node
)
327 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
335 tree op
, first
= NULL_TREE
;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
339 else if (!operand_equal_p (first
, op
, 0))
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
351 stmt_vec_info first_stmt_info
)
353 stmt_vec_info next_stmt_info
= first_stmt_info
;
356 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
361 if (next_stmt_info
== stmt_info
)
363 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
365 result
+= DR_GROUP_GAP (next_stmt_info
);
367 while (next_stmt_info
);
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
379 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
380 tree elt_type
, unsigned int *nvectors_out
,
381 tree
*vector_type_out
,
384 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
385 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
388 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
389 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
390 unsigned int nvectors
= 1;
393 scalar_int_mode int_mode
;
394 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
395 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type
= build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode
), 1);
401 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
403 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
405 GET_MODE_SIZE (base_vector_mode
)))
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
411 vec_perm_builder
sel1 (nelts
, 2, 3);
412 vec_perm_builder
sel2 (nelts
, 2, 3);
413 poly_int64 half_nelts
= exact_div (nelts
, 2);
414 for (unsigned int i
= 0; i
< 3; ++i
)
417 sel1
.quick_push (i
+ nelts
);
418 sel2
.quick_push (half_nelts
+ i
);
419 sel2
.quick_push (half_nelts
+ i
+ nelts
);
421 vec_perm_indices
indices1 (sel1
, 2, nelts
);
422 vec_perm_indices
indices2 (sel2
, 2, nelts
);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
427 *nvectors_out
= nvectors
;
429 *vector_type_out
= vector_type
;
432 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
434 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
441 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
447 /* Return true if DTA and DTB match. */
450 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
453 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
454 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
457 static const int cond_expr_maps
[3][5] = {
462 static const int arg1_map
[] = { 1, 1 };
463 static const int arg2_map
[] = { 1, 2 };
464 static const int arg1_arg4_map
[] = { 2, 1, 4 };
466 /* For most SLP statements, there is a one-to-one mapping between
467 gimple arguments and child nodes. If that is not true for STMT,
468 return an array that contains:
470 - the number of child nodes, followed by
471 - for each child node, the index of the argument associated with that node.
472 The special index -1 is the first operand of an embedded comparison and
473 the special index -2 is the second operand of an embedded comparison.
475 SWAP is as for vect_get_and_check_slp_defs. */
478 vect_get_operand_map (const gimple
*stmt
, unsigned char swap
= 0)
480 if (auto assign
= dyn_cast
<const gassign
*> (stmt
))
482 if (gimple_assign_rhs_code (assign
) == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign
)))
484 return cond_expr_maps
[swap
];
487 if (auto call
= dyn_cast
<const gcall
*> (stmt
))
489 if (gimple_call_internal_p (call
))
490 switch (gimple_call_internal_fn (call
))
495 case IFN_GATHER_LOAD
:
498 case IFN_MASK_GATHER_LOAD
:
499 return arg1_arg4_map
;
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509 they are of a valid type and that they match the defs of the first stmt of
510 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
511 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
512 indicates swap is required for cond_expr stmts. Specifically, SWAP
513 is 1 if STMT is cond and operands of comparison need to be swapped;
514 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
516 If there was a fatal error return -1; if the error could be corrected by
517 swapping operands of father node of this one, return 1; if everything is
520 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
522 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
523 vec
<slp_oprnd_info
> *oprnds_info
)
525 stmt_vec_info stmt_info
= stmts
[stmt_num
];
527 unsigned int i
, number_of_oprnds
;
528 enum vect_def_type dt
= vect_uninitialized_def
;
529 slp_oprnd_info oprnd_info
;
530 unsigned int commutative_op
= -1U;
531 bool first
= stmt_num
== 0;
533 if (!is_a
<gcall
*> (stmt_info
->stmt
)
534 && !is_a
<gassign
*> (stmt_info
->stmt
)
535 && !is_a
<gphi
*> (stmt_info
->stmt
))
538 number_of_oprnds
= gimple_num_args (stmt_info
->stmt
);
539 const int *map
= vect_get_operand_map (stmt_info
->stmt
, swap
);
541 number_of_oprnds
= *map
++;
542 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
544 if (gimple_call_internal_p (stmt
))
546 internal_fn ifn
= gimple_call_internal_fn (stmt
);
547 commutative_op
= first_commutative_argument (ifn
);
550 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
552 if (commutative_tree_code (gimple_assign_rhs_code (stmt
)))
556 bool swapped
= (swap
!= 0);
557 bool backedge
= false;
558 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
559 for (i
= 0; i
< number_of_oprnds
; i
++)
561 int opno
= map
? map
[i
] : int (i
);
563 oprnd
= TREE_OPERAND (gimple_arg (stmt_info
->stmt
, 0), -1 - opno
);
566 oprnd
= gimple_arg (stmt_info
->stmt
, opno
);
567 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
568 backedge
= dominated_by_p (CDI_DOMINATORS
,
569 gimple_phi_arg_edge (stmt
, opno
)->src
,
570 gimple_bb (stmt_info
->stmt
));
572 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
573 oprnd
= TREE_OPERAND (oprnd
, 0);
575 oprnd_info
= (*oprnds_info
)[i
];
577 stmt_vec_info def_stmt_info
;
578 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
582 "Build SLP failed: can't analyze def for %T\n",
590 oprnd_info
->def_stmts
.quick_push (NULL
);
591 oprnd_info
->ops
.quick_push (NULL_TREE
);
592 oprnd_info
->first_dt
= vect_uninitialized_def
;
596 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
597 oprnd_info
->ops
.quick_push (oprnd
);
600 && is_pattern_stmt_p (def_stmt_info
))
602 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
604 oprnd_info
->any_pattern
= true;
606 /* If we promote this to external use the original stmt def. */
607 oprnd_info
->ops
.last ()
608 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
611 /* If there's a extern def on a backedge make sure we can
612 code-generate at the region start.
613 ??? This is another case that could be fixed by adjusting
614 how we split the function but at the moment we'd have conflicting
617 && dts
[i
] == vect_external_def
618 && is_a
<bb_vec_info
> (vinfo
)
619 && TREE_CODE (oprnd
) == SSA_NAME
620 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
621 && !dominated_by_p (CDI_DOMINATORS
,
622 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
623 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
627 "Build SLP failed: extern def %T only defined "
628 "on backedge\n", oprnd
);
634 tree type
= TREE_TYPE (oprnd
);
636 if ((dt
== vect_constant_def
637 || dt
== vect_external_def
)
638 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
639 && (TREE_CODE (type
) == BOOLEAN_TYPE
640 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
645 "Build SLP failed: invalid type of def "
646 "for variable-length SLP %T\n", oprnd
);
650 /* For the swapping logic below force vect_reduction_def
651 for the reduction op in a SLP reduction group. */
652 if (!STMT_VINFO_DATA_REF (stmt_info
)
653 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
654 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
656 dts
[i
] = dt
= vect_reduction_def
;
658 /* Check the types of the definition. */
661 case vect_external_def
:
662 case vect_constant_def
:
663 case vect_internal_def
:
664 case vect_reduction_def
:
665 case vect_induction_def
:
666 case vect_nested_cycle
:
670 /* FORNOW: Not supported. */
671 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
673 "Build SLP failed: illegal type of def %T\n",
678 oprnd_info
->first_dt
= dt
;
679 oprnd_info
->first_op_type
= type
;
685 /* Now match the operand definition types to that of the first stmt. */
686 for (i
= 0; i
< number_of_oprnds
;)
694 oprnd_info
= (*oprnds_info
)[i
];
696 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
697 oprnd
= oprnd_info
->ops
[stmt_num
];
698 tree type
= TREE_TYPE (oprnd
);
700 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
704 "Build SLP failed: different operand types\n");
708 /* Not first stmt of the group, check that the def-stmt/s match
709 the def-stmt/s of the first stmt. Allow different definition
710 types for reduction chains: the first stmt must be a
711 vect_reduction_def (a phi node), and the rest
712 end in the reduction chain. */
713 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
714 && !(oprnd_info
->first_dt
== vect_reduction_def
715 && !STMT_VINFO_DATA_REF (stmt_info
)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
718 && !STMT_VINFO_DATA_REF (def_stmt_info
)
719 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
720 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
721 || (!STMT_VINFO_DATA_REF (stmt_info
)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
724 || STMT_VINFO_DATA_REF (def_stmt_info
)
725 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
726 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
727 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
729 /* Try swapping operands if we got a mismatch. For BB
730 vectorization only in case it will clearly improve things. */
731 if (i
== commutative_op
&& !swapped
732 && (!is_a
<bb_vec_info
> (vinfo
)
733 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
735 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
736 || vect_def_types_match
737 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE
, vect_location
,
741 "trying swapped operands\n");
742 std::swap (dts
[i
], dts
[i
+1]);
743 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
744 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
745 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
746 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
751 if (is_a
<bb_vec_info
> (vinfo
)
752 && !oprnd_info
->any_pattern
)
754 /* Now for commutative ops we should see whether we can
755 make the other operand matching. */
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
758 "treating operand as external\n");
759 oprnd_info
->first_dt
= dt
= vect_external_def
;
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
765 "Build SLP failed: different types\n");
770 /* Make sure to demote the overall operand to external. */
771 if (dt
== vect_external_def
)
772 oprnd_info
->first_dt
= vect_external_def
;
773 /* For a SLP reduction chain we want to duplicate the reduction to
774 each of the chain members. That gets us a sane SLP graph (still
775 the stmts are not 100% correct wrt the initial values). */
776 else if ((dt
== vect_internal_def
777 || dt
== vect_reduction_def
)
778 && oprnd_info
->first_dt
== vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info
)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
781 && !STMT_VINFO_DATA_REF (def_stmt_info
)
782 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
783 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
785 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
786 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_NOTE
, vect_location
,
797 "swapped operands to match def types in %G",
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805 to be combined into the same SLP group. */
808 compatible_calls_p (gcall
*call1
, gcall
*call2
)
810 unsigned int nargs
= gimple_call_num_args (call1
);
811 if (nargs
!= gimple_call_num_args (call2
))
814 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
817 if (gimple_call_internal_p (call1
))
819 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
820 TREE_TYPE (gimple_call_lhs (call2
))))
822 for (unsigned int i
= 0; i
< nargs
; ++i
)
823 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
824 TREE_TYPE (gimple_call_arg (call2
, i
))))
829 if (!operand_equal_p (gimple_call_fn (call1
),
830 gimple_call_fn (call2
), 0))
833 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
837 /* Check that any unvectorized arguments are equal. */
838 if (const int *map
= vect_get_operand_map (call1
))
840 unsigned int nkept
= *map
++;
841 unsigned int mapi
= 0;
842 for (unsigned int i
= 0; i
< nargs
; ++i
)
843 if (mapi
< nkept
&& map
[mapi
] == int (i
))
845 else if (!operand_equal_p (gimple_call_arg (call1
, i
),
846 gimple_call_arg (call2
, i
)))
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854 caller's attempt to find the vector type in STMT_INFO with the narrowest
855 element type. Return true if VECTYPE is nonnull and if it is valid
856 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
857 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
858 vect_build_slp_tree. */
861 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
862 unsigned int group_size
,
863 tree vectype
, poly_uint64
*max_nunits
)
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
869 "Build SLP failed: unsupported data-type in %G\n",
871 /* Fatal mismatch. */
875 /* If populating the vector type requires unrolling then fail
876 before adjusting *max_nunits for basic-block vectorization. */
877 if (is_a
<bb_vec_info
> (vinfo
)
878 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
882 "Build SLP failed: unrolling required "
883 "in basic block SLP\n");
884 /* Fatal mismatch. */
888 /* In case of multiple types we need to detect the smallest type. */
889 vect_update_max_nunits (max_nunits
, vectype
);
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894 permutation or are of unsupported types of operation. Return
895 true if they are, otherwise return false and indicate in *MATCHES
896 which stmts are not isomorphic to the first one. If MATCHES[0]
897 is false then this indicates the comparison could not be
898 carried out or the stmts will never be vectorized by SLP.
900 Note COND_EXPR is possibly isomorphic to another one after swapping its
901 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902 the first stmt by swapping the two operands of comparison; set SWAP[i]
903 to 2 if stmt I is isormorphic to the first stmt by inverting the code
904 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
908 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
909 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
910 poly_uint64
*max_nunits
, bool *matches
,
911 bool *two_operators
, tree
*node_vectype
)
914 stmt_vec_info first_stmt_info
= stmts
[0];
915 code_helper first_stmt_code
= ERROR_MARK
;
916 code_helper alt_stmt_code
= ERROR_MARK
;
917 code_helper rhs_code
= ERROR_MARK
;
918 code_helper first_cond_code
= ERROR_MARK
;
920 bool need_same_oprnds
= false;
921 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
922 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
923 bool first_stmt_load_p
= false, load_p
= false;
924 bool first_stmt_phi_p
= false, phi_p
= false;
925 bool maybe_soft_fail
= false;
926 tree soft_fail_nunits_vectype
= NULL_TREE
;
928 /* For every stmt in NODE find its def stmt/s. */
929 stmt_vec_info stmt_info
;
930 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
932 gimple
*stmt
= stmt_info
->stmt
;
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
939 /* Fail to vectorize statements marked as unvectorizable, throw
941 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
942 || stmt_can_throw_internal (cfun
, stmt
)
943 || gimple_has_volatile_ops (stmt
))
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
947 "Build SLP failed: unvectorizable statement %G",
949 /* ??? For BB vectorization we want to commutate operands in a way
950 to shuffle all unvectorizable defs into one operand and have
951 the other still vectorized. The following doesn't reliably
952 work for this though but it's the easiest we can do here. */
953 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
955 /* Fatal mismatch. */
960 lhs
= gimple_get_lhs (stmt
);
961 if (lhs
== NULL_TREE
)
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
965 "Build SLP failed: not GIMPLE_ASSIGN nor "
966 "GIMPLE_CALL %G", stmt
);
967 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
969 /* Fatal mismatch. */
975 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
976 &nunits_vectype
, group_size
))
978 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
980 /* Fatal mismatch. */
984 /* Record nunits required but continue analysis, producing matches[]
985 as if nunits was not an issue. This allows splitting of groups
988 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
989 nunits_vectype
, max_nunits
))
991 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
992 maybe_soft_fail
= true;
993 soft_fail_nunits_vectype
= nunits_vectype
;
996 gcc_assert (vectype
);
998 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1001 combined_fn cfn
= gimple_call_combined_fn (call_stmt
);
1002 if (cfn
!= CFN_LAST
)
1005 rhs_code
= CALL_EXPR
;
1007 if (cfn
== CFN_MASK_LOAD
1008 || cfn
== CFN_GATHER_LOAD
1009 || cfn
== CFN_MASK_GATHER_LOAD
)
1011 else if ((internal_fn_p (cfn
)
1012 && !vectorizable_internal_fn_p (as_internal_fn (cfn
)))
1013 || gimple_call_tail_p (call_stmt
)
1014 || gimple_call_noreturn_p (call_stmt
)
1015 || gimple_call_chain (call_stmt
))
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1019 "Build SLP failed: unsupported call type %G",
1021 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1023 /* Fatal mismatch. */
1028 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1030 rhs_code
= ERROR_MARK
;
1035 rhs_code
= gimple_assign_rhs_code (stmt
);
1036 load_p
= gimple_vuse (stmt
);
1039 /* Check the operation. */
1042 *node_vectype
= vectype
;
1043 first_stmt_code
= rhs_code
;
1044 first_stmt_load_p
= load_p
;
1045 first_stmt_phi_p
= phi_p
;
1047 /* Shift arguments should be equal in all the packed stmts for a
1048 vector shift with scalar shift operand. */
1049 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1050 || rhs_code
== LROTATE_EXPR
1051 || rhs_code
== RROTATE_EXPR
)
1053 /* First see if we have a vector/vector shift. */
1054 if (!directly_supported_p (rhs_code
, vectype
, optab_vector
))
1056 /* No vector/vector shift, try for a vector/scalar shift. */
1057 if (!directly_supported_p (rhs_code
, vectype
, optab_scalar
))
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1061 "Build SLP failed: "
1062 "op not supported by target.\n");
1063 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1065 /* Fatal mismatch. */
1069 need_same_oprnds
= true;
1070 first_op1
= gimple_assign_rhs2 (stmt
);
1073 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1075 need_same_oprnds
= true;
1076 first_op1
= gimple_assign_rhs2 (stmt
);
1079 && rhs_code
== BIT_FIELD_REF
)
1081 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1082 if (!is_a
<bb_vec_info
> (vinfo
)
1083 || TREE_CODE (vec
) != SSA_NAME
1084 || !operand_equal_p (TYPE_SIZE (vectype
),
1085 TYPE_SIZE (TREE_TYPE (vec
))))
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1089 "Build SLP failed: "
1090 "BIT_FIELD_REF not supported\n");
1091 /* Fatal mismatch. */
1096 else if (rhs_code
== CFN_DIV_POW2
)
1098 need_same_oprnds
= true;
1099 first_op1
= gimple_call_arg (call_stmt
, 1);
1104 if (first_stmt_code
!= rhs_code
1105 && alt_stmt_code
== ERROR_MARK
)
1106 alt_stmt_code
= rhs_code
;
1107 if ((first_stmt_code
!= rhs_code
1108 && (first_stmt_code
!= IMAGPART_EXPR
1109 || rhs_code
!= REALPART_EXPR
)
1110 && (first_stmt_code
!= REALPART_EXPR
1111 || rhs_code
!= IMAGPART_EXPR
)
1112 /* Handle mismatches in plus/minus by computing both
1113 and merging the results. */
1114 && !((first_stmt_code
== PLUS_EXPR
1115 || first_stmt_code
== MINUS_EXPR
)
1116 && (alt_stmt_code
== PLUS_EXPR
1117 || alt_stmt_code
== MINUS_EXPR
)
1118 && rhs_code
== alt_stmt_code
)
1119 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1120 && (first_stmt_code
== ARRAY_REF
1121 || first_stmt_code
== BIT_FIELD_REF
1122 || first_stmt_code
== INDIRECT_REF
1123 || first_stmt_code
== COMPONENT_REF
1124 || first_stmt_code
== MEM_REF
)
1125 && (rhs_code
== ARRAY_REF
1126 || rhs_code
== BIT_FIELD_REF
1127 || rhs_code
== INDIRECT_REF
1128 || rhs_code
== COMPONENT_REF
1129 || rhs_code
== MEM_REF
)))
1130 || first_stmt_load_p
!= load_p
1131 || first_stmt_phi_p
!= phi_p
)
1133 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1136 "Build SLP failed: different operation "
1137 "in stmt %G", stmt
);
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1139 "original stmt %G", first_stmt_info
->stmt
);
1146 && first_stmt_code
== BIT_FIELD_REF
1147 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1148 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1150 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1152 "Build SLP failed: different BIT_FIELD_REF "
1153 "arguments in %G", stmt
);
1158 if (call_stmt
&& first_stmt_code
!= CFN_MASK_LOAD
)
1160 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1165 "Build SLP failed: different calls in %G",
1172 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1173 && (gimple_bb (first_stmt_info
->stmt
)
1174 != gimple_bb (stmt_info
->stmt
)))
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1178 "Build SLP failed: different BB for PHI "
1179 "or possibly trapping operation in %G", stmt
);
1184 if (need_same_oprnds
)
1186 tree other_op1
= gimple_arg (stmt
, 1);
1187 if (!operand_equal_p (first_op1
, other_op1
, 0))
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1191 "Build SLP failed: different shift "
1192 "arguments in %G", stmt
);
1198 if (!types_compatible_p (vectype
, *node_vectype
))
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1202 "Build SLP failed: different vector type "
1209 /* Grouped store or load. */
1210 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1212 if (REFERENCE_CLASS_P (lhs
))
1220 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1221 if (prev_first_load
)
1223 /* Check that there are no loads from different interleaving
1224 chains in the same node. */
1225 if (prev_first_load
!= first_load
)
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1230 "Build SLP failed: different "
1231 "interleaving chains in one node %G",
1238 prev_first_load
= first_load
;
1240 } /* Grouped access. */
1244 && rhs_code
!= CFN_GATHER_LOAD
1245 && rhs_code
!= CFN_MASK_GATHER_LOAD
)
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1250 "Build SLP failed: not grouped load %G", stmt
);
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1255 /* Fatal mismatch. */
1260 /* Not memory operation. */
1262 && rhs_code
.is_tree_code ()
1263 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_binary
1264 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_unary
1265 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_expression
1266 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_comparison
1267 && rhs_code
!= VIEW_CONVERT_EXPR
1268 && rhs_code
!= CALL_EXPR
1269 && rhs_code
!= BIT_FIELD_REF
)
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1273 "Build SLP failed: operation unsupported %G",
1275 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1277 /* Fatal mismatch. */
1282 if (rhs_code
== COND_EXPR
)
1284 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1285 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1286 enum tree_code swap_code
= ERROR_MARK
;
1287 enum tree_code invert_code
= ERROR_MARK
;
1290 first_cond_code
= TREE_CODE (cond_expr
);
1291 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1293 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1294 swap_code
= swap_tree_comparison (cond_code
);
1295 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1298 if (first_cond_code
== cond_code
)
1300 /* Isomorphic can be achieved by swapping. */
1301 else if (first_cond_code
== swap_code
)
1303 /* Isomorphic can be achieved by inverting. */
1304 else if (first_cond_code
== invert_code
)
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1310 "Build SLP failed: different"
1311 " operation %G", stmt
);
1321 for (i
= 0; i
< group_size
; ++i
)
1325 /* If we allowed a two-operation SLP node verify the target can cope
1326 with the permute we are going to use. */
1327 if (alt_stmt_code
!= ERROR_MARK
1328 && (!alt_stmt_code
.is_tree_code ()
1329 || TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_reference
))
1331 *two_operators
= true;
1334 if (maybe_soft_fail
)
1336 unsigned HOST_WIDE_INT const_nunits
;
1337 if (!TYPE_VECTOR_SUBPARTS
1338 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1339 || const_nunits
> group_size
)
1343 /* With constant vector elements simulate a mismatch at the
1344 point we need to split. */
1345 unsigned tail
= group_size
& (const_nunits
- 1);
1346 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355 Note we never remove apart from at destruction time so we do not
1356 need a special value for deleted that differs from empty. */
1359 typedef vec
<stmt_vec_info
> value_type
;
1360 typedef vec
<stmt_vec_info
> compare_type
;
1361 static inline hashval_t
hash (value_type
);
1362 static inline bool equal (value_type existing
, value_type candidate
);
1363 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1364 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1365 static const bool empty_zero_p
= true;
1366 static inline void mark_empty (value_type
&x
) { x
.release (); }
1367 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1368 static inline void remove (value_type
&x
) { x
.release (); }
1371 bst_traits::hash (value_type x
)
1374 for (unsigned i
= 0; i
< x
.length (); ++i
)
1375 h
.add_int (gimple_uid (x
[i
]->stmt
));
1379 bst_traits::equal (value_type existing
, value_type candidate
)
1381 if (existing
.length () != candidate
.length ())
1383 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1384 if (existing
[i
] != candidate
[i
])
1389 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390 but then vec::insert does memmove and that's not compatible with
1394 chain_op_t (tree_code code_
, vect_def_type dt_
, tree op_
)
1395 : code (code_
), dt (dt_
), op (op_
) {}
1401 /* Comparator for sorting associatable chains. */
1404 dt_sort_cmp (const void *op1_
, const void *op2_
, void *)
1406 auto *op1
= (const chain_op_t
*) op1_
;
1407 auto *op2
= (const chain_op_t
*) op2_
;
1408 if (op1
->dt
!= op2
->dt
)
1409 return (int)op1
->dt
- (int)op2
->dt
;
1410 return (int)op1
->code
- (int)op2
->code
;
1413 /* Linearize the associatable expression chain at START with the
1414 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415 filling CHAIN with the result and using WORKLIST as intermediate storage.
1416 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1418 stmts, starting with START. */
1421 vect_slp_linearize_chain (vec_info
*vinfo
,
1422 vec
<std::pair
<tree_code
, gimple
*> > &worklist
,
1423 vec
<chain_op_t
> &chain
,
1424 enum tree_code code
, gimple
*start
,
1425 gimple
*&code_stmt
, gimple
*&alt_code_stmt
,
1426 vec
<gimple
*> *chain_stmts
)
1428 /* For each lane linearize the addition/subtraction (or other
1429 uniform associatable operation) expression tree. */
1430 worklist
.safe_push (std::make_pair (code
, start
));
1431 while (!worklist
.is_empty ())
1433 auto entry
= worklist
.pop ();
1434 gassign
*stmt
= as_a
<gassign
*> (entry
.second
);
1435 enum tree_code in_code
= entry
.first
;
1436 enum tree_code this_code
= gimple_assign_rhs_code (stmt
);
1437 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1439 && gimple_assign_rhs_code (stmt
) == code
)
1441 else if (!alt_code_stmt
1442 && gimple_assign_rhs_code (stmt
) == MINUS_EXPR
)
1443 alt_code_stmt
= stmt
;
1445 chain_stmts
->safe_push (stmt
);
1446 for (unsigned opnum
= 1; opnum
<= 2; ++opnum
)
1448 tree op
= gimple_op (stmt
, opnum
);
1450 stmt_vec_info def_stmt_info
;
1451 bool res
= vect_is_simple_use (op
, vinfo
, &dt
, &def_stmt_info
);
1453 if (dt
== vect_internal_def
1454 && is_pattern_stmt_p (def_stmt_info
))
1455 op
= gimple_get_lhs (def_stmt_info
->stmt
);
1457 use_operand_p use_p
;
1458 if (dt
== vect_internal_def
1459 && single_imm_use (op
, &use_p
, &use_stmt
)
1460 && is_gimple_assign (def_stmt_info
->stmt
)
1461 && (gimple_assign_rhs_code (def_stmt_info
->stmt
) == code
1462 || (code
== PLUS_EXPR
1463 && (gimple_assign_rhs_code (def_stmt_info
->stmt
)
1466 tree_code op_def_code
= this_code
;
1467 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1468 op_def_code
= PLUS_EXPR
;
1469 if (in_code
== MINUS_EXPR
)
1470 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1471 worklist
.safe_push (std::make_pair (op_def_code
,
1472 def_stmt_info
->stmt
));
1476 tree_code op_def_code
= this_code
;
1477 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1478 op_def_code
= PLUS_EXPR
;
1479 if (in_code
== MINUS_EXPR
)
1480 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1481 chain
.safe_push (chain_op_t (op_def_code
, dt
, op
));
1487 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1488 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1489 scalar_stmts_to_slp_tree_map_t
;
1492 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1493 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1494 poly_uint64
*max_nunits
,
1495 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1496 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1499 vect_build_slp_tree (vec_info
*vinfo
,
1500 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1501 poly_uint64
*max_nunits
,
1502 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1503 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1505 if (slp_tree
*leader
= bst_map
->get (stmts
))
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1509 !(*leader
)->failed
? "" : "failed ", *leader
);
1510 if (!(*leader
)->failed
)
1512 SLP_TREE_REF_COUNT (*leader
)++;
1513 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1517 memcpy (matches
, (*leader
)->failed
, sizeof (bool) * group_size
);
1521 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522 so we can pick up backedge destinations during discovery. */
1523 slp_tree res
= new _slp_tree
;
1524 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1525 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1526 bst_map
->put (stmts
.copy (), res
);
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE
, vect_location
,
1532 "SLP discovery limit exceeded\n");
1533 /* Mark the node invalid so we can detect those when still in use
1534 as backedge destinations. */
1535 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1536 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1537 res
->failed
= XNEWVEC (bool, group_size
);
1538 memset (res
->failed
, 0, sizeof (bool) * group_size
);
1539 memset (matches
, 0, sizeof (bool) * group_size
);
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_NOTE
, vect_location
,
1546 "starting SLP discovery for node %p\n", res
);
1548 poly_uint64 this_max_nunits
= 1;
1549 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1551 matches
, limit
, tree_size
, bst_map
);
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE
, vect_location
,
1556 "SLP discovery for node %p failed\n", res
);
1557 /* Mark the node invalid so we can detect those when still in use
1558 as backedge destinations. */
1559 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1560 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1561 res
->failed
= XNEWVEC (bool, group_size
);
1565 for (i
= 0; i
< group_size
; ++i
)
1568 gcc_assert (i
< group_size
);
1570 memcpy (res
->failed
, matches
, sizeof (bool) * group_size
);
1574 if (dump_enabled_p ())
1575 dump_printf_loc (MSG_NOTE
, vect_location
,
1576 "SLP discovery for node %p succeeded\n", res
);
1577 gcc_assert (res_
== res
);
1578 res
->max_nunits
= this_max_nunits
;
1579 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1580 /* Keep a reference for the bst_map use. */
1581 SLP_TREE_REF_COUNT (res
)++;
1586 /* Helper for building an associated SLP node chain. */
1589 vect_slp_build_two_operator_nodes (slp_tree perm
, tree vectype
,
1590 slp_tree op0
, slp_tree op1
,
1591 stmt_vec_info oper1
, stmt_vec_info oper2
,
1592 vec
<std::pair
<unsigned, unsigned> > lperm
)
1594 unsigned group_size
= SLP_TREE_LANES (op1
);
1596 slp_tree child1
= new _slp_tree
;
1597 SLP_TREE_DEF_TYPE (child1
) = vect_internal_def
;
1598 SLP_TREE_VECTYPE (child1
) = vectype
;
1599 SLP_TREE_LANES (child1
) = group_size
;
1600 SLP_TREE_CHILDREN (child1
).create (2);
1601 SLP_TREE_CHILDREN (child1
).quick_push (op0
);
1602 SLP_TREE_CHILDREN (child1
).quick_push (op1
);
1603 SLP_TREE_REPRESENTATIVE (child1
) = oper1
;
1605 slp_tree child2
= new _slp_tree
;
1606 SLP_TREE_DEF_TYPE (child2
) = vect_internal_def
;
1607 SLP_TREE_VECTYPE (child2
) = vectype
;
1608 SLP_TREE_LANES (child2
) = group_size
;
1609 SLP_TREE_CHILDREN (child2
).create (2);
1610 SLP_TREE_CHILDREN (child2
).quick_push (op0
);
1611 SLP_TREE_REF_COUNT (op0
)++;
1612 SLP_TREE_CHILDREN (child2
).quick_push (op1
);
1613 SLP_TREE_REF_COUNT (op1
)++;
1614 SLP_TREE_REPRESENTATIVE (child2
) = oper2
;
1616 SLP_TREE_DEF_TYPE (perm
) = vect_internal_def
;
1617 SLP_TREE_CODE (perm
) = VEC_PERM_EXPR
;
1618 SLP_TREE_VECTYPE (perm
) = vectype
;
1619 SLP_TREE_LANES (perm
) = group_size
;
1620 /* ??? We should set this NULL but that's not expected. */
1621 SLP_TREE_REPRESENTATIVE (perm
) = oper1
;
1622 SLP_TREE_LANE_PERMUTATION (perm
) = lperm
;
1623 SLP_TREE_CHILDREN (perm
).quick_push (child1
);
1624 SLP_TREE_CHILDREN (perm
).quick_push (child2
);
1627 /* Recursively build an SLP tree starting from NODE.
1628 Fail (and return a value not equal to zero) if def-stmts are not
1629 isomorphic, require data permutation or are of unsupported types of
1630 operation. Otherwise, return 0.
1631 The value returned is the depth in the SLP tree where a mismatch
1635 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1636 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1637 poly_uint64
*max_nunits
,
1638 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1639 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1641 unsigned nops
, i
, this_tree_size
= 0;
1642 poly_uint64 this_max_nunits
= *max_nunits
;
1646 stmt_vec_info stmt_info
= stmts
[0];
1647 if (!is_a
<gcall
*> (stmt_info
->stmt
)
1648 && !is_a
<gassign
*> (stmt_info
->stmt
)
1649 && !is_a
<gphi
*> (stmt_info
->stmt
))
1652 nops
= gimple_num_args (stmt_info
->stmt
);
1653 if (const int *map
= vect_get_operand_map (stmt_info
->stmt
))
1656 /* If the SLP node is a PHI (induction or reduction), terminate
1658 bool *skip_args
= XALLOCAVEC (bool, nops
);
1659 memset (skip_args
, 0, sizeof (bool) * nops
);
1660 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1661 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1663 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1664 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1666 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1670 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1671 if (def_type
== vect_induction_def
)
1673 /* Induction PHIs are not cycles but walk the initial
1674 value. Only for inner loops through, for outer loops
1675 we need to pick up the value from the actual PHIs
1676 to more easily support peeling and epilogue vectorization. */
1677 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1678 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1679 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1682 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1684 else if (def_type
== vect_reduction_def
1685 || def_type
== vect_double_reduction_def
1686 || def_type
== vect_nested_cycle
)
1688 /* Else def types have to match. */
1689 stmt_vec_info other_info
;
1690 bool all_same
= true;
1691 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1693 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1695 if (other_info
!= stmt_info
)
1698 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1699 /* Reduction initial values are not explicitely represented. */
1700 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1701 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1702 /* Reduction chain backedge defs are filled manually.
1703 ??? Need a better way to identify a SLP reduction chain PHI.
1704 Or a better overall way to SLP match those. */
1705 if (all_same
&& def_type
== vect_reduction_def
)
1706 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1708 else if (def_type
!= vect_internal_def
)
1713 bool two_operators
= false;
1714 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1715 tree vectype
= NULL_TREE
;
1716 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1717 &this_max_nunits
, matches
, &two_operators
,
1721 /* If the SLP node is a load, terminate the recursion unless masked. */
1722 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1723 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1725 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1726 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
)
1727 || gimple_call_internal_p (stmt
, IFN_GATHER_LOAD
)
1728 || gimple_call_internal_p (stmt
, IFN_MASK_GATHER_LOAD
));
1731 *max_nunits
= this_max_nunits
;
1733 node
= vect_create_new_slp_node (node
, stmts
, 0);
1734 SLP_TREE_VECTYPE (node
) = vectype
;
1735 /* And compute the load permutation. Whether it is actually
1736 a permutation depends on the unrolling factor which is
1738 vec
<unsigned> load_permutation
;
1740 stmt_vec_info load_info
;
1741 load_permutation
.create (group_size
);
1742 stmt_vec_info first_stmt_info
1743 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1744 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1746 int load_place
= vect_get_place_in_interleaving_chain
1747 (load_info
, first_stmt_info
);
1748 gcc_assert (load_place
!= -1);
1749 load_permutation
.safe_push (load_place
);
1751 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1755 else if (gimple_assign_single_p (stmt_info
->stmt
)
1756 && !gimple_vuse (stmt_info
->stmt
)
1757 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1759 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1760 the same SSA name vector of a compatible type to vectype. */
1761 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1762 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1763 stmt_vec_info estmt_info
;
1764 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1766 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1767 tree bfref
= gimple_assign_rhs1 (estmt
);
1769 if (!known_eq (bit_field_size (bfref
),
1770 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1771 || !constant_multiple_p (bit_field_offset (bfref
),
1772 bit_field_size (bfref
), &lane
))
1778 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1780 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1781 /* ??? We record vectype here but we hide eventually necessary
1782 punning and instead rely on code generation to materialize
1783 VIEW_CONVERT_EXPRs as necessary. We instead should make
1784 this explicit somehow. */
1785 SLP_TREE_VECTYPE (vnode
) = vectype
;
1786 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1787 /* We are always building a permutation node even if it is an identity
1788 permute to shield the rest of the vectorizer from the odd node
1789 representing an actual vector without any scalar ops.
1790 ??? We could hide it completely with making the permute node
1792 node
= vect_create_new_slp_node (node
, stmts
, 1);
1793 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1794 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1795 SLP_TREE_VECTYPE (node
) = vectype
;
1796 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1799 /* When discovery reaches an associatable operation see whether we can
1800 improve that to match up lanes in a way superior to the operand
1801 swapping code which at most looks at two defs.
1802 ??? For BB vectorization we cannot do the brute-force search
1803 for matching as we can succeed by means of builds from scalars
1804 and have no good way to "cost" one build against another. */
1805 else if (is_a
<loop_vec_info
> (vinfo
)
1806 /* ??? We don't handle !vect_internal_def defs below. */
1807 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1808 && is_gimple_assign (stmt_info
->stmt
)
1809 && (associative_tree_code (gimple_assign_rhs_code (stmt_info
->stmt
))
1810 || gimple_assign_rhs_code (stmt_info
->stmt
) == MINUS_EXPR
)
1811 && ((FLOAT_TYPE_P (vectype
) && flag_associative_math
)
1812 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype
))
1813 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype
)))))
1815 /* See if we have a chain of (mixed) adds or subtracts or other
1816 associatable ops. */
1817 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
1818 if (code
== MINUS_EXPR
)
1820 stmt_vec_info other_op_stmt_info
= NULL
;
1821 stmt_vec_info op_stmt_info
= NULL
;
1822 unsigned chain_len
= 0;
1823 auto_vec
<chain_op_t
> chain
;
1824 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
1825 auto_vec
<vec
<chain_op_t
> > chains (group_size
);
1826 auto_vec
<slp_tree
, 4> children
;
1827 bool hard_fail
= true;
1828 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1830 /* For each lane linearize the addition/subtraction (or other
1831 uniform associatable operation) expression tree. */
1832 gimple
*op_stmt
= NULL
, *other_op_stmt
= NULL
;
1833 vect_slp_linearize_chain (vinfo
, worklist
, chain
, code
,
1834 stmts
[lane
]->stmt
, op_stmt
, other_op_stmt
,
1836 if (!op_stmt_info
&& op_stmt
)
1837 op_stmt_info
= vinfo
->lookup_stmt (op_stmt
);
1838 if (!other_op_stmt_info
&& other_op_stmt
)
1839 other_op_stmt_info
= vinfo
->lookup_stmt (other_op_stmt
);
1840 if (chain
.length () == 2)
1842 /* In a chain of just two elements resort to the regular
1843 operand swapping scheme. If we run into a length
1844 mismatch still hard-FAIL. */
1849 matches
[lane
] = false;
1850 /* ??? We might want to process the other lanes, but
1851 make sure to not give false matching hints to the
1852 caller for lanes we did not process. */
1853 if (lane
!= group_size
- 1)
1858 else if (chain_len
== 0)
1859 chain_len
= chain
.length ();
1860 else if (chain
.length () != chain_len
)
1862 /* ??? Here we could slip in magic to compensate with
1863 neutral operands. */
1864 matches
[lane
] = false;
1865 if (lane
!= group_size
- 1)
1869 chains
.quick_push (chain
.copy ());
1872 if (chains
.length () == group_size
)
1874 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1880 /* Now we have a set of chains with the same length. */
1881 /* 1. pre-sort according to def_type and operation. */
1882 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1883 chains
[lane
].stablesort (dt_sort_cmp
, vinfo
);
1884 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE
, vect_location
,
1887 "pre-sorted chains of %s\n",
1888 get_tree_code_name (code
));
1889 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
1891 for (unsigned opnum
= 0; opnum
< chain_len
; ++opnum
)
1892 dump_printf (MSG_NOTE
, "%s %T ",
1893 get_tree_code_name (chains
[lane
][opnum
].code
),
1894 chains
[lane
][opnum
].op
);
1895 dump_printf (MSG_NOTE
, "\n");
1898 /* 2. try to build children nodes, associating as necessary. */
1899 for (unsigned n
= 0; n
< chain_len
; ++n
)
1901 vect_def_type dt
= chains
[0][n
].dt
;
1903 for (lane
= 0; lane
< group_size
; ++lane
)
1904 if (chains
[lane
][n
].dt
!= dt
)
1906 if (dt
== vect_constant_def
1907 && chains
[lane
][n
].dt
== vect_external_def
)
1908 dt
= vect_external_def
;
1909 else if (dt
== vect_external_def
1910 && chains
[lane
][n
].dt
== vect_constant_def
)
1915 if (lane
!= group_size
)
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE
, vect_location
,
1919 "giving up on chain due to mismatched "
1921 matches
[lane
] = false;
1922 if (lane
!= group_size
- 1)
1926 if (dt
== vect_constant_def
1927 || dt
== vect_external_def
)
1929 /* Check whether we can build the invariant. If we can't
1930 we never will be able to. */
1931 tree type
= TREE_TYPE (chains
[0][n
].op
);
1932 if (!GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
1933 && (TREE_CODE (type
) == BOOLEAN_TYPE
1934 || !can_duplicate_and_interleave_p (vinfo
, group_size
,
1941 ops
.create (group_size
);
1942 for (lane
= 0; lane
< group_size
; ++lane
)
1943 ops
.quick_push (chains
[lane
][n
].op
);
1944 slp_tree child
= vect_create_new_slp_node (ops
);
1945 SLP_TREE_DEF_TYPE (child
) = dt
;
1946 children
.safe_push (child
);
1948 else if (dt
!= vect_internal_def
)
1950 /* Not sure, we might need sth special.
1951 gcc.dg/vect/pr96854.c,
1952 gfortran.dg/vect/fast-math-pr37021.f90
1953 and gfortran.dg/vect/pr61171.f trigger. */
1954 /* Soft-fail for now. */
1960 vec
<stmt_vec_info
> op_stmts
;
1961 op_stmts
.create (group_size
);
1962 slp_tree child
= NULL
;
1963 /* Brute-force our way. We have to consider a lane
1964 failing after fixing an earlier fail up in the
1965 SLP discovery recursion. So track the current
1966 permute per lane. */
1967 unsigned *perms
= XALLOCAVEC (unsigned, group_size
);
1968 memset (perms
, 0, sizeof (unsigned) * group_size
);
1971 op_stmts
.truncate (0);
1972 for (lane
= 0; lane
< group_size
; ++lane
)
1974 (vinfo
->lookup_def (chains
[lane
][n
].op
));
1975 child
= vect_build_slp_tree (vinfo
, op_stmts
,
1976 group_size
, &this_max_nunits
,
1978 &this_tree_size
, bst_map
);
1979 /* ??? We're likely getting too many fatal mismatches
1980 here so maybe we want to ignore them (but then we
1981 have no idea which lanes fatally mismatched). */
1982 if (child
|| !matches
[0])
1984 /* Swap another lane we have not yet matched up into
1985 lanes that did not match. If we run out of
1986 permute possibilities for a lane terminate the
1989 for (lane
= 1; lane
< group_size
; ++lane
)
1992 if (n
+ perms
[lane
] + 1 == chain_len
)
1997 std::swap (chains
[lane
][n
],
1998 chains
[lane
][n
+ perms
[lane
] + 1]);
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_NOTE
, vect_location
,
2009 "failed to match up op %d\n", n
);
2010 op_stmts
.release ();
2011 if (lane
!= group_size
- 1)
2014 matches
[lane
] = false;
2017 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_NOTE
, vect_location
,
2020 "matched up op %d to\n", n
);
2021 vect_print_slp_tree (MSG_NOTE
, vect_location
, child
);
2023 children
.safe_push (child
);
2026 /* 3. build SLP nodes to combine the chain. */
2027 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2028 if (chains
[lane
][0].code
!= code
)
2030 /* See if there's any alternate all-PLUS entry. */
2032 for (n
= 1; n
< chain_len
; ++n
)
2034 for (lane
= 0; lane
< group_size
; ++lane
)
2035 if (chains
[lane
][n
].code
!= code
)
2037 if (lane
== group_size
)
2042 /* Swap that in at first position. */
2043 std::swap (children
[0], children
[n
]);
2044 for (lane
= 0; lane
< group_size
; ++lane
)
2045 std::swap (chains
[lane
][0], chains
[lane
][n
]);
2049 /* ??? When this triggers and we end up with two
2050 vect_constant/external_def up-front things break (ICE)
2051 spectacularly finding an insertion place for the
2052 all-constant op. We should have a fully
2053 vect_internal_def operand though(?) so we can swap
2054 that into first place and then prepend the all-zero
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE
, vect_location
,
2058 "inserting constant zero to compensate "
2059 "for (partially) negated first "
2062 for (lane
= 0; lane
< group_size
; ++lane
)
2063 chains
[lane
].safe_insert
2064 (0, chain_op_t (code
, vect_constant_def
, NULL_TREE
));
2066 zero_ops
.create (group_size
);
2067 zero_ops
.quick_push (build_zero_cst (TREE_TYPE (vectype
)));
2068 for (lane
= 1; lane
< group_size
; ++lane
)
2069 zero_ops
.quick_push (zero_ops
[0]);
2070 slp_tree zero
= vect_create_new_slp_node (zero_ops
);
2071 SLP_TREE_DEF_TYPE (zero
) = vect_constant_def
;
2072 children
.safe_insert (0, zero
);
2076 for (unsigned i
= 1; i
< children
.length (); ++i
)
2078 slp_tree op0
= children
[i
- 1];
2079 slp_tree op1
= children
[i
];
2080 bool this_two_op
= false;
2081 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2082 if (chains
[lane
][i
].code
!= chains
[0][i
].code
)
2088 if (i
== children
.length () - 1)
2089 child
= vect_create_new_slp_node (node
, stmts
, 2);
2091 child
= vect_create_new_slp_node (2, ERROR_MARK
);
2094 vec
<std::pair
<unsigned, unsigned> > lperm
;
2095 lperm
.create (group_size
);
2096 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2097 lperm
.quick_push (std::make_pair
2098 (chains
[lane
][i
].code
!= chains
[0][i
].code
, lane
));
2099 vect_slp_build_two_operator_nodes (child
, vectype
, op0
, op1
,
2100 (chains
[0][i
].code
== code
2102 : other_op_stmt_info
),
2103 (chains
[0][i
].code
== code
2104 ? other_op_stmt_info
2110 SLP_TREE_DEF_TYPE (child
) = vect_internal_def
;
2111 SLP_TREE_VECTYPE (child
) = vectype
;
2112 SLP_TREE_LANES (child
) = group_size
;
2113 SLP_TREE_CHILDREN (child
).quick_push (op0
);
2114 SLP_TREE_CHILDREN (child
).quick_push (op1
);
2115 SLP_TREE_REPRESENTATIVE (child
)
2116 = (chains
[0][i
].code
== code
2117 ? op_stmt_info
: other_op_stmt_info
);
2119 children
[i
] = child
;
2121 *tree_size
+= this_tree_size
+ 1;
2122 *max_nunits
= this_max_nunits
;
2123 while (!chains
.is_empty ())
2124 chains
.pop ().release ();
2128 while (!children
.is_empty ())
2129 vect_free_slp_tree (children
.pop ());
2130 while (!chains
.is_empty ())
2131 chains
.pop ().release ();
2132 /* Hard-fail, otherwise we might run into quadratic processing of the
2133 chains starting one stmt into the chain again. */
2136 /* Fall thru to normal processing. */
2139 /* Get at the operands, verifying they are compatible. */
2140 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
2141 slp_oprnd_info oprnd_info
;
2142 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
2144 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
2145 stmts
, i
, &oprnds_info
);
2147 matches
[(res
== -1) ? 0 : i
] = false;
2151 for (i
= 0; i
< group_size
; ++i
)
2154 vect_free_oprnd_info (oprnds_info
);
2159 auto_vec
<slp_tree
, 4> children
;
2161 stmt_info
= stmts
[0];
2163 /* Create SLP_TREE nodes for the definition node/s. */
2164 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2169 /* We're skipping certain operands from processing, for example
2170 outer loop reduction initial defs. */
2173 children
.safe_push (NULL
);
2177 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
2179 /* COND_EXPR have one too many eventually if the condition
2181 gcc_assert (i
== 3 && nops
== 4);
2185 if (is_a
<bb_vec_info
> (vinfo
)
2186 && oprnd_info
->first_dt
== vect_internal_def
2187 && !oprnd_info
->any_pattern
)
2189 /* For BB vectorization, if all defs are the same do not
2190 bother to continue the build along the single-lane
2191 graph but use a splat of the scalar value. */
2192 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
2193 for (j
= 1; j
< group_size
; ++j
)
2194 if (oprnd_info
->def_stmts
[j
] != first_def
)
2197 /* But avoid doing this for loads where we may be
2198 able to CSE things, unless the stmt is not
2200 && (!STMT_VINFO_VECTORIZABLE (first_def
)
2201 || !gimple_vuse (first_def
->stmt
)))
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE
, vect_location
,
2205 "Using a splat of the uniform operand %G",
2207 oprnd_info
->first_dt
= vect_external_def
;
2211 if (oprnd_info
->first_dt
== vect_external_def
2212 || oprnd_info
->first_dt
== vect_constant_def
)
2214 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
2215 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
2216 oprnd_info
->ops
= vNULL
;
2217 children
.safe_push (invnode
);
2221 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2222 group_size
, &this_max_nunits
,
2224 &this_tree_size
, bst_map
)) != NULL
)
2226 oprnd_info
->def_stmts
= vNULL
;
2227 children
.safe_push (child
);
2231 /* If the SLP build for operand zero failed and operand zero
2232 and one can be commutated try that for the scalar stmts
2233 that failed the match. */
2235 /* A first scalar stmt mismatch signals a fatal mismatch. */
2237 /* ??? For COND_EXPRs we can swap the comparison operands
2238 as well as the arms under some constraints. */
2240 && oprnds_info
[1]->first_dt
== vect_internal_def
2241 && is_gimple_assign (stmt_info
->stmt
)
2242 /* Swapping operands for reductions breaks assumptions later on. */
2243 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
2244 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
2246 /* See whether we can swap the matching or the non-matching
2248 bool swap_not_matching
= true;
2251 for (j
= 0; j
< group_size
; ++j
)
2253 if (matches
[j
] != !swap_not_matching
)
2255 stmt_vec_info stmt_info
= stmts
[j
];
2256 /* Verify if we can swap operands of this stmt. */
2257 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
2259 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
2261 if (!swap_not_matching
)
2263 swap_not_matching
= false;
2268 while (j
!= group_size
);
2270 /* Swap mismatched definition stmts. */
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE
, vect_location
,
2273 "Re-trying with swapped operands of stmts ");
2274 for (j
= 0; j
< group_size
; ++j
)
2275 if (matches
[j
] == !swap_not_matching
)
2277 std::swap (oprnds_info
[0]->def_stmts
[j
],
2278 oprnds_info
[1]->def_stmts
[j
]);
2279 std::swap (oprnds_info
[0]->ops
[j
],
2280 oprnds_info
[1]->ops
[j
]);
2281 if (dump_enabled_p ())
2282 dump_printf (MSG_NOTE
, "%d ", j
);
2284 if (dump_enabled_p ())
2285 dump_printf (MSG_NOTE
, "\n");
2286 /* After swapping some operands we lost track whether an
2287 operand has any pattern defs so be conservative here. */
2288 if (oprnds_info
[0]->any_pattern
|| oprnds_info
[1]->any_pattern
)
2289 oprnds_info
[0]->any_pattern
= oprnds_info
[1]->any_pattern
= true;
2290 /* And try again with scratch 'matches' ... */
2291 bool *tem
= XALLOCAVEC (bool, group_size
);
2292 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2293 group_size
, &this_max_nunits
,
2295 &this_tree_size
, bst_map
)) != NULL
)
2297 oprnd_info
->def_stmts
= vNULL
;
2298 children
.safe_push (child
);
2304 /* If the SLP build failed and we analyze a basic-block
2305 simply treat nodes we fail to build as externally defined
2306 (and thus build vectors from the scalar defs).
2307 The cost model will reject outright expensive cases.
2308 ??? This doesn't treat cases where permutation ultimatively
2309 fails (or we don't try permutation below). Ideally we'd
2310 even compute a permutation that will end up with the maximum
2312 if (is_a
<bb_vec_info
> (vinfo
)
2313 /* ??? Rejecting patterns this way doesn't work. We'd have to
2314 do extra work to cancel the pattern so the uses see the
2316 && !is_pattern_stmt_p (stmt_info
)
2317 && !oprnd_info
->any_pattern
)
2319 /* But if there's a leading vector sized set of matching stmts
2320 fail here so we can split the group. This matches the condition
2321 vect_analyze_slp_instance uses. */
2322 /* ??? We might want to split here and combine the results to support
2323 multiple vector sizes better. */
2324 for (j
= 0; j
< group_size
; ++j
)
2327 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE
, vect_location
,
2331 "Building vector operands from scalars\n");
2333 child
= vect_create_new_slp_node (oprnd_info
->ops
);
2334 children
.safe_push (child
);
2335 oprnd_info
->ops
= vNULL
;
2340 gcc_assert (child
== NULL
);
2341 FOR_EACH_VEC_ELT (children
, j
, child
)
2343 vect_free_slp_tree (child
);
2344 vect_free_oprnd_info (oprnds_info
);
2348 vect_free_oprnd_info (oprnds_info
);
2350 /* If we have all children of a child built up from uniform scalars
2351 or does more than one possibly expensive vector construction then
2352 just throw that away, causing it built up from scalars.
2353 The exception is the SLP node for the vector store. */
2354 if (is_a
<bb_vec_info
> (vinfo
)
2355 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2356 /* ??? Rejecting patterns this way doesn't work. We'd have to
2357 do extra work to cancel the pattern so the uses see the
2359 && !is_pattern_stmt_p (stmt_info
))
2363 bool all_uniform_p
= true;
2364 unsigned n_vector_builds
= 0;
2365 FOR_EACH_VEC_ELT (children
, j
, child
)
2369 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2370 all_uniform_p
= false;
2371 else if (!vect_slp_tree_uniform_p (child
))
2373 all_uniform_p
= false;
2374 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2379 || n_vector_builds
> 1
2380 || (n_vector_builds
== children
.length ()
2381 && is_a
<gphi
*> (stmt_info
->stmt
)))
2385 FOR_EACH_VEC_ELT (children
, j
, child
)
2387 vect_free_slp_tree (child
);
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE
, vect_location
,
2391 "Building parent vector operands from "
2392 "scalars instead\n");
2397 *tree_size
+= this_tree_size
+ 1;
2398 *max_nunits
= this_max_nunits
;
2402 /* ??? We'd likely want to either cache in bst_map sth like
2403 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2404 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2405 explicit stmts to put in so the keying on 'stmts' doesn't
2406 work (but we have the same issue with nodes that use 'ops'). */
2407 slp_tree one
= new _slp_tree
;
2408 slp_tree two
= new _slp_tree
;
2409 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
2410 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
2411 SLP_TREE_VECTYPE (one
) = vectype
;
2412 SLP_TREE_VECTYPE (two
) = vectype
;
2413 SLP_TREE_CHILDREN (one
).safe_splice (children
);
2414 SLP_TREE_CHILDREN (two
).safe_splice (children
);
2416 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
2417 SLP_TREE_REF_COUNT (child
)++;
2419 /* Here we record the original defs since this
2420 node represents the final lane configuration. */
2421 node
= vect_create_new_slp_node (node
, stmts
, 2);
2422 SLP_TREE_VECTYPE (node
) = vectype
;
2423 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2424 SLP_TREE_CHILDREN (node
).quick_push (one
);
2425 SLP_TREE_CHILDREN (node
).quick_push (two
);
2426 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
2427 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
2428 enum tree_code ocode
= ERROR_MARK
;
2429 stmt_vec_info ostmt_info
;
2431 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
2433 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
2434 if (gimple_assign_rhs_code (ostmt
) != code0
)
2436 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
2437 ocode
= gimple_assign_rhs_code (ostmt
);
2441 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
2443 SLP_TREE_CODE (one
) = code0
;
2444 SLP_TREE_CODE (two
) = ocode
;
2445 SLP_TREE_LANES (one
) = stmts
.length ();
2446 SLP_TREE_LANES (two
) = stmts
.length ();
2447 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
2448 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
2452 node
= vect_create_new_slp_node (node
, stmts
, nops
);
2453 SLP_TREE_VECTYPE (node
) = vectype
;
2454 SLP_TREE_CHILDREN (node
).splice (children
);
2458 /* Dump a single SLP tree NODE. */
2461 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
2466 stmt_vec_info stmt_info
;
2469 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2470 dump_user_location_t user_loc
= loc
.get_user_location ();
2471 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)",
2472 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2474 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2477 estimated_poly_value (node
->max_nunits
),
2478 SLP_TREE_REF_COUNT (node
));
2479 if (SLP_TREE_VECTYPE (node
))
2480 dump_printf (metadata
, " %T", SLP_TREE_VECTYPE (node
));
2481 dump_printf (metadata
, "\n");
2482 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2484 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2485 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2487 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2488 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2490 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2491 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2492 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2495 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2496 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2497 dump_printf (metadata
, "%T%s ", op
,
2498 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2499 dump_printf (metadata
, "}\n");
2501 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2503 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2504 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2505 dump_printf (dump_kind
, " %u", j
);
2506 dump_printf (dump_kind
, " }\n");
2508 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2510 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2511 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2512 dump_printf (dump_kind
, " %u[%u]",
2513 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2514 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2515 dump_printf (dump_kind
, " }\n");
2517 if (SLP_TREE_CHILDREN (node
).is_empty ())
2519 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2520 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2521 dump_printf (dump_kind
, " %p", (void *)child
);
2522 dump_printf (dump_kind
, "\n");
2526 debug (slp_tree node
)
2528 debug_dump_context ctx
;
2529 vect_print_slp_tree (MSG_NOTE
,
2530 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2534 /* Recursive helper for the dot producer below. */
2537 dot_slp_tree (FILE *f
, slp_tree node
, hash_set
<slp_tree
> &visited
)
2539 if (visited
.add (node
))
2542 fprintf (f
, "\"%p\" [label=\"", (void *)node
);
2543 vect_print_slp_tree (MSG_NOTE
,
2544 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2546 fprintf (f
, "\"];\n");
2549 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2550 fprintf (f
, "\"%p\" -> \"%p\";", (void *)node
, (void *)child
);
2552 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
2554 dot_slp_tree (f
, child
, visited
);
2558 dot_slp_tree (const char *fname
, slp_tree node
)
2560 FILE *f
= fopen (fname
, "w");
2561 fprintf (f
, "digraph {\n");
2564 debug_dump_context
ctx (f
);
2565 hash_set
<slp_tree
> visited
;
2566 dot_slp_tree (f
, node
, visited
);
2573 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2576 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2577 slp_tree node
, hash_set
<slp_tree
> &visited
)
2582 if (visited
.add (node
))
2585 vect_print_slp_tree (dump_kind
, loc
, node
);
2587 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2589 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2593 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2596 hash_set
<slp_tree
> visited
;
2597 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2600 /* Mark the tree rooted at NODE with PURE_SLP. */
2603 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2606 stmt_vec_info stmt_info
;
2609 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2612 if (visited
.add (node
))
2615 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2616 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2618 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2620 vect_mark_slp_stmts (child
, visited
);
2624 vect_mark_slp_stmts (slp_tree node
)
2626 hash_set
<slp_tree
> visited
;
2627 vect_mark_slp_stmts (node
, visited
);
2630 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2633 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2636 stmt_vec_info stmt_info
;
2639 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2642 if (visited
.add (node
))
2645 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2647 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2648 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2649 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2652 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2654 vect_mark_slp_stmts_relevant (child
, visited
);
2658 vect_mark_slp_stmts_relevant (slp_tree node
)
2660 hash_set
<slp_tree
> visited
;
2661 vect_mark_slp_stmts_relevant (node
, visited
);
2665 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2668 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2669 hash_set
<slp_tree
> &visited
)
2671 if (!node
|| visited
.add (node
))
2674 if (SLP_TREE_CHILDREN (node
).length () == 0)
2676 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2678 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2679 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2680 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2681 loads
.safe_push (node
);
2687 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2688 vect_gather_slp_loads (loads
, child
, visited
);
2693 /* Find the last store in SLP INSTANCE. */
2696 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2698 stmt_vec_info last
= NULL
;
2699 stmt_vec_info stmt_vinfo
;
2701 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2703 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2704 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2710 /* Find the first stmt in NODE. */
2713 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2715 stmt_vec_info first
= NULL
;
2716 stmt_vec_info stmt_vinfo
;
2718 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2720 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2722 || get_later_stmt (stmt_vinfo
, first
) == first
)
2729 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2730 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2731 (also containing the first GROUP1_SIZE stmts, since stores are
2732 consecutive), the second containing the remainder.
2733 Return the first stmt in the second group. */
2735 static stmt_vec_info
2736 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2738 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2739 gcc_assert (group1_size
> 0);
2740 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2741 gcc_assert (group2_size
> 0);
2742 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2744 stmt_vec_info stmt_info
= first_vinfo
;
2745 for (unsigned i
= group1_size
; i
> 1; i
--)
2747 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2748 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2750 /* STMT is now the last element of the first group. */
2751 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2752 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2754 DR_GROUP_SIZE (group2
) = group2_size
;
2755 for (stmt_info
= group2
; stmt_info
;
2756 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2758 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2759 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2762 /* For the second group, the DR_GROUP_GAP is that before the original group,
2763 plus skipping over the first vector. */
2764 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2766 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2767 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2771 group1_size
, group2_size
);
2776 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2777 statements and a vector of NUNITS elements. */
2780 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2782 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2785 /* Helper that checks to see if a node is a load node. */
2788 vect_is_slp_load_node (slp_tree root
)
2790 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2791 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2792 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2796 /* Helper function of optimize_load_redistribution that performs the operation
2800 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2801 vec_info
*vinfo
, unsigned int group_size
,
2802 hash_map
<slp_tree
, slp_tree
> *load_map
,
2805 if (slp_tree
*leader
= load_map
->get (root
))
2811 /* For now, we don't know anything about externals so do not do anything. */
2812 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2814 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2816 /* First convert this node into a load node and add it to the leaves
2817 list and flatten the permute from a lane to a load one. If it's
2818 unneeded it will be elided later. */
2819 vec
<stmt_vec_info
> stmts
;
2820 stmts
.create (SLP_TREE_LANES (root
));
2821 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2822 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2824 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2825 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2827 if (!vect_is_slp_load_node (node
)
2828 || SLP_TREE_CHILDREN (node
).exists ())
2834 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2837 if (dump_enabled_p ())
2838 dump_printf_loc (MSG_NOTE
, vect_location
,
2839 "converting stmts on permute node %p\n", root
);
2841 bool *matches
= XALLOCAVEC (bool, group_size
);
2842 poly_uint64 max_nunits
= 1;
2843 unsigned tree_size
= 0, limit
= 1;
2844 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2845 matches
, &limit
, &tree_size
, bst_map
);
2849 load_map
->put (root
, node
);
2854 load_map
->put (root
, NULL
);
2856 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2859 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2863 SLP_TREE_REF_COUNT (value
)++;
2864 SLP_TREE_CHILDREN (root
)[i
] = value
;
2865 /* ??? We know the original leafs of the replaced nodes will
2866 be referenced by bst_map, only the permutes created by
2867 pattern matching are not. */
2868 if (SLP_TREE_REF_COUNT (node
) == 1)
2869 load_map
->remove (node
);
2870 vect_free_slp_tree (node
);
2877 /* Temporary workaround for loads not being CSEd during SLP build. This
2878 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2879 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2880 same DR such that the final operation is equal to a permuted load. Such
2881 NODES are then directly converted into LOADS themselves. The nodes are
2882 CSEd using BST_MAP. */
2885 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2886 vec_info
*vinfo
, unsigned int group_size
,
2887 hash_map
<slp_tree
, slp_tree
> *load_map
,
2893 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2896 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2900 SLP_TREE_REF_COUNT (value
)++;
2901 SLP_TREE_CHILDREN (root
)[i
] = value
;
2902 /* ??? We know the original leafs of the replaced nodes will
2903 be referenced by bst_map, only the permutes created by
2904 pattern matching are not. */
2905 if (SLP_TREE_REF_COUNT (node
) == 1)
2906 load_map
->remove (node
);
2907 vect_free_slp_tree (node
);
2912 /* Helper function of vect_match_slp_patterns.
2914 Attempts to match patterns against the slp tree rooted in REF_NODE using
2915 VINFO. Patterns are matched in post-order traversal.
2917 If matching is successful the value in REF_NODE is updated and returned, if
2918 not then it is returned unchanged. */
2921 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2922 slp_tree_to_load_perm_map_t
*perm_cache
,
2923 slp_compat_nodes_map_t
*compat_cache
,
2924 hash_set
<slp_tree
> *visited
)
2927 slp_tree node
= *ref_node
;
2928 bool found_p
= false;
2929 if (!node
|| visited
->add (node
))
2933 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2934 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2935 vinfo
, perm_cache
, compat_cache
,
2938 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2940 vect_pattern
*pattern
2941 = slp_patterns
[x
] (perm_cache
, compat_cache
, ref_node
);
2944 pattern
->build (vinfo
);
2953 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2956 The modified tree is returned. Patterns are tried in order and multiple
2957 patterns may match. */
2960 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2961 hash_set
<slp_tree
> *visited
,
2962 slp_tree_to_load_perm_map_t
*perm_cache
,
2963 slp_compat_nodes_map_t
*compat_cache
)
2965 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2966 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE
, vect_location
,
2970 "Analyzing SLP tree %p for patterns\n",
2971 SLP_INSTANCE_TREE (instance
));
2973 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, compat_cache
,
2977 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2978 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2979 Return true if we could use IFN_STORE_LANES instead and if that appears
2980 to be the better approach. */
2983 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
2984 unsigned int group_size
,
2985 unsigned int new_group_size
)
2987 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2988 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
2991 /* Allow the split if one of the two new groups would operate on full
2992 vectors *within* rather than across one scalar loop iteration.
2993 This is purely a heuristic, but it should work well for group
2994 sizes of 3 and 4, where the possible splits are:
2996 3->2+1: OK if the vector has exactly two elements
2998 4->3+1: Less clear-cut. */
2999 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
3000 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
3002 return vect_store_lanes_supported (vectype
, group_size
, false);
3005 /* Analyze an SLP instance starting from a group of grouped stores. Call
3006 vect_build_slp_tree to build a tree of packed stmts if possible.
3007 Return FALSE if it's impossible to SLP any stmt in the loop. */
3010 vect_analyze_slp_instance (vec_info
*vinfo
,
3011 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3012 stmt_vec_info stmt_info
, slp_instance_kind kind
,
3013 unsigned max_tree_size
, unsigned *limit
);
3015 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3016 of KIND. Return true if successful. */
3019 vect_build_slp_instance (vec_info
*vinfo
,
3020 slp_instance_kind kind
,
3021 vec
<stmt_vec_info
> &scalar_stmts
,
3022 vec
<stmt_vec_info
> &root_stmt_infos
,
3023 unsigned max_tree_size
, unsigned *limit
,
3024 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3025 /* ??? We need stmt_info for group splitting. */
3026 stmt_vec_info stmt_info_
)
3028 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE
, vect_location
,
3031 "Starting SLP discovery for\n");
3032 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
3033 dump_printf_loc (MSG_NOTE
, vect_location
,
3034 " %G", scalar_stmts
[i
]->stmt
);
3037 /* Build the tree for the SLP instance. */
3038 unsigned int group_size
= scalar_stmts
.length ();
3039 bool *matches
= XALLOCAVEC (bool, group_size
);
3040 poly_uint64 max_nunits
= 1;
3041 unsigned tree_size
= 0;
3043 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
3044 &max_nunits
, matches
, limit
,
3045 &tree_size
, bst_map
);
3048 /* Calculate the unrolling factor based on the smallest type. */
3049 poly_uint64 unrolling_factor
3050 = calculate_unrolling_factor (max_nunits
, group_size
);
3052 if (maybe_ne (unrolling_factor
, 1U)
3053 && is_a
<bb_vec_info
> (vinfo
))
3055 unsigned HOST_WIDE_INT const_max_nunits
;
3056 if (!max_nunits
.is_constant (&const_max_nunits
)
3057 || const_max_nunits
> group_size
)
3059 if (dump_enabled_p ())
3060 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3061 "Build SLP failed: store group "
3062 "size not a multiple of the vector size "
3063 "in basic block SLP\n");
3064 vect_free_slp_tree (node
);
3067 /* Fatal mismatch. */
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE
, vect_location
,
3070 "SLP discovery succeeded but node needs "
3072 memset (matches
, true, group_size
);
3073 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
3074 vect_free_slp_tree (node
);
3078 /* Create a new SLP instance. */
3079 slp_instance new_instance
= XNEW (class _slp_instance
);
3080 SLP_INSTANCE_TREE (new_instance
) = node
;
3081 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
3082 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
3083 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
3084 SLP_INSTANCE_KIND (new_instance
) = kind
;
3085 new_instance
->reduc_phis
= NULL
;
3086 new_instance
->cost_vec
= vNULL
;
3087 new_instance
->subgraph_entries
= vNULL
;
3089 if (dump_enabled_p ())
3090 dump_printf_loc (MSG_NOTE
, vect_location
,
3091 "SLP size %u vs. limit %u.\n",
3092 tree_size
, max_tree_size
);
3094 /* Fixup SLP reduction chains. */
3095 if (kind
== slp_inst_kind_reduc_chain
)
3097 /* If this is a reduction chain with a conversion in front
3098 amend the SLP tree with a node for that. */
3100 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
3101 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
3103 /* Get at the conversion stmt - we know it's the single use
3104 of the last stmt of the reduction chain. */
3105 use_operand_p use_p
;
3106 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
3107 &use_p
, &scalar_def
);
3109 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
3110 next_info
= vect_stmt_to_vectorize (next_info
);
3111 scalar_stmts
= vNULL
;
3112 scalar_stmts
.create (group_size
);
3113 for (unsigned i
= 0; i
< group_size
; ++i
)
3114 scalar_stmts
.quick_push (next_info
);
3115 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
3116 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
3117 SLP_TREE_CHILDREN (conv
).quick_push (node
);
3118 SLP_INSTANCE_TREE (new_instance
) = conv
;
3119 /* We also have to fake this conversion stmt as SLP reduction
3120 group so we don't have to mess with too much code
3122 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
3123 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
3125 /* Fill the backedge child of the PHI SLP node. The
3126 general matching code cannot find it because the
3127 scalar code does not reflect how we vectorize the
3129 use_operand_p use_p
;
3130 imm_use_iterator imm_iter
;
3131 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
3132 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
3133 gimple_get_lhs (scalar_def
))
3134 /* There are exactly two non-debug uses, the reduction
3135 PHI and the loop-closed PHI node. */
3136 if (!is_gimple_debug (USE_STMT (use_p
))
3137 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
3139 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
3140 stmt_vec_info phi_info
3141 = vinfo
->lookup_stmt (USE_STMT (use_p
));
3142 for (unsigned i
= 0; i
< group_size
; ++i
)
3143 phis
.quick_push (phi_info
);
3144 slp_tree
*phi_node
= bst_map
->get (phis
);
3145 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
3146 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
3147 = SLP_INSTANCE_TREE (new_instance
);
3148 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
3152 vinfo
->slp_instances
.safe_push (new_instance
);
3154 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3155 the number of scalar stmts in the root in a few places.
3156 Verify that assumption holds. */
3157 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
3158 .length () == group_size
);
3160 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE
, vect_location
,
3163 "Final SLP tree for instance %p:\n", new_instance
);
3164 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3165 SLP_INSTANCE_TREE (new_instance
));
3173 /* Failed to SLP. */
3174 /* Free the allocated memory. */
3175 scalar_stmts
.release ();
3178 stmt_vec_info stmt_info
= stmt_info_
;
3179 /* Try to break the group up into pieces. */
3180 if (kind
== slp_inst_kind_store
)
3182 /* ??? We could delay all the actual splitting of store-groups
3183 until after SLP discovery of the original group completed.
3184 Then we can recurse to vect_build_slp_instance directly. */
3185 for (i
= 0; i
< group_size
; i
++)
3189 /* For basic block SLP, try to break the group up into multiples of
3191 if (is_a
<bb_vec_info
> (vinfo
)
3192 && (i
> 1 && i
< group_size
))
3195 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3196 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
3197 1 << floor_log2 (i
));
3198 unsigned HOST_WIDE_INT const_nunits
;
3200 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
3202 /* Split into two groups at the first vector boundary. */
3203 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
3204 unsigned group1_size
= i
& ~(const_nunits
- 1);
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE
, vect_location
,
3208 "Splitting SLP group at stmt %u\n", i
);
3209 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3211 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3212 kind
, max_tree_size
,
3214 /* Split the rest at the failure point and possibly
3215 re-analyze the remaining matching part if it has
3216 at least two lanes. */
3218 && (i
+ 1 < group_size
3219 || i
- group1_size
> 1))
3221 stmt_vec_info rest2
= rest
;
3222 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
3223 if (i
- group1_size
> 1)
3224 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
3225 kind
, max_tree_size
,
3228 /* Re-analyze the non-matching tail if it has at least
3230 if (i
+ 1 < group_size
)
3231 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3232 rest
, kind
, max_tree_size
,
3238 /* For loop vectorization split into arbitrary pieces of size > 1. */
3239 if (is_a
<loop_vec_info
> (vinfo
)
3240 && (i
> 1 && i
< group_size
)
3241 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
3243 unsigned group1_size
= i
;
3245 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_NOTE
, vect_location
,
3247 "Splitting SLP group at stmt %u\n", i
);
3249 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
3251 /* Loop vectorization cannot handle gaps in stores, make sure
3252 the split group appears as strided. */
3253 STMT_VINFO_STRIDED_P (rest
) = 1;
3254 DR_GROUP_GAP (rest
) = 0;
3255 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
3256 DR_GROUP_GAP (stmt_info
) = 0;
3258 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
3259 kind
, max_tree_size
, limit
);
3260 if (i
+ 1 < group_size
)
3261 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
3262 rest
, kind
, max_tree_size
, limit
);
3267 /* Even though the first vector did not all match, we might be able to SLP
3268 (some) of the remainder. FORNOW ignore this possibility. */
3271 /* Failed to SLP. */
3272 if (dump_enabled_p ())
3273 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
3278 /* Analyze an SLP instance starting from a group of grouped stores. Call
3279 vect_build_slp_tree to build a tree of packed stmts if possible.
3280 Return FALSE if it's impossible to SLP any stmt in the loop. */
3283 vect_analyze_slp_instance (vec_info
*vinfo
,
3284 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3285 stmt_vec_info stmt_info
,
3286 slp_instance_kind kind
,
3287 unsigned max_tree_size
, unsigned *limit
)
3290 vec
<stmt_vec_info
> scalar_stmts
;
3292 if (is_a
<bb_vec_info
> (vinfo
))
3293 vect_location
= stmt_info
->stmt
;
3295 stmt_vec_info next_info
= stmt_info
;
3296 if (kind
== slp_inst_kind_store
)
3298 /* Collect the stores and store them in scalar_stmts. */
3299 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
3302 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3303 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
3306 else if (kind
== slp_inst_kind_reduc_chain
)
3308 /* Collect the reduction stmts and store them in scalar_stmts. */
3309 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
3312 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
3313 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
3315 /* Mark the first element of the reduction chain as reduction to properly
3316 transform the node. In the reduction analysis phase only the last
3317 element of the chain is marked as reduction. */
3318 STMT_VINFO_DEF_TYPE (stmt_info
)
3319 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
3320 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
3321 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
3323 else if (kind
== slp_inst_kind_ctor
)
3325 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
3327 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
3328 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
3330 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
3331 def_info
= vect_stmt_to_vectorize (def_info
);
3332 scalar_stmts
.quick_push (def_info
);
3334 if (dump_enabled_p ())
3335 dump_printf_loc (MSG_NOTE
, vect_location
,
3336 "Analyzing vectorizable constructor: %G\n",
3339 else if (kind
== slp_inst_kind_reduc_group
)
3341 /* Collect reduction statements. */
3342 const vec
<stmt_vec_info
> &reductions
3343 = as_a
<loop_vec_info
> (vinfo
)->reductions
;
3344 scalar_stmts
.create (reductions
.length ());
3345 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
3346 if ((STMT_VINFO_RELEVANT_P (next_info
)
3347 || STMT_VINFO_LIVE_P (next_info
))
3348 /* ??? Make sure we didn't skip a conversion around a reduction
3349 path. In that case we'd have to reverse engineer that conversion
3350 stmt following the chain using reduc_idx and from the PHI
3352 && STMT_VINFO_DEF_TYPE (next_info
) == vect_reduction_def
)
3353 scalar_stmts
.quick_push (next_info
);
3354 /* If less than two were relevant/live there's nothing to SLP. */
3355 if (scalar_stmts
.length () < 2)
3361 vec
<stmt_vec_info
> roots
= vNULL
;
3362 if (kind
== slp_inst_kind_ctor
)
3365 roots
.quick_push (stmt_info
);
3367 /* Build the tree for the SLP instance. */
3368 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
3370 max_tree_size
, limit
, bst_map
,
3371 kind
== slp_inst_kind_store
3372 ? stmt_info
: NULL
);
3376 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3377 where we should do store group splitting. */
3382 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3383 trees of packed scalar stmts if SLP is possible. */
3386 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
3389 stmt_vec_info first_element
;
3390 slp_instance instance
;
3392 DUMP_VECT_SCOPE ("vect_analyze_slp");
3394 unsigned limit
= max_tree_size
;
3396 scalar_stmts_to_slp_tree_map_t
*bst_map
3397 = new scalar_stmts_to_slp_tree_map_t ();
3399 /* Find SLP sequences starting from groups of grouped stores. */
3400 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
3401 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3402 STMT_VINFO_GROUPED_ACCESS (first_element
)
3403 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
3404 max_tree_size
, &limit
);
3406 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3408 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
3410 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
3411 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
3412 bb_vinfo
->roots
[i
].stmts
,
3413 bb_vinfo
->roots
[i
].roots
,
3414 max_tree_size
, &limit
, bst_map
, NULL
))
3416 bb_vinfo
->roots
[i
].stmts
= vNULL
;
3417 bb_vinfo
->roots
[i
].roots
= vNULL
;
3422 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3424 /* Find SLP sequences starting from reduction chains. */
3425 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
3426 if (! STMT_VINFO_RELEVANT_P (first_element
)
3427 && ! STMT_VINFO_LIVE_P (first_element
))
3429 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
3430 slp_inst_kind_reduc_chain
,
3431 max_tree_size
, &limit
))
3433 /* Dissolve reduction chain group. */
3434 stmt_vec_info vinfo
= first_element
;
3435 stmt_vec_info last
= NULL
;
3438 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
3439 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
3440 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
3444 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
3445 /* It can be still vectorized as part of an SLP reduction. */
3446 loop_vinfo
->reductions
.safe_push (last
);
3449 /* Find SLP sequences starting from groups of reductions. */
3450 if (loop_vinfo
->reductions
.length () > 1)
3451 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
3452 slp_inst_kind_reduc_group
, max_tree_size
,
3456 hash_set
<slp_tree
> visited_patterns
;
3457 slp_tree_to_load_perm_map_t perm_cache
;
3458 slp_compat_nodes_map_t compat_cache
;
3460 /* See if any patterns can be found in the SLP tree. */
3461 bool pattern_found
= false;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3463 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
3464 &visited_patterns
, &perm_cache
,
3467 /* If any were found optimize permutations of loads. */
3470 hash_map
<slp_tree
, slp_tree
> load_map
;
3471 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3473 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3474 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
3481 /* The map keeps a reference on SLP nodes built, release that. */
3482 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
3483 it
!= bst_map
->end (); ++it
)
3485 vect_free_slp_tree ((*it
).second
);
3488 if (pattern_found
&& dump_enabled_p ())
3490 dump_printf_loc (MSG_NOTE
, vect_location
,
3491 "Pattern matched SLP tree\n");
3492 hash_set
<slp_tree
> visited
;
3493 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
3494 vect_print_slp_graph (MSG_NOTE
, vect_location
,
3495 SLP_INSTANCE_TREE (instance
), visited
);
3498 return opt_result::success ();
3503 slpg_vertex (slp_tree node_
)
3504 : node (node_
), perm_in (-1), perm_out (-1) {}
3506 int get_perm_materialized () const
3507 { return perm_in
!= perm_out
? perm_in
: 0; }
3510 /* The common permutation on the incoming lanes (towards SLP children). */
3512 /* The permutation on the outgoing lanes (towards SLP parents). When
3513 the node is a materialization point for a permute this differs
3514 from perm_in (and is then usually zero). Materialization happens
3515 on the input side. */
3519 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3522 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
3523 vec
<slpg_vertex
> &vertices
, vec
<int> &leafs
)
3528 if (visited
.add (node
))
3531 node
->vertex
= vertices
.length ();
3532 vertices
.safe_push (slpg_vertex (node
));
3535 bool force_leaf
= false;
3536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3540 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
3544 /* Since SLP discovery works along use-def edges all cycles have an
3545 entry - but there's the exception of cycles where we do not handle
3546 the entry explicitely (but with a NULL SLP node), like some reductions
3547 and inductions. Force those SLP PHIs to act as leafs to make them
3548 backwards reachable. */
3549 if (leaf
|| force_leaf
)
3550 leafs
.safe_push (node
->vertex
);
3553 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3556 vect_slp_build_vertices (vec_info
*info
, vec
<slpg_vertex
> &vertices
,
3559 hash_set
<slp_tree
> visited
;
3561 slp_instance instance
;
3562 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
3563 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
3567 /* Apply (reverse) bijectite PERM to VEC. */
3571 vect_slp_permute (vec
<unsigned> perm
,
3572 vec
<T
> &vec
, bool reverse
)
3574 auto_vec
<T
, 64> saved
;
3575 saved
.create (vec
.length ());
3576 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3577 saved
.quick_push (vec
[i
]);
3581 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3582 vec
[perm
[i
]] = saved
[i
];
3583 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3584 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3588 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3589 vec
[i
] = saved
[perm
[i
]];
3590 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3591 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3595 /* Return whether permutations PERM_A and PERM_B as recorded in the
3596 PERMS vector are equal. */
3599 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3600 int perm_a
, int perm_b
)
3602 return (perm_a
== perm_b
3603 || (perm_a
!= -1 && perm_b
!= -1
3604 && perms
[perm_a
].length () == perms
[perm_b
].length ()
3605 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3606 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3609 /* Optimize the SLP graph of VINFO. */
3612 vect_optimize_slp (vec_info
*vinfo
)
3614 if (vinfo
->slp_instances
.is_empty ())
3619 auto_vec
<slpg_vertex
> vertices
;
3620 auto_vec
<int> leafs
;
3621 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3623 struct graph
*slpg
= new_graph (vertices
.length ());
3624 for (slpg_vertex
&v
: vertices
)
3625 for (slp_tree child
: SLP_TREE_CHILDREN (v
.node
))
3627 add_edge (slpg
, v
.node
->vertex
, child
->vertex
);
3629 /* Compute (reverse) postorder on the inverted graph. */
3631 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3633 auto_vec
<vec
<unsigned> > perms
;
3634 perms
.safe_push (vNULL
); /* zero is no permute */
3636 /* Produce initial permutations. */
3637 for (i
= 0; i
< leafs
.length (); ++i
)
3640 slp_tree node
= vertices
[idx
].node
;
3642 /* Handle externals and constants optimistically throughout the
3643 iteration. But treat existing vectors as fixed since we
3644 do not handle permuting them below. */
3645 if ((SLP_TREE_DEF_TYPE (node
) == vect_external_def
3646 && !SLP_TREE_VEC_DEFS (node
).exists ())
3647 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3650 /* Leafs do not change across iterations. Note leafs also double
3651 as entries to the reverse graph. */
3652 if (!slpg
->vertices
[idx
].succ
)
3654 vertices
[idx
].perm_in
= 0;
3655 vertices
[idx
].perm_out
= 0;
3658 /* Loads are the only thing generating permutes. */
3659 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3662 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3663 node unpermuted, record this permute. */
3664 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3665 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3667 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3668 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3669 bool any_permute
= false;
3670 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3672 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3673 imin
= MIN (imin
, idx
);
3674 imax
= MAX (imax
, idx
);
3675 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3678 /* If there's no permute no need to split one out. */
3681 /* If the span doesn't match we'd disrupt VF computation, avoid
3683 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3686 /* For now only handle true permutes, like
3687 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3688 when permuting constants and invariants keeping the permute
3690 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3691 bitmap_clear (load_index
);
3692 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3693 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3695 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3696 if (!bitmap_bit_p (load_index
, j
))
3698 if (j
!= SLP_TREE_LANES (node
))
3701 vec
<unsigned> perm
= vNULL
;
3702 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3703 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3704 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3705 perms
.safe_push (perm
);
3706 vertices
[idx
].perm_in
= perms
.length () - 1;
3707 vertices
[idx
].perm_out
= perms
.length () - 1;
3710 /* In addition to the above we have to mark outgoing permutes facing
3711 non-reduction graph entries that are not represented as to be
3713 for (slp_instance instance
: vinfo
->slp_instances
)
3714 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
)
3716 /* Just setting perm_out isn't enough for the propagation to
3718 vertices
[SLP_INSTANCE_TREE (instance
)->vertex
].perm_in
= 0;
3719 vertices
[SLP_INSTANCE_TREE (instance
)->vertex
].perm_out
= 0;
3722 /* Propagate permutes along the graph and compute materialization points. */
3724 bool do_materialization
= false;
3725 unsigned iteration
= 0;
3731 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_NOTE
, vect_location
,
3733 "SLP optimize iteration %d\n", iteration
);
3735 for (i
= vertices
.length (); i
> 0 ; --i
)
3738 slp_tree node
= vertices
[idx
].node
;
3740 /* Handle externals and constants optimistically throughout the
3742 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3743 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3746 /* We still eventually have failed backedge SLP nodes in the
3747 graph, those are only cancelled when analyzing operations.
3748 Simply treat them as transparent ops, propagating permutes
3750 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
3752 /* We do not handle stores with a permutation, so all
3753 incoming permutes must have been materialized. */
3754 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
3755 if (STMT_VINFO_DATA_REF (rep
)
3756 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep
)))
3758 /* ??? We're forcing materialization in place
3759 of the child here, we'd need special handling
3760 in materialization to leave perm_in -1 here. */
3761 vertices
[idx
].perm_in
= 0;
3762 vertices
[idx
].perm_out
= 0;
3764 /* We cannot move a permute across an operation that is
3765 not independent on lanes. Note this is an explicit
3766 negative list since that's much shorter than the respective
3767 positive one but it's critical to keep maintaining it. */
3768 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
3769 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
3771 case CFN_COMPLEX_ADD_ROT90
:
3772 case CFN_COMPLEX_ADD_ROT270
:
3773 case CFN_COMPLEX_MUL
:
3774 case CFN_COMPLEX_MUL_CONJ
:
3775 case CFN_VEC_ADDSUB
:
3776 case CFN_VEC_FMADDSUB
:
3777 case CFN_VEC_FMSUBADD
:
3778 vertices
[idx
].perm_in
= 0;
3779 vertices
[idx
].perm_out
= 0;
3784 if (!slpg
->vertices
[idx
].succ
)
3785 /* Pick up pre-computed leaf values. */
3789 bool any_succ_perm_out_m1
= false;
3790 int perm_in
= vertices
[idx
].perm_in
;
3791 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3792 succ
; succ
= succ
->succ_next
)
3794 int succ_idx
= succ
->dest
;
3795 int succ_perm
= vertices
[succ_idx
].perm_out
;
3796 /* Handle unvisited (and constant) nodes optimistically. */
3797 /* ??? But for constants once we want to handle
3798 non-bijective permutes we have to verify the permute,
3799 when unifying lanes, will not unify different constants.
3800 For example see gcc.dg/vect/bb-slp-14.c for a case
3801 that would break. */
3802 if (succ_perm
== -1)
3804 /* When we handled a non-leaf optimistically, note
3805 that so we can adjust its outgoing permute below. */
3806 slp_tree succ_node
= vertices
[succ_idx
].node
;
3807 if (SLP_TREE_DEF_TYPE (succ_node
) != vect_external_def
3808 && SLP_TREE_DEF_TYPE (succ_node
) != vect_constant_def
)
3809 any_succ_perm_out_m1
= true;
3813 perm_in
= succ_perm
;
3814 else if (succ_perm
== 0
3815 || !vect_slp_perms_eq (perms
, perm_in
, succ_perm
))
3822 /* Adjust any incoming permutes we treated optimistically. */
3823 if (perm_in
!= -1 && any_succ_perm_out_m1
)
3825 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3826 succ
; succ
= succ
->succ_next
)
3828 slp_tree succ_node
= vertices
[succ
->dest
].node
;
3829 if (vertices
[succ
->dest
].perm_out
== -1
3830 && SLP_TREE_DEF_TYPE (succ_node
) != vect_external_def
3831 && SLP_TREE_DEF_TYPE (succ_node
) != vect_constant_def
)
3833 vertices
[succ
->dest
].perm_out
= perm_in
;
3834 /* And ensure this propagates. */
3835 if (vertices
[succ
->dest
].perm_in
== -1)
3836 vertices
[succ
->dest
].perm_in
= perm_in
;
3842 if (!vect_slp_perms_eq (perms
, perm_in
,
3843 vertices
[idx
].perm_in
))
3845 /* Make sure we eventually converge. */
3846 gcc_checking_assert (vertices
[idx
].perm_in
== -1
3848 vertices
[idx
].perm_in
= perm_in
;
3850 /* While we can handle VEC_PERM nodes as transparent
3851 pass-through they can be a cheap materialization
3852 point as well. In addition they can act as source
3853 of a random permutation as well.
3854 The following ensures that former materialization
3855 points that now have zero incoming permutes no
3856 longer appear as such and that former "any" permutes
3857 get pass-through. We keep VEC_PERM nodes optimistic
3858 as "any" outgoing permute though. */
3859 if (vertices
[idx
].perm_out
!= 0
3860 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
3861 vertices
[idx
].perm_out
= perm_in
;
3866 /* Elide pruning at materialization points in the first
3868 if (!do_materialization
)
3871 int perm
= vertices
[idx
].perm_out
;
3872 if (perm
== 0 || perm
== -1)
3875 /* Decide on permute materialization. Look whether there's
3876 a use (pred) edge that is permuted differently than us.
3877 In that case mark ourselves so the permutation is applied. */
3878 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
3879 if (all_preds_permuted
)
3880 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3881 pred
; pred
= pred
->pred_next
)
3883 int pred_perm
= vertices
[pred
->src
].perm_in
;
3884 gcc_checking_assert (pred_perm
!= -1);
3885 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3887 all_preds_permuted
= false;
3891 if (!all_preds_permuted
)
3893 vertices
[idx
].perm_out
= 0;
3898 /* If the initial propagation converged, switch on materialization
3899 and re-propagate. */
3900 if (!changed
&& !do_materialization
)
3902 do_materialization
= true;
3907 statistics_histogram_event (cfun
, "SLP optimize perm iterations", iteration
);
3910 for (i
= 0; i
< vertices
.length (); ++i
)
3912 int perm_in
= vertices
[i
].perm_in
;
3913 slp_tree node
= vertices
[i
].node
;
3915 /* First permute invariant/external original successors, we handle
3916 those optimistically during propagation and duplicate them if
3917 they are used with different permutations. */
3921 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3924 || (SLP_TREE_DEF_TYPE (child
) != vect_constant_def
3925 && SLP_TREE_DEF_TYPE (child
) != vect_external_def
))
3928 /* If the vector is uniform there's nothing to do. */
3929 if (vect_slp_tree_uniform_p (child
))
3932 /* We can end up sharing some externals via two_operator
3933 handling. Be prepared to unshare those. */
3934 if (child
->refcnt
!= 1)
3936 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3937 SLP_TREE_CHILDREN (node
)[j
] = child
3938 = vect_create_new_slp_node
3939 (SLP_TREE_SCALAR_OPS (child
).copy ());
3941 vect_slp_permute (perms
[perm_in
],
3942 SLP_TREE_SCALAR_OPS (child
), true);
3945 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3947 /* Apply the common permutes to the input vectors. */
3950 /* If the node is already a permute node we can apply
3951 the permutation to the lane selection, effectively
3952 materializing it on the incoming vectors. */
3953 if (dump_enabled_p ())
3954 dump_printf_loc (MSG_NOTE
, vect_location
,
3955 "simplifying permute node %p\n",
3957 for (unsigned k
= 0;
3958 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3959 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3960 = perms
[perm_in
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3962 /* Apply the anticipated output permute to the permute and
3964 int perm_out
= vertices
[i
].perm_out
;
3967 vect_slp_permute (perms
[perm_out
],
3968 SLP_TREE_SCALAR_STMTS (node
), true);
3969 vect_slp_permute (perms
[perm_out
],
3970 SLP_TREE_LANE_PERMUTATION (node
), true);
3973 else if (vertices
[i
].get_perm_materialized () != 0)
3975 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3976 /* For loads simply drop the permutation, the load permutation
3977 already performs the desired permutation. */
3979 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3983 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_NOTE
, vect_location
,
3985 "inserting permute node in place of %p\n",
3988 /* Make a copy of NODE and in-place change it to a
3989 VEC_PERM node to permute the lanes of the copy. */
3990 slp_tree copy
= new _slp_tree
;
3991 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3992 SLP_TREE_CHILDREN (node
) = vNULL
;
3993 SLP_TREE_SCALAR_STMTS (copy
)
3994 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3995 vect_slp_permute (perms
[perm_in
],
3996 SLP_TREE_SCALAR_STMTS (copy
), true);
3997 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3998 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3999 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
4000 SLP_TREE_LANE_PERMUTATION (copy
)
4001 = SLP_TREE_LANE_PERMUTATION (node
);
4002 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
4003 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
4005 copy
->max_nunits
= node
->max_nunits
;
4006 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
4007 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
4008 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
4010 /* Now turn NODE into a VEC_PERM. */
4011 SLP_TREE_CHILDREN (node
).safe_push (copy
);
4012 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
4013 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
4014 SLP_TREE_LANE_PERMUTATION (node
)
4015 .quick_push (std::make_pair (0, perms
[perm_in
][j
]));
4016 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
4019 else if (perm_in
> 0) /* perm_in == perm_out */
4021 /* Apply the reverse permutation to our stmts. */
4022 vect_slp_permute (perms
[perm_in
],
4023 SLP_TREE_SCALAR_STMTS (node
), true);
4024 /* And to the lane/load permutation, which we can simply
4025 make regular by design. */
4026 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4028 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node
).exists ());
4029 /* ??? When we handle non-bijective permutes the idea
4030 is that we can force the load-permutation to be
4031 { min, min + 1, min + 2, ... max }. But then the
4032 scalar defs might no longer match the lane content
4033 which means wrong-code with live lane vectorization.
4034 So we possibly have to have NULL entries for those. */
4035 vect_slp_permute (perms
[perm_in
],
4036 SLP_TREE_LOAD_PERMUTATION (node
), true);
4038 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
4043 /* Elide any permutations at BB reduction roots. */
4044 if (is_a
<bb_vec_info
> (vinfo
))
4046 for (slp_instance instance
: vinfo
->slp_instances
)
4048 if (SLP_INSTANCE_KIND (instance
) != slp_inst_kind_bb_reduc
)
4050 slp_tree old
= SLP_INSTANCE_TREE (instance
);
4051 if (SLP_TREE_CODE (old
) == VEC_PERM_EXPR
4052 && SLP_TREE_CHILDREN (old
).length () == 1)
4054 slp_tree child
= SLP_TREE_CHILDREN (old
)[0];
4055 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4057 /* Preserve the special VEC_PERM we use to shield existing
4058 vector defs from the rest. But make it a no-op. */
4060 for (std::pair
<unsigned, unsigned> &p
4061 : SLP_TREE_LANE_PERMUTATION (old
))
4066 SLP_INSTANCE_TREE (instance
) = child
;
4067 SLP_TREE_REF_COUNT (child
)++;
4068 vect_free_slp_tree (old
);
4071 else if (SLP_TREE_LOAD_PERMUTATION (old
).exists ()
4072 && SLP_TREE_REF_COUNT (old
) == 1
4073 && vertices
[old
->vertex
].get_perm_materialized () != 0)
4075 /* ??? For loads the situation is more complex since
4076 we can't modify the permute in place in case the
4077 node is used multiple times. In fact for loads this
4078 should be somehow handled in the propagation engine. */
4079 /* Apply the reverse permutation to our stmts. */
4080 int perm
= vertices
[old
->vertex
].get_perm_materialized ();
4081 vect_slp_permute (perms
[perm
],
4082 SLP_TREE_SCALAR_STMTS (old
), true);
4083 vect_slp_permute (perms
[perm
],
4084 SLP_TREE_LOAD_PERMUTATION (old
), true);
4089 /* Free the perms vector used for propagation. */
4090 while (!perms
.is_empty ())
4091 perms
.pop ().release ();
4095 /* Now elide load permutations that are not necessary. */
4096 for (i
= 0; i
< leafs
.length (); ++i
)
4098 node
= vertices
[leafs
[i
]].node
;
4099 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
4102 /* In basic block vectorization we allow any subchain of an interleaving
4104 FORNOW: not in loop SLP because of realignment complications. */
4105 if (is_a
<bb_vec_info
> (vinfo
))
4107 bool subchain_p
= true;
4108 stmt_vec_info next_load_info
= NULL
;
4109 stmt_vec_info load_info
;
4111 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4114 && (next_load_info
!= load_info
4115 || DR_GROUP_GAP (load_info
) != 1))
4120 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
4124 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4130 stmt_vec_info load_info
;
4131 bool this_load_permuted
= false;
4133 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
4134 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
4136 this_load_permuted
= true;
4139 stmt_vec_info first_stmt_info
4140 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
4141 if (!this_load_permuted
4142 /* The load requires permutation when unrolling exposes
4143 a gap either because the group is larger than the SLP
4144 group-size or because there is a gap between the groups. */
4145 && (known_eq (LOOP_VINFO_VECT_FACTOR
4146 (as_a
<loop_vec_info
> (vinfo
)), 1U)
4147 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
4148 && DR_GROUP_GAP (first_stmt_info
) == 0)))
4150 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4157 /* Gather loads reachable from the individual SLP graph entries. */
4160 vect_gather_slp_loads (vec_info
*vinfo
)
4163 slp_instance instance
;
4164 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
4166 hash_set
<slp_tree
> visited
;
4167 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
4168 SLP_INSTANCE_TREE (instance
), visited
);
4173 /* For each possible SLP instance decide whether to SLP it and calculate overall
4174 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4175 least one instance. */
4178 vect_make_slp_decision (loop_vec_info loop_vinfo
)
4181 poly_uint64 unrolling_factor
= 1;
4182 const vec
<slp_instance
> &slp_instances
4183 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
4184 slp_instance instance
;
4185 int decided_to_slp
= 0;
4187 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4189 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4191 /* FORNOW: SLP if you can. */
4192 /* All unroll factors have the form:
4194 GET_MODE_SIZE (vinfo->vector_mode) * X
4196 for some rational X, so they must have a common multiple. */
4198 = force_common_multiple (unrolling_factor
,
4199 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
4201 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4202 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4203 loop-based vectorization. Such stmts will be marked as HYBRID. */
4204 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4208 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
4210 if (decided_to_slp
&& dump_enabled_p ())
4212 dump_printf_loc (MSG_NOTE
, vect_location
,
4213 "Decided to SLP %d instances. Unrolling factor ",
4215 dump_dec (MSG_NOTE
, unrolling_factor
);
4216 dump_printf (MSG_NOTE
, "\n");
4219 return (decided_to_slp
> 0);
4222 /* Private data for vect_detect_hybrid_slp. */
4225 loop_vec_info loop_vinfo
;
4226 vec
<stmt_vec_info
> *worklist
;
4229 /* Walker for walk_gimple_op. */
4232 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
4234 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
4235 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
4240 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
4243 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
4244 if (PURE_SLP_STMT (def_stmt_info
))
4246 if (dump_enabled_p ())
4247 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
4248 def_stmt_info
->stmt
);
4249 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
4250 dat
->worklist
->safe_push (def_stmt_info
);
4256 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4257 if so, otherwise pushing it to WORKLIST. */
4260 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
4261 vec
<stmt_vec_info
> &worklist
,
4262 stmt_vec_info stmt_info
)
4264 if (dump_enabled_p ())
4265 dump_printf_loc (MSG_NOTE
, vect_location
,
4266 "Processing hybrid candidate : %G", stmt_info
->stmt
);
4267 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
4268 imm_use_iterator iter2
;
4270 use_operand_p use_p
;
4271 def_operand_p def_p
;
4272 bool any_def
= false;
4273 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
4276 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
4278 if (is_gimple_debug (USE_STMT (use_p
)))
4280 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
4281 /* An out-of loop use means this is a loop_vect sink. */
4284 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_NOTE
, vect_location
,
4286 "Found loop_vect sink: %G", stmt_info
->stmt
);
4287 worklist
.safe_push (stmt_info
);
4290 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_NOTE
, vect_location
,
4294 "Found loop_vect use: %G", use_info
->stmt
);
4295 worklist
.safe_push (stmt_info
);
4300 /* No def means this is a loo_vect sink. */
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_NOTE
, vect_location
,
4305 "Found loop_vect sink: %G", stmt_info
->stmt
);
4306 worklist
.safe_push (stmt_info
);
4309 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_NOTE
, vect_location
,
4311 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
4312 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
4315 /* Find stmts that must be both vectorized and SLPed. */
4318 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
4320 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4322 /* All stmts participating in SLP are marked pure_slp, all other
4323 stmts are loop_vect.
4324 First collect all loop_vect stmts into a worklist.
4325 SLP patterns cause not all original scalar stmts to appear in
4326 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4327 Rectify this here and do a backward walk over the IL only considering
4328 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4329 mark them as pure_slp. */
4330 auto_vec
<stmt_vec_info
> worklist
;
4331 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
4333 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
4334 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
4337 gphi
*phi
= gsi
.phi ();
4338 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
4339 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4340 maybe_push_to_hybrid_worklist (loop_vinfo
,
4341 worklist
, stmt_info
);
4343 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
4346 gimple
*stmt
= gsi_stmt (gsi
);
4347 if (is_gimple_debug (stmt
))
4349 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
4350 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
4352 for (gimple_stmt_iterator gsi2
4353 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
4354 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
4356 stmt_vec_info patt_info
4357 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
4358 if (!STMT_SLP_TYPE (patt_info
)
4359 && STMT_VINFO_RELEVANT (patt_info
))
4360 maybe_push_to_hybrid_worklist (loop_vinfo
,
4361 worklist
, patt_info
);
4363 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
4365 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
4366 maybe_push_to_hybrid_worklist (loop_vinfo
,
4367 worklist
, stmt_info
);
4371 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4372 mark any SLP vectorized stmt as hybrid.
4373 ??? We're visiting def stmts N times (once for each non-SLP and
4374 once for each hybrid-SLP use). */
4377 dat
.worklist
= &worklist
;
4378 dat
.loop_vinfo
= loop_vinfo
;
4379 memset (&wi
, 0, sizeof (wi
));
4380 wi
.info
= (void *)&dat
;
4381 while (!worklist
.is_empty ())
4383 stmt_vec_info stmt_info
= worklist
.pop ();
4384 /* Since SSA operands are not set up for pattern stmts we need
4385 to use walk_gimple_op. */
4387 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
4392 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4394 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
4395 : vec_info (vec_info::bb
, shared
),
4399 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4402 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4405 gphi
*phi
= si
.phi ();
4406 gimple_set_uid (phi
, 0);
4409 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4410 !gsi_end_p (gsi
); gsi_next (&gsi
))
4412 gimple
*stmt
= gsi_stmt (gsi
);
4413 gimple_set_uid (stmt
, 0);
4414 if (is_gimple_debug (stmt
))
4422 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4423 stmts in the basic block. */
4425 _bb_vec_info::~_bb_vec_info ()
4427 /* Reset region marker. */
4428 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
4431 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
4434 gphi
*phi
= si
.phi ();
4435 gimple_set_uid (phi
, -1);
4437 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
4438 !gsi_end_p (gsi
); gsi_next (&gsi
))
4440 gimple
*stmt
= gsi_stmt (gsi
);
4441 gimple_set_uid (stmt
, -1);
4445 for (unsigned i
= 0; i
< roots
.length (); ++i
)
4447 roots
[i
].stmts
.release ();
4448 roots
[i
].roots
.release ();
4453 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4454 given then that child nodes have already been processed, and that
4455 their def types currently match their SLP node's def type. */
4458 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
4459 slp_instance node_instance
,
4460 stmt_vector_for_cost
*cost_vec
)
4462 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4464 /* Calculate the number of vector statements to be created for the
4465 scalar stmts in this node. For SLP reductions it is equal to the
4466 number of vector statements in the children (which has already been
4467 calculated by the recursive call). Otherwise it is the number of
4468 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4469 VF divided by the number of elements in a vector. */
4470 if (!STMT_VINFO_DATA_REF (stmt_info
)
4471 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
4473 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
4474 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
4476 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4477 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
4484 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4485 vf
= loop_vinfo
->vectorization_factor
;
4488 unsigned int group_size
= SLP_TREE_LANES (node
);
4489 tree vectype
= SLP_TREE_VECTYPE (node
);
4490 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
4491 = vect_get_num_vectors (vf
* group_size
, vectype
);
4494 /* Handle purely internal nodes. */
4495 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4496 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
4498 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
4501 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
4502 node
, node_instance
, cost_vec
);
4505 /* Try to build NODE from scalars, returning true on success.
4506 NODE_INSTANCE is the SLP instance that contains NODE. */
4509 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
4510 slp_instance node_instance
)
4512 stmt_vec_info stmt_info
;
4515 if (!is_a
<bb_vec_info
> (vinfo
)
4516 || node
== SLP_INSTANCE_TREE (node_instance
)
4517 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
4518 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
4521 if (dump_enabled_p ())
4522 dump_printf_loc (MSG_NOTE
, vect_location
,
4523 "Building vector operands of %p from scalars instead\n", node
);
4525 /* Don't remove and free the child nodes here, since they could be
4526 referenced by other structures. The analysis and scheduling phases
4527 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4528 unsigned int group_size
= SLP_TREE_LANES (node
);
4529 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
4530 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
4531 SLP_TREE_LOAD_PERMUTATION (node
).release ();
4532 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4534 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
4535 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
4540 /* Return true if all elements of the slice are the same. */
4542 vect_scalar_ops_slice::all_same_p () const
4544 for (unsigned int i
= 1; i
< length
; ++i
)
4545 if (!operand_equal_p (op (0), op (i
)))
4551 vect_scalar_ops_slice_hash::hash (const value_type
&s
)
4554 for (unsigned i
= 0; i
< s
.length
; ++i
)
4555 hash
= iterative_hash_expr (s
.op (i
), hash
);
4560 vect_scalar_ops_slice_hash::equal (const value_type
&s1
,
4561 const compare_type
&s2
)
4563 if (s1
.length
!= s2
.length
)
4565 for (unsigned i
= 0; i
< s1
.length
; ++i
)
4566 if (!operand_equal_p (s1
.op (i
), s2
.op (i
)))
4571 /* Compute the prologue cost for invariant or constant operands represented
4575 vect_prologue_cost_for_slp (slp_tree node
,
4576 stmt_vector_for_cost
*cost_vec
)
4578 /* There's a special case of an existing vector, that costs nothing. */
4579 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
4580 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
4582 /* Without looking at the actual initializer a vector of
4583 constants can be implemented as load from the constant pool.
4584 When all elements are the same we can use a splat. */
4585 tree vectype
= SLP_TREE_VECTYPE (node
);
4586 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
4587 unsigned HOST_WIDE_INT const_nunits
;
4588 unsigned nelt_limit
;
4589 auto ops
= &SLP_TREE_SCALAR_OPS (node
);
4590 auto_vec
<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4591 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
4592 && ! multiple_p (const_nunits
, group_size
))
4594 nelt_limit
= const_nunits
;
4595 hash_set
<vect_scalar_ops_slice_hash
> vector_ops
;
4596 for (unsigned int i
= 0; i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); ++i
)
4597 if (!vector_ops
.add ({ ops
, i
* const_nunits
, const_nunits
}))
4598 starts
.quick_push (i
* const_nunits
);
4602 /* If either the vector has variable length or the vectors
4603 are composed of repeated whole groups we only need to
4604 cost construction once. All vectors will be the same. */
4605 nelt_limit
= group_size
;
4606 starts
.quick_push (0);
4608 /* ??? We're just tracking whether vectors in a single node are the same.
4609 Ideally we'd do something more global. */
4610 for (unsigned int start
: starts
)
4612 vect_cost_for_stmt kind
;
4613 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
4615 else if (vect_scalar_ops_slice
{ ops
, start
, nelt_limit
}.all_same_p ())
4616 kind
= scalar_to_vec
;
4618 kind
= vec_construct
;
4619 record_stmt_cost (cost_vec
, 1, kind
, node
, vectype
, 0, vect_prologue
);
4623 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4624 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4626 Return true if the operations are supported. */
4629 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
4630 slp_instance node_instance
,
4631 hash_set
<slp_tree
> &visited_set
,
4632 vec
<slp_tree
> &visited_vec
,
4633 stmt_vector_for_cost
*cost_vec
)
4638 /* Assume we can code-generate all invariants. */
4640 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4641 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4644 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
4646 if (dump_enabled_p ())
4647 dump_printf_loc (MSG_NOTE
, vect_location
,
4648 "Failed cyclic SLP reference in %p\n", node
);
4651 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
4653 /* If we already analyzed the exact same set of scalar stmts we're done.
4654 We share the generated vector stmts for those. */
4655 if (visited_set
.add (node
))
4657 visited_vec
.safe_push (node
);
4660 unsigned visited_rec_start
= visited_vec
.length ();
4661 unsigned cost_vec_rec_start
= cost_vec
->length ();
4662 bool seen_non_constant_child
= false;
4663 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4665 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
4666 visited_set
, visited_vec
,
4670 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
4671 seen_non_constant_child
= true;
4673 /* We're having difficulties scheduling nodes with just constant
4674 operands and no scalar stmts since we then cannot compute a stmt
4676 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
4678 if (dump_enabled_p ())
4679 dump_printf_loc (MSG_NOTE
, vect_location
,
4680 "Cannot vectorize all-constant op node %p\n", node
);
4685 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
4687 /* If analysis failed we have to pop all recursive visited nodes
4691 while (visited_vec
.length () >= visited_rec_start
)
4692 visited_set
.remove (visited_vec
.pop ());
4693 cost_vec
->truncate (cost_vec_rec_start
);
4696 /* When the node can be vectorized cost invariant nodes it references.
4697 This is not done in DFS order to allow the refering node
4698 vectorizable_* calls to nail down the invariant nodes vector type
4699 and possibly unshare it if it needs a different vector type than
4702 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
4704 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
4705 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4706 /* Perform usual caching, note code-generation still
4707 code-gens these nodes multiple times but we expect
4708 to CSE them later. */
4709 && !visited_set
.add (child
))
4711 visited_vec
.safe_push (child
);
4712 /* ??? After auditing more code paths make a "default"
4713 and push the vector type from NODE to all children
4714 if it is not already set. */
4715 /* Compute the number of vectors to be generated. */
4716 tree vector_type
= SLP_TREE_VECTYPE (child
);
4719 /* For shifts with a scalar argument we don't need
4720 to cost or code-generate anything.
4721 ??? Represent this more explicitely. */
4722 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4723 == shift_vec_info_type
)
4727 unsigned group_size
= SLP_TREE_LANES (child
);
4729 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4730 vf
= loop_vinfo
->vectorization_factor
;
4731 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
4732 = vect_get_num_vectors (vf
* group_size
, vector_type
);
4733 /* And cost them. */
4734 vect_prologue_cost_for_slp (child
, cost_vec
);
4737 /* If this node or any of its children can't be vectorized, try pruning
4738 the tree here rather than felling the whole thing. */
4739 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
4741 /* We'll need to revisit this for invariant costing and number
4742 of vectorized stmt setting. */
4749 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4750 region and that can be vectorized using vectorizable_live_operation
4751 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4752 scalar code computing it to be retained. */
4755 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4756 slp_instance instance
,
4757 stmt_vector_for_cost
*cost_vec
,
4758 hash_set
<stmt_vec_info
> &svisited
,
4759 hash_set
<slp_tree
> &visited
)
4761 if (visited
.add (node
))
4765 stmt_vec_info stmt_info
;
4766 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4769 if (svisited
.contains (stmt_info
))
4771 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4772 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4773 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4774 /* Only the pattern root stmt computes the original scalar value. */
4776 bool mark_visited
= true;
4777 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4778 ssa_op_iter op_iter
;
4779 def_operand_p def_p
;
4780 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4782 imm_use_iterator use_iter
;
4784 stmt_vec_info use_stmt_info
;
4785 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4786 if (!is_gimple_debug (use_stmt
))
4788 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4790 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4792 STMT_VINFO_LIVE_P (stmt_info
) = true;
4793 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4794 NULL
, node
, instance
, i
,
4796 /* ??? So we know we can vectorize the live stmt
4797 from one SLP node. If we cannot do so from all
4798 or none consistently we'd have to record which
4799 SLP node (and lane) we want to use for the live
4800 operation. So make sure we can code-generate
4802 mark_visited
= false;
4804 STMT_VINFO_LIVE_P (stmt_info
) = false;
4808 /* We have to verify whether we can insert the lane extract
4809 before all uses. The following is a conservative approximation.
4810 We cannot put this into vectorizable_live_operation because
4811 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4813 Note that while the fact that we emit code for loads at the
4814 first load should make this a non-problem leafs we construct
4815 from scalars are vectorized after the last scalar def.
4816 ??? If we'd actually compute the insert location during
4817 analysis we could use sth less conservative than the last
4818 scalar stmt in the node for the dominance check. */
4819 /* ??? What remains is "live" uses in vector CTORs in the same
4820 SLP graph which is where those uses can end up code-generated
4821 right after their definition instead of close to their original
4822 use. But that would restrict us to code-generate lane-extracts
4823 from the latest stmt in a node. So we compensate for this
4824 during code-generation, simply not replacing uses for those
4825 hopefully rare cases. */
4826 if (STMT_VINFO_LIVE_P (stmt_info
))
4827 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4828 if (!is_gimple_debug (use_stmt
)
4829 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4830 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4831 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4833 if (dump_enabled_p ())
4834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4835 "Cannot determine insertion place for "
4837 STMT_VINFO_LIVE_P (stmt_info
) = false;
4838 mark_visited
= true;
4842 svisited
.add (stmt_info
);
4846 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4847 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4848 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4849 cost_vec
, svisited
, visited
);
4852 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4855 vectorizable_bb_reduc_epilogue (slp_instance instance
,
4856 stmt_vector_for_cost
*cost_vec
)
4858 gassign
*stmt
= as_a
<gassign
*> (instance
->root_stmts
[0]->stmt
);
4859 enum tree_code reduc_code
= gimple_assign_rhs_code (stmt
);
4860 if (reduc_code
== MINUS_EXPR
)
4861 reduc_code
= PLUS_EXPR
;
4862 internal_fn reduc_fn
;
4863 tree vectype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance
));
4864 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
4865 || reduc_fn
== IFN_LAST
4866 || !direct_internal_fn_supported_p (reduc_fn
, vectype
, OPTIMIZE_FOR_BOTH
)
4867 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt
)),
4868 TREE_TYPE (vectype
)))
4871 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4872 cost log2 vector operations plus shuffles and one extraction. */
4873 unsigned steps
= floor_log2 (vect_nunits_for_cost (vectype
));
4874 record_stmt_cost (cost_vec
, steps
, vector_stmt
, instance
->root_stmts
[0],
4875 vectype
, 0, vect_body
);
4876 record_stmt_cost (cost_vec
, steps
, vec_perm
, instance
->root_stmts
[0],
4877 vectype
, 0, vect_body
);
4878 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, instance
->root_stmts
[0],
4879 vectype
, 0, vect_body
);
4883 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4884 and recurse to children. */
4887 vect_slp_prune_covered_roots (slp_tree node
, hash_set
<stmt_vec_info
> &roots
,
4888 hash_set
<slp_tree
> &visited
)
4890 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
4891 || visited
.add (node
))
4896 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
4897 roots
.remove (vect_orig_stmt (stmt
));
4900 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4902 vect_slp_prune_covered_roots (child
, roots
, visited
);
4905 /* Analyze statements in SLP instances of VINFO. Return true if the
4906 operations are supported. */
4909 vect_slp_analyze_operations (vec_info
*vinfo
)
4911 slp_instance instance
;
4914 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4916 hash_set
<slp_tree
> visited
;
4917 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4919 auto_vec
<slp_tree
> visited_vec
;
4920 stmt_vector_for_cost cost_vec
;
4921 cost_vec
.create (2);
4922 if (is_a
<bb_vec_info
> (vinfo
))
4923 vect_location
= instance
->location ();
4924 if (!vect_slp_analyze_node_operations (vinfo
,
4925 SLP_INSTANCE_TREE (instance
),
4926 instance
, visited
, visited_vec
,
4928 /* CTOR instances require vectorized defs for the SLP tree root. */
4929 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
4930 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4931 != vect_internal_def
4932 /* Make sure we vectorized with the expected type. */
4933 || !useless_type_conversion_p
4934 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4935 (instance
->root_stmts
[0]->stmt
))),
4936 TREE_TYPE (SLP_TREE_VECTYPE
4937 (SLP_INSTANCE_TREE (instance
))))))
4938 /* Check we can vectorize the reduction. */
4939 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
4940 && !vectorizable_bb_reduc_epilogue (instance
, &cost_vec
)))
4942 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4943 stmt_vec_info stmt_info
;
4944 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4945 stmt_info
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
4947 stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_NOTE
, vect_location
,
4950 "removing SLP instance operations starting from: %G",
4952 vect_free_slp_instance (instance
);
4953 vinfo
->slp_instances
.ordered_remove (i
);
4954 cost_vec
.release ();
4955 while (!visited_vec
.is_empty ())
4956 visited
.remove (visited_vec
.pop ());
4961 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4963 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
4964 cost_vec
.release ();
4967 /* For BB vectorization remember the SLP graph entry
4969 instance
->cost_vec
= cost_vec
;
4973 /* Now look for SLP instances with a root that are covered by other
4974 instances and remove them. */
4975 hash_set
<stmt_vec_info
> roots
;
4976 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4977 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
4978 roots
.add (SLP_INSTANCE_ROOT_STMTS (instance
)[0]);
4979 if (!roots
.is_empty ())
4982 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4983 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance
), roots
,
4985 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4986 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
4987 && !roots
.contains (SLP_INSTANCE_ROOT_STMTS (instance
)[0]))
4989 stmt_vec_info root
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
4990 if (dump_enabled_p ())
4991 dump_printf_loc (MSG_NOTE
, vect_location
,
4992 "removing SLP instance operations starting "
4993 "from: %G", root
->stmt
);
4994 vect_free_slp_instance (instance
);
4995 vinfo
->slp_instances
.ordered_remove (i
);
5001 /* Compute vectorizable live stmts. */
5002 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
5004 hash_set
<stmt_vec_info
> svisited
;
5005 hash_set
<slp_tree
> visited
;
5006 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5008 vect_location
= instance
->location ();
5009 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
5010 instance
, &instance
->cost_vec
, svisited
,
5015 return !vinfo
->slp_instances
.is_empty ();
5018 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5019 closing the eventual chain. */
5022 get_ultimate_leader (slp_instance instance
,
5023 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
5025 auto_vec
<slp_instance
*, 8> chain
;
5027 while (*(tem
= instance_leader
.get (instance
)) != instance
)
5029 chain
.safe_push (tem
);
5032 while (!chain
.is_empty ())
5033 *chain
.pop () = instance
;
5037 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5040 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
5041 slp_instance instance
, slp_tree node
,
5042 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
5043 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
5044 hash_set
<slp_tree
> &visited
)
5046 stmt_vec_info stmt_info
;
5049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5052 slp_instance
&stmt_instance
5053 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
5056 else if (stmt_instance
!= instance
)
5058 /* If we're running into a previously marked stmt make us the
5059 leader of the current ultimate leader. This keeps the
5060 leader chain acyclic and works even when the current instance
5061 connects two previously independent graph parts. */
5062 slp_instance stmt_leader
5063 = get_ultimate_leader (stmt_instance
, instance_leader
);
5064 if (stmt_leader
!= instance
)
5065 instance_leader
.put (stmt_leader
, instance
);
5067 stmt_instance
= instance
;
5070 if (!SLP_TREE_SCALAR_STMTS (node
).is_empty () && visited
.add (node
))
5074 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5075 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5076 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
5077 instance_leader
, visited
);
5080 /* Partition the SLP graph into pieces that can be costed independently. */
5083 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
5085 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5087 /* First walk the SLP graph assigning each involved scalar stmt a
5088 corresponding SLP graph entry and upon visiting a previously
5089 marked stmt, make the stmts leader the current SLP graph entry. */
5090 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
5091 hash_map
<slp_instance
, slp_instance
> instance_leader
;
5092 hash_set
<slp_tree
> visited
;
5093 slp_instance instance
;
5094 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5096 instance_leader
.put (instance
, instance
);
5097 vect_bb_partition_graph_r (bb_vinfo
,
5098 instance
, SLP_INSTANCE_TREE (instance
),
5099 stmt_to_instance
, instance_leader
,
5103 /* Then collect entries to each independent subgraph. */
5104 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
5106 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
5107 leader
->subgraph_entries
.safe_push (instance
);
5108 if (dump_enabled_p ()
5109 && leader
!= instance
)
5110 dump_printf_loc (MSG_NOTE
, vect_location
,
5111 "instance %p is leader of %p\n",
5116 /* Compute the set of scalar stmts participating in internal and external
5120 vect_slp_gather_vectorized_scalar_stmts (vec_info
*vinfo
, slp_tree node
,
5121 hash_set
<slp_tree
> &visited
,
5122 hash_set
<stmt_vec_info
> &vstmts
,
5123 hash_set
<stmt_vec_info
> &estmts
)
5126 stmt_vec_info stmt_info
;
5129 if (visited
.add (node
))
5132 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
5134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5135 vstmts
.add (stmt_info
);
5137 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5139 vect_slp_gather_vectorized_scalar_stmts (vinfo
, child
, visited
,
5143 for (tree def
: SLP_TREE_SCALAR_OPS (node
))
5145 stmt_vec_info def_stmt
= vinfo
->lookup_def (def
);
5147 estmts
.add (def_stmt
);
5152 /* Compute the scalar cost of the SLP node NODE and its children
5153 and return it. Do not account defs that are marked in LIFE and
5154 update LIFE according to uses of NODE. */
5157 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
5158 slp_tree node
, vec
<bool, va_heap
> *life
,
5159 stmt_vector_for_cost
*cost_vec
,
5160 hash_set
<stmt_vec_info
> &vectorized_scalar_stmts
,
5161 hash_set
<slp_tree
> &visited
)
5164 stmt_vec_info stmt_info
;
5167 if (visited
.add (node
))
5170 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5172 ssa_op_iter op_iter
;
5173 def_operand_p def_p
;
5178 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5179 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
5181 /* If there is a non-vectorized use of the defs then the scalar
5182 stmt is kept live in which case we do not account it or any
5183 required defs in the SLP children in the scalar cost. This
5184 way we make the vectorization more costly when compared to
5186 if (!STMT_VINFO_LIVE_P (stmt_info
))
5188 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
5190 imm_use_iterator use_iter
;
5192 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
5193 if (!is_gimple_debug (use_stmt
))
5195 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
5197 || !vectorized_scalar_stmts
.contains (use_stmt_info
))
5208 /* Count scalar stmts only once. */
5209 if (gimple_visited_p (orig_stmt
))
5211 gimple_set_visited (orig_stmt
, true);
5213 vect_cost_for_stmt kind
;
5214 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
5216 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
5219 kind
= scalar_store
;
5221 else if (vect_nop_conversion_p (orig_stmt_info
))
5223 /* For single-argument PHIs assume coalescing which means zero cost
5224 for the scalar and the vector PHIs. This avoids artificially
5225 favoring the vector path (but may pessimize it in some cases). */
5226 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
5227 && gimple_phi_num_args
5228 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
5232 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
5233 SLP_TREE_VECTYPE (node
), 0, vect_body
);
5236 auto_vec
<bool, 20> subtree_life
;
5237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5239 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5241 /* Do not directly pass LIFE to the recursive call, copy it to
5242 confine changes in the callee to the current child/subtree. */
5243 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5245 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
5246 for (unsigned j
= 0;
5247 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
5249 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
5250 if (perm
.first
== i
)
5251 subtree_life
[perm
.second
] = (*life
)[j
];
5256 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
5257 subtree_life
.safe_splice (*life
);
5259 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
5260 vectorized_scalar_stmts
, visited
);
5261 subtree_life
.truncate (0);
5266 /* Comparator for the loop-index sorted cost vectors. */
5269 li_cost_vec_cmp (const void *a_
, const void *b_
)
5271 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
5272 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
5273 if (a
->first
< b
->first
)
5275 else if (a
->first
== b
->first
)
5280 /* Check if vectorization of the basic block is profitable for the
5281 subgraph denoted by SLP_INSTANCES. */
5284 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
5285 vec
<slp_instance
> slp_instances
,
5288 slp_instance instance
;
5290 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
5291 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
5293 if (dump_enabled_p ())
5295 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
5296 hash_set
<slp_tree
> visited
;
5297 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5298 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5299 SLP_INSTANCE_TREE (instance
), visited
);
5302 /* Compute the set of scalar stmts we know will go away 'locally' when
5303 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5304 not accurate for nodes promoted extern late or for scalar stmts that
5305 are used both in extern defs and in vectorized defs. */
5306 hash_set
<stmt_vec_info
> vectorized_scalar_stmts
;
5307 hash_set
<stmt_vec_info
> scalar_stmts_in_externs
;
5308 hash_set
<slp_tree
> visited
;
5309 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5311 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo
,
5312 SLP_INSTANCE_TREE (instance
),
5314 vectorized_scalar_stmts
,
5315 scalar_stmts_in_externs
);
5316 for (stmt_vec_info rstmt
: SLP_INSTANCE_ROOT_STMTS (instance
))
5317 vectorized_scalar_stmts
.add (rstmt
);
5319 /* Scalar stmts used as defs in external nodes need to be preseved, so
5320 remove them from vectorized_scalar_stmts. */
5321 for (stmt_vec_info stmt
: scalar_stmts_in_externs
)
5322 vectorized_scalar_stmts
.remove (stmt
);
5324 /* Calculate scalar cost and sum the cost for the vector stmts
5325 previously collected. */
5326 stmt_vector_for_cost scalar_costs
= vNULL
;
5327 stmt_vector_for_cost vector_costs
= vNULL
;
5329 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5331 auto_vec
<bool, 20> life
;
5332 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
5334 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
5335 record_stmt_cost (&scalar_costs
,
5336 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
5338 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
5339 vect_bb_slp_scalar_cost (bb_vinfo
,
5340 SLP_INSTANCE_TREE (instance
),
5341 &life
, &scalar_costs
, vectorized_scalar_stmts
,
5343 vector_costs
.safe_splice (instance
->cost_vec
);
5344 instance
->cost_vec
.release ();
5347 if (dump_enabled_p ())
5348 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
5350 /* When costing non-loop vectorization we need to consider each covered
5351 loop independently and make sure vectorization is profitable. For
5352 now we assume a loop may be not entered or executed an arbitrary
5353 number of iterations (??? static information can provide more
5354 precise info here) which means we can simply cost each containing
5355 loops stmts separately. */
5357 /* First produce cost vectors sorted by loop index. */
5358 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5359 li_scalar_costs (scalar_costs
.length ());
5360 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
5361 li_vector_costs (vector_costs
.length ());
5362 stmt_info_for_cost
*cost
;
5363 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5365 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5366 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
5368 /* Use a random used loop as fallback in case the first vector_costs
5369 entry does not have a stmt_info associated with it. */
5370 unsigned l
= li_scalar_costs
[0].first
;
5371 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
5373 /* We inherit from the previous COST, invariants, externals and
5374 extracts immediately follow the cost for the related stmt. */
5375 if (cost
->stmt_info
)
5376 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
5377 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
5379 li_scalar_costs
.qsort (li_cost_vec_cmp
);
5380 li_vector_costs
.qsort (li_cost_vec_cmp
);
5382 /* Now cost the portions individually. */
5385 bool profitable
= true;
5386 while (si
< li_scalar_costs
.length ()
5387 && vi
< li_vector_costs
.length ())
5389 unsigned sl
= li_scalar_costs
[si
].first
;
5390 unsigned vl
= li_vector_costs
[vi
].first
;
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_NOTE
, vect_location
,
5395 "Scalar %d and vector %d loop part do not "
5396 "match up, skipping scalar part\n", sl
, vl
);
5397 /* Skip the scalar part, assuming zero cost on the vector side. */
5402 while (si
< li_scalar_costs
.length ()
5403 && li_scalar_costs
[si
].first
== sl
);
5407 class vector_costs
*scalar_target_cost_data
= init_cost (bb_vinfo
, true);
5410 add_stmt_cost (scalar_target_cost_data
, li_scalar_costs
[si
].second
);
5413 while (si
< li_scalar_costs
.length ()
5414 && li_scalar_costs
[si
].first
== sl
);
5416 finish_cost (scalar_target_cost_data
, nullptr,
5417 &dummy
, &scalar_cost
, &dummy
);
5419 /* Complete the target-specific vector cost calculation. */
5420 class vector_costs
*vect_target_cost_data
= init_cost (bb_vinfo
, false);
5423 add_stmt_cost (vect_target_cost_data
, li_vector_costs
[vi
].second
);
5426 while (vi
< li_vector_costs
.length ()
5427 && li_vector_costs
[vi
].first
== vl
);
5428 finish_cost (vect_target_cost_data
, scalar_target_cost_data
,
5429 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
);
5430 delete scalar_target_cost_data
;
5431 delete vect_target_cost_data
;
5433 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
5435 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_NOTE
, vect_location
,
5438 "Cost model analysis for part in loop %d:\n", sl
);
5439 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
5440 vec_inside_cost
+ vec_outside_cost
);
5441 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
5444 /* Vectorization is profitable if its cost is more than the cost of scalar
5445 version. Note that we err on the vector side for equal cost because
5446 the cost estimate is otherwise quite pessimistic (constant uses are
5447 free on the scalar side but cost a load on the vector side for
5449 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
5455 if (profitable
&& vi
< li_vector_costs
.length ())
5457 if (dump_enabled_p ())
5458 dump_printf_loc (MSG_NOTE
, vect_location
,
5459 "Excess vector cost for part in loop %d:\n",
5460 li_vector_costs
[vi
].first
);
5464 /* Unset visited flag. This is delayed when the subgraph is profitable
5465 and we process the loop for remaining unvectorized if-converted code. */
5466 if (!orig_loop
|| !profitable
)
5467 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
5468 gimple_set_visited (cost
->stmt_info
->stmt
, false);
5470 scalar_costs
.release ();
5471 vector_costs
.release ();
5476 /* qsort comparator for lane defs. */
5479 vld_cmp (const void *a_
, const void *b_
)
5481 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
5482 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
5483 return a
->first
- b
->first
;
5486 /* Return true if USE_STMT is a vector lane insert into VEC and set
5487 *THIS_LANE to the lane number that is set. */
5490 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
5492 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
5494 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
5496 ? gimple_assign_rhs1 (use_ass
) != vec
5497 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
5498 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
5499 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
5500 || !constant_multiple_p
5501 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
5502 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
5508 /* Find any vectorizable constructors and add them to the grouped_store
5512 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
5514 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
5515 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
5516 !gsi_end_p (gsi
); gsi_next (&gsi
))
5518 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
5522 tree rhs
= gimple_assign_rhs1 (assign
);
5523 enum tree_code code
= gimple_assign_rhs_code (assign
);
5524 use_operand_p use_p
;
5526 if (code
== CONSTRUCTOR
)
5528 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5529 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
5530 CONSTRUCTOR_NELTS (rhs
))
5531 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
5532 || uniform_vector_p (rhs
))
5537 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
5538 if (TREE_CODE (val
) != SSA_NAME
5539 || !bb_vinfo
->lookup_def (val
))
5541 if (j
!= CONSTRUCTOR_NELTS (rhs
))
5544 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
5545 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
5547 else if (code
== BIT_INSERT_EXPR
5548 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
5549 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
5550 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
5551 && integer_zerop (gimple_assign_rhs3 (assign
))
5552 && useless_type_conversion_p
5553 (TREE_TYPE (TREE_TYPE (rhs
)),
5554 TREE_TYPE (gimple_assign_rhs2 (assign
)))
5555 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
5557 /* We start to match on insert to lane zero but since the
5558 inserts need not be ordered we'd have to search both
5559 the def and the use chains. */
5560 tree vectype
= TREE_TYPE (rhs
);
5561 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5562 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
5563 auto_sbitmap
lanes (nlanes
);
5564 bitmap_clear (lanes
);
5565 bitmap_set_bit (lanes
, 0);
5566 tree def
= gimple_assign_lhs (assign
);
5567 lane_defs
.quick_push
5568 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
5569 unsigned lanes_found
= 1;
5570 /* Start with the use chains, the last stmt will be the root. */
5571 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
5572 vec
<stmt_vec_info
> roots
= vNULL
;
5573 roots
.safe_push (last
);
5576 use_operand_p use_p
;
5578 if (!single_imm_use (def
, &use_p
, &use_stmt
))
5581 if (!bb_vinfo
->lookup_stmt (use_stmt
)
5582 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
5583 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
5585 if (bitmap_bit_p (lanes
, this_lane
))
5588 bitmap_set_bit (lanes
, this_lane
);
5589 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
5590 lane_defs
.quick_push (std::make_pair
5591 (this_lane
, gimple_assign_rhs2 (use_ass
)));
5592 last
= bb_vinfo
->lookup_stmt (use_ass
);
5593 roots
.safe_push (last
);
5594 def
= gimple_assign_lhs (use_ass
);
5596 while (lanes_found
< nlanes
);
5597 if (roots
.length () > 1)
5598 std::swap(roots
[0], roots
[roots
.length () - 1]);
5599 if (lanes_found
< nlanes
)
5601 /* Now search the def chain. */
5602 def
= gimple_assign_rhs1 (assign
);
5605 if (TREE_CODE (def
) != SSA_NAME
5606 || !has_single_use (def
))
5608 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
5610 if (!bb_vinfo
->lookup_stmt (def_stmt
)
5611 || !vect_slp_is_lane_insert (def_stmt
,
5612 NULL_TREE
, &this_lane
)
5613 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
5615 if (bitmap_bit_p (lanes
, this_lane
))
5618 bitmap_set_bit (lanes
, this_lane
);
5619 lane_defs
.quick_push (std::make_pair
5621 gimple_assign_rhs2 (def_stmt
)));
5622 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
5623 def
= gimple_assign_rhs1 (def_stmt
);
5625 while (lanes_found
< nlanes
);
5627 if (lanes_found
== nlanes
)
5629 /* Sort lane_defs after the lane index and register the root. */
5630 lane_defs
.qsort (vld_cmp
);
5631 vec
<stmt_vec_info
> stmts
;
5632 stmts
.create (nlanes
);
5633 for (unsigned i
= 0; i
< nlanes
; ++i
)
5634 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
5635 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
5641 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
5642 && (associative_tree_code (code
) || code
== MINUS_EXPR
)
5643 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5644 checks pessimize a two-element reduction. PR54400.
5645 ??? In-order reduction could be handled if we only
5646 traverse one operand chain in vect_slp_linearize_chain. */
5647 && ((FLOAT_TYPE_P (TREE_TYPE (rhs
)) && flag_associative_math
)
5648 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs
))
5649 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs
))))
5650 /* Ops with constants at the tail can be stripped here. */
5651 && TREE_CODE (rhs
) == SSA_NAME
5652 && TREE_CODE (gimple_assign_rhs2 (assign
)) == SSA_NAME
5653 /* Should be the chain end. */
5654 && (!single_imm_use (gimple_assign_lhs (assign
),
5656 || !is_gimple_assign (use_stmt
)
5657 || (gimple_assign_rhs_code (use_stmt
) != code
5658 && ((code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
5659 || (gimple_assign_rhs_code (use_stmt
)
5660 != (code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
))))))
5662 /* We start the match at the end of a possible association
5664 auto_vec
<chain_op_t
> chain
;
5665 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
5666 auto_vec
<gimple
*> chain_stmts
;
5667 gimple
*code_stmt
= NULL
, *alt_code_stmt
= NULL
;
5668 if (code
== MINUS_EXPR
)
5670 internal_fn reduc_fn
;
5671 if (!reduction_fn_for_scalar_code (code
, &reduc_fn
)
5672 || reduc_fn
== IFN_LAST
)
5674 vect_slp_linearize_chain (bb_vinfo
, worklist
, chain
, code
, assign
,
5676 code_stmt
, alt_code_stmt
, &chain_stmts
);
5677 if (chain
.length () > 1)
5679 /* Sort the chain according to def_type and operation. */
5680 chain
.sort (dt_sort_cmp
, bb_vinfo
);
5681 /* ??? Now we'd want to strip externals and constants
5682 but record those to be handled in the epilogue. */
5683 /* ??? For now do not allow mixing ops or externs/constants. */
5684 bool invalid
= false;
5685 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5686 if (chain
[i
].dt
!= vect_internal_def
5687 || chain
[i
].code
!= code
)
5691 vec
<stmt_vec_info
> stmts
;
5692 stmts
.create (chain
.length ());
5693 for (unsigned i
= 0; i
< chain
.length (); ++i
)
5694 stmts
.quick_push (bb_vinfo
->lookup_def (chain
[i
].op
));
5695 vec
<stmt_vec_info
> roots
;
5696 roots
.create (chain_stmts
.length ());
5697 for (unsigned i
= 0; i
< chain_stmts
.length (); ++i
)
5698 roots
.quick_push (bb_vinfo
->lookup_stmt (chain_stmts
[i
]));
5699 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_bb_reduc
,
5707 /* Walk the grouped store chains and replace entries with their
5708 pattern variant if any. */
5711 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
5713 stmt_vec_info first_element
;
5716 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
5718 /* We also have CTORs in this array. */
5719 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
5721 if (STMT_VINFO_IN_PATTERN_P (first_element
))
5723 stmt_vec_info orig
= first_element
;
5724 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
5725 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
5726 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
5727 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
5728 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
5729 vinfo
->grouped_stores
[i
] = first_element
;
5731 stmt_vec_info prev
= first_element
;
5732 while (DR_GROUP_NEXT_ELEMENT (prev
))
5734 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
5735 if (STMT_VINFO_IN_PATTERN_P (elt
))
5737 stmt_vec_info orig
= elt
;
5738 elt
= STMT_VINFO_RELATED_STMT (elt
);
5739 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
5740 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
5741 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
5743 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
5749 /* Check if the region described by BB_VINFO can be vectorized, returning
5750 true if so. When returning false, set FATAL to true if the same failure
5751 would prevent vectorization at other vector sizes, false if it is still
5752 worth trying other sizes. N_STMTS is the number of statements in the
5756 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
5757 vec
<int> *dataref_groups
)
5759 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5761 slp_instance instance
;
5763 poly_uint64 min_vf
= 2;
5765 /* The first group of checks is independent of the vector size. */
5768 /* Analyze the data references. */
5770 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5774 "not vectorized: unhandled data-ref in basic "
5779 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
5781 if (dump_enabled_p ())
5782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5783 "not vectorized: unhandled data access in "
5788 vect_slp_check_for_constructors (bb_vinfo
);
5790 /* If there are no grouped stores and no constructors in the region
5791 there is no need to continue with pattern recog as vect_analyze_slp
5792 will fail anyway. */
5793 if (bb_vinfo
->grouped_stores
.is_empty ()
5794 && bb_vinfo
->roots
.is_empty ())
5796 if (dump_enabled_p ())
5797 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5798 "not vectorized: no grouped stores in "
5803 /* While the rest of the analysis below depends on it in some way. */
5806 vect_pattern_recog (bb_vinfo
);
5808 /* Update store groups from pattern processing. */
5809 vect_fixup_store_groups_with_patterns (bb_vinfo
);
5811 /* Check the SLP opportunities in the basic block, analyze and build SLP
5813 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
5815 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5818 "Failed to SLP the basic block.\n");
5819 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5820 "not vectorized: failed to find SLP opportunities "
5821 "in basic block.\n");
5826 /* Optimize permutations. */
5827 vect_optimize_slp (bb_vinfo
);
5829 /* Gather the loads reachable from the SLP graph entries. */
5830 vect_gather_slp_loads (bb_vinfo
);
5832 vect_record_base_alignments (bb_vinfo
);
5834 /* Analyze and verify the alignment of data references and the
5835 dependence in the SLP instances. */
5836 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
5838 vect_location
= instance
->location ();
5839 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
5840 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
5842 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5843 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE
, vect_location
,
5846 "removing SLP instance operations starting from: %G",
5848 vect_free_slp_instance (instance
);
5849 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
5853 /* Mark all the statements that we want to vectorize as pure SLP and
5855 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
5856 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
5859 /* Likewise consider instance root stmts as vectorized. */
5860 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
5861 STMT_SLP_TYPE (root
) = pure_slp
;
5865 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
5868 if (!vect_slp_analyze_operations (bb_vinfo
))
5870 if (dump_enabled_p ())
5871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5872 "not vectorized: bad operation in basic block.\n");
5876 vect_bb_partition_graph (bb_vinfo
);
5881 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5882 basic blocks in BBS, returning true on success.
5883 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5886 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
5887 vec
<int> *dataref_groups
, unsigned int n_stmts
,
5890 bb_vec_info bb_vinfo
;
5891 auto_vector_modes vector_modes
;
5893 /* Autodetect first vector size we try. */
5894 machine_mode next_vector_mode
= VOIDmode
;
5895 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
5896 unsigned int mode_i
= 0;
5898 vec_info_shared shared
;
5900 machine_mode autodetected_vector_mode
= VOIDmode
;
5903 bool vectorized
= false;
5905 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
5907 bool first_time_p
= shared
.datarefs
.is_empty ();
5908 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
5910 bb_vinfo
->shared
->save_datarefs ();
5912 bb_vinfo
->shared
->check_datarefs ();
5913 bb_vinfo
->vector_mode
= next_vector_mode
;
5915 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
5917 if (dump_enabled_p ())
5919 dump_printf_loc (MSG_NOTE
, vect_location
,
5920 "***** Analysis succeeded with vector mode"
5921 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
5922 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
5925 bb_vinfo
->shared
->check_datarefs ();
5927 auto_vec
<slp_instance
> profitable_subgraphs
;
5928 for (slp_instance instance
: BB_VINFO_SLP_INSTANCES (bb_vinfo
))
5930 if (instance
->subgraph_entries
.is_empty ())
5933 vect_location
= instance
->location ();
5934 if (!unlimited_cost_model (NULL
)
5935 && !vect_bb_vectorization_profitable_p
5936 (bb_vinfo
, instance
->subgraph_entries
, orig_loop
))
5938 if (dump_enabled_p ())
5939 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5940 "not vectorized: vectorization is not "
5945 if (!dbg_cnt (vect_slp
))
5948 profitable_subgraphs
.safe_push (instance
);
5951 /* When we're vectorizing an if-converted loop body make sure
5952 we vectorized all if-converted code. */
5953 if (!profitable_subgraphs
.is_empty ()
5956 gcc_assert (bb_vinfo
->bbs
.length () == 1);
5957 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[0]);
5958 !gsi_end_p (gsi
); gsi_next (&gsi
))
5960 /* The costing above left us with DCEable vectorized scalar
5961 stmts having the visited flag set on profitable
5962 subgraphs. Do the delayed clearing of the flag here. */
5963 if (gimple_visited_p (gsi_stmt (gsi
)))
5965 gimple_set_visited (gsi_stmt (gsi
), false);
5968 if (flag_vect_cost_model
== VECT_COST_MODEL_UNLIMITED
)
5971 if (gassign
*ass
= dyn_cast
<gassign
*> (gsi_stmt (gsi
)))
5972 if (gimple_assign_rhs_code (ass
) == COND_EXPR
)
5974 if (!profitable_subgraphs
.is_empty ()
5975 && dump_enabled_p ())
5976 dump_printf_loc (MSG_NOTE
, vect_location
,
5977 "not profitable because of "
5978 "unprofitable if-converted scalar "
5980 profitable_subgraphs
.truncate (0);
5985 /* Finally schedule the profitable subgraphs. */
5986 for (slp_instance instance
: profitable_subgraphs
)
5988 if (!vectorized
&& dump_enabled_p ())
5989 dump_printf_loc (MSG_NOTE
, vect_location
,
5990 "Basic block will be vectorized "
5994 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
5996 unsigned HOST_WIDE_INT bytes
;
5997 if (dump_enabled_p ())
6000 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
6001 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
6002 "basic block part vectorized using %wu "
6003 "byte vectors\n", bytes
);
6005 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
6006 "basic block part vectorized using "
6007 "variable length vectors\n");
6013 if (dump_enabled_p ())
6014 dump_printf_loc (MSG_NOTE
, vect_location
,
6015 "***** Analysis failed with vector mode %s\n",
6016 GET_MODE_NAME (bb_vinfo
->vector_mode
));
6020 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
6023 while (mode_i
< vector_modes
.length ()
6024 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
6026 if (dump_enabled_p ())
6027 dump_printf_loc (MSG_NOTE
, vect_location
,
6028 "***** The result for vector mode %s would"
6030 GET_MODE_NAME (vector_modes
[mode_i
]));
6036 if (mode_i
< vector_modes
.length ()
6037 && VECTOR_MODE_P (autodetected_vector_mode
)
6038 && (related_vector_mode (vector_modes
[mode_i
],
6039 GET_MODE_INNER (autodetected_vector_mode
))
6040 == autodetected_vector_mode
)
6041 && (related_vector_mode (autodetected_vector_mode
,
6042 GET_MODE_INNER (vector_modes
[mode_i
]))
6043 == vector_modes
[mode_i
]))
6045 if (dump_enabled_p ())
6046 dump_printf_loc (MSG_NOTE
, vect_location
,
6047 "***** Skipping vector mode %s, which would"
6048 " repeat the analysis for %s\n",
6049 GET_MODE_NAME (vector_modes
[mode_i
]),
6050 GET_MODE_NAME (autodetected_vector_mode
));
6055 || mode_i
== vector_modes
.length ()
6056 || autodetected_vector_mode
== VOIDmode
6057 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6058 vector sizes will fail do not bother iterating. */
6062 /* Try the next biggest vector size. */
6063 next_vector_mode
= vector_modes
[mode_i
++];
6064 if (dump_enabled_p ())
6065 dump_printf_loc (MSG_NOTE
, vect_location
,
6066 "***** Re-trying analysis with vector mode %s\n",
6067 GET_MODE_NAME (next_vector_mode
));
6072 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6073 true if anything in the basic-block was vectorized. */
6076 vect_slp_bbs (const vec
<basic_block
> &bbs
, loop_p orig_loop
)
6078 vec
<data_reference_p
> datarefs
= vNULL
;
6079 auto_vec
<int> dataref_groups
;
6081 int current_group
= 0;
6083 for (unsigned i
= 0; i
< bbs
.length (); i
++)
6085 basic_block bb
= bbs
[i
];
6086 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
6089 gimple
*stmt
= gsi_stmt (gsi
);
6090 if (is_gimple_debug (stmt
))
6095 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
6096 vect_location
= stmt
;
6098 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
6099 &dataref_groups
, current_group
))
6102 /* New BBs always start a new DR group. */
6106 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
, orig_loop
);
6109 /* Special entry for the BB vectorizer. Analyze and transform a single
6110 if-converted BB with ORIG_LOOPs body being the not if-converted
6111 representation. Returns true if anything in the basic-block was
6115 vect_slp_if_converted_bb (basic_block bb
, loop_p orig_loop
)
6117 auto_vec
<basic_block
> bbs
;
6119 return vect_slp_bbs (bbs
, orig_loop
);
6122 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6123 true if anything in the basic-block was vectorized. */
6126 vect_slp_function (function
*fun
)
6129 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
6130 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
6132 /* For the moment split the function into pieces to avoid making
6133 the iteration on the vector mode moot. Split at points we know
6134 to not handle well which is CFG merges (SLP discovery doesn't
6135 handle non-loop-header PHIs) and loop exits. Since pattern
6136 recog requires reverse iteration to visit uses before defs
6137 simply chop RPO into pieces. */
6138 auto_vec
<basic_block
> bbs
;
6139 for (unsigned i
= 0; i
< n
; i
++)
6141 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
6144 /* Split when a BB is not dominated by the first block. */
6145 if (!bbs
.is_empty ()
6146 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6150 "splitting region at dominance boundary bb%d\n",
6154 /* Split when the loop determined by the first block
6155 is exited. This is because we eventually insert
6156 invariants at region begin. */
6157 else if (!bbs
.is_empty ()
6158 && bbs
[0]->loop_father
!= bb
->loop_father
6159 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
6161 if (dump_enabled_p ())
6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6163 "splitting region at loop %d exit at bb%d\n",
6164 bbs
[0]->loop_father
->num
, bb
->index
);
6168 if (split
&& !bbs
.is_empty ())
6170 r
|= vect_slp_bbs (bbs
, NULL
);
6172 bbs
.quick_push (bb
);
6177 /* When we have a stmt ending this block and defining a
6178 value we have to insert on edges when inserting after it for
6179 a vector containing its definition. Avoid this for now. */
6180 if (gimple
*last
= last_stmt (bb
))
6181 if (gimple_get_lhs (last
)
6182 && is_ctrl_altering_stmt (last
))
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6186 "splitting region at control altering "
6187 "definition %G", last
);
6188 r
|= vect_slp_bbs (bbs
, NULL
);
6193 if (!bbs
.is_empty ())
6194 r
|= vect_slp_bbs (bbs
, NULL
);
6201 /* Build a variable-length vector in which the elements in ELTS are repeated
6202 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6203 RESULTS and add any new instructions to SEQ.
6205 The approach we use is:
6207 (1) Find a vector mode VM with integer elements of mode IM.
6209 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6210 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6211 from small vectors to IM.
6213 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6215 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6216 correct byte contents.
6218 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6220 We try to find the largest IM for which this sequence works, in order
6221 to cut down on the number of interleaves. */
6224 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
6225 const vec
<tree
> &elts
, unsigned int nresults
,
6228 unsigned int nelts
= elts
.length ();
6229 tree element_type
= TREE_TYPE (vector_type
);
6231 /* (1) Find a vector mode VM with integer elements of mode IM. */
6232 unsigned int nvectors
= 1;
6233 tree new_vector_type
;
6235 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
6236 &nvectors
, &new_vector_type
,
6240 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6241 unsigned int partial_nelts
= nelts
/ nvectors
;
6242 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
6244 tree_vector_builder partial_elts
;
6245 auto_vec
<tree
, 32> pieces (nvectors
* 2);
6246 pieces
.quick_grow_cleared (nvectors
* 2);
6247 for (unsigned int i
= 0; i
< nvectors
; ++i
)
6249 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6250 ELTS' has mode IM. */
6251 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
6252 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
6253 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
6254 tree t
= gimple_build_vector (seq
, &partial_elts
);
6255 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
6256 TREE_TYPE (new_vector_type
), t
);
6258 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6259 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
6262 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6263 correct byte contents.
6265 Conceptually, we need to repeat the following operation log2(nvectors)
6266 times, where hi_start = nvectors / 2:
6268 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6269 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6271 However, if each input repeats every N elements and the VF is
6272 a multiple of N * 2, the HI result is the same as the LO result.
6273 This will be true for the first N1 iterations of the outer loop,
6274 followed by N2 iterations for which both the LO and HI results
6277 N1 + N2 = log2(nvectors)
6279 Each "N1 iteration" doubles the number of redundant vectors and the
6280 effect of the process as a whole is to have a sequence of nvectors/2**N1
6281 vectors that repeats 2**N1 times. Rather than generate these redundant
6282 vectors, we halve the number of vectors for each N1 iteration. */
6283 unsigned int in_start
= 0;
6284 unsigned int out_start
= nvectors
;
6285 unsigned int new_nvectors
= nvectors
;
6286 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
6288 unsigned int hi_start
= new_nvectors
/ 2;
6289 unsigned int out_i
= 0;
6290 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
6293 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
6297 tree output
= make_ssa_name (new_vector_type
);
6298 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
6299 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
6300 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
6302 permutes
[in_i
& 1]);
6303 gimple_seq_add_stmt (seq
, stmt
);
6304 pieces
[out_start
+ out_i
] = output
;
6307 std::swap (in_start
, out_start
);
6308 new_nvectors
= out_i
;
6311 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6312 results
.reserve (nresults
);
6313 for (unsigned int i
= 0; i
< nresults
; ++i
)
6314 if (i
< new_nvectors
)
6315 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
6316 pieces
[in_start
+ i
]));
6318 results
.quick_push (results
[i
- new_nvectors
]);
6322 /* For constant and loop invariant defs in OP_NODE this function creates
6323 vector defs that will be used in the vectorized stmts and stores them
6324 to SLP_TREE_VEC_DEFS of OP_NODE. */
6327 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
6329 unsigned HOST_WIDE_INT nunits
;
6331 unsigned j
, number_of_places_left_in_vector
;
6334 int group_size
= op_node
->ops
.length ();
6335 unsigned int vec_num
, i
;
6336 unsigned number_of_copies
= 1;
6338 gimple_seq ctor_seq
= NULL
;
6339 auto_vec
<tree
, 16> permute_results
;
6341 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6342 vector_type
= SLP_TREE_VECTYPE (op_node
);
6344 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
6345 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
6346 auto_vec
<tree
> voprnds (number_of_vectors
);
6348 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6349 created vectors. It is greater than 1 if unrolling is performed.
6351 For example, we have two scalar operands, s1 and s2 (e.g., group of
6352 strided accesses of size two), while NUNITS is four (i.e., four scalars
6353 of this type can be packed in a vector). The output vector will contain
6354 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6357 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6358 containing the operands.
6360 For example, NUNITS is four as before, and the group size is 8
6361 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6362 {s5, s6, s7, s8}. */
6364 /* When using duplicate_and_interleave, we just need one element for
6365 each scalar statement. */
6366 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
6367 nunits
= group_size
;
6369 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
6371 number_of_places_left_in_vector
= nunits
;
6373 tree_vector_builder
elts (vector_type
, nunits
, 1);
6374 elts
.quick_grow (nunits
);
6375 stmt_vec_info insert_after
= NULL
;
6376 for (j
= 0; j
< number_of_copies
; j
++)
6379 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
6381 /* Create 'vect_ = {op0,op1,...,opn}'. */
6382 number_of_places_left_in_vector
--;
6384 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
6386 if (CONSTANT_CLASS_P (op
))
6388 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6390 /* Can't use VIEW_CONVERT_EXPR for booleans because
6391 of possibly different sizes of scalar value and
6393 if (integer_zerop (op
))
6394 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
6395 else if (integer_onep (op
))
6396 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
6401 op
= fold_unary (VIEW_CONVERT_EXPR
,
6402 TREE_TYPE (vector_type
), op
);
6403 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
6407 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
6409 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
6412 = build_all_ones_cst (TREE_TYPE (vector_type
));
6414 = build_zero_cst (TREE_TYPE (vector_type
));
6415 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
6416 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
6422 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
6425 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
6428 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
6432 elts
[number_of_places_left_in_vector
] = op
;
6433 if (!CONSTANT_CLASS_P (op
))
6435 /* For BB vectorization we have to compute an insert location
6436 when a def is inside the analyzed region since we cannot
6437 simply insert at the BB start in this case. */
6438 stmt_vec_info opdef
;
6439 if (TREE_CODE (orig_op
) == SSA_NAME
6440 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
6441 && is_a
<bb_vec_info
> (vinfo
)
6442 && (opdef
= vinfo
->lookup_def (orig_op
)))
6445 insert_after
= opdef
;
6447 insert_after
= get_later_stmt (insert_after
, opdef
);
6450 if (number_of_places_left_in_vector
== 0)
6453 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
6454 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
6455 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
6458 if (permute_results
.is_empty ())
6459 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
6460 elts
, number_of_vectors
,
6462 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
6464 if (!gimple_seq_empty_p (ctor_seq
))
6468 gimple_stmt_iterator gsi
;
6469 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
6471 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
6472 gsi_insert_seq_before (&gsi
, ctor_seq
,
6473 GSI_CONTINUE_LINKING
);
6475 else if (!stmt_ends_bb_p (insert_after
->stmt
))
6477 gsi
= gsi_for_stmt (insert_after
->stmt
);
6478 gsi_insert_seq_after (&gsi
, ctor_seq
,
6479 GSI_CONTINUE_LINKING
);
6483 /* When we want to insert after a def where the
6484 defining stmt throws then insert on the fallthru
6486 edge e
= find_fallthru_edge
6487 (gimple_bb (insert_after
->stmt
)->succs
);
6489 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
6490 gcc_assert (!new_bb
);
6494 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
6497 voprnds
.quick_push (vec_cst
);
6498 insert_after
= NULL
;
6499 number_of_places_left_in_vector
= nunits
;
6501 elts
.new_vector (vector_type
, nunits
, 1);
6502 elts
.quick_grow (nunits
);
6507 /* Since the vectors are created in the reverse order, we should invert
6509 vec_num
= voprnds
.length ();
6510 for (j
= vec_num
; j
!= 0; j
--)
6512 vop
= voprnds
[j
- 1];
6513 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6516 /* In case that VF is greater than the unrolling factor needed for the SLP
6517 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6518 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6519 to replicate the vectors. */
6520 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
6521 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
6523 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
6526 /* Get the Ith vectorized definition from SLP_NODE. */
6529 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
6531 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
6532 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
6534 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
6537 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6540 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
6542 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
6543 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
6546 gimple
*vec_def_stmt
;
6547 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
6548 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
6551 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
6554 /* Get N vectorized definitions for SLP_NODE. */
6557 vect_get_slp_defs (vec_info
*,
6558 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
6561 n
= SLP_TREE_CHILDREN (slp_node
).length ();
6563 for (unsigned i
= 0; i
< n
; ++i
)
6565 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
6566 vec
<tree
> vec_defs
= vNULL
;
6567 vect_get_slp_defs (child
, &vec_defs
);
6568 vec_oprnds
->quick_push (vec_defs
);
6572 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6573 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6574 permute statements for the SLP node NODE. Store the number of vector
6575 permute instructions in *N_PERMS and the number of vector load
6576 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6577 that were not needed. */
6580 vect_transform_slp_perm_load (vec_info
*vinfo
,
6581 slp_tree node
, const vec
<tree
> &dr_chain
,
6582 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
6583 bool analyze_only
, unsigned *n_perms
,
6584 unsigned int *n_loads
, bool dce_chain
)
6586 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
6588 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6589 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
6590 unsigned int mask_element
;
6593 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
6596 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6598 mode
= TYPE_MODE (vectype
);
6599 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6601 /* Initialize the vect stmts of NODE to properly insert the generated
6604 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
6605 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
6606 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
6608 /* Generate permutation masks for every NODE. Number of masks for each NODE
6609 is equal to GROUP_SIZE.
6610 E.g., we have a group of three nodes with three loads from the same
6611 location in each node, and the vector size is 4. I.e., we have a
6612 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6613 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6614 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6617 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6618 The last mask is illegal since we assume two operands for permute
6619 operation, and the mask element values can't be outside that range.
6620 Hence, the last mask must be converted into {2,5,5,5}.
6621 For the first two permutations we need the first and the second input
6622 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6623 we need the second and the third vectors: {b1,c1,a2,b2} and
6626 int vect_stmts_counter
= 0;
6627 unsigned int index
= 0;
6628 int first_vec_index
= -1;
6629 int second_vec_index
= -1;
6633 vec_perm_builder mask
;
6634 unsigned int nelts_to_build
;
6635 unsigned int nvectors_per_build
;
6636 unsigned int in_nlanes
;
6637 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
6638 && multiple_p (nunits
, group_size
));
6641 /* A single vector contains a whole number of copies of the node, so:
6642 (a) all permutes can use the same mask; and
6643 (b) the permutes only need a single vector input. */
6644 mask
.new_vector (nunits
, group_size
, 3);
6645 nelts_to_build
= mask
.encoded_nelts ();
6646 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
6647 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
6651 /* We need to construct a separate mask for each vector statement. */
6652 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
6653 if (!nunits
.is_constant (&const_nunits
)
6654 || !vf
.is_constant (&const_vf
))
6656 mask
.new_vector (const_nunits
, const_nunits
, 1);
6657 nelts_to_build
= const_vf
* group_size
;
6658 nvectors_per_build
= 1;
6659 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
6661 auto_sbitmap
used_in_lanes (in_nlanes
);
6662 bitmap_clear (used_in_lanes
);
6663 auto_bitmap used_defs
;
6665 unsigned int count
= mask
.encoded_nelts ();
6666 mask
.quick_grow (count
);
6667 vec_perm_indices indices
;
6669 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
6671 unsigned int iter_num
= j
/ group_size
;
6672 unsigned int stmt_num
= j
% group_size
;
6673 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
6674 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
6675 bitmap_set_bit (used_in_lanes
, i
);
6678 first_vec_index
= 0;
6683 /* Enforced before the loop when !repeating_p. */
6684 unsigned int const_nunits
= nunits
.to_constant ();
6685 vec_index
= i
/ const_nunits
;
6686 mask_element
= i
% const_nunits
;
6687 if (vec_index
== first_vec_index
6688 || first_vec_index
== -1)
6690 first_vec_index
= vec_index
;
6692 else if (vec_index
== second_vec_index
6693 || second_vec_index
== -1)
6695 second_vec_index
= vec_index
;
6696 mask_element
+= const_nunits
;
6700 if (dump_enabled_p ())
6701 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6702 "permutation requires at "
6703 "least three vectors %G",
6705 gcc_assert (analyze_only
);
6709 gcc_assert (mask_element
< 2 * const_nunits
);
6712 if (mask_element
!= index
)
6714 mask
[index
++] = mask_element
;
6716 if (index
== count
&& !noop_p
)
6718 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
6719 if (!can_vec_perm_const_p (mode
, indices
))
6721 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6725 "unsupported vect permute { ");
6726 for (i
= 0; i
< count
; ++i
)
6728 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6729 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6731 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6733 gcc_assert (analyze_only
);
6744 tree mask_vec
= NULL_TREE
;
6747 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6749 if (second_vec_index
== -1)
6750 second_vec_index
= first_vec_index
;
6752 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
6754 /* Generate the permute statement if necessary. */
6755 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
6756 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
6760 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6762 = vect_create_destination_var (gimple_assign_lhs (stmt
),
6764 perm_dest
= make_ssa_name (perm_dest
);
6766 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6767 first_vec
, second_vec
,
6769 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
6773 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6774 bitmap_set_bit (used_defs
, second_vec_index
+ ri
);
6779 /* If mask was NULL_TREE generate the requested
6780 identity transform. */
6781 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
6783 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
6786 /* Store the vector statement in NODE. */
6787 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
6792 first_vec_index
= -1;
6793 second_vec_index
= -1;
6801 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6804 /* Enforced above when !repeating_p. */
6805 unsigned int const_nunits
= nunits
.to_constant ();
6807 bool load_seen
= false;
6808 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
6810 if (i
% const_nunits
== 0)
6816 if (bitmap_bit_p (used_in_lanes
, i
))
6825 for (unsigned i
= 0; i
< dr_chain
.length (); ++i
)
6826 if (!bitmap_bit_p (used_defs
, i
))
6828 gimple
*stmt
= SSA_NAME_DEF_STMT (dr_chain
[i
]);
6829 gimple_stmt_iterator rgsi
= gsi_for_stmt (stmt
);
6830 gsi_remove (&rgsi
, true);
6831 release_defs (stmt
);
6837 /* Produce the next vector result for SLP permutation NODE by adding a vector
6838 statement at GSI. If MASK_VEC is nonnull, add:
6840 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6844 <new SSA name> = FIRST_DEF. */
6847 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6848 slp_tree node
, tree first_def
, tree second_def
,
6851 tree vectype
= SLP_TREE_VECTYPE (node
);
6853 /* ??? We SLP match existing vector element extracts but
6854 allow punning which we need to re-instantiate at uses
6855 but have no good way of explicitly representing. */
6856 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
6859 = gimple_build_assign (make_ssa_name (vectype
),
6860 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
6861 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6862 first_def
= gimple_assign_lhs (conv_stmt
);
6865 tree perm_dest
= make_ssa_name (vectype
);
6868 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
6871 = gimple_build_assign (make_ssa_name (vectype
),
6872 build1 (VIEW_CONVERT_EXPR
,
6873 vectype
, second_def
));
6874 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
6875 second_def
= gimple_assign_lhs (conv_stmt
);
6877 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
6878 first_def
, second_def
,
6882 /* We need a copy here in case the def was external. */
6883 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
6884 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
6885 /* Store the vector statement in NODE. */
6886 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
6889 /* Vectorize the SLP permutations in NODE as specified
6890 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6891 child number and lane number.
6892 Interleaving of two two-lane two-child SLP subtrees (not supported):
6893 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6894 A blend of two four-lane two-child SLP subtrees:
6895 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6896 Highpart of a four-lane one-child SLP subtree (not supported):
6897 [ { 0, 2 }, { 0, 3 } ]
6898 Where currently only a subset is supported by code generating below. */
6901 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
6902 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
6904 tree vectype
= SLP_TREE_VECTYPE (node
);
6906 /* ??? We currently only support all same vector input and output types
6907 while the SLP IL should really do a concat + select and thus accept
6908 arbitrary mismatches. */
6911 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
6912 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
6913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6915 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
6916 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
6918 if (dump_enabled_p ())
6919 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6920 "Unsupported lane permutation\n");
6923 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
6924 repeating_p
= false;
6927 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
6928 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
6929 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_NOTE
, vect_location
,
6932 "vectorizing permutation");
6933 for (unsigned i
= 0; i
< perm
.length (); ++i
)
6934 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
6936 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
6937 dump_printf (MSG_NOTE
, "\n");
6940 /* REPEATING_P is true if every output vector is guaranteed to use the
6941 same permute vector. We can handle that case for both variable-length
6942 and constant-length vectors, but we only handle other cases for
6943 constant-length vectors.
6947 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6948 mask vector that we want to build.
6950 - NCOPIES to the number of copies of PERM that we need in order
6951 to build the necessary permute mask vectors.
6953 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6954 for each permute mask vector. This is only relevant when GSI is
6957 unsigned nelts_per_pattern
;
6959 unsigned noutputs_per_mask
;
6962 /* We need a single permute mask vector that has the form:
6964 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6966 In other words, the original n-element permute in PERM is
6967 "unrolled" to fill a full vector. The stepped vector encoding
6968 that we use for permutes requires 3n elements. */
6969 npatterns
= SLP_TREE_LANES (node
);
6970 nelts_per_pattern
= ncopies
= 3;
6971 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6975 /* Calculate every element of every permute mask vector explicitly,
6976 instead of relying on the pattern described above. */
6977 if (!nunits
.is_constant (&npatterns
))
6979 nelts_per_pattern
= ncopies
= 1;
6980 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6981 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
6983 noutputs_per_mask
= 1;
6985 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
6986 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
6988 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6989 from the { SLP operand, scalar lane } permutation as recorded in the
6990 SLP node as intermediate step. This part should already work
6991 with SLP children with arbitrary number of lanes. */
6992 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
6993 auto_vec
<unsigned> active_lane
;
6994 vperm
.create (olanes
);
6995 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
6996 for (unsigned i
= 0; i
< ncopies
; ++i
)
6998 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
7000 std::pair
<unsigned, unsigned> p
= perm
[pi
];
7001 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
7003 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
7006 /* We checked above that the vectors are constant-length. */
7007 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
7008 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
7009 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
7010 vperm
.quick_push ({{p
.first
, vi
}, vl
});
7013 /* Advance to the next group. */
7014 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
7015 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
7018 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
7021 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
7025 ? multiple_p (i
, npatterns
)
7026 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
7027 dump_printf (MSG_NOTE
, ",");
7028 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
7029 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
7032 dump_printf (MSG_NOTE
, "\n");
7035 /* We can only handle two-vector permutes, everything else should
7036 be lowered on the SLP level. The following is closely inspired
7037 by vect_transform_slp_perm_load and is supposed to eventually
7039 ??? As intermediate step do code-gen in the SLP tree representation
7041 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
7042 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
7043 unsigned int index
= 0;
7044 poly_uint64 mask_element
;
7045 vec_perm_builder mask
;
7046 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
7047 unsigned int count
= mask
.encoded_nelts ();
7048 mask
.quick_grow (count
);
7049 vec_perm_indices indices
;
7050 unsigned nperms
= 0;
7051 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
7053 mask_element
= vperm
[i
].second
;
7054 if (first_vec
.first
== -1U
7055 || first_vec
== vperm
[i
].first
)
7056 first_vec
= vperm
[i
].first
;
7057 else if (second_vec
.first
== -1U
7058 || second_vec
== vperm
[i
].first
)
7060 second_vec
= vperm
[i
].first
;
7061 mask_element
+= nunits
;
7065 if (dump_enabled_p ())
7066 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7067 "permutation requires at "
7068 "least three vectors\n");
7073 mask
[index
++] = mask_element
;
7077 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2, nunits
);
7078 bool identity_p
= indices
.series_p (0, 1, 0, 1);
7080 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
7082 if (dump_enabled_p ())
7084 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
7086 "unsupported vect permute { ");
7087 for (i
= 0; i
< count
; ++i
)
7089 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
7090 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
7092 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
7102 if (second_vec
.first
== -1U)
7103 second_vec
= first_vec
;
7106 first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
],
7107 second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
7109 tree mask_vec
= NULL_TREE
;
7111 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
7113 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
7116 = vect_get_slp_vect_def (first_node
,
7117 first_vec
.second
+ vi
);
7119 = vect_get_slp_vect_def (second_node
,
7120 second_vec
.second
+ vi
);
7121 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
7122 second_def
, mask_vec
);
7127 first_vec
= std::make_pair (-1U, -1U);
7128 second_vec
= std::make_pair (-1U, -1U);
7133 record_stmt_cost (cost_vec
, nperms
, vec_perm
, node
, vectype
, 0, vect_body
);
7138 /* Vectorize SLP NODE. */
7141 vect_schedule_slp_node (vec_info
*vinfo
,
7142 slp_tree node
, slp_instance instance
)
7144 gimple_stmt_iterator si
;
7148 /* For existing vectors there's nothing to do. */
7149 if (SLP_TREE_VEC_DEFS (node
).exists ())
7152 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
7154 /* Vectorize externals and constants. */
7155 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
7156 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
7158 /* ??? vectorizable_shift can end up using a scalar operand which is
7159 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7160 node in this case. */
7161 if (!SLP_TREE_VECTYPE (node
))
7164 vect_create_constant_vectors (vinfo
, node
);
7168 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
7170 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
7171 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
7173 if (dump_enabled_p ())
7174 dump_printf_loc (MSG_NOTE
, vect_location
,
7175 "------>vectorizing SLP node starting from: %G",
7178 if (STMT_VINFO_DATA_REF (stmt_info
)
7179 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7181 /* Vectorized loads go before the first scalar load to make it
7182 ready early, vectorized stores go before the last scalar
7183 stmt which is where all uses are ready. */
7184 stmt_vec_info last_stmt_info
= NULL
;
7185 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
7186 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
7187 else /* DR_IS_WRITE */
7188 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
7189 si
= gsi_for_stmt (last_stmt_info
->stmt
);
7191 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
7192 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
7193 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
7194 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
7196 /* For PHI node vectorization we do not use the insertion iterator. */
7201 /* Emit other stmts after the children vectorized defs which is
7202 earliest possible. */
7203 gimple
*last_stmt
= NULL
;
7204 bool seen_vector_def
= false;
7205 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7206 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
7208 /* For fold-left reductions we are retaining the scalar
7209 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7210 set so the representation isn't perfect. Resort to the
7211 last scalar def here. */
7212 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
7214 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
7215 == cycle_phi_info_type
);
7216 gphi
*phi
= as_a
<gphi
*>
7217 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
7219 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
7222 /* We are emitting all vectorized stmts in the same place and
7223 the last one is the last.
7224 ??? Unless we have a load permutation applied and that
7225 figures to re-use an earlier generated load. */
7228 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
7230 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7233 else if (!SLP_TREE_VECTYPE (child
))
7235 /* For externals we use unvectorized at all scalar defs. */
7238 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
7239 if (TREE_CODE (def
) == SSA_NAME
7240 && !SSA_NAME_IS_DEFAULT_DEF (def
))
7242 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
7244 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
7250 /* For externals we have to look at all defs since their
7251 insertion place is decided per vector. But beware
7252 of pre-existing vectors where we need to make sure
7253 we do not insert before the region boundary. */
7254 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
7255 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
7256 seen_vector_def
= true;
7261 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
7262 if (TREE_CODE (vdef
) == SSA_NAME
7263 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
7265 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
7267 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
7272 /* This can happen when all children are pre-existing vectors or
7275 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
7278 gcc_assert (seen_vector_def
);
7279 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
7281 else if (is_a
<bb_vec_info
> (vinfo
)
7282 && gimple_bb (last_stmt
) != gimple_bb (stmt_info
->stmt
)
7283 && gimple_could_trap_p (stmt_info
->stmt
))
7285 /* We've constrained possibly trapping operations to all come
7286 from the same basic-block, if vectorized defs would allow earlier
7287 scheduling still force vectorized stmts to the original block.
7288 This is only necessary for BB vectorization since for loop vect
7289 all operations are in a single BB and scalar stmt based
7290 placement doesn't play well with epilogue vectorization. */
7291 gcc_assert (dominated_by_p (CDI_DOMINATORS
,
7292 gimple_bb (stmt_info
->stmt
),
7293 gimple_bb (last_stmt
)));
7294 si
= gsi_after_labels (gimple_bb (stmt_info
->stmt
));
7296 else if (is_a
<gphi
*> (last_stmt
))
7297 si
= gsi_after_labels (gimple_bb (last_stmt
));
7300 si
= gsi_for_stmt (last_stmt
);
7305 bool done_p
= false;
7307 /* Handle purely internal nodes. */
7308 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
7310 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7311 be shared with different SLP nodes (but usually it's the same
7312 operation apart from the case the stmt is only there for denoting
7313 the actual scalar lane defs ...). So do not call vect_transform_stmt
7314 but open-code it here (partly). */
7315 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
7320 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
7323 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7324 For loop vectorization this is done in vectorizable_call, but for SLP
7325 it needs to be deferred until end of vect_schedule_slp, because multiple
7326 SLP instances may refer to the same scalar stmt. */
7329 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
7330 slp_tree node
, hash_set
<slp_tree
> &visited
)
7333 gimple_stmt_iterator gsi
;
7337 stmt_vec_info stmt_info
;
7339 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7342 if (visited
.add (node
))
7345 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7346 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
7348 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
7350 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
7351 if (!stmt
|| gimple_bb (stmt
) == NULL
)
7353 if (is_pattern_stmt_p (stmt_info
)
7354 || !PURE_SLP_STMT (stmt_info
))
7356 lhs
= gimple_call_lhs (stmt
);
7357 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
7358 gsi
= gsi_for_stmt (stmt
);
7359 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
7360 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
7365 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
7367 hash_set
<slp_tree
> visited
;
7368 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
7371 /* Vectorize the instance root. */
7374 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
7376 gassign
*rstmt
= NULL
;
7378 if (instance
->kind
== slp_inst_kind_ctor
)
7380 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
7382 gimple
*child_stmt
= SLP_TREE_VEC_STMTS (node
)[0];
7383 tree vect_lhs
= gimple_get_lhs (child_stmt
);
7384 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7385 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
7386 TREE_TYPE (vect_lhs
)))
7387 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
7389 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
7391 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
7393 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
7396 vec
<constructor_elt
, va_gc
> *v
;
7397 vec_alloc (v
, nelts
);
7399 /* A CTOR can handle V16HI composition from VNx8HI so we
7400 do not need to convert vector elements if the types
7402 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
7403 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
,
7404 gimple_get_lhs (child_stmt
));
7405 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
7407 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
7408 tree r_constructor
= build_constructor (rtype
, v
);
7409 rstmt
= gimple_build_assign (lhs
, r_constructor
);
7412 else if (instance
->kind
== slp_inst_kind_bb_reduc
)
7414 /* Largely inspired by reduction chain epilogue handling in
7415 vect_create_epilog_for_reduction. */
7416 vec
<tree
> vec_defs
= vNULL
;
7417 vect_get_slp_defs (node
, &vec_defs
);
7418 enum tree_code reduc_code
7419 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
7420 /* ??? We actually have to reflect signs somewhere. */
7421 if (reduc_code
== MINUS_EXPR
)
7422 reduc_code
= PLUS_EXPR
;
7423 gimple_seq epilogue
= NULL
;
7424 /* We may end up with more than one vector result, reduce them
7426 tree vec_def
= vec_defs
[0];
7427 for (unsigned i
= 1; i
< vec_defs
.length (); ++i
)
7428 vec_def
= gimple_build (&epilogue
, reduc_code
, TREE_TYPE (vec_def
),
7429 vec_def
, vec_defs
[i
]);
7430 vec_defs
.release ();
7431 /* ??? Support other schemes than direct internal fn. */
7432 internal_fn reduc_fn
;
7433 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
7434 || reduc_fn
== IFN_LAST
)
7436 tree scalar_def
= gimple_build (&epilogue
, as_combined_fn (reduc_fn
),
7437 TREE_TYPE (TREE_TYPE (vec_def
)), vec_def
);
7439 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7440 gsi_insert_seq_before (&rgsi
, epilogue
, GSI_SAME_STMT
);
7441 gimple_assign_set_rhs_from_tree (&rgsi
, scalar_def
);
7442 update_stmt (gsi_stmt (rgsi
));
7450 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
7451 gsi_replace (&rgsi
, rstmt
, true);
7461 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7464 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
7465 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
7466 int &maxdfs
, vec
<slp_tree
> &stack
)
7469 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
7470 gcc_assert (!existed_p
);
7472 info
->lowlink
= maxdfs
;
7476 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7478 info
->on_stack
= false;
7479 vect_schedule_slp_node (vinfo
, node
, instance
);
7483 info
->on_stack
= true;
7484 stack
.safe_push (node
);
7489 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7493 slp_scc_info
*child_info
= scc_info
.get (child
);
7496 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
7497 /* Recursion might have re-allocated the node. */
7498 info
= scc_info
.get (node
);
7499 child_info
= scc_info
.get (child
);
7500 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
7502 else if (child_info
->on_stack
)
7503 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
7505 if (info
->lowlink
!= info
->dfs
)
7508 auto_vec
<slp_tree
, 4> phis_to_fixup
;
7511 if (stack
.last () == node
)
7514 info
->on_stack
= false;
7515 vect_schedule_slp_node (vinfo
, node
, instance
);
7516 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
7517 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
7518 phis_to_fixup
.quick_push (node
);
7523 int last_idx
= stack
.length () - 1;
7524 while (stack
[last_idx
] != node
)
7526 /* We can break the cycle at PHIs who have at least one child
7527 code generated. Then we could re-start the DFS walk until
7528 all nodes in the SCC are covered (we might have new entries
7529 for only back-reachable nodes). But it's simpler to just
7530 iterate and schedule those that are ready. */
7531 unsigned todo
= stack
.length () - last_idx
;
7534 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
7536 slp_tree entry
= stack
[idx
];
7539 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
7540 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
7542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
7549 else if (scc_info
.get (child
)->on_stack
)
7567 vect_schedule_slp_node (vinfo
, entry
, instance
);
7568 scc_info
.get (entry
)->on_stack
= false;
7572 phis_to_fixup
.safe_push (entry
);
7579 stack
.truncate (last_idx
);
7582 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7584 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
7586 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
7589 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
7591 unsigned dest_idx
= e
->dest_idx
;
7592 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
7593 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
7595 /* Simply fill all args. */
7596 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
7597 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
7598 vect_get_slp_vect_def (child
, i
),
7599 e
, gimple_phi_arg_location (phi
, dest_idx
));
7604 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7607 vect_schedule_slp (vec_info
*vinfo
, const vec
<slp_instance
> &slp_instances
)
7609 slp_instance instance
;
7612 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
7614 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7616 slp_tree node
= SLP_INSTANCE_TREE (instance
);
7617 if (dump_enabled_p ())
7619 dump_printf_loc (MSG_NOTE
, vect_location
,
7620 "Vectorizing SLP tree:\n");
7622 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7623 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
7624 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
7625 vect_print_slp_graph (MSG_NOTE
, vect_location
,
7626 SLP_INSTANCE_TREE (instance
));
7628 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7629 have a PHI be the node breaking the cycle. */
7630 auto_vec
<slp_tree
> stack
;
7631 if (!scc_info
.get (node
))
7632 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
7634 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
7635 vectorize_slp_instance_root_stmt (node
, instance
);
7637 if (dump_enabled_p ())
7638 dump_printf_loc (MSG_NOTE
, vect_location
,
7639 "vectorizing stmts using SLP.\n");
7642 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7644 slp_tree root
= SLP_INSTANCE_TREE (instance
);
7645 stmt_vec_info store_info
;
7648 /* Remove scalar call stmts. Do not do this for basic-block
7649 vectorization as not all uses may be vectorized.
7650 ??? Why should this be necessary? DCE should be able to
7651 remove the stmts itself.
7652 ??? For BB vectorization we can as well remove scalar
7653 stmts starting from the SLP tree root if they have no
7655 if (is_a
<loop_vec_info
> (vinfo
))
7656 vect_remove_slp_scalar_calls (vinfo
, root
);
7658 /* Remove vectorized stores original scalar stmts. */
7659 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
7661 if (!STMT_VINFO_DATA_REF (store_info
)
7662 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
7665 store_info
= vect_orig_stmt (store_info
);
7666 /* Free the attached stmt_vec_info and remove the stmt. */
7667 vinfo
->remove_stmt (store_info
);
7669 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7670 to not crash in vect_free_slp_tree later. */
7671 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
7672 SLP_TREE_REPRESENTATIVE (root
) = NULL
;