1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
51 slp_tree
, stmt_vector_for_cost
*);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
58 SLP_TREE_SCALAR_OPS (this) = vNULL
;
59 SLP_TREE_VEC_STMTS (this) = vNULL
;
60 SLP_TREE_VEC_DEFS (this) = vNULL
;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL
;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
66 SLP_TREE_CODE (this) = ERROR_MARK
;
67 SLP_TREE_VECTYPE (this) = NULL_TREE
;
68 SLP_TREE_REPRESENTATIVE (this) = NULL
;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
92 vect_free_slp_tree (slp_tree node
, bool final_p
)
97 if (--node
->refcnt
!= 0)
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
101 vect_free_slp_tree (child
, final_p
);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
109 stmt_vec_info stmt_info
;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info
) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info
)--;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
125 vect_free_slp_instance (slp_instance instance
, bool final_p
)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
), final_p
);
128 SLP_INSTANCE_LOADS (instance
).release ();
129 instance
->subgraph_entries
.release ();
130 instance
->cost_vec
.release ();
135 /* Create an SLP node for SCALAR_STMTS. */
138 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
140 slp_tree node
= new _slp_tree
;
141 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
142 SLP_TREE_CHILDREN (node
).create (nops
);
143 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
144 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
145 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
148 stmt_vec_info stmt_info
;
149 FOR_EACH_VEC_ELT (scalar_stmts
, i
, stmt_info
)
150 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
155 /* Create an SLP node for OPS. */
158 vect_create_new_slp_node (vec
<tree
> ops
)
160 slp_tree node
= new _slp_tree
;
161 SLP_TREE_SCALAR_OPS (node
) = ops
;
162 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
163 SLP_TREE_LANES (node
) = ops
.length ();
168 /* This structure is used in creation of an SLP tree. Each instance
169 corresponds to the same operand in a group of scalar stmts in an SLP
171 typedef struct _slp_oprnd_info
173 /* Def-stmts for the operands. */
174 vec
<stmt_vec_info
> def_stmts
;
177 /* Information about the first statement, its vector def-type, type, the
178 operand itself in case it's constant, and an indication if it's a pattern
181 enum vect_def_type first_dt
;
186 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
188 static vec
<slp_oprnd_info
>
189 vect_create_oprnd_info (int nops
, int group_size
)
192 slp_oprnd_info oprnd_info
;
193 vec
<slp_oprnd_info
> oprnds_info
;
195 oprnds_info
.create (nops
);
196 for (i
= 0; i
< nops
; i
++)
198 oprnd_info
= XNEW (struct _slp_oprnd_info
);
199 oprnd_info
->def_stmts
.create (group_size
);
200 oprnd_info
->ops
.create (group_size
);
201 oprnd_info
->first_dt
= vect_uninitialized_def
;
202 oprnd_info
->first_op_type
= NULL_TREE
;
203 oprnd_info
->any_pattern
= false;
204 oprnds_info
.quick_push (oprnd_info
);
211 /* Free operands info. */
214 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
217 slp_oprnd_info oprnd_info
;
219 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
221 oprnd_info
->def_stmts
.release ();
222 oprnd_info
->ops
.release ();
223 XDELETE (oprnd_info
);
226 oprnds_info
.release ();
230 /* Return true if STMTS contains a pattern statement. */
233 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
235 stmt_vec_info stmt_info
;
237 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
238 if (is_pattern_stmt_p (stmt_info
))
243 /* Return true when all lanes in the external or constant NODE have
247 vect_slp_tree_uniform_p (slp_tree node
)
249 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
250 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
252 /* Pre-exsting vectors. */
253 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
257 tree op
, first
= NULL_TREE
;
258 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
261 else if (!operand_equal_p (first
, op
, 0))
267 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
268 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
272 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
273 stmt_vec_info first_stmt_info
)
275 stmt_vec_info next_stmt_info
= first_stmt_info
;
278 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
283 if (next_stmt_info
== stmt_info
)
285 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
287 result
+= DR_GROUP_GAP (next_stmt_info
);
289 while (next_stmt_info
);
294 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
295 using the method implemented by duplicate_and_interleave. Return true
296 if so, returning the number of intermediate vectors in *NVECTORS_OUT
297 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
301 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
302 tree elt_type
, unsigned int *nvectors_out
,
303 tree
*vector_type_out
,
306 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
307 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
310 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
311 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
312 unsigned int nvectors
= 1;
315 scalar_int_mode int_mode
;
316 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
317 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
319 /* Get the natural vector type for this SLP group size. */
320 tree int_type
= build_nonstandard_integer_type
321 (GET_MODE_BITSIZE (int_mode
), 1);
323 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
325 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
326 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
327 GET_MODE_SIZE (base_vector_mode
)))
329 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
330 together into elements of type INT_TYPE and using the result
331 to build NVECTORS vectors. */
332 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
333 vec_perm_builder
sel1 (nelts
, 2, 3);
334 vec_perm_builder
sel2 (nelts
, 2, 3);
335 poly_int64 half_nelts
= exact_div (nelts
, 2);
336 for (unsigned int i
= 0; i
< 3; ++i
)
339 sel1
.quick_push (i
+ nelts
);
340 sel2
.quick_push (half_nelts
+ i
);
341 sel2
.quick_push (half_nelts
+ i
+ nelts
);
343 vec_perm_indices
indices1 (sel1
, 2, nelts
);
344 vec_perm_indices
indices2 (sel2
, 2, nelts
);
345 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
346 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
349 *nvectors_out
= nvectors
;
351 *vector_type_out
= vector_type
;
354 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
356 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
363 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
369 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
370 they are of a valid type and that they match the defs of the first stmt of
371 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
372 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
373 indicates swap is required for cond_expr stmts. Specifically, *SWAP
374 is 1 if STMT is cond and operands of comparison need to be swapped;
375 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
376 If there is any operand swap in this function, *SWAP is set to non-zero
378 If there was a fatal error return -1; if the error could be corrected by
379 swapping operands of father node of this one, return 1; if everything is
382 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char *swap
,
383 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
384 vec
<slp_oprnd_info
> *oprnds_info
)
386 stmt_vec_info stmt_info
= stmts
[stmt_num
];
388 unsigned int i
, number_of_oprnds
;
389 enum vect_def_type dt
= vect_uninitialized_def
;
390 slp_oprnd_info oprnd_info
;
391 int first_op_idx
= 1;
392 unsigned int commutative_op
= -1U;
393 bool first_op_cond
= false;
394 bool first
= stmt_num
== 0;
396 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
398 number_of_oprnds
= gimple_call_num_args (stmt
);
400 if (gimple_call_internal_p (stmt
))
402 internal_fn ifn
= gimple_call_internal_fn (stmt
);
403 commutative_op
= first_commutative_argument (ifn
);
405 /* Masked load, only look at mask. */
406 if (ifn
== IFN_MASK_LOAD
)
408 number_of_oprnds
= 1;
409 /* Mask operand index. */
414 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
416 enum tree_code code
= gimple_assign_rhs_code (stmt
);
417 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
418 /* Swap can only be done for cond_expr if asked to, otherwise we
419 could result in different comparison code to the first stmt. */
420 if (code
== COND_EXPR
421 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
423 first_op_cond
= true;
427 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
432 bool swapped
= (*swap
!= 0);
433 gcc_assert (!swapped
|| first_op_cond
);
434 for (i
= 0; i
< number_of_oprnds
; i
++)
439 /* Map indicating how operands of cond_expr should be swapped. */
440 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
441 int *map
= maps
[*swap
];
444 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
445 first_op_idx
), map
[i
]);
447 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
450 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
451 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
452 oprnd
= TREE_OPERAND (oprnd
, 0);
454 oprnd_info
= (*oprnds_info
)[i
];
456 stmt_vec_info def_stmt_info
;
457 if (!vect_is_simple_use (oprnd
, vinfo
, &dt
, &def_stmt_info
))
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
461 "Build SLP failed: can't analyze def for %T\n",
467 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
468 oprnd_info
->any_pattern
= true;
472 /* For the swapping logic below force vect_reduction_def
473 for the reduction op in a SLP reduction group. */
474 if (!STMT_VINFO_DATA_REF (stmt_info
)
475 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
476 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
478 dt
= vect_reduction_def
;
479 oprnd_info
->first_dt
= dt
;
480 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
484 /* Not first stmt of the group, check that the def-stmt/s match
485 the def-stmt/s of the first stmt. Allow different definition
486 types for reduction chains: the first stmt must be a
487 vect_reduction_def (a phi node), and the rest
488 end in the reduction chain. */
489 tree type
= TREE_TYPE (oprnd
);
490 if ((oprnd_info
->first_dt
!= dt
491 && !(oprnd_info
->first_dt
== vect_reduction_def
492 && !STMT_VINFO_DATA_REF (stmt_info
)
493 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
495 && !STMT_VINFO_DATA_REF (def_stmt_info
)
496 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
497 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
498 && !((oprnd_info
->first_dt
== vect_external_def
499 || oprnd_info
->first_dt
== vect_constant_def
)
500 && (dt
== vect_external_def
501 || dt
== vect_constant_def
)))
502 || !types_compatible_p (oprnd_info
->first_op_type
, type
)
503 || (!STMT_VINFO_DATA_REF (stmt_info
)
504 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
506 || STMT_VINFO_DATA_REF (def_stmt_info
)
507 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
508 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
509 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
511 /* Try swapping operands if we got a mismatch. */
512 if (i
== commutative_op
&& !swapped
)
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE
, vect_location
,
516 "trying swapped operands\n");
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "Build SLP failed: different types\n");
527 if ((dt
== vect_constant_def
528 || dt
== vect_external_def
)
529 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
530 && (TREE_CODE (type
) == BOOLEAN_TYPE
531 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
536 "Build SLP failed: invalid type of def "
537 "for variable-length SLP %T\n", oprnd
);
542 /* Check the types of the definitions. */
545 case vect_external_def
:
546 /* Make sure to demote the overall operand to external. */
547 oprnd_info
->first_dt
= vect_external_def
;
549 case vect_constant_def
:
550 oprnd_info
->def_stmts
.quick_push (NULL
);
551 oprnd_info
->ops
.quick_push (oprnd
);
554 case vect_internal_def
:
555 case vect_reduction_def
:
556 if (oprnd_info
->first_dt
== vect_reduction_def
557 && !STMT_VINFO_DATA_REF (stmt_info
)
558 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
559 && !STMT_VINFO_DATA_REF (def_stmt_info
)
560 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
561 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
563 /* For a SLP reduction chain we want to duplicate the
564 reduction to each of the chain members. That gets
565 us a sane SLP graph (still the stmts are not 100%
566 correct wrt the initial values). */
568 oprnd_info
->def_stmts
.quick_push (oprnd_info
->def_stmts
[0]);
569 oprnd_info
->ops
.quick_push (oprnd_info
->ops
[0]);
573 case vect_induction_def
:
574 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
575 oprnd_info
->ops
.quick_push (oprnd
);
579 /* FORNOW: Not supported. */
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
582 "Build SLP failed: illegal type of def %T\n",
592 if (dump_enabled_p ())
593 dump_printf_loc (MSG_NOTE
, vect_location
,
594 "swapped operands to match def types in %G",
602 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
603 Return true if we can, meaning that this choice doesn't conflict with
604 existing SLP nodes that use STMT_INFO. */
607 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
609 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
610 if (old_vectype
&& useless_type_conversion_p (vectype
, old_vectype
))
613 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
614 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
616 /* We maintain the invariant that if any statement in the group is
617 used, all other members of the group have the same vector type. */
618 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
619 stmt_vec_info member_info
= first_info
;
620 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
621 if (STMT_VINFO_NUM_SLP_USES (member_info
) > 0
622 || is_pattern_stmt_p (member_info
))
627 for (member_info
= first_info
; member_info
;
628 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
629 STMT_VINFO_VECTYPE (member_info
) = vectype
;
633 else if (STMT_VINFO_NUM_SLP_USES (stmt_info
) == 0
634 && !is_pattern_stmt_p (stmt_info
))
636 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
640 if (dump_enabled_p ())
642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
643 "Build SLP failed: incompatible vector"
644 " types for: %G", stmt_info
->stmt
);
645 dump_printf_loc (MSG_NOTE
, vect_location
,
646 " old vector type: %T\n", old_vectype
);
647 dump_printf_loc (MSG_NOTE
, vect_location
,
648 " new vector type: %T\n", vectype
);
653 /* Return true if call statements CALL1 and CALL2 are similar enough
654 to be combined into the same SLP group. */
657 compatible_calls_p (gcall
*call1
, gcall
*call2
)
659 unsigned int nargs
= gimple_call_num_args (call1
);
660 if (nargs
!= gimple_call_num_args (call2
))
663 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
666 if (gimple_call_internal_p (call1
))
668 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
669 TREE_TYPE (gimple_call_lhs (call2
))))
671 for (unsigned int i
= 0; i
< nargs
; ++i
)
672 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
673 TREE_TYPE (gimple_call_arg (call2
, i
))))
678 if (!operand_equal_p (gimple_call_fn (call1
),
679 gimple_call_fn (call2
), 0))
682 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
688 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
689 caller's attempt to find the vector type in STMT_INFO with the narrowest
690 element type. Return true if VECTYPE is nonnull and if it is valid
691 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
692 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
693 vect_build_slp_tree. */
696 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
697 unsigned int group_size
,
698 tree vectype
, poly_uint64
*max_nunits
)
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
704 "Build SLP failed: unsupported data-type in %G\n",
706 /* Fatal mismatch. */
710 /* If populating the vector type requires unrolling then fail
711 before adjusting *max_nunits for basic-block vectorization. */
712 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
713 unsigned HOST_WIDE_INT const_nunits
;
714 if (is_a
<bb_vec_info
> (vinfo
)
715 && (!nunits
.is_constant (&const_nunits
)
716 || const_nunits
> group_size
))
718 if (dump_enabled_p ())
719 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
720 "Build SLP failed: unrolling required "
721 "in basic block SLP\n");
722 /* Fatal mismatch. */
726 /* In case of multiple types we need to detect the smallest type. */
727 vect_update_max_nunits (max_nunits
, vectype
);
731 /* Verify if the scalar stmts STMTS are isomorphic, require data
732 permutation or are of unsupported types of operation. Return
733 true if they are, otherwise return false and indicate in *MATCHES
734 which stmts are not isomorphic to the first one. If MATCHES[0]
735 is false then this indicates the comparison could not be
736 carried out or the stmts will never be vectorized by SLP.
738 Note COND_EXPR is possibly isomorphic to another one after swapping its
739 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
740 the first stmt by swapping the two operands of comparison; set SWAP[i]
741 to 2 if stmt I is isormorphic to the first stmt by inverting the code
742 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
743 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
746 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
747 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
748 poly_uint64
*max_nunits
, bool *matches
,
749 bool *two_operators
, tree
*node_vectype
)
752 stmt_vec_info first_stmt_info
= stmts
[0];
753 enum tree_code first_stmt_code
= ERROR_MARK
;
754 enum tree_code alt_stmt_code
= ERROR_MARK
;
755 enum tree_code rhs_code
= ERROR_MARK
;
756 enum tree_code first_cond_code
= ERROR_MARK
;
758 bool need_same_oprnds
= false;
759 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
762 machine_mode optab_op2_mode
;
763 machine_mode vec_mode
;
764 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
765 bool first_stmt_load_p
= false, load_p
= false;
767 /* For every stmt in NODE find its def stmt/s. */
768 stmt_vec_info stmt_info
;
769 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
771 gimple
*stmt
= stmt_info
->stmt
;
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
778 /* Fail to vectorize statements marked as unvectorizable. */
779 if (!STMT_VINFO_VECTORIZABLE (stmt_info
))
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
783 "Build SLP failed: unvectorizable statement %G",
785 /* Fatal mismatch. */
790 lhs
= gimple_get_lhs (stmt
);
791 if (lhs
== NULL_TREE
)
793 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
795 "Build SLP failed: not GIMPLE_ASSIGN nor "
796 "GIMPLE_CALL %G", stmt
);
797 /* Fatal mismatch. */
803 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
804 &nunits_vectype
, group_size
)
806 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
807 nunits_vectype
, max_nunits
)))
809 /* Fatal mismatch. */
814 gcc_assert (vectype
);
816 if (is_a
<bb_vec_info
> (vinfo
)
817 && !vect_update_shared_vectype (stmt_info
, vectype
))
820 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
823 rhs_code
= CALL_EXPR
;
825 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
827 else if ((gimple_call_internal_p (call_stmt
)
828 && (!vectorizable_internal_fn_p
829 (gimple_call_internal_fn (call_stmt
))))
830 || gimple_call_tail_p (call_stmt
)
831 || gimple_call_noreturn_p (call_stmt
)
832 || !gimple_call_nothrow_p (call_stmt
)
833 || gimple_call_chain (call_stmt
))
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
837 "Build SLP failed: unsupported call type %G",
839 /* Fatal mismatch. */
846 rhs_code
= gimple_assign_rhs_code (stmt
);
847 load_p
= gimple_vuse (stmt
);
850 /* Check the operation. */
853 *node_vectype
= vectype
;
854 first_stmt_code
= rhs_code
;
855 first_stmt_load_p
= load_p
;
857 /* Shift arguments should be equal in all the packed stmts for a
858 vector shift with scalar shift operand. */
859 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
860 || rhs_code
== LROTATE_EXPR
861 || rhs_code
== RROTATE_EXPR
)
863 vec_mode
= TYPE_MODE (vectype
);
865 /* First see if we have a vector/vector shift. */
866 optab
= optab_for_tree_code (rhs_code
, vectype
,
870 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
872 /* No vector/vector shift, try for a vector/scalar shift. */
873 optab
= optab_for_tree_code (rhs_code
, vectype
,
878 if (dump_enabled_p ())
879 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
880 "Build SLP failed: no optab.\n");
881 /* Fatal mismatch. */
885 icode
= (int) optab_handler (optab
, vec_mode
);
886 if (icode
== CODE_FOR_nothing
)
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
891 "op not supported by target.\n");
892 /* Fatal mismatch. */
896 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
897 if (!VECTOR_MODE_P (optab_op2_mode
))
899 need_same_oprnds
= true;
900 first_op1
= gimple_assign_rhs2 (stmt
);
904 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
906 need_same_oprnds
= true;
907 first_op1
= gimple_assign_rhs2 (stmt
);
910 && rhs_code
== BIT_FIELD_REF
)
912 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
913 if (TREE_CODE (vec
) != SSA_NAME
914 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
916 if (dump_enabled_p ())
917 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
919 "BIT_FIELD_REF not supported\n");
920 /* Fatal mismatch. */
926 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
928 need_same_oprnds
= true;
929 first_op1
= gimple_call_arg (call_stmt
, 1);
934 if (first_stmt_code
!= rhs_code
935 && alt_stmt_code
== ERROR_MARK
)
936 alt_stmt_code
= rhs_code
;
937 if ((first_stmt_code
!= rhs_code
938 && (first_stmt_code
!= IMAGPART_EXPR
939 || rhs_code
!= REALPART_EXPR
)
940 && (first_stmt_code
!= REALPART_EXPR
941 || rhs_code
!= IMAGPART_EXPR
)
942 /* Handle mismatches in plus/minus by computing both
943 and merging the results. */
944 && !((first_stmt_code
== PLUS_EXPR
945 || first_stmt_code
== MINUS_EXPR
)
946 && (alt_stmt_code
== PLUS_EXPR
947 || alt_stmt_code
== MINUS_EXPR
)
948 && rhs_code
== alt_stmt_code
)
949 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
950 && (first_stmt_code
== ARRAY_REF
951 || first_stmt_code
== BIT_FIELD_REF
952 || first_stmt_code
== INDIRECT_REF
953 || first_stmt_code
== COMPONENT_REF
954 || first_stmt_code
== MEM_REF
)))
955 || first_stmt_load_p
!= load_p
)
957 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
960 "Build SLP failed: different operation "
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
963 "original stmt %G", first_stmt_info
->stmt
);
969 if (need_same_oprnds
)
971 tree other_op1
= (call_stmt
972 ? gimple_call_arg (call_stmt
, 1)
973 : gimple_assign_rhs2 (stmt
));
974 if (!operand_equal_p (first_op1
, other_op1
, 0))
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
978 "Build SLP failed: different shift "
979 "arguments in %G", stmt
);
985 && first_stmt_code
== BIT_FIELD_REF
986 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
987 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
989 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
991 "Build SLP failed: different BIT_FIELD_REF "
992 "arguments in %G", stmt
);
997 if (!load_p
&& rhs_code
== CALL_EXPR
)
999 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1000 as_a
<gcall
*> (stmt
)))
1002 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1004 "Build SLP failed: different calls in %G",
1012 /* Grouped store or load. */
1013 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1015 if (REFERENCE_CLASS_P (lhs
))
1023 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1024 if (prev_first_load
)
1026 /* Check that there are no loads from different interleaving
1027 chains in the same node. */
1028 if (prev_first_load
!= first_load
)
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1033 "Build SLP failed: different "
1034 "interleaving chains in one node %G",
1041 prev_first_load
= first_load
;
1043 } /* Grouped access. */
1048 /* Not grouped load. */
1049 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1051 "Build SLP failed: not grouped load %G", stmt
);
1053 /* FORNOW: Not grouped loads are not supported. */
1054 /* Fatal mismatch. */
1059 /* Not memory operation. */
1060 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
1061 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1062 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1063 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1064 && rhs_code
!= VIEW_CONVERT_EXPR
1065 && rhs_code
!= CALL_EXPR
1066 && rhs_code
!= BIT_FIELD_REF
)
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1070 "Build SLP failed: operation unsupported %G",
1072 /* Fatal mismatch. */
1077 if (rhs_code
== COND_EXPR
)
1079 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1080 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1081 enum tree_code swap_code
= ERROR_MARK
;
1082 enum tree_code invert_code
= ERROR_MARK
;
1085 first_cond_code
= TREE_CODE (cond_expr
);
1086 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1088 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1089 swap_code
= swap_tree_comparison (cond_code
);
1090 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1093 if (first_cond_code
== cond_code
)
1095 /* Isomorphic can be achieved by swapping. */
1096 else if (first_cond_code
== swap_code
)
1098 /* Isomorphic can be achieved by inverting. */
1099 else if (first_cond_code
== invert_code
)
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1105 "Build SLP failed: different"
1106 " operation %G", stmt
);
1116 for (i
= 0; i
< group_size
; ++i
)
1120 /* If we allowed a two-operation SLP node verify the target can cope
1121 with the permute we are going to use. */
1122 if (alt_stmt_code
!= ERROR_MARK
1123 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1125 *two_operators
= true;
1131 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1132 Note we never remove apart from at destruction time so we do not
1133 need a special value for deleted that differs from empty. */
1136 typedef vec
<stmt_vec_info
> value_type
;
1137 typedef vec
<stmt_vec_info
> compare_type
;
1138 static inline hashval_t
hash (value_type
);
1139 static inline bool equal (value_type existing
, value_type candidate
);
1140 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1141 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1142 static const bool empty_zero_p
= true;
1143 static inline void mark_empty (value_type
&x
) { x
.release (); }
1144 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1145 static inline void remove (value_type
&x
) { x
.release (); }
1148 bst_traits::hash (value_type x
)
1151 for (unsigned i
= 0; i
< x
.length (); ++i
)
1152 h
.add_int (gimple_uid (x
[i
]->stmt
));
1156 bst_traits::equal (value_type existing
, value_type candidate
)
1158 if (existing
.length () != candidate
.length ())
1160 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1161 if (existing
[i
] != candidate
[i
])
1166 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1167 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1168 scalar_stmts_to_slp_tree_map_t
;
1171 vect_build_slp_tree_2 (vec_info
*vinfo
,
1172 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1173 poly_uint64
*max_nunits
,
1174 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1175 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1178 vect_build_slp_tree (vec_info
*vinfo
,
1179 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1180 poly_uint64
*max_nunits
,
1181 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1182 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1184 if (slp_tree
*leader
= bst_map
->get (stmts
))
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1188 *leader
? "" : "failed ", *leader
);
1191 (*leader
)->refcnt
++;
1192 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1196 poly_uint64 this_max_nunits
= 1;
1197 slp_tree res
= vect_build_slp_tree_2 (vinfo
, stmts
, group_size
,
1199 matches
, npermutes
, tree_size
, bst_map
);
1202 res
->max_nunits
= this_max_nunits
;
1203 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1204 /* Keep a reference for the bst_map use. */
1207 bst_map
->put (stmts
.copy (), res
);
1211 /* Recursively build an SLP tree starting from NODE.
1212 Fail (and return a value not equal to zero) if def-stmts are not
1213 isomorphic, require data permutation or are of unsupported types of
1214 operation. Otherwise, return 0.
1215 The value returned is the depth in the SLP tree where a mismatch
1219 vect_build_slp_tree_2 (vec_info
*vinfo
,
1220 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1221 poly_uint64
*max_nunits
,
1222 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1223 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1225 unsigned nops
, i
, this_tree_size
= 0;
1226 poly_uint64 this_max_nunits
= *max_nunits
;
1231 stmt_vec_info stmt_info
= stmts
[0];
1232 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1233 nops
= gimple_call_num_args (stmt
);
1234 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1236 nops
= gimple_num_ops (stmt
) - 1;
1237 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1240 else if (is_a
<gphi
*> (stmt_info
->stmt
))
1245 /* If the SLP node is a PHI (induction or reduction), terminate
1247 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1249 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1250 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
1251 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1255 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1256 /* Induction from different IVs is not supported. */
1257 if (def_type
== vect_induction_def
)
1259 stmt_vec_info other_info
;
1260 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1261 if (stmt_info
!= other_info
)
1264 else if (def_type
== vect_reduction_def
1265 || def_type
== vect_double_reduction_def
1266 || def_type
== vect_nested_cycle
)
1268 /* Else def types have to match. */
1269 stmt_vec_info other_info
;
1270 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1271 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1277 node
= vect_create_new_slp_node (stmts
, 0);
1278 SLP_TREE_VECTYPE (node
) = vectype
;
1283 bool two_operators
= false;
1284 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1285 tree vectype
= NULL_TREE
;
1286 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1287 &this_max_nunits
, matches
, &two_operators
,
1291 /* If the SLP node is a load, terminate the recursion unless masked. */
1292 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1293 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1295 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1298 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1303 *max_nunits
= this_max_nunits
;
1305 node
= vect_create_new_slp_node (stmts
, 0);
1306 SLP_TREE_VECTYPE (node
) = vectype
;
1307 /* And compute the load permutation. Whether it is actually
1308 a permutation depends on the unrolling factor which is
1310 vec
<unsigned> load_permutation
;
1312 stmt_vec_info load_info
;
1313 load_permutation
.create (group_size
);
1314 stmt_vec_info first_stmt_info
1315 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1316 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1318 int load_place
= vect_get_place_in_interleaving_chain
1319 (load_info
, first_stmt_info
);
1320 gcc_assert (load_place
!= -1);
1321 load_permutation
.safe_push (load_place
);
1323 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1327 else if (gimple_assign_single_p (stmt_info
->stmt
)
1328 && !gimple_vuse (stmt_info
->stmt
)
1329 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1331 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1332 the same SSA name vector of a compatible type to vectype. */
1333 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1334 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1335 stmt_vec_info estmt_info
;
1336 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1338 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1339 tree bfref
= gimple_assign_rhs1 (estmt
);
1341 if (!known_eq (bit_field_size (bfref
),
1342 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1343 || !constant_multiple_p (bit_field_offset (bfref
),
1344 bit_field_size (bfref
), &lane
))
1349 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1351 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1352 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1353 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1354 /* We are always building a permutation node even if it is an identity
1355 permute to shield the rest of the vectorizer from the odd node
1356 representing an actual vector without any scalar ops.
1357 ??? We could hide it completely with making the permute node
1359 node
= vect_create_new_slp_node (stmts
, 1);
1360 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1361 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1362 SLP_TREE_VECTYPE (node
) = vectype
;
1363 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1367 /* Get at the operands, verifying they are compatible. */
1368 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1369 slp_oprnd_info oprnd_info
;
1370 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1372 int res
= vect_get_and_check_slp_defs (vinfo
, &swap
[i
],
1373 stmts
, i
, &oprnds_info
);
1375 matches
[(res
== -1) ? 0 : i
] = false;
1379 for (i
= 0; i
< group_size
; ++i
)
1382 vect_free_oprnd_info (oprnds_info
);
1386 auto_vec
<slp_tree
, 4> children
;
1388 stmt_info
= stmts
[0];
1390 /* Create SLP_TREE nodes for the definition node/s. */
1391 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1396 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1398 /* COND_EXPR have one too many eventually if the condition
1400 gcc_assert (i
== 3 && nops
== 4);
1404 if (oprnd_info
->first_dt
!= vect_internal_def
1405 && oprnd_info
->first_dt
!= vect_reduction_def
1406 && oprnd_info
->first_dt
!= vect_induction_def
)
1408 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1409 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1410 oprnd_info
->ops
= vNULL
;
1411 children
.safe_push (invnode
);
1415 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1416 group_size
, &this_max_nunits
,
1418 &this_tree_size
, bst_map
)) != NULL
)
1420 oprnd_info
->def_stmts
= vNULL
;
1421 children
.safe_push (child
);
1425 /* If the SLP build failed fatally and we analyze a basic-block
1426 simply treat nodes we fail to build as externally defined
1427 (and thus build vectors from the scalar defs).
1428 The cost model will reject outright expensive cases.
1429 ??? This doesn't treat cases where permutation ultimatively
1430 fails (or we don't try permutation below). Ideally we'd
1431 even compute a permutation that will end up with the maximum
1433 if (is_a
<bb_vec_info
> (vinfo
)
1435 /* ??? Rejecting patterns this way doesn't work. We'd have to
1436 do extra work to cancel the pattern so the uses see the
1438 && !is_pattern_stmt_p (stmt_info
)
1439 && !oprnd_info
->any_pattern
)
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_NOTE
, vect_location
,
1443 "Building vector operands from scalars\n");
1445 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1446 children
.safe_push (child
);
1447 oprnd_info
->ops
= vNULL
;
1448 oprnd_info
->def_stmts
= vNULL
;
1452 /* If the SLP build for operand zero failed and operand zero
1453 and one can be commutated try that for the scalar stmts
1454 that failed the match. */
1456 /* A first scalar stmt mismatch signals a fatal mismatch. */
1458 /* ??? For COND_EXPRs we can swap the comparison operands
1459 as well as the arms under some constraints. */
1461 && oprnds_info
[1]->first_dt
== vect_internal_def
1462 && is_gimple_assign (stmt_info
->stmt
)
1463 /* Swapping operands for reductions breaks assumptions later on. */
1464 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1465 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1466 /* Do so only if the number of not successful permutes was nor more
1467 than a cut-ff as re-trying the recursive match on
1468 possibly each level of the tree would expose exponential
1472 /* See whether we can swap the matching or the non-matching
1474 bool swap_not_matching
= true;
1477 for (j
= 0; j
< group_size
; ++j
)
1479 if (matches
[j
] != !swap_not_matching
)
1481 stmt_vec_info stmt_info
= stmts
[j
];
1482 /* Verify if we can swap operands of this stmt. */
1483 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1485 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1487 if (!swap_not_matching
)
1489 swap_not_matching
= false;
1494 while (j
!= group_size
);
1496 /* Swap mismatched definition stmts. */
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_NOTE
, vect_location
,
1499 "Re-trying with swapped operands of stmts ");
1500 for (j
= 0; j
< group_size
; ++j
)
1501 if (matches
[j
] == !swap_not_matching
)
1503 std::swap (oprnds_info
[0]->def_stmts
[j
],
1504 oprnds_info
[1]->def_stmts
[j
]);
1505 std::swap (oprnds_info
[0]->ops
[j
],
1506 oprnds_info
[1]->ops
[j
]);
1507 if (dump_enabled_p ())
1508 dump_printf (MSG_NOTE
, "%d ", j
);
1510 if (dump_enabled_p ())
1511 dump_printf (MSG_NOTE
, "\n");
1512 /* And try again with scratch 'matches' ... */
1513 bool *tem
= XALLOCAVEC (bool, group_size
);
1514 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1515 group_size
, &this_max_nunits
,
1517 &this_tree_size
, bst_map
)) != NULL
)
1519 oprnd_info
->def_stmts
= vNULL
;
1520 children
.safe_push (child
);
1528 gcc_assert (child
== NULL
);
1529 FOR_EACH_VEC_ELT (children
, j
, child
)
1530 vect_free_slp_tree (child
, false);
1531 vect_free_oprnd_info (oprnds_info
);
1535 vect_free_oprnd_info (oprnds_info
);
1537 /* If we have all children of a child built up from uniform scalars
1538 then just throw that away, causing it built up from scalars.
1539 The exception is the SLP node for the vector store. */
1540 if (is_a
<bb_vec_info
> (vinfo
)
1541 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1542 /* ??? Rejecting patterns this way doesn't work. We'd have to
1543 do extra work to cancel the pattern so the uses see the
1545 && !is_pattern_stmt_p (stmt_info
))
1549 FOR_EACH_VEC_ELT (children
, j
, child
)
1550 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
1551 || !vect_slp_tree_uniform_p (child
))
1557 FOR_EACH_VEC_ELT (children
, j
, child
)
1558 vect_free_slp_tree (child
, false);
1560 if (dump_enabled_p ())
1561 dump_printf_loc (MSG_NOTE
, vect_location
,
1562 "Building parent vector operands from "
1563 "scalars instead\n");
1568 *tree_size
+= this_tree_size
+ 1;
1569 *max_nunits
= this_max_nunits
;
1573 /* ??? We'd likely want to either cache in bst_map sth like
1574 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1575 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1576 explicit stmts to put in so the keying on 'stmts' doesn't
1577 work (but we have the same issue with nodes that use 'ops'). */
1578 slp_tree one
= new _slp_tree
;
1579 slp_tree two
= new _slp_tree
;
1580 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1581 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1582 SLP_TREE_VECTYPE (one
) = vectype
;
1583 SLP_TREE_VECTYPE (two
) = vectype
;
1584 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1585 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1587 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1590 /* Here we record the original defs since this
1591 node represents the final lane configuration. */
1592 node
= vect_create_new_slp_node (stmts
, 2);
1593 SLP_TREE_VECTYPE (node
) = vectype
;
1594 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1595 SLP_TREE_CHILDREN (node
).quick_push (one
);
1596 SLP_TREE_CHILDREN (node
).quick_push (two
);
1597 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1598 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1599 enum tree_code ocode
= ERROR_MARK
;
1600 stmt_vec_info ostmt_info
;
1602 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1604 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1605 if (gimple_assign_rhs_code (ostmt
) != code0
)
1607 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1608 ocode
= gimple_assign_rhs_code (ostmt
);
1612 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1614 SLP_TREE_CODE (one
) = code0
;
1615 SLP_TREE_CODE (two
) = ocode
;
1616 SLP_TREE_LANES (one
) = stmts
.length ();
1617 SLP_TREE_LANES (two
) = stmts
.length ();
1618 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1619 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1623 node
= vect_create_new_slp_node (stmts
, nops
);
1624 SLP_TREE_VECTYPE (node
) = vectype
;
1625 SLP_TREE_CHILDREN (node
).splice (children
);
1629 /* Dump a single SLP tree NODE. */
1632 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1637 stmt_vec_info stmt_info
;
1640 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1641 dump_user_location_t user_loc
= loc
.get_user_location ();
1642 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1643 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1645 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1648 estimated_poly_value (node
->max_nunits
), node
->refcnt
);
1649 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1650 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1651 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1654 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1655 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1656 dump_printf (metadata
, "%T%s ", op
,
1657 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1658 dump_printf (metadata
, "}\n");
1660 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1662 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1663 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1664 dump_printf (dump_kind
, " %u", j
);
1665 dump_printf (dump_kind
, " }\n");
1667 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1669 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1670 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1671 dump_printf (dump_kind
, " %u[%u]",
1672 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1673 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1674 dump_printf (dump_kind
, " }\n");
1676 if (SLP_TREE_CHILDREN (node
).is_empty ())
1678 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1679 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1680 dump_printf (dump_kind
, " %p", (void *)child
);
1681 dump_printf (dump_kind
, "\n");
1685 debug (slp_tree node
)
1687 debug_dump_context ctx
;
1688 vect_print_slp_tree (MSG_NOTE
,
1689 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1693 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1696 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1697 slp_tree node
, hash_set
<slp_tree
> &visited
)
1702 if (visited
.add (node
))
1705 vect_print_slp_tree (dump_kind
, loc
, node
);
1707 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1708 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1712 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1715 hash_set
<slp_tree
> visited
;
1716 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1719 /* Mark the tree rooted at NODE with PURE_SLP. */
1722 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1725 stmt_vec_info stmt_info
;
1728 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1731 if (visited
.add (node
))
1734 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1735 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
1737 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1738 vect_mark_slp_stmts (child
, visited
);
1742 vect_mark_slp_stmts (slp_tree node
)
1744 hash_set
<slp_tree
> visited
;
1745 vect_mark_slp_stmts (node
, visited
);
1748 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1751 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
1754 stmt_vec_info stmt_info
;
1757 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1760 if (visited
.add (node
))
1763 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1765 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1766 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1767 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1770 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1771 vect_mark_slp_stmts_relevant (child
, visited
);
1775 vect_mark_slp_stmts_relevant (slp_tree node
)
1777 hash_set
<slp_tree
> visited
;
1778 vect_mark_slp_stmts_relevant (node
, visited
);
1781 /* Copy the SLP subtree rooted at NODE. */
1784 slp_copy_subtree (slp_tree node
, hash_map
<slp_tree
, slp_tree
> &map
)
1789 slp_tree
©_ref
= map
.get_or_insert (node
, &existed_p
);
1793 copy_ref
= new _slp_tree
;
1794 slp_tree copy
= copy_ref
;
1795 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
1796 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
1797 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
1798 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
1799 copy
->max_nunits
= node
->max_nunits
;
1801 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1803 SLP_TREE_SCALAR_STMTS (copy
) = SLP_TREE_SCALAR_STMTS (node
).copy ();
1804 stmt_vec_info stmt_info
;
1805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1806 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
1808 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1809 SLP_TREE_SCALAR_OPS (copy
) = SLP_TREE_SCALAR_OPS (node
).copy ();
1810 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1811 SLP_TREE_LOAD_PERMUTATION (copy
) = SLP_TREE_LOAD_PERMUTATION (node
).copy ();
1812 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1813 SLP_TREE_LANE_PERMUTATION (copy
) = SLP_TREE_LANE_PERMUTATION (node
).copy ();
1814 if (SLP_TREE_CHILDREN (node
).exists ())
1815 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
).copy ();
1816 gcc_assert (!SLP_TREE_VEC_STMTS (node
).exists ());
1819 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy
), i
, child
)
1821 SLP_TREE_CHILDREN (copy
)[i
] = slp_copy_subtree (child
, map
);
1822 SLP_TREE_CHILDREN (copy
)[i
]->refcnt
++;
1827 /* Rearrange the statements of NODE according to PERMUTATION. */
1830 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1831 vec
<unsigned> permutation
,
1832 hash_set
<slp_tree
> &visited
)
1837 if (visited
.add (node
))
1840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1841 vect_slp_rearrange_stmts (child
, group_size
, permutation
, visited
);
1843 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1845 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1846 vec
<stmt_vec_info
> tmp_stmts
;
1847 tmp_stmts
.create (group_size
);
1848 tmp_stmts
.quick_grow (group_size
);
1849 stmt_vec_info stmt_info
;
1850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1851 tmp_stmts
[permutation
[i
]] = stmt_info
;
1852 SLP_TREE_SCALAR_STMTS (node
).release ();
1853 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1855 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1857 gcc_assert (group_size
== SLP_TREE_SCALAR_OPS (node
).length ());
1859 tmp_ops
.create (group_size
);
1860 tmp_ops
.quick_grow (group_size
);
1862 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1863 tmp_ops
[permutation
[i
]] = op
;
1864 SLP_TREE_SCALAR_OPS (node
).release ();
1865 SLP_TREE_SCALAR_OPS (node
) = tmp_ops
;
1867 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1869 gcc_assert (group_size
== SLP_TREE_LANE_PERMUTATION (node
).length ());
1870 for (i
= 0; i
< group_size
; ++i
)
1871 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
1872 = permutation
[SLP_TREE_LANE_PERMUTATION (node
)[i
].second
];
1877 /* Attempt to reorder stmts in a reduction chain so that we don't
1878 require any load permutation. Return true if that was possible,
1879 otherwise return false. */
1882 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn
)
1886 slp_tree node
, load
;
1888 if (SLP_INSTANCE_LOADS (slp_instn
).is_empty ())
1891 /* Compare all the permutation sequences to the first one. We know
1892 that at least one load is permuted. */
1893 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1894 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1896 unsigned int group_size
= SLP_TREE_LOAD_PERMUTATION (node
).length ();
1897 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1899 if (!SLP_TREE_LOAD_PERMUTATION (load
).exists ()
1900 || SLP_TREE_LOAD_PERMUTATION (load
).length () != group_size
)
1902 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load
), j
, lidx
)
1903 if (lidx
!= SLP_TREE_LOAD_PERMUTATION (node
)[j
])
1907 /* Check that the loads in the first sequence are different and there
1908 are no gaps between them and that there is an actual permutation. */
1909 bool any_permute
= false;
1910 auto_sbitmap
load_index (group_size
);
1911 bitmap_clear (load_index
);
1912 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1916 if (lidx
>= group_size
)
1918 if (bitmap_bit_p (load_index
, lidx
))
1921 bitmap_set_bit (load_index
, lidx
);
1925 for (i
= 0; i
< group_size
; i
++)
1926 if (!bitmap_bit_p (load_index
, i
))
1929 /* This permutation is valid for reduction. Since the order of the
1930 statements in the nodes is not important unless they are memory
1931 accesses, we can rearrange the statements in all the nodes
1932 according to the order of the loads. */
1934 /* We have to unshare the SLP tree we modify. */
1935 hash_map
<slp_tree
, slp_tree
> map
;
1936 slp_tree unshared
= slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn
), map
);
1937 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn
), false);
1939 SLP_INSTANCE_TREE (slp_instn
) = unshared
;
1940 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1941 SLP_INSTANCE_LOADS (slp_instn
)[i
] = *map
.get (node
);
1942 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1944 /* Do the actual re-arrangement. */
1945 hash_set
<slp_tree
> visited
;
1946 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1947 node
->load_permutation
, visited
);
1949 /* We are done, no actual permutations need to be generated. */
1950 poly_uint64 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
);
1951 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1953 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1954 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
1955 /* But we have to keep those permutations that are required because
1956 of handling of gaps. */
1957 if (known_eq (unrolling_factor
, 1U)
1958 || (group_size
== DR_GROUP_SIZE (first_stmt_info
)
1959 && DR_GROUP_GAP (first_stmt_info
) == 0))
1960 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1962 for (j
= 0; j
< SLP_TREE_LOAD_PERMUTATION (node
).length (); ++j
)
1963 SLP_TREE_LOAD_PERMUTATION (node
)[j
] = j
;
1969 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1972 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
1973 hash_set
<slp_tree
> &visited
)
1975 if (visited
.add (node
))
1978 if (SLP_TREE_CHILDREN (node
).length () == 0)
1980 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1982 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1983 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1984 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1985 loads
.safe_push (node
);
1991 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1992 vect_gather_slp_loads (loads
, child
, visited
);
1997 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
1999 hash_set
<slp_tree
> visited
;
2000 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
2004 /* Find the last store in SLP INSTANCE. */
2007 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2009 stmt_vec_info last
= NULL
;
2010 stmt_vec_info stmt_vinfo
;
2012 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2014 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2015 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2021 /* Find the first stmt in NODE. */
2024 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2026 stmt_vec_info first
= NULL
;
2027 stmt_vec_info stmt_vinfo
;
2029 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2031 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2033 || get_later_stmt (stmt_vinfo
, first
) == first
)
2040 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2041 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2042 (also containing the first GROUP1_SIZE stmts, since stores are
2043 consecutive), the second containing the remainder.
2044 Return the first stmt in the second group. */
2046 static stmt_vec_info
2047 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2049 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2050 gcc_assert (group1_size
> 0);
2051 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2052 gcc_assert (group2_size
> 0);
2053 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2055 stmt_vec_info stmt_info
= first_vinfo
;
2056 for (unsigned i
= group1_size
; i
> 1; i
--)
2058 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2059 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2061 /* STMT is now the last element of the first group. */
2062 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2063 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2065 DR_GROUP_SIZE (group2
) = group2_size
;
2066 for (stmt_info
= group2
; stmt_info
;
2067 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2069 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2070 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2073 /* For the second group, the DR_GROUP_GAP is that before the original group,
2074 plus skipping over the first vector. */
2075 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2077 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2078 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2082 group1_size
, group2_size
);
2087 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2088 statements and a vector of NUNITS elements. */
2091 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2093 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2096 /* Analyze an SLP instance starting from a group of grouped stores. Call
2097 vect_build_slp_tree to build a tree of packed stmts if possible.
2098 Return FALSE if it's impossible to SLP any stmt in the loop. */
2101 vect_analyze_slp_instance (vec_info
*vinfo
,
2102 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2103 stmt_vec_info stmt_info
, unsigned max_tree_size
)
2105 slp_instance new_instance
;
2107 unsigned int group_size
;
2108 tree vectype
, scalar_type
= NULL_TREE
;
2110 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2111 vec
<stmt_vec_info
> scalar_stmts
;
2112 bool constructor
= false;
2114 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2116 scalar_type
= TREE_TYPE (DR_REF (dr
));
2117 group_size
= DR_GROUP_SIZE (stmt_info
);
2118 vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
, group_size
);
2120 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2122 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2123 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2124 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2126 else if (is_gimple_assign (stmt_info
->stmt
)
2127 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2129 vectype
= TREE_TYPE (gimple_assign_rhs1 (stmt_info
->stmt
));
2130 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2135 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2136 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2137 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2144 "Build SLP failed: unsupported data-type %T\n",
2149 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2151 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2152 scalar_stmts
.create (group_size
);
2153 stmt_vec_info next_info
= stmt_info
;
2154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2156 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2159 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2160 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2163 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2165 /* Collect the reduction stmts and store them in
2166 SLP_TREE_SCALAR_STMTS. */
2169 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2170 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2172 /* Mark the first element of the reduction chain as reduction to properly
2173 transform the node. In the reduction analysis phase only the last
2174 element of the chain is marked as reduction. */
2175 STMT_VINFO_DEF_TYPE (stmt_info
)
2176 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2177 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2178 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2180 else if (constructor
)
2182 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2184 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2186 if (TREE_CODE (val
) == SSA_NAME
)
2188 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2189 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2190 /* Value is defined in another basic block. */
2193 def_info
= vect_stmt_to_vectorize (def_info
);
2194 scalar_stmts
.safe_push (def_info
);
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE
, vect_location
,
2201 "Analyzing vectorizable constructor: %G\n",
2206 /* Collect reduction statements. */
2207 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2208 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2209 scalar_stmts
.safe_push (next_info
);
2212 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_NOTE
, vect_location
,
2215 "Starting SLP discovery for\n");
2216 for (i
= 0; i
< scalar_stmts
.length (); ++i
)
2217 dump_printf_loc (MSG_NOTE
, vect_location
,
2218 " %G", scalar_stmts
[i
]->stmt
);
2221 /* Build the tree for the SLP instance. */
2222 bool *matches
= XALLOCAVEC (bool, group_size
);
2223 unsigned npermutes
= 0;
2224 poly_uint64 max_nunits
= nunits
;
2225 unsigned tree_size
= 0;
2226 node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2227 &max_nunits
, matches
, &npermutes
,
2228 &tree_size
, bst_map
);
2231 /* Calculate the unrolling factor based on the smallest type. */
2232 poly_uint64 unrolling_factor
2233 = calculate_unrolling_factor (max_nunits
, group_size
);
2235 if (maybe_ne (unrolling_factor
, 1U)
2236 && is_a
<bb_vec_info
> (vinfo
))
2238 unsigned HOST_WIDE_INT const_max_nunits
;
2239 if (!max_nunits
.is_constant (&const_max_nunits
)
2240 || const_max_nunits
> group_size
)
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2244 "Build SLP failed: store group "
2245 "size not a multiple of the vector size "
2246 "in basic block SLP\n");
2247 vect_free_slp_tree (node
, false);
2250 /* Fatal mismatch. */
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE
, vect_location
,
2253 "SLP discovery succeeded but node needs "
2256 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2257 vect_free_slp_tree (node
, false);
2261 /* Create a new SLP instance. */
2262 new_instance
= XNEW (class _slp_instance
);
2263 SLP_INSTANCE_TREE (new_instance
) = node
;
2264 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2265 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2266 SLP_INSTANCE_ROOT_STMT (new_instance
) = constructor
? stmt_info
: NULL
;
2267 new_instance
->reduc_phis
= NULL
;
2268 new_instance
->cost_vec
= vNULL
;
2269 new_instance
->subgraph_entries
= vNULL
;
2271 vect_gather_slp_loads (new_instance
, node
);
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE
, vect_location
,
2274 "SLP size %u vs. limit %u.\n",
2275 tree_size
, max_tree_size
);
2277 /* Check whether any load is possibly permuted. */
2279 bool loads_permuted
= false;
2280 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2282 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2285 stmt_vec_info load_info
;
2286 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2287 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2289 loads_permuted
= true;
2294 /* If the loads and stores can be handled with load/store-lane
2295 instructions do not generate this SLP instance. */
2296 if (is_a
<loop_vec_info
> (vinfo
)
2298 && dr
&& vect_store_lanes_supported (vectype
, group_size
, false))
2301 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2303 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2304 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2305 /* Use SLP for strided accesses (or if we can't load-lanes). */
2306 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2307 || ! vect_load_lanes_supported
2308 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2309 DR_GROUP_SIZE (stmt_vinfo
), false))
2312 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2316 "Built SLP cancelled: can use "
2317 "load/store-lanes\n");
2318 vect_free_slp_instance (new_instance
, false);
2323 /* If this is a reduction chain with a conversion in front
2324 amend the SLP tree with a node for that. */
2326 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
2327 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
)
2329 /* Get at the conversion stmt - we know it's the single use
2330 of the last stmt of the reduction chain. */
2331 gimple
*tem
= vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2332 use_operand_p use_p
;
2334 bool r
= single_imm_use (gimple_assign_lhs (tem
),
2337 next_info
= vinfo
->lookup_stmt (use_stmt
);
2338 next_info
= vect_stmt_to_vectorize (next_info
);
2339 scalar_stmts
= vNULL
;
2340 scalar_stmts
.create (group_size
);
2341 for (unsigned i
= 0; i
< group_size
; ++i
)
2342 scalar_stmts
.quick_push (next_info
);
2343 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2344 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2345 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2346 SLP_INSTANCE_TREE (new_instance
) = conv
;
2347 /* We also have to fake this conversion stmt as SLP reduction
2348 group so we don't have to mess with too much code
2350 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2351 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2354 vinfo
->slp_instances
.safe_push (new_instance
);
2356 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2357 the number of scalar stmts in the root in a few places.
2358 Verify that assumption holds. */
2359 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2360 .length () == group_size
);
2362 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE
, vect_location
,
2365 "Final SLP tree for instance:\n");
2366 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2367 SLP_INSTANCE_TREE (new_instance
));
2375 /* Failed to SLP. */
2376 /* Free the allocated memory. */
2377 scalar_stmts
.release ();
2380 /* For basic block SLP, try to break the group up into multiples of the
2382 unsigned HOST_WIDE_INT const_nunits
;
2383 if (is_a
<bb_vec_info
> (vinfo
)
2384 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2385 && DR_GROUP_FIRST_ELEMENT (stmt_info
)
2386 && nunits
.is_constant (&const_nunits
))
2388 /* We consider breaking the group only on VF boundaries from the existing
2390 for (i
= 0; i
< group_size
; i
++)
2391 if (!matches
[i
]) break;
2393 if (i
>= const_nunits
&& i
< group_size
)
2395 /* Split into two groups at the first vector boundary before i. */
2396 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2397 unsigned group1_size
= i
& ~(const_nunits
- 1);
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE
, vect_location
,
2401 "Splitting SLP group at stmt %u\n", i
);
2402 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2404 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2406 /* If the first non-match was in the middle of a vector,
2407 skip the rest of that vector. */
2408 if (group1_size
< i
)
2410 i
= group1_size
+ const_nunits
;
2412 rest
= vect_split_slp_store_group (rest
, const_nunits
);
2415 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2416 rest
, max_tree_size
);
2419 /* Even though the first vector did not all match, we might be able to SLP
2420 (some) of the remainder. FORNOW ignore this possibility. */
2423 /* Failed to SLP. */
2424 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2430 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2431 trees of packed scalar stmts if SLP is possible. */
2434 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2437 stmt_vec_info first_element
;
2439 DUMP_VECT_SCOPE ("vect_analyze_slp");
2441 scalar_stmts_to_slp_tree_map_t
*bst_map
2442 = new scalar_stmts_to_slp_tree_map_t ();
2444 /* Find SLP sequences starting from groups of grouped stores. */
2445 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2446 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2448 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2450 if (loop_vinfo
->reduction_chains
.length () > 0)
2452 /* Find SLP sequences starting from reduction chains. */
2453 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2454 if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2457 /* Dissolve reduction chain group. */
2458 stmt_vec_info vinfo
= first_element
;
2459 stmt_vec_info last
= NULL
;
2462 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2463 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2464 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2468 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2469 /* It can be still vectorized as part of an SLP reduction. */
2470 loop_vinfo
->reductions
.safe_push (last
);
2474 /* Find SLP sequences starting from groups of reductions. */
2475 if (loop_vinfo
->reductions
.length () > 1)
2476 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2480 /* The map keeps a reference on SLP nodes built, release that. */
2481 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2482 it
!= bst_map
->end (); ++it
)
2484 vect_free_slp_tree ((*it
).second
, false);
2487 /* Optimize permutations in SLP reductions. */
2488 slp_instance instance
;
2489 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2491 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2492 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2493 /* Reduction (there are no data-refs in the root).
2494 In reduction chain the order of the loads is not important. */
2495 if (!STMT_VINFO_DATA_REF (stmt_info
)
2496 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2497 vect_attempt_slp_rearrange_stmts (instance
);
2500 /* Gather all loads in the SLP graph. */
2501 hash_set
<slp_tree
> visited
;
2502 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2503 vect_gather_slp_loads (vinfo
->slp_loads
, SLP_INSTANCE_TREE (instance
),
2506 return opt_result::success ();
2510 vect_optimize_slp (vec_info
*vinfo
)
2514 FOR_EACH_VEC_ELT (vinfo
->slp_loads
, i
, node
)
2516 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2519 /* In basic block vectorization we allow any subchain of an interleaving
2521 FORNOW: not in loop SLP because of realignment complications. */
2522 if (is_a
<bb_vec_info
> (vinfo
))
2524 bool subchain_p
= true;
2525 stmt_vec_info next_load_info
= NULL
;
2526 stmt_vec_info load_info
;
2528 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2531 && (next_load_info
!= load_info
2532 || DR_GROUP_GAP (load_info
) != 1))
2537 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
2541 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2547 stmt_vec_info load_info
;
2548 bool this_load_permuted
= false;
2550 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2551 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
2553 this_load_permuted
= true;
2556 stmt_vec_info first_stmt_info
2557 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
2558 if (!this_load_permuted
2559 /* The load requires permutation when unrolling exposes
2560 a gap either because the group is larger than the SLP
2561 group-size or because there is a gap between the groups. */
2562 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a
<loop_vec_info
> (vinfo
)), 1U)
2563 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
2564 && DR_GROUP_GAP (first_stmt_info
) == 0)))
2566 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2574 /* For each possible SLP instance decide whether to SLP it and calculate overall
2575 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2576 least one instance. */
2579 vect_make_slp_decision (loop_vec_info loop_vinfo
)
2582 poly_uint64 unrolling_factor
= 1;
2583 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2584 slp_instance instance
;
2585 int decided_to_slp
= 0;
2587 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2589 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2591 /* FORNOW: SLP if you can. */
2592 /* All unroll factors have the form:
2594 GET_MODE_SIZE (vinfo->vector_mode) * X
2596 for some rational X, so they must have a common multiple. */
2598 = force_common_multiple (unrolling_factor
,
2599 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
2601 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2602 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2603 loop-based vectorization. Such stmts will be marked as HYBRID. */
2604 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
2608 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
2610 if (decided_to_slp
&& dump_enabled_p ())
2612 dump_printf_loc (MSG_NOTE
, vect_location
,
2613 "Decided to SLP %d instances. Unrolling factor ",
2615 dump_dec (MSG_NOTE
, unrolling_factor
);
2616 dump_printf (MSG_NOTE
, "\n");
2619 return (decided_to_slp
> 0);
2622 /* Private data for vect_detect_hybrid_slp. */
2625 loop_vec_info loop_vinfo
;
2626 vec
<stmt_vec_info
> *worklist
;
2629 /* Walker for walk_gimple_op. */
2632 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
2634 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
2635 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
2640 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
2643 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
2644 if (PURE_SLP_STMT (def_stmt_info
))
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
2648 def_stmt_info
->stmt
);
2649 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
2650 dat
->worklist
->safe_push (def_stmt_info
);
2656 /* Find stmts that must be both vectorized and SLPed. */
2659 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
2661 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2663 /* All stmts participating in SLP are marked pure_slp, all other
2664 stmts are loop_vect.
2665 First collect all loop_vect stmts into a worklist. */
2666 auto_vec
<stmt_vec_info
> worklist
;
2667 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2669 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2670 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
2673 gphi
*phi
= gsi
.phi ();
2674 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2675 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2676 worklist
.safe_push (stmt_info
);
2678 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
2681 gimple
*stmt
= gsi_stmt (gsi
);
2682 if (is_gimple_debug (stmt
))
2684 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
2685 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2687 for (gimple_stmt_iterator gsi2
2688 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2689 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
2691 stmt_vec_info patt_info
2692 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
2693 if (!STMT_SLP_TYPE (patt_info
)
2694 && STMT_VINFO_RELEVANT (patt_info
))
2695 worklist
.safe_push (patt_info
);
2697 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2699 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2700 worklist
.safe_push (stmt_info
);
2704 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2705 mark any SLP vectorized stmt as hybrid.
2706 ??? We're visiting def stmts N times (once for each non-SLP and
2707 once for each hybrid-SLP use). */
2710 dat
.worklist
= &worklist
;
2711 dat
.loop_vinfo
= loop_vinfo
;
2712 memset (&wi
, 0, sizeof (wi
));
2713 wi
.info
= (void *)&dat
;
2714 while (!worklist
.is_empty ())
2716 stmt_vec_info stmt_info
= worklist
.pop ();
2717 /* Since SSA operands are not set up for pattern stmts we need
2718 to use walk_gimple_op. */
2720 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
2725 /* Initialize a bb_vec_info struct for the statements between
2726 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2728 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in
,
2729 gimple_stmt_iterator region_end_in
,
2730 vec_info_shared
*shared
)
2731 : vec_info (vec_info::bb
, init_cost (NULL
), shared
),
2732 bb (gsi_bb (region_begin_in
)),
2733 region_begin (region_begin_in
),
2734 region_end (region_end_in
)
2736 for (gimple
*stmt
: this->region_stmts ())
2738 gimple_set_uid (stmt
, 0);
2739 if (is_gimple_debug (stmt
))
2748 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2749 stmts in the basic block. */
2751 _bb_vec_info::~_bb_vec_info ()
2753 for (gimple
*stmt
: this->region_stmts ())
2754 /* Reset region marker. */
2755 gimple_set_uid (stmt
, -1);
2760 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2761 given then that child nodes have already been processed, and that
2762 their def types currently match their SLP node's def type. */
2765 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
2766 slp_instance node_instance
,
2767 stmt_vector_for_cost
*cost_vec
)
2769 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
2770 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
2772 /* Calculate the number of vector statements to be created for the
2773 scalar stmts in this node. For SLP reductions it is equal to the
2774 number of vector statements in the children (which has already been
2775 calculated by the recursive call). Otherwise it is the number of
2776 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2777 VF divided by the number of elements in a vector. */
2778 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2779 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2781 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
2782 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
2784 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2785 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
2792 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2793 vf
= loop_vinfo
->vectorization_factor
;
2796 unsigned int group_size
= SLP_TREE_LANES (node
);
2797 tree vectype
= SLP_TREE_VECTYPE (node
);
2798 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2799 = vect_get_num_vectors (vf
* group_size
, vectype
);
2802 /* Handle purely internal nodes. */
2803 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2804 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
2807 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
2808 node
, node_instance
, cost_vec
);
2811 /* Try to build NODE from scalars, returning true on success.
2812 NODE_INSTANCE is the SLP instance that contains NODE. */
2815 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
2816 slp_instance node_instance
)
2818 stmt_vec_info stmt_info
;
2821 if (!is_a
<bb_vec_info
> (vinfo
)
2822 || node
== SLP_INSTANCE_TREE (node_instance
)
2823 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
2824 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE
, vect_location
,
2829 "Building vector operands from scalars instead\n");
2831 /* Don't remove and free the child nodes here, since they could be
2832 referenced by other structures. The analysis and scheduling phases
2833 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2834 unsigned int group_size
= SLP_TREE_LANES (node
);
2835 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
2836 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
2837 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2839 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
2840 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
2845 /* Compute the prologue cost for invariant or constant operands represented
2849 vect_prologue_cost_for_slp (slp_tree node
,
2850 stmt_vector_for_cost
*cost_vec
)
2852 /* There's a special case of an existing vector, that costs nothing. */
2853 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
2854 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
2856 /* Without looking at the actual initializer a vector of
2857 constants can be implemented as load from the constant pool.
2858 When all elements are the same we can use a splat. */
2859 tree vectype
= SLP_TREE_VECTYPE (node
);
2860 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
2861 unsigned num_vects_to_check
;
2862 unsigned HOST_WIDE_INT const_nunits
;
2863 unsigned nelt_limit
;
2864 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
2865 && ! multiple_p (const_nunits
, group_size
))
2867 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
2868 nelt_limit
= const_nunits
;
2872 /* If either the vector has variable length or the vectors
2873 are composed of repeated whole groups we only need to
2874 cost construction once. All vectors will be the same. */
2875 num_vects_to_check
= 1;
2876 nelt_limit
= group_size
;
2878 tree elt
= NULL_TREE
;
2880 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
2882 unsigned si
= j
% group_size
;
2884 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
2885 /* ??? We're just tracking whether all operands of a single
2886 vector initializer are the same, ideally we'd check if
2887 we emitted the same one already. */
2888 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
2891 if (nelt
== nelt_limit
)
2893 record_stmt_cost (cost_vec
, 1,
2894 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2895 ? (elt
? scalar_to_vec
: vec_construct
)
2897 NULL
, vectype
, 0, vect_prologue
);
2903 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2904 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2906 Return true if the operations are supported. */
2909 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
2910 slp_instance node_instance
,
2911 hash_set
<slp_tree
> &visited
,
2912 hash_set
<slp_tree
> &lvisited
,
2913 stmt_vector_for_cost
*cost_vec
)
2918 /* Assume we can code-generate all invariants. */
2919 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2922 /* If we already analyzed the exact same set of scalar stmts we're done.
2923 We share the generated vector stmts for those.
2924 The SLP graph is acyclic so not caching whether we failed or succeeded
2925 doesn't result in any issue since we throw away the lvisited set
2927 if (visited
.contains (node
)
2928 || lvisited
.contains (node
))
2932 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2934 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
2935 visited
, lvisited
, cost_vec
);
2942 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
2945 lvisited
.add (node
);
2948 /* When the node can be vectorized cost invariant nodes it references.
2949 This is not done in DFS order to allow the refering node
2950 vectorizable_* calls to nail down the invariant nodes vector type
2951 and possibly unshare it if it needs a different vector type than
2954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2955 if ((SLP_TREE_DEF_TYPE (child
) == vect_constant_def
2956 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2957 /* Perform usual caching, note code-generation still
2958 code-gens these nodes multiple times but we expect
2959 to CSE them later. */
2960 && !visited
.contains (child
)
2961 && !lvisited
.add (child
))
2963 /* ??? After auditing more code paths make a "default"
2964 and push the vector type from NODE to all children
2965 if it is not already set. */
2966 /* Compute the number of vectors to be generated. */
2967 tree vector_type
= SLP_TREE_VECTYPE (child
);
2970 /* For shifts with a scalar argument we don't need
2971 to cost or code-generate anything.
2972 ??? Represent this more explicitely. */
2973 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
2974 == shift_vec_info_type
)
2978 unsigned group_size
= SLP_TREE_LANES (child
);
2980 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2981 vf
= loop_vinfo
->vectorization_factor
;
2982 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
2983 = vect_get_num_vectors (vf
* group_size
, vector_type
);
2984 /* And cost them. */
2985 vect_prologue_cost_for_slp (child
, cost_vec
);
2988 /* If this node or any of its children can't be vectorized, try pruning
2989 the tree here rather than felling the whole thing. */
2990 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
2992 /* We'll need to revisit this for invariant costing and number
2993 of vectorized stmt setting. */
3001 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3002 region and that can be vectorized using vectorizable_live_operation
3003 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3004 scalar code computing it to be retained. */
3007 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3008 slp_instance instance
,
3009 stmt_vector_for_cost
*cost_vec
,
3010 hash_set
<stmt_vec_info
> &svisited
)
3013 stmt_vec_info stmt_info
;
3014 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3017 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3018 if (svisited
.contains (orig_stmt_info
))
3020 bool mark_visited
= true;
3021 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3022 ssa_op_iter op_iter
;
3023 def_operand_p def_p
;
3024 FOR_EACH_SSA_DEF_OPERAND (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3026 imm_use_iterator use_iter
;
3028 stmt_vec_info use_stmt_info
;
3029 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3030 if (!is_gimple_debug (use_stmt
))
3032 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3034 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3036 STMT_VINFO_LIVE_P (stmt_info
) = true;
3037 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3038 NULL
, node
, instance
, i
,
3040 /* ??? So we know we can vectorize the live stmt
3041 from one SLP node. If we cannot do so from all
3042 or none consistently we'd have to record which
3043 SLP node (and lane) we want to use for the live
3044 operation. So make sure we can code-generate
3046 mark_visited
= false;
3048 STMT_VINFO_LIVE_P (stmt_info
) = false;
3049 BREAK_FROM_IMM_USE_STMT (use_iter
);
3052 /* We have to verify whether we can insert the lane extract
3053 before all uses. The following is a conservative approximation.
3054 We cannot put this into vectorizable_live_operation because
3055 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3057 Note that while the fact that we emit code for loads at the
3058 first load should make this a non-problem leafs we construct
3059 from scalars are vectorized after the last scalar def.
3060 ??? If we'd actually compute the insert location during
3061 analysis we could use sth less conservative than the last
3062 scalar stmt in the node for the dominance check. */
3063 /* ??? What remains is "live" uses in vector CTORs in the same
3064 SLP graph which is where those uses can end up code-generated
3065 right after their definition instead of close to their original
3066 use. But that would restrict us to code-generate lane-extracts
3067 from the latest stmt in a node. So we compensate for this
3068 during code-generation, simply not replacing uses for those
3069 hopefully rare cases. */
3070 if (STMT_VINFO_LIVE_P (stmt_info
))
3071 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3072 if (!is_gimple_debug (use_stmt
)
3073 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3074 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3075 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3079 "Cannot determine insertion place for "
3081 STMT_VINFO_LIVE_P (stmt_info
) = false;
3082 mark_visited
= true;
3086 svisited
.add (orig_stmt_info
);
3090 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3091 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3092 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3093 cost_vec
, svisited
);
3096 /* Analyze statements in SLP instances of VINFO. Return true if the
3097 operations are supported. */
3100 vect_slp_analyze_operations (vec_info
*vinfo
)
3102 slp_instance instance
;
3105 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3107 hash_set
<slp_tree
> visited
;
3108 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3110 hash_set
<slp_tree
> lvisited
;
3111 stmt_vector_for_cost cost_vec
;
3112 cost_vec
.create (2);
3113 if (!vect_slp_analyze_node_operations (vinfo
,
3114 SLP_INSTANCE_TREE (instance
),
3115 instance
, visited
, lvisited
,
3117 /* Instances with a root stmt require vectorized defs for the
3119 || (SLP_INSTANCE_ROOT_STMT (instance
)
3120 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3121 != vect_internal_def
)))
3123 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3124 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE
, vect_location
,
3127 "removing SLP instance operations starting from: %G",
3129 vect_free_slp_instance (instance
, false);
3130 vinfo
->slp_instances
.ordered_remove (i
);
3131 cost_vec
.release ();
3135 for (hash_set
<slp_tree
>::iterator x
= lvisited
.begin();
3136 x
!= lvisited
.end(); ++x
)
3140 /* Remember the SLP graph entry cost for later. */
3141 instance
->cost_vec
= cost_vec
;
3145 /* Compute vectorizable live stmts. */
3146 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3148 hash_set
<stmt_vec_info
> svisited
;
3149 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3150 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3151 instance
, &instance
->cost_vec
, svisited
);
3154 return !vinfo
->slp_instances
.is_empty ();
3157 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3158 closing the eventual chain. */
3161 get_ultimate_leader (slp_instance instance
,
3162 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3164 auto_vec
<slp_instance
*, 8> chain
;
3166 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3168 chain
.safe_push (tem
);
3171 while (!chain
.is_empty ())
3172 *chain
.pop () = instance
;
3176 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3179 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3180 slp_instance instance
, slp_tree node
,
3181 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3182 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3184 stmt_vec_info stmt_info
;
3187 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3190 slp_instance
&stmt_instance
3191 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3194 else if (stmt_instance
!= instance
)
3196 /* If we're running into a previously marked stmt make us the
3197 leader of the current ultimate leader. This keeps the
3198 leader chain acyclic and works even when the current instance
3199 connects two previously independent graph parts. */
3200 slp_instance stmt_leader
3201 = get_ultimate_leader (stmt_instance
, instance_leader
);
3202 if (stmt_leader
!= instance
)
3203 instance_leader
.put (stmt_leader
, instance
);
3205 stmt_instance
= instance
;
3207 /* If not all stmts had been visited we have to recurse on children. */
3212 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3213 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3214 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3218 /* Partition the SLP graph into pieces that can be costed independently. */
3221 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3223 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3225 /* First walk the SLP graph assigning each involved scalar stmt a
3226 corresponding SLP graph entry and upon visiting a previously
3227 marked stmt, make the stmts leader the current SLP graph entry. */
3228 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3229 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3230 slp_instance instance
;
3231 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3233 instance_leader
.put (instance
, instance
);
3234 vect_bb_partition_graph_r (bb_vinfo
,
3235 instance
, SLP_INSTANCE_TREE (instance
),
3236 stmt_to_instance
, instance_leader
);
3239 /* Then collect entries to each independent subgraph. */
3240 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3242 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3243 leader
->subgraph_entries
.safe_push (instance
);
3244 if (dump_enabled_p ()
3245 && leader
!= instance
)
3246 dump_printf_loc (MSG_NOTE
, vect_location
,
3247 "instance %p is leader of %p\n",
3252 /* Compute the scalar cost of the SLP node NODE and its children
3253 and return it. Do not account defs that are marked in LIFE and
3254 update LIFE according to uses of NODE. */
3257 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3258 slp_tree node
, vec
<bool, va_heap
> *life
,
3259 stmt_vector_for_cost
*cost_vec
,
3260 hash_set
<slp_tree
> &visited
)
3263 stmt_vec_info stmt_info
;
3266 if (visited
.add (node
))
3269 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3271 ssa_op_iter op_iter
;
3272 def_operand_p def_p
;
3277 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3278 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3280 /* If there is a non-vectorized use of the defs then the scalar
3281 stmt is kept live in which case we do not account it or any
3282 required defs in the SLP children in the scalar cost. This
3283 way we make the vectorization more costly when compared to
3285 if (!STMT_VINFO_LIVE_P (stmt_info
))
3287 FOR_EACH_SSA_DEF_OPERAND (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3289 imm_use_iterator use_iter
;
3291 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3292 if (!is_gimple_debug (use_stmt
))
3294 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
3297 (vect_stmt_to_vectorize (use_stmt_info
)))
3300 BREAK_FROM_IMM_USE_STMT (use_iter
);
3308 /* Count scalar stmts only once. */
3309 if (gimple_visited_p (orig_stmt
))
3311 gimple_set_visited (orig_stmt
, true);
3313 vect_cost_for_stmt kind
;
3314 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
3316 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
3319 kind
= scalar_store
;
3321 else if (vect_nop_conversion_p (orig_stmt_info
))
3325 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
, 0, vect_body
);
3328 auto_vec
<bool, 20> subtree_life
;
3329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3331 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3333 /* Do not directly pass LIFE to the recursive call, copy it to
3334 confine changes in the callee to the current child/subtree. */
3335 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3337 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
3338 for (unsigned j
= 0;
3339 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3341 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3342 if (perm
.first
== i
)
3343 subtree_life
[perm
.second
] = (*life
)[j
];
3348 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3349 subtree_life
.safe_splice (*life
);
3351 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
3353 subtree_life
.truncate (0);
3358 /* Check if vectorization of the basic block is profitable for the
3359 subgraph denoted by SLP_INSTANCES. */
3362 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
3363 vec
<slp_instance
> slp_instances
)
3365 slp_instance instance
;
3367 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3368 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3370 void *vect_target_cost_data
= init_cost (NULL
);
3372 /* Calculate scalar cost and sum the cost for the vector stmts
3373 previously collected. */
3374 stmt_vector_for_cost scalar_costs
;
3375 scalar_costs
.create (0);
3376 hash_set
<slp_tree
> visited
;
3377 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3379 auto_vec
<bool, 20> life
;
3380 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
3382 vect_bb_slp_scalar_cost (bb_vinfo
,
3383 SLP_INSTANCE_TREE (instance
),
3384 &life
, &scalar_costs
, visited
);
3385 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
3386 instance
->cost_vec
.release ();
3388 /* Unset visited flag. */
3389 stmt_info_for_cost
*si
;
3390 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
3391 gimple_set_visited (si
->stmt_info
->stmt
, false);
3393 void *scalar_target_cost_data
= init_cost (NULL
);
3394 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
3395 scalar_costs
.release ();
3397 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
3398 destroy_cost_data (scalar_target_cost_data
);
3400 /* Complete the target-specific vector cost calculation. */
3401 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
3402 &vec_inside_cost
, &vec_epilogue_cost
);
3403 destroy_cost_data (vect_target_cost_data
);
3405 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
3407 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3410 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
3412 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
3413 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
3414 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
3417 /* Vectorization is profitable if its cost is more than the cost of scalar
3418 version. Note that we err on the vector side for equal cost because
3419 the cost estimate is otherwise quite pessimistic (constant uses are
3420 free on the scalar side but cost a load on the vector side for
3422 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
3428 /* For each SLP subgraph determine profitability and remove parts not so.
3429 Returns true if any profitable to vectorize subgraph remains. */
3432 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
3434 slp_instance instance
;
3437 auto_vec
<slp_instance
> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ());
3438 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
3439 if (!instance
->subgraph_entries
.is_empty ())
3440 subgraphs
.quick_push (instance
);
3441 BB_VINFO_SLP_INSTANCES (bb_vinfo
).truncate (0);
3442 for (i
= 0; i
< subgraphs
.length ();)
3444 instance
= subgraphs
[i
];
3445 if (!vect_bb_vectorization_profitable_p (bb_vinfo
,
3446 instance
->subgraph_entries
))
3448 /* ??? We need to think of providing better dump/opt-report
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3453 "not vectorized: vectorization is not "
3458 FOR_EACH_VEC_ELT (instance
->subgraph_entries
, j
, entry
)
3459 if (entry
!= instance
)
3460 vect_free_slp_instance (entry
, false);
3461 vect_free_slp_instance (instance
, false);
3462 subgraphs
.ordered_remove (i
);
3468 FOR_EACH_VEC_ELT (instance
->subgraph_entries
, j
, entry
)
3469 BB_VINFO_SLP_INSTANCES (bb_vinfo
).safe_push (entry
);
3473 return !BB_VINFO_SLP_INSTANCES (bb_vinfo
).is_empty ();
3476 /* Find any vectorizable constructors and add them to the grouped_store
3480 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
3482 for (gimple
*stmt
: bb_vinfo
->region_stmts ())
3484 gassign
*assign
= dyn_cast
<gassign
*> (stmt
);
3485 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
3488 tree rhs
= gimple_assign_rhs1 (assign
);
3489 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
3490 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
3491 CONSTRUCTOR_NELTS (rhs
))
3492 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
3493 || uniform_vector_p (rhs
))
3496 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
3497 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3501 /* Check if the region described by BB_VINFO can be vectorized, returning
3502 true if so. When returning false, set FATAL to true if the same failure
3503 would prevent vectorization at other vector sizes, false if it is still
3504 worth trying other sizes. N_STMTS is the number of statements in the
3508 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
3509 vec
<int> *dataref_groups
)
3511 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3513 slp_instance instance
;
3515 poly_uint64 min_vf
= 2;
3517 /* The first group of checks is independent of the vector size. */
3520 /* Analyze the data references. */
3522 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
3524 if (dump_enabled_p ())
3525 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3526 "not vectorized: unhandled data-ref in basic "
3531 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3535 "not vectorized: unhandled data access in "
3540 vect_slp_check_for_constructors (bb_vinfo
);
3542 /* If there are no grouped stores and no constructors in the region
3543 there is no need to continue with pattern recog as vect_analyze_slp
3544 will fail anyway. */
3545 if (bb_vinfo
->grouped_stores
.is_empty ())
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3549 "not vectorized: no grouped stores in "
3554 /* While the rest of the analysis below depends on it in some way. */
3557 vect_pattern_recog (bb_vinfo
);
3559 /* Check the SLP opportunities in the basic block, analyze and build SLP
3561 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
3563 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3566 "Failed to SLP the basic block.\n");
3567 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3568 "not vectorized: failed to find SLP opportunities "
3569 "in basic block.\n");
3574 /* Optimize permutations. */
3575 vect_optimize_slp (bb_vinfo
);
3577 vect_record_base_alignments (bb_vinfo
);
3579 /* Analyze and verify the alignment of data references and the
3580 dependence in the SLP instances. */
3581 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
3583 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
3584 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
3586 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3587 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_NOTE
, vect_location
,
3590 "removing SLP instance operations starting from: %G",
3592 vect_free_slp_instance (instance
, false);
3593 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
3597 /* Mark all the statements that we want to vectorize as pure SLP and
3599 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3600 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
3601 if (SLP_INSTANCE_ROOT_STMT (instance
))
3602 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
3606 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
3609 if (!vect_slp_analyze_operations (bb_vinfo
))
3611 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3613 "not vectorized: bad operation in basic block.\n");
3617 vect_bb_partition_graph (bb_vinfo
);
3619 /* Cost model: check if the vectorization is worthwhile. */
3620 if (!unlimited_cost_model (NULL
)
3621 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
3623 if (dump_enabled_p ())
3624 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3625 "not vectorized: vectorization is not "
3630 if (dump_enabled_p ())
3631 dump_printf_loc (MSG_NOTE
, vect_location
,
3632 "Basic block will be vectorized using SLP\n");
3636 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3637 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3638 on success. The region has N_STMTS statements and has the datarefs
3639 given by DATAREFS. */
3642 vect_slp_region (gimple_stmt_iterator region_begin
,
3643 gimple_stmt_iterator region_end
,
3644 vec
<data_reference_p
> datarefs
,
3645 vec
<int> *dataref_groups
,
3646 unsigned int n_stmts
)
3648 bb_vec_info bb_vinfo
;
3649 auto_vector_modes vector_modes
;
3651 /* Autodetect first vector size we try. */
3652 machine_mode next_vector_mode
= VOIDmode
;
3653 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
3654 unsigned int mode_i
= 0;
3656 vec_info_shared shared
;
3658 machine_mode autodetected_vector_mode
= VOIDmode
;
3661 bool vectorized
= false;
3663 bb_vinfo
= new _bb_vec_info (region_begin
, region_end
, &shared
);
3665 bool first_time_p
= shared
.datarefs
.is_empty ();
3666 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
3668 bb_vinfo
->shared
->save_datarefs ();
3670 bb_vinfo
->shared
->check_datarefs ();
3671 bb_vinfo
->vector_mode
= next_vector_mode
;
3673 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
3674 && dbg_cnt (vect_slp
))
3676 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_NOTE
, vect_location
,
3679 "***** Analysis succeeded with vector mode"
3680 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
3681 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
3684 bb_vinfo
->shared
->check_datarefs ();
3685 vect_schedule_slp (bb_vinfo
);
3687 unsigned HOST_WIDE_INT bytes
;
3688 if (dump_enabled_p ())
3690 if (GET_MODE_SIZE (bb_vinfo
->vector_mode
).is_constant (&bytes
))
3691 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3692 "basic block part vectorized using %wu byte "
3693 "vectors\n", bytes
);
3695 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3696 "basic block part vectorized using variable "
3697 "length vectors\n");
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_NOTE
, vect_location
,
3706 "***** Analysis failed with vector mode %s\n",
3707 GET_MODE_NAME (bb_vinfo
->vector_mode
));
3711 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
3714 while (mode_i
< vector_modes
.length ()
3715 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_NOTE
, vect_location
,
3719 "***** The result for vector mode %s would"
3721 GET_MODE_NAME (vector_modes
[mode_i
]));
3727 if (mode_i
< vector_modes
.length ()
3728 && VECTOR_MODE_P (autodetected_vector_mode
)
3729 && (related_vector_mode (vector_modes
[mode_i
],
3730 GET_MODE_INNER (autodetected_vector_mode
))
3731 == autodetected_vector_mode
)
3732 && (related_vector_mode (autodetected_vector_mode
,
3733 GET_MODE_INNER (vector_modes
[mode_i
]))
3734 == vector_modes
[mode_i
]))
3736 if (dump_enabled_p ())
3737 dump_printf_loc (MSG_NOTE
, vect_location
,
3738 "***** Skipping vector mode %s, which would"
3739 " repeat the analysis for %s\n",
3740 GET_MODE_NAME (vector_modes
[mode_i
]),
3741 GET_MODE_NAME (autodetected_vector_mode
));
3746 || mode_i
== vector_modes
.length ()
3747 || autodetected_vector_mode
== VOIDmode
3748 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3749 vector sizes will fail do not bother iterating. */
3753 /* Try the next biggest vector size. */
3754 next_vector_mode
= vector_modes
[mode_i
++];
3755 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_NOTE
, vect_location
,
3757 "***** Re-trying analysis with vector mode %s\n",
3758 GET_MODE_NAME (next_vector_mode
));
3762 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3763 true if anything in the basic-block was vectorized. */
3766 vect_slp_bb (basic_block bb
)
3768 vec
<data_reference_p
> datarefs
= vNULL
;
3769 vec
<int> dataref_groups
= vNULL
;
3771 int current_group
= 0;
3772 gimple_stmt_iterator region_begin
= gsi_start_nondebug_after_labels_bb (bb
);
3773 gimple_stmt_iterator region_end
= gsi_last_bb (bb
);
3774 if (!gsi_end_p (region_end
))
3775 gsi_next (®ion_end
);
3777 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
3780 gimple
*stmt
= gsi_stmt (gsi
);
3781 if (is_gimple_debug (stmt
))
3786 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
3787 vect_location
= stmt
;
3789 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
3790 &dataref_groups
, current_group
))
3793 if (insns
> param_slp_max_insns_in_bb
)
3795 if (dump_enabled_p ())
3796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3797 "not vectorized: too many instructions in "
3802 return vect_slp_region (region_begin
, region_end
, datarefs
,
3803 &dataref_groups
, insns
);
3807 /* Build a variable-length vector in which the elements in ELTS are repeated
3808 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3809 RESULTS and add any new instructions to SEQ.
3811 The approach we use is:
3813 (1) Find a vector mode VM with integer elements of mode IM.
3815 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3816 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3817 from small vectors to IM.
3819 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3821 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3822 correct byte contents.
3824 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3826 We try to find the largest IM for which this sequence works, in order
3827 to cut down on the number of interleaves. */
3830 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
3831 vec
<tree
> elts
, unsigned int nresults
,
3834 unsigned int nelts
= elts
.length ();
3835 tree element_type
= TREE_TYPE (vector_type
);
3837 /* (1) Find a vector mode VM with integer elements of mode IM. */
3838 unsigned int nvectors
= 1;
3839 tree new_vector_type
;
3841 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
3842 &nvectors
, &new_vector_type
,
3846 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3847 unsigned int partial_nelts
= nelts
/ nvectors
;
3848 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
3850 tree_vector_builder partial_elts
;
3851 auto_vec
<tree
, 32> pieces (nvectors
* 2);
3852 pieces
.quick_grow (nvectors
* 2);
3853 for (unsigned int i
= 0; i
< nvectors
; ++i
)
3855 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3856 ELTS' has mode IM. */
3857 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
3858 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
3859 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
3860 tree t
= gimple_build_vector (seq
, &partial_elts
);
3861 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
3862 TREE_TYPE (new_vector_type
), t
);
3864 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3865 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
3868 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3869 correct byte contents.
3871 We need to repeat the following operation log2(nvectors) times:
3873 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3874 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3876 However, if each input repeats every N elements and the VF is
3877 a multiple of N * 2, the HI result is the same as the LO. */
3878 unsigned int in_start
= 0;
3879 unsigned int out_start
= nvectors
;
3880 unsigned int hi_start
= nvectors
/ 2;
3881 /* A bound on the number of outputs needed to produce NRESULTS results
3882 in the final iteration. */
3883 unsigned int noutputs_bound
= nvectors
* nresults
;
3884 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
3886 noutputs_bound
/= 2;
3887 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
3888 for (unsigned int i
= 0; i
< limit
; ++i
)
3891 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
3894 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
3898 tree output
= make_ssa_name (new_vector_type
);
3899 tree input1
= pieces
[in_start
+ (i
/ 2)];
3900 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
3901 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
3904 gimple_seq_add_stmt (seq
, stmt
);
3905 pieces
[out_start
+ i
] = output
;
3907 std::swap (in_start
, out_start
);
3910 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3911 results
.reserve (nresults
);
3912 for (unsigned int i
= 0; i
< nresults
; ++i
)
3914 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
3915 pieces
[in_start
+ i
]));
3917 results
.quick_push (results
[i
- nvectors
]);
3921 /* For constant and loop invariant defs in OP_NODE this function creates
3922 vector defs that will be used in the vectorized stmts and stores them
3923 to SLP_TREE_VEC_DEFS of OP_NODE. */
3926 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
3928 unsigned HOST_WIDE_INT nunits
;
3930 unsigned j
, number_of_places_left_in_vector
;
3933 int group_size
= op_node
->ops
.length ();
3934 unsigned int vec_num
, i
;
3935 unsigned number_of_copies
= 1;
3937 gimple_seq ctor_seq
= NULL
;
3938 auto_vec
<tree
, 16> permute_results
;
3940 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3941 vector_type
= SLP_TREE_VECTYPE (op_node
);
3943 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
3944 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
3945 auto_vec
<tree
> voprnds (number_of_vectors
);
3947 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3948 created vectors. It is greater than 1 if unrolling is performed.
3950 For example, we have two scalar operands, s1 and s2 (e.g., group of
3951 strided accesses of size two), while NUNITS is four (i.e., four scalars
3952 of this type can be packed in a vector). The output vector will contain
3953 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3956 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3957 containing the operands.
3959 For example, NUNITS is four as before, and the group size is 8
3960 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3961 {s5, s6, s7, s8}. */
3963 /* When using duplicate_and_interleave, we just need one element for
3964 each scalar statement. */
3965 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
3966 nunits
= group_size
;
3968 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
3970 number_of_places_left_in_vector
= nunits
;
3972 tree_vector_builder
elts (vector_type
, nunits
, 1);
3973 elts
.quick_grow (nunits
);
3974 stmt_vec_info insert_after
= NULL
;
3975 for (j
= 0; j
< number_of_copies
; j
++)
3978 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
3980 /* Create 'vect_ = {op0,op1,...,opn}'. */
3981 number_of_places_left_in_vector
--;
3983 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
3985 if (CONSTANT_CLASS_P (op
))
3987 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3989 /* Can't use VIEW_CONVERT_EXPR for booleans because
3990 of possibly different sizes of scalar value and
3992 if (integer_zerop (op
))
3993 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
3994 else if (integer_onep (op
))
3995 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
4000 op
= fold_unary (VIEW_CONVERT_EXPR
,
4001 TREE_TYPE (vector_type
), op
);
4002 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
4006 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
4008 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4011 = build_all_ones_cst (TREE_TYPE (vector_type
));
4013 = build_zero_cst (TREE_TYPE (vector_type
));
4014 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
4015 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4021 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4024 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4027 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4031 elts
[number_of_places_left_in_vector
] = op
;
4032 if (!CONSTANT_CLASS_P (op
))
4034 /* For BB vectorization we have to compute an insert location
4035 when a def is inside the analyzed region since we cannot
4036 simply insert at the BB start in this case. */
4037 stmt_vec_info opdef
;
4038 if (TREE_CODE (orig_op
) == SSA_NAME
4039 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4040 && is_a
<bb_vec_info
> (vinfo
)
4041 && (opdef
= vinfo
->lookup_def (orig_op
)))
4044 insert_after
= opdef
;
4046 insert_after
= get_later_stmt (insert_after
, opdef
);
4049 if (number_of_places_left_in_vector
== 0)
4052 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4053 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4054 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
4057 if (permute_results
.is_empty ())
4058 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
4059 elts
, number_of_vectors
,
4061 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
4063 if (!gimple_seq_empty_p (ctor_seq
))
4067 gimple_stmt_iterator gsi
4068 = gsi_for_stmt (insert_after
->stmt
);
4069 gsi_insert_seq_after (&gsi
, ctor_seq
,
4070 GSI_CONTINUE_LINKING
);
4073 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
4076 voprnds
.quick_push (vec_cst
);
4077 insert_after
= NULL
;
4078 number_of_places_left_in_vector
= nunits
;
4080 elts
.new_vector (vector_type
, nunits
, 1);
4081 elts
.quick_grow (nunits
);
4086 /* Since the vectors are created in the reverse order, we should invert
4088 vec_num
= voprnds
.length ();
4089 for (j
= vec_num
; j
!= 0; j
--)
4091 vop
= voprnds
[j
- 1];
4092 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4095 /* In case that VF is greater than the unrolling factor needed for the SLP
4096 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4097 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4098 to replicate the vectors. */
4099 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
4100 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
4102 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4105 /* Get the Ith vectorized definition from SLP_NODE. */
4108 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
4110 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
4111 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
4113 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
4116 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4119 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
4121 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
4122 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
4125 gimple
*vec_def_stmt
;
4126 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
4127 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
4130 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
4133 /* Get N vectorized definitions for SLP_NODE. */
4136 vect_get_slp_defs (vec_info
*,
4137 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
4140 n
= SLP_TREE_CHILDREN (slp_node
).length ();
4142 for (unsigned i
= 0; i
< n
; ++i
)
4144 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
4145 vec
<tree
> vec_defs
= vNULL
;
4146 vect_get_slp_defs (child
, &vec_defs
);
4147 vec_oprnds
->quick_push (vec_defs
);
4151 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4152 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4153 permute statements for the SLP node NODE. */
4156 vect_transform_slp_perm_load (vec_info
*vinfo
,
4157 slp_tree node
, vec
<tree
> dr_chain
,
4158 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
4159 bool analyze_only
, unsigned *n_perms
)
4161 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4163 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4164 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
4165 unsigned int mask_element
;
4168 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4171 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4173 mode
= TYPE_MODE (vectype
);
4174 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4176 /* Initialize the vect stmts of NODE to properly insert the generated
4179 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
4180 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
4181 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
4183 /* Generate permutation masks for every NODE. Number of masks for each NODE
4184 is equal to GROUP_SIZE.
4185 E.g., we have a group of three nodes with three loads from the same
4186 location in each node, and the vector size is 4. I.e., we have a
4187 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4188 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4189 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4192 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4193 The last mask is illegal since we assume two operands for permute
4194 operation, and the mask element values can't be outside that range.
4195 Hence, the last mask must be converted into {2,5,5,5}.
4196 For the first two permutations we need the first and the second input
4197 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4198 we need the second and the third vectors: {b1,c1,a2,b2} and
4201 int vect_stmts_counter
= 0;
4202 unsigned int index
= 0;
4203 int first_vec_index
= -1;
4204 int second_vec_index
= -1;
4208 vec_perm_builder mask
;
4209 unsigned int nelts_to_build
;
4210 unsigned int nvectors_per_build
;
4211 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
4212 && multiple_p (nunits
, group_size
));
4215 /* A single vector contains a whole number of copies of the node, so:
4216 (a) all permutes can use the same mask; and
4217 (b) the permutes only need a single vector input. */
4218 mask
.new_vector (nunits
, group_size
, 3);
4219 nelts_to_build
= mask
.encoded_nelts ();
4220 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
4224 /* We need to construct a separate mask for each vector statement. */
4225 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
4226 if (!nunits
.is_constant (&const_nunits
)
4227 || !vf
.is_constant (&const_vf
))
4229 mask
.new_vector (const_nunits
, const_nunits
, 1);
4230 nelts_to_build
= const_vf
* group_size
;
4231 nvectors_per_build
= 1;
4234 unsigned int count
= mask
.encoded_nelts ();
4235 mask
.quick_grow (count
);
4236 vec_perm_indices indices
;
4238 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
4240 unsigned int iter_num
= j
/ group_size
;
4241 unsigned int stmt_num
= j
% group_size
;
4242 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
4243 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
4246 first_vec_index
= 0;
4251 /* Enforced before the loop when !repeating_p. */
4252 unsigned int const_nunits
= nunits
.to_constant ();
4253 vec_index
= i
/ const_nunits
;
4254 mask_element
= i
% const_nunits
;
4255 if (vec_index
== first_vec_index
4256 || first_vec_index
== -1)
4258 first_vec_index
= vec_index
;
4260 else if (vec_index
== second_vec_index
4261 || second_vec_index
== -1)
4263 second_vec_index
= vec_index
;
4264 mask_element
+= const_nunits
;
4268 if (dump_enabled_p ())
4269 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4270 "permutation requires at "
4271 "least three vectors %G",
4273 gcc_assert (analyze_only
);
4277 gcc_assert (mask_element
< 2 * const_nunits
);
4280 if (mask_element
!= index
)
4282 mask
[index
++] = mask_element
;
4284 if (index
== count
&& !noop_p
)
4286 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
4287 if (!can_vec_perm_const_p (mode
, indices
))
4289 if (dump_enabled_p ())
4291 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4293 "unsupported vect permute { ");
4294 for (i
= 0; i
< count
; ++i
)
4296 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4297 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4299 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4301 gcc_assert (analyze_only
);
4312 tree mask_vec
= NULL_TREE
;
4315 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4317 if (second_vec_index
== -1)
4318 second_vec_index
= first_vec_index
;
4320 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
4322 /* Generate the permute statement if necessary. */
4323 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
4324 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
4328 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
4330 = vect_create_destination_var (gimple_assign_lhs (stmt
),
4332 perm_dest
= make_ssa_name (perm_dest
);
4334 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4335 first_vec
, second_vec
,
4337 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
4341 /* If mask was NULL_TREE generate the requested
4342 identity transform. */
4343 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
4345 /* Store the vector statement in NODE. */
4346 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
4351 first_vec_index
= -1;
4352 second_vec_index
= -1;
4361 /* Vectorize the SLP permutations in NODE as specified
4362 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4363 child number and lane number.
4364 Interleaving of two two-lane two-child SLP subtrees (not supported):
4365 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4366 A blend of two four-lane two-child SLP subtrees:
4367 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4368 Highpart of a four-lane one-child SLP subtree (not supported):
4369 [ { 0, 2 }, { 0, 3 } ]
4370 Where currently only a subset is supported by code generating below. */
4373 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
4374 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
4376 tree vectype
= SLP_TREE_VECTYPE (node
);
4378 /* ??? We currently only support all same vector input and output types
4379 while the SLP IL should really do a concat + select and thus accept
4380 arbitrary mismatches. */
4383 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4384 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
4386 if (dump_enabled_p ())
4387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4388 "Unsupported lane permutation\n");
4392 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
4393 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
4394 if (dump_enabled_p ())
4396 dump_printf_loc (MSG_NOTE
, vect_location
,
4397 "vectorizing permutation");
4398 for (unsigned i
= 0; i
< perm
.length (); ++i
)
4399 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
4400 dump_printf (MSG_NOTE
, "\n");
4403 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4404 if (!nunits
.is_constant ())
4406 unsigned HOST_WIDE_INT vf
= 1;
4407 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4408 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
4410 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
4411 gcc_assert (multiple_p (olanes
, nunits
));
4413 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4414 from the { SLP operand, scalar lane } permutation as recorded in the
4415 SLP node as intermediate step. This part should already work
4416 with SLP children with arbitrary number of lanes. */
4417 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
4418 auto_vec
<unsigned> active_lane
;
4419 vperm
.create (olanes
);
4420 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
4421 for (unsigned i
= 0; i
< vf
; ++i
)
4423 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
4425 std::pair
<unsigned, unsigned> p
= perm
[pi
];
4426 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
4427 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
4428 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
4429 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
4430 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
4432 /* Advance to the next group. */
4433 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
4434 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
4437 if (dump_enabled_p ())
4439 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
4440 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4442 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
4443 dump_printf (MSG_NOTE
, ",");
4444 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
4445 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
4446 vperm
[i
].first
.second
);
4448 dump_printf (MSG_NOTE
, "\n");
4451 /* We can only handle two-vector permutes, everything else should
4452 be lowered on the SLP level. The following is closely inspired
4453 by vect_transform_slp_perm_load and is supposed to eventually
4455 ??? As intermediate step do code-gen in the SLP tree representation
4457 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
4458 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
4459 unsigned int const_nunits
= nunits
.to_constant ();
4460 unsigned int index
= 0;
4461 unsigned int mask_element
;
4462 vec_perm_builder mask
;
4463 mask
.new_vector (const_nunits
, const_nunits
, 1);
4464 unsigned int count
= mask
.encoded_nelts ();
4465 mask
.quick_grow (count
);
4466 vec_perm_indices indices
;
4467 unsigned nperms
= 0;
4468 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4470 mask_element
= vperm
[i
].second
;
4471 if (first_vec
.first
== -1U
4472 || first_vec
== vperm
[i
].first
)
4473 first_vec
= vperm
[i
].first
;
4474 else if (second_vec
.first
== -1U
4475 || second_vec
== vperm
[i
].first
)
4477 second_vec
= vperm
[i
].first
;
4478 mask_element
+= const_nunits
;
4482 if (dump_enabled_p ())
4483 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4484 "permutation requires at "
4485 "least three vectors");
4490 mask
[index
++] = mask_element
;
4494 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
4496 bool identity_p
= indices
.series_p (0, 1, 0, 1);
4498 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
4500 if (dump_enabled_p ())
4502 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4504 "unsupported vect permute { ");
4505 for (i
= 0; i
< count
; ++i
)
4507 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4508 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4510 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4520 if (second_vec
.first
== -1U)
4521 second_vec
= first_vec
;
4523 /* Generate the permute statement if necessary. */
4524 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
4526 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
4528 tree perm_dest
= make_ssa_name (vectype
);
4531 slp_tree second_node
4532 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
4534 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
4535 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4536 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4537 first_def
, second_def
,
4541 /* We need a copy here in case the def was external. */
4542 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
4543 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
4544 /* Store the vector statement in NODE. */
4545 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
4549 first_vec
= std::make_pair (-1U, -1U);
4550 second_vec
= std::make_pair (-1U, -1U);
4555 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
4560 /* Vectorize SLP instance tree in postorder. */
4563 vect_schedule_slp_instance (vec_info
*vinfo
,
4564 slp_tree node
, slp_instance instance
)
4566 gimple_stmt_iterator si
;
4570 /* See if we have already vectorized the node in the graph of the
4572 if ((SLP_TREE_DEF_TYPE (node
) == vect_internal_def
4573 && SLP_TREE_VEC_STMTS (node
).exists ())
4574 || SLP_TREE_VEC_DEFS (node
).exists ())
4577 /* Vectorize externals and constants. */
4578 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4579 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4581 /* ??? vectorizable_shift can end up using a scalar operand which is
4582 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4583 node in this case. */
4584 if (!SLP_TREE_VECTYPE (node
))
4587 vect_create_constant_vectors (vinfo
, node
);
4591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4592 vect_schedule_slp_instance (vinfo
, child
, instance
);
4594 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
4595 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4597 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4598 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_NOTE
, vect_location
,
4600 "------>vectorizing SLP node starting from: %G",
4603 if (STMT_VINFO_DATA_REF (stmt_info
)
4604 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
4606 /* Vectorized loads go before the first scalar load to make it
4607 ready early, vectorized stores go before the last scalar
4608 stmt which is where all uses are ready. */
4609 stmt_vec_info last_stmt_info
= NULL
;
4610 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
4611 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
4612 else /* DR_IS_WRITE */
4613 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
4614 si
= gsi_for_stmt (last_stmt_info
->stmt
);
4616 else if (SLP_TREE_CHILDREN (node
).is_empty ())
4618 /* This happens for reduction and induction PHIs where we do not use the
4619 insertion iterator. */
4620 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4621 == cycle_phi_info_type
4622 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4623 == induc_vec_info_type
));
4628 /* Emit other stmts after the children vectorized defs which is
4629 earliest possible. */
4630 gimple
*last_stmt
= NULL
;
4631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4632 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4634 /* For fold-left reductions we are retaining the scalar
4635 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4636 set so the representation isn't perfect. Resort to the
4637 last scalar def here. */
4638 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
4640 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
4641 == cycle_phi_info_type
);
4642 gphi
*phi
= as_a
<gphi
*>
4643 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
4645 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
4648 /* We are emitting all vectorized stmts in the same place and
4649 the last one is the last.
4650 ??? Unless we have a load permutation applied and that
4651 figures to re-use an earlier generated load. */
4654 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
4656 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4659 else if (!SLP_TREE_VECTYPE (child
))
4661 /* For externals we use unvectorized at all scalar defs. */
4664 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
4665 if (TREE_CODE (def
) == SSA_NAME
4666 && !SSA_NAME_IS_DEFAULT_DEF (def
))
4668 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
4670 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
4676 /* For externals we have to look at all defs since their
4677 insertion place is decided per vector. But beware
4678 of pre-existing vectors where we need to make sure
4679 we do not insert before the region boundary. */
4680 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
4681 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
4682 last_stmt
= gsi_stmt (as_a
<bb_vec_info
> (vinfo
)->region_begin
);
4687 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
4688 if (TREE_CODE (vdef
) == SSA_NAME
4689 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
4691 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
4693 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4698 /* This can happen when all children are pre-existing vectors or
4701 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
4702 if (is_a
<gphi
*> (last_stmt
))
4703 si
= gsi_after_labels (gimple_bb (last_stmt
));
4706 si
= gsi_for_stmt (last_stmt
);
4711 bool done_p
= false;
4713 /* Handle purely internal nodes. */
4714 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4716 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4717 be shared with different SLP nodes (but usually it's the same
4718 operation apart from the case the stmt is only there for denoting
4719 the actual scalar lane defs ...). So do not call vect_transform_stmt
4720 but open-code it here (partly). */
4721 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
4726 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
4729 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4730 For loop vectorization this is done in vectorizable_call, but for SLP
4731 it needs to be deferred until end of vect_schedule_slp, because multiple
4732 SLP instances may refer to the same scalar stmt. */
4735 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
4736 slp_tree node
, hash_set
<slp_tree
> &visited
)
4739 gimple_stmt_iterator gsi
;
4743 stmt_vec_info stmt_info
;
4745 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
4748 if (visited
.add (node
))
4751 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4752 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
4754 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4756 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4757 if (!stmt
|| gimple_bb (stmt
) == NULL
)
4759 if (is_pattern_stmt_p (stmt_info
)
4760 || !PURE_SLP_STMT (stmt_info
))
4762 lhs
= gimple_call_lhs (stmt
);
4763 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
4764 gsi
= gsi_for_stmt (stmt
);
4765 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
4766 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
4771 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
4773 hash_set
<slp_tree
> visited
;
4774 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
4777 /* Vectorize the instance root. */
4780 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
4782 gassign
*rstmt
= NULL
;
4784 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
4789 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4791 tree vect_lhs
= gimple_get_lhs (child_stmt
);
4792 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4793 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
4794 TREE_TYPE (vect_lhs
)))
4795 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
4797 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
4801 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
4803 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
4806 vec
<constructor_elt
, va_gc
> *v
;
4807 vec_alloc (v
, nelts
);
4809 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4811 CONSTRUCTOR_APPEND_ELT (v
,
4813 gimple_get_lhs (child_stmt
));
4815 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4816 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
4817 tree r_constructor
= build_constructor (rtype
, v
);
4818 rstmt
= gimple_build_assign (lhs
, r_constructor
);
4823 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
4824 gsi_replace (&rgsi
, rstmt
, true);
4827 /* Generate vector code for all SLP instances in the loop/basic block. */
4830 vect_schedule_slp (vec_info
*vinfo
)
4832 vec
<slp_instance
> slp_instances
;
4833 slp_instance instance
;
4836 slp_instances
= vinfo
->slp_instances
;
4837 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4839 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4840 if (dump_enabled_p ())
4842 dump_printf_loc (MSG_NOTE
, vect_location
,
4843 "Vectorizing SLP tree:\n");
4844 if (SLP_INSTANCE_ROOT_STMT (instance
))
4845 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
4846 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
4847 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4848 SLP_INSTANCE_TREE (instance
));
4850 /* Schedule the tree of INSTANCE. */
4851 vect_schedule_slp_instance (vinfo
, node
, instance
);
4853 if (SLP_INSTANCE_ROOT_STMT (instance
))
4854 vectorize_slp_instance_root_stmt (node
, instance
);
4856 if (dump_enabled_p ())
4857 dump_printf_loc (MSG_NOTE
, vect_location
,
4858 "vectorizing stmts using SLP.\n");
4861 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4863 slp_tree root
= SLP_INSTANCE_TREE (instance
);
4864 stmt_vec_info store_info
;
4867 /* For reductions set the latch values of the vectorized PHIs. */
4868 if (instance
->reduc_phis
4869 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4870 (instance
->reduc_phis
)) != FOLD_LEFT_REDUCTION
4871 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4872 (instance
->reduc_phis
)) != EXTRACT_LAST_REDUCTION
)
4874 slp_tree slp_node
= root
;
4875 slp_tree phi_node
= instance
->reduc_phis
;
4876 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_SCALAR_STMTS (phi_node
)[0]->stmt
);
4877 edge e
= loop_latch_edge (gimple_bb (phi
)->loop_father
);
4878 gcc_assert (SLP_TREE_VEC_STMTS (phi_node
).length ()
4879 == SLP_TREE_VEC_STMTS (slp_node
).length ());
4880 for (unsigned j
= 0; j
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++j
)
4881 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[j
]),
4882 vect_get_slp_vect_def (slp_node
, j
),
4883 e
, gimple_phi_arg_location (phi
, e
->dest_idx
));
4886 /* Remove scalar call stmts. Do not do this for basic-block
4887 vectorization as not all uses may be vectorized.
4888 ??? Why should this be necessary? DCE should be able to
4889 remove the stmts itself.
4890 ??? For BB vectorization we can as well remove scalar
4891 stmts starting from the SLP tree root if they have no
4893 if (is_a
<loop_vec_info
> (vinfo
))
4894 vect_remove_slp_scalar_calls (vinfo
, root
);
4896 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
4898 if (!STMT_VINFO_DATA_REF (store_info
))
4901 if (SLP_INSTANCE_ROOT_STMT (instance
))
4904 store_info
= vect_orig_stmt (store_info
);
4905 /* Free the attached stmt_vec_info and remove the stmt. */
4906 vinfo
->remove_stmt (store_info
);