1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
51 slp_tree
, stmt_vector_for_cost
*);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
58 SLP_TREE_SCALAR_OPS (this) = vNULL
;
59 SLP_TREE_VEC_STMTS (this) = vNULL
;
60 SLP_TREE_VEC_DEFS (this) = vNULL
;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL
;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
66 SLP_TREE_CODE (this) = ERROR_MARK
;
67 SLP_TREE_VECTYPE (this) = NULL_TREE
;
68 SLP_TREE_REPRESENTATIVE (this) = NULL
;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
92 vect_free_slp_tree (slp_tree node
, bool final_p
)
97 if (--node
->refcnt
!= 0)
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
101 vect_free_slp_tree (child
, final_p
);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
109 stmt_vec_info stmt_info
;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info
) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info
)--;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
125 vect_free_slp_instance (slp_instance instance
, bool final_p
)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
), final_p
);
128 SLP_INSTANCE_LOADS (instance
).release ();
133 /* Create an SLP node for SCALAR_STMTS. */
136 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
138 slp_tree node
= new _slp_tree
;
139 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
140 SLP_TREE_CHILDREN (node
).create (nops
);
141 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
142 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
143 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
146 stmt_vec_info stmt_info
;
147 FOR_EACH_VEC_ELT (scalar_stmts
, i
, stmt_info
)
148 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
153 /* Create an SLP node for OPS. */
156 vect_create_new_slp_node (vec
<tree
> ops
)
158 slp_tree node
= new _slp_tree
;
159 SLP_TREE_SCALAR_OPS (node
) = ops
;
160 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
161 SLP_TREE_LANES (node
) = ops
.length ();
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
169 typedef struct _slp_oprnd_info
171 /* Def-stmts for the operands. */
172 vec
<stmt_vec_info
> def_stmts
;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
179 enum vect_def_type first_dt
;
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
186 static vec
<slp_oprnd_info
>
187 vect_create_oprnd_info (int nops
, int group_size
)
190 slp_oprnd_info oprnd_info
;
191 vec
<slp_oprnd_info
> oprnds_info
;
193 oprnds_info
.create (nops
);
194 for (i
= 0; i
< nops
; i
++)
196 oprnd_info
= XNEW (struct _slp_oprnd_info
);
197 oprnd_info
->def_stmts
.create (group_size
);
198 oprnd_info
->ops
.create (group_size
);
199 oprnd_info
->first_dt
= vect_uninitialized_def
;
200 oprnd_info
->first_op_type
= NULL_TREE
;
201 oprnd_info
->any_pattern
= false;
202 oprnds_info
.quick_push (oprnd_info
);
209 /* Free operands info. */
212 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
215 slp_oprnd_info oprnd_info
;
217 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
219 oprnd_info
->def_stmts
.release ();
220 oprnd_info
->ops
.release ();
221 XDELETE (oprnd_info
);
224 oprnds_info
.release ();
228 /* Return true if STMTS contains a pattern statement. */
231 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
233 stmt_vec_info stmt_info
;
235 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
236 if (is_pattern_stmt_p (stmt_info
))
241 /* Return true when all lanes in the external or constant NODE have
245 vect_slp_tree_uniform_p (slp_tree node
)
247 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
251 tree op
, first
= NULL_TREE
;
252 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
255 else if (!operand_equal_p (first
, op
, 0))
261 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
262 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
266 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
267 stmt_vec_info first_stmt_info
)
269 stmt_vec_info next_stmt_info
= first_stmt_info
;
272 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
277 if (next_stmt_info
== stmt_info
)
279 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
281 result
+= DR_GROUP_GAP (next_stmt_info
);
283 while (next_stmt_info
);
288 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
289 using the method implemented by duplicate_and_interleave. Return true
290 if so, returning the number of intermediate vectors in *NVECTORS_OUT
291 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
295 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
296 tree elt_type
, unsigned int *nvectors_out
,
297 tree
*vector_type_out
,
300 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
301 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
304 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
305 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
306 unsigned int nvectors
= 1;
309 scalar_int_mode int_mode
;
310 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
311 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
313 /* Get the natural vector type for this SLP group size. */
314 tree int_type
= build_nonstandard_integer_type
315 (GET_MODE_BITSIZE (int_mode
), 1);
317 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
319 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
320 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
321 GET_MODE_SIZE (base_vector_mode
)))
323 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
324 together into elements of type INT_TYPE and using the result
325 to build NVECTORS vectors. */
326 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
327 vec_perm_builder
sel1 (nelts
, 2, 3);
328 vec_perm_builder
sel2 (nelts
, 2, 3);
329 poly_int64 half_nelts
= exact_div (nelts
, 2);
330 for (unsigned int i
= 0; i
< 3; ++i
)
333 sel1
.quick_push (i
+ nelts
);
334 sel2
.quick_push (half_nelts
+ i
);
335 sel2
.quick_push (half_nelts
+ i
+ nelts
);
337 vec_perm_indices
indices1 (sel1
, 2, nelts
);
338 vec_perm_indices
indices2 (sel2
, 2, nelts
);
339 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
340 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
343 *nvectors_out
= nvectors
;
345 *vector_type_out
= vector_type
;
348 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
350 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
357 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
363 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
364 they are of a valid type and that they match the defs of the first stmt of
365 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
366 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
367 indicates swap is required for cond_expr stmts. Specifically, *SWAP
368 is 1 if STMT is cond and operands of comparison need to be swapped;
369 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
370 If there is any operand swap in this function, *SWAP is set to non-zero
372 If there was a fatal error return -1; if the error could be corrected by
373 swapping operands of father node of this one, return 1; if everything is
376 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char *swap
,
377 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
378 vec
<slp_oprnd_info
> *oprnds_info
)
380 stmt_vec_info stmt_info
= stmts
[stmt_num
];
382 unsigned int i
, number_of_oprnds
;
383 enum vect_def_type dt
= vect_uninitialized_def
;
384 slp_oprnd_info oprnd_info
;
385 int first_op_idx
= 1;
386 unsigned int commutative_op
= -1U;
387 bool first_op_cond
= false;
388 bool first
= stmt_num
== 0;
390 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
392 number_of_oprnds
= gimple_call_num_args (stmt
);
394 if (gimple_call_internal_p (stmt
))
396 internal_fn ifn
= gimple_call_internal_fn (stmt
);
397 commutative_op
= first_commutative_argument (ifn
);
399 /* Masked load, only look at mask. */
400 if (ifn
== IFN_MASK_LOAD
)
402 number_of_oprnds
= 1;
403 /* Mask operand index. */
408 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
410 enum tree_code code
= gimple_assign_rhs_code (stmt
);
411 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
412 /* Swap can only be done for cond_expr if asked to, otherwise we
413 could result in different comparison code to the first stmt. */
414 if (code
== COND_EXPR
415 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
417 first_op_cond
= true;
421 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
426 bool swapped
= (*swap
!= 0);
427 gcc_assert (!swapped
|| first_op_cond
);
428 for (i
= 0; i
< number_of_oprnds
; i
++)
433 /* Map indicating how operands of cond_expr should be swapped. */
434 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
435 int *map
= maps
[*swap
];
438 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
439 first_op_idx
), map
[i
]);
441 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
444 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
445 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
446 oprnd
= TREE_OPERAND (oprnd
, 0);
448 oprnd_info
= (*oprnds_info
)[i
];
450 stmt_vec_info def_stmt_info
;
451 if (!vect_is_simple_use (oprnd
, vinfo
, &dt
, &def_stmt_info
))
453 if (dump_enabled_p ())
454 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
455 "Build SLP failed: can't analyze def for %T\n",
461 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
462 oprnd_info
->any_pattern
= true;
466 /* For the swapping logic below force vect_reduction_def
467 for the reduction op in a SLP reduction group. */
468 if (!STMT_VINFO_DATA_REF (stmt_info
)
469 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
470 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
472 dt
= vect_reduction_def
;
473 oprnd_info
->first_dt
= dt
;
474 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
478 /* Not first stmt of the group, check that the def-stmt/s match
479 the def-stmt/s of the first stmt. Allow different definition
480 types for reduction chains: the first stmt must be a
481 vect_reduction_def (a phi node), and the rest
482 end in the reduction chain. */
483 tree type
= TREE_TYPE (oprnd
);
484 if ((oprnd_info
->first_dt
!= dt
485 && !(oprnd_info
->first_dt
== vect_reduction_def
486 && !STMT_VINFO_DATA_REF (stmt_info
)
487 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
489 && !STMT_VINFO_DATA_REF (def_stmt_info
)
490 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
491 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
492 && !((oprnd_info
->first_dt
== vect_external_def
493 || oprnd_info
->first_dt
== vect_constant_def
)
494 && (dt
== vect_external_def
495 || dt
== vect_constant_def
)))
496 || !types_compatible_p (oprnd_info
->first_op_type
, type
)
497 || (!STMT_VINFO_DATA_REF (stmt_info
)
498 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
500 || STMT_VINFO_DATA_REF (def_stmt_info
)
501 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
502 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
503 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
505 /* Try swapping operands if we got a mismatch. */
506 if (i
== commutative_op
&& !swapped
)
508 if (dump_enabled_p ())
509 dump_printf_loc (MSG_NOTE
, vect_location
,
510 "trying swapped operands\n");
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
517 "Build SLP failed: different types\n");
521 if ((dt
== vect_constant_def
522 || dt
== vect_external_def
)
523 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
524 && (TREE_CODE (type
) == BOOLEAN_TYPE
525 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
528 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
530 "Build SLP failed: invalid type of def "
531 "for variable-length SLP %T\n", oprnd
);
536 /* Check the types of the definitions. */
539 case vect_external_def
:
540 /* Make sure to demote the overall operand to external. */
541 oprnd_info
->first_dt
= vect_external_def
;
543 case vect_constant_def
:
544 oprnd_info
->def_stmts
.quick_push (NULL
);
545 oprnd_info
->ops
.quick_push (oprnd
);
548 case vect_internal_def
:
549 case vect_reduction_def
:
550 if (oprnd_info
->first_dt
== vect_reduction_def
551 && !STMT_VINFO_DATA_REF (stmt_info
)
552 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
553 && !STMT_VINFO_DATA_REF (def_stmt_info
)
554 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
555 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
557 /* For a SLP reduction chain we want to duplicate the
558 reduction to each of the chain members. That gets
559 us a sane SLP graph (still the stmts are not 100%
560 correct wrt the initial values). */
562 oprnd_info
->def_stmts
.quick_push (oprnd_info
->def_stmts
[0]);
563 oprnd_info
->ops
.quick_push (oprnd_info
->ops
[0]);
567 case vect_induction_def
:
568 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
569 oprnd_info
->ops
.quick_push (oprnd
);
573 /* FORNOW: Not supported. */
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
576 "Build SLP failed: illegal type of def %T\n",
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_NOTE
, vect_location
,
588 "swapped operands to match def types in %G",
596 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
597 Return true if we can, meaning that this choice doesn't conflict with
598 existing SLP nodes that use STMT_INFO. */
601 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
603 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
604 if (old_vectype
&& useless_type_conversion_p (vectype
, old_vectype
))
607 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
608 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
610 /* We maintain the invariant that if any statement in the group is
611 used, all other members of the group have the same vector type. */
612 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
613 stmt_vec_info member_info
= first_info
;
614 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
615 if (STMT_VINFO_NUM_SLP_USES (member_info
) > 0
616 || is_pattern_stmt_p (member_info
))
621 for (member_info
= first_info
; member_info
;
622 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
623 STMT_VINFO_VECTYPE (member_info
) = vectype
;
627 else if (STMT_VINFO_NUM_SLP_USES (stmt_info
) == 0
628 && !is_pattern_stmt_p (stmt_info
))
630 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
634 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
637 "Build SLP failed: incompatible vector"
638 " types for: %G", stmt_info
->stmt
);
639 dump_printf_loc (MSG_NOTE
, vect_location
,
640 " old vector type: %T\n", old_vectype
);
641 dump_printf_loc (MSG_NOTE
, vect_location
,
642 " new vector type: %T\n", vectype
);
647 /* Return true if call statements CALL1 and CALL2 are similar enough
648 to be combined into the same SLP group. */
651 compatible_calls_p (gcall
*call1
, gcall
*call2
)
653 unsigned int nargs
= gimple_call_num_args (call1
);
654 if (nargs
!= gimple_call_num_args (call2
))
657 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
660 if (gimple_call_internal_p (call1
))
662 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
663 TREE_TYPE (gimple_call_lhs (call2
))))
665 for (unsigned int i
= 0; i
< nargs
; ++i
)
666 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
667 TREE_TYPE (gimple_call_arg (call2
, i
))))
672 if (!operand_equal_p (gimple_call_fn (call1
),
673 gimple_call_fn (call2
), 0))
676 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
682 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
683 caller's attempt to find the vector type in STMT_INFO with the narrowest
684 element type. Return true if VECTYPE is nonnull and if it is valid
685 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
686 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
687 vect_build_slp_tree. */
690 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
691 unsigned int group_size
,
692 tree vectype
, poly_uint64
*max_nunits
)
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
698 "Build SLP failed: unsupported data-type in %G\n",
700 /* Fatal mismatch. */
704 /* If populating the vector type requires unrolling then fail
705 before adjusting *max_nunits for basic-block vectorization. */
706 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
707 unsigned HOST_WIDE_INT const_nunits
;
708 if (is_a
<bb_vec_info
> (vinfo
)
709 && (!nunits
.is_constant (&const_nunits
)
710 || const_nunits
> group_size
))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
714 "Build SLP failed: unrolling required "
715 "in basic block SLP\n");
716 /* Fatal mismatch. */
720 /* In case of multiple types we need to detect the smallest type. */
721 vect_update_max_nunits (max_nunits
, vectype
);
725 /* Verify if the scalar stmts STMTS are isomorphic, require data
726 permutation or are of unsupported types of operation. Return
727 true if they are, otherwise return false and indicate in *MATCHES
728 which stmts are not isomorphic to the first one. If MATCHES[0]
729 is false then this indicates the comparison could not be
730 carried out or the stmts will never be vectorized by SLP.
732 Note COND_EXPR is possibly isomorphic to another one after swapping its
733 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
734 the first stmt by swapping the two operands of comparison; set SWAP[i]
735 to 2 if stmt I is isormorphic to the first stmt by inverting the code
736 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
737 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
740 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
741 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
742 poly_uint64
*max_nunits
, bool *matches
,
743 bool *two_operators
, tree
*node_vectype
)
746 stmt_vec_info first_stmt_info
= stmts
[0];
747 enum tree_code first_stmt_code
= ERROR_MARK
;
748 enum tree_code alt_stmt_code
= ERROR_MARK
;
749 enum tree_code rhs_code
= ERROR_MARK
;
750 enum tree_code first_cond_code
= ERROR_MARK
;
752 bool need_same_oprnds
= false;
753 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
756 machine_mode optab_op2_mode
;
757 machine_mode vec_mode
;
758 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
761 /* For every stmt in NODE find its def stmt/s. */
762 stmt_vec_info stmt_info
;
763 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
765 gimple
*stmt
= stmt_info
->stmt
;
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
772 /* Fail to vectorize statements marked as unvectorizable. */
773 if (!STMT_VINFO_VECTORIZABLE (stmt_info
))
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
777 "Build SLP failed: unvectorizable statement %G",
779 /* Fatal mismatch. */
784 lhs
= gimple_get_lhs (stmt
);
785 if (lhs
== NULL_TREE
)
787 if (dump_enabled_p ())
788 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
789 "Build SLP failed: not GIMPLE_ASSIGN nor "
790 "GIMPLE_CALL %G", stmt
);
791 /* Fatal mismatch. */
797 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
798 &nunits_vectype
, group_size
)
800 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
801 nunits_vectype
, max_nunits
)))
803 /* Fatal mismatch. */
808 gcc_assert (vectype
);
810 if (is_a
<bb_vec_info
> (vinfo
)
811 && !vect_update_shared_vectype (stmt_info
, vectype
))
814 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
817 rhs_code
= CALL_EXPR
;
819 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
821 else if ((gimple_call_internal_p (call_stmt
)
822 && (!vectorizable_internal_fn_p
823 (gimple_call_internal_fn (call_stmt
))))
824 || gimple_call_tail_p (call_stmt
)
825 || gimple_call_noreturn_p (call_stmt
)
826 || !gimple_call_nothrow_p (call_stmt
)
827 || gimple_call_chain (call_stmt
))
829 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
831 "Build SLP failed: unsupported call type %G",
833 /* Fatal mismatch. */
840 rhs_code
= gimple_assign_rhs_code (stmt
);
841 load_p
= TREE_CODE_CLASS (rhs_code
) == tcc_reference
;
844 /* Check the operation. */
847 *node_vectype
= vectype
;
848 first_stmt_code
= rhs_code
;
850 /* Shift arguments should be equal in all the packed stmts for a
851 vector shift with scalar shift operand. */
852 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
853 || rhs_code
== LROTATE_EXPR
854 || rhs_code
== RROTATE_EXPR
)
856 vec_mode
= TYPE_MODE (vectype
);
858 /* First see if we have a vector/vector shift. */
859 optab
= optab_for_tree_code (rhs_code
, vectype
,
863 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
865 /* No vector/vector shift, try for a vector/scalar shift. */
866 optab
= optab_for_tree_code (rhs_code
, vectype
,
871 if (dump_enabled_p ())
872 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
873 "Build SLP failed: no optab.\n");
874 /* Fatal mismatch. */
878 icode
= (int) optab_handler (optab
, vec_mode
);
879 if (icode
== CODE_FOR_nothing
)
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
884 "op not supported by target.\n");
885 /* Fatal mismatch. */
889 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
890 if (!VECTOR_MODE_P (optab_op2_mode
))
892 need_same_oprnds
= true;
893 first_op1
= gimple_assign_rhs2 (stmt
);
897 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
899 need_same_oprnds
= true;
900 first_op1
= gimple_assign_rhs2 (stmt
);
903 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
905 need_same_oprnds
= true;
906 first_op1
= gimple_call_arg (call_stmt
, 1);
911 if (first_stmt_code
!= rhs_code
912 && alt_stmt_code
== ERROR_MARK
)
913 alt_stmt_code
= rhs_code
;
914 if (first_stmt_code
!= rhs_code
915 && (first_stmt_code
!= IMAGPART_EXPR
916 || rhs_code
!= REALPART_EXPR
)
917 && (first_stmt_code
!= REALPART_EXPR
918 || rhs_code
!= IMAGPART_EXPR
)
919 /* Handle mismatches in plus/minus by computing both
920 and merging the results. */
921 && !((first_stmt_code
== PLUS_EXPR
922 || first_stmt_code
== MINUS_EXPR
)
923 && (alt_stmt_code
== PLUS_EXPR
924 || alt_stmt_code
== MINUS_EXPR
)
925 && rhs_code
== alt_stmt_code
)
926 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
927 && (first_stmt_code
== ARRAY_REF
928 || first_stmt_code
== BIT_FIELD_REF
929 || first_stmt_code
== INDIRECT_REF
930 || first_stmt_code
== COMPONENT_REF
931 || first_stmt_code
== MEM_REF
)))
933 if (dump_enabled_p ())
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
936 "Build SLP failed: different operation "
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
939 "original stmt %G", first_stmt_info
->stmt
);
945 if (need_same_oprnds
)
947 tree other_op1
= (call_stmt
948 ? gimple_call_arg (call_stmt
, 1)
949 : gimple_assign_rhs2 (stmt
));
950 if (!operand_equal_p (first_op1
, other_op1
, 0))
952 if (dump_enabled_p ())
953 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
954 "Build SLP failed: different shift "
955 "arguments in %G", stmt
);
961 if (!load_p
&& rhs_code
== CALL_EXPR
)
963 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
964 as_a
<gcall
*> (stmt
)))
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
968 "Build SLP failed: different calls in %G",
976 /* Grouped store or load. */
977 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
979 if (REFERENCE_CLASS_P (lhs
))
987 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
990 /* Check that there are no loads from different interleaving
991 chains in the same node. */
992 if (prev_first_load
!= first_load
)
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
997 "Build SLP failed: different "
998 "interleaving chains in one node %G",
1005 prev_first_load
= first_load
;
1007 } /* Grouped access. */
1012 /* Not grouped load. */
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1015 "Build SLP failed: not grouped load %G", stmt
);
1017 /* FORNOW: Not grouped loads are not supported. */
1018 /* Fatal mismatch. */
1023 /* Not memory operation. */
1024 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
1025 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1026 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1027 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1028 && rhs_code
!= VIEW_CONVERT_EXPR
1029 && rhs_code
!= CALL_EXPR
)
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1033 "Build SLP failed: operation unsupported %G",
1035 /* Fatal mismatch. */
1040 if (rhs_code
== COND_EXPR
)
1042 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1043 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1044 enum tree_code swap_code
= ERROR_MARK
;
1045 enum tree_code invert_code
= ERROR_MARK
;
1048 first_cond_code
= TREE_CODE (cond_expr
);
1049 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1051 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1052 swap_code
= swap_tree_comparison (cond_code
);
1053 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1056 if (first_cond_code
== cond_code
)
1058 /* Isomorphic can be achieved by swapping. */
1059 else if (first_cond_code
== swap_code
)
1061 /* Isomorphic can be achieved by inverting. */
1062 else if (first_cond_code
== invert_code
)
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1068 "Build SLP failed: different"
1069 " operation %G", stmt
);
1079 for (i
= 0; i
< group_size
; ++i
)
1083 /* If we allowed a two-operation SLP node verify the target can cope
1084 with the permute we are going to use. */
1085 if (alt_stmt_code
!= ERROR_MARK
1086 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1088 *two_operators
= true;
1094 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1095 Note we never remove apart from at destruction time so we do not
1096 need a special value for deleted that differs from empty. */
1099 typedef vec
<stmt_vec_info
> value_type
;
1100 typedef vec
<stmt_vec_info
> compare_type
;
1101 static inline hashval_t
hash (value_type
);
1102 static inline bool equal (value_type existing
, value_type candidate
);
1103 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1104 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1105 static const bool empty_zero_p
= true;
1106 static inline void mark_empty (value_type
&x
) { x
.release (); }
1107 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1108 static inline void remove (value_type
&x
) { x
.release (); }
1111 bst_traits::hash (value_type x
)
1114 for (unsigned i
= 0; i
< x
.length (); ++i
)
1115 h
.add_int (gimple_uid (x
[i
]->stmt
));
1119 bst_traits::equal (value_type existing
, value_type candidate
)
1121 if (existing
.length () != candidate
.length ())
1123 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1124 if (existing
[i
] != candidate
[i
])
1129 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1130 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1131 scalar_stmts_to_slp_tree_map_t
;
1134 vect_build_slp_tree_2 (vec_info
*vinfo
,
1135 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1136 poly_uint64
*max_nunits
,
1137 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1138 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1141 vect_build_slp_tree (vec_info
*vinfo
,
1142 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1143 poly_uint64
*max_nunits
,
1144 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1145 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1147 if (slp_tree
*leader
= bst_map
->get (stmts
))
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1151 *leader
? "" : "failed ", *leader
);
1154 (*leader
)->refcnt
++;
1155 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1159 poly_uint64 this_max_nunits
= 1;
1160 slp_tree res
= vect_build_slp_tree_2 (vinfo
, stmts
, group_size
,
1162 matches
, npermutes
, tree_size
, bst_map
);
1165 res
->max_nunits
= this_max_nunits
;
1166 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1167 /* Keep a reference for the bst_map use. */
1170 bst_map
->put (stmts
.copy (), res
);
1174 /* Recursively build an SLP tree starting from NODE.
1175 Fail (and return a value not equal to zero) if def-stmts are not
1176 isomorphic, require data permutation or are of unsupported types of
1177 operation. Otherwise, return 0.
1178 The value returned is the depth in the SLP tree where a mismatch
1182 vect_build_slp_tree_2 (vec_info
*vinfo
,
1183 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1184 poly_uint64
*max_nunits
,
1185 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1186 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1188 unsigned nops
, i
, this_tree_size
= 0;
1189 poly_uint64 this_max_nunits
= *max_nunits
;
1194 stmt_vec_info stmt_info
= stmts
[0];
1195 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1196 nops
= gimple_call_num_args (stmt
);
1197 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1199 nops
= gimple_num_ops (stmt
) - 1;
1200 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1203 else if (is_a
<gphi
*> (stmt_info
->stmt
))
1208 /* If the SLP node is a PHI (induction or reduction), terminate
1210 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1212 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1213 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
1214 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1218 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1219 /* Induction from different IVs is not supported. */
1220 if (def_type
== vect_induction_def
)
1222 stmt_vec_info other_info
;
1223 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1224 if (stmt_info
!= other_info
)
1227 else if (def_type
== vect_reduction_def
1228 || def_type
== vect_double_reduction_def
1229 || def_type
== vect_nested_cycle
)
1231 /* Else def types have to match. */
1232 stmt_vec_info other_info
;
1233 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1234 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1240 node
= vect_create_new_slp_node (stmts
, 0);
1241 SLP_TREE_VECTYPE (node
) = vectype
;
1246 bool two_operators
= false;
1247 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1248 tree vectype
= NULL_TREE
;
1249 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1250 &this_max_nunits
, matches
, &two_operators
,
1254 /* If the SLP node is a load, terminate the recursion unless masked. */
1255 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1256 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1258 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1261 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1266 *max_nunits
= this_max_nunits
;
1268 node
= vect_create_new_slp_node (stmts
, 0);
1269 SLP_TREE_VECTYPE (node
) = vectype
;
1270 /* And compute the load permutation. Whether it is actually
1271 a permutation depends on the unrolling factor which is
1273 vec
<unsigned> load_permutation
;
1275 stmt_vec_info load_info
;
1276 load_permutation
.create (group_size
);
1277 stmt_vec_info first_stmt_info
1278 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1281 int load_place
= vect_get_place_in_interleaving_chain
1282 (load_info
, first_stmt_info
);
1283 gcc_assert (load_place
!= -1);
1284 load_permutation
.safe_push (load_place
);
1286 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1291 /* Get at the operands, verifying they are compatible. */
1292 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1293 slp_oprnd_info oprnd_info
;
1294 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1296 int res
= vect_get_and_check_slp_defs (vinfo
, &swap
[i
],
1297 stmts
, i
, &oprnds_info
);
1299 matches
[(res
== -1) ? 0 : i
] = false;
1303 for (i
= 0; i
< group_size
; ++i
)
1306 vect_free_oprnd_info (oprnds_info
);
1310 auto_vec
<slp_tree
, 4> children
;
1312 stmt_info
= stmts
[0];
1314 /* Create SLP_TREE nodes for the definition node/s. */
1315 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1320 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1322 /* COND_EXPR have one too many eventually if the condition
1324 gcc_assert (i
== 3 && nops
== 4);
1328 if (oprnd_info
->first_dt
!= vect_internal_def
1329 && oprnd_info
->first_dt
!= vect_reduction_def
1330 && oprnd_info
->first_dt
!= vect_induction_def
)
1332 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1333 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1334 oprnd_info
->ops
= vNULL
;
1335 children
.safe_push (invnode
);
1339 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1340 group_size
, &this_max_nunits
,
1342 &this_tree_size
, bst_map
)) != NULL
)
1344 oprnd_info
->def_stmts
= vNULL
;
1345 children
.safe_push (child
);
1349 /* If the SLP build failed fatally and we analyze a basic-block
1350 simply treat nodes we fail to build as externally defined
1351 (and thus build vectors from the scalar defs).
1352 The cost model will reject outright expensive cases.
1353 ??? This doesn't treat cases where permutation ultimatively
1354 fails (or we don't try permutation below). Ideally we'd
1355 even compute a permutation that will end up with the maximum
1357 if (is_a
<bb_vec_info
> (vinfo
)
1359 /* ??? Rejecting patterns this way doesn't work. We'd have to
1360 do extra work to cancel the pattern so the uses see the
1362 && !is_pattern_stmt_p (stmt_info
)
1363 && !oprnd_info
->any_pattern
)
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE
, vect_location
,
1367 "Building vector operands from scalars\n");
1369 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1370 children
.safe_push (child
);
1371 oprnd_info
->ops
= vNULL
;
1372 oprnd_info
->def_stmts
= vNULL
;
1376 /* If the SLP build for operand zero failed and operand zero
1377 and one can be commutated try that for the scalar stmts
1378 that failed the match. */
1380 /* A first scalar stmt mismatch signals a fatal mismatch. */
1382 /* ??? For COND_EXPRs we can swap the comparison operands
1383 as well as the arms under some constraints. */
1385 && oprnds_info
[1]->first_dt
== vect_internal_def
1386 && is_gimple_assign (stmt_info
->stmt
)
1387 /* Swapping operands for reductions breaks assumptions later on. */
1388 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1389 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1390 /* Do so only if the number of not successful permutes was nor more
1391 than a cut-ff as re-trying the recursive match on
1392 possibly each level of the tree would expose exponential
1396 /* See whether we can swap the matching or the non-matching
1398 bool swap_not_matching
= true;
1401 for (j
= 0; j
< group_size
; ++j
)
1403 if (matches
[j
] != !swap_not_matching
)
1405 stmt_vec_info stmt_info
= stmts
[j
];
1406 /* Verify if we can swap operands of this stmt. */
1407 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1409 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1411 if (!swap_not_matching
)
1413 swap_not_matching
= false;
1418 while (j
!= group_size
);
1420 /* Swap mismatched definition stmts. */
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE
, vect_location
,
1423 "Re-trying with swapped operands of stmts ");
1424 for (j
= 0; j
< group_size
; ++j
)
1425 if (matches
[j
] == !swap_not_matching
)
1427 std::swap (oprnds_info
[0]->def_stmts
[j
],
1428 oprnds_info
[1]->def_stmts
[j
]);
1429 std::swap (oprnds_info
[0]->ops
[j
],
1430 oprnds_info
[1]->ops
[j
]);
1431 if (dump_enabled_p ())
1432 dump_printf (MSG_NOTE
, "%d ", j
);
1434 if (dump_enabled_p ())
1435 dump_printf (MSG_NOTE
, "\n");
1436 /* And try again with scratch 'matches' ... */
1437 bool *tem
= XALLOCAVEC (bool, group_size
);
1438 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1439 group_size
, &this_max_nunits
,
1441 &this_tree_size
, bst_map
)) != NULL
)
1443 oprnd_info
->def_stmts
= vNULL
;
1444 children
.safe_push (child
);
1452 gcc_assert (child
== NULL
);
1453 FOR_EACH_VEC_ELT (children
, j
, child
)
1454 vect_free_slp_tree (child
, false);
1455 vect_free_oprnd_info (oprnds_info
);
1459 vect_free_oprnd_info (oprnds_info
);
1461 /* If we have all children of a non-unary child built up from
1462 uniform scalars then just throw that away, causing it built up
1465 && is_a
<bb_vec_info
> (vinfo
)
1466 /* ??? Rejecting patterns this way doesn't work. We'd have to
1467 do extra work to cancel the pattern so the uses see the
1469 && !is_pattern_stmt_p (stmt_info
))
1473 FOR_EACH_VEC_ELT (children
, j
, child
)
1474 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
1475 || !vect_slp_tree_uniform_p (child
))
1481 FOR_EACH_VEC_ELT (children
, j
, child
)
1482 vect_free_slp_tree (child
, false);
1484 if (dump_enabled_p ())
1485 dump_printf_loc (MSG_NOTE
, vect_location
,
1486 "Building parent vector operands from "
1487 "scalars instead\n");
1492 *tree_size
+= this_tree_size
+ 1;
1493 *max_nunits
= this_max_nunits
;
1497 /* ??? We'd likely want to either cache in bst_map sth like
1498 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1499 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1500 explicit stmts to put in so the keying on 'stmts' doesn't
1501 work (but we have the same issue with nodes that use 'ops'). */
1502 slp_tree one
= new _slp_tree
;
1503 slp_tree two
= new _slp_tree
;
1504 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1505 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1506 SLP_TREE_VECTYPE (one
) = vectype
;
1507 SLP_TREE_VECTYPE (two
) = vectype
;
1508 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1509 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1514 /* Here we record the original defs since this
1515 node represents the final lane configuration. */
1516 node
= vect_create_new_slp_node (stmts
, 2);
1517 SLP_TREE_VECTYPE (node
) = vectype
;
1518 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1519 SLP_TREE_CHILDREN (node
).quick_push (one
);
1520 SLP_TREE_CHILDREN (node
).quick_push (two
);
1521 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1522 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1523 enum tree_code ocode
= ERROR_MARK
;
1524 stmt_vec_info ostmt_info
;
1526 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1528 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1529 if (gimple_assign_rhs_code (ostmt
) != code0
)
1531 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1532 ocode
= gimple_assign_rhs_code (ostmt
);
1536 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1538 SLP_TREE_CODE (one
) = code0
;
1539 SLP_TREE_CODE (two
) = ocode
;
1540 SLP_TREE_LANES (one
) = stmts
.length ();
1541 SLP_TREE_LANES (two
) = stmts
.length ();
1542 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1543 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1547 node
= vect_create_new_slp_node (stmts
, nops
);
1548 SLP_TREE_VECTYPE (node
) = vectype
;
1549 SLP_TREE_CHILDREN (node
).splice (children
);
1553 /* Dump a single SLP tree NODE. */
1556 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1561 stmt_vec_info stmt_info
;
1564 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1565 dump_user_location_t user_loc
= loc
.get_user_location ();
1566 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1567 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1569 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1572 estimated_poly_value (node
->max_nunits
), node
->refcnt
);
1573 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1574 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1575 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1578 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1579 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1580 dump_printf (metadata
, "%T%s ", op
,
1581 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1582 dump_printf (metadata
, "}\n");
1584 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1586 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1587 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1588 dump_printf (dump_kind
, " %u", j
);
1589 dump_printf (dump_kind
, " }\n");
1591 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1593 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1594 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1595 dump_printf (dump_kind
, " %u[%u]",
1596 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1597 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1598 dump_printf (dump_kind
, " }\n");
1600 if (SLP_TREE_CHILDREN (node
).is_empty ())
1602 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1603 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1604 dump_printf (dump_kind
, " %p", (void *)child
);
1605 dump_printf (dump_kind
, "\n");
1609 debug (slp_tree node
)
1611 debug_dump_context ctx
;
1612 vect_print_slp_tree (MSG_NOTE
,
1613 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1617 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1620 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1621 slp_tree node
, hash_set
<slp_tree
> &visited
)
1626 if (visited
.add (node
))
1629 vect_print_slp_tree (dump_kind
, loc
, node
);
1631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1632 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1636 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1639 hash_set
<slp_tree
> visited
;
1640 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1643 /* Mark the tree rooted at NODE with PURE_SLP. */
1646 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1649 stmt_vec_info stmt_info
;
1652 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1655 if (visited
.add (node
))
1658 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1659 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
1661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1662 vect_mark_slp_stmts (child
, visited
);
1666 vect_mark_slp_stmts (slp_tree node
)
1668 hash_set
<slp_tree
> visited
;
1669 vect_mark_slp_stmts (node
, visited
);
1672 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1675 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
1678 stmt_vec_info stmt_info
;
1681 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1684 if (visited
.add (node
))
1687 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1689 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1690 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1691 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1694 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1695 vect_mark_slp_stmts_relevant (child
, visited
);
1699 vect_mark_slp_stmts_relevant (slp_tree node
)
1701 hash_set
<slp_tree
> visited
;
1702 vect_mark_slp_stmts_relevant (node
, visited
);
1705 /* Copy the SLP subtree rooted at NODE. */
1708 slp_copy_subtree (slp_tree node
, hash_map
<slp_tree
, slp_tree
> &map
)
1713 slp_tree
©_ref
= map
.get_or_insert (node
, &existed_p
);
1717 copy_ref
= new _slp_tree
;
1718 slp_tree copy
= copy_ref
;
1719 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
1720 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
1721 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
1722 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
1723 copy
->max_nunits
= node
->max_nunits
;
1725 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1727 SLP_TREE_SCALAR_STMTS (copy
) = SLP_TREE_SCALAR_STMTS (node
).copy ();
1728 stmt_vec_info stmt_info
;
1729 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1730 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
1732 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1733 SLP_TREE_SCALAR_OPS (copy
) = SLP_TREE_SCALAR_OPS (node
).copy ();
1734 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1735 SLP_TREE_LOAD_PERMUTATION (copy
) = SLP_TREE_LOAD_PERMUTATION (node
).copy ();
1736 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1737 SLP_TREE_LANE_PERMUTATION (copy
) = SLP_TREE_LANE_PERMUTATION (node
).copy ();
1738 if (SLP_TREE_CHILDREN (node
).exists ())
1739 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
).copy ();
1740 gcc_assert (!SLP_TREE_VEC_STMTS (node
).exists ());
1743 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy
), i
, child
)
1745 SLP_TREE_CHILDREN (copy
)[i
] = slp_copy_subtree (child
, map
);
1746 SLP_TREE_CHILDREN (copy
)[i
]->refcnt
++;
1751 /* Rearrange the statements of NODE according to PERMUTATION. */
1754 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1755 vec
<unsigned> permutation
,
1756 hash_set
<slp_tree
> &visited
)
1761 if (visited
.add (node
))
1764 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1765 vect_slp_rearrange_stmts (child
, group_size
, permutation
, visited
);
1767 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1769 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1770 vec
<stmt_vec_info
> tmp_stmts
;
1771 tmp_stmts
.create (group_size
);
1772 tmp_stmts
.quick_grow (group_size
);
1773 stmt_vec_info stmt_info
;
1774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1775 tmp_stmts
[permutation
[i
]] = stmt_info
;
1776 SLP_TREE_SCALAR_STMTS (node
).release ();
1777 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1779 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1781 gcc_assert (group_size
== SLP_TREE_SCALAR_OPS (node
).length ());
1783 tmp_ops
.create (group_size
);
1784 tmp_ops
.quick_grow (group_size
);
1786 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1787 tmp_ops
[permutation
[i
]] = op
;
1788 SLP_TREE_SCALAR_OPS (node
).release ();
1789 SLP_TREE_SCALAR_OPS (node
) = tmp_ops
;
1791 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1793 gcc_assert (group_size
== SLP_TREE_LANE_PERMUTATION (node
).length ());
1794 for (i
= 0; i
< group_size
; ++i
)
1795 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
1796 = permutation
[SLP_TREE_LANE_PERMUTATION (node
)[i
].second
];
1801 /* Attempt to reorder stmts in a reduction chain so that we don't
1802 require any load permutation. Return true if that was possible,
1803 otherwise return false. */
1806 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn
)
1810 slp_tree node
, load
;
1812 if (SLP_INSTANCE_LOADS (slp_instn
).is_empty ())
1815 /* Compare all the permutation sequences to the first one. We know
1816 that at least one load is permuted. */
1817 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1818 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1820 unsigned int group_size
= SLP_TREE_LOAD_PERMUTATION (node
).length ();
1821 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1823 if (!SLP_TREE_LOAD_PERMUTATION (load
).exists ()
1824 || SLP_TREE_LOAD_PERMUTATION (load
).length () != group_size
)
1826 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load
), j
, lidx
)
1827 if (lidx
!= SLP_TREE_LOAD_PERMUTATION (node
)[j
])
1831 /* Check that the loads in the first sequence are different and there
1832 are no gaps between them. */
1833 auto_sbitmap
load_index (group_size
);
1834 bitmap_clear (load_index
);
1835 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1837 if (lidx
>= group_size
)
1839 if (bitmap_bit_p (load_index
, lidx
))
1842 bitmap_set_bit (load_index
, lidx
);
1844 for (i
= 0; i
< group_size
; i
++)
1845 if (!bitmap_bit_p (load_index
, i
))
1848 /* This permutation is valid for reduction. Since the order of the
1849 statements in the nodes is not important unless they are memory
1850 accesses, we can rearrange the statements in all the nodes
1851 according to the order of the loads. */
1853 /* We have to unshare the SLP tree we modify. */
1854 hash_map
<slp_tree
, slp_tree
> map
;
1855 slp_tree unshared
= slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn
), map
);
1856 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn
), false);
1858 SLP_INSTANCE_TREE (slp_instn
) = unshared
;
1859 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1860 SLP_INSTANCE_LOADS (slp_instn
)[i
] = *map
.get (node
);
1861 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1863 /* Do the actual re-arrangement. */
1864 hash_set
<slp_tree
> visited
;
1865 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1866 node
->load_permutation
, visited
);
1868 /* We are done, no actual permutations need to be generated. */
1869 poly_uint64 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
);
1870 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1872 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1873 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
1874 /* But we have to keep those permutations that are required because
1875 of handling of gaps. */
1876 if (known_eq (unrolling_factor
, 1U)
1877 || (group_size
== DR_GROUP_SIZE (first_stmt_info
)
1878 && DR_GROUP_GAP (first_stmt_info
) == 0))
1879 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1881 for (j
= 0; j
< SLP_TREE_LOAD_PERMUTATION (node
).length (); ++j
)
1882 SLP_TREE_LOAD_PERMUTATION (node
)[j
] = j
;
1888 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1891 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
1892 hash_set
<slp_tree
> &visited
)
1894 if (visited
.add (node
))
1897 if (SLP_TREE_CHILDREN (node
).length () == 0)
1899 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1901 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1902 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1903 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1904 loads
.safe_push (node
);
1910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1911 vect_gather_slp_loads (loads
, child
, visited
);
1916 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
1918 hash_set
<slp_tree
> visited
;
1919 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
1923 /* Find the last store in SLP INSTANCE. */
1926 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
1928 stmt_vec_info last
= NULL
;
1929 stmt_vec_info stmt_vinfo
;
1931 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
1933 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
1934 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
1940 /* Find the first stmt in NODE. */
1943 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
1945 stmt_vec_info first
= NULL
;
1946 stmt_vec_info stmt_vinfo
;
1948 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
1950 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
1952 || get_later_stmt (stmt_vinfo
, first
) == first
)
1959 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1960 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1961 (also containing the first GROUP1_SIZE stmts, since stores are
1962 consecutive), the second containing the remainder.
1963 Return the first stmt in the second group. */
1965 static stmt_vec_info
1966 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
1968 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
1969 gcc_assert (group1_size
> 0);
1970 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
1971 gcc_assert (group2_size
> 0);
1972 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
1974 stmt_vec_info stmt_info
= first_vinfo
;
1975 for (unsigned i
= group1_size
; i
> 1; i
--)
1977 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
1978 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
1980 /* STMT is now the last element of the first group. */
1981 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
1982 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
1984 DR_GROUP_SIZE (group2
) = group2_size
;
1985 for (stmt_info
= group2
; stmt_info
;
1986 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
1988 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
1989 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
1992 /* For the second group, the DR_GROUP_GAP is that before the original group,
1993 plus skipping over the first vector. */
1994 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
1996 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1997 DR_GROUP_GAP (first_vinfo
) += group2_size
;
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2001 group1_size
, group2_size
);
2006 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2007 statements and a vector of NUNITS elements. */
2010 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2012 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2015 /* Analyze an SLP instance starting from a group of grouped stores. Call
2016 vect_build_slp_tree to build a tree of packed stmts if possible.
2017 Return FALSE if it's impossible to SLP any stmt in the loop. */
2020 vect_analyze_slp_instance (vec_info
*vinfo
,
2021 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2022 stmt_vec_info stmt_info
, unsigned max_tree_size
)
2024 slp_instance new_instance
;
2026 unsigned int group_size
;
2027 tree vectype
, scalar_type
= NULL_TREE
;
2029 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2030 vec
<stmt_vec_info
> scalar_stmts
;
2031 bool constructor
= false;
2033 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2035 scalar_type
= TREE_TYPE (DR_REF (dr
));
2036 group_size
= DR_GROUP_SIZE (stmt_info
);
2037 vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
, group_size
);
2039 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2041 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2042 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2043 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2045 else if (is_gimple_assign (stmt_info
->stmt
)
2046 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2048 vectype
= TREE_TYPE (gimple_assign_rhs1 (stmt_info
->stmt
));
2049 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2054 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2055 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2056 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2063 "Build SLP failed: unsupported data-type %T\n",
2068 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2070 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2071 scalar_stmts
.create (group_size
);
2072 stmt_vec_info next_info
= stmt_info
;
2073 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2075 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2078 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2079 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2082 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2084 /* Collect the reduction stmts and store them in
2085 SLP_TREE_SCALAR_STMTS. */
2088 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2089 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2091 /* Mark the first element of the reduction chain as reduction to properly
2092 transform the node. In the reduction analysis phase only the last
2093 element of the chain is marked as reduction. */
2094 STMT_VINFO_DEF_TYPE (stmt_info
)
2095 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2096 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2097 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2099 else if (constructor
)
2101 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2103 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2105 if (TREE_CODE (val
) == SSA_NAME
)
2107 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2108 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2109 /* Value is defined in another basic block. */
2112 def_info
= vect_stmt_to_vectorize (def_info
);
2113 scalar_stmts
.safe_push (def_info
);
2118 if (dump_enabled_p ())
2119 dump_printf_loc (MSG_NOTE
, vect_location
,
2120 "Analyzing vectorizable constructor: %G\n",
2125 /* Collect reduction statements. */
2126 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2127 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2128 scalar_stmts
.safe_push (next_info
);
2131 /* Build the tree for the SLP instance. */
2132 bool *matches
= XALLOCAVEC (bool, group_size
);
2133 unsigned npermutes
= 0;
2134 poly_uint64 max_nunits
= nunits
;
2135 unsigned tree_size
= 0;
2136 node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2137 &max_nunits
, matches
, &npermutes
,
2138 &tree_size
, bst_map
);
2141 /* Calculate the unrolling factor based on the smallest type. */
2142 poly_uint64 unrolling_factor
2143 = calculate_unrolling_factor (max_nunits
, group_size
);
2145 if (maybe_ne (unrolling_factor
, 1U)
2146 && is_a
<bb_vec_info
> (vinfo
))
2148 unsigned HOST_WIDE_INT const_max_nunits
;
2149 if (!max_nunits
.is_constant (&const_max_nunits
)
2150 || const_max_nunits
> group_size
)
2152 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2154 "Build SLP failed: store group "
2155 "size not a multiple of the vector size "
2156 "in basic block SLP\n");
2157 vect_free_slp_tree (node
, false);
2160 /* Fatal mismatch. */
2161 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2162 vect_free_slp_tree (node
, false);
2166 /* Create a new SLP instance. */
2167 new_instance
= XNEW (class _slp_instance
);
2168 SLP_INSTANCE_TREE (new_instance
) = node
;
2169 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2170 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2171 SLP_INSTANCE_ROOT_STMT (new_instance
) = constructor
? stmt_info
: NULL
;
2173 vect_gather_slp_loads (new_instance
, node
);
2174 if (dump_enabled_p ())
2175 dump_printf_loc (MSG_NOTE
, vect_location
,
2176 "SLP size %u vs. limit %u.\n",
2177 tree_size
, max_tree_size
);
2179 /* Check whether any load is possibly permuted. */
2181 bool loads_permuted
= false;
2182 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2184 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2187 stmt_vec_info load_info
;
2188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2189 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2191 loads_permuted
= true;
2196 /* If the loads and stores can be handled with load/store-lane
2197 instructions do not generate this SLP instance. */
2198 if (is_a
<loop_vec_info
> (vinfo
)
2200 && dr
&& vect_store_lanes_supported (vectype
, group_size
, false))
2203 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2205 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2206 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2207 /* Use SLP for strided accesses (or if we can't load-lanes). */
2208 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2209 || ! vect_load_lanes_supported
2210 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2211 DR_GROUP_SIZE (stmt_vinfo
), false))
2214 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2218 "Built SLP cancelled: can use "
2219 "load/store-lanes\n");
2220 vect_free_slp_instance (new_instance
, false);
2225 /* If this is a reduction chain with a conversion in front
2226 amend the SLP tree with a node for that. */
2228 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
2229 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
)
2231 /* Get at the conversion stmt - we know it's the single use
2232 of the last stmt of the reduction chain. */
2233 gimple
*tem
= vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2234 use_operand_p use_p
;
2236 bool r
= single_imm_use (gimple_assign_lhs (tem
),
2239 next_info
= vinfo
->lookup_stmt (use_stmt
);
2240 next_info
= vect_stmt_to_vectorize (next_info
);
2241 scalar_stmts
= vNULL
;
2242 scalar_stmts
.create (group_size
);
2243 for (unsigned i
= 0; i
< group_size
; ++i
)
2244 scalar_stmts
.quick_push (next_info
);
2245 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2246 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2247 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2248 SLP_INSTANCE_TREE (new_instance
) = conv
;
2249 /* We also have to fake this conversion stmt as SLP reduction
2250 group so we don't have to mess with too much code
2252 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2253 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2256 vinfo
->slp_instances
.safe_push (new_instance
);
2258 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2259 the number of scalar stmts in the root in a few places.
2260 Verify that assumption holds. */
2261 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2262 .length () == group_size
);
2264 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE
, vect_location
,
2267 "Final SLP tree for instance:\n");
2268 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2269 SLP_INSTANCE_TREE (new_instance
));
2277 /* Failed to SLP. */
2278 /* Free the allocated memory. */
2279 scalar_stmts
.release ();
2282 /* For basic block SLP, try to break the group up into multiples of the
2284 unsigned HOST_WIDE_INT const_nunits
;
2285 if (is_a
<bb_vec_info
> (vinfo
)
2286 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2287 && DR_GROUP_FIRST_ELEMENT (stmt_info
)
2288 && nunits
.is_constant (&const_nunits
))
2290 /* We consider breaking the group only on VF boundaries from the existing
2292 for (i
= 0; i
< group_size
; i
++)
2293 if (!matches
[i
]) break;
2295 if (i
>= const_nunits
&& i
< group_size
)
2297 /* Split into two groups at the first vector boundary before i. */
2298 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2299 unsigned group1_size
= i
& ~(const_nunits
- 1);
2301 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2303 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2305 /* If the first non-match was in the middle of a vector,
2306 skip the rest of that vector. */
2307 if (group1_size
< i
)
2309 i
= group1_size
+ const_nunits
;
2311 rest
= vect_split_slp_store_group (rest
, const_nunits
);
2314 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2315 rest
, max_tree_size
);
2318 /* Even though the first vector did not all match, we might be able to SLP
2319 (some) of the remainder. FORNOW ignore this possibility. */
2326 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2327 trees of packed scalar stmts if SLP is possible. */
2330 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2333 stmt_vec_info first_element
;
2335 DUMP_VECT_SCOPE ("vect_analyze_slp");
2337 scalar_stmts_to_slp_tree_map_t
*bst_map
2338 = new scalar_stmts_to_slp_tree_map_t ();
2340 /* Find SLP sequences starting from groups of grouped stores. */
2341 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2342 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2344 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2346 if (loop_vinfo
->reduction_chains
.length () > 0)
2348 /* Find SLP sequences starting from reduction chains. */
2349 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2350 if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2353 /* Dissolve reduction chain group. */
2354 stmt_vec_info vinfo
= first_element
;
2355 stmt_vec_info last
= NULL
;
2358 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2359 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2360 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2364 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2365 /* It can be still vectorized as part of an SLP reduction. */
2366 loop_vinfo
->reductions
.safe_push (last
);
2370 /* Find SLP sequences starting from groups of reductions. */
2371 if (loop_vinfo
->reductions
.length () > 1)
2372 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2376 /* The map keeps a reference on SLP nodes built, release that. */
2377 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2378 it
!= bst_map
->end (); ++it
)
2380 vect_free_slp_tree ((*it
).second
, false);
2383 /* Optimize permutations in SLP reductions. */
2384 slp_instance instance
;
2385 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2387 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2388 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2389 /* Reduction (there are no data-refs in the root).
2390 In reduction chain the order of the loads is not important. */
2391 if (!STMT_VINFO_DATA_REF (stmt_info
)
2392 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2393 vect_attempt_slp_rearrange_stmts (instance
);
2396 /* Gather all loads in the SLP graph. */
2397 hash_set
<slp_tree
> visited
;
2398 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2399 vect_gather_slp_loads (vinfo
->slp_loads
, SLP_INSTANCE_TREE (instance
),
2402 return opt_result::success ();
2406 vect_optimize_slp (vec_info
*vinfo
)
2410 FOR_EACH_VEC_ELT (vinfo
->slp_loads
, i
, node
)
2412 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2415 /* In basic block vectorization we allow any subchain of an interleaving
2417 FORNOW: not in loop SLP because of realignment complications. */
2418 if (is_a
<bb_vec_info
> (vinfo
))
2420 bool subchain_p
= true;
2421 stmt_vec_info next_load_info
= NULL
;
2422 stmt_vec_info load_info
;
2424 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2427 && (next_load_info
!= load_info
2428 || DR_GROUP_GAP (load_info
) != 1))
2433 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
2437 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2443 stmt_vec_info load_info
;
2444 bool this_load_permuted
= false;
2446 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2447 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
2449 this_load_permuted
= true;
2452 stmt_vec_info first_stmt_info
2453 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
2454 if (!this_load_permuted
2455 /* The load requires permutation when unrolling exposes
2456 a gap either because the group is larger than the SLP
2457 group-size or because there is a gap between the groups. */
2458 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a
<loop_vec_info
> (vinfo
)), 1U)
2459 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
2460 && DR_GROUP_GAP (first_stmt_info
) == 0)))
2462 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2470 /* For each possible SLP instance decide whether to SLP it and calculate overall
2471 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2472 least one instance. */
2475 vect_make_slp_decision (loop_vec_info loop_vinfo
)
2478 poly_uint64 unrolling_factor
= 1;
2479 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2480 slp_instance instance
;
2481 int decided_to_slp
= 0;
2483 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2485 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2487 /* FORNOW: SLP if you can. */
2488 /* All unroll factors have the form:
2490 GET_MODE_SIZE (vinfo->vector_mode) * X
2492 for some rational X, so they must have a common multiple. */
2494 = force_common_multiple (unrolling_factor
,
2495 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
2497 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2498 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2499 loop-based vectorization. Such stmts will be marked as HYBRID. */
2500 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
2504 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
2506 if (decided_to_slp
&& dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE
, vect_location
,
2509 "Decided to SLP %d instances. Unrolling factor ",
2511 dump_dec (MSG_NOTE
, unrolling_factor
);
2512 dump_printf (MSG_NOTE
, "\n");
2515 return (decided_to_slp
> 0);
2518 /* Private data for vect_detect_hybrid_slp. */
2521 loop_vec_info loop_vinfo
;
2522 vec
<stmt_vec_info
> *worklist
;
2525 /* Walker for walk_gimple_op. */
2528 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
2530 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
2531 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
2536 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
2539 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
2540 if (PURE_SLP_STMT (def_stmt_info
))
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
2544 def_stmt_info
->stmt
);
2545 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
2546 dat
->worklist
->safe_push (def_stmt_info
);
2552 /* Find stmts that must be both vectorized and SLPed. */
2555 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
2557 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2559 /* All stmts participating in SLP are marked pure_slp, all other
2560 stmts are loop_vect.
2561 First collect all loop_vect stmts into a worklist. */
2562 auto_vec
<stmt_vec_info
> worklist
;
2563 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2565 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2566 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
2569 gphi
*phi
= gsi
.phi ();
2570 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2571 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2572 worklist
.safe_push (stmt_info
);
2574 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
2577 gimple
*stmt
= gsi_stmt (gsi
);
2578 if (is_gimple_debug (stmt
))
2580 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
2581 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2583 for (gimple_stmt_iterator gsi2
2584 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2585 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
2587 stmt_vec_info patt_info
2588 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
2589 if (!STMT_SLP_TYPE (patt_info
)
2590 && STMT_VINFO_RELEVANT (patt_info
))
2591 worklist
.safe_push (patt_info
);
2593 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2595 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2596 worklist
.safe_push (stmt_info
);
2600 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2601 mark any SLP vectorized stmt as hybrid.
2602 ??? We're visiting def stmts N times (once for each non-SLP and
2603 once for each hybrid-SLP use). */
2606 dat
.worklist
= &worklist
;
2607 dat
.loop_vinfo
= loop_vinfo
;
2608 memset (&wi
, 0, sizeof (wi
));
2609 wi
.info
= (void *)&dat
;
2610 while (!worklist
.is_empty ())
2612 stmt_vec_info stmt_info
= worklist
.pop ();
2613 /* Since SSA operands are not set up for pattern stmts we need
2614 to use walk_gimple_op. */
2616 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
2621 /* Initialize a bb_vec_info struct for the statements between
2622 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2624 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in
,
2625 gimple_stmt_iterator region_end_in
,
2626 vec_info_shared
*shared
)
2627 : vec_info (vec_info::bb
, init_cost (NULL
), shared
),
2628 bb (gsi_bb (region_begin_in
)),
2629 region_begin (region_begin_in
),
2630 region_end (region_end_in
)
2632 for (gimple
*stmt
: this->region_stmts ())
2634 gimple_set_uid (stmt
, 0);
2635 if (is_gimple_debug (stmt
))
2644 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2645 stmts in the basic block. */
2647 _bb_vec_info::~_bb_vec_info ()
2649 for (gimple
*stmt
: this->region_stmts ())
2650 /* Reset region marker. */
2651 gimple_set_uid (stmt
, -1);
2656 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2657 given then that child nodes have already been processed, and that
2658 their def types currently match their SLP node's def type. */
2661 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
2662 slp_instance node_instance
,
2663 stmt_vector_for_cost
*cost_vec
)
2665 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
2666 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
2668 /* Calculate the number of vector statements to be created for the
2669 scalar stmts in this node. For SLP reductions it is equal to the
2670 number of vector statements in the children (which has already been
2671 calculated by the recursive call). Otherwise it is the number of
2672 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2673 VF divided by the number of elements in a vector. */
2674 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2675 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2677 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
2678 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
2680 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2681 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
2688 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2689 vf
= loop_vinfo
->vectorization_factor
;
2692 unsigned int group_size
= SLP_TREE_LANES (node
);
2693 tree vectype
= SLP_TREE_VECTYPE (node
);
2694 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2695 = vect_get_num_vectors (vf
* group_size
, vectype
);
2698 /* Handle purely internal nodes. */
2699 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2700 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
2703 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
2704 node
, node_instance
, cost_vec
);
2707 /* Try to build NODE from scalars, returning true on success.
2708 NODE_INSTANCE is the SLP instance that contains NODE. */
2711 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
2712 slp_instance node_instance
)
2714 stmt_vec_info stmt_info
;
2717 if (!is_a
<bb_vec_info
> (vinfo
)
2718 || node
== SLP_INSTANCE_TREE (node_instance
)
2719 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE
, vect_location
,
2724 "Building vector operands from scalars instead\n");
2726 /* Don't remove and free the child nodes here, since they could be
2727 referenced by other structures. The analysis and scheduling phases
2728 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2729 unsigned int group_size
= SLP_TREE_LANES (node
);
2730 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
2731 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
);
2732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2734 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
2735 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
2740 /* Compute the prologue cost for invariant or constant operands represented
2744 vect_prologue_cost_for_slp (slp_tree node
,
2745 stmt_vector_for_cost
*cost_vec
)
2747 /* Without looking at the actual initializer a vector of
2748 constants can be implemented as load from the constant pool.
2749 When all elements are the same we can use a splat. */
2750 tree vectype
= SLP_TREE_VECTYPE (node
);
2751 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
2752 unsigned num_vects_to_check
;
2753 unsigned HOST_WIDE_INT const_nunits
;
2754 unsigned nelt_limit
;
2755 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
2756 && ! multiple_p (const_nunits
, group_size
))
2758 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
2759 nelt_limit
= const_nunits
;
2763 /* If either the vector has variable length or the vectors
2764 are composed of repeated whole groups we only need to
2765 cost construction once. All vectors will be the same. */
2766 num_vects_to_check
= 1;
2767 nelt_limit
= group_size
;
2769 tree elt
= NULL_TREE
;
2771 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
2773 unsigned si
= j
% group_size
;
2775 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
2776 /* ??? We're just tracking whether all operands of a single
2777 vector initializer are the same, ideally we'd check if
2778 we emitted the same one already. */
2779 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
2782 if (nelt
== nelt_limit
)
2784 record_stmt_cost (cost_vec
, 1,
2785 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2786 ? (elt
? scalar_to_vec
: vec_construct
)
2788 NULL
, vectype
, 0, vect_prologue
);
2794 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2795 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2797 Return true if the operations are supported. */
2800 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
2801 slp_instance node_instance
,
2802 hash_set
<slp_tree
> &visited
,
2803 hash_set
<slp_tree
> &lvisited
,
2804 stmt_vector_for_cost
*cost_vec
)
2809 /* Assume we can code-generate all invariants. */
2810 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2813 /* If we already analyzed the exact same set of scalar stmts we're done.
2814 We share the generated vector stmts for those.
2815 The SLP graph is acyclic so not caching whether we failed or succeeded
2816 doesn't result in any issue since we throw away the lvisited set
2818 if (visited
.contains (node
)
2819 || lvisited
.add (node
))
2822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2823 if (!vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
2824 visited
, lvisited
, cost_vec
))
2827 bool res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
2830 /* When the node can be vectorized cost invariant nodes it references.
2831 This is not done in DFS order to allow the refering node
2832 vectorizable_* calls to nail down the invariant nodes vector type
2833 and possibly unshare it if it needs a different vector type than
2836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2837 if ((SLP_TREE_DEF_TYPE (child
) == vect_constant_def
2838 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2839 /* Perform usual caching, note code-generation still
2840 code-gens these nodes multiple times but we expect
2841 to CSE them later. */
2842 && !visited
.contains (child
)
2843 && !lvisited
.add (child
))
2845 /* ??? After auditing more code paths make a "default"
2846 and push the vector type from NODE to all children
2847 if it is not already set. */
2848 /* Compute the number of vectors to be generated. */
2849 tree vector_type
= SLP_TREE_VECTYPE (child
);
2852 /* For shifts with a scalar argument we don't need
2853 to cost or code-generate anything.
2854 ??? Represent this more explicitely. */
2855 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
2856 == shift_vec_info_type
)
2860 unsigned group_size
= SLP_TREE_SCALAR_OPS (child
).length ();
2862 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2863 vf
= loop_vinfo
->vectorization_factor
;
2864 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
2865 = vect_get_num_vectors (vf
* group_size
, vector_type
);
2866 /* And cost them. */
2867 vect_prologue_cost_for_slp (child
, cost_vec
);
2870 /* If this node can't be vectorized, try pruning the tree here rather
2871 than felling the whole thing. */
2872 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
2874 /* We'll need to revisit this for invariant costing and number
2875 of vectorized stmt setting. */
2876 lvisited
.remove (node
);
2884 /* Analyze statements in SLP instances of VINFO. Return true if the
2885 operations are supported. */
2888 vect_slp_analyze_operations (vec_info
*vinfo
)
2890 slp_instance instance
;
2893 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2895 hash_set
<slp_tree
> visited
;
2896 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
2898 hash_set
<slp_tree
> lvisited
;
2899 stmt_vector_for_cost cost_vec
;
2900 cost_vec
.create (2);
2901 if (!vect_slp_analyze_node_operations (vinfo
,
2902 SLP_INSTANCE_TREE (instance
),
2903 instance
, visited
, lvisited
,
2905 /* Instances with a root stmt require vectorized defs for the
2907 || (SLP_INSTANCE_ROOT_STMT (instance
)
2908 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
2909 != vect_internal_def
)))
2911 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2912 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE
, vect_location
,
2915 "removing SLP instance operations starting from: %G",
2917 vect_free_slp_instance (instance
, false);
2918 vinfo
->slp_instances
.ordered_remove (i
);
2919 cost_vec
.release ();
2923 for (hash_set
<slp_tree
>::iterator x
= lvisited
.begin();
2924 x
!= lvisited
.end(); ++x
)
2928 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
2929 cost_vec
.release ();
2933 return !vinfo
->slp_instances
.is_empty ();
2937 /* Compute the scalar cost of the SLP node NODE and its children
2938 and return it. Do not account defs that are marked in LIFE and
2939 update LIFE according to uses of NODE. */
2942 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
2943 slp_tree node
, vec
<bool, va_heap
> *life
,
2944 stmt_vector_for_cost
*cost_vec
,
2945 hash_set
<slp_tree
> &visited
)
2948 stmt_vec_info stmt_info
;
2951 if (visited
.add (node
))
2954 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2956 gimple
*stmt
= stmt_info
->stmt
;
2957 ssa_op_iter op_iter
;
2958 def_operand_p def_p
;
2963 /* If there is a non-vectorized use of the defs then the scalar
2964 stmt is kept live in which case we do not account it or any
2965 required defs in the SLP children in the scalar cost. This
2966 way we make the vectorization more costly when compared to
2968 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
2970 imm_use_iterator use_iter
;
2972 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
2973 if (!is_gimple_debug (use_stmt
))
2975 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
2976 if (!use_stmt_info
|| !PURE_SLP_STMT (use_stmt_info
))
2979 BREAK_FROM_IMM_USE_STMT (use_iter
);
2986 /* Count scalar stmts only once. */
2987 if (gimple_visited_p (stmt
))
2989 gimple_set_visited (stmt
, true);
2991 vect_cost_for_stmt kind
;
2992 if (STMT_VINFO_DATA_REF (stmt_info
))
2994 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2997 kind
= scalar_store
;
2999 else if (vect_nop_conversion_p (stmt_info
))
3003 record_stmt_cost (cost_vec
, 1, kind
, stmt_info
, 0, vect_body
);
3006 auto_vec
<bool, 20> subtree_life
;
3007 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3009 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3011 /* Do not directly pass LIFE to the recursive call, copy it to
3012 confine changes in the callee to the current child/subtree. */
3013 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3015 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
));
3016 for (unsigned j
= 0;
3017 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3019 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3020 if (perm
.first
== i
)
3021 subtree_life
[perm
.second
] = (*life
)[j
];
3026 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3027 subtree_life
.safe_splice (*life
);
3029 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
3031 subtree_life
.truncate (0);
3036 /* Check if vectorization of the basic block is profitable. */
3039 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
3041 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3042 slp_instance instance
;
3044 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3045 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3047 /* Calculate scalar cost. */
3048 stmt_vector_for_cost scalar_costs
;
3049 scalar_costs
.create (0);
3050 hash_set
<slp_tree
> visited
;
3051 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3053 auto_vec
<bool, 20> life
;
3054 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)));
3055 vect_bb_slp_scalar_cost (bb_vinfo
,
3056 SLP_INSTANCE_TREE (instance
),
3057 &life
, &scalar_costs
, visited
);
3059 /* Unset visited flag. */
3060 stmt_info_for_cost
*si
;
3061 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
3062 gimple_set_visited (si
->stmt_info
->stmt
, false);
3064 void *target_cost_data
= init_cost (NULL
);
3065 add_stmt_costs (bb_vinfo
, target_cost_data
, &scalar_costs
);
3066 scalar_costs
.release ();
3068 finish_cost (target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
3069 destroy_cost_data (target_cost_data
);
3071 /* Complete the target-specific cost calculation. */
3072 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
3073 &vec_inside_cost
, &vec_epilogue_cost
);
3075 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
3077 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3080 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
3082 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
3083 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
3084 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
3087 /* Vectorization is profitable if its cost is more than the cost of scalar
3088 version. Note that we err on the vector side for equal cost because
3089 the cost estimate is otherwise quite pessimistic (constant uses are
3090 free on the scalar side but cost a load on the vector side for
3092 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
3098 /* Find any vectorizable constructors and add them to the grouped_store
3102 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
3104 for (gimple
*stmt
: bb_vinfo
->region_stmts ())
3106 gassign
*assign
= dyn_cast
<gassign
*> (stmt
);
3107 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
3110 tree rhs
= gimple_assign_rhs1 (assign
);
3111 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
3112 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
3113 CONSTRUCTOR_NELTS (rhs
))
3114 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
3115 || uniform_vector_p (rhs
))
3118 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
3119 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3123 /* Check if the region described by BB_VINFO can be vectorized, returning
3124 true if so. When returning false, set FATAL to true if the same failure
3125 would prevent vectorization at other vector sizes, false if it is still
3126 worth trying other sizes. N_STMTS is the number of statements in the
3130 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
)
3132 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3134 slp_instance instance
;
3136 poly_uint64 min_vf
= 2;
3138 /* The first group of checks is independent of the vector size. */
3141 /* Analyze the data references. */
3143 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3147 "not vectorized: unhandled data-ref in basic "
3152 if (!vect_analyze_data_ref_accesses (bb_vinfo
))
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3156 "not vectorized: unhandled data access in "
3161 vect_slp_check_for_constructors (bb_vinfo
);
3163 /* If there are no grouped stores and no constructors in the region
3164 there is no need to continue with pattern recog as vect_analyze_slp
3165 will fail anyway. */
3166 if (bb_vinfo
->grouped_stores
.is_empty ())
3168 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3170 "not vectorized: no grouped stores in "
3175 /* While the rest of the analysis below depends on it in some way. */
3178 vect_pattern_recog (bb_vinfo
);
3180 /* Check the SLP opportunities in the basic block, analyze and build SLP
3182 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
3184 if (dump_enabled_p ())
3186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3187 "Failed to SLP the basic block.\n");
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3189 "not vectorized: failed to find SLP opportunities "
3190 "in basic block.\n");
3195 /* Optimize permutations. */
3196 vect_optimize_slp (bb_vinfo
);
3198 vect_record_base_alignments (bb_vinfo
);
3200 /* Analyze and verify the alignment of data references and the
3201 dependence in the SLP instances. */
3202 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
3204 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo
, instance
)
3205 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
3207 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3208 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_NOTE
, vect_location
,
3211 "removing SLP instance operations starting from: %G",
3213 vect_free_slp_instance (instance
, false);
3214 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
3218 /* Mark all the statements that we want to vectorize as pure SLP and
3220 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3221 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
3222 if (SLP_INSTANCE_ROOT_STMT (instance
))
3223 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
3227 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
3230 if (!vect_slp_analyze_operations (bb_vinfo
))
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3234 "not vectorized: bad operation in basic block.\n");
3238 /* Cost model: check if the vectorization is worthwhile. */
3239 if (!unlimited_cost_model (NULL
)
3240 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3244 "not vectorized: vectorization is not "
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE
, vect_location
,
3251 "Basic block will be vectorized using SLP\n");
3255 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3256 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3257 on success. The region has N_STMTS statements and has the datarefs
3258 given by DATAREFS. */
3261 vect_slp_bb_region (gimple_stmt_iterator region_begin
,
3262 gimple_stmt_iterator region_end
,
3263 vec
<data_reference_p
> datarefs
,
3264 unsigned int n_stmts
)
3266 bb_vec_info bb_vinfo
;
3267 auto_vector_modes vector_modes
;
3269 /* Autodetect first vector size we try. */
3270 machine_mode next_vector_mode
= VOIDmode
;
3271 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
3272 unsigned int mode_i
= 0;
3274 vec_info_shared shared
;
3276 machine_mode autodetected_vector_mode
= VOIDmode
;
3279 bool vectorized
= false;
3281 bb_vinfo
= new _bb_vec_info (region_begin
, region_end
, &shared
);
3283 bool first_time_p
= shared
.datarefs
.is_empty ();
3284 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
3286 bb_vinfo
->shared
->save_datarefs ();
3288 bb_vinfo
->shared
->check_datarefs ();
3289 bb_vinfo
->vector_mode
= next_vector_mode
;
3291 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
)
3292 && dbg_cnt (vect_slp
))
3294 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE
, vect_location
,
3297 "***** Analysis succeeded with vector mode"
3298 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
3299 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
3302 bb_vinfo
->shared
->check_datarefs ();
3303 vect_schedule_slp (bb_vinfo
);
3305 unsigned HOST_WIDE_INT bytes
;
3306 if (dump_enabled_p ())
3308 if (GET_MODE_SIZE (bb_vinfo
->vector_mode
).is_constant (&bytes
))
3309 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3310 "basic block part vectorized using %wu byte "
3311 "vectors\n", bytes
);
3313 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3314 "basic block part vectorized using variable "
3315 "length vectors\n");
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE
, vect_location
,
3324 "***** Analysis failed with vector mode %s\n",
3325 GET_MODE_NAME (bb_vinfo
->vector_mode
));
3329 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
3332 while (mode_i
< vector_modes
.length ()
3333 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE
, vect_location
,
3337 "***** The result for vector mode %s would"
3339 GET_MODE_NAME (vector_modes
[mode_i
]));
3345 if (mode_i
< vector_modes
.length ()
3346 && VECTOR_MODE_P (autodetected_vector_mode
)
3347 && (related_vector_mode (vector_modes
[mode_i
],
3348 GET_MODE_INNER (autodetected_vector_mode
))
3349 == autodetected_vector_mode
)
3350 && (related_vector_mode (autodetected_vector_mode
,
3351 GET_MODE_INNER (vector_modes
[mode_i
]))
3352 == vector_modes
[mode_i
]))
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE
, vect_location
,
3356 "***** Skipping vector mode %s, which would"
3357 " repeat the analysis for %s\n",
3358 GET_MODE_NAME (vector_modes
[mode_i
]),
3359 GET_MODE_NAME (autodetected_vector_mode
));
3364 || mode_i
== vector_modes
.length ()
3365 || autodetected_vector_mode
== VOIDmode
3366 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3367 vector sizes will fail do not bother iterating. */
3371 /* Try the next biggest vector size. */
3372 next_vector_mode
= vector_modes
[mode_i
++];
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE
, vect_location
,
3375 "***** Re-trying analysis with vector mode %s\n",
3376 GET_MODE_NAME (next_vector_mode
));
3380 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3381 true if anything in the basic-block was vectorized. */
3384 vect_slp_bb (basic_block bb
)
3386 gimple_stmt_iterator gsi
;
3387 bool any_vectorized
= false;
3389 gsi
= gsi_after_labels (bb
);
3390 while (!gsi_end_p (gsi
))
3392 gimple_stmt_iterator region_begin
= gsi
;
3393 vec
<data_reference_p
> datarefs
= vNULL
;
3396 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3398 gimple
*stmt
= gsi_stmt (gsi
);
3399 if (is_gimple_debug (stmt
))
3401 /* Skip leading debug stmts. */
3402 if (gsi_stmt (region_begin
) == stmt
)
3403 gsi_next (®ion_begin
);
3408 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
3409 vect_location
= stmt
;
3411 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
))
3414 if (gsi_end_p (region_begin
))
3417 /* Skip leading unhandled stmts. */
3418 if (gsi_stmt (region_begin
) == gsi_stmt (gsi
))
3424 gimple_stmt_iterator region_end
= gsi
;
3426 if (insns
> param_slp_max_insns_in_bb
)
3428 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3430 "not vectorized: too many instructions in "
3433 else if (vect_slp_bb_region (region_begin
, region_end
, datarefs
, insns
))
3434 any_vectorized
= true;
3436 if (gsi_end_p (region_end
))
3439 /* Skip the unhandled stmt. */
3443 return any_vectorized
;
3447 /* Build a variable-length vector in which the elements in ELTS are repeated
3448 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3449 RESULTS and add any new instructions to SEQ.
3451 The approach we use is:
3453 (1) Find a vector mode VM with integer elements of mode IM.
3455 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3456 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3457 from small vectors to IM.
3459 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3461 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3462 correct byte contents.
3464 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3466 We try to find the largest IM for which this sequence works, in order
3467 to cut down on the number of interleaves. */
3470 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
3471 vec
<tree
> elts
, unsigned int nresults
,
3474 unsigned int nelts
= elts
.length ();
3475 tree element_type
= TREE_TYPE (vector_type
);
3477 /* (1) Find a vector mode VM with integer elements of mode IM. */
3478 unsigned int nvectors
= 1;
3479 tree new_vector_type
;
3481 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
3482 &nvectors
, &new_vector_type
,
3486 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3487 unsigned int partial_nelts
= nelts
/ nvectors
;
3488 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
3490 tree_vector_builder partial_elts
;
3491 auto_vec
<tree
, 32> pieces (nvectors
* 2);
3492 pieces
.quick_grow (nvectors
* 2);
3493 for (unsigned int i
= 0; i
< nvectors
; ++i
)
3495 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3496 ELTS' has mode IM. */
3497 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
3498 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
3499 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
3500 tree t
= gimple_build_vector (seq
, &partial_elts
);
3501 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
3502 TREE_TYPE (new_vector_type
), t
);
3504 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3505 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
3508 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3509 correct byte contents.
3511 We need to repeat the following operation log2(nvectors) times:
3513 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3514 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3516 However, if each input repeats every N elements and the VF is
3517 a multiple of N * 2, the HI result is the same as the LO. */
3518 unsigned int in_start
= 0;
3519 unsigned int out_start
= nvectors
;
3520 unsigned int hi_start
= nvectors
/ 2;
3521 /* A bound on the number of outputs needed to produce NRESULTS results
3522 in the final iteration. */
3523 unsigned int noutputs_bound
= nvectors
* nresults
;
3524 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
3526 noutputs_bound
/= 2;
3527 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
3528 for (unsigned int i
= 0; i
< limit
; ++i
)
3531 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
3534 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
3538 tree output
= make_ssa_name (new_vector_type
);
3539 tree input1
= pieces
[in_start
+ (i
/ 2)];
3540 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
3541 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
3544 gimple_seq_add_stmt (seq
, stmt
);
3545 pieces
[out_start
+ i
] = output
;
3547 std::swap (in_start
, out_start
);
3550 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3551 results
.reserve (nresults
);
3552 for (unsigned int i
= 0; i
< nresults
; ++i
)
3554 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
3555 pieces
[in_start
+ i
]));
3557 results
.quick_push (results
[i
- nvectors
]);
3561 /* For constant and loop invariant defs in OP_NODE this function creates
3562 vector defs that will be used in the vectorized stmts and stores them
3563 to SLP_TREE_VEC_DEFS of OP_NODE. */
3566 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
3568 unsigned HOST_WIDE_INT nunits
;
3570 unsigned j
, number_of_places_left_in_vector
;
3573 int group_size
= op_node
->ops
.length ();
3574 unsigned int vec_num
, i
;
3575 unsigned number_of_copies
= 1;
3577 gimple_seq ctor_seq
= NULL
;
3578 auto_vec
<tree
, 16> permute_results
;
3580 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3581 vector_type
= SLP_TREE_VECTYPE (op_node
);
3583 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
3584 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
3585 auto_vec
<tree
> voprnds (number_of_vectors
);
3587 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3588 created vectors. It is greater than 1 if unrolling is performed.
3590 For example, we have two scalar operands, s1 and s2 (e.g., group of
3591 strided accesses of size two), while NUNITS is four (i.e., four scalars
3592 of this type can be packed in a vector). The output vector will contain
3593 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3596 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3597 containing the operands.
3599 For example, NUNITS is four as before, and the group size is 8
3600 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3601 {s5, s6, s7, s8}. */
3603 /* When using duplicate_and_interleave, we just need one element for
3604 each scalar statement. */
3605 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
3606 nunits
= group_size
;
3608 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
3610 number_of_places_left_in_vector
= nunits
;
3612 tree_vector_builder
elts (vector_type
, nunits
, 1);
3613 elts
.quick_grow (nunits
);
3614 stmt_vec_info insert_after
= NULL
;
3615 for (j
= 0; j
< number_of_copies
; j
++)
3618 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
3620 /* Create 'vect_ = {op0,op1,...,opn}'. */
3621 number_of_places_left_in_vector
--;
3623 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
3625 if (CONSTANT_CLASS_P (op
))
3627 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3629 /* Can't use VIEW_CONVERT_EXPR for booleans because
3630 of possibly different sizes of scalar value and
3632 if (integer_zerop (op
))
3633 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
3634 else if (integer_onep (op
))
3635 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
3640 op
= fold_unary (VIEW_CONVERT_EXPR
,
3641 TREE_TYPE (vector_type
), op
);
3642 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
3646 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
3648 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3651 = build_all_ones_cst (TREE_TYPE (vector_type
));
3653 = build_zero_cst (TREE_TYPE (vector_type
));
3654 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
3655 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
3661 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
3664 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
3667 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
3671 elts
[number_of_places_left_in_vector
] = op
;
3672 if (!CONSTANT_CLASS_P (op
))
3674 /* For BB vectorization we have to compute an insert location
3675 when a def is inside the analyzed region since we cannot
3676 simply insert at the BB start in this case. */
3677 stmt_vec_info opdef
;
3678 if (TREE_CODE (orig_op
) == SSA_NAME
3679 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
3680 && is_a
<bb_vec_info
> (vinfo
)
3681 && (opdef
= vinfo
->lookup_def (orig_op
)))
3684 insert_after
= opdef
;
3686 insert_after
= get_later_stmt (insert_after
, opdef
);
3689 if (number_of_places_left_in_vector
== 0)
3692 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
3693 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
3694 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
3697 if (permute_results
.is_empty ())
3698 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
3699 elts
, number_of_vectors
,
3701 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
3706 gimple_stmt_iterator gsi
= gsi_for_stmt (insert_after
->stmt
);
3707 /* vect_init_vector inserts before. */
3709 init
= vect_init_vector (vinfo
, NULL
, vec_cst
,
3713 init
= vect_init_vector (vinfo
, NULL
, vec_cst
,
3715 if (ctor_seq
!= NULL
)
3717 gimple_stmt_iterator gsi
3718 = gsi_for_stmt (SSA_NAME_DEF_STMT (init
));
3719 gsi_insert_seq_before (&gsi
, ctor_seq
, GSI_SAME_STMT
);
3722 voprnds
.quick_push (init
);
3723 insert_after
= NULL
;
3724 number_of_places_left_in_vector
= nunits
;
3726 elts
.new_vector (vector_type
, nunits
, 1);
3727 elts
.quick_grow (nunits
);
3732 /* Since the vectors are created in the reverse order, we should invert
3734 vec_num
= voprnds
.length ();
3735 for (j
= vec_num
; j
!= 0; j
--)
3737 vop
= voprnds
[j
- 1];
3738 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
3741 /* In case that VF is greater than the unrolling factor needed for the SLP
3742 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3743 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3744 to replicate the vectors. */
3745 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
3746 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
3748 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
3751 /* Get the Ith vectorized definition from SLP_NODE. */
3754 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
3756 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
3757 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
3759 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
3762 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3765 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
3767 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
3768 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
3771 gimple
*vec_def_stmt
;
3772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
3773 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
3776 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
3779 /* Get N vectorized definitions for SLP_NODE. */
3782 vect_get_slp_defs (vec_info
*,
3783 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
3786 n
= SLP_TREE_CHILDREN (slp_node
).length ();
3788 for (unsigned i
= 0; i
< n
; ++i
)
3790 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
3791 vec
<tree
> vec_defs
= vNULL
;
3792 vect_get_slp_defs (child
, &vec_defs
);
3793 vec_oprnds
->quick_push (vec_defs
);
3797 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3798 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3799 permute statements for the SLP node NODE. */
3802 vect_transform_slp_perm_load (vec_info
*vinfo
,
3803 slp_tree node
, vec
<tree
> dr_chain
,
3804 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
3805 bool analyze_only
, unsigned *n_perms
)
3807 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3809 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3810 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
3811 unsigned int mask_element
;
3814 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
3817 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
3819 mode
= TYPE_MODE (vectype
);
3820 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3822 /* Initialize the vect stmts of NODE to properly insert the generated
3825 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
3826 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
3827 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
3829 /* Generate permutation masks for every NODE. Number of masks for each NODE
3830 is equal to GROUP_SIZE.
3831 E.g., we have a group of three nodes with three loads from the same
3832 location in each node, and the vector size is 4. I.e., we have a
3833 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3834 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3835 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3838 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3839 The last mask is illegal since we assume two operands for permute
3840 operation, and the mask element values can't be outside that range.
3841 Hence, the last mask must be converted into {2,5,5,5}.
3842 For the first two permutations we need the first and the second input
3843 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3844 we need the second and the third vectors: {b1,c1,a2,b2} and
3847 int vect_stmts_counter
= 0;
3848 unsigned int index
= 0;
3849 int first_vec_index
= -1;
3850 int second_vec_index
= -1;
3854 vec_perm_builder mask
;
3855 unsigned int nelts_to_build
;
3856 unsigned int nvectors_per_build
;
3857 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
3858 && multiple_p (nunits
, group_size
));
3861 /* A single vector contains a whole number of copies of the node, so:
3862 (a) all permutes can use the same mask; and
3863 (b) the permutes only need a single vector input. */
3864 mask
.new_vector (nunits
, group_size
, 3);
3865 nelts_to_build
= mask
.encoded_nelts ();
3866 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
3870 /* We need to construct a separate mask for each vector statement. */
3871 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
3872 if (!nunits
.is_constant (&const_nunits
)
3873 || !vf
.is_constant (&const_vf
))
3875 mask
.new_vector (const_nunits
, const_nunits
, 1);
3876 nelts_to_build
= const_vf
* group_size
;
3877 nvectors_per_build
= 1;
3880 unsigned int count
= mask
.encoded_nelts ();
3881 mask
.quick_grow (count
);
3882 vec_perm_indices indices
;
3884 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
3886 unsigned int iter_num
= j
/ group_size
;
3887 unsigned int stmt_num
= j
% group_size
;
3888 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
3889 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
3892 first_vec_index
= 0;
3897 /* Enforced before the loop when !repeating_p. */
3898 unsigned int const_nunits
= nunits
.to_constant ();
3899 vec_index
= i
/ const_nunits
;
3900 mask_element
= i
% const_nunits
;
3901 if (vec_index
== first_vec_index
3902 || first_vec_index
== -1)
3904 first_vec_index
= vec_index
;
3906 else if (vec_index
== second_vec_index
3907 || second_vec_index
== -1)
3909 second_vec_index
= vec_index
;
3910 mask_element
+= const_nunits
;
3914 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3916 "permutation requires at "
3917 "least three vectors %G",
3919 gcc_assert (analyze_only
);
3923 gcc_assert (mask_element
< 2 * const_nunits
);
3926 if (mask_element
!= index
)
3928 mask
[index
++] = mask_element
;
3930 if (index
== count
&& !noop_p
)
3932 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
3933 if (!can_vec_perm_const_p (mode
, indices
))
3935 if (dump_enabled_p ())
3937 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
3939 "unsupported vect permute { ");
3940 for (i
= 0; i
< count
; ++i
)
3942 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
3943 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
3945 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
3947 gcc_assert (analyze_only
);
3958 tree mask_vec
= NULL_TREE
;
3961 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
3963 if (second_vec_index
== -1)
3964 second_vec_index
= first_vec_index
;
3966 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
3968 /* Generate the permute statement if necessary. */
3969 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
3970 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
3974 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
3976 = vect_create_destination_var (gimple_assign_lhs (stmt
),
3978 perm_dest
= make_ssa_name (perm_dest
);
3980 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
3981 first_vec
, second_vec
,
3983 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
3987 /* If mask was NULL_TREE generate the requested
3988 identity transform. */
3989 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
3991 /* Store the vector statement in NODE. */
3992 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
3997 first_vec_index
= -1;
3998 second_vec_index
= -1;
4007 /* Vectorize the SLP permutations in NODE as specified
4008 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4009 child number and lane number.
4010 Interleaving of two two-lane two-child SLP subtrees (not supported):
4011 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4012 A blend of two four-lane two-child SLP subtrees:
4013 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4014 Highpart of a four-lane one-child SLP subtree (not supported):
4015 [ { 0, 2 }, { 0, 3 } ]
4016 Where currently only a subset is supported by code generating below. */
4019 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
4020 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
4022 tree vectype
= SLP_TREE_VECTYPE (node
);
4024 /* ??? We currently only support all same vector input and output types
4025 while the SLP IL should really do a concat + select and thus accept
4026 arbitrary mismatches. */
4029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4030 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
4032 if (dump_enabled_p ())
4033 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4034 "Unsupported lane permutation\n");
4038 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
4039 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
4040 if (dump_enabled_p ())
4042 dump_printf_loc (MSG_NOTE
, vect_location
,
4043 "vectorizing permutation");
4044 for (unsigned i
= 0; i
< perm
.length (); ++i
)
4045 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
4046 dump_printf (MSG_NOTE
, "\n");
4049 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4050 if (!nunits
.is_constant ())
4052 unsigned HOST_WIDE_INT vf
= 1;
4053 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4054 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
4056 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
4057 gcc_assert (multiple_p (olanes
, nunits
));
4059 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4060 from the { SLP operand, scalar lane } permutation as recorded in the
4061 SLP node as intermediate step. This part should already work
4062 with SLP children with arbitrary number of lanes. */
4063 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
4064 auto_vec
<unsigned> active_lane
;
4065 vperm
.create (olanes
);
4066 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length ());
4067 for (unsigned i
= 0; i
< vf
; ++i
)
4069 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
4071 std::pair
<unsigned, unsigned> p
= perm
[pi
];
4072 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
4073 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
4074 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
4075 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
4076 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
4078 /* Advance to the next group. */
4079 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
4080 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
4083 if (dump_enabled_p ())
4085 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
4086 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4088 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
4089 dump_printf (MSG_NOTE
, ",");
4090 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
4091 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
4092 vperm
[i
].first
.second
);
4094 dump_printf (MSG_NOTE
, "\n");
4097 /* We can only handle two-vector permutes, everything else should
4098 be lowered on the SLP level. The following is closely inspired
4099 by vect_transform_slp_perm_load and is supposed to eventually
4101 ??? As intermediate step do code-gen in the SLP tree representation
4103 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
4104 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
4105 unsigned int const_nunits
= nunits
.to_constant ();
4106 unsigned int index
= 0;
4107 unsigned int mask_element
;
4108 vec_perm_builder mask
;
4109 mask
.new_vector (const_nunits
, const_nunits
, 1);
4110 unsigned int count
= mask
.encoded_nelts ();
4111 mask
.quick_grow (count
);
4112 vec_perm_indices indices
;
4113 unsigned nperms
= 0;
4114 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4116 mask_element
= vperm
[i
].second
;
4117 if (first_vec
.first
== -1U
4118 || first_vec
== vperm
[i
].first
)
4119 first_vec
= vperm
[i
].first
;
4120 else if (second_vec
.first
== -1U
4121 || second_vec
== vperm
[i
].first
)
4123 second_vec
= vperm
[i
].first
;
4124 mask_element
+= const_nunits
;
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4130 "permutation requires at "
4131 "least three vectors");
4136 mask
[index
++] = mask_element
;
4140 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
4142 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
4144 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4148 "unsupported vect permute { ");
4149 for (i
= 0; i
< count
; ++i
)
4151 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4152 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4154 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4163 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4165 if (second_vec
.first
== -1U)
4166 second_vec
= first_vec
;
4168 /* Generate the permute statement if necessary. */
4169 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
4171 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
4172 slp_tree second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
4174 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
4175 tree perm_dest
= make_ssa_name (vectype
);
4177 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4178 first_def
, second_def
,
4180 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
4181 /* Store the vector statement in NODE. */
4182 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
4186 first_vec
= std::make_pair (-1U, -1U);
4187 second_vec
= std::make_pair (-1U, -1U);
4192 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
4197 /* Vectorize SLP instance tree in postorder. */
4200 vect_schedule_slp_instance (vec_info
*vinfo
,
4201 slp_tree node
, slp_instance instance
)
4203 gimple_stmt_iterator si
;
4207 /* See if we have already vectorized the node in the graph of the
4209 if ((SLP_TREE_DEF_TYPE (node
) == vect_internal_def
4210 && SLP_TREE_VEC_STMTS (node
).exists ())
4211 || SLP_TREE_VEC_DEFS (node
).exists ())
4214 /* Vectorize externals and constants. */
4215 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4216 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4218 /* ??? vectorizable_shift can end up using a scalar operand which is
4219 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4220 node in this case. */
4221 if (!SLP_TREE_VECTYPE (node
))
4224 vect_create_constant_vectors (vinfo
, node
);
4228 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4229 vect_schedule_slp_instance (vinfo
, child
, instance
);
4231 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
4232 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4234 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4235 if (dump_enabled_p ())
4236 dump_printf_loc (MSG_NOTE
, vect_location
,
4237 "------>vectorizing SLP node starting from: %G",
4240 if (STMT_VINFO_DATA_REF (stmt_info
)
4241 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
4243 /* Vectorized loads go before the first scalar load to make it
4244 ready early, vectorized stores go before the last scalar
4245 stmt which is where all uses are ready. */
4246 stmt_vec_info last_stmt_info
= NULL
;
4247 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
4248 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
4249 else /* DR_IS_WRITE */
4250 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
4251 si
= gsi_for_stmt (last_stmt_info
->stmt
);
4253 else if (SLP_TREE_CHILDREN (node
).is_empty ())
4255 /* This happens for reduction and induction PHIs where we do not use the
4256 insertion iterator. */
4257 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4258 == cycle_phi_info_type
4259 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4260 == induc_vec_info_type
));
4265 /* Emit other stmts after the children vectorized defs which is
4266 earliest possible. */
4267 gimple
*last_stmt
= NULL
;
4268 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4269 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4271 /* For fold-left reductions we are retaining the scalar
4272 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4273 set so the representation isn't perfect. Resort to the
4274 last scalar def here. */
4275 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
4277 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
4278 == cycle_phi_info_type
);
4279 gphi
*phi
= as_a
<gphi
*>
4280 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
4282 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
4285 /* We are emitting all vectorized stmts in the same place and
4286 the last one is the last.
4287 ??? Unless we have a load permutation applied and that
4288 figures to re-use an earlier generated load. */
4291 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
4293 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4296 else if (!SLP_TREE_VECTYPE (child
))
4298 /* For externals we use unvectorized at all scalar defs. */
4301 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
4302 if (TREE_CODE (def
) == SSA_NAME
4303 && !SSA_NAME_IS_DEFAULT_DEF (def
))
4305 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
4307 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
4313 /* For externals we have to look at all defs since their
4314 insertion place is decided per vector. */
4317 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
4318 if (TREE_CODE (vdef
) == SSA_NAME
)
4320 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
4322 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4326 if (is_a
<gphi
*> (last_stmt
))
4327 si
= gsi_after_labels (gimple_bb (last_stmt
));
4330 si
= gsi_for_stmt (last_stmt
);
4335 bool done_p
= false;
4337 /* Handle purely internal nodes. */
4338 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4340 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4341 be shared with different SLP nodes (but usually it's the same
4342 operation apart from the case the stmt is only there for denoting
4343 the actual scalar lane defs ...). So do not call vect_transform_stmt
4344 but open-code it here (partly). */
4345 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
4350 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
4353 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4354 For loop vectorization this is done in vectorizable_call, but for SLP
4355 it needs to be deferred until end of vect_schedule_slp, because multiple
4356 SLP instances may refer to the same scalar stmt. */
4359 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
4360 slp_tree node
, hash_set
<slp_tree
> &visited
)
4363 gimple_stmt_iterator gsi
;
4367 stmt_vec_info stmt_info
;
4369 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
4372 if (visited
.add (node
))
4375 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4376 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
4378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4380 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4381 if (!stmt
|| gimple_bb (stmt
) == NULL
)
4383 if (is_pattern_stmt_p (stmt_info
)
4384 || !PURE_SLP_STMT (stmt_info
))
4386 lhs
= gimple_call_lhs (stmt
);
4387 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
4388 gsi
= gsi_for_stmt (stmt
);
4389 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
4390 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
4395 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
4397 hash_set
<slp_tree
> visited
;
4398 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
4401 /* Vectorize the instance root. */
4404 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
4406 gassign
*rstmt
= NULL
;
4408 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
4413 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4415 tree vect_lhs
= gimple_get_lhs (child_stmt
);
4416 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4417 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
4418 TREE_TYPE (vect_lhs
)))
4419 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
4421 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
4425 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
4427 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
4430 vec
<constructor_elt
, va_gc
> *v
;
4431 vec_alloc (v
, nelts
);
4433 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4435 CONSTRUCTOR_APPEND_ELT (v
,
4437 gimple_get_lhs (child_stmt
));
4439 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4440 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
4441 tree r_constructor
= build_constructor (rtype
, v
);
4442 rstmt
= gimple_build_assign (lhs
, r_constructor
);
4447 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
4448 gsi_replace (&rgsi
, rstmt
, true);
4451 /* Generate vector code for all SLP instances in the loop/basic block. */
4454 vect_schedule_slp (vec_info
*vinfo
)
4456 vec
<slp_instance
> slp_instances
;
4457 slp_instance instance
;
4460 slp_instances
= vinfo
->slp_instances
;
4461 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4463 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4464 /* Schedule the tree of INSTANCE. */
4465 vect_schedule_slp_instance (vinfo
, node
, instance
);
4467 if (SLP_INSTANCE_ROOT_STMT (instance
))
4468 vectorize_slp_instance_root_stmt (node
, instance
);
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE
, vect_location
,
4472 "vectorizing stmts using SLP.\n");
4475 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4477 slp_tree root
= SLP_INSTANCE_TREE (instance
);
4478 stmt_vec_info store_info
;
4481 /* Remove scalar call stmts. Do not do this for basic-block
4482 vectorization as not all uses may be vectorized.
4483 ??? Why should this be necessary? DCE should be able to
4484 remove the stmts itself.
4485 ??? For BB vectorization we can as well remove scalar
4486 stmts starting from the SLP tree root if they have no
4488 if (is_a
<loop_vec_info
> (vinfo
))
4489 vect_remove_slp_scalar_calls (vinfo
, root
);
4491 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
4493 if (!STMT_VINFO_DATA_REF (store_info
))
4496 if (SLP_INSTANCE_ROOT_STMT (instance
))
4499 store_info
= vect_orig_stmt (store_info
);
4500 /* Free the attached stmt_vec_info and remove the stmt. */
4501 vinfo
->remove_stmt (store_info
);