1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
35 #include "hard-reg-set.h"
38 #include "basic-block.h"
39 #include "gimple-pretty-print.h"
40 #include "tree-ssa-alias.h"
41 #include "internal-fn.h"
42 #include "gimple-expr.h"
45 #include "gimple-iterator.h"
46 #include "gimple-ssa.h"
47 #include "tree-phinodes.h"
48 #include "ssa-iterators.h"
49 #include "stringpool.h"
50 #include "tree-ssanames.h"
51 #include "tree-pass.h"
54 #include "recog.h" /* FIXME: for insn_data */
56 #include "tree-vectorizer.h"
57 #include "langhooks.h"
59 /* Extract the location of the basic block in the source code.
60 Return the basic block location if succeed and NULL if not. */
63 find_bb_location (basic_block bb
)
66 gimple_stmt_iterator si
;
69 return UNKNOWN_LOCATION
;
71 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
74 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
75 return gimple_location (stmt
);
78 return UNKNOWN_LOCATION
;
82 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
85 vect_free_slp_tree (slp_tree node
)
93 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
94 vect_free_slp_tree (child
);
96 SLP_TREE_CHILDREN (node
).release ();
97 SLP_TREE_SCALAR_STMTS (node
).release ();
98 SLP_TREE_VEC_STMTS (node
).release ();
99 SLP_TREE_LOAD_PERMUTATION (node
).release ();
105 /* Free the memory allocated for the SLP instance. */
108 vect_free_slp_instance (slp_instance instance
)
110 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
111 SLP_INSTANCE_LOADS (instance
).release ();
112 SLP_INSTANCE_BODY_COST_VEC (instance
).release ();
117 /* Create an SLP node for SCALAR_STMTS. */
120 vect_create_new_slp_node (vec
<gimple
> scalar_stmts
)
123 gimple stmt
= scalar_stmts
[0];
126 if (is_gimple_call (stmt
))
127 nops
= gimple_call_num_args (stmt
);
128 else if (is_gimple_assign (stmt
))
130 nops
= gimple_num_ops (stmt
) - 1;
131 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
137 node
= XNEW (struct _slp_tree
);
138 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
139 SLP_TREE_VEC_STMTS (node
).create (0);
140 SLP_TREE_CHILDREN (node
).create (nops
);
141 SLP_TREE_LOAD_PERMUTATION (node
) = vNULL
;
147 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
149 static vec
<slp_oprnd_info
>
150 vect_create_oprnd_info (int nops
, int group_size
)
153 slp_oprnd_info oprnd_info
;
154 vec
<slp_oprnd_info
> oprnds_info
;
156 oprnds_info
.create (nops
);
157 for (i
= 0; i
< nops
; i
++)
159 oprnd_info
= XNEW (struct _slp_oprnd_info
);
160 oprnd_info
->def_stmts
.create (group_size
);
161 oprnd_info
->first_dt
= vect_uninitialized_def
;
162 oprnd_info
->first_op_type
= NULL_TREE
;
163 oprnd_info
->first_pattern
= false;
164 oprnds_info
.quick_push (oprnd_info
);
171 /* Free operands info. */
174 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
177 slp_oprnd_info oprnd_info
;
179 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
181 oprnd_info
->def_stmts
.release ();
182 XDELETE (oprnd_info
);
185 oprnds_info
.release ();
189 /* Find the place of the data-ref in STMT in the interleaving chain that starts
190 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
193 vect_get_place_in_interleaving_chain (gimple stmt
, gimple first_stmt
)
195 gimple next_stmt
= first_stmt
;
198 if (first_stmt
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
203 if (next_stmt
== stmt
)
206 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
214 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
215 they are of a valid type and that they match the defs of the first stmt of
216 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
217 return -1, if the error could be corrected by swapping operands of the
218 operation return 1, if everything is ok return 0. */
221 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
222 gimple stmt
, bool first
,
223 vec
<slp_oprnd_info
> *oprnds_info
)
226 unsigned int i
, number_of_oprnds
;
229 enum vect_def_type dt
= vect_uninitialized_def
;
230 struct loop
*loop
= NULL
;
231 bool pattern
= false;
232 slp_oprnd_info oprnd_info
;
233 int first_op_idx
= 1;
234 bool commutative
= false;
235 bool first_op_cond
= false;
238 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
240 if (is_gimple_call (stmt
))
242 number_of_oprnds
= gimple_call_num_args (stmt
);
245 else if (is_gimple_assign (stmt
))
247 enum tree_code code
= gimple_assign_rhs_code (stmt
);
248 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
249 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
251 first_op_cond
= true;
256 commutative
= commutative_tree_code (code
);
261 bool swapped
= false;
262 for (i
= 0; i
< number_of_oprnds
; i
++)
267 if (i
== 0 || i
== 1)
268 oprnd
= TREE_OPERAND (gimple_op (stmt
, first_op_idx
),
271 oprnd
= gimple_op (stmt
, first_op_idx
+ i
- 1);
274 oprnd
= gimple_op (stmt
, first_op_idx
+ (swapped
? !i
: i
));
276 oprnd_info
= (*oprnds_info
)[i
];
278 if (!vect_is_simple_use (oprnd
, NULL
, loop_vinfo
, bb_vinfo
, &def_stmt
,
280 || (!def_stmt
&& dt
!= vect_constant_def
))
282 if (dump_enabled_p ())
284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
285 "Build SLP failed: can't find def for ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
287 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
294 from the pattern. Check that all the stmts of the node are in the
296 if (def_stmt
&& gimple_bb (def_stmt
)
297 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
298 || (!loop
&& gimple_bb (def_stmt
) == BB_VINFO_BB (bb_vinfo
)
299 && gimple_code (def_stmt
) != GIMPLE_PHI
))
300 && vinfo_for_stmt (def_stmt
)
301 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt
))
302 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt
))
303 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt
)))
306 if (!first
&& !oprnd_info
->first_pattern
)
316 if (dump_enabled_p ())
318 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
319 "Build SLP failed: some of the stmts"
320 " are in a pattern, and others are not ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
322 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 def_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt
));
329 dt
= STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
));
331 if (dt
== vect_unknown_def_type
)
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
335 "Unsupported pattern.\n");
339 switch (gimple_code (def_stmt
))
342 def
= gimple_phi_result (def_stmt
);
346 def
= gimple_assign_lhs (def_stmt
);
350 if (dump_enabled_p ())
351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
352 "unsupported defining stmt:\n");
359 oprnd_info
->first_dt
= dt
;
360 oprnd_info
->first_pattern
= pattern
;
361 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
365 /* Not first stmt of the group, check that the def-stmt/s match
366 the def-stmt/s of the first stmt. Allow different definition
367 types for reduction chains: the first stmt must be a
368 vect_reduction_def (a phi node), and the rest
369 vect_internal_def. */
370 if (((oprnd_info
->first_dt
!= dt
371 && !(oprnd_info
->first_dt
== vect_reduction_def
372 && dt
== vect_internal_def
)
373 && !((oprnd_info
->first_dt
== vect_external_def
374 || oprnd_info
->first_dt
== vect_constant_def
)
375 && (dt
== vect_external_def
376 || dt
== vect_constant_def
)))
377 || !types_compatible_p (oprnd_info
->first_op_type
,
380 /* Try swapping operands if we got a mismatch. */
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "Build SLP failed: different types\n");
397 /* Check the types of the definitions. */
400 case vect_constant_def
:
401 case vect_external_def
:
402 case vect_reduction_def
:
405 case vect_internal_def
:
406 oprnd_info
->def_stmts
.quick_push (def_stmt
);
410 /* FORNOW: Not supported. */
411 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
414 "Build SLP failed: illegal type of def ");
415 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def
);
416 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
428 tree cond
= gimple_assign_rhs1 (stmt
);
429 swap_ssa_operands (stmt
, &TREE_OPERAND (cond
, 0),
430 &TREE_OPERAND (cond
, 1));
431 TREE_SET_CODE (cond
, swap_tree_comparison (TREE_CODE (cond
)));
434 swap_ssa_operands (stmt
, gimple_assign_rhs1_ptr (stmt
),
435 gimple_assign_rhs2_ptr (stmt
));
442 /* Verify if the scalar stmts STMTS are isomorphic, require data
443 permutation or are of unsupported types of operation. Return
444 true if they are, otherwise return false and indicate in *MATCHES
445 which stmts are not isomorphic to the first one. If MATCHES[0]
446 is false then this indicates the comparison could not be
447 carried out or the stmts will never be vectorized by SLP. */
450 vect_build_slp_tree_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
451 vec
<gimple
> stmts
, unsigned int group_size
,
452 unsigned nops
, unsigned int *max_nunits
,
453 unsigned int vectorization_factor
, bool *matches
)
456 gimple stmt
= stmts
[0];
457 enum tree_code first_stmt_code
= ERROR_MARK
, rhs_code
= ERROR_MARK
;
458 enum tree_code first_cond_code
= ERROR_MARK
;
460 bool need_same_oprnds
= false;
461 tree vectype
, scalar_type
, first_op1
= NULL_TREE
;
464 enum machine_mode optab_op2_mode
;
465 enum machine_mode vec_mode
;
466 struct data_reference
*first_dr
;
468 gimple first_load
= NULL
, prev_first_load
= NULL
, old_first_load
= NULL
;
471 /* For every stmt in NODE find its def stmt/s. */
472 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
476 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for ");
479 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
480 dump_printf (MSG_NOTE
, "\n");
483 /* Fail to vectorize statements marked as unvectorizable. */
484 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)))
486 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
489 "Build SLP failed: unvectorizable statement ");
490 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
491 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
493 /* Fatal mismatch. */
498 lhs
= gimple_get_lhs (stmt
);
499 if (lhs
== NULL_TREE
)
501 if (dump_enabled_p ())
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
504 "Build SLP failed: not GIMPLE_ASSIGN nor "
506 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
507 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
509 /* Fatal mismatch. */
514 if (is_gimple_assign (stmt
)
515 && gimple_assign_rhs_code (stmt
) == COND_EXPR
516 && (cond
= gimple_assign_rhs1 (stmt
))
517 && !COMPARISON_CLASS_P (cond
))
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
522 "Build SLP failed: condition is not "
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
525 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
527 /* Fatal mismatch. */
532 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
, &dummy
);
533 vectype
= get_vectype_for_scalar_type (scalar_type
);
536 if (dump_enabled_p ())
538 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
539 "Build SLP failed: unsupported data-type ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
542 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
544 /* Fatal mismatch. */
549 /* In case of multiple types we need to detect the smallest type. */
550 if (*max_nunits
< TYPE_VECTOR_SUBPARTS (vectype
))
552 *max_nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
554 vectorization_factor
= *max_nunits
;
557 if (gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
))
559 rhs_code
= CALL_EXPR
;
560 if (gimple_call_internal_p (call_stmt
)
561 || gimple_call_tail_p (call_stmt
)
562 || gimple_call_noreturn_p (call_stmt
)
563 || !gimple_call_nothrow_p (call_stmt
)
564 || gimple_call_chain (call_stmt
))
566 if (dump_enabled_p ())
568 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
569 "Build SLP failed: unsupported call type ");
570 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
572 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
574 /* Fatal mismatch. */
580 rhs_code
= gimple_assign_rhs_code (stmt
);
582 /* Check the operation. */
585 first_stmt_code
= rhs_code
;
587 /* Shift arguments should be equal in all the packed stmts for a
588 vector shift with scalar shift operand. */
589 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
590 || rhs_code
== LROTATE_EXPR
591 || rhs_code
== RROTATE_EXPR
)
593 vec_mode
= TYPE_MODE (vectype
);
595 /* First see if we have a vector/vector shift. */
596 optab
= optab_for_tree_code (rhs_code
, vectype
,
600 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
602 /* No vector/vector shift, try for a vector/scalar shift. */
603 optab
= optab_for_tree_code (rhs_code
, vectype
,
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
610 "Build SLP failed: no optab.\n");
611 /* Fatal mismatch. */
615 icode
= (int) optab_handler (optab
, vec_mode
);
616 if (icode
== CODE_FOR_nothing
)
618 if (dump_enabled_p ())
619 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
621 "op not supported by target.\n");
622 /* Fatal mismatch. */
626 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
627 if (!VECTOR_MODE_P (optab_op2_mode
))
629 need_same_oprnds
= true;
630 first_op1
= gimple_assign_rhs2 (stmt
);
634 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
636 need_same_oprnds
= true;
637 first_op1
= gimple_assign_rhs2 (stmt
);
642 if (first_stmt_code
!= rhs_code
643 && (first_stmt_code
!= IMAGPART_EXPR
644 || rhs_code
!= REALPART_EXPR
)
645 && (first_stmt_code
!= REALPART_EXPR
646 || rhs_code
!= IMAGPART_EXPR
)
647 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
648 && (first_stmt_code
== ARRAY_REF
649 || first_stmt_code
== BIT_FIELD_REF
650 || first_stmt_code
== INDIRECT_REF
651 || first_stmt_code
== COMPONENT_REF
652 || first_stmt_code
== MEM_REF
)))
654 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
657 "Build SLP failed: different operation "
659 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
660 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
667 && !operand_equal_p (first_op1
, gimple_assign_rhs2 (stmt
), 0))
669 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
672 "Build SLP failed: different shift "
674 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
675 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
681 if (rhs_code
== CALL_EXPR
)
683 gimple first_stmt
= stmts
[0];
684 if (gimple_call_num_args (stmt
) != nops
685 || !operand_equal_p (gimple_call_fn (first_stmt
),
686 gimple_call_fn (stmt
), 0)
687 || gimple_call_fntype (first_stmt
)
688 != gimple_call_fntype (stmt
))
690 if (dump_enabled_p ())
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
693 "Build SLP failed: different calls in ");
694 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
696 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
704 /* Grouped store or load. */
705 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
)))
707 if (REFERENCE_CLASS_P (lhs
))
715 unsigned unrolling_factor
716 = least_common_multiple
717 (*max_nunits
, group_size
) / group_size
;
718 /* FORNOW: Check that there is no gap between the loads
719 and no gap between the groups when we need to load
720 multiple groups at once.
721 ??? We should enhance this to only disallow gaps
723 if ((unrolling_factor
> 1
724 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
725 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
726 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) != stmt
727 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 1))
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
732 "Build SLP failed: grouped "
734 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
736 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
738 /* Fatal mismatch. */
743 /* Check that the size of interleaved loads group is not
744 greater than the SLP group size. */
746 = vectorization_factor
/ TYPE_VECTOR_SUBPARTS (vectype
);
748 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
749 && ((GROUP_SIZE (vinfo_for_stmt (stmt
))
750 - GROUP_GAP (vinfo_for_stmt (stmt
)))
751 > ncopies
* group_size
))
753 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
756 "Build SLP failed: the number "
757 "of interleaved loads is greater than "
758 "the SLP group size ");
759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
761 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
763 /* Fatal mismatch. */
768 old_first_load
= first_load
;
769 first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
772 /* Check that there are no loads from different interleaving
773 chains in the same node. */
774 if (prev_first_load
!= first_load
)
776 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
780 "Build SLP failed: different "
781 "interleaving chains in one node ");
782 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
784 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
791 prev_first_load
= first_load
;
793 /* In some cases a group of loads is just the same load
794 repeated N times. Only analyze its cost once. */
795 if (first_load
== stmt
&& old_first_load
!= first_load
)
797 first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
798 if (vect_supportable_dr_alignment (first_dr
, false)
799 == dr_unaligned_unsupported
)
801 if (dump_enabled_p ())
803 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
805 "Build SLP failed: unsupported "
807 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
809 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
811 /* Fatal mismatch. */
817 } /* Grouped access. */
820 if (TREE_CODE_CLASS (rhs_code
) == tcc_reference
)
822 /* Not grouped load. */
823 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
826 "Build SLP failed: not grouped load ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
828 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
831 /* FORNOW: Not grouped loads are not supported. */
832 /* Fatal mismatch. */
837 /* Not memory operation. */
838 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
839 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
840 && rhs_code
!= COND_EXPR
841 && rhs_code
!= CALL_EXPR
)
843 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
846 "Build SLP failed: operation");
847 dump_printf (MSG_MISSED_OPTIMIZATION
, " unsupported ");
848 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
849 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
851 /* Fatal mismatch. */
856 if (rhs_code
== COND_EXPR
)
858 tree cond_expr
= gimple_assign_rhs1 (stmt
);
861 first_cond_code
= TREE_CODE (cond_expr
);
862 else if (first_cond_code
!= TREE_CODE (cond_expr
))
864 if (dump_enabled_p ())
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
867 "Build SLP failed: different"
869 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
871 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
882 for (i
= 0; i
< group_size
; ++i
)
889 /* Recursively build an SLP tree starting from NODE.
890 Fail (and return a value not equal to zero) if def-stmts are not
891 isomorphic, require data permutation or are of unsupported types of
892 operation. Otherwise, return 0.
893 The value returned is the depth in the SLP tree where a mismatch
897 vect_build_slp_tree (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
898 slp_tree
*node
, unsigned int group_size
,
899 unsigned int *max_nunits
,
900 vec
<slp_tree
> *loads
,
901 unsigned int vectorization_factor
,
902 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
903 unsigned max_tree_size
)
905 unsigned nops
, i
, this_npermutes
= 0, this_tree_size
= 0;
909 matches
= XALLOCAVEC (bool, group_size
);
911 npermutes
= &this_npermutes
;
915 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
916 if (is_gimple_call (stmt
))
917 nops
= gimple_call_num_args (stmt
);
918 else if (is_gimple_assign (stmt
))
920 nops
= gimple_num_ops (stmt
) - 1;
921 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
927 if (!vect_build_slp_tree_1 (loop_vinfo
, bb_vinfo
,
928 SLP_TREE_SCALAR_STMTS (*node
), group_size
, nops
,
929 max_nunits
, vectorization_factor
, matches
))
932 /* If the SLP node is a load, terminate the recursion. */
933 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
934 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
936 loads
->safe_push (*node
);
940 /* Get at the operands, verifying they are compatible. */
941 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
942 slp_oprnd_info oprnd_info
;
943 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node
), i
, stmt
)
945 switch (vect_get_and_check_slp_defs (loop_vinfo
, bb_vinfo
,
946 stmt
, (i
== 0), &oprnds_info
))
952 vect_free_oprnd_info (oprnds_info
);
959 for (i
= 0; i
< group_size
; ++i
)
962 vect_free_oprnd_info (oprnds_info
);
966 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
968 /* Create SLP_TREE nodes for the definition node/s. */
969 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
972 unsigned old_nloads
= loads
->length ();
973 unsigned old_max_nunits
= *max_nunits
;
975 if (oprnd_info
->first_dt
!= vect_internal_def
)
978 if (++this_tree_size
> max_tree_size
)
980 vect_free_oprnd_info (oprnds_info
);
984 child
= vect_create_new_slp_node (oprnd_info
->def_stmts
);
987 vect_free_oprnd_info (oprnds_info
);
991 bool *matches
= XALLOCAVEC (bool, group_size
);
992 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
993 group_size
, max_nunits
, loads
,
994 vectorization_factor
, matches
,
995 npermutes
, &this_tree_size
, max_tree_size
))
997 oprnd_info
->def_stmts
= vNULL
;
998 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1002 /* If the SLP build for operand zero failed and operand zero
1003 and one can be commutated try that for the scalar stmts
1004 that failed the match. */
1006 /* A first scalar stmt mismatch signals a fatal mismatch. */
1008 /* ??? For COND_EXPRs we can swap the comparison operands
1009 as well as the arms under some constraints. */
1011 && oprnds_info
[1]->first_dt
== vect_internal_def
1012 && is_gimple_assign (stmt
)
1013 && commutative_tree_code (gimple_assign_rhs_code (stmt
))
1014 /* Do so only if the number of not successful permutes was nor more
1015 than a cut-ff as re-trying the recursive match on
1016 possibly each level of the tree would expose exponential
1021 *max_nunits
= old_max_nunits
;
1022 loads
->truncate (old_nloads
);
1023 /* Swap mismatched definition stmts. */
1024 dump_printf_loc (MSG_NOTE
, vect_location
,
1025 "Re-trying with swapped operands of stmts ");
1026 for (unsigned j
= 0; j
< group_size
; ++j
)
1029 gimple tem
= oprnds_info
[0]->def_stmts
[j
];
1030 oprnds_info
[0]->def_stmts
[j
] = oprnds_info
[1]->def_stmts
[j
];
1031 oprnds_info
[1]->def_stmts
[j
] = tem
;
1032 dump_printf (MSG_NOTE
, "%d ", j
);
1034 dump_printf (MSG_NOTE
, "\n");
1035 /* And try again ... */
1036 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
1037 group_size
, max_nunits
, loads
,
1038 vectorization_factor
,
1039 matches
, npermutes
, &this_tree_size
,
1042 oprnd_info
->def_stmts
= vNULL
;
1043 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1050 oprnd_info
->def_stmts
= vNULL
;
1051 vect_free_slp_tree (child
);
1052 vect_free_oprnd_info (oprnds_info
);
1057 *tree_size
+= this_tree_size
;
1059 vect_free_oprnd_info (oprnds_info
);
1063 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1066 vect_print_slp_tree (int dump_kind
, slp_tree node
)
1075 dump_printf (dump_kind
, "node ");
1076 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1078 dump_printf (dump_kind
, "\n\tstmt %d ", i
);
1079 dump_gimple_stmt (dump_kind
, TDF_SLIM
, stmt
, 0);
1081 dump_printf (dump_kind
, "\n");
1083 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1084 vect_print_slp_tree (dump_kind
, child
);
1088 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1089 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1090 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1091 stmts in NODE are to be marked. */
1094 vect_mark_slp_stmts (slp_tree node
, enum slp_vect_type mark
, int j
)
1103 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1104 if (j
< 0 || i
== j
)
1105 STMT_SLP_TYPE (vinfo_for_stmt (stmt
)) = mark
;
1107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1108 vect_mark_slp_stmts (child
, mark
, j
);
1112 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1115 vect_mark_slp_stmts_relevant (slp_tree node
)
1119 stmt_vec_info stmt_info
;
1125 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1127 stmt_info
= vinfo_for_stmt (stmt
);
1128 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1129 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1130 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1133 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1134 vect_mark_slp_stmts_relevant (child
);
1138 /* Rearrange the statements of NODE according to PERMUTATION. */
1141 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1142 vec
<unsigned> permutation
)
1145 vec
<gimple
> tmp_stmts
;
1149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1150 vect_slp_rearrange_stmts (child
, group_size
, permutation
);
1152 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1153 tmp_stmts
.create (group_size
);
1154 tmp_stmts
.quick_grow_cleared (group_size
);
1156 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1157 tmp_stmts
[permutation
[i
]] = stmt
;
1159 SLP_TREE_SCALAR_STMTS (node
).release ();
1160 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1164 /* Check if the required load permutations in the SLP instance
1165 SLP_INSTN are supported. */
1168 vect_supported_load_permutation_p (slp_instance slp_instn
)
1170 unsigned int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_instn
);
1171 unsigned int i
, j
, k
, next
;
1174 gimple stmt
, load
, next_load
, first_load
;
1175 struct data_reference
*dr
;
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE
, vect_location
, "Load permutation ");
1180 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1181 if (node
->load_permutation
.exists ())
1182 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, next
)
1183 dump_printf (MSG_NOTE
, "%d ", next
);
1185 for (k
= 0; k
< group_size
; ++k
)
1186 dump_printf (MSG_NOTE
, "%d ", k
);
1187 dump_printf (MSG_NOTE
, "\n");
1190 /* In case of reduction every load permutation is allowed, since the order
1191 of the reduction statements is not important (as opposed to the case of
1192 grouped stores). The only condition we need to check is that all the
1193 load nodes are of the same size and have the same permutation (and then
1194 rearrange all the nodes of the SLP instance according to this
1197 /* Check that all the load nodes are of the same size. */
1198 /* ??? Can't we assert this? */
1199 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1200 if (SLP_TREE_SCALAR_STMTS (node
).length () != (unsigned) group_size
)
1203 node
= SLP_INSTANCE_TREE (slp_instn
);
1204 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1206 /* Reduction (there are no data-refs in the root).
1207 In reduction chain the order of the loads is important. */
1208 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))
1209 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1214 /* Compare all the permutation sequences to the first one. We know
1215 that at least one load is permuted. */
1216 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1217 if (!node
->load_permutation
.exists ())
1219 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1221 if (!load
->load_permutation
.exists ())
1223 FOR_EACH_VEC_ELT (load
->load_permutation
, j
, lidx
)
1224 if (lidx
!= node
->load_permutation
[j
])
1228 /* Check that the loads in the first sequence are different and there
1229 are no gaps between them. */
1230 load_index
= sbitmap_alloc (group_size
);
1231 bitmap_clear (load_index
);
1232 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1234 if (bitmap_bit_p (load_index
, lidx
))
1236 sbitmap_free (load_index
);
1239 bitmap_set_bit (load_index
, lidx
);
1241 for (i
= 0; i
< group_size
; i
++)
1242 if (!bitmap_bit_p (load_index
, i
))
1244 sbitmap_free (load_index
);
1247 sbitmap_free (load_index
);
1249 /* This permutation is valid for reduction. Since the order of the
1250 statements in the nodes is not important unless they are memory
1251 accesses, we can rearrange the statements in all the nodes
1252 according to the order of the loads. */
1253 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1254 node
->load_permutation
);
1256 /* We are done, no actual permutations need to be generated. */
1257 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1258 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1262 /* In basic block vectorization we allow any subchain of an interleaving
1264 FORNOW: not supported in loop SLP because of realignment compications. */
1265 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt
)))
1267 /* Check that for every node in the instance the loads
1269 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1272 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load
)
1274 if (j
!= 0 && next_load
!= load
)
1276 next_load
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (load
));
1280 /* Check that the alignment of the first load in every subchain, i.e.,
1281 the first statement in every load node, is supported.
1282 ??? This belongs in alignment checking. */
1283 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1285 first_load
= SLP_TREE_SCALAR_STMTS (node
)[0];
1286 if (first_load
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load
)))
1288 dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load
));
1289 if (vect_supportable_dr_alignment (dr
, false)
1290 == dr_unaligned_unsupported
)
1292 if (dump_enabled_p ())
1294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1296 "unsupported unaligned load ");
1297 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1299 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1306 /* We are done, no actual permutations need to be generated. */
1307 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1308 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1312 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1313 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1314 well (unless it's reduction). */
1315 if (SLP_INSTANCE_LOADS (slp_instn
).length () != group_size
)
1317 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1318 if (!node
->load_permutation
.exists ())
1321 load_index
= sbitmap_alloc (group_size
);
1322 bitmap_clear (load_index
);
1323 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1325 unsigned int lidx
= node
->load_permutation
[0];
1326 if (bitmap_bit_p (load_index
, lidx
))
1328 sbitmap_free (load_index
);
1331 bitmap_set_bit (load_index
, lidx
);
1332 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, k
)
1335 sbitmap_free (load_index
);
1339 for (i
= 0; i
< group_size
; i
++)
1340 if (!bitmap_bit_p (load_index
, i
))
1342 sbitmap_free (load_index
);
1345 sbitmap_free (load_index
);
1347 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1348 if (node
->load_permutation
.exists ()
1349 && !vect_transform_slp_perm_load
1351 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
), slp_instn
, true))
1357 /* Find the first load in the loop that belongs to INSTANCE.
1358 When loads are in several SLP nodes, there can be a case in which the first
1359 load does not appear in the first SLP node to be transformed, causing
1360 incorrect order of statements. Since we generate all the loads together,
1361 they must be inserted before the first load of the SLP instance and not
1362 before the first load of the first node of the instance. */
1365 vect_find_first_load_in_slp_instance (slp_instance instance
)
1369 gimple first_load
= NULL
, load
;
1371 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
1372 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1373 first_load
= get_earlier_stmt (load
, first_load
);
1379 /* Find the last store in SLP INSTANCE. */
1382 vect_find_last_store_in_slp_instance (slp_instance instance
)
1386 gimple last_store
= NULL
, store
;
1388 node
= SLP_INSTANCE_TREE (instance
);
1389 for (i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &store
); i
++)
1390 last_store
= get_later_stmt (store
, last_store
);
1395 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1398 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1399 slp_instance instance
, slp_tree node
,
1400 stmt_vector_for_cost
*prologue_cost_vec
,
1401 unsigned ncopies_for_cost
)
1403 stmt_vector_for_cost
*body_cost_vec
= &SLP_INSTANCE_BODY_COST_VEC (instance
);
1408 stmt_vec_info stmt_info
;
1410 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1412 /* Recurse down the SLP tree. */
1413 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1414 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1415 instance
, child
, prologue_cost_vec
,
1418 /* Look at the first scalar stmt to determine the cost. */
1419 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1420 stmt_info
= vinfo_for_stmt (stmt
);
1421 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1423 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1424 vect_model_store_cost (stmt_info
, ncopies_for_cost
, false,
1425 vect_uninitialized_def
,
1426 node
, prologue_cost_vec
, body_cost_vec
);
1430 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)));
1431 vect_model_load_cost (stmt_info
, ncopies_for_cost
, false,
1432 node
, prologue_cost_vec
, body_cost_vec
);
1433 /* If the load is permuted record the cost for the permutation.
1434 ??? Loads from multiple chains are let through here only
1435 for a single special case involving complex numbers where
1436 in the end no permutation is necessary. */
1437 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, s
)
1438 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s
))
1439 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
))
1440 && vect_get_place_in_interleaving_chain
1441 (s
, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
)) != i
)
1443 record_stmt_cost (body_cost_vec
, group_size
, vec_perm
,
1444 stmt_info
, 0, vect_body
);
1450 record_stmt_cost (body_cost_vec
, ncopies_for_cost
, vector_stmt
,
1451 stmt_info
, 0, vect_body
);
1453 /* Scan operands and account for prologue cost of constants/externals.
1454 ??? This over-estimates cost for multiple uses and should be
1456 lhs
= gimple_get_lhs (stmt
);
1457 for (i
= 0; i
< gimple_num_ops (stmt
); ++i
)
1459 tree def
, op
= gimple_op (stmt
, i
);
1461 enum vect_def_type dt
;
1462 if (!op
|| op
== lhs
)
1464 if (vect_is_simple_use (op
, NULL
, loop_vinfo
, bb_vinfo
,
1465 &def_stmt
, &def
, &dt
)
1466 && (dt
== vect_constant_def
|| dt
== vect_external_def
))
1467 record_stmt_cost (prologue_cost_vec
, 1, vector_stmt
,
1468 stmt_info
, 0, vect_prologue
);
1472 /* Compute the cost for the SLP instance INSTANCE. */
1475 vect_analyze_slp_cost (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1476 slp_instance instance
, unsigned nunits
)
1478 stmt_vector_for_cost body_cost_vec
, prologue_cost_vec
;
1479 unsigned ncopies_for_cost
;
1480 stmt_info_for_cost
*si
;
1483 /* Calculate the number of vector stmts to create based on the unrolling
1484 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1485 GROUP_SIZE / NUNITS otherwise. */
1486 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1487 ncopies_for_cost
= least_common_multiple (nunits
, group_size
) / nunits
;
1489 prologue_cost_vec
.create (10);
1490 body_cost_vec
.create (10);
1491 SLP_INSTANCE_BODY_COST_VEC (instance
) = body_cost_vec
;
1492 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1493 instance
, SLP_INSTANCE_TREE (instance
),
1494 &prologue_cost_vec
, ncopies_for_cost
);
1496 /* Record the prologue costs, which were delayed until we were
1497 sure that SLP was successful. Unlike the body costs, we know
1498 the final values now regardless of the loop vectorization factor. */
1499 void *data
= (loop_vinfo
? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
1500 : BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1501 FOR_EACH_VEC_ELT (prologue_cost_vec
, i
, si
)
1503 struct _stmt_vec_info
*stmt_info
1504 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1505 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1506 si
->misalign
, vect_prologue
);
1509 prologue_cost_vec
.release ();
1512 /* Analyze an SLP instance starting from a group of grouped stores. Call
1513 vect_build_slp_tree to build a tree of packed stmts if possible.
1514 Return FALSE if it's impossible to SLP any stmt in the loop. */
1517 vect_analyze_slp_instance (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1518 gimple stmt
, unsigned max_tree_size
)
1520 slp_instance new_instance
;
1522 unsigned int group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1523 unsigned int unrolling_factor
= 1, nunits
;
1524 tree vectype
, scalar_type
= NULL_TREE
;
1526 unsigned int vectorization_factor
= 0;
1528 unsigned int max_nunits
= 0;
1529 vec
<slp_tree
> loads
;
1530 struct data_reference
*dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
1531 vec
<gimple
> scalar_stmts
;
1533 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1537 scalar_type
= TREE_TYPE (DR_REF (dr
));
1538 vectype
= get_vectype_for_scalar_type (scalar_type
);
1542 gcc_assert (loop_vinfo
);
1543 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1546 group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1550 gcc_assert (loop_vinfo
);
1551 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1552 group_size
= LOOP_VINFO_REDUCTIONS (loop_vinfo
).length ();
1557 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1560 "Build SLP failed: unsupported data-type ");
1561 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, scalar_type
);
1562 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1568 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
1570 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1572 vectorization_factor
= nunits
;
1574 /* Calculate the unrolling factor. */
1575 unrolling_factor
= least_common_multiple (nunits
, group_size
) / group_size
;
1576 if (unrolling_factor
!= 1 && !loop_vinfo
)
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1580 "Build SLP failed: unrolling required in basic"
1586 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1587 scalar_stmts
.create (group_size
);
1589 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1591 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1594 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
))
1595 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)))
1596 scalar_stmts
.safe_push (
1597 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)));
1599 scalar_stmts
.safe_push (next
);
1600 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
1605 /* Collect reduction statements. */
1606 vec
<gimple
> reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1607 for (i
= 0; reductions
.iterate (i
, &next
); i
++)
1608 scalar_stmts
.safe_push (next
);
1611 node
= vect_create_new_slp_node (scalar_stmts
);
1613 loads
.create (group_size
);
1615 /* Build the tree for the SLP instance. */
1616 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &node
, group_size
,
1617 &max_nunits
, &loads
,
1618 vectorization_factor
, NULL
, NULL
, NULL
,
1621 /* Calculate the unrolling factor based on the smallest type. */
1622 if (max_nunits
> nunits
)
1623 unrolling_factor
= least_common_multiple (max_nunits
, group_size
)
1626 if (unrolling_factor
!= 1 && !loop_vinfo
)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1630 "Build SLP failed: unrolling required in basic"
1632 vect_free_slp_tree (node
);
1637 /* Create a new SLP instance. */
1638 new_instance
= XNEW (struct _slp_instance
);
1639 SLP_INSTANCE_TREE (new_instance
) = node
;
1640 SLP_INSTANCE_GROUP_SIZE (new_instance
) = group_size
;
1641 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
1642 SLP_INSTANCE_BODY_COST_VEC (new_instance
) = vNULL
;
1643 SLP_INSTANCE_LOADS (new_instance
) = loads
;
1644 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
) = NULL
;
1646 /* Compute the load permutation. */
1648 bool loads_permuted
= false;
1649 FOR_EACH_VEC_ELT (loads
, i
, load_node
)
1651 vec
<unsigned> load_permutation
;
1653 gimple load
, first_stmt
;
1654 bool this_load_permuted
= false;
1655 load_permutation
.create (group_size
);
1656 first_stmt
= GROUP_FIRST_ELEMENT
1657 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node
)[0]));
1658 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1661 = vect_get_place_in_interleaving_chain (load
, first_stmt
);
1662 gcc_assert (load_place
!= -1);
1663 if (load_place
!= j
)
1664 this_load_permuted
= true;
1665 load_permutation
.safe_push (load_place
);
1667 if (!this_load_permuted
)
1669 load_permutation
.release ();
1672 SLP_TREE_LOAD_PERMUTATION (load_node
) = load_permutation
;
1673 loads_permuted
= true;
1678 if (!vect_supported_load_permutation_p (new_instance
))
1680 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1683 "Build SLP failed: unsupported load "
1685 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
1686 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1688 vect_free_slp_instance (new_instance
);
1692 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
)
1693 = vect_find_first_load_in_slp_instance (new_instance
);
1696 /* Compute the costs of this SLP instance. */
1697 vect_analyze_slp_cost (loop_vinfo
, bb_vinfo
,
1698 new_instance
, TYPE_VECTOR_SUBPARTS (vectype
));
1701 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).safe_push (new_instance
);
1703 BB_VINFO_SLP_INSTANCES (bb_vinfo
).safe_push (new_instance
);
1705 if (dump_enabled_p ())
1706 vect_print_slp_tree (MSG_NOTE
, node
);
1711 /* Failed to SLP. */
1712 /* Free the allocated memory. */
1713 vect_free_slp_tree (node
);
1720 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1721 trees of packed scalar stmts if SLP is possible. */
1724 vect_analyze_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1725 unsigned max_tree_size
)
1728 vec
<gimple
> grouped_stores
;
1729 vec
<gimple
> reductions
= vNULL
;
1730 vec
<gimple
> reduc_chains
= vNULL
;
1731 gimple first_element
;
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_analyze_slp ===\n");
1739 grouped_stores
= LOOP_VINFO_GROUPED_STORES (loop_vinfo
);
1740 reduc_chains
= LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
);
1741 reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1744 grouped_stores
= BB_VINFO_GROUPED_STORES (bb_vinfo
);
1746 /* Find SLP sequences starting from groups of grouped stores. */
1747 FOR_EACH_VEC_ELT (grouped_stores
, i
, first_element
)
1748 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1752 if (bb_vinfo
&& !ok
)
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1756 "Failed to SLP the basic block.\n");
1762 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).length () > 0)
1764 /* Find SLP sequences starting from reduction chains. */
1765 FOR_EACH_VEC_ELT (reduc_chains
, i
, first_element
)
1766 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1772 /* Don't try to vectorize SLP reductions if reduction chain was
1777 /* Find SLP sequences starting from groups of reductions. */
1778 if (loop_vinfo
&& LOOP_VINFO_REDUCTIONS (loop_vinfo
).length () > 1
1779 && vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, reductions
[0],
1787 /* For each possible SLP instance decide whether to SLP it and calculate overall
1788 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1789 least one instance. */
1792 vect_make_slp_decision (loop_vec_info loop_vinfo
)
1794 unsigned int i
, unrolling_factor
= 1;
1795 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1796 slp_instance instance
;
1797 int decided_to_slp
= 0;
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_make_slp_decision ==="
1803 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1805 /* FORNOW: SLP if you can. */
1806 if (unrolling_factor
< SLP_INSTANCE_UNROLLING_FACTOR (instance
))
1807 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (instance
);
1809 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1810 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1811 loop-based vectorization. Such stmts will be marked as HYBRID. */
1812 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
1816 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
1818 if (decided_to_slp
&& dump_enabled_p ())
1819 dump_printf_loc (MSG_NOTE
, vect_location
,
1820 "Decided to SLP %d instances. Unrolling factor %d\n",
1821 decided_to_slp
, unrolling_factor
);
1823 return (decided_to_slp
> 0);
1827 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1828 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1831 vect_detect_hybrid_slp_stmts (slp_tree node
)
1834 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (node
);
1835 gimple stmt
= stmts
[0];
1836 imm_use_iterator imm_iter
;
1838 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
1840 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
1841 struct loop
*loop
= NULL
;
1842 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_vinfo
);
1843 basic_block bb
= NULL
;
1849 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1851 bb
= BB_VINFO_BB (bb_vinfo
);
1853 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1854 if (PURE_SLP_STMT (vinfo_for_stmt (stmt
))
1855 && TREE_CODE (gimple_op (stmt
, 0)) == SSA_NAME
)
1856 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, gimple_op (stmt
, 0))
1857 if (gimple_bb (use_stmt
)
1858 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1859 || bb
== gimple_bb (use_stmt
))
1860 && (stmt_vinfo
= vinfo_for_stmt (use_stmt
))
1861 && !STMT_SLP_TYPE (stmt_vinfo
)
1862 && (STMT_VINFO_RELEVANT (stmt_vinfo
)
1863 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo
))
1864 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
)
1865 && STMT_VINFO_RELATED_STMT (stmt_vinfo
)
1866 && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo
)))))
1867 && !(gimple_code (use_stmt
) == GIMPLE_PHI
1868 && STMT_VINFO_DEF_TYPE (stmt_vinfo
)
1869 == vect_reduction_def
))
1870 vect_mark_slp_stmts (node
, hybrid
, i
);
1872 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1873 vect_detect_hybrid_slp_stmts (child
);
1877 /* Find stmts that must be both vectorized and SLPed. */
1880 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
1883 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1884 slp_instance instance
;
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_detect_hybrid_slp ==="
1890 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1891 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance
));
1895 /* Create and initialize a new bb_vec_info struct for BB, as well as
1896 stmt_vec_info structs for all the stmts in it. */
1899 new_bb_vec_info (basic_block bb
)
1901 bb_vec_info res
= NULL
;
1902 gimple_stmt_iterator gsi
;
1904 res
= (bb_vec_info
) xcalloc (1, sizeof (struct _bb_vec_info
));
1905 BB_VINFO_BB (res
) = bb
;
1907 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1909 gimple stmt
= gsi_stmt (gsi
);
1910 gimple_set_uid (stmt
, 0);
1911 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, NULL
, res
));
1914 BB_VINFO_GROUPED_STORES (res
).create (10);
1915 BB_VINFO_SLP_INSTANCES (res
).create (2);
1916 BB_VINFO_TARGET_COST_DATA (res
) = init_cost (NULL
);
1923 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1924 stmts in the basic block. */
1927 destroy_bb_vec_info (bb_vec_info bb_vinfo
)
1929 vec
<slp_instance
> slp_instances
;
1930 slp_instance instance
;
1932 gimple_stmt_iterator si
;
1938 bb
= BB_VINFO_BB (bb_vinfo
);
1940 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1942 gimple stmt
= gsi_stmt (si
);
1943 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1946 /* Free stmt_vec_info. */
1947 free_stmt_vec_info (stmt
);
1950 vect_destroy_datarefs (NULL
, bb_vinfo
);
1951 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo
));
1952 BB_VINFO_GROUPED_STORES (bb_vinfo
).release ();
1953 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
1954 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1955 vect_free_slp_instance (instance
);
1956 BB_VINFO_SLP_INSTANCES (bb_vinfo
).release ();
1957 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1963 /* Analyze statements contained in SLP tree node after recursively analyzing
1964 the subtree. Return TRUE if the operations are supported. */
1967 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo
, slp_tree node
)
1977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1978 if (!vect_slp_analyze_node_operations (bb_vinfo
, child
))
1981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1983 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1984 gcc_assert (stmt_info
);
1985 gcc_assert (PURE_SLP_STMT (stmt_info
));
1987 if (!vect_analyze_stmt (stmt
, &dummy
, node
))
1995 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1996 operations are supported. */
1999 vect_slp_analyze_operations (bb_vec_info bb_vinfo
)
2001 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2002 slp_instance instance
;
2005 for (i
= 0; slp_instances
.iterate (i
, &instance
); )
2007 if (!vect_slp_analyze_node_operations (bb_vinfo
,
2008 SLP_INSTANCE_TREE (instance
)))
2010 vect_free_slp_instance (instance
);
2011 slp_instances
.ordered_remove (i
);
2017 if (!slp_instances
.length ())
2024 /* Compute the scalar cost of the SLP node NODE and its children
2025 and return it. Do not account defs that are marked in LIFE and
2026 update LIFE according to uses of NODE. */
2029 vect_bb_slp_scalar_cost (basic_block bb
,
2030 slp_tree node
, vec
<bool, va_heap
> *life
)
2032 unsigned scalar_cost
= 0;
2037 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
2040 ssa_op_iter op_iter
;
2041 def_operand_p def_p
;
2042 stmt_vec_info stmt_info
;
2047 /* If there is a non-vectorized use of the defs then the scalar
2048 stmt is kept live in which case we do not account it or any
2049 required defs in the SLP children in the scalar cost. This
2050 way we make the vectorization more costly when compared to
2052 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
2054 imm_use_iterator use_iter
;
2056 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
2057 if (!is_gimple_debug (use_stmt
)
2058 && (gimple_code (use_stmt
) == GIMPLE_PHI
2059 || gimple_bb (use_stmt
) != bb
2060 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt
))))
2063 BREAK_FROM_IMM_USE_STMT (use_iter
);
2069 stmt_info
= vinfo_for_stmt (stmt
);
2070 if (STMT_VINFO_DATA_REF (stmt_info
))
2072 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2073 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2075 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2078 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2080 scalar_cost
+= stmt_cost
;
2083 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2084 scalar_cost
+= vect_bb_slp_scalar_cost (bb
, child
, life
);
2089 /* Check if vectorization of the basic block is profitable. */
2092 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
2094 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2095 slp_instance instance
;
2097 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
2098 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
2099 void *target_cost_data
= BB_VINFO_TARGET_COST_DATA (bb_vinfo
);
2100 stmt_vec_info stmt_info
= NULL
;
2101 stmt_vector_for_cost body_cost_vec
;
2102 stmt_info_for_cost
*ci
;
2104 /* Calculate vector costs. */
2105 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2107 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2109 FOR_EACH_VEC_ELT (body_cost_vec
, j
, ci
)
2111 stmt_info
= ci
->stmt
? vinfo_for_stmt (ci
->stmt
) : NULL
;
2112 (void) add_stmt_cost (target_cost_data
, ci
->count
, ci
->kind
,
2113 stmt_info
, ci
->misalign
, vect_body
);
2117 /* Calculate scalar cost. */
2118 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2120 auto_vec
<bool, 20> life
;
2121 life
.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance
));
2122 scalar_cost
+= vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo
),
2123 SLP_INSTANCE_TREE (instance
),
2127 /* Complete the target-specific cost calculation. */
2128 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
2129 &vec_inside_cost
, &vec_epilogue_cost
);
2131 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
2133 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2136 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
2138 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
2139 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
2140 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
2143 /* Vectorization is profitable if its cost is less than the cost of scalar
2145 if (vec_outside_cost
+ vec_inside_cost
>= scalar_cost
)
2151 /* Check if the basic block can be vectorized. */
2154 vect_slp_analyze_bb_1 (basic_block bb
)
2156 bb_vec_info bb_vinfo
;
2157 vec
<slp_instance
> slp_instances
;
2158 slp_instance instance
;
2161 unsigned n_stmts
= 0;
2163 bb_vinfo
= new_bb_vec_info (bb
);
2167 if (!vect_analyze_data_refs (NULL
, bb_vinfo
, &min_vf
, &n_stmts
))
2169 if (dump_enabled_p ())
2170 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2171 "not vectorized: unhandled data-ref in basic "
2174 destroy_bb_vec_info (bb_vinfo
);
2178 if (BB_VINFO_DATAREFS (bb_vinfo
).length () < 2)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2182 "not vectorized: not enough data-refs in "
2185 destroy_bb_vec_info (bb_vinfo
);
2189 if (!vect_analyze_data_ref_accesses (NULL
, bb_vinfo
))
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2193 "not vectorized: unhandled data access in "
2196 destroy_bb_vec_info (bb_vinfo
);
2200 vect_pattern_recog (NULL
, bb_vinfo
);
2202 if (!vect_analyze_data_refs_alignment (NULL
, bb_vinfo
))
2204 if (dump_enabled_p ())
2205 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2206 "not vectorized: bad data alignment in basic "
2209 destroy_bb_vec_info (bb_vinfo
);
2213 /* Check the SLP opportunities in the basic block, analyze and build SLP
2215 if (!vect_analyze_slp (NULL
, bb_vinfo
, n_stmts
))
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2219 "not vectorized: failed to find SLP opportunities "
2220 "in basic block.\n");
2222 destroy_bb_vec_info (bb_vinfo
);
2226 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2228 /* Mark all the statements that we want to vectorize as pure SLP and
2230 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2232 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
2233 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
2236 /* Mark all the statements that we do not want to vectorize. */
2237 for (gimple_stmt_iterator gsi
= gsi_start_bb (BB_VINFO_BB (bb_vinfo
));
2238 !gsi_end_p (gsi
); gsi_next (&gsi
))
2240 stmt_vec_info vinfo
= vinfo_for_stmt (gsi_stmt (gsi
));
2241 if (STMT_SLP_TYPE (vinfo
) != pure_slp
)
2242 STMT_VINFO_VECTORIZABLE (vinfo
) = false;
2245 /* Analyze dependences. At this point all stmts not participating in
2246 vectorization have to be marked. Dependence analysis assumes
2247 that we either vectorize all SLP instances or none at all. */
2248 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo
))
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2252 "not vectorized: unhandled data dependence "
2253 "in basic block.\n");
2255 destroy_bb_vec_info (bb_vinfo
);
2259 if (!vect_verify_datarefs_alignment (NULL
, bb_vinfo
))
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2263 "not vectorized: unsupported alignment in basic "
2265 destroy_bb_vec_info (bb_vinfo
);
2269 if (!vect_slp_analyze_operations (bb_vinfo
))
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2273 "not vectorized: bad operation in basic block.\n");
2275 destroy_bb_vec_info (bb_vinfo
);
2279 /* Cost model: check if the vectorization is worthwhile. */
2280 if (!unlimited_cost_model (NULL
)
2281 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2285 "not vectorized: vectorization is not "
2288 destroy_bb_vec_info (bb_vinfo
);
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE
, vect_location
,
2294 "Basic block will be vectorized using SLP\n");
2301 vect_slp_analyze_bb (basic_block bb
)
2303 bb_vec_info bb_vinfo
;
2305 gimple_stmt_iterator gsi
;
2306 unsigned int vector_sizes
;
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE
, vect_location
, "===vect_slp_analyze_bb===\n");
2311 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2313 gimple stmt
= gsi_stmt (gsi
);
2314 if (!is_gimple_debug (stmt
)
2315 && !gimple_nop_p (stmt
)
2316 && gimple_code (stmt
) != GIMPLE_LABEL
)
2320 if (insns
> PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB
))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2324 "not vectorized: too many instructions in "
2330 /* Autodetect first vector size we try. */
2331 current_vector_size
= 0;
2332 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
2336 bb_vinfo
= vect_slp_analyze_bb_1 (bb
);
2340 destroy_bb_vec_info (bb_vinfo
);
2342 vector_sizes
&= ~current_vector_size
;
2343 if (vector_sizes
== 0
2344 || current_vector_size
== 0)
2347 /* Try the next biggest vector size. */
2348 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2349 if (dump_enabled_p ())
2350 dump_printf_loc (MSG_NOTE
, vect_location
,
2351 "***** Re-trying analysis with "
2352 "vector size %d\n", current_vector_size
);
2357 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2358 the number of created vector stmts depends on the unrolling factor).
2359 However, the actual number of vector stmts for every SLP node depends on
2360 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2361 should be updated. In this function we assume that the inside costs
2362 calculated in vect_model_xxx_cost are linear in ncopies. */
2365 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo
)
2367 unsigned int i
, j
, vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2368 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2369 slp_instance instance
;
2370 stmt_vector_for_cost body_cost_vec
;
2371 stmt_info_for_cost
*si
;
2372 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE
, vect_location
,
2376 "=== vect_update_slp_costs_according_to_vf ===\n");
2378 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2380 /* We assume that costs are linear in ncopies. */
2381 int ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (instance
);
2383 /* Record the instance's instructions in the target cost model.
2384 This was delayed until here because the count of instructions
2385 isn't known beforehand. */
2386 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2388 FOR_EACH_VEC_ELT (body_cost_vec
, j
, si
)
2389 (void) add_stmt_cost (data
, si
->count
* ncopies
, si
->kind
,
2390 vinfo_for_stmt (si
->stmt
), si
->misalign
,
2396 /* For constant and loop invariant defs of SLP_NODE this function returns
2397 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2398 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2399 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2400 REDUC_INDEX is the index of the reduction operand in the statements, unless
2404 vect_get_constant_vectors (tree op
, slp_tree slp_node
,
2405 vec
<tree
> *vec_oprnds
,
2406 unsigned int op_num
, unsigned int number_of_vectors
,
2409 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2410 gimple stmt
= stmts
[0];
2411 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2415 unsigned j
, number_of_places_left_in_vector
;
2418 int group_size
= stmts
.length ();
2419 unsigned int vec_num
, i
;
2420 unsigned number_of_copies
= 1;
2422 voprnds
.create (number_of_vectors
);
2423 bool constant_p
, is_store
;
2424 tree neutral_op
= NULL
;
2425 enum tree_code code
= gimple_expr_code (stmt
);
2428 gimple_seq ctor_seq
= NULL
;
2430 if (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
2431 && reduc_index
!= -1)
2433 op_num
= reduc_index
- 1;
2434 op
= gimple_op (stmt
, reduc_index
);
2435 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2436 we need either neutral operands or the original operands. See
2437 get_initial_def_for_reduction() for details. */
2440 case WIDEN_SUM_EXPR
:
2446 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2447 neutral_op
= build_real (TREE_TYPE (op
), dconst0
);
2449 neutral_op
= build_int_cst (TREE_TYPE (op
), 0);
2454 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2455 neutral_op
= build_real (TREE_TYPE (op
), dconst1
);
2457 neutral_op
= build_int_cst (TREE_TYPE (op
), 1);
2462 neutral_op
= build_int_cst (TREE_TYPE (op
), -1);
2465 /* For MIN/MAX we don't have an easy neutral operand but
2466 the initial values can be used fine here. Only for
2467 a reduction chain we have to force a neutral element. */
2470 if (!GROUP_FIRST_ELEMENT (stmt_vinfo
))
2474 def_stmt
= SSA_NAME_DEF_STMT (op
);
2475 loop
= (gimple_bb (stmt
))->loop_father
;
2476 neutral_op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2477 loop_preheader_edge (loop
));
2486 if (STMT_VINFO_DATA_REF (stmt_vinfo
))
2489 op
= gimple_assign_rhs1 (stmt
);
2496 if (CONSTANT_CLASS_P (op
))
2501 vector_type
= get_vectype_for_scalar_type (TREE_TYPE (op
));
2502 gcc_assert (vector_type
);
2503 nunits
= TYPE_VECTOR_SUBPARTS (vector_type
);
2505 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2506 created vectors. It is greater than 1 if unrolling is performed.
2508 For example, we have two scalar operands, s1 and s2 (e.g., group of
2509 strided accesses of size two), while NUNITS is four (i.e., four scalars
2510 of this type can be packed in a vector). The output vector will contain
2511 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2514 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2515 containing the operands.
2517 For example, NUNITS is four as before, and the group size is 8
2518 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2519 {s5, s6, s7, s8}. */
2521 number_of_copies
= least_common_multiple (nunits
, group_size
) / group_size
;
2523 number_of_places_left_in_vector
= nunits
;
2524 elts
= XALLOCAVEC (tree
, nunits
);
2525 for (j
= 0; j
< number_of_copies
; j
++)
2527 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
2530 op
= gimple_assign_rhs1 (stmt
);
2536 if (op_num
== 0 || op_num
== 1)
2538 tree cond
= gimple_assign_rhs1 (stmt
);
2539 op
= TREE_OPERAND (cond
, op_num
);
2544 op
= gimple_assign_rhs2 (stmt
);
2546 op
= gimple_assign_rhs3 (stmt
);
2551 op
= gimple_call_arg (stmt
, op_num
);
2558 op
= gimple_op (stmt
, op_num
+ 1);
2559 /* Unlike the other binary operators, shifts/rotates have
2560 the shift count being int, instead of the same type as
2561 the lhs, so make sure the scalar is the right type if
2562 we are dealing with vectors of
2563 long long/long/short/char. */
2564 if (op_num
== 1 && TREE_CODE (op
) == INTEGER_CST
)
2565 op
= fold_convert (TREE_TYPE (vector_type
), op
);
2569 op
= gimple_op (stmt
, op_num
+ 1);
2574 if (reduc_index
!= -1)
2576 loop
= (gimple_bb (stmt
))->loop_father
;
2577 def_stmt
= SSA_NAME_DEF_STMT (op
);
2581 /* Get the def before the loop. In reduction chain we have only
2582 one initial value. */
2583 if ((j
!= (number_of_copies
- 1)
2584 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
2589 op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2590 loop_preheader_edge (loop
));
2593 /* Create 'vect_ = {op0,op1,...,opn}'. */
2594 number_of_places_left_in_vector
--;
2595 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
2597 if (CONSTANT_CLASS_P (op
))
2599 op
= fold_unary (VIEW_CONVERT_EXPR
,
2600 TREE_TYPE (vector_type
), op
);
2601 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
2606 = make_ssa_name (TREE_TYPE (vector_type
), NULL
);
2608 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
2611 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR
,
2612 new_temp
, op
, NULL_TREE
);
2613 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
2617 elts
[number_of_places_left_in_vector
] = op
;
2618 if (!CONSTANT_CLASS_P (op
))
2621 if (number_of_places_left_in_vector
== 0)
2623 number_of_places_left_in_vector
= nunits
;
2626 vec_cst
= build_vector (vector_type
, elts
);
2629 vec
<constructor_elt
, va_gc
> *v
;
2631 vec_alloc (v
, nunits
);
2632 for (k
= 0; k
< nunits
; ++k
)
2633 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[k
]);
2634 vec_cst
= build_constructor (vector_type
, v
);
2636 voprnds
.quick_push (vect_init_vector (stmt
, vec_cst
,
2637 vector_type
, NULL
));
2638 if (ctor_seq
!= NULL
)
2640 gimple init_stmt
= SSA_NAME_DEF_STMT (voprnds
.last ());
2641 gimple_stmt_iterator gsi
= gsi_for_stmt (init_stmt
);
2642 gsi_insert_seq_before_without_update (&gsi
, ctor_seq
,
2650 /* Since the vectors are created in the reverse order, we should invert
2652 vec_num
= voprnds
.length ();
2653 for (j
= vec_num
; j
!= 0; j
--)
2655 vop
= voprnds
[j
- 1];
2656 vec_oprnds
->quick_push (vop
);
2661 /* In case that VF is greater than the unrolling factor needed for the SLP
2662 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2663 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2664 to replicate the vectors. */
2665 while (number_of_vectors
> vec_oprnds
->length ())
2667 tree neutral_vec
= NULL
;
2672 neutral_vec
= build_vector_from_val (vector_type
, neutral_op
);
2674 vec_oprnds
->quick_push (neutral_vec
);
2678 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
2679 vec_oprnds
->quick_push (vop
);
2685 /* Get vectorized definitions from SLP_NODE that contains corresponding
2686 vectorized def-stmts. */
2689 vect_get_slp_vect_defs (slp_tree slp_node
, vec
<tree
> *vec_oprnds
)
2692 gimple vec_def_stmt
;
2695 gcc_assert (SLP_TREE_VEC_STMTS (slp_node
).exists ());
2697 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), i
, vec_def_stmt
)
2699 gcc_assert (vec_def_stmt
);
2700 vec_oprnd
= gimple_get_lhs (vec_def_stmt
);
2701 vec_oprnds
->quick_push (vec_oprnd
);
2706 /* Get vectorized definitions for SLP_NODE.
2707 If the scalar definitions are loop invariants or constants, collect them and
2708 call vect_get_constant_vectors() to create vector stmts.
2709 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2710 must be stored in the corresponding child of SLP_NODE, and we call
2711 vect_get_slp_vect_defs () to retrieve them. */
2714 vect_get_slp_defs (vec
<tree
> ops
, slp_tree slp_node
,
2715 vec
<vec
<tree
> > *vec_oprnds
, int reduc_index
)
2718 int number_of_vects
= 0, i
;
2719 unsigned int child_index
= 0;
2720 HOST_WIDE_INT lhs_size_unit
, rhs_size_unit
;
2721 slp_tree child
= NULL
;
2724 bool vectorized_defs
;
2726 first_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[0];
2727 FOR_EACH_VEC_ELT (ops
, i
, oprnd
)
2729 /* For each operand we check if it has vectorized definitions in a child
2730 node or we need to create them (for invariants and constants). We
2731 check if the LHS of the first stmt of the next child matches OPRND.
2732 If it does, we found the correct child. Otherwise, we call
2733 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2734 to check this child node for the next operand. */
2735 vectorized_defs
= false;
2736 if (SLP_TREE_CHILDREN (slp_node
).length () > child_index
)
2738 child
= SLP_TREE_CHILDREN (slp_node
)[child_index
];
2740 /* We have to check both pattern and original def, if available. */
2741 gimple first_def
= SLP_TREE_SCALAR_STMTS (child
)[0];
2742 gimple related
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def
));
2744 if (operand_equal_p (oprnd
, gimple_get_lhs (first_def
), 0)
2746 && operand_equal_p (oprnd
, gimple_get_lhs (related
), 0)))
2748 /* The number of vector defs is determined by the number of
2749 vector statements in the node from which we get those
2751 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (child
);
2752 vectorized_defs
= true;
2757 if (!vectorized_defs
)
2761 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
2762 /* Number of vector stmts was calculated according to LHS in
2763 vect_schedule_slp_instance (), fix it by replacing LHS with
2764 RHS, if necessary. See vect_get_smallest_scalar_type () for
2766 vect_get_smallest_scalar_type (first_stmt
, &lhs_size_unit
,
2768 if (rhs_size_unit
!= lhs_size_unit
)
2770 number_of_vects
*= rhs_size_unit
;
2771 number_of_vects
/= lhs_size_unit
;
2776 /* Allocate memory for vectorized defs. */
2778 vec_defs
.create (number_of_vects
);
2780 /* For reduction defs we call vect_get_constant_vectors (), since we are
2781 looking for initial loop invariant values. */
2782 if (vectorized_defs
&& reduc_index
== -1)
2783 /* The defs are already vectorized. */
2784 vect_get_slp_vect_defs (child
, &vec_defs
);
2786 /* Build vectors from scalar defs. */
2787 vect_get_constant_vectors (oprnd
, slp_node
, &vec_defs
, i
,
2788 number_of_vects
, reduc_index
);
2790 vec_oprnds
->quick_push (vec_defs
);
2792 /* For reductions, we only need initial values. */
2793 if (reduc_index
!= -1)
2799 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2800 building a vector of type MASK_TYPE from it) and two input vectors placed in
2801 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2802 shifting by STRIDE elements of DR_CHAIN for every copy.
2803 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2805 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2806 the created stmts must be inserted. */
2809 vect_create_mask_and_perm (gimple stmt
, gimple next_scalar_stmt
,
2810 tree mask
, int first_vec_indx
, int second_vec_indx
,
2811 gimple_stmt_iterator
*gsi
, slp_tree node
,
2812 tree vectype
, vec
<tree
> dr_chain
,
2813 int ncopies
, int vect_stmts_counter
)
2816 gimple perm_stmt
= NULL
;
2817 stmt_vec_info next_stmt_info
;
2819 tree first_vec
, second_vec
, data_ref
;
2821 stride
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
) / ncopies
;
2823 /* Initialize the vect stmts of NODE to properly insert the generated
2825 for (i
= SLP_TREE_VEC_STMTS (node
).length ();
2826 i
< (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
2827 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
2829 perm_dest
= vect_create_destination_var (gimple_assign_lhs (stmt
), vectype
);
2830 for (i
= 0; i
< ncopies
; i
++)
2832 first_vec
= dr_chain
[first_vec_indx
];
2833 second_vec
= dr_chain
[second_vec_indx
];
2835 /* Generate the permute statement. */
2836 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, perm_dest
,
2837 first_vec
, second_vec
, mask
);
2838 data_ref
= make_ssa_name (perm_dest
, perm_stmt
);
2839 gimple_set_lhs (perm_stmt
, data_ref
);
2840 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
2842 /* Store the vector statement in NODE. */
2843 SLP_TREE_VEC_STMTS (node
)[stride
* i
+ vect_stmts_counter
] = perm_stmt
;
2845 first_vec_indx
+= stride
;
2846 second_vec_indx
+= stride
;
2849 /* Mark the scalar stmt as vectorized. */
2850 next_stmt_info
= vinfo_for_stmt (next_scalar_stmt
);
2851 STMT_VINFO_VEC_STMT (next_stmt_info
) = perm_stmt
;
2855 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2856 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2857 representation. Check that the mask is valid and return FALSE if not.
2858 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2859 the next vector, i.e., the current first vector is not needed. */
2862 vect_get_mask_element (gimple stmt
, int first_mask_element
, int m
,
2863 int mask_nunits
, bool only_one_vec
, int index
,
2864 unsigned char *mask
, int *current_mask_element
,
2865 bool *need_next_vector
, int *number_of_mask_fixes
,
2866 bool *mask_fixed
, bool *needs_first_vector
)
2870 /* Convert to target specific representation. */
2871 *current_mask_element
= first_mask_element
+ m
;
2872 /* Adjust the value in case it's a mask for second and third vectors. */
2873 *current_mask_element
-= mask_nunits
* (*number_of_mask_fixes
- 1);
2875 if (*current_mask_element
< mask_nunits
)
2876 *needs_first_vector
= true;
2878 /* We have only one input vector to permute but the mask accesses values in
2879 the next vector as well. */
2880 if (only_one_vec
&& *current_mask_element
>= mask_nunits
)
2882 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2885 "permutation requires at least two vectors ");
2886 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2887 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2893 /* The mask requires the next vector. */
2894 if (*current_mask_element
>= mask_nunits
* 2)
2896 if (*needs_first_vector
|| *mask_fixed
)
2898 /* We either need the first vector too or have already moved to the
2899 next vector. In both cases, this permutation needs three
2901 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2904 "permutation requires at "
2905 "least three vectors ");
2906 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2907 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2913 /* We move to the next vector, dropping the first one and working with
2914 the second and the third - we need to adjust the values of the mask
2916 *current_mask_element
-= mask_nunits
* *number_of_mask_fixes
;
2918 for (i
= 0; i
< index
; i
++)
2919 mask
[i
] -= mask_nunits
* *number_of_mask_fixes
;
2921 (*number_of_mask_fixes
)++;
2925 *need_next_vector
= *mask_fixed
;
2927 /* This was the last element of this mask. Start a new one. */
2928 if (index
== mask_nunits
- 1)
2930 *number_of_mask_fixes
= 1;
2931 *mask_fixed
= false;
2932 *needs_first_vector
= false;
2939 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2940 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2941 permute statements for the SLP node NODE of the SLP instance
2942 SLP_NODE_INSTANCE. */
2945 vect_transform_slp_perm_load (slp_tree node
, vec
<tree
> dr_chain
,
2946 gimple_stmt_iterator
*gsi
, int vf
,
2947 slp_instance slp_node_instance
, bool analyze_only
)
2949 gimple stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2950 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2951 tree mask_element_type
= NULL_TREE
, mask_type
;
2952 int i
, j
, k
, nunits
, vec_index
= 0, scalar_index
;
2953 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2954 gimple next_scalar_stmt
;
2955 int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_node_instance
);
2956 int first_mask_element
;
2957 int index
, unroll_factor
, current_mask_element
, ncopies
;
2958 unsigned char *mask
;
2959 bool only_one_vec
= false, need_next_vector
= false;
2960 int first_vec_index
, second_vec_index
, orig_vec_stmts_num
, vect_stmts_counter
;
2961 int number_of_mask_fixes
= 1;
2962 bool mask_fixed
= false;
2963 bool needs_first_vector
= false;
2964 enum machine_mode mode
;
2966 mode
= TYPE_MODE (vectype
);
2968 if (!can_vec_perm_p (mode
, false, NULL
))
2970 if (dump_enabled_p ())
2972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2973 "no vect permute for ");
2974 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2975 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2980 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2981 same size as the vector element being permuted. */
2982 mask_element_type
= lang_hooks
.types
.type_for_mode
2983 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype
))), 1);
2984 mask_type
= get_vectype_for_scalar_type (mask_element_type
);
2985 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2986 mask
= XALLOCAVEC (unsigned char, nunits
);
2987 unroll_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
2989 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2990 unrolling factor. */
2991 orig_vec_stmts_num
= group_size
*
2992 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
) / nunits
;
2993 if (orig_vec_stmts_num
== 1)
2994 only_one_vec
= true;
2996 /* Number of copies is determined by the final vectorization factor
2997 relatively to SLP_NODE_INSTANCE unrolling factor. */
2998 ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
3000 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
3003 /* Generate permutation masks for every NODE. Number of masks for each NODE
3004 is equal to GROUP_SIZE.
3005 E.g., we have a group of three nodes with three loads from the same
3006 location in each node, and the vector size is 4. I.e., we have a
3007 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3008 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3009 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3012 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3013 The last mask is illegal since we assume two operands for permute
3014 operation, and the mask element values can't be outside that range.
3015 Hence, the last mask must be converted into {2,5,5,5}.
3016 For the first two permutations we need the first and the second input
3017 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3018 we need the second and the third vectors: {b1,c1,a2,b2} and
3024 vect_stmts_counter
= 0;
3026 first_vec_index
= vec_index
++;
3028 second_vec_index
= first_vec_index
;
3030 second_vec_index
= vec_index
++;
3032 for (j
= 0; j
< unroll_factor
; j
++)
3034 for (k
= 0; k
< group_size
; k
++)
3036 i
= SLP_TREE_LOAD_PERMUTATION (node
)[k
];
3037 first_mask_element
= i
+ j
* group_size
;
3038 if (!vect_get_mask_element (stmt
, first_mask_element
, 0,
3039 nunits
, only_one_vec
, index
,
3040 mask
, ¤t_mask_element
,
3042 &number_of_mask_fixes
, &mask_fixed
,
3043 &needs_first_vector
))
3045 mask
[index
++] = current_mask_element
;
3047 if (index
== nunits
)
3050 if (!can_vec_perm_p (mode
, false, mask
))
3052 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
3056 "unsupported vect permute { ");
3057 for (i
= 0; i
< nunits
; ++i
)
3058 dump_printf (MSG_MISSED_OPTIMIZATION
, "%d ",
3060 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
3068 tree mask_vec
, *mask_elts
;
3069 mask_elts
= XALLOCAVEC (tree
, nunits
);
3070 for (l
= 0; l
< nunits
; ++l
)
3071 mask_elts
[l
] = build_int_cst (mask_element_type
,
3073 mask_vec
= build_vector (mask_type
, mask_elts
);
3075 if (need_next_vector
)
3077 first_vec_index
= second_vec_index
;
3078 second_vec_index
= vec_index
;
3082 = SLP_TREE_SCALAR_STMTS (node
)[scalar_index
++];
3084 vect_create_mask_and_perm (stmt
, next_scalar_stmt
,
3085 mask_vec
, first_vec_index
, second_vec_index
,
3086 gsi
, node
, vectype
, dr_chain
,
3087 ncopies
, vect_stmts_counter
++);
3099 /* Vectorize SLP instance tree in postorder. */
3102 vect_schedule_slp_instance (slp_tree node
, slp_instance instance
,
3103 unsigned int vectorization_factor
)
3106 bool grouped_store
, is_store
;
3107 gimple_stmt_iterator si
;
3108 stmt_vec_info stmt_info
;
3109 unsigned int vec_stmts_size
, nunits
, group_size
;
3117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3118 vect_schedule_slp_instance (child
, instance
, vectorization_factor
);
3120 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
3121 stmt_info
= vinfo_for_stmt (stmt
);
3123 /* VECTYPE is the type of the destination. */
3124 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3125 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (vectype
);
3126 group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
3128 /* For each SLP instance calculate number of vector stmts to be created
3129 for the scalar stmts in each node of the SLP tree. Number of vector
3130 elements in one vector iteration is the number of scalar elements in
3131 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3133 vec_stmts_size
= (vectorization_factor
* group_size
) / nunits
;
3135 if (!SLP_TREE_VEC_STMTS (node
).exists ())
3137 SLP_TREE_VEC_STMTS (node
).create (vec_stmts_size
);
3138 SLP_TREE_NUMBER_OF_VEC_STMTS (node
) = vec_stmts_size
;
3141 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_NOTE
,vect_location
,
3144 "------>vectorizing SLP node starting from: ");
3145 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3146 dump_printf (MSG_NOTE
, "\n");
3149 /* Loads should be inserted before the first load. */
3150 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance
)
3151 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3152 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt
))
3153 && SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3154 si
= gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance
));
3155 else if (is_pattern_stmt_p (stmt_info
))
3156 si
= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
3158 si
= gsi_for_stmt (stmt
);
3160 /* Stores should be inserted just before the last store. */
3161 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3162 && REFERENCE_CLASS_P (gimple_get_lhs (stmt
)))
3164 gimple last_store
= vect_find_last_store_in_slp_instance (instance
);
3165 if (is_pattern_stmt_p (vinfo_for_stmt (last_store
)))
3166 last_store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store
));
3167 si
= gsi_for_stmt (last_store
);
3170 /* Mark the first element of the reduction chain as reduction to properly
3171 transform the node. In the analysis phase only the last element of the
3172 chain is marked as reduction. */
3173 if (GROUP_FIRST_ELEMENT (stmt_info
) && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3174 && GROUP_FIRST_ELEMENT (stmt_info
) == stmt
)
3176 STMT_VINFO_DEF_TYPE (stmt_info
) = vect_reduction_def
;
3177 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
3180 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, node
, instance
);
3184 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3185 For loop vectorization this is done in vectorizable_call, but for SLP
3186 it needs to be deferred until end of vect_schedule_slp, because multiple
3187 SLP instances may refer to the same scalar stmt. */
3190 vect_remove_slp_scalar_calls (slp_tree node
)
3192 gimple stmt
, new_stmt
;
3193 gimple_stmt_iterator gsi
;
3197 stmt_vec_info stmt_info
;
3202 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3203 vect_remove_slp_scalar_calls (child
);
3205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
3207 if (!is_gimple_call (stmt
) || gimple_bb (stmt
) == NULL
)
3209 stmt_info
= vinfo_for_stmt (stmt
);
3210 if (stmt_info
== NULL
3211 || is_pattern_stmt_p (stmt_info
)
3212 || !PURE_SLP_STMT (stmt_info
))
3214 lhs
= gimple_call_lhs (stmt
);
3215 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
3216 set_vinfo_for_stmt (new_stmt
, stmt_info
);
3217 set_vinfo_for_stmt (stmt
, NULL
);
3218 STMT_VINFO_STMT (stmt_info
) = new_stmt
;
3219 gsi
= gsi_for_stmt (stmt
);
3220 gsi_replace (&gsi
, new_stmt
, false);
3221 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
3225 /* Generate vector code for all SLP instances in the loop/basic block. */
3228 vect_schedule_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
3230 vec
<slp_instance
> slp_instances
;
3231 slp_instance instance
;
3233 bool is_store
= false;
3237 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3238 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3242 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3246 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3248 /* Schedule the tree of INSTANCE. */
3249 is_store
= vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance
),
3251 if (dump_enabled_p ())
3252 dump_printf_loc (MSG_NOTE
, vect_location
,
3253 "vectorizing stmts using SLP.\n");
3256 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3258 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3261 gimple_stmt_iterator gsi
;
3263 /* Remove scalar call stmts. Do not do this for basic-block
3264 vectorization as not all uses may be vectorized.
3265 ??? Why should this be necessary? DCE should be able to
3266 remove the stmts itself.
3267 ??? For BB vectorization we can as well remove scalar
3268 stmts starting from the SLP tree root if they have no
3271 vect_remove_slp_scalar_calls (root
);
3273 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store
)
3274 && j
< SLP_INSTANCE_GROUP_SIZE (instance
); j
++)
3276 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store
)))
3279 if (is_pattern_stmt_p (vinfo_for_stmt (store
)))
3280 store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store
));
3281 /* Free the attached stmt_vec_info and remove the stmt. */
3282 gsi
= gsi_for_stmt (store
);
3283 unlink_stmt_vdef (store
);
3284 gsi_remove (&gsi
, true);
3285 release_defs (store
);
3286 free_stmt_vec_info (store
);
3294 /* Vectorize the basic block. */
3297 vect_slp_transform_bb (basic_block bb
)
3299 bb_vec_info bb_vinfo
= vec_info_for_bb (bb
);
3300 gimple_stmt_iterator si
;
3302 gcc_assert (bb_vinfo
);
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB\n");
3307 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
3309 gimple stmt
= gsi_stmt (si
);
3310 stmt_vec_info stmt_info
;
3312 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_NOTE
, vect_location
,
3315 "------>SLPing statement: ");
3316 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3317 dump_printf (MSG_NOTE
, "\n");
3320 stmt_info
= vinfo_for_stmt (stmt
);
3321 gcc_assert (stmt_info
);
3323 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3324 if (STMT_SLP_TYPE (stmt_info
))
3326 vect_schedule_slp (NULL
, bb_vinfo
);
3331 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE
, vect_location
,
3333 "BASIC BLOCK VECTORIZED\n");
3335 destroy_bb_vec_info (bb_vinfo
);