1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
35 #include "hard-reg-set.h"
38 #include "basic-block.h"
39 #include "gimple-pretty-print.h"
40 #include "tree-ssa-alias.h"
41 #include "internal-fn.h"
42 #include "gimple-expr.h"
45 #include "gimple-iterator.h"
46 #include "gimple-ssa.h"
47 #include "tree-phinodes.h"
48 #include "ssa-iterators.h"
49 #include "stringpool.h"
50 #include "tree-ssanames.h"
51 #include "tree-pass.h"
54 #include "recog.h" /* FIXME: for insn_data */
55 #include "insn-codes.h"
57 #include "tree-vectorizer.h"
58 #include "langhooks.h"
60 /* Extract the location of the basic block in the source code.
61 Return the basic block location if succeed and NULL if not. */
64 find_bb_location (basic_block bb
)
67 gimple_stmt_iterator si
;
70 return UNKNOWN_LOCATION
;
72 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
75 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
76 return gimple_location (stmt
);
79 return UNKNOWN_LOCATION
;
83 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
86 vect_free_slp_tree (slp_tree node
)
94 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
95 vect_free_slp_tree (child
);
97 SLP_TREE_CHILDREN (node
).release ();
98 SLP_TREE_SCALAR_STMTS (node
).release ();
99 SLP_TREE_VEC_STMTS (node
).release ();
100 SLP_TREE_LOAD_PERMUTATION (node
).release ();
106 /* Free the memory allocated for the SLP instance. */
109 vect_free_slp_instance (slp_instance instance
)
111 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
112 SLP_INSTANCE_LOADS (instance
).release ();
113 SLP_INSTANCE_BODY_COST_VEC (instance
).release ();
118 /* Create an SLP node for SCALAR_STMTS. */
121 vect_create_new_slp_node (vec
<gimple
> scalar_stmts
)
124 gimple stmt
= scalar_stmts
[0];
127 if (is_gimple_call (stmt
))
128 nops
= gimple_call_num_args (stmt
);
129 else if (is_gimple_assign (stmt
))
131 nops
= gimple_num_ops (stmt
) - 1;
132 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
138 node
= XNEW (struct _slp_tree
);
139 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
140 SLP_TREE_VEC_STMTS (node
).create (0);
141 SLP_TREE_CHILDREN (node
).create (nops
);
142 SLP_TREE_LOAD_PERMUTATION (node
) = vNULL
;
148 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
150 static vec
<slp_oprnd_info
>
151 vect_create_oprnd_info (int nops
, int group_size
)
154 slp_oprnd_info oprnd_info
;
155 vec
<slp_oprnd_info
> oprnds_info
;
157 oprnds_info
.create (nops
);
158 for (i
= 0; i
< nops
; i
++)
160 oprnd_info
= XNEW (struct _slp_oprnd_info
);
161 oprnd_info
->def_stmts
.create (group_size
);
162 oprnd_info
->first_dt
= vect_uninitialized_def
;
163 oprnd_info
->first_op_type
= NULL_TREE
;
164 oprnd_info
->first_pattern
= false;
165 oprnds_info
.quick_push (oprnd_info
);
172 /* Free operands info. */
175 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
178 slp_oprnd_info oprnd_info
;
180 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
182 oprnd_info
->def_stmts
.release ();
183 XDELETE (oprnd_info
);
186 oprnds_info
.release ();
190 /* Find the place of the data-ref in STMT in the interleaving chain that starts
191 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
194 vect_get_place_in_interleaving_chain (gimple stmt
, gimple first_stmt
)
196 gimple next_stmt
= first_stmt
;
199 if (first_stmt
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
204 if (next_stmt
== stmt
)
207 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
215 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
216 they are of a valid type and that they match the defs of the first stmt of
217 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
218 return -1, if the error could be corrected by swapping operands of the
219 operation return 1, if everything is ok return 0. */
222 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
223 gimple stmt
, bool first
,
224 vec
<slp_oprnd_info
> *oprnds_info
)
227 unsigned int i
, number_of_oprnds
;
230 enum vect_def_type dt
= vect_uninitialized_def
;
231 struct loop
*loop
= NULL
;
232 bool pattern
= false;
233 slp_oprnd_info oprnd_info
;
234 int first_op_idx
= 1;
235 bool commutative
= false;
236 bool first_op_cond
= false;
239 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
241 if (is_gimple_call (stmt
))
243 number_of_oprnds
= gimple_call_num_args (stmt
);
246 else if (is_gimple_assign (stmt
))
248 enum tree_code code
= gimple_assign_rhs_code (stmt
);
249 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
250 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
252 first_op_cond
= true;
257 commutative
= commutative_tree_code (code
);
262 bool swapped
= false;
263 for (i
= 0; i
< number_of_oprnds
; i
++)
268 if (i
== 0 || i
== 1)
269 oprnd
= TREE_OPERAND (gimple_op (stmt
, first_op_idx
),
272 oprnd
= gimple_op (stmt
, first_op_idx
+ i
- 1);
275 oprnd
= gimple_op (stmt
, first_op_idx
+ (swapped
? !i
: i
));
277 oprnd_info
= (*oprnds_info
)[i
];
279 if (!vect_is_simple_use (oprnd
, NULL
, loop_vinfo
, bb_vinfo
, &def_stmt
,
281 || (!def_stmt
&& dt
!= vect_constant_def
))
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
286 "Build SLP failed: can't find def for ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
288 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
294 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
295 from the pattern. Check that all the stmts of the node are in the
297 if (def_stmt
&& gimple_bb (def_stmt
)
298 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
299 || (!loop
&& gimple_bb (def_stmt
) == BB_VINFO_BB (bb_vinfo
)
300 && gimple_code (def_stmt
) != GIMPLE_PHI
))
301 && vinfo_for_stmt (def_stmt
)
302 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt
))
303 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt
))
304 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt
)))
307 if (!first
&& !oprnd_info
->first_pattern
)
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
320 "Build SLP failed: some of the stmts"
321 " are in a pattern, and others are not ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
329 def_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt
));
330 dt
= STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
));
332 if (dt
== vect_unknown_def_type
)
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
336 "Unsupported pattern.\n");
340 switch (gimple_code (def_stmt
))
343 def
= gimple_phi_result (def_stmt
);
347 def
= gimple_assign_lhs (def_stmt
);
351 if (dump_enabled_p ())
352 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
353 "unsupported defining stmt:\n");
360 oprnd_info
->first_dt
= dt
;
361 oprnd_info
->first_pattern
= pattern
;
362 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
366 /* Not first stmt of the group, check that the def-stmt/s match
367 the def-stmt/s of the first stmt. Allow different definition
368 types for reduction chains: the first stmt must be a
369 vect_reduction_def (a phi node), and the rest
370 vect_internal_def. */
371 if (((oprnd_info
->first_dt
!= dt
372 && !(oprnd_info
->first_dt
== vect_reduction_def
373 && dt
== vect_internal_def
)
374 && !((oprnd_info
->first_dt
== vect_external_def
375 || oprnd_info
->first_dt
== vect_constant_def
)
376 && (dt
== vect_external_def
377 || dt
== vect_constant_def
)))
378 || !types_compatible_p (oprnd_info
->first_op_type
,
381 /* Try swapping operands if we got a mismatch. */
390 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
392 "Build SLP failed: different types\n");
398 /* Check the types of the definitions. */
401 case vect_constant_def
:
402 case vect_external_def
:
403 case vect_reduction_def
:
406 case vect_internal_def
:
407 oprnd_info
->def_stmts
.quick_push (def_stmt
);
411 /* FORNOW: Not supported. */
412 if (dump_enabled_p ())
414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
415 "Build SLP failed: illegal type of def ");
416 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def
);
417 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
429 tree cond
= gimple_assign_rhs1 (stmt
);
430 swap_ssa_operands (stmt
, &TREE_OPERAND (cond
, 0),
431 &TREE_OPERAND (cond
, 1));
432 TREE_SET_CODE (cond
, swap_tree_comparison (TREE_CODE (cond
)));
435 swap_ssa_operands (stmt
, gimple_assign_rhs1_ptr (stmt
),
436 gimple_assign_rhs2_ptr (stmt
));
443 /* Verify if the scalar stmts STMTS are isomorphic, require data
444 permutation or are of unsupported types of operation. Return
445 true if they are, otherwise return false and indicate in *MATCHES
446 which stmts are not isomorphic to the first one. If MATCHES[0]
447 is false then this indicates the comparison could not be
448 carried out or the stmts will never be vectorized by SLP. */
451 vect_build_slp_tree_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
452 vec
<gimple
> stmts
, unsigned int group_size
,
453 unsigned nops
, unsigned int *max_nunits
,
454 unsigned int vectorization_factor
, bool *matches
)
457 gimple stmt
= stmts
[0];
458 enum tree_code first_stmt_code
= ERROR_MARK
, rhs_code
= ERROR_MARK
;
459 enum tree_code first_cond_code
= ERROR_MARK
;
461 bool need_same_oprnds
= false;
462 tree vectype
, scalar_type
, first_op1
= NULL_TREE
;
465 machine_mode optab_op2_mode
;
466 machine_mode vec_mode
;
467 struct data_reference
*first_dr
;
469 gimple first_load
= NULL
, prev_first_load
= NULL
, old_first_load
= NULL
;
472 /* For every stmt in NODE find its def stmt/s. */
473 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
477 if (dump_enabled_p ())
479 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for ");
480 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
481 dump_printf (MSG_NOTE
, "\n");
484 /* Fail to vectorize statements marked as unvectorizable. */
485 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)))
487 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
490 "Build SLP failed: unvectorizable statement ");
491 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
492 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
494 /* Fatal mismatch. */
499 lhs
= gimple_get_lhs (stmt
);
500 if (lhs
== NULL_TREE
)
502 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
505 "Build SLP failed: not GIMPLE_ASSIGN nor "
507 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
508 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
510 /* Fatal mismatch. */
515 if (is_gimple_assign (stmt
)
516 && gimple_assign_rhs_code (stmt
) == COND_EXPR
517 && (cond
= gimple_assign_rhs1 (stmt
))
518 && !COMPARISON_CLASS_P (cond
))
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "Build SLP failed: condition is not "
525 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
526 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
528 /* Fatal mismatch. */
533 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
, &dummy
);
534 vectype
= get_vectype_for_scalar_type (scalar_type
);
537 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
540 "Build SLP failed: unsupported data-type ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
543 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
545 /* Fatal mismatch. */
550 /* In case of multiple types we need to detect the smallest type. */
551 if (*max_nunits
< TYPE_VECTOR_SUBPARTS (vectype
))
553 *max_nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
555 vectorization_factor
= *max_nunits
;
558 if (gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
))
560 rhs_code
= CALL_EXPR
;
561 if (gimple_call_internal_p (call_stmt
)
562 || gimple_call_tail_p (call_stmt
)
563 || gimple_call_noreturn_p (call_stmt
)
564 || !gimple_call_nothrow_p (call_stmt
)
565 || gimple_call_chain (call_stmt
))
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
570 "Build SLP failed: unsupported call type ");
571 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
573 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
575 /* Fatal mismatch. */
581 rhs_code
= gimple_assign_rhs_code (stmt
);
583 /* Check the operation. */
586 first_stmt_code
= rhs_code
;
588 /* Shift arguments should be equal in all the packed stmts for a
589 vector shift with scalar shift operand. */
590 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
591 || rhs_code
== LROTATE_EXPR
592 || rhs_code
== RROTATE_EXPR
)
594 vec_mode
= TYPE_MODE (vectype
);
596 /* First see if we have a vector/vector shift. */
597 optab
= optab_for_tree_code (rhs_code
, vectype
,
601 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
603 /* No vector/vector shift, try for a vector/scalar shift. */
604 optab
= optab_for_tree_code (rhs_code
, vectype
,
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
611 "Build SLP failed: no optab.\n");
612 /* Fatal mismatch. */
616 icode
= (int) optab_handler (optab
, vec_mode
);
617 if (icode
== CODE_FOR_nothing
)
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
622 "op not supported by target.\n");
623 /* Fatal mismatch. */
627 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
628 if (!VECTOR_MODE_P (optab_op2_mode
))
630 need_same_oprnds
= true;
631 first_op1
= gimple_assign_rhs2 (stmt
);
635 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
637 need_same_oprnds
= true;
638 first_op1
= gimple_assign_rhs2 (stmt
);
643 if (first_stmt_code
!= rhs_code
644 && (first_stmt_code
!= IMAGPART_EXPR
645 || rhs_code
!= REALPART_EXPR
)
646 && (first_stmt_code
!= REALPART_EXPR
647 || rhs_code
!= IMAGPART_EXPR
)
648 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
649 && (first_stmt_code
== ARRAY_REF
650 || first_stmt_code
== BIT_FIELD_REF
651 || first_stmt_code
== INDIRECT_REF
652 || first_stmt_code
== COMPONENT_REF
653 || first_stmt_code
== MEM_REF
)))
655 if (dump_enabled_p ())
657 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
658 "Build SLP failed: different operation "
660 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
661 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
668 && !operand_equal_p (first_op1
, gimple_assign_rhs2 (stmt
), 0))
670 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
673 "Build SLP failed: different shift "
675 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
676 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
682 if (rhs_code
== CALL_EXPR
)
684 gimple first_stmt
= stmts
[0];
685 if (gimple_call_num_args (stmt
) != nops
686 || !operand_equal_p (gimple_call_fn (first_stmt
),
687 gimple_call_fn (stmt
), 0)
688 || gimple_call_fntype (first_stmt
)
689 != gimple_call_fntype (stmt
))
691 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
694 "Build SLP failed: different calls in ");
695 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
697 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
705 /* Grouped store or load. */
706 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
)))
708 if (REFERENCE_CLASS_P (lhs
))
716 unsigned unrolling_factor
717 = least_common_multiple
718 (*max_nunits
, group_size
) / group_size
;
719 /* FORNOW: Check that there is no gap between the loads
720 and no gap between the groups when we need to load
721 multiple groups at once.
722 ??? We should enhance this to only disallow gaps
724 if ((unrolling_factor
> 1
725 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
726 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
727 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) != stmt
728 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 1))
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
733 "Build SLP failed: grouped "
735 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
737 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
739 /* Fatal mismatch. */
744 /* Check that the size of interleaved loads group is not
745 greater than the SLP group size. */
747 = vectorization_factor
/ TYPE_VECTOR_SUBPARTS (vectype
);
749 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
750 && ((GROUP_SIZE (vinfo_for_stmt (stmt
))
751 - GROUP_GAP (vinfo_for_stmt (stmt
)))
752 > ncopies
* group_size
))
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
757 "Build SLP failed: the number "
758 "of interleaved loads is greater than "
759 "the SLP group size ");
760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
762 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
764 /* Fatal mismatch. */
769 old_first_load
= first_load
;
770 first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
773 /* Check that there are no loads from different interleaving
774 chains in the same node. */
775 if (prev_first_load
!= first_load
)
777 if (dump_enabled_p ())
779 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
781 "Build SLP failed: different "
782 "interleaving chains in one node ");
783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
785 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
792 prev_first_load
= first_load
;
794 /* In some cases a group of loads is just the same load
795 repeated N times. Only analyze its cost once. */
796 if (first_load
== stmt
&& old_first_load
!= first_load
)
798 first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
799 if (vect_supportable_dr_alignment (first_dr
, false)
800 == dr_unaligned_unsupported
)
802 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
806 "Build SLP failed: unsupported "
808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
810 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
812 /* Fatal mismatch. */
818 } /* Grouped access. */
821 if (TREE_CODE_CLASS (rhs_code
) == tcc_reference
)
823 /* Not grouped load. */
824 if (dump_enabled_p ())
826 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
827 "Build SLP failed: not grouped load ");
828 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
829 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
832 /* FORNOW: Not grouped loads are not supported. */
833 /* Fatal mismatch. */
838 /* Not memory operation. */
839 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
840 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
841 && rhs_code
!= COND_EXPR
842 && rhs_code
!= CALL_EXPR
)
844 if (dump_enabled_p ())
846 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
847 "Build SLP failed: operation");
848 dump_printf (MSG_MISSED_OPTIMIZATION
, " unsupported ");
849 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
850 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
852 /* Fatal mismatch. */
857 if (rhs_code
== COND_EXPR
)
859 tree cond_expr
= gimple_assign_rhs1 (stmt
);
862 first_cond_code
= TREE_CODE (cond_expr
);
863 else if (first_cond_code
!= TREE_CODE (cond_expr
))
865 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
868 "Build SLP failed: different"
870 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
872 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
883 for (i
= 0; i
< group_size
; ++i
)
890 /* Recursively build an SLP tree starting from NODE.
891 Fail (and return a value not equal to zero) if def-stmts are not
892 isomorphic, require data permutation or are of unsupported types of
893 operation. Otherwise, return 0.
894 The value returned is the depth in the SLP tree where a mismatch
898 vect_build_slp_tree (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
899 slp_tree
*node
, unsigned int group_size
,
900 unsigned int *max_nunits
,
901 vec
<slp_tree
> *loads
,
902 unsigned int vectorization_factor
,
903 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
904 unsigned max_tree_size
)
906 unsigned nops
, i
, this_npermutes
= 0, this_tree_size
= 0;
910 matches
= XALLOCAVEC (bool, group_size
);
912 npermutes
= &this_npermutes
;
916 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
917 if (is_gimple_call (stmt
))
918 nops
= gimple_call_num_args (stmt
);
919 else if (is_gimple_assign (stmt
))
921 nops
= gimple_num_ops (stmt
) - 1;
922 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
928 if (!vect_build_slp_tree_1 (loop_vinfo
, bb_vinfo
,
929 SLP_TREE_SCALAR_STMTS (*node
), group_size
, nops
,
930 max_nunits
, vectorization_factor
, matches
))
933 /* If the SLP node is a load, terminate the recursion. */
934 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
935 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
937 loads
->safe_push (*node
);
941 /* Get at the operands, verifying they are compatible. */
942 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
943 slp_oprnd_info oprnd_info
;
944 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node
), i
, stmt
)
946 switch (vect_get_and_check_slp_defs (loop_vinfo
, bb_vinfo
,
947 stmt
, (i
== 0), &oprnds_info
))
953 vect_free_oprnd_info (oprnds_info
);
960 for (i
= 0; i
< group_size
; ++i
)
963 vect_free_oprnd_info (oprnds_info
);
967 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
969 /* Create SLP_TREE nodes for the definition node/s. */
970 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
973 unsigned old_nloads
= loads
->length ();
974 unsigned old_max_nunits
= *max_nunits
;
976 if (oprnd_info
->first_dt
!= vect_internal_def
)
979 if (++this_tree_size
> max_tree_size
)
981 vect_free_oprnd_info (oprnds_info
);
985 child
= vect_create_new_slp_node (oprnd_info
->def_stmts
);
988 vect_free_oprnd_info (oprnds_info
);
992 bool *matches
= XALLOCAVEC (bool, group_size
);
993 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
994 group_size
, max_nunits
, loads
,
995 vectorization_factor
, matches
,
996 npermutes
, &this_tree_size
, max_tree_size
))
998 oprnd_info
->def_stmts
= vNULL
;
999 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1003 /* If the SLP build for operand zero failed and operand zero
1004 and one can be commutated try that for the scalar stmts
1005 that failed the match. */
1007 /* A first scalar stmt mismatch signals a fatal mismatch. */
1009 /* ??? For COND_EXPRs we can swap the comparison operands
1010 as well as the arms under some constraints. */
1012 && oprnds_info
[1]->first_dt
== vect_internal_def
1013 && is_gimple_assign (stmt
)
1014 && commutative_tree_code (gimple_assign_rhs_code (stmt
))
1015 /* Do so only if the number of not successful permutes was nor more
1016 than a cut-ff as re-trying the recursive match on
1017 possibly each level of the tree would expose exponential
1022 *max_nunits
= old_max_nunits
;
1023 loads
->truncate (old_nloads
);
1024 /* Swap mismatched definition stmts. */
1025 dump_printf_loc (MSG_NOTE
, vect_location
,
1026 "Re-trying with swapped operands of stmts ");
1027 for (unsigned j
= 0; j
< group_size
; ++j
)
1030 gimple tem
= oprnds_info
[0]->def_stmts
[j
];
1031 oprnds_info
[0]->def_stmts
[j
] = oprnds_info
[1]->def_stmts
[j
];
1032 oprnds_info
[1]->def_stmts
[j
] = tem
;
1033 dump_printf (MSG_NOTE
, "%d ", j
);
1035 dump_printf (MSG_NOTE
, "\n");
1036 /* And try again ... */
1037 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
1038 group_size
, max_nunits
, loads
,
1039 vectorization_factor
,
1040 matches
, npermutes
, &this_tree_size
,
1043 oprnd_info
->def_stmts
= vNULL
;
1044 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1051 oprnd_info
->def_stmts
= vNULL
;
1052 vect_free_slp_tree (child
);
1053 vect_free_oprnd_info (oprnds_info
);
1058 *tree_size
+= this_tree_size
;
1060 vect_free_oprnd_info (oprnds_info
);
1064 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1067 vect_print_slp_tree (int dump_kind
, slp_tree node
)
1076 dump_printf (dump_kind
, "node ");
1077 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1079 dump_printf (dump_kind
, "\n\tstmt %d ", i
);
1080 dump_gimple_stmt (dump_kind
, TDF_SLIM
, stmt
, 0);
1082 dump_printf (dump_kind
, "\n");
1084 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1085 vect_print_slp_tree (dump_kind
, child
);
1089 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1090 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1091 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1092 stmts in NODE are to be marked. */
1095 vect_mark_slp_stmts (slp_tree node
, enum slp_vect_type mark
, int j
)
1104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1105 if (j
< 0 || i
== j
)
1106 STMT_SLP_TYPE (vinfo_for_stmt (stmt
)) = mark
;
1108 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1109 vect_mark_slp_stmts (child
, mark
, j
);
1113 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1116 vect_mark_slp_stmts_relevant (slp_tree node
)
1120 stmt_vec_info stmt_info
;
1126 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1128 stmt_info
= vinfo_for_stmt (stmt
);
1129 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1130 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1131 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1134 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1135 vect_mark_slp_stmts_relevant (child
);
1139 /* Rearrange the statements of NODE according to PERMUTATION. */
1142 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1143 vec
<unsigned> permutation
)
1146 vec
<gimple
> tmp_stmts
;
1150 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1151 vect_slp_rearrange_stmts (child
, group_size
, permutation
);
1153 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1154 tmp_stmts
.create (group_size
);
1155 tmp_stmts
.quick_grow_cleared (group_size
);
1157 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1158 tmp_stmts
[permutation
[i
]] = stmt
;
1160 SLP_TREE_SCALAR_STMTS (node
).release ();
1161 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1165 /* Check if the required load permutations in the SLP instance
1166 SLP_INSTN are supported. */
1169 vect_supported_load_permutation_p (slp_instance slp_instn
)
1171 unsigned int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_instn
);
1172 unsigned int i
, j
, k
, next
;
1175 gimple stmt
, load
, next_load
, first_load
;
1176 struct data_reference
*dr
;
1178 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_NOTE
, vect_location
, "Load permutation ");
1181 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1182 if (node
->load_permutation
.exists ())
1183 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, next
)
1184 dump_printf (MSG_NOTE
, "%d ", next
);
1186 for (k
= 0; k
< group_size
; ++k
)
1187 dump_printf (MSG_NOTE
, "%d ", k
);
1188 dump_printf (MSG_NOTE
, "\n");
1191 /* In case of reduction every load permutation is allowed, since the order
1192 of the reduction statements is not important (as opposed to the case of
1193 grouped stores). The only condition we need to check is that all the
1194 load nodes are of the same size and have the same permutation (and then
1195 rearrange all the nodes of the SLP instance according to this
1198 /* Check that all the load nodes are of the same size. */
1199 /* ??? Can't we assert this? */
1200 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1201 if (SLP_TREE_SCALAR_STMTS (node
).length () != (unsigned) group_size
)
1204 node
= SLP_INSTANCE_TREE (slp_instn
);
1205 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1207 /* Reduction (there are no data-refs in the root).
1208 In reduction chain the order of the loads is important. */
1209 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))
1210 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1215 /* Compare all the permutation sequences to the first one. We know
1216 that at least one load is permuted. */
1217 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1218 if (!node
->load_permutation
.exists ())
1220 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1222 if (!load
->load_permutation
.exists ())
1224 FOR_EACH_VEC_ELT (load
->load_permutation
, j
, lidx
)
1225 if (lidx
!= node
->load_permutation
[j
])
1229 /* Check that the loads in the first sequence are different and there
1230 are no gaps between them. */
1231 load_index
= sbitmap_alloc (group_size
);
1232 bitmap_clear (load_index
);
1233 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1235 if (bitmap_bit_p (load_index
, lidx
))
1237 sbitmap_free (load_index
);
1240 bitmap_set_bit (load_index
, lidx
);
1242 for (i
= 0; i
< group_size
; i
++)
1243 if (!bitmap_bit_p (load_index
, i
))
1245 sbitmap_free (load_index
);
1248 sbitmap_free (load_index
);
1250 /* This permutation is valid for reduction. Since the order of the
1251 statements in the nodes is not important unless they are memory
1252 accesses, we can rearrange the statements in all the nodes
1253 according to the order of the loads. */
1254 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1255 node
->load_permutation
);
1257 /* We are done, no actual permutations need to be generated. */
1258 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1259 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1263 /* In basic block vectorization we allow any subchain of an interleaving
1265 FORNOW: not supported in loop SLP because of realignment compications. */
1266 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt
)))
1268 /* Check that for every node in the instance the loads
1270 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1273 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load
)
1275 if (j
!= 0 && next_load
!= load
)
1277 next_load
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (load
));
1281 /* Check that the alignment of the first load in every subchain, i.e.,
1282 the first statement in every load node, is supported.
1283 ??? This belongs in alignment checking. */
1284 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1286 first_load
= SLP_TREE_SCALAR_STMTS (node
)[0];
1287 if (first_load
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load
)))
1289 dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load
));
1290 if (vect_supportable_dr_alignment (dr
, false)
1291 == dr_unaligned_unsupported
)
1293 if (dump_enabled_p ())
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1297 "unsupported unaligned load ");
1298 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1300 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1307 /* We are done, no actual permutations need to be generated. */
1308 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1309 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1313 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1314 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1315 well (unless it's reduction). */
1316 if (SLP_INSTANCE_LOADS (slp_instn
).length () != group_size
)
1318 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1319 if (!node
->load_permutation
.exists ())
1322 load_index
= sbitmap_alloc (group_size
);
1323 bitmap_clear (load_index
);
1324 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1326 unsigned int lidx
= node
->load_permutation
[0];
1327 if (bitmap_bit_p (load_index
, lidx
))
1329 sbitmap_free (load_index
);
1332 bitmap_set_bit (load_index
, lidx
);
1333 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, k
)
1336 sbitmap_free (load_index
);
1340 for (i
= 0; i
< group_size
; i
++)
1341 if (!bitmap_bit_p (load_index
, i
))
1343 sbitmap_free (load_index
);
1346 sbitmap_free (load_index
);
1348 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1349 if (node
->load_permutation
.exists ()
1350 && !vect_transform_slp_perm_load
1352 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
), slp_instn
, true))
1358 /* Find the first load in the loop that belongs to INSTANCE.
1359 When loads are in several SLP nodes, there can be a case in which the first
1360 load does not appear in the first SLP node to be transformed, causing
1361 incorrect order of statements. Since we generate all the loads together,
1362 they must be inserted before the first load of the SLP instance and not
1363 before the first load of the first node of the instance. */
1366 vect_find_first_load_in_slp_instance (slp_instance instance
)
1370 gimple first_load
= NULL
, load
;
1372 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
1373 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1374 first_load
= get_earlier_stmt (load
, first_load
);
1380 /* Find the last store in SLP INSTANCE. */
1383 vect_find_last_store_in_slp_instance (slp_instance instance
)
1387 gimple last_store
= NULL
, store
;
1389 node
= SLP_INSTANCE_TREE (instance
);
1390 for (i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &store
); i
++)
1391 last_store
= get_later_stmt (store
, last_store
);
1396 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1399 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1400 slp_instance instance
, slp_tree node
,
1401 stmt_vector_for_cost
*prologue_cost_vec
,
1402 unsigned ncopies_for_cost
)
1404 stmt_vector_for_cost
*body_cost_vec
= &SLP_INSTANCE_BODY_COST_VEC (instance
);
1409 stmt_vec_info stmt_info
;
1411 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1413 /* Recurse down the SLP tree. */
1414 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1415 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1416 instance
, child
, prologue_cost_vec
,
1419 /* Look at the first scalar stmt to determine the cost. */
1420 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1421 stmt_info
= vinfo_for_stmt (stmt
);
1422 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1424 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1425 vect_model_store_cost (stmt_info
, ncopies_for_cost
, false,
1426 vect_uninitialized_def
,
1427 node
, prologue_cost_vec
, body_cost_vec
);
1431 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)));
1432 vect_model_load_cost (stmt_info
, ncopies_for_cost
, false,
1433 node
, prologue_cost_vec
, body_cost_vec
);
1434 /* If the load is permuted record the cost for the permutation.
1435 ??? Loads from multiple chains are let through here only
1436 for a single special case involving complex numbers where
1437 in the end no permutation is necessary. */
1438 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, s
)
1439 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s
))
1440 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
))
1441 && vect_get_place_in_interleaving_chain
1442 (s
, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
)) != i
)
1444 record_stmt_cost (body_cost_vec
, group_size
, vec_perm
,
1445 stmt_info
, 0, vect_body
);
1451 record_stmt_cost (body_cost_vec
, ncopies_for_cost
, vector_stmt
,
1452 stmt_info
, 0, vect_body
);
1454 /* Scan operands and account for prologue cost of constants/externals.
1455 ??? This over-estimates cost for multiple uses and should be
1457 lhs
= gimple_get_lhs (stmt
);
1458 for (i
= 0; i
< gimple_num_ops (stmt
); ++i
)
1460 tree def
, op
= gimple_op (stmt
, i
);
1462 enum vect_def_type dt
;
1463 if (!op
|| op
== lhs
)
1465 if (vect_is_simple_use (op
, NULL
, loop_vinfo
, bb_vinfo
,
1466 &def_stmt
, &def
, &dt
)
1467 && (dt
== vect_constant_def
|| dt
== vect_external_def
))
1468 record_stmt_cost (prologue_cost_vec
, 1, vector_stmt
,
1469 stmt_info
, 0, vect_prologue
);
1473 /* Compute the cost for the SLP instance INSTANCE. */
1476 vect_analyze_slp_cost (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1477 slp_instance instance
, unsigned nunits
)
1479 stmt_vector_for_cost body_cost_vec
, prologue_cost_vec
;
1480 unsigned ncopies_for_cost
;
1481 stmt_info_for_cost
*si
;
1484 /* Calculate the number of vector stmts to create based on the unrolling
1485 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1486 GROUP_SIZE / NUNITS otherwise. */
1487 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1488 ncopies_for_cost
= least_common_multiple (nunits
, group_size
) / nunits
;
1490 prologue_cost_vec
.create (10);
1491 body_cost_vec
.create (10);
1492 SLP_INSTANCE_BODY_COST_VEC (instance
) = body_cost_vec
;
1493 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1494 instance
, SLP_INSTANCE_TREE (instance
),
1495 &prologue_cost_vec
, ncopies_for_cost
);
1497 /* Record the prologue costs, which were delayed until we were
1498 sure that SLP was successful. Unlike the body costs, we know
1499 the final values now regardless of the loop vectorization factor. */
1500 void *data
= (loop_vinfo
? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
1501 : BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1502 FOR_EACH_VEC_ELT (prologue_cost_vec
, i
, si
)
1504 struct _stmt_vec_info
*stmt_info
1505 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1506 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1507 si
->misalign
, vect_prologue
);
1510 prologue_cost_vec
.release ();
1513 /* Analyze an SLP instance starting from a group of grouped stores. Call
1514 vect_build_slp_tree to build a tree of packed stmts if possible.
1515 Return FALSE if it's impossible to SLP any stmt in the loop. */
1518 vect_analyze_slp_instance (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1519 gimple stmt
, unsigned max_tree_size
)
1521 slp_instance new_instance
;
1523 unsigned int group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1524 unsigned int unrolling_factor
= 1, nunits
;
1525 tree vectype
, scalar_type
= NULL_TREE
;
1527 unsigned int vectorization_factor
= 0;
1529 unsigned int max_nunits
= 0;
1530 vec
<slp_tree
> loads
;
1531 struct data_reference
*dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
1532 vec
<gimple
> scalar_stmts
;
1534 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1538 scalar_type
= TREE_TYPE (DR_REF (dr
));
1539 vectype
= get_vectype_for_scalar_type (scalar_type
);
1543 gcc_assert (loop_vinfo
);
1544 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1547 group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1551 gcc_assert (loop_vinfo
);
1552 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1553 group_size
= LOOP_VINFO_REDUCTIONS (loop_vinfo
).length ();
1558 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1561 "Build SLP failed: unsupported data-type ");
1562 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, scalar_type
);
1563 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1569 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
1571 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1573 vectorization_factor
= nunits
;
1575 /* Calculate the unrolling factor. */
1576 unrolling_factor
= least_common_multiple (nunits
, group_size
) / group_size
;
1577 if (unrolling_factor
!= 1 && !loop_vinfo
)
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1581 "Build SLP failed: unrolling required in basic"
1587 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1588 scalar_stmts
.create (group_size
);
1590 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1592 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1595 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
))
1596 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)))
1597 scalar_stmts
.safe_push (
1598 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)));
1600 scalar_stmts
.safe_push (next
);
1601 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
1606 /* Collect reduction statements. */
1607 vec
<gimple
> reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1608 for (i
= 0; reductions
.iterate (i
, &next
); i
++)
1609 scalar_stmts
.safe_push (next
);
1612 node
= vect_create_new_slp_node (scalar_stmts
);
1614 loads
.create (group_size
);
1616 /* Build the tree for the SLP instance. */
1617 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &node
, group_size
,
1618 &max_nunits
, &loads
,
1619 vectorization_factor
, NULL
, NULL
, NULL
,
1622 /* Calculate the unrolling factor based on the smallest type. */
1623 if (max_nunits
> nunits
)
1624 unrolling_factor
= least_common_multiple (max_nunits
, group_size
)
1627 if (unrolling_factor
!= 1 && !loop_vinfo
)
1629 if (dump_enabled_p ())
1630 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1631 "Build SLP failed: unrolling required in basic"
1633 vect_free_slp_tree (node
);
1638 /* Create a new SLP instance. */
1639 new_instance
= XNEW (struct _slp_instance
);
1640 SLP_INSTANCE_TREE (new_instance
) = node
;
1641 SLP_INSTANCE_GROUP_SIZE (new_instance
) = group_size
;
1642 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
1643 SLP_INSTANCE_BODY_COST_VEC (new_instance
) = vNULL
;
1644 SLP_INSTANCE_LOADS (new_instance
) = loads
;
1645 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
) = NULL
;
1647 /* Compute the load permutation. */
1649 bool loads_permuted
= false;
1650 FOR_EACH_VEC_ELT (loads
, i
, load_node
)
1652 vec
<unsigned> load_permutation
;
1654 gimple load
, first_stmt
;
1655 bool this_load_permuted
= false;
1656 load_permutation
.create (group_size
);
1657 first_stmt
= GROUP_FIRST_ELEMENT
1658 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node
)[0]));
1659 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1662 = vect_get_place_in_interleaving_chain (load
, first_stmt
);
1663 gcc_assert (load_place
!= -1);
1664 if (load_place
!= j
)
1665 this_load_permuted
= true;
1666 load_permutation
.safe_push (load_place
);
1668 if (!this_load_permuted
)
1670 load_permutation
.release ();
1673 SLP_TREE_LOAD_PERMUTATION (load_node
) = load_permutation
;
1674 loads_permuted
= true;
1679 if (!vect_supported_load_permutation_p (new_instance
))
1681 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1684 "Build SLP failed: unsupported load "
1686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
1687 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1689 vect_free_slp_instance (new_instance
);
1693 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
)
1694 = vect_find_first_load_in_slp_instance (new_instance
);
1697 /* Compute the costs of this SLP instance. */
1698 vect_analyze_slp_cost (loop_vinfo
, bb_vinfo
,
1699 new_instance
, TYPE_VECTOR_SUBPARTS (vectype
));
1702 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).safe_push (new_instance
);
1704 BB_VINFO_SLP_INSTANCES (bb_vinfo
).safe_push (new_instance
);
1706 if (dump_enabled_p ())
1707 vect_print_slp_tree (MSG_NOTE
, node
);
1712 /* Failed to SLP. */
1713 /* Free the allocated memory. */
1714 vect_free_slp_tree (node
);
1721 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1722 trees of packed scalar stmts if SLP is possible. */
1725 vect_analyze_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1726 unsigned max_tree_size
)
1729 vec
<gimple
> grouped_stores
;
1730 vec
<gimple
> reductions
= vNULL
;
1731 vec
<gimple
> reduc_chains
= vNULL
;
1732 gimple first_element
;
1735 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_analyze_slp ===\n");
1740 grouped_stores
= LOOP_VINFO_GROUPED_STORES (loop_vinfo
);
1741 reduc_chains
= LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
);
1742 reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1745 grouped_stores
= BB_VINFO_GROUPED_STORES (bb_vinfo
);
1747 /* Find SLP sequences starting from groups of grouped stores. */
1748 FOR_EACH_VEC_ELT (grouped_stores
, i
, first_element
)
1749 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1753 if (bb_vinfo
&& !ok
)
1755 if (dump_enabled_p ())
1756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1757 "Failed to SLP the basic block.\n");
1763 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).length () > 0)
1765 /* Find SLP sequences starting from reduction chains. */
1766 FOR_EACH_VEC_ELT (reduc_chains
, i
, first_element
)
1767 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1773 /* Don't try to vectorize SLP reductions if reduction chain was
1778 /* Find SLP sequences starting from groups of reductions. */
1779 if (loop_vinfo
&& LOOP_VINFO_REDUCTIONS (loop_vinfo
).length () > 1
1780 && vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, reductions
[0],
1788 /* For each possible SLP instance decide whether to SLP it and calculate overall
1789 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1790 least one instance. */
1793 vect_make_slp_decision (loop_vec_info loop_vinfo
)
1795 unsigned int i
, unrolling_factor
= 1;
1796 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1797 slp_instance instance
;
1798 int decided_to_slp
= 0;
1800 if (dump_enabled_p ())
1801 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_make_slp_decision ==="
1804 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1806 /* FORNOW: SLP if you can. */
1807 if (unrolling_factor
< SLP_INSTANCE_UNROLLING_FACTOR (instance
))
1808 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (instance
);
1810 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1811 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1812 loop-based vectorization. Such stmts will be marked as HYBRID. */
1813 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
1817 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
1819 if (decided_to_slp
&& dump_enabled_p ())
1820 dump_printf_loc (MSG_NOTE
, vect_location
,
1821 "Decided to SLP %d instances. Unrolling factor %d\n",
1822 decided_to_slp
, unrolling_factor
);
1824 return (decided_to_slp
> 0);
1828 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1829 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1832 vect_detect_hybrid_slp_stmts (slp_tree node
)
1835 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (node
);
1836 gimple stmt
= stmts
[0];
1837 imm_use_iterator imm_iter
;
1839 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
1841 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
1842 struct loop
*loop
= NULL
;
1843 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_vinfo
);
1844 basic_block bb
= NULL
;
1850 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1852 bb
= BB_VINFO_BB (bb_vinfo
);
1854 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1855 if (PURE_SLP_STMT (vinfo_for_stmt (stmt
))
1856 && TREE_CODE (gimple_op (stmt
, 0)) == SSA_NAME
)
1857 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, gimple_op (stmt
, 0))
1858 if (gimple_bb (use_stmt
)
1859 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1860 || bb
== gimple_bb (use_stmt
))
1861 && (stmt_vinfo
= vinfo_for_stmt (use_stmt
))
1862 && !STMT_SLP_TYPE (stmt_vinfo
)
1863 && (STMT_VINFO_RELEVANT (stmt_vinfo
)
1864 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo
))
1865 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
)
1866 && STMT_VINFO_RELATED_STMT (stmt_vinfo
)
1867 && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo
)))))
1868 && !(gimple_code (use_stmt
) == GIMPLE_PHI
1869 && STMT_VINFO_DEF_TYPE (stmt_vinfo
)
1870 == vect_reduction_def
))
1871 vect_mark_slp_stmts (node
, hybrid
, i
);
1873 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1874 vect_detect_hybrid_slp_stmts (child
);
1878 /* Find stmts that must be both vectorized and SLPed. */
1881 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
1884 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1885 slp_instance instance
;
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_detect_hybrid_slp ==="
1891 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1892 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance
));
1896 /* Create and initialize a new bb_vec_info struct for BB, as well as
1897 stmt_vec_info structs for all the stmts in it. */
1900 new_bb_vec_info (basic_block bb
)
1902 bb_vec_info res
= NULL
;
1903 gimple_stmt_iterator gsi
;
1905 res
= (bb_vec_info
) xcalloc (1, sizeof (struct _bb_vec_info
));
1906 BB_VINFO_BB (res
) = bb
;
1908 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1910 gimple stmt
= gsi_stmt (gsi
);
1911 gimple_set_uid (stmt
, 0);
1912 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, NULL
, res
));
1915 BB_VINFO_GROUPED_STORES (res
).create (10);
1916 BB_VINFO_SLP_INSTANCES (res
).create (2);
1917 BB_VINFO_TARGET_COST_DATA (res
) = init_cost (NULL
);
1924 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1925 stmts in the basic block. */
1928 destroy_bb_vec_info (bb_vec_info bb_vinfo
)
1930 vec
<slp_instance
> slp_instances
;
1931 slp_instance instance
;
1933 gimple_stmt_iterator si
;
1939 bb
= BB_VINFO_BB (bb_vinfo
);
1941 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1943 gimple stmt
= gsi_stmt (si
);
1944 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1947 /* Free stmt_vec_info. */
1948 free_stmt_vec_info (stmt
);
1951 vect_destroy_datarefs (NULL
, bb_vinfo
);
1952 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo
));
1953 BB_VINFO_GROUPED_STORES (bb_vinfo
).release ();
1954 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
1955 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1956 vect_free_slp_instance (instance
);
1957 BB_VINFO_SLP_INSTANCES (bb_vinfo
).release ();
1958 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1964 /* Analyze statements contained in SLP tree node after recursively analyzing
1965 the subtree. Return TRUE if the operations are supported. */
1968 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo
, slp_tree node
)
1978 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1979 if (!vect_slp_analyze_node_operations (bb_vinfo
, child
))
1982 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1984 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1985 gcc_assert (stmt_info
);
1986 gcc_assert (PURE_SLP_STMT (stmt_info
));
1988 if (!vect_analyze_stmt (stmt
, &dummy
, node
))
1996 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1997 operations are supported. */
2000 vect_slp_analyze_operations (bb_vec_info bb_vinfo
)
2002 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2003 slp_instance instance
;
2006 for (i
= 0; slp_instances
.iterate (i
, &instance
); )
2008 if (!vect_slp_analyze_node_operations (bb_vinfo
,
2009 SLP_INSTANCE_TREE (instance
)))
2011 vect_free_slp_instance (instance
);
2012 slp_instances
.ordered_remove (i
);
2018 if (!slp_instances
.length ())
2025 /* Compute the scalar cost of the SLP node NODE and its children
2026 and return it. Do not account defs that are marked in LIFE and
2027 update LIFE according to uses of NODE. */
2030 vect_bb_slp_scalar_cost (basic_block bb
,
2031 slp_tree node
, vec
<bool, va_heap
> *life
)
2033 unsigned scalar_cost
= 0;
2038 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
2041 ssa_op_iter op_iter
;
2042 def_operand_p def_p
;
2043 stmt_vec_info stmt_info
;
2048 /* If there is a non-vectorized use of the defs then the scalar
2049 stmt is kept live in which case we do not account it or any
2050 required defs in the SLP children in the scalar cost. This
2051 way we make the vectorization more costly when compared to
2053 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
2055 imm_use_iterator use_iter
;
2057 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
2058 if (!is_gimple_debug (use_stmt
)
2059 && (gimple_code (use_stmt
) == GIMPLE_PHI
2060 || gimple_bb (use_stmt
) != bb
2061 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt
))))
2064 BREAK_FROM_IMM_USE_STMT (use_iter
);
2070 stmt_info
= vinfo_for_stmt (stmt
);
2071 if (STMT_VINFO_DATA_REF (stmt_info
))
2073 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2074 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2076 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2079 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2081 scalar_cost
+= stmt_cost
;
2084 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2085 scalar_cost
+= vect_bb_slp_scalar_cost (bb
, child
, life
);
2090 /* Check if vectorization of the basic block is profitable. */
2093 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
2095 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2096 slp_instance instance
;
2098 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
2099 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
2100 void *target_cost_data
= BB_VINFO_TARGET_COST_DATA (bb_vinfo
);
2101 stmt_vec_info stmt_info
= NULL
;
2102 stmt_vector_for_cost body_cost_vec
;
2103 stmt_info_for_cost
*ci
;
2105 /* Calculate vector costs. */
2106 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2108 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2110 FOR_EACH_VEC_ELT (body_cost_vec
, j
, ci
)
2112 stmt_info
= ci
->stmt
? vinfo_for_stmt (ci
->stmt
) : NULL
;
2113 (void) add_stmt_cost (target_cost_data
, ci
->count
, ci
->kind
,
2114 stmt_info
, ci
->misalign
, vect_body
);
2118 /* Calculate scalar cost. */
2119 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2121 auto_vec
<bool, 20> life
;
2122 life
.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance
));
2123 scalar_cost
+= vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo
),
2124 SLP_INSTANCE_TREE (instance
),
2128 /* Complete the target-specific cost calculation. */
2129 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
2130 &vec_inside_cost
, &vec_epilogue_cost
);
2132 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
2134 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2137 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
2139 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
2140 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
2141 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
2144 /* Vectorization is profitable if its cost is less than the cost of scalar
2146 if (vec_outside_cost
+ vec_inside_cost
>= scalar_cost
)
2152 /* Check if the basic block can be vectorized. */
2155 vect_slp_analyze_bb_1 (basic_block bb
)
2157 bb_vec_info bb_vinfo
;
2158 vec
<slp_instance
> slp_instances
;
2159 slp_instance instance
;
2162 unsigned n_stmts
= 0;
2164 bb_vinfo
= new_bb_vec_info (bb
);
2168 if (!vect_analyze_data_refs (NULL
, bb_vinfo
, &min_vf
, &n_stmts
))
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2172 "not vectorized: unhandled data-ref in basic "
2175 destroy_bb_vec_info (bb_vinfo
);
2179 if (BB_VINFO_DATAREFS (bb_vinfo
).length () < 2)
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2183 "not vectorized: not enough data-refs in "
2186 destroy_bb_vec_info (bb_vinfo
);
2190 if (!vect_analyze_data_ref_accesses (NULL
, bb_vinfo
))
2192 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2194 "not vectorized: unhandled data access in "
2197 destroy_bb_vec_info (bb_vinfo
);
2201 vect_pattern_recog (NULL
, bb_vinfo
);
2203 if (!vect_analyze_data_refs_alignment (NULL
, bb_vinfo
))
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2207 "not vectorized: bad data alignment in basic "
2210 destroy_bb_vec_info (bb_vinfo
);
2214 /* Check the SLP opportunities in the basic block, analyze and build SLP
2216 if (!vect_analyze_slp (NULL
, bb_vinfo
, n_stmts
))
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2220 "not vectorized: failed to find SLP opportunities "
2221 "in basic block.\n");
2223 destroy_bb_vec_info (bb_vinfo
);
2227 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2229 /* Mark all the statements that we want to vectorize as pure SLP and
2231 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2233 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
2234 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
2237 /* Mark all the statements that we do not want to vectorize. */
2238 for (gimple_stmt_iterator gsi
= gsi_start_bb (BB_VINFO_BB (bb_vinfo
));
2239 !gsi_end_p (gsi
); gsi_next (&gsi
))
2241 stmt_vec_info vinfo
= vinfo_for_stmt (gsi_stmt (gsi
));
2242 if (STMT_SLP_TYPE (vinfo
) != pure_slp
)
2243 STMT_VINFO_VECTORIZABLE (vinfo
) = false;
2246 /* Analyze dependences. At this point all stmts not participating in
2247 vectorization have to be marked. Dependence analysis assumes
2248 that we either vectorize all SLP instances or none at all. */
2249 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo
))
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2253 "not vectorized: unhandled data dependence "
2254 "in basic block.\n");
2256 destroy_bb_vec_info (bb_vinfo
);
2260 if (!vect_verify_datarefs_alignment (NULL
, bb_vinfo
))
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2264 "not vectorized: unsupported alignment in basic "
2266 destroy_bb_vec_info (bb_vinfo
);
2270 if (!vect_slp_analyze_operations (bb_vinfo
))
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2274 "not vectorized: bad operation in basic block.\n");
2276 destroy_bb_vec_info (bb_vinfo
);
2280 /* Cost model: check if the vectorization is worthwhile. */
2281 if (!unlimited_cost_model (NULL
)
2282 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2286 "not vectorized: vectorization is not "
2289 destroy_bb_vec_info (bb_vinfo
);
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE
, vect_location
,
2295 "Basic block will be vectorized using SLP\n");
2302 vect_slp_analyze_bb (basic_block bb
)
2304 bb_vec_info bb_vinfo
;
2306 gimple_stmt_iterator gsi
;
2307 unsigned int vector_sizes
;
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_NOTE
, vect_location
, "===vect_slp_analyze_bb===\n");
2312 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2314 gimple stmt
= gsi_stmt (gsi
);
2315 if (!is_gimple_debug (stmt
)
2316 && !gimple_nop_p (stmt
)
2317 && gimple_code (stmt
) != GIMPLE_LABEL
)
2321 if (insns
> PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB
))
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2325 "not vectorized: too many instructions in "
2331 /* Autodetect first vector size we try. */
2332 current_vector_size
= 0;
2333 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
2337 bb_vinfo
= vect_slp_analyze_bb_1 (bb
);
2341 destroy_bb_vec_info (bb_vinfo
);
2343 vector_sizes
&= ~current_vector_size
;
2344 if (vector_sizes
== 0
2345 || current_vector_size
== 0)
2348 /* Try the next biggest vector size. */
2349 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_NOTE
, vect_location
,
2352 "***** Re-trying analysis with "
2353 "vector size %d\n", current_vector_size
);
2358 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2359 the number of created vector stmts depends on the unrolling factor).
2360 However, the actual number of vector stmts for every SLP node depends on
2361 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2362 should be updated. In this function we assume that the inside costs
2363 calculated in vect_model_xxx_cost are linear in ncopies. */
2366 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo
)
2368 unsigned int i
, j
, vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2369 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2370 slp_instance instance
;
2371 stmt_vector_for_cost body_cost_vec
;
2372 stmt_info_for_cost
*si
;
2373 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_NOTE
, vect_location
,
2377 "=== vect_update_slp_costs_according_to_vf ===\n");
2379 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2381 /* We assume that costs are linear in ncopies. */
2382 int ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (instance
);
2384 /* Record the instance's instructions in the target cost model.
2385 This was delayed until here because the count of instructions
2386 isn't known beforehand. */
2387 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2389 FOR_EACH_VEC_ELT (body_cost_vec
, j
, si
)
2390 (void) add_stmt_cost (data
, si
->count
* ncopies
, si
->kind
,
2391 vinfo_for_stmt (si
->stmt
), si
->misalign
,
2397 /* For constant and loop invariant defs of SLP_NODE this function returns
2398 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2399 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2400 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2401 REDUC_INDEX is the index of the reduction operand in the statements, unless
2405 vect_get_constant_vectors (tree op
, slp_tree slp_node
,
2406 vec
<tree
> *vec_oprnds
,
2407 unsigned int op_num
, unsigned int number_of_vectors
,
2410 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2411 gimple stmt
= stmts
[0];
2412 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2416 unsigned j
, number_of_places_left_in_vector
;
2419 int group_size
= stmts
.length ();
2420 unsigned int vec_num
, i
;
2421 unsigned number_of_copies
= 1;
2423 voprnds
.create (number_of_vectors
);
2424 bool constant_p
, is_store
;
2425 tree neutral_op
= NULL
;
2426 enum tree_code code
= gimple_expr_code (stmt
);
2429 gimple_seq ctor_seq
= NULL
;
2431 if (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
2432 && reduc_index
!= -1)
2434 op_num
= reduc_index
- 1;
2435 op
= gimple_op (stmt
, reduc_index
);
2436 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2437 we need either neutral operands or the original operands. See
2438 get_initial_def_for_reduction() for details. */
2441 case WIDEN_SUM_EXPR
:
2447 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2448 neutral_op
= build_real (TREE_TYPE (op
), dconst0
);
2450 neutral_op
= build_int_cst (TREE_TYPE (op
), 0);
2455 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2456 neutral_op
= build_real (TREE_TYPE (op
), dconst1
);
2458 neutral_op
= build_int_cst (TREE_TYPE (op
), 1);
2463 neutral_op
= build_int_cst (TREE_TYPE (op
), -1);
2466 /* For MIN/MAX we don't have an easy neutral operand but
2467 the initial values can be used fine here. Only for
2468 a reduction chain we have to force a neutral element. */
2471 if (!GROUP_FIRST_ELEMENT (stmt_vinfo
))
2475 def_stmt
= SSA_NAME_DEF_STMT (op
);
2476 loop
= (gimple_bb (stmt
))->loop_father
;
2477 neutral_op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2478 loop_preheader_edge (loop
));
2487 if (STMT_VINFO_DATA_REF (stmt_vinfo
))
2490 op
= gimple_assign_rhs1 (stmt
);
2497 if (CONSTANT_CLASS_P (op
))
2502 vector_type
= get_vectype_for_scalar_type (TREE_TYPE (op
));
2503 gcc_assert (vector_type
);
2504 nunits
= TYPE_VECTOR_SUBPARTS (vector_type
);
2506 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2507 created vectors. It is greater than 1 if unrolling is performed.
2509 For example, we have two scalar operands, s1 and s2 (e.g., group of
2510 strided accesses of size two), while NUNITS is four (i.e., four scalars
2511 of this type can be packed in a vector). The output vector will contain
2512 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2515 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2516 containing the operands.
2518 For example, NUNITS is four as before, and the group size is 8
2519 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2520 {s5, s6, s7, s8}. */
2522 number_of_copies
= least_common_multiple (nunits
, group_size
) / group_size
;
2524 number_of_places_left_in_vector
= nunits
;
2525 elts
= XALLOCAVEC (tree
, nunits
);
2526 for (j
= 0; j
< number_of_copies
; j
++)
2528 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
2531 op
= gimple_assign_rhs1 (stmt
);
2537 if (op_num
== 0 || op_num
== 1)
2539 tree cond
= gimple_assign_rhs1 (stmt
);
2540 op
= TREE_OPERAND (cond
, op_num
);
2545 op
= gimple_assign_rhs2 (stmt
);
2547 op
= gimple_assign_rhs3 (stmt
);
2552 op
= gimple_call_arg (stmt
, op_num
);
2559 op
= gimple_op (stmt
, op_num
+ 1);
2560 /* Unlike the other binary operators, shifts/rotates have
2561 the shift count being int, instead of the same type as
2562 the lhs, so make sure the scalar is the right type if
2563 we are dealing with vectors of
2564 long long/long/short/char. */
2565 if (op_num
== 1 && TREE_CODE (op
) == INTEGER_CST
)
2566 op
= fold_convert (TREE_TYPE (vector_type
), op
);
2570 op
= gimple_op (stmt
, op_num
+ 1);
2575 if (reduc_index
!= -1)
2577 loop
= (gimple_bb (stmt
))->loop_father
;
2578 def_stmt
= SSA_NAME_DEF_STMT (op
);
2582 /* Get the def before the loop. In reduction chain we have only
2583 one initial value. */
2584 if ((j
!= (number_of_copies
- 1)
2585 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
2590 op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2591 loop_preheader_edge (loop
));
2594 /* Create 'vect_ = {op0,op1,...,opn}'. */
2595 number_of_places_left_in_vector
--;
2596 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
2598 if (CONSTANT_CLASS_P (op
))
2600 op
= fold_unary (VIEW_CONVERT_EXPR
,
2601 TREE_TYPE (vector_type
), op
);
2602 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
2607 = make_ssa_name (TREE_TYPE (vector_type
), NULL
);
2609 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
2612 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR
,
2614 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
2618 elts
[number_of_places_left_in_vector
] = op
;
2619 if (!CONSTANT_CLASS_P (op
))
2622 if (number_of_places_left_in_vector
== 0)
2624 number_of_places_left_in_vector
= nunits
;
2627 vec_cst
= build_vector (vector_type
, elts
);
2630 vec
<constructor_elt
, va_gc
> *v
;
2632 vec_alloc (v
, nunits
);
2633 for (k
= 0; k
< nunits
; ++k
)
2634 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[k
]);
2635 vec_cst
= build_constructor (vector_type
, v
);
2637 voprnds
.quick_push (vect_init_vector (stmt
, vec_cst
,
2638 vector_type
, NULL
));
2639 if (ctor_seq
!= NULL
)
2641 gimple init_stmt
= SSA_NAME_DEF_STMT (voprnds
.last ());
2642 gimple_stmt_iterator gsi
= gsi_for_stmt (init_stmt
);
2643 gsi_insert_seq_before_without_update (&gsi
, ctor_seq
,
2651 /* Since the vectors are created in the reverse order, we should invert
2653 vec_num
= voprnds
.length ();
2654 for (j
= vec_num
; j
!= 0; j
--)
2656 vop
= voprnds
[j
- 1];
2657 vec_oprnds
->quick_push (vop
);
2662 /* In case that VF is greater than the unrolling factor needed for the SLP
2663 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2664 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2665 to replicate the vectors. */
2666 while (number_of_vectors
> vec_oprnds
->length ())
2668 tree neutral_vec
= NULL
;
2673 neutral_vec
= build_vector_from_val (vector_type
, neutral_op
);
2675 vec_oprnds
->quick_push (neutral_vec
);
2679 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
2680 vec_oprnds
->quick_push (vop
);
2686 /* Get vectorized definitions from SLP_NODE that contains corresponding
2687 vectorized def-stmts. */
2690 vect_get_slp_vect_defs (slp_tree slp_node
, vec
<tree
> *vec_oprnds
)
2693 gimple vec_def_stmt
;
2696 gcc_assert (SLP_TREE_VEC_STMTS (slp_node
).exists ());
2698 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), i
, vec_def_stmt
)
2700 gcc_assert (vec_def_stmt
);
2701 vec_oprnd
= gimple_get_lhs (vec_def_stmt
);
2702 vec_oprnds
->quick_push (vec_oprnd
);
2707 /* Get vectorized definitions for SLP_NODE.
2708 If the scalar definitions are loop invariants or constants, collect them and
2709 call vect_get_constant_vectors() to create vector stmts.
2710 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2711 must be stored in the corresponding child of SLP_NODE, and we call
2712 vect_get_slp_vect_defs () to retrieve them. */
2715 vect_get_slp_defs (vec
<tree
> ops
, slp_tree slp_node
,
2716 vec
<vec
<tree
> > *vec_oprnds
, int reduc_index
)
2719 int number_of_vects
= 0, i
;
2720 unsigned int child_index
= 0;
2721 HOST_WIDE_INT lhs_size_unit
, rhs_size_unit
;
2722 slp_tree child
= NULL
;
2725 bool vectorized_defs
;
2727 first_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[0];
2728 FOR_EACH_VEC_ELT (ops
, i
, oprnd
)
2730 /* For each operand we check if it has vectorized definitions in a child
2731 node or we need to create them (for invariants and constants). We
2732 check if the LHS of the first stmt of the next child matches OPRND.
2733 If it does, we found the correct child. Otherwise, we call
2734 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2735 to check this child node for the next operand. */
2736 vectorized_defs
= false;
2737 if (SLP_TREE_CHILDREN (slp_node
).length () > child_index
)
2739 child
= SLP_TREE_CHILDREN (slp_node
)[child_index
];
2741 /* We have to check both pattern and original def, if available. */
2742 gimple first_def
= SLP_TREE_SCALAR_STMTS (child
)[0];
2743 gimple related
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def
));
2745 if (operand_equal_p (oprnd
, gimple_get_lhs (first_def
), 0)
2747 && operand_equal_p (oprnd
, gimple_get_lhs (related
), 0)))
2749 /* The number of vector defs is determined by the number of
2750 vector statements in the node from which we get those
2752 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (child
);
2753 vectorized_defs
= true;
2758 if (!vectorized_defs
)
2762 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
2763 /* Number of vector stmts was calculated according to LHS in
2764 vect_schedule_slp_instance (), fix it by replacing LHS with
2765 RHS, if necessary. See vect_get_smallest_scalar_type () for
2767 vect_get_smallest_scalar_type (first_stmt
, &lhs_size_unit
,
2769 if (rhs_size_unit
!= lhs_size_unit
)
2771 number_of_vects
*= rhs_size_unit
;
2772 number_of_vects
/= lhs_size_unit
;
2777 /* Allocate memory for vectorized defs. */
2779 vec_defs
.create (number_of_vects
);
2781 /* For reduction defs we call vect_get_constant_vectors (), since we are
2782 looking for initial loop invariant values. */
2783 if (vectorized_defs
&& reduc_index
== -1)
2784 /* The defs are already vectorized. */
2785 vect_get_slp_vect_defs (child
, &vec_defs
);
2787 /* Build vectors from scalar defs. */
2788 vect_get_constant_vectors (oprnd
, slp_node
, &vec_defs
, i
,
2789 number_of_vects
, reduc_index
);
2791 vec_oprnds
->quick_push (vec_defs
);
2793 /* For reductions, we only need initial values. */
2794 if (reduc_index
!= -1)
2800 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2801 building a vector of type MASK_TYPE from it) and two input vectors placed in
2802 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2803 shifting by STRIDE elements of DR_CHAIN for every copy.
2804 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2806 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2807 the created stmts must be inserted. */
2810 vect_create_mask_and_perm (gimple stmt
, gimple next_scalar_stmt
,
2811 tree mask
, int first_vec_indx
, int second_vec_indx
,
2812 gimple_stmt_iterator
*gsi
, slp_tree node
,
2813 tree vectype
, vec
<tree
> dr_chain
,
2814 int ncopies
, int vect_stmts_counter
)
2817 gimple perm_stmt
= NULL
;
2818 stmt_vec_info next_stmt_info
;
2820 tree first_vec
, second_vec
, data_ref
;
2822 stride
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
) / ncopies
;
2824 /* Initialize the vect stmts of NODE to properly insert the generated
2826 for (i
= SLP_TREE_VEC_STMTS (node
).length ();
2827 i
< (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
2828 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
2830 perm_dest
= vect_create_destination_var (gimple_assign_lhs (stmt
), vectype
);
2831 for (i
= 0; i
< ncopies
; i
++)
2833 first_vec
= dr_chain
[first_vec_indx
];
2834 second_vec
= dr_chain
[second_vec_indx
];
2836 /* Generate the permute statement. */
2837 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, perm_dest
,
2838 first_vec
, second_vec
, mask
);
2839 data_ref
= make_ssa_name (perm_dest
, perm_stmt
);
2840 gimple_set_lhs (perm_stmt
, data_ref
);
2841 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
2843 /* Store the vector statement in NODE. */
2844 SLP_TREE_VEC_STMTS (node
)[stride
* i
+ vect_stmts_counter
] = perm_stmt
;
2846 first_vec_indx
+= stride
;
2847 second_vec_indx
+= stride
;
2850 /* Mark the scalar stmt as vectorized. */
2851 next_stmt_info
= vinfo_for_stmt (next_scalar_stmt
);
2852 STMT_VINFO_VEC_STMT (next_stmt_info
) = perm_stmt
;
2856 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2857 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2858 representation. Check that the mask is valid and return FALSE if not.
2859 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2860 the next vector, i.e., the current first vector is not needed. */
2863 vect_get_mask_element (gimple stmt
, int first_mask_element
, int m
,
2864 int mask_nunits
, bool only_one_vec
, int index
,
2865 unsigned char *mask
, int *current_mask_element
,
2866 bool *need_next_vector
, int *number_of_mask_fixes
,
2867 bool *mask_fixed
, bool *needs_first_vector
)
2871 /* Convert to target specific representation. */
2872 *current_mask_element
= first_mask_element
+ m
;
2873 /* Adjust the value in case it's a mask for second and third vectors. */
2874 *current_mask_element
-= mask_nunits
* (*number_of_mask_fixes
- 1);
2876 if (*current_mask_element
< mask_nunits
)
2877 *needs_first_vector
= true;
2879 /* We have only one input vector to permute but the mask accesses values in
2880 the next vector as well. */
2881 if (only_one_vec
&& *current_mask_element
>= mask_nunits
)
2883 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2886 "permutation requires at least two vectors ");
2887 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2888 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2894 /* The mask requires the next vector. */
2895 while (*current_mask_element
>= mask_nunits
* 2)
2897 if (*needs_first_vector
|| *mask_fixed
)
2899 /* We either need the first vector too or have already moved to the
2900 next vector. In both cases, this permutation needs three
2902 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2905 "permutation requires at "
2906 "least three vectors ");
2907 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2908 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2914 /* We move to the next vector, dropping the first one and working with
2915 the second and the third - we need to adjust the values of the mask
2917 *current_mask_element
-= mask_nunits
* *number_of_mask_fixes
;
2919 for (i
= 0; i
< index
; i
++)
2920 mask
[i
] -= mask_nunits
* *number_of_mask_fixes
;
2922 (*number_of_mask_fixes
)++;
2926 *need_next_vector
= *mask_fixed
;
2928 /* This was the last element of this mask. Start a new one. */
2929 if (index
== mask_nunits
- 1)
2931 *number_of_mask_fixes
= 1;
2932 *mask_fixed
= false;
2933 *needs_first_vector
= false;
2940 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2941 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2942 permute statements for the SLP node NODE of the SLP instance
2943 SLP_NODE_INSTANCE. */
2946 vect_transform_slp_perm_load (slp_tree node
, vec
<tree
> dr_chain
,
2947 gimple_stmt_iterator
*gsi
, int vf
,
2948 slp_instance slp_node_instance
, bool analyze_only
)
2950 gimple stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2951 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2952 tree mask_element_type
= NULL_TREE
, mask_type
;
2953 int i
, j
, k
, nunits
, vec_index
= 0, scalar_index
;
2954 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2955 gimple next_scalar_stmt
;
2956 int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_node_instance
);
2957 int first_mask_element
;
2958 int index
, unroll_factor
, current_mask_element
, ncopies
;
2959 unsigned char *mask
;
2960 bool only_one_vec
= false, need_next_vector
= false;
2961 int first_vec_index
, second_vec_index
, orig_vec_stmts_num
, vect_stmts_counter
;
2962 int number_of_mask_fixes
= 1;
2963 bool mask_fixed
= false;
2964 bool needs_first_vector
= false;
2967 mode
= TYPE_MODE (vectype
);
2969 if (!can_vec_perm_p (mode
, false, NULL
))
2971 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2974 "no vect permute for ");
2975 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2976 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2981 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2982 same size as the vector element being permuted. */
2983 mask_element_type
= lang_hooks
.types
.type_for_mode
2984 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype
))), 1);
2985 mask_type
= get_vectype_for_scalar_type (mask_element_type
);
2986 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2987 mask
= XALLOCAVEC (unsigned char, nunits
);
2988 unroll_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
2990 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2991 unrolling factor. */
2992 orig_vec_stmts_num
= group_size
*
2993 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
) / nunits
;
2994 if (orig_vec_stmts_num
== 1)
2995 only_one_vec
= true;
2997 /* Number of copies is determined by the final vectorization factor
2998 relatively to SLP_NODE_INSTANCE unrolling factor. */
2999 ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
3001 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
3004 /* Generate permutation masks for every NODE. Number of masks for each NODE
3005 is equal to GROUP_SIZE.
3006 E.g., we have a group of three nodes with three loads from the same
3007 location in each node, and the vector size is 4. I.e., we have a
3008 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3009 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3010 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3013 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3014 The last mask is illegal since we assume two operands for permute
3015 operation, and the mask element values can't be outside that range.
3016 Hence, the last mask must be converted into {2,5,5,5}.
3017 For the first two permutations we need the first and the second input
3018 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3019 we need the second and the third vectors: {b1,c1,a2,b2} and
3025 vect_stmts_counter
= 0;
3027 first_vec_index
= vec_index
++;
3029 second_vec_index
= first_vec_index
;
3031 second_vec_index
= vec_index
++;
3033 for (j
= 0; j
< unroll_factor
; j
++)
3035 for (k
= 0; k
< group_size
; k
++)
3037 i
= SLP_TREE_LOAD_PERMUTATION (node
)[k
];
3038 first_mask_element
= i
+ j
* group_size
;
3039 if (!vect_get_mask_element (stmt
, first_mask_element
, 0,
3040 nunits
, only_one_vec
, index
,
3041 mask
, ¤t_mask_element
,
3043 &number_of_mask_fixes
, &mask_fixed
,
3044 &needs_first_vector
))
3046 gcc_assert (current_mask_element
< 2 * nunits
);
3047 mask
[index
++] = current_mask_element
;
3049 if (index
== nunits
)
3052 if (!can_vec_perm_p (mode
, false, mask
))
3054 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
3058 "unsupported vect permute { ");
3059 for (i
= 0; i
< nunits
; ++i
)
3060 dump_printf (MSG_MISSED_OPTIMIZATION
, "%d ",
3062 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
3070 tree mask_vec
, *mask_elts
;
3071 mask_elts
= XALLOCAVEC (tree
, nunits
);
3072 for (l
= 0; l
< nunits
; ++l
)
3073 mask_elts
[l
] = build_int_cst (mask_element_type
,
3075 mask_vec
= build_vector (mask_type
, mask_elts
);
3077 if (need_next_vector
)
3079 first_vec_index
= second_vec_index
;
3080 second_vec_index
= vec_index
;
3084 = SLP_TREE_SCALAR_STMTS (node
)[scalar_index
++];
3086 vect_create_mask_and_perm (stmt
, next_scalar_stmt
,
3087 mask_vec
, first_vec_index
, second_vec_index
,
3088 gsi
, node
, vectype
, dr_chain
,
3089 ncopies
, vect_stmts_counter
++);
3101 /* Vectorize SLP instance tree in postorder. */
3104 vect_schedule_slp_instance (slp_tree node
, slp_instance instance
,
3105 unsigned int vectorization_factor
)
3108 bool grouped_store
, is_store
;
3109 gimple_stmt_iterator si
;
3110 stmt_vec_info stmt_info
;
3111 unsigned int vec_stmts_size
, nunits
, group_size
;
3119 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3120 vect_schedule_slp_instance (child
, instance
, vectorization_factor
);
3122 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
3123 stmt_info
= vinfo_for_stmt (stmt
);
3125 /* VECTYPE is the type of the destination. */
3126 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3127 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (vectype
);
3128 group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
3130 /* For each SLP instance calculate number of vector stmts to be created
3131 for the scalar stmts in each node of the SLP tree. Number of vector
3132 elements in one vector iteration is the number of scalar elements in
3133 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3135 vec_stmts_size
= (vectorization_factor
* group_size
) / nunits
;
3137 if (!SLP_TREE_VEC_STMTS (node
).exists ())
3139 SLP_TREE_VEC_STMTS (node
).create (vec_stmts_size
);
3140 SLP_TREE_NUMBER_OF_VEC_STMTS (node
) = vec_stmts_size
;
3143 if (dump_enabled_p ())
3145 dump_printf_loc (MSG_NOTE
,vect_location
,
3146 "------>vectorizing SLP node starting from: ");
3147 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3148 dump_printf (MSG_NOTE
, "\n");
3151 /* Loads should be inserted before the first load. */
3152 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance
)
3153 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3154 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt
))
3155 && SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3156 si
= gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance
));
3157 else if (is_pattern_stmt_p (stmt_info
))
3158 si
= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
3160 si
= gsi_for_stmt (stmt
);
3162 /* Stores should be inserted just before the last store. */
3163 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3164 && REFERENCE_CLASS_P (gimple_get_lhs (stmt
)))
3166 gimple last_store
= vect_find_last_store_in_slp_instance (instance
);
3167 if (is_pattern_stmt_p (vinfo_for_stmt (last_store
)))
3168 last_store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store
));
3169 si
= gsi_for_stmt (last_store
);
3172 /* Mark the first element of the reduction chain as reduction to properly
3173 transform the node. In the analysis phase only the last element of the
3174 chain is marked as reduction. */
3175 if (GROUP_FIRST_ELEMENT (stmt_info
) && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3176 && GROUP_FIRST_ELEMENT (stmt_info
) == stmt
)
3178 STMT_VINFO_DEF_TYPE (stmt_info
) = vect_reduction_def
;
3179 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
3182 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, node
, instance
);
3186 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3187 For loop vectorization this is done in vectorizable_call, but for SLP
3188 it needs to be deferred until end of vect_schedule_slp, because multiple
3189 SLP instances may refer to the same scalar stmt. */
3192 vect_remove_slp_scalar_calls (slp_tree node
)
3194 gimple stmt
, new_stmt
;
3195 gimple_stmt_iterator gsi
;
3199 stmt_vec_info stmt_info
;
3204 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3205 vect_remove_slp_scalar_calls (child
);
3207 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
3209 if (!is_gimple_call (stmt
) || gimple_bb (stmt
) == NULL
)
3211 stmt_info
= vinfo_for_stmt (stmt
);
3212 if (stmt_info
== NULL
3213 || is_pattern_stmt_p (stmt_info
)
3214 || !PURE_SLP_STMT (stmt_info
))
3216 lhs
= gimple_call_lhs (stmt
);
3217 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
3218 set_vinfo_for_stmt (new_stmt
, stmt_info
);
3219 set_vinfo_for_stmt (stmt
, NULL
);
3220 STMT_VINFO_STMT (stmt_info
) = new_stmt
;
3221 gsi
= gsi_for_stmt (stmt
);
3222 gsi_replace (&gsi
, new_stmt
, false);
3223 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
3227 /* Generate vector code for all SLP instances in the loop/basic block. */
3230 vect_schedule_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
3232 vec
<slp_instance
> slp_instances
;
3233 slp_instance instance
;
3235 bool is_store
= false;
3239 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3240 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3244 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3248 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3250 /* Schedule the tree of INSTANCE. */
3251 is_store
= vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance
),
3253 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_NOTE
, vect_location
,
3255 "vectorizing stmts using SLP.\n");
3258 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3260 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3263 gimple_stmt_iterator gsi
;
3265 /* Remove scalar call stmts. Do not do this for basic-block
3266 vectorization as not all uses may be vectorized.
3267 ??? Why should this be necessary? DCE should be able to
3268 remove the stmts itself.
3269 ??? For BB vectorization we can as well remove scalar
3270 stmts starting from the SLP tree root if they have no
3273 vect_remove_slp_scalar_calls (root
);
3275 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store
)
3276 && j
< SLP_INSTANCE_GROUP_SIZE (instance
); j
++)
3278 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store
)))
3281 if (is_pattern_stmt_p (vinfo_for_stmt (store
)))
3282 store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store
));
3283 /* Free the attached stmt_vec_info and remove the stmt. */
3284 gsi
= gsi_for_stmt (store
);
3285 unlink_stmt_vdef (store
);
3286 gsi_remove (&gsi
, true);
3287 release_defs (store
);
3288 free_stmt_vec_info (store
);
3296 /* Vectorize the basic block. */
3299 vect_slp_transform_bb (basic_block bb
)
3301 bb_vec_info bb_vinfo
= vec_info_for_bb (bb
);
3302 gimple_stmt_iterator si
;
3304 gcc_assert (bb_vinfo
);
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB\n");
3309 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
3311 gimple stmt
= gsi_stmt (si
);
3312 stmt_vec_info stmt_info
;
3314 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE
, vect_location
,
3317 "------>SLPing statement: ");
3318 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3319 dump_printf (MSG_NOTE
, "\n");
3322 stmt_info
= vinfo_for_stmt (stmt
);
3323 gcc_assert (stmt_info
);
3325 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3326 if (STMT_SLP_TYPE (stmt_info
))
3328 vect_schedule_slp (NULL
, bb_vinfo
);
3333 if (dump_enabled_p ())
3334 dump_printf_loc (MSG_NOTE
, vect_location
,
3335 "BASIC BLOCK VECTORIZED\n");
3337 destroy_bb_vec_info (bb_vinfo
);