1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
35 #include "hard-reg-set.h"
38 #include "basic-block.h"
39 #include "gimple-pretty-print.h"
40 #include "tree-ssa-alias.h"
41 #include "internal-fn.h"
42 #include "gimple-expr.h"
45 #include "gimple-iterator.h"
46 #include "gimple-ssa.h"
47 #include "tree-phinodes.h"
48 #include "ssa-iterators.h"
49 #include "stringpool.h"
50 #include "tree-ssanames.h"
51 #include "tree-pass.h"
54 #include "recog.h" /* FIXME: for insn_data */
55 #include "insn-codes.h"
57 #include "tree-vectorizer.h"
58 #include "langhooks.h"
60 /* Extract the location of the basic block in the source code.
61 Return the basic block location if succeed and NULL if not. */
64 find_bb_location (basic_block bb
)
67 gimple_stmt_iterator si
;
70 return UNKNOWN_LOCATION
;
72 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
75 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
76 return gimple_location (stmt
);
79 return UNKNOWN_LOCATION
;
83 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
86 vect_free_slp_tree (slp_tree node
)
94 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
95 vect_free_slp_tree (child
);
97 SLP_TREE_CHILDREN (node
).release ();
98 SLP_TREE_SCALAR_STMTS (node
).release ();
99 SLP_TREE_VEC_STMTS (node
).release ();
100 SLP_TREE_LOAD_PERMUTATION (node
).release ();
106 /* Free the memory allocated for the SLP instance. */
109 vect_free_slp_instance (slp_instance instance
)
111 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
112 SLP_INSTANCE_LOADS (instance
).release ();
113 SLP_INSTANCE_BODY_COST_VEC (instance
).release ();
118 /* Create an SLP node for SCALAR_STMTS. */
121 vect_create_new_slp_node (vec
<gimple
> scalar_stmts
)
124 gimple stmt
= scalar_stmts
[0];
127 if (is_gimple_call (stmt
))
128 nops
= gimple_call_num_args (stmt
);
129 else if (is_gimple_assign (stmt
))
131 nops
= gimple_num_ops (stmt
) - 1;
132 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
138 node
= XNEW (struct _slp_tree
);
139 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
140 SLP_TREE_VEC_STMTS (node
).create (0);
141 SLP_TREE_CHILDREN (node
).create (nops
);
142 SLP_TREE_LOAD_PERMUTATION (node
) = vNULL
;
148 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
150 static vec
<slp_oprnd_info
>
151 vect_create_oprnd_info (int nops
, int group_size
)
154 slp_oprnd_info oprnd_info
;
155 vec
<slp_oprnd_info
> oprnds_info
;
157 oprnds_info
.create (nops
);
158 for (i
= 0; i
< nops
; i
++)
160 oprnd_info
= XNEW (struct _slp_oprnd_info
);
161 oprnd_info
->def_stmts
.create (group_size
);
162 oprnd_info
->first_dt
= vect_uninitialized_def
;
163 oprnd_info
->first_op_type
= NULL_TREE
;
164 oprnd_info
->first_pattern
= false;
165 oprnds_info
.quick_push (oprnd_info
);
172 /* Free operands info. */
175 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
178 slp_oprnd_info oprnd_info
;
180 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
182 oprnd_info
->def_stmts
.release ();
183 XDELETE (oprnd_info
);
186 oprnds_info
.release ();
190 /* Find the place of the data-ref in STMT in the interleaving chain that starts
191 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
194 vect_get_place_in_interleaving_chain (gimple stmt
, gimple first_stmt
)
196 gimple next_stmt
= first_stmt
;
199 if (first_stmt
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
204 if (next_stmt
== stmt
)
207 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
215 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
216 they are of a valid type and that they match the defs of the first stmt of
217 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
218 return -1, if the error could be corrected by swapping operands of the
219 operation return 1, if everything is ok return 0. */
222 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
223 gimple stmt
, bool first
,
224 vec
<slp_oprnd_info
> *oprnds_info
)
227 unsigned int i
, number_of_oprnds
;
230 enum vect_def_type dt
= vect_uninitialized_def
;
231 struct loop
*loop
= NULL
;
232 bool pattern
= false;
233 slp_oprnd_info oprnd_info
;
234 int first_op_idx
= 1;
235 bool commutative
= false;
236 bool first_op_cond
= false;
239 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
241 if (is_gimple_call (stmt
))
243 number_of_oprnds
= gimple_call_num_args (stmt
);
246 else if (is_gimple_assign (stmt
))
248 enum tree_code code
= gimple_assign_rhs_code (stmt
);
249 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
250 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
252 first_op_cond
= true;
257 commutative
= commutative_tree_code (code
);
262 bool swapped
= false;
263 for (i
= 0; i
< number_of_oprnds
; i
++)
268 if (i
== 0 || i
== 1)
269 oprnd
= TREE_OPERAND (gimple_op (stmt
, first_op_idx
),
272 oprnd
= gimple_op (stmt
, first_op_idx
+ i
- 1);
275 oprnd
= gimple_op (stmt
, first_op_idx
+ (swapped
? !i
: i
));
277 oprnd_info
= (*oprnds_info
)[i
];
279 if (!vect_is_simple_use (oprnd
, NULL
, loop_vinfo
, bb_vinfo
, &def_stmt
,
281 || (!def_stmt
&& dt
!= vect_constant_def
))
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
286 "Build SLP failed: can't find def for ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
288 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
294 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
295 from the pattern. Check that all the stmts of the node are in the
297 if (def_stmt
&& gimple_bb (def_stmt
)
298 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
299 || (!loop
&& gimple_bb (def_stmt
) == BB_VINFO_BB (bb_vinfo
)
300 && gimple_code (def_stmt
) != GIMPLE_PHI
))
301 && vinfo_for_stmt (def_stmt
)
302 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt
))
303 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt
))
304 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt
)))
307 if (!first
&& !oprnd_info
->first_pattern
)
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
320 "Build SLP failed: some of the stmts"
321 " are in a pattern, and others are not ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
329 def_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt
));
330 dt
= STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
));
332 if (dt
== vect_unknown_def_type
)
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
336 "Unsupported pattern.\n");
340 switch (gimple_code (def_stmt
))
343 def
= gimple_phi_result (def_stmt
);
347 def
= gimple_assign_lhs (def_stmt
);
351 if (dump_enabled_p ())
352 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
353 "unsupported defining stmt:\n");
360 oprnd_info
->first_dt
= dt
;
361 oprnd_info
->first_pattern
= pattern
;
362 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
366 /* Not first stmt of the group, check that the def-stmt/s match
367 the def-stmt/s of the first stmt. Allow different definition
368 types for reduction chains: the first stmt must be a
369 vect_reduction_def (a phi node), and the rest
370 vect_internal_def. */
371 if (((oprnd_info
->first_dt
!= dt
372 && !(oprnd_info
->first_dt
== vect_reduction_def
373 && dt
== vect_internal_def
)
374 && !((oprnd_info
->first_dt
== vect_external_def
375 || oprnd_info
->first_dt
== vect_constant_def
)
376 && (dt
== vect_external_def
377 || dt
== vect_constant_def
)))
378 || !types_compatible_p (oprnd_info
->first_op_type
,
381 /* Try swapping operands if we got a mismatch. */
390 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
392 "Build SLP failed: different types\n");
398 /* Check the types of the definitions. */
401 case vect_constant_def
:
402 case vect_external_def
:
403 case vect_reduction_def
:
406 case vect_internal_def
:
407 oprnd_info
->def_stmts
.quick_push (def_stmt
);
411 /* FORNOW: Not supported. */
412 if (dump_enabled_p ())
414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
415 "Build SLP failed: illegal type of def ");
416 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def
);
417 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
429 tree cond
= gimple_assign_rhs1 (stmt
);
430 swap_ssa_operands (stmt
, &TREE_OPERAND (cond
, 0),
431 &TREE_OPERAND (cond
, 1));
432 TREE_SET_CODE (cond
, swap_tree_comparison (TREE_CODE (cond
)));
435 swap_ssa_operands (stmt
, gimple_assign_rhs1_ptr (stmt
),
436 gimple_assign_rhs2_ptr (stmt
));
443 /* Verify if the scalar stmts STMTS are isomorphic, require data
444 permutation or are of unsupported types of operation. Return
445 true if they are, otherwise return false and indicate in *MATCHES
446 which stmts are not isomorphic to the first one. If MATCHES[0]
447 is false then this indicates the comparison could not be
448 carried out or the stmts will never be vectorized by SLP. */
451 vect_build_slp_tree_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
452 vec
<gimple
> stmts
, unsigned int group_size
,
453 unsigned nops
, unsigned int *max_nunits
,
454 unsigned int vectorization_factor
, bool *matches
)
457 gimple stmt
= stmts
[0];
458 enum tree_code first_stmt_code
= ERROR_MARK
, rhs_code
= ERROR_MARK
;
459 enum tree_code first_cond_code
= ERROR_MARK
;
461 bool need_same_oprnds
= false;
462 tree vectype
, scalar_type
, first_op1
= NULL_TREE
;
465 machine_mode optab_op2_mode
;
466 machine_mode vec_mode
;
467 struct data_reference
*first_dr
;
469 gimple first_load
= NULL
, prev_first_load
= NULL
, old_first_load
= NULL
;
472 /* For every stmt in NODE find its def stmt/s. */
473 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
477 if (dump_enabled_p ())
479 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for ");
480 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
481 dump_printf (MSG_NOTE
, "\n");
484 /* Fail to vectorize statements marked as unvectorizable. */
485 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)))
487 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
490 "Build SLP failed: unvectorizable statement ");
491 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
492 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
494 /* Fatal mismatch. */
499 lhs
= gimple_get_lhs (stmt
);
500 if (lhs
== NULL_TREE
)
502 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
505 "Build SLP failed: not GIMPLE_ASSIGN nor "
507 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
508 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
510 /* Fatal mismatch. */
515 if (is_gimple_assign (stmt
)
516 && gimple_assign_rhs_code (stmt
) == COND_EXPR
517 && (cond
= gimple_assign_rhs1 (stmt
))
518 && !COMPARISON_CLASS_P (cond
))
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "Build SLP failed: condition is not "
525 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
526 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
528 /* Fatal mismatch. */
533 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
, &dummy
);
534 vectype
= get_vectype_for_scalar_type (scalar_type
);
537 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
540 "Build SLP failed: unsupported data-type ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
543 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
545 /* Fatal mismatch. */
550 /* In case of multiple types we need to detect the smallest type. */
551 if (*max_nunits
< TYPE_VECTOR_SUBPARTS (vectype
))
553 *max_nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
555 vectorization_factor
= *max_nunits
;
558 if (is_gimple_call (stmt
))
560 rhs_code
= CALL_EXPR
;
561 if (gimple_call_internal_p (stmt
)
562 || gimple_call_tail_p (stmt
)
563 || gimple_call_noreturn_p (stmt
)
564 || !gimple_call_nothrow_p (stmt
)
565 || gimple_call_chain (stmt
))
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
570 "Build SLP failed: unsupported call type ");
571 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
572 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
574 /* Fatal mismatch. */
580 rhs_code
= gimple_assign_rhs_code (stmt
);
582 /* Check the operation. */
585 first_stmt_code
= rhs_code
;
587 /* Shift arguments should be equal in all the packed stmts for a
588 vector shift with scalar shift operand. */
589 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
590 || rhs_code
== LROTATE_EXPR
591 || rhs_code
== RROTATE_EXPR
)
593 vec_mode
= TYPE_MODE (vectype
);
595 /* First see if we have a vector/vector shift. */
596 optab
= optab_for_tree_code (rhs_code
, vectype
,
600 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
602 /* No vector/vector shift, try for a vector/scalar shift. */
603 optab
= optab_for_tree_code (rhs_code
, vectype
,
608 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
610 "Build SLP failed: no optab.\n");
611 /* Fatal mismatch. */
615 icode
= (int) optab_handler (optab
, vec_mode
);
616 if (icode
== CODE_FOR_nothing
)
618 if (dump_enabled_p ())
619 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
621 "op not supported by target.\n");
622 /* Fatal mismatch. */
626 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
627 if (!VECTOR_MODE_P (optab_op2_mode
))
629 need_same_oprnds
= true;
630 first_op1
= gimple_assign_rhs2 (stmt
);
634 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
636 need_same_oprnds
= true;
637 first_op1
= gimple_assign_rhs2 (stmt
);
642 if (first_stmt_code
!= rhs_code
643 && (first_stmt_code
!= IMAGPART_EXPR
644 || rhs_code
!= REALPART_EXPR
)
645 && (first_stmt_code
!= REALPART_EXPR
646 || rhs_code
!= IMAGPART_EXPR
)
647 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
648 && (first_stmt_code
== ARRAY_REF
649 || first_stmt_code
== BIT_FIELD_REF
650 || first_stmt_code
== INDIRECT_REF
651 || first_stmt_code
== COMPONENT_REF
652 || first_stmt_code
== MEM_REF
)))
654 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
657 "Build SLP failed: different operation "
659 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
660 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
667 && !operand_equal_p (first_op1
, gimple_assign_rhs2 (stmt
), 0))
669 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
672 "Build SLP failed: different shift "
674 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
675 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
681 if (rhs_code
== CALL_EXPR
)
683 gimple first_stmt
= stmts
[0];
684 if (gimple_call_num_args (stmt
) != nops
685 || !operand_equal_p (gimple_call_fn (first_stmt
),
686 gimple_call_fn (stmt
), 0)
687 || gimple_call_fntype (first_stmt
)
688 != gimple_call_fntype (stmt
))
690 if (dump_enabled_p ())
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
693 "Build SLP failed: different calls in ");
694 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
696 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
704 /* Grouped store or load. */
705 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
)))
707 if (REFERENCE_CLASS_P (lhs
))
715 unsigned unrolling_factor
716 = least_common_multiple
717 (*max_nunits
, group_size
) / group_size
;
718 /* FORNOW: Check that there is no gap between the loads
719 and no gap between the groups when we need to load
720 multiple groups at once.
721 ??? We should enhance this to only disallow gaps
723 if ((unrolling_factor
> 1
724 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
725 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
726 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) != stmt
727 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 1))
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
732 "Build SLP failed: grouped "
734 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
736 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
738 /* Fatal mismatch. */
743 /* Check that the size of interleaved loads group is not
744 greater than the SLP group size. */
746 = vectorization_factor
/ TYPE_VECTOR_SUBPARTS (vectype
);
748 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
749 && ((GROUP_SIZE (vinfo_for_stmt (stmt
))
750 - GROUP_GAP (vinfo_for_stmt (stmt
)))
751 > ncopies
* group_size
))
753 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
756 "Build SLP failed: the number "
757 "of interleaved loads is greater than "
758 "the SLP group size ");
759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
761 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
763 /* Fatal mismatch. */
768 old_first_load
= first_load
;
769 first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
772 /* Check that there are no loads from different interleaving
773 chains in the same node. */
774 if (prev_first_load
!= first_load
)
776 if (dump_enabled_p ())
778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
780 "Build SLP failed: different "
781 "interleaving chains in one node ");
782 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
784 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
791 prev_first_load
= first_load
;
793 /* In some cases a group of loads is just the same load
794 repeated N times. Only analyze its cost once. */
795 if (first_load
== stmt
&& old_first_load
!= first_load
)
797 first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
798 if (vect_supportable_dr_alignment (first_dr
, false)
799 == dr_unaligned_unsupported
)
801 if (dump_enabled_p ())
803 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
805 "Build SLP failed: unsupported "
807 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
809 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
811 /* Fatal mismatch. */
817 } /* Grouped access. */
820 if (TREE_CODE_CLASS (rhs_code
) == tcc_reference
)
822 /* Not grouped load. */
823 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
826 "Build SLP failed: not grouped load ");
827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
828 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
831 /* FORNOW: Not grouped loads are not supported. */
832 /* Fatal mismatch. */
837 /* Not memory operation. */
838 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
839 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
840 && rhs_code
!= COND_EXPR
841 && rhs_code
!= CALL_EXPR
)
843 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
846 "Build SLP failed: operation");
847 dump_printf (MSG_MISSED_OPTIMIZATION
, " unsupported ");
848 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
849 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
851 /* Fatal mismatch. */
856 if (rhs_code
== COND_EXPR
)
858 tree cond_expr
= gimple_assign_rhs1 (stmt
);
861 first_cond_code
= TREE_CODE (cond_expr
);
862 else if (first_cond_code
!= TREE_CODE (cond_expr
))
864 if (dump_enabled_p ())
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
867 "Build SLP failed: different"
869 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
871 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
882 for (i
= 0; i
< group_size
; ++i
)
889 /* Recursively build an SLP tree starting from NODE.
890 Fail (and return a value not equal to zero) if def-stmts are not
891 isomorphic, require data permutation or are of unsupported types of
892 operation. Otherwise, return 0.
893 The value returned is the depth in the SLP tree where a mismatch
897 vect_build_slp_tree (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
898 slp_tree
*node
, unsigned int group_size
,
899 unsigned int *max_nunits
,
900 vec
<slp_tree
> *loads
,
901 unsigned int vectorization_factor
,
902 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
903 unsigned max_tree_size
)
905 unsigned nops
, i
, this_npermutes
= 0, this_tree_size
= 0;
909 matches
= XALLOCAVEC (bool, group_size
);
911 npermutes
= &this_npermutes
;
915 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
916 if (is_gimple_call (stmt
))
917 nops
= gimple_call_num_args (stmt
);
918 else if (is_gimple_assign (stmt
))
920 nops
= gimple_num_ops (stmt
) - 1;
921 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
927 if (!vect_build_slp_tree_1 (loop_vinfo
, bb_vinfo
,
928 SLP_TREE_SCALAR_STMTS (*node
), group_size
, nops
,
929 max_nunits
, vectorization_factor
, matches
))
932 /* If the SLP node is a load, terminate the recursion. */
933 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
934 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))))
936 loads
->safe_push (*node
);
940 /* Get at the operands, verifying they are compatible. */
941 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
942 slp_oprnd_info oprnd_info
;
943 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node
), i
, stmt
)
945 switch (vect_get_and_check_slp_defs (loop_vinfo
, bb_vinfo
,
946 stmt
, (i
== 0), &oprnds_info
))
952 vect_free_oprnd_info (oprnds_info
);
959 for (i
= 0; i
< group_size
; ++i
)
962 vect_free_oprnd_info (oprnds_info
);
966 stmt
= SLP_TREE_SCALAR_STMTS (*node
)[0];
968 /* Create SLP_TREE nodes for the definition node/s. */
969 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
972 unsigned old_nloads
= loads
->length ();
973 unsigned old_max_nunits
= *max_nunits
;
975 if (oprnd_info
->first_dt
!= vect_internal_def
)
978 if (++this_tree_size
> max_tree_size
)
980 vect_free_oprnd_info (oprnds_info
);
984 child
= vect_create_new_slp_node (oprnd_info
->def_stmts
);
987 vect_free_oprnd_info (oprnds_info
);
991 bool *matches
= XALLOCAVEC (bool, group_size
);
992 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
993 group_size
, max_nunits
, loads
,
994 vectorization_factor
, matches
,
995 npermutes
, &this_tree_size
, max_tree_size
))
997 oprnd_info
->def_stmts
= vNULL
;
998 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1002 /* If the SLP build for operand zero failed and operand zero
1003 and one can be commutated try that for the scalar stmts
1004 that failed the match. */
1006 /* A first scalar stmt mismatch signals a fatal mismatch. */
1008 /* ??? For COND_EXPRs we can swap the comparison operands
1009 as well as the arms under some constraints. */
1011 && oprnds_info
[1]->first_dt
== vect_internal_def
1012 && is_gimple_assign (stmt
)
1013 && commutative_tree_code (gimple_assign_rhs_code (stmt
))
1014 /* Do so only if the number of not successful permutes was nor more
1015 than a cut-ff as re-trying the recursive match on
1016 possibly each level of the tree would expose exponential
1021 *max_nunits
= old_max_nunits
;
1022 loads
->truncate (old_nloads
);
1023 /* Swap mismatched definition stmts. */
1024 dump_printf_loc (MSG_NOTE
, vect_location
,
1025 "Re-trying with swapped operands of stmts ");
1026 for (unsigned j
= 0; j
< group_size
; ++j
)
1029 gimple tem
= oprnds_info
[0]->def_stmts
[j
];
1030 oprnds_info
[0]->def_stmts
[j
] = oprnds_info
[1]->def_stmts
[j
];
1031 oprnds_info
[1]->def_stmts
[j
] = tem
;
1032 dump_printf (MSG_NOTE
, "%d ", j
);
1034 dump_printf (MSG_NOTE
, "\n");
1035 /* And try again ... */
1036 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
,
1037 group_size
, max_nunits
, loads
,
1038 vectorization_factor
,
1039 matches
, npermutes
, &this_tree_size
,
1042 oprnd_info
->def_stmts
= vNULL
;
1043 SLP_TREE_CHILDREN (*node
).quick_push (child
);
1050 oprnd_info
->def_stmts
= vNULL
;
1051 vect_free_slp_tree (child
);
1052 vect_free_oprnd_info (oprnds_info
);
1057 *tree_size
+= this_tree_size
;
1059 vect_free_oprnd_info (oprnds_info
);
1063 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1066 vect_print_slp_tree (int dump_kind
, slp_tree node
)
1075 dump_printf (dump_kind
, "node ");
1076 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1078 dump_printf (dump_kind
, "\n\tstmt %d ", i
);
1079 dump_gimple_stmt (dump_kind
, TDF_SLIM
, stmt
, 0);
1081 dump_printf (dump_kind
, "\n");
1083 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1084 vect_print_slp_tree (dump_kind
, child
);
1088 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1089 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1090 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1091 stmts in NODE are to be marked. */
1094 vect_mark_slp_stmts (slp_tree node
, enum slp_vect_type mark
, int j
)
1103 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1104 if (j
< 0 || i
== j
)
1105 STMT_SLP_TYPE (vinfo_for_stmt (stmt
)) = mark
;
1107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1108 vect_mark_slp_stmts (child
, mark
, j
);
1112 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1115 vect_mark_slp_stmts_relevant (slp_tree node
)
1119 stmt_vec_info stmt_info
;
1125 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1127 stmt_info
= vinfo_for_stmt (stmt
);
1128 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1129 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1130 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1133 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1134 vect_mark_slp_stmts_relevant (child
);
1138 /* Rearrange the statements of NODE according to PERMUTATION. */
1141 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1142 vec
<unsigned> permutation
)
1145 vec
<gimple
> tmp_stmts
;
1149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1150 vect_slp_rearrange_stmts (child
, group_size
, permutation
);
1152 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1153 tmp_stmts
.create (group_size
);
1154 tmp_stmts
.quick_grow_cleared (group_size
);
1156 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1157 tmp_stmts
[permutation
[i
]] = stmt
;
1159 SLP_TREE_SCALAR_STMTS (node
).release ();
1160 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1164 /* Check if the required load permutations in the SLP instance
1165 SLP_INSTN are supported. */
1168 vect_supported_load_permutation_p (slp_instance slp_instn
)
1170 unsigned int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_instn
);
1171 unsigned int i
, j
, k
, next
;
1174 gimple stmt
, load
, next_load
, first_load
;
1175 struct data_reference
*dr
;
1177 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE
, vect_location
, "Load permutation ");
1180 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1181 if (node
->load_permutation
.exists ())
1182 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, next
)
1183 dump_printf (MSG_NOTE
, "%d ", next
);
1185 for (k
= 0; k
< group_size
; ++k
)
1186 dump_printf (MSG_NOTE
, "%d ", k
);
1187 dump_printf (MSG_NOTE
, "\n");
1190 /* In case of reduction every load permutation is allowed, since the order
1191 of the reduction statements is not important (as opposed to the case of
1192 grouped stores). The only condition we need to check is that all the
1193 load nodes are of the same size and have the same permutation (and then
1194 rearrange all the nodes of the SLP instance according to this
1197 /* Check that all the load nodes are of the same size. */
1198 /* ??? Can't we assert this? */
1199 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1200 if (SLP_TREE_SCALAR_STMTS (node
).length () != (unsigned) group_size
)
1203 node
= SLP_INSTANCE_TREE (slp_instn
);
1204 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1206 /* Reduction (there are no data-refs in the root).
1207 In reduction chain the order of the loads is important. */
1208 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))
1209 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1214 /* Compare all the permutation sequences to the first one. We know
1215 that at least one load is permuted. */
1216 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1217 if (!node
->load_permutation
.exists ())
1219 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1221 if (!load
->load_permutation
.exists ())
1223 FOR_EACH_VEC_ELT (load
->load_permutation
, j
, lidx
)
1224 if (lidx
!= node
->load_permutation
[j
])
1228 /* Check that the loads in the first sequence are different and there
1229 are no gaps between them. */
1230 load_index
= sbitmap_alloc (group_size
);
1231 bitmap_clear (load_index
);
1232 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1234 if (bitmap_bit_p (load_index
, lidx
))
1236 sbitmap_free (load_index
);
1239 bitmap_set_bit (load_index
, lidx
);
1241 for (i
= 0; i
< group_size
; i
++)
1242 if (!bitmap_bit_p (load_index
, i
))
1244 sbitmap_free (load_index
);
1247 sbitmap_free (load_index
);
1249 /* This permutation is valid for reduction. Since the order of the
1250 statements in the nodes is not important unless they are memory
1251 accesses, we can rearrange the statements in all the nodes
1252 according to the order of the loads. */
1253 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1254 node
->load_permutation
);
1256 /* We are done, no actual permutations need to be generated. */
1257 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1258 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1262 /* In basic block vectorization we allow any subchain of an interleaving
1264 FORNOW: not supported in loop SLP because of realignment compications. */
1265 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt
)))
1267 /* Check that for every node in the instance the loads
1269 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1272 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load
)
1274 if (j
!= 0 && next_load
!= load
)
1276 next_load
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (load
));
1280 /* Check that the alignment of the first load in every subchain, i.e.,
1281 the first statement in every load node, is supported.
1282 ??? This belongs in alignment checking. */
1283 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1285 first_load
= SLP_TREE_SCALAR_STMTS (node
)[0];
1286 if (first_load
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load
)))
1288 dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load
));
1289 if (vect_supportable_dr_alignment (dr
, false)
1290 == dr_unaligned_unsupported
)
1292 if (dump_enabled_p ())
1294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1296 "unsupported unaligned load ");
1297 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1299 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1306 /* We are done, no actual permutations need to be generated. */
1307 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1308 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1312 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1313 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1314 well (unless it's reduction). */
1315 if (SLP_INSTANCE_LOADS (slp_instn
).length () != group_size
)
1317 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1318 if (!node
->load_permutation
.exists ())
1321 load_index
= sbitmap_alloc (group_size
);
1322 bitmap_clear (load_index
);
1323 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1325 unsigned int lidx
= node
->load_permutation
[0];
1326 if (bitmap_bit_p (load_index
, lidx
))
1328 sbitmap_free (load_index
);
1331 bitmap_set_bit (load_index
, lidx
);
1332 FOR_EACH_VEC_ELT (node
->load_permutation
, j
, k
)
1335 sbitmap_free (load_index
);
1339 for (i
= 0; i
< group_size
; i
++)
1340 if (!bitmap_bit_p (load_index
, i
))
1342 sbitmap_free (load_index
);
1345 sbitmap_free (load_index
);
1347 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1348 if (node
->load_permutation
.exists ()
1349 && !vect_transform_slp_perm_load
1351 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
), slp_instn
, true))
1357 /* Find the first load in the loop that belongs to INSTANCE.
1358 When loads are in several SLP nodes, there can be a case in which the first
1359 load does not appear in the first SLP node to be transformed, causing
1360 incorrect order of statements. Since we generate all the loads together,
1361 they must be inserted before the first load of the SLP instance and not
1362 before the first load of the first node of the instance. */
1365 vect_find_first_load_in_slp_instance (slp_instance instance
)
1369 gimple first_load
= NULL
, load
;
1371 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
1372 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1373 first_load
= get_earlier_stmt (load
, first_load
);
1379 /* Find the last store in SLP INSTANCE. */
1382 vect_find_last_store_in_slp_instance (slp_instance instance
)
1386 gimple last_store
= NULL
, store
;
1388 node
= SLP_INSTANCE_TREE (instance
);
1389 for (i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &store
); i
++)
1390 last_store
= get_later_stmt (store
, last_store
);
1395 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1398 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1399 slp_instance instance
, slp_tree node
,
1400 stmt_vector_for_cost
*prologue_cost_vec
,
1401 unsigned ncopies_for_cost
)
1403 stmt_vector_for_cost
*body_cost_vec
= &SLP_INSTANCE_BODY_COST_VEC (instance
);
1408 stmt_vec_info stmt_info
;
1410 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1412 /* Recurse down the SLP tree. */
1413 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1414 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1415 instance
, child
, prologue_cost_vec
,
1418 /* Look at the first scalar stmt to determine the cost. */
1419 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1420 stmt_info
= vinfo_for_stmt (stmt
);
1421 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1423 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1424 vect_model_store_cost (stmt_info
, ncopies_for_cost
, false,
1425 vect_uninitialized_def
,
1426 node
, prologue_cost_vec
, body_cost_vec
);
1430 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)));
1431 vect_model_load_cost (stmt_info
, ncopies_for_cost
, false,
1432 node
, prologue_cost_vec
, body_cost_vec
);
1433 /* If the load is permuted record the cost for the permutation.
1434 ??? Loads from multiple chains are let through here only
1435 for a single special case involving complex numbers where
1436 in the end no permutation is necessary. */
1437 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, s
)
1438 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s
))
1439 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
))
1440 && vect_get_place_in_interleaving_chain
1441 (s
, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
)) != i
)
1443 record_stmt_cost (body_cost_vec
, group_size
, vec_perm
,
1444 stmt_info
, 0, vect_body
);
1450 record_stmt_cost (body_cost_vec
, ncopies_for_cost
, vector_stmt
,
1451 stmt_info
, 0, vect_body
);
1453 /* Scan operands and account for prologue cost of constants/externals.
1454 ??? This over-estimates cost for multiple uses and should be
1456 lhs
= gimple_get_lhs (stmt
);
1457 for (i
= 0; i
< gimple_num_ops (stmt
); ++i
)
1459 tree def
, op
= gimple_op (stmt
, i
);
1461 enum vect_def_type dt
;
1462 if (!op
|| op
== lhs
)
1464 if (vect_is_simple_use (op
, NULL
, loop_vinfo
, bb_vinfo
,
1465 &def_stmt
, &def
, &dt
)
1466 && (dt
== vect_constant_def
|| dt
== vect_external_def
))
1467 record_stmt_cost (prologue_cost_vec
, 1, vector_stmt
,
1468 stmt_info
, 0, vect_prologue
);
1472 /* Compute the cost for the SLP instance INSTANCE. */
1475 vect_analyze_slp_cost (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1476 slp_instance instance
, unsigned nunits
)
1478 stmt_vector_for_cost body_cost_vec
, prologue_cost_vec
;
1479 unsigned ncopies_for_cost
;
1480 stmt_info_for_cost
*si
;
1483 /* Calculate the number of vector stmts to create based on the unrolling
1484 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1485 GROUP_SIZE / NUNITS otherwise. */
1486 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1487 ncopies_for_cost
= least_common_multiple (nunits
, group_size
) / nunits
;
1489 prologue_cost_vec
.create (10);
1490 body_cost_vec
.create (10);
1491 SLP_INSTANCE_BODY_COST_VEC (instance
) = body_cost_vec
;
1492 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1493 instance
, SLP_INSTANCE_TREE (instance
),
1494 &prologue_cost_vec
, ncopies_for_cost
);
1496 /* Record the prologue costs, which were delayed until we were
1497 sure that SLP was successful. Unlike the body costs, we know
1498 the final values now regardless of the loop vectorization factor. */
1499 void *data
= (loop_vinfo
? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
1500 : BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1501 FOR_EACH_VEC_ELT (prologue_cost_vec
, i
, si
)
1503 struct _stmt_vec_info
*stmt_info
1504 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1505 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1506 si
->misalign
, vect_prologue
);
1509 prologue_cost_vec
.release ();
1512 /* Analyze an SLP instance starting from a group of grouped stores. Call
1513 vect_build_slp_tree to build a tree of packed stmts if possible.
1514 Return FALSE if it's impossible to SLP any stmt in the loop. */
1517 vect_analyze_slp_instance (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1518 gimple stmt
, unsigned max_tree_size
)
1520 slp_instance new_instance
;
1522 unsigned int group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1523 unsigned int unrolling_factor
= 1, nunits
;
1524 tree vectype
, scalar_type
= NULL_TREE
;
1526 unsigned int vectorization_factor
= 0;
1528 unsigned int max_nunits
= 0;
1529 vec
<slp_tree
> loads
;
1530 struct data_reference
*dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
1531 vec
<gimple
> scalar_stmts
;
1533 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1537 scalar_type
= TREE_TYPE (DR_REF (dr
));
1538 vectype
= get_vectype_for_scalar_type (scalar_type
);
1542 gcc_assert (loop_vinfo
);
1543 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1546 group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1550 gcc_assert (loop_vinfo
);
1551 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1552 group_size
= LOOP_VINFO_REDUCTIONS (loop_vinfo
).length ();
1557 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1560 "Build SLP failed: unsupported data-type ");
1561 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, scalar_type
);
1562 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1568 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
1570 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1572 vectorization_factor
= nunits
;
1574 /* Calculate the unrolling factor. */
1575 unrolling_factor
= least_common_multiple (nunits
, group_size
) / group_size
;
1576 if (unrolling_factor
!= 1 && !loop_vinfo
)
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1580 "Build SLP failed: unrolling required in basic"
1586 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1587 scalar_stmts
.create (group_size
);
1589 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1591 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1594 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
))
1595 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)))
1596 scalar_stmts
.safe_push (
1597 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)));
1599 scalar_stmts
.safe_push (next
);
1600 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
1605 /* Collect reduction statements. */
1606 vec
<gimple
> reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1607 for (i
= 0; reductions
.iterate (i
, &next
); i
++)
1608 scalar_stmts
.safe_push (next
);
1611 node
= vect_create_new_slp_node (scalar_stmts
);
1613 loads
.create (group_size
);
1615 /* Build the tree for the SLP instance. */
1616 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &node
, group_size
,
1617 &max_nunits
, &loads
,
1618 vectorization_factor
, NULL
, NULL
, NULL
,
1621 /* Calculate the unrolling factor based on the smallest type. */
1622 if (max_nunits
> nunits
)
1623 unrolling_factor
= least_common_multiple (max_nunits
, group_size
)
1626 if (unrolling_factor
!= 1 && !loop_vinfo
)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1630 "Build SLP failed: unrolling required in basic"
1632 vect_free_slp_tree (node
);
1637 /* Create a new SLP instance. */
1638 new_instance
= XNEW (struct _slp_instance
);
1639 SLP_INSTANCE_TREE (new_instance
) = node
;
1640 SLP_INSTANCE_GROUP_SIZE (new_instance
) = group_size
;
1641 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
1642 SLP_INSTANCE_BODY_COST_VEC (new_instance
) = vNULL
;
1643 SLP_INSTANCE_LOADS (new_instance
) = loads
;
1644 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
) = NULL
;
1646 /* Compute the load permutation. */
1648 bool loads_permuted
= false;
1649 FOR_EACH_VEC_ELT (loads
, i
, load_node
)
1651 vec
<unsigned> load_permutation
;
1653 gimple load
, first_stmt
;
1654 bool this_load_permuted
= false;
1655 load_permutation
.create (group_size
);
1656 first_stmt
= GROUP_FIRST_ELEMENT
1657 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node
)[0]));
1658 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1661 = vect_get_place_in_interleaving_chain (load
, first_stmt
);
1662 gcc_assert (load_place
!= -1);
1663 if (load_place
!= j
)
1664 this_load_permuted
= true;
1665 load_permutation
.safe_push (load_place
);
1667 if (!this_load_permuted
)
1669 load_permutation
.release ();
1672 SLP_TREE_LOAD_PERMUTATION (load_node
) = load_permutation
;
1673 loads_permuted
= true;
1678 if (!vect_supported_load_permutation_p (new_instance
))
1680 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1683 "Build SLP failed: unsupported load "
1685 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
1686 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1688 vect_free_slp_instance (new_instance
);
1692 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
)
1693 = vect_find_first_load_in_slp_instance (new_instance
);
1696 /* Compute the costs of this SLP instance. */
1697 vect_analyze_slp_cost (loop_vinfo
, bb_vinfo
,
1698 new_instance
, TYPE_VECTOR_SUBPARTS (vectype
));
1701 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).safe_push (new_instance
);
1703 BB_VINFO_SLP_INSTANCES (bb_vinfo
).safe_push (new_instance
);
1705 if (dump_enabled_p ())
1706 vect_print_slp_tree (MSG_NOTE
, node
);
1711 /* Failed to SLP. */
1712 /* Free the allocated memory. */
1713 vect_free_slp_tree (node
);
1720 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1721 trees of packed scalar stmts if SLP is possible. */
1724 vect_analyze_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1725 unsigned max_tree_size
)
1728 vec
<gimple
> grouped_stores
;
1729 vec
<gimple
> reductions
= vNULL
;
1730 vec
<gimple
> reduc_chains
= vNULL
;
1731 gimple first_element
;
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_analyze_slp ===\n");
1739 grouped_stores
= LOOP_VINFO_GROUPED_STORES (loop_vinfo
);
1740 reduc_chains
= LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
);
1741 reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1744 grouped_stores
= BB_VINFO_GROUPED_STORES (bb_vinfo
);
1746 /* Find SLP sequences starting from groups of grouped stores. */
1747 FOR_EACH_VEC_ELT (grouped_stores
, i
, first_element
)
1748 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1752 if (bb_vinfo
&& !ok
)
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1756 "Failed to SLP the basic block.\n");
1762 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).length () > 0)
1764 /* Find SLP sequences starting from reduction chains. */
1765 FOR_EACH_VEC_ELT (reduc_chains
, i
, first_element
)
1766 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
,
1772 /* Don't try to vectorize SLP reductions if reduction chain was
1777 /* Find SLP sequences starting from groups of reductions. */
1778 if (loop_vinfo
&& LOOP_VINFO_REDUCTIONS (loop_vinfo
).length () > 1
1779 && vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, reductions
[0],
1787 /* For each possible SLP instance decide whether to SLP it and calculate overall
1788 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1789 least one instance. */
1792 vect_make_slp_decision (loop_vec_info loop_vinfo
)
1794 unsigned int i
, unrolling_factor
= 1;
1795 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1796 slp_instance instance
;
1797 int decided_to_slp
= 0;
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_make_slp_decision ==="
1803 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1805 /* FORNOW: SLP if you can. */
1806 if (unrolling_factor
< SLP_INSTANCE_UNROLLING_FACTOR (instance
))
1807 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (instance
);
1809 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1810 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1811 loop-based vectorization. Such stmts will be marked as HYBRID. */
1812 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
1816 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
1818 if (decided_to_slp
&& dump_enabled_p ())
1819 dump_printf_loc (MSG_NOTE
, vect_location
,
1820 "Decided to SLP %d instances. Unrolling factor %d\n",
1821 decided_to_slp
, unrolling_factor
);
1823 return (decided_to_slp
> 0);
1827 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1828 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1831 vect_detect_hybrid_slp_stmts (slp_tree node
)
1834 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (node
);
1835 gimple stmt
= stmts
[0];
1836 imm_use_iterator imm_iter
;
1838 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
1840 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
1841 struct loop
*loop
= NULL
;
1842 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_vinfo
);
1843 basic_block bb
= NULL
;
1849 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1851 bb
= BB_VINFO_BB (bb_vinfo
);
1853 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1854 if (PURE_SLP_STMT (vinfo_for_stmt (stmt
))
1855 && TREE_CODE (gimple_op (stmt
, 0)) == SSA_NAME
)
1856 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, gimple_op (stmt
, 0))
1857 if (gimple_bb (use_stmt
)
1858 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1859 || bb
== gimple_bb (use_stmt
))
1860 && (stmt_vinfo
= vinfo_for_stmt (use_stmt
))
1861 && !STMT_SLP_TYPE (stmt_vinfo
)
1862 && (STMT_VINFO_RELEVANT (stmt_vinfo
)
1863 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo
))
1864 || (STMT_VINFO_IN_PATTERN_P (stmt_vinfo
)
1865 && STMT_VINFO_RELATED_STMT (stmt_vinfo
)
1866 && !STMT_SLP_TYPE (vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo
)))))
1867 && !(gimple_code (use_stmt
) == GIMPLE_PHI
1868 && STMT_VINFO_DEF_TYPE (stmt_vinfo
)
1869 == vect_reduction_def
))
1870 vect_mark_slp_stmts (node
, hybrid
, i
);
1872 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1873 vect_detect_hybrid_slp_stmts (child
);
1877 /* Find stmts that must be both vectorized and SLPed. */
1880 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
1883 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1884 slp_instance instance
;
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_detect_hybrid_slp ==="
1890 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1891 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance
));
1895 /* Create and initialize a new bb_vec_info struct for BB, as well as
1896 stmt_vec_info structs for all the stmts in it. */
1899 new_bb_vec_info (basic_block bb
)
1901 bb_vec_info res
= NULL
;
1902 gimple_stmt_iterator gsi
;
1904 res
= (bb_vec_info
) xcalloc (1, sizeof (struct _bb_vec_info
));
1905 BB_VINFO_BB (res
) = bb
;
1907 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1909 gimple stmt
= gsi_stmt (gsi
);
1910 gimple_set_uid (stmt
, 0);
1911 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, NULL
, res
));
1914 BB_VINFO_GROUPED_STORES (res
).create (10);
1915 BB_VINFO_SLP_INSTANCES (res
).create (2);
1916 BB_VINFO_TARGET_COST_DATA (res
) = init_cost (NULL
);
1923 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1924 stmts in the basic block. */
1927 destroy_bb_vec_info (bb_vec_info bb_vinfo
)
1929 vec
<slp_instance
> slp_instances
;
1930 slp_instance instance
;
1932 gimple_stmt_iterator si
;
1938 bb
= BB_VINFO_BB (bb_vinfo
);
1940 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1942 gimple stmt
= gsi_stmt (si
);
1943 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1946 /* Free stmt_vec_info. */
1947 free_stmt_vec_info (stmt
);
1950 vect_destroy_datarefs (NULL
, bb_vinfo
);
1951 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo
));
1952 BB_VINFO_GROUPED_STORES (bb_vinfo
).release ();
1953 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
1954 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1955 vect_free_slp_instance (instance
);
1956 BB_VINFO_SLP_INSTANCES (bb_vinfo
).release ();
1957 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1963 /* Analyze statements contained in SLP tree node after recursively analyzing
1964 the subtree. Return TRUE if the operations are supported. */
1967 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo
, slp_tree node
)
1977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1978 if (!vect_slp_analyze_node_operations (bb_vinfo
, child
))
1981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1983 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1984 gcc_assert (stmt_info
);
1985 gcc_assert (PURE_SLP_STMT (stmt_info
));
1987 if (!vect_analyze_stmt (stmt
, &dummy
, node
))
1995 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1996 operations are supported. */
1999 vect_slp_analyze_operations (bb_vec_info bb_vinfo
)
2001 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2002 slp_instance instance
;
2005 for (i
= 0; slp_instances
.iterate (i
, &instance
); )
2007 if (!vect_slp_analyze_node_operations (bb_vinfo
,
2008 SLP_INSTANCE_TREE (instance
)))
2010 vect_free_slp_instance (instance
);
2011 slp_instances
.ordered_remove (i
);
2017 if (!slp_instances
.length ())
2024 /* Compute the scalar cost of the SLP node NODE and its children
2025 and return it. Do not account defs that are marked in LIFE and
2026 update LIFE according to uses of NODE. */
2029 vect_bb_slp_scalar_cost (basic_block bb
,
2030 slp_tree node
, vec
<bool, va_heap
> *life
)
2032 unsigned scalar_cost
= 0;
2037 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
2040 ssa_op_iter op_iter
;
2041 def_operand_p def_p
;
2042 stmt_vec_info stmt_info
;
2047 /* If there is a non-vectorized use of the defs then the scalar
2048 stmt is kept live in which case we do not account it or any
2049 required defs in the SLP children in the scalar cost. This
2050 way we make the vectorization more costly when compared to
2052 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
2054 imm_use_iterator use_iter
;
2056 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
2057 if (!is_gimple_debug (use_stmt
)
2058 && (gimple_code (use_stmt
) == GIMPLE_PHI
2059 || gimple_bb (use_stmt
) != bb
2060 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt
))))
2063 BREAK_FROM_IMM_USE_STMT (use_iter
);
2069 stmt_info
= vinfo_for_stmt (stmt
);
2070 if (STMT_VINFO_DATA_REF (stmt_info
))
2072 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2073 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2075 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2078 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2080 scalar_cost
+= stmt_cost
;
2083 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2084 scalar_cost
+= vect_bb_slp_scalar_cost (bb
, child
, life
);
2089 /* Check if vectorization of the basic block is profitable. */
2092 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
2094 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2095 slp_instance instance
;
2097 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
2098 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
2099 void *target_cost_data
= BB_VINFO_TARGET_COST_DATA (bb_vinfo
);
2100 stmt_vec_info stmt_info
= NULL
;
2101 stmt_vector_for_cost body_cost_vec
;
2102 stmt_info_for_cost
*ci
;
2104 /* Calculate vector costs. */
2105 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2107 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2109 FOR_EACH_VEC_ELT (body_cost_vec
, j
, ci
)
2111 stmt_info
= ci
->stmt
? vinfo_for_stmt (ci
->stmt
) : NULL
;
2112 (void) add_stmt_cost (target_cost_data
, ci
->count
, ci
->kind
,
2113 stmt_info
, ci
->misalign
, vect_body
);
2117 /* Calculate scalar cost. */
2118 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2120 auto_vec
<bool, 20> life
;
2121 life
.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance
));
2122 scalar_cost
+= vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo
),
2123 SLP_INSTANCE_TREE (instance
),
2127 /* Complete the target-specific cost calculation. */
2128 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
2129 &vec_inside_cost
, &vec_epilogue_cost
);
2131 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
2133 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2136 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
2138 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
2139 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
2140 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
2143 /* Vectorization is profitable if its cost is less than the cost of scalar
2145 if (vec_outside_cost
+ vec_inside_cost
>= scalar_cost
)
2151 /* Check if the basic block can be vectorized. */
2154 vect_slp_analyze_bb_1 (basic_block bb
)
2156 bb_vec_info bb_vinfo
;
2157 vec
<slp_instance
> slp_instances
;
2158 slp_instance instance
;
2161 unsigned n_stmts
= 0;
2163 bb_vinfo
= new_bb_vec_info (bb
);
2167 if (!vect_analyze_data_refs (NULL
, bb_vinfo
, &min_vf
, &n_stmts
))
2169 if (dump_enabled_p ())
2170 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2171 "not vectorized: unhandled data-ref in basic "
2174 destroy_bb_vec_info (bb_vinfo
);
2178 if (BB_VINFO_DATAREFS (bb_vinfo
).length () < 2)
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2182 "not vectorized: not enough data-refs in "
2185 destroy_bb_vec_info (bb_vinfo
);
2189 if (!vect_analyze_data_ref_accesses (NULL
, bb_vinfo
))
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2193 "not vectorized: unhandled data access in "
2196 destroy_bb_vec_info (bb_vinfo
);
2200 vect_pattern_recog (NULL
, bb_vinfo
);
2202 if (!vect_analyze_data_refs_alignment (NULL
, bb_vinfo
))
2204 if (dump_enabled_p ())
2205 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2206 "not vectorized: bad data alignment in basic "
2209 destroy_bb_vec_info (bb_vinfo
);
2213 /* Check the SLP opportunities in the basic block, analyze and build SLP
2215 if (!vect_analyze_slp (NULL
, bb_vinfo
, n_stmts
))
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2219 "not vectorized: failed to find SLP opportunities "
2220 "in basic block.\n");
2222 destroy_bb_vec_info (bb_vinfo
);
2226 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2228 /* Mark all the statements that we want to vectorize as pure SLP and
2230 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2232 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
2233 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
2236 /* Mark all the statements that we do not want to vectorize. */
2237 for (gimple_stmt_iterator gsi
= gsi_start_bb (BB_VINFO_BB (bb_vinfo
));
2238 !gsi_end_p (gsi
); gsi_next (&gsi
))
2240 stmt_vec_info vinfo
= vinfo_for_stmt (gsi_stmt (gsi
));
2241 if (STMT_SLP_TYPE (vinfo
) != pure_slp
)
2242 STMT_VINFO_VECTORIZABLE (vinfo
) = false;
2245 /* Analyze dependences. At this point all stmts not participating in
2246 vectorization have to be marked. Dependence analysis assumes
2247 that we either vectorize all SLP instances or none at all. */
2248 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo
))
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2252 "not vectorized: unhandled data dependence "
2253 "in basic block.\n");
2255 destroy_bb_vec_info (bb_vinfo
);
2259 if (!vect_verify_datarefs_alignment (NULL
, bb_vinfo
))
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2263 "not vectorized: unsupported alignment in basic "
2265 destroy_bb_vec_info (bb_vinfo
);
2269 if (!vect_slp_analyze_operations (bb_vinfo
))
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2273 "not vectorized: bad operation in basic block.\n");
2275 destroy_bb_vec_info (bb_vinfo
);
2279 /* Cost model: check if the vectorization is worthwhile. */
2280 if (!unlimited_cost_model (NULL
)
2281 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2285 "not vectorized: vectorization is not "
2288 destroy_bb_vec_info (bb_vinfo
);
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE
, vect_location
,
2294 "Basic block will be vectorized using SLP\n");
2301 vect_slp_analyze_bb (basic_block bb
)
2303 bb_vec_info bb_vinfo
;
2305 gimple_stmt_iterator gsi
;
2306 unsigned int vector_sizes
;
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE
, vect_location
, "===vect_slp_analyze_bb===\n");
2311 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2313 gimple stmt
= gsi_stmt (gsi
);
2314 if (!is_gimple_debug (stmt
)
2315 && !gimple_nop_p (stmt
)
2316 && gimple_code (stmt
) != GIMPLE_LABEL
)
2320 if (insns
> PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB
))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2324 "not vectorized: too many instructions in "
2330 /* Autodetect first vector size we try. */
2331 current_vector_size
= 0;
2332 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
2336 bb_vinfo
= vect_slp_analyze_bb_1 (bb
);
2340 destroy_bb_vec_info (bb_vinfo
);
2342 vector_sizes
&= ~current_vector_size
;
2343 if (vector_sizes
== 0
2344 || current_vector_size
== 0)
2347 /* Try the next biggest vector size. */
2348 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2349 if (dump_enabled_p ())
2350 dump_printf_loc (MSG_NOTE
, vect_location
,
2351 "***** Re-trying analysis with "
2352 "vector size %d\n", current_vector_size
);
2357 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2358 the number of created vector stmts depends on the unrolling factor).
2359 However, the actual number of vector stmts for every SLP node depends on
2360 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2361 should be updated. In this function we assume that the inside costs
2362 calculated in vect_model_xxx_cost are linear in ncopies. */
2365 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo
)
2367 unsigned int i
, j
, vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2368 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2369 slp_instance instance
;
2370 stmt_vector_for_cost body_cost_vec
;
2371 stmt_info_for_cost
*si
;
2372 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE
, vect_location
,
2376 "=== vect_update_slp_costs_according_to_vf ===\n");
2378 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2380 /* We assume that costs are linear in ncopies. */
2381 int ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (instance
);
2383 /* Record the instance's instructions in the target cost model.
2384 This was delayed until here because the count of instructions
2385 isn't known beforehand. */
2386 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2388 FOR_EACH_VEC_ELT (body_cost_vec
, j
, si
)
2389 (void) add_stmt_cost (data
, si
->count
* ncopies
, si
->kind
,
2390 vinfo_for_stmt (si
->stmt
), si
->misalign
,
2396 /* For constant and loop invariant defs of SLP_NODE this function returns
2397 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2398 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2399 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2400 REDUC_INDEX is the index of the reduction operand in the statements, unless
2404 vect_get_constant_vectors (tree op
, slp_tree slp_node
,
2405 vec
<tree
> *vec_oprnds
,
2406 unsigned int op_num
, unsigned int number_of_vectors
,
2409 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2410 gimple stmt
= stmts
[0];
2411 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2415 unsigned j
, number_of_places_left_in_vector
;
2418 int group_size
= stmts
.length ();
2419 unsigned int vec_num
, i
;
2420 unsigned number_of_copies
= 1;
2422 voprnds
.create (number_of_vectors
);
2423 bool constant_p
, is_store
;
2424 tree neutral_op
= NULL
;
2425 enum tree_code code
= gimple_expr_code (stmt
);
2428 gimple_seq ctor_seq
= NULL
;
2430 if (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
2431 && reduc_index
!= -1)
2433 op_num
= reduc_index
- 1;
2434 op
= gimple_op (stmt
, reduc_index
);
2435 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2436 we need either neutral operands or the original operands. See
2437 get_initial_def_for_reduction() for details. */
2440 case WIDEN_SUM_EXPR
:
2446 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2447 neutral_op
= build_real (TREE_TYPE (op
), dconst0
);
2449 neutral_op
= build_int_cst (TREE_TYPE (op
), 0);
2454 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2455 neutral_op
= build_real (TREE_TYPE (op
), dconst1
);
2457 neutral_op
= build_int_cst (TREE_TYPE (op
), 1);
2462 neutral_op
= build_int_cst (TREE_TYPE (op
), -1);
2465 /* For MIN/MAX we don't have an easy neutral operand but
2466 the initial values can be used fine here. Only for
2467 a reduction chain we have to force a neutral element. */
2470 if (!GROUP_FIRST_ELEMENT (stmt_vinfo
))
2474 def_stmt
= SSA_NAME_DEF_STMT (op
);
2475 loop
= (gimple_bb (stmt
))->loop_father
;
2476 neutral_op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2477 loop_preheader_edge (loop
));
2486 if (STMT_VINFO_DATA_REF (stmt_vinfo
))
2489 op
= gimple_assign_rhs1 (stmt
);
2496 if (CONSTANT_CLASS_P (op
))
2501 vector_type
= get_vectype_for_scalar_type (TREE_TYPE (op
));
2502 gcc_assert (vector_type
);
2503 nunits
= TYPE_VECTOR_SUBPARTS (vector_type
);
2505 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2506 created vectors. It is greater than 1 if unrolling is performed.
2508 For example, we have two scalar operands, s1 and s2 (e.g., group of
2509 strided accesses of size two), while NUNITS is four (i.e., four scalars
2510 of this type can be packed in a vector). The output vector will contain
2511 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2514 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2515 containing the operands.
2517 For example, NUNITS is four as before, and the group size is 8
2518 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2519 {s5, s6, s7, s8}. */
2521 number_of_copies
= least_common_multiple (nunits
, group_size
) / group_size
;
2523 number_of_places_left_in_vector
= nunits
;
2524 elts
= XALLOCAVEC (tree
, nunits
);
2525 for (j
= 0; j
< number_of_copies
; j
++)
2527 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
2530 op
= gimple_assign_rhs1 (stmt
);
2536 if (op_num
== 0 || op_num
== 1)
2538 tree cond
= gimple_assign_rhs1 (stmt
);
2539 op
= TREE_OPERAND (cond
, op_num
);
2544 op
= gimple_assign_rhs2 (stmt
);
2546 op
= gimple_assign_rhs3 (stmt
);
2551 op
= gimple_call_arg (stmt
, op_num
);
2558 op
= gimple_op (stmt
, op_num
+ 1);
2559 /* Unlike the other binary operators, shifts/rotates have
2560 the shift count being int, instead of the same type as
2561 the lhs, so make sure the scalar is the right type if
2562 we are dealing with vectors of
2563 long long/long/short/char. */
2564 if (op_num
== 1 && TREE_CODE (op
) == INTEGER_CST
)
2565 op
= fold_convert (TREE_TYPE (vector_type
), op
);
2569 op
= gimple_op (stmt
, op_num
+ 1);
2574 if (reduc_index
!= -1)
2576 loop
= (gimple_bb (stmt
))->loop_father
;
2577 def_stmt
= SSA_NAME_DEF_STMT (op
);
2581 /* Get the def before the loop. In reduction chain we have only
2582 one initial value. */
2583 if ((j
!= (number_of_copies
- 1)
2584 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
2589 op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2590 loop_preheader_edge (loop
));
2593 /* Create 'vect_ = {op0,op1,...,opn}'. */
2594 number_of_places_left_in_vector
--;
2595 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
2597 if (CONSTANT_CLASS_P (op
))
2599 op
= fold_unary (VIEW_CONVERT_EXPR
,
2600 TREE_TYPE (vector_type
), op
);
2601 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
2606 = make_ssa_name (TREE_TYPE (vector_type
), NULL
);
2608 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
2611 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR
,
2612 new_temp
, op
, NULL_TREE
);
2613 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
2617 elts
[number_of_places_left_in_vector
] = op
;
2618 if (!CONSTANT_CLASS_P (op
))
2621 if (number_of_places_left_in_vector
== 0)
2623 number_of_places_left_in_vector
= nunits
;
2626 vec_cst
= build_vector (vector_type
, elts
);
2629 vec
<constructor_elt
, va_gc
> *v
;
2631 vec_alloc (v
, nunits
);
2632 for (k
= 0; k
< nunits
; ++k
)
2633 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[k
]);
2634 vec_cst
= build_constructor (vector_type
, v
);
2636 voprnds
.quick_push (vect_init_vector (stmt
, vec_cst
,
2637 vector_type
, NULL
));
2638 if (ctor_seq
!= NULL
)
2640 gimple init_stmt
= SSA_NAME_DEF_STMT (voprnds
.last ());
2641 gimple_stmt_iterator gsi
= gsi_for_stmt (init_stmt
);
2642 gsi_insert_seq_before_without_update (&gsi
, ctor_seq
,
2650 /* Since the vectors are created in the reverse order, we should invert
2652 vec_num
= voprnds
.length ();
2653 for (j
= vec_num
; j
!= 0; j
--)
2655 vop
= voprnds
[j
- 1];
2656 vec_oprnds
->quick_push (vop
);
2661 /* In case that VF is greater than the unrolling factor needed for the SLP
2662 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2663 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2664 to replicate the vectors. */
2665 while (number_of_vectors
> vec_oprnds
->length ())
2667 tree neutral_vec
= NULL
;
2672 neutral_vec
= build_vector_from_val (vector_type
, neutral_op
);
2674 vec_oprnds
->quick_push (neutral_vec
);
2678 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
2679 vec_oprnds
->quick_push (vop
);
2685 /* Get vectorized definitions from SLP_NODE that contains corresponding
2686 vectorized def-stmts. */
2689 vect_get_slp_vect_defs (slp_tree slp_node
, vec
<tree
> *vec_oprnds
)
2692 gimple vec_def_stmt
;
2695 gcc_assert (SLP_TREE_VEC_STMTS (slp_node
).exists ());
2697 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), i
, vec_def_stmt
)
2699 gcc_assert (vec_def_stmt
);
2700 vec_oprnd
= gimple_get_lhs (vec_def_stmt
);
2701 vec_oprnds
->quick_push (vec_oprnd
);
2706 /* Get vectorized definitions for SLP_NODE.
2707 If the scalar definitions are loop invariants or constants, collect them and
2708 call vect_get_constant_vectors() to create vector stmts.
2709 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2710 must be stored in the corresponding child of SLP_NODE, and we call
2711 vect_get_slp_vect_defs () to retrieve them. */
2714 vect_get_slp_defs (vec
<tree
> ops
, slp_tree slp_node
,
2715 vec
<vec
<tree
> > *vec_oprnds
, int reduc_index
)
2718 int number_of_vects
= 0, i
;
2719 unsigned int child_index
= 0;
2720 HOST_WIDE_INT lhs_size_unit
, rhs_size_unit
;
2721 slp_tree child
= NULL
;
2724 bool vectorized_defs
;
2726 first_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[0];
2727 FOR_EACH_VEC_ELT (ops
, i
, oprnd
)
2729 /* For each operand we check if it has vectorized definitions in a child
2730 node or we need to create them (for invariants and constants). We
2731 check if the LHS of the first stmt of the next child matches OPRND.
2732 If it does, we found the correct child. Otherwise, we call
2733 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2734 to check this child node for the next operand. */
2735 vectorized_defs
= false;
2736 if (SLP_TREE_CHILDREN (slp_node
).length () > child_index
)
2738 child
= SLP_TREE_CHILDREN (slp_node
)[child_index
];
2740 /* We have to check both pattern and original def, if available. */
2741 gimple first_def
= SLP_TREE_SCALAR_STMTS (child
)[0];
2742 gimple related
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def
));
2744 if (operand_equal_p (oprnd
, gimple_get_lhs (first_def
), 0)
2746 && operand_equal_p (oprnd
, gimple_get_lhs (related
), 0)))
2748 /* The number of vector defs is determined by the number of
2749 vector statements in the node from which we get those
2751 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (child
);
2752 vectorized_defs
= true;
2757 if (!vectorized_defs
)
2761 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
2762 /* Number of vector stmts was calculated according to LHS in
2763 vect_schedule_slp_instance (), fix it by replacing LHS with
2764 RHS, if necessary. See vect_get_smallest_scalar_type () for
2766 vect_get_smallest_scalar_type (first_stmt
, &lhs_size_unit
,
2768 if (rhs_size_unit
!= lhs_size_unit
)
2770 number_of_vects
*= rhs_size_unit
;
2771 number_of_vects
/= lhs_size_unit
;
2776 /* Allocate memory for vectorized defs. */
2778 vec_defs
.create (number_of_vects
);
2780 /* For reduction defs we call vect_get_constant_vectors (), since we are
2781 looking for initial loop invariant values. */
2782 if (vectorized_defs
&& reduc_index
== -1)
2783 /* The defs are already vectorized. */
2784 vect_get_slp_vect_defs (child
, &vec_defs
);
2786 /* Build vectors from scalar defs. */
2787 vect_get_constant_vectors (oprnd
, slp_node
, &vec_defs
, i
,
2788 number_of_vects
, reduc_index
);
2790 vec_oprnds
->quick_push (vec_defs
);
2792 /* For reductions, we only need initial values. */
2793 if (reduc_index
!= -1)
2799 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2800 building a vector of type MASK_TYPE from it) and two input vectors placed in
2801 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2802 shifting by STRIDE elements of DR_CHAIN for every copy.
2803 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2805 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2806 the created stmts must be inserted. */
2809 vect_create_mask_and_perm (gimple stmt
, gimple next_scalar_stmt
,
2810 tree mask
, int first_vec_indx
, int second_vec_indx
,
2811 gimple_stmt_iterator
*gsi
, slp_tree node
,
2812 tree vectype
, vec
<tree
> dr_chain
,
2813 int ncopies
, int vect_stmts_counter
)
2816 gimple perm_stmt
= NULL
;
2817 stmt_vec_info next_stmt_info
;
2819 tree first_vec
, second_vec
, data_ref
;
2821 stride
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
) / ncopies
;
2823 /* Initialize the vect stmts of NODE to properly insert the generated
2825 for (i
= SLP_TREE_VEC_STMTS (node
).length ();
2826 i
< (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
2827 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
2829 perm_dest
= vect_create_destination_var (gimple_assign_lhs (stmt
), vectype
);
2830 for (i
= 0; i
< ncopies
; i
++)
2832 first_vec
= dr_chain
[first_vec_indx
];
2833 second_vec
= dr_chain
[second_vec_indx
];
2835 /* Generate the permute statement. */
2836 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, perm_dest
,
2837 first_vec
, second_vec
, mask
);
2838 data_ref
= make_ssa_name (perm_dest
, perm_stmt
);
2839 gimple_set_lhs (perm_stmt
, data_ref
);
2840 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
2842 /* Store the vector statement in NODE. */
2843 SLP_TREE_VEC_STMTS (node
)[stride
* i
+ vect_stmts_counter
] = perm_stmt
;
2845 first_vec_indx
+= stride
;
2846 second_vec_indx
+= stride
;
2849 /* Mark the scalar stmt as vectorized. */
2850 next_stmt_info
= vinfo_for_stmt (next_scalar_stmt
);
2851 STMT_VINFO_VEC_STMT (next_stmt_info
) = perm_stmt
;
2855 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2856 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2857 representation. Check that the mask is valid and return FALSE if not.
2858 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2859 the next vector, i.e., the current first vector is not needed. */
2862 vect_get_mask_element (gimple stmt
, int first_mask_element
, int m
,
2863 int mask_nunits
, bool only_one_vec
, int index
,
2864 unsigned char *mask
, int *current_mask_element
,
2865 bool *need_next_vector
, int *number_of_mask_fixes
,
2866 bool *mask_fixed
, bool *needs_first_vector
)
2870 /* Convert to target specific representation. */
2871 *current_mask_element
= first_mask_element
+ m
;
2872 /* Adjust the value in case it's a mask for second and third vectors. */
2873 *current_mask_element
-= mask_nunits
* (*number_of_mask_fixes
- 1);
2875 if (*current_mask_element
< mask_nunits
)
2876 *needs_first_vector
= true;
2878 /* We have only one input vector to permute but the mask accesses values in
2879 the next vector as well. */
2880 if (only_one_vec
&& *current_mask_element
>= mask_nunits
)
2882 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2885 "permutation requires at least two vectors ");
2886 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2887 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2893 /* The mask requires the next vector. */
2894 while (*current_mask_element
>= mask_nunits
* 2)
2896 if (*needs_first_vector
|| *mask_fixed
)
2898 /* We either need the first vector too or have already moved to the
2899 next vector. In both cases, this permutation needs three
2901 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2904 "permutation requires at "
2905 "least three vectors ");
2906 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2907 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2913 /* We move to the next vector, dropping the first one and working with
2914 the second and the third - we need to adjust the values of the mask
2916 *current_mask_element
-= mask_nunits
* *number_of_mask_fixes
;
2918 for (i
= 0; i
< index
; i
++)
2919 mask
[i
] -= mask_nunits
* *number_of_mask_fixes
;
2921 (*number_of_mask_fixes
)++;
2925 *need_next_vector
= *mask_fixed
;
2927 /* This was the last element of this mask. Start a new one. */
2928 if (index
== mask_nunits
- 1)
2930 *number_of_mask_fixes
= 1;
2931 *mask_fixed
= false;
2932 *needs_first_vector
= false;
2939 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2940 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2941 permute statements for the SLP node NODE of the SLP instance
2942 SLP_NODE_INSTANCE. */
2945 vect_transform_slp_perm_load (slp_tree node
, vec
<tree
> dr_chain
,
2946 gimple_stmt_iterator
*gsi
, int vf
,
2947 slp_instance slp_node_instance
, bool analyze_only
)
2949 gimple stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2950 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2951 tree mask_element_type
= NULL_TREE
, mask_type
;
2952 int i
, j
, k
, nunits
, vec_index
= 0, scalar_index
;
2953 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2954 gimple next_scalar_stmt
;
2955 int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_node_instance
);
2956 int first_mask_element
;
2957 int index
, unroll_factor
, current_mask_element
, ncopies
;
2958 unsigned char *mask
;
2959 bool only_one_vec
= false, need_next_vector
= false;
2960 int first_vec_index
, second_vec_index
, orig_vec_stmts_num
, vect_stmts_counter
;
2961 int number_of_mask_fixes
= 1;
2962 bool mask_fixed
= false;
2963 bool needs_first_vector
= false;
2966 mode
= TYPE_MODE (vectype
);
2968 if (!can_vec_perm_p (mode
, false, NULL
))
2970 if (dump_enabled_p ())
2972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2973 "no vect permute for ");
2974 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2975 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2980 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2981 same size as the vector element being permuted. */
2982 mask_element_type
= lang_hooks
.types
.type_for_mode
2983 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype
))), 1);
2984 mask_type
= get_vectype_for_scalar_type (mask_element_type
);
2985 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2986 mask
= XALLOCAVEC (unsigned char, nunits
);
2987 unroll_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
2989 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2990 unrolling factor. */
2991 orig_vec_stmts_num
= group_size
*
2992 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
) / nunits
;
2993 if (orig_vec_stmts_num
== 1)
2994 only_one_vec
= true;
2996 /* Number of copies is determined by the final vectorization factor
2997 relatively to SLP_NODE_INSTANCE unrolling factor. */
2998 ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
3000 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
3003 /* Generate permutation masks for every NODE. Number of masks for each NODE
3004 is equal to GROUP_SIZE.
3005 E.g., we have a group of three nodes with three loads from the same
3006 location in each node, and the vector size is 4. I.e., we have a
3007 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3008 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3009 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3012 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3013 The last mask is illegal since we assume two operands for permute
3014 operation, and the mask element values can't be outside that range.
3015 Hence, the last mask must be converted into {2,5,5,5}.
3016 For the first two permutations we need the first and the second input
3017 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3018 we need the second and the third vectors: {b1,c1,a2,b2} and
3024 vect_stmts_counter
= 0;
3026 first_vec_index
= vec_index
++;
3028 second_vec_index
= first_vec_index
;
3030 second_vec_index
= vec_index
++;
3032 for (j
= 0; j
< unroll_factor
; j
++)
3034 for (k
= 0; k
< group_size
; k
++)
3036 i
= SLP_TREE_LOAD_PERMUTATION (node
)[k
];
3037 first_mask_element
= i
+ j
* group_size
;
3038 if (!vect_get_mask_element (stmt
, first_mask_element
, 0,
3039 nunits
, only_one_vec
, index
,
3040 mask
, ¤t_mask_element
,
3042 &number_of_mask_fixes
, &mask_fixed
,
3043 &needs_first_vector
))
3045 gcc_assert (current_mask_element
< 2 * nunits
);
3046 mask
[index
++] = current_mask_element
;
3048 if (index
== nunits
)
3051 if (!can_vec_perm_p (mode
, false, mask
))
3053 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
3057 "unsupported vect permute { ");
3058 for (i
= 0; i
< nunits
; ++i
)
3059 dump_printf (MSG_MISSED_OPTIMIZATION
, "%d ",
3061 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
3069 tree mask_vec
, *mask_elts
;
3070 mask_elts
= XALLOCAVEC (tree
, nunits
);
3071 for (l
= 0; l
< nunits
; ++l
)
3072 mask_elts
[l
] = build_int_cst (mask_element_type
,
3074 mask_vec
= build_vector (mask_type
, mask_elts
);
3076 if (need_next_vector
)
3078 first_vec_index
= second_vec_index
;
3079 second_vec_index
= vec_index
;
3083 = SLP_TREE_SCALAR_STMTS (node
)[scalar_index
++];
3085 vect_create_mask_and_perm (stmt
, next_scalar_stmt
,
3086 mask_vec
, first_vec_index
, second_vec_index
,
3087 gsi
, node
, vectype
, dr_chain
,
3088 ncopies
, vect_stmts_counter
++);
3100 /* Vectorize SLP instance tree in postorder. */
3103 vect_schedule_slp_instance (slp_tree node
, slp_instance instance
,
3104 unsigned int vectorization_factor
)
3107 bool grouped_store
, is_store
;
3108 gimple_stmt_iterator si
;
3109 stmt_vec_info stmt_info
;
3110 unsigned int vec_stmts_size
, nunits
, group_size
;
3118 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3119 vect_schedule_slp_instance (child
, instance
, vectorization_factor
);
3121 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
3122 stmt_info
= vinfo_for_stmt (stmt
);
3124 /* VECTYPE is the type of the destination. */
3125 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3126 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (vectype
);
3127 group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
3129 /* For each SLP instance calculate number of vector stmts to be created
3130 for the scalar stmts in each node of the SLP tree. Number of vector
3131 elements in one vector iteration is the number of scalar elements in
3132 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3134 vec_stmts_size
= (vectorization_factor
* group_size
) / nunits
;
3136 if (!SLP_TREE_VEC_STMTS (node
).exists ())
3138 SLP_TREE_VEC_STMTS (node
).create (vec_stmts_size
);
3139 SLP_TREE_NUMBER_OF_VEC_STMTS (node
) = vec_stmts_size
;
3142 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_NOTE
,vect_location
,
3145 "------>vectorizing SLP node starting from: ");
3146 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3147 dump_printf (MSG_NOTE
, "\n");
3150 /* Loads should be inserted before the first load. */
3151 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance
)
3152 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3153 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt
))
3154 && SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3155 si
= gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance
));
3156 else if (is_pattern_stmt_p (stmt_info
))
3157 si
= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
3159 si
= gsi_for_stmt (stmt
);
3161 /* Stores should be inserted just before the last store. */
3162 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3163 && REFERENCE_CLASS_P (gimple_get_lhs (stmt
)))
3165 gimple last_store
= vect_find_last_store_in_slp_instance (instance
);
3166 if (is_pattern_stmt_p (vinfo_for_stmt (last_store
)))
3167 last_store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store
));
3168 si
= gsi_for_stmt (last_store
);
3171 /* Mark the first element of the reduction chain as reduction to properly
3172 transform the node. In the analysis phase only the last element of the
3173 chain is marked as reduction. */
3174 if (GROUP_FIRST_ELEMENT (stmt_info
) && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3175 && GROUP_FIRST_ELEMENT (stmt_info
) == stmt
)
3177 STMT_VINFO_DEF_TYPE (stmt_info
) = vect_reduction_def
;
3178 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
3181 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, node
, instance
);
3185 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3186 For loop vectorization this is done in vectorizable_call, but for SLP
3187 it needs to be deferred until end of vect_schedule_slp, because multiple
3188 SLP instances may refer to the same scalar stmt. */
3191 vect_remove_slp_scalar_calls (slp_tree node
)
3193 gimple stmt
, new_stmt
;
3194 gimple_stmt_iterator gsi
;
3198 stmt_vec_info stmt_info
;
3203 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3204 vect_remove_slp_scalar_calls (child
);
3206 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
3208 if (!is_gimple_call (stmt
) || gimple_bb (stmt
) == NULL
)
3210 stmt_info
= vinfo_for_stmt (stmt
);
3211 if (stmt_info
== NULL
3212 || is_pattern_stmt_p (stmt_info
)
3213 || !PURE_SLP_STMT (stmt_info
))
3215 lhs
= gimple_call_lhs (stmt
);
3216 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
3217 set_vinfo_for_stmt (new_stmt
, stmt_info
);
3218 set_vinfo_for_stmt (stmt
, NULL
);
3219 STMT_VINFO_STMT (stmt_info
) = new_stmt
;
3220 gsi
= gsi_for_stmt (stmt
);
3221 gsi_replace (&gsi
, new_stmt
, false);
3222 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
3226 /* Generate vector code for all SLP instances in the loop/basic block. */
3229 vect_schedule_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
3231 vec
<slp_instance
> slp_instances
;
3232 slp_instance instance
;
3234 bool is_store
= false;
3238 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3239 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3243 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3247 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3249 /* Schedule the tree of INSTANCE. */
3250 is_store
= vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance
),
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_NOTE
, vect_location
,
3254 "vectorizing stmts using SLP.\n");
3257 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3259 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3262 gimple_stmt_iterator gsi
;
3264 /* Remove scalar call stmts. Do not do this for basic-block
3265 vectorization as not all uses may be vectorized.
3266 ??? Why should this be necessary? DCE should be able to
3267 remove the stmts itself.
3268 ??? For BB vectorization we can as well remove scalar
3269 stmts starting from the SLP tree root if they have no
3272 vect_remove_slp_scalar_calls (root
);
3274 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store
)
3275 && j
< SLP_INSTANCE_GROUP_SIZE (instance
); j
++)
3277 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store
)))
3280 if (is_pattern_stmt_p (vinfo_for_stmt (store
)))
3281 store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store
));
3282 /* Free the attached stmt_vec_info and remove the stmt. */
3283 gsi
= gsi_for_stmt (store
);
3284 unlink_stmt_vdef (store
);
3285 gsi_remove (&gsi
, true);
3286 release_defs (store
);
3287 free_stmt_vec_info (store
);
3295 /* Vectorize the basic block. */
3298 vect_slp_transform_bb (basic_block bb
)
3300 bb_vec_info bb_vinfo
= vec_info_for_bb (bb
);
3301 gimple_stmt_iterator si
;
3303 gcc_assert (bb_vinfo
);
3305 if (dump_enabled_p ())
3306 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB\n");
3308 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
3310 gimple stmt
= gsi_stmt (si
);
3311 stmt_vec_info stmt_info
;
3313 if (dump_enabled_p ())
3315 dump_printf_loc (MSG_NOTE
, vect_location
,
3316 "------>SLPing statement: ");
3317 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3318 dump_printf (MSG_NOTE
, "\n");
3321 stmt_info
= vinfo_for_stmt (stmt
);
3322 gcc_assert (stmt_info
);
3324 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3325 if (STMT_SLP_TYPE (stmt_info
))
3327 vect_schedule_slp (NULL
, bb_vinfo
);
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_NOTE
, vect_location
,
3334 "BASIC BLOCK VECTORIZED\n");
3336 destroy_bb_vec_info (bb_vinfo
);