1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
34 #include "gimple-ssa.h"
35 #include "tree-phinodes.h"
36 #include "ssa-iterators.h"
37 #include "tree-ssanames.h"
38 #include "tree-ssa-loop-ivopts.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
45 #include "tree-vectorizer.h"
46 #include "diagnostic-core.h"
47 /* Need to include rtl.h, expr.h, etc. for optabs. */
51 /* Return true if load- or store-lanes optab OPTAB is implemented for
52 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
55 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
56 tree vectype
, unsigned HOST_WIDE_INT count
)
58 enum machine_mode mode
, array_mode
;
61 mode
= TYPE_MODE (vectype
);
62 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
63 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
66 if (array_mode
== BLKmode
)
68 if (dump_enabled_p ())
69 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
70 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
71 GET_MODE_NAME (mode
), count
);
75 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
79 "cannot use %s<%s><%s>\n", name
,
80 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_NOTE
, vect_location
,
86 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
87 GET_MODE_NAME (mode
));
93 /* Return the smallest scalar part of STMT.
94 This is used to determine the vectype of the stmt. We generally set the
95 vectype according to the type of the result (lhs). For stmts whose
96 result-type is different than the type of the arguments (e.g., demotion,
97 promotion), vectype will be reset appropriately (later). Note that we have
98 to visit the smallest datatype in this function, because that determines the
99 VF. If the smallest datatype in the loop is present only as the rhs of a
100 promotion operation - we'd miss it.
101 Such a case, where a variable of this datatype does not appear in the lhs
102 anywhere in the loop, can only occur if it's an invariant: e.g.:
103 'int_x = (int) short_inv', which we'd expect to have been optimized away by
104 invariant motion. However, we cannot rely on invariant motion to always
105 take invariants out of the loop, and so in the case of promotion we also
106 have to check the rhs.
107 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
112 HOST_WIDE_INT
*rhs_size_unit
)
114 tree scalar_type
= gimple_expr_type (stmt
);
115 HOST_WIDE_INT lhs
, rhs
;
117 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
119 if (is_gimple_assign (stmt
)
120 && (gimple_assign_cast_p (stmt
)
121 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
122 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
123 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
125 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
127 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
129 scalar_type
= rhs_type
;
132 *lhs_size_unit
= lhs
;
133 *rhs_size_unit
= rhs
;
138 /* Check if data references pointed by DR_I and DR_J are same or
139 belong to same interleaving group. Return FALSE if drs are
140 different, otherwise return TRUE. */
143 vect_same_range_drs (data_reference_p dr_i
, data_reference_p dr_j
)
145 gimple stmt_i
= DR_STMT (dr_i
);
146 gimple stmt_j
= DR_STMT (dr_j
);
148 if (operand_equal_p (DR_REF (dr_i
), DR_REF (dr_j
), 0)
149 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
150 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
))
151 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
152 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
)))))
158 /* If address ranges represented by DDR_I and DDR_J are equal,
159 return TRUE, otherwise return FALSE. */
162 vect_vfa_range_equal (ddr_p ddr_i
, ddr_p ddr_j
)
164 if ((vect_same_range_drs (DDR_A (ddr_i
), DDR_A (ddr_j
))
165 && vect_same_range_drs (DDR_B (ddr_i
), DDR_B (ddr_j
)))
166 || (vect_same_range_drs (DDR_A (ddr_i
), DDR_B (ddr_j
))
167 && vect_same_range_drs (DDR_B (ddr_i
), DDR_A (ddr_j
))))
173 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
174 tested at run-time. Return TRUE if DDR was successfully inserted.
175 Return false if versioning is not supported. */
178 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
180 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
182 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
185 if (dump_enabled_p ())
187 dump_printf_loc (MSG_NOTE
, vect_location
,
188 "mark for run-time aliasing test between ");
189 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
190 dump_printf (MSG_NOTE
, " and ");
191 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
192 dump_printf (MSG_NOTE
, "\n");
195 if (optimize_loop_nest_for_size_p (loop
))
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
199 "versioning not supported when optimizing"
204 /* FORNOW: We don't support versioning with outer-loop vectorization. */
207 if (dump_enabled_p ())
208 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
209 "versioning not yet supported for outer-loops.\n");
213 /* FORNOW: We don't support creating runtime alias tests for non-constant
215 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
216 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
220 "versioning not yet supported for non-constant "
225 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
230 /* Function vect_analyze_data_ref_dependence.
232 Return TRUE if there (might) exist a dependence between a memory-reference
233 DRA and a memory-reference DRB. When versioning for alias may check a
234 dependence at run-time, return FALSE. Adjust *MAX_VF according to
235 the data dependence. */
238 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
239 loop_vec_info loop_vinfo
, int *max_vf
)
242 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
243 struct data_reference
*dra
= DDR_A (ddr
);
244 struct data_reference
*drb
= DDR_B (ddr
);
245 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
246 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
247 lambda_vector dist_v
;
248 unsigned int loop_depth
;
250 /* In loop analysis all data references should be vectorizable. */
251 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
252 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
255 /* Independent data accesses. */
256 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
260 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
263 /* Unknown data dependence. */
264 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
266 /* If user asserted safelen consecutive iterations can be
267 executed concurrently, assume independence. */
268 if (loop
->safelen
>= 2)
270 if (loop
->safelen
< *max_vf
)
271 *max_vf
= loop
->safelen
;
275 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
276 || STMT_VINFO_GATHER_P (stmtinfo_b
))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
281 "versioning for alias not supported for: "
282 "can't determine dependence between ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
285 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
288 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
296 "versioning for alias required: "
297 "can't determine dependence between ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
300 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
301 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
303 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
306 /* Add to list of ddrs that need to be tested at run-time. */
307 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
310 /* Known data dependence. */
311 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
313 /* If user asserted safelen consecutive iterations can be
314 executed concurrently, assume independence. */
315 if (loop
->safelen
>= 2)
317 if (loop
->safelen
< *max_vf
)
318 *max_vf
= loop
->safelen
;
322 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
323 || STMT_VINFO_GATHER_P (stmtinfo_b
))
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
328 "versioning for alias not supported for: "
329 "bad dist vector for ");
330 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
332 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
335 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
343 "versioning for alias required: "
344 "bad dist vector for ");
345 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
346 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
348 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
350 /* Add to list of ddrs that need to be tested at run-time. */
351 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
354 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
355 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
357 int dist
= dist_v
[loop_depth
];
359 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE
, vect_location
,
361 "dependence distance = %d.\n", dist
);
365 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE
, vect_location
,
368 "dependence distance == 0 between ");
369 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
370 dump_printf (MSG_NOTE
, " and ");
371 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
372 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
375 /* When we perform grouped accesses and perform implicit CSE
376 by detecting equal accesses and doing disambiguation with
377 runtime alias tests like for
385 where we will end up loading { a[i], a[i+1] } once, make
386 sure that inserting group loads before the first load and
387 stores after the last store will do the right thing. */
388 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
389 && GROUP_SAME_DR_STMT (stmtinfo_a
))
390 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
391 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
394 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
396 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
400 "READ_WRITE dependence in interleaving."
409 if (dist
> 0 && DDR_REVERSED_P (ddr
))
411 /* If DDR_REVERSED_P the order of the data-refs in DDR was
412 reversed (to make distance vector positive), and the actual
413 distance is negative. */
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
416 "dependence distance negative.\n");
421 && abs (dist
) < *max_vf
)
423 /* The dependence distance requires reduction of the maximal
424 vectorization factor. */
425 *max_vf
= abs (dist
);
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE
, vect_location
,
428 "adjusting maximal vectorization factor to %i\n",
432 if (abs (dist
) >= *max_vf
)
434 /* Dependence distance does not create dependence, as far as
435 vectorization is concerned, in this case. */
436 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE
, vect_location
,
438 "dependence distance >= VF.\n");
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
445 "not vectorized, possible dependence "
446 "between data-refs ");
447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
448 dump_printf (MSG_NOTE
, " and ");
449 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
450 dump_printf (MSG_NOTE
, "\n");
459 /* Function vect_analyze_data_ref_dependences.
461 Examine all the data references in the loop, and make sure there do not
462 exist any data dependences between them. Set *MAX_VF according to
463 the maximum vectorization factor the data dependences allow. */
466 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
469 struct data_dependence_relation
*ddr
;
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_NOTE
, vect_location
,
473 "=== vect_analyze_data_ref_dependences ===\n");
475 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
476 &LOOP_VINFO_DDRS (loop_vinfo
),
477 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
480 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
481 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
488 /* Function vect_slp_analyze_data_ref_dependence.
490 Return TRUE if there (might) exist a dependence between a memory-reference
491 DRA and a memory-reference DRB. When versioning for alias may check a
492 dependence at run-time, return FALSE. Adjust *MAX_VF according to
493 the data dependence. */
496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
498 struct data_reference
*dra
= DDR_A (ddr
);
499 struct data_reference
*drb
= DDR_B (ddr
);
501 /* We need to check dependences of statements marked as unvectorizable
502 as well, they still can prohibit vectorization. */
504 /* Independent data accesses. */
505 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
511 /* Read-read is OK. */
512 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
515 /* If dra and drb are part of the same interleaving chain consider
517 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
518 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
519 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
522 /* Unknown data dependence. */
523 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
530 "can't determine dependence between ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
532 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
533 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
534 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
537 /* We do not vectorize basic blocks with write-write dependencies. */
538 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
541 /* Check that it's not a load-after-store dependence. */
542 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
543 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE
, vect_location
,
552 "determined dependence between ");
553 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
554 dump_printf (MSG_NOTE
, " and ");
555 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
556 dump_printf (MSG_NOTE
, "\n");
559 /* Do not vectorize basic blocks with write-write dependences. */
560 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
563 /* Check dependence between DRA and DRB for basic block vectorization.
564 If the accesses share same bases and offsets, we can compare their initial
565 constant offsets to decide whether they differ or not. In case of a read-
566 write dependence we check that the load is before the store to ensure that
567 vectorization will not change the order of the accesses. */
569 HOST_WIDE_INT type_size_a
, type_size_b
, init_a
, init_b
;
572 /* Check that the data-refs have same bases and offsets. If not, we can't
573 determine if they are dependent. */
574 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
575 || !dr_equal_offsets_p (dra
, drb
))
578 /* Check the types. */
579 type_size_a
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))));
580 type_size_b
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
582 if (type_size_a
!= type_size_b
583 || !types_compatible_p (TREE_TYPE (DR_REF (dra
)),
584 TREE_TYPE (DR_REF (drb
))))
587 init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
588 init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
590 /* Two different locations - no dependence. */
591 if (init_a
!= init_b
)
594 /* We have a read-write dependence. Check that the load is before the store.
595 When we vectorize basic blocks, vector load can be only before
596 corresponding scalar load, and vector store can be only after its
597 corresponding scalar store. So the order of the acceses is preserved in
598 case the load is before the store. */
599 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
600 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
607 /* Function vect_analyze_data_ref_dependences.
609 Examine all the data references in the basic-block, and make sure there
610 do not exist any data dependences between them. Set *MAX_VF according to
611 the maximum vectorization factor the data dependences allow. */
614 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
616 struct data_dependence_relation
*ddr
;
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE
, vect_location
,
621 "=== vect_slp_analyze_data_ref_dependences ===\n");
623 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
624 &BB_VINFO_DDRS (bb_vinfo
),
628 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
629 if (vect_slp_analyze_data_ref_dependence (ddr
))
636 /* Function vect_compute_data_ref_alignment
638 Compute the misalignment of the data reference DR.
641 1. If during the misalignment computation it is found that the data reference
642 cannot be vectorized then false is returned.
643 2. DR_MISALIGNMENT (DR) is defined.
645 FOR NOW: No analysis is actually performed. Misalignment is calculated
646 only for trivial cases. TODO. */
649 vect_compute_data_ref_alignment (struct data_reference
*dr
)
651 gimple stmt
= DR_STMT (dr
);
652 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
653 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
654 struct loop
*loop
= NULL
;
655 tree ref
= DR_REF (dr
);
657 tree base
, base_addr
;
660 tree aligned_to
, alignment
;
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_NOTE
, vect_location
,
664 "vect_compute_data_ref_alignment:\n");
667 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
669 /* Initialize misalignment to unknown. */
670 SET_DR_MISALIGNMENT (dr
, -1);
672 /* Strided loads perform only component accesses, misalignment information
673 is irrelevant for them. */
674 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
677 misalign
= DR_INIT (dr
);
678 aligned_to
= DR_ALIGNED_TO (dr
);
679 base_addr
= DR_BASE_ADDRESS (dr
);
680 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
682 /* In case the dataref is in an inner-loop of the loop that is being
683 vectorized (LOOP), we use the base and misalignment information
684 relative to the outer-loop (LOOP). This is ok only if the misalignment
685 stays the same throughout the execution of the inner-loop, which is why
686 we have to check that the stride of the dataref in the inner-loop evenly
687 divides by the vector size. */
688 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
690 tree step
= DR_STEP (dr
);
691 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
693 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_NOTE
, vect_location
,
697 "inner step divides the vector-size.\n");
698 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
699 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
700 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
706 "inner step doesn't divide the vector-size.\n");
707 misalign
= NULL_TREE
;
711 /* Similarly, if we're doing basic-block vectorization, we can only use
712 base and misalignment information relative to an innermost loop if the
713 misalignment stays the same throughout the execution of the loop.
714 As above, this is the case if the stride of the dataref evenly divides
715 by the vector size. */
718 tree step
= DR_STEP (dr
);
719 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
721 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
725 "SLP: step doesn't divide the vector-size.\n");
726 misalign
= NULL_TREE
;
730 base
= build_fold_indirect_ref (base_addr
);
731 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
733 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
736 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
739 "Unknown alignment for access: ");
740 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
741 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
747 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
749 || (TREE_CODE (base_addr
) == SSA_NAME
750 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
751 TREE_TYPE (base_addr
)))),
753 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
756 base_aligned
= false;
760 /* Do not change the alignment of global variables here if
761 flag_section_anchors is enabled as we already generated
762 RTL for other functions. Most global variables should
763 have been aligned during the IPA increase_alignment pass. */
764 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
765 || (TREE_STATIC (base
) && flag_section_anchors
))
767 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE
, vect_location
,
770 "can't force alignment of ref: ");
771 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
772 dump_printf (MSG_NOTE
, "\n");
777 /* Force the alignment of the decl.
778 NOTE: This is the only change to the code we make during
779 the analysis phase, before deciding to vectorize the loop. */
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
783 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
784 dump_printf (MSG_NOTE
, "\n");
787 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
788 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
791 /* If this is a backward running DR then first access in the larger
792 vectype actually is N-1 elements before the address in the DR.
793 Adjust misalign accordingly. */
794 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
796 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
797 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
798 otherwise we wouldn't be here. */
799 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
800 /* PLUS because DR_STEP was negative. */
801 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
804 /* Modulo alignment. */
805 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
807 if (!host_integerp (misalign
, 1))
809 /* Negative or overflowed misalignment value. */
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
812 "unexpected misalign value\n");
816 SET_DR_MISALIGNMENT (dr
, TREE_INT_CST_LOW (misalign
));
818 if (dump_enabled_p ())
820 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
821 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
822 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
823 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
830 /* Function vect_compute_data_refs_alignment
832 Compute the misalignment of data references in the loop.
833 Return FALSE if a data reference is found that cannot be vectorized. */
836 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
837 bb_vec_info bb_vinfo
)
839 vec
<data_reference_p
> datarefs
;
840 struct data_reference
*dr
;
844 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
846 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
848 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
849 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
850 && !vect_compute_data_ref_alignment (dr
))
854 /* Mark unsupported statement as unvectorizable. */
855 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
866 /* Function vect_update_misalignment_for_peel
868 DR - the data reference whose misalignment is to be adjusted.
869 DR_PEEL - the data reference whose misalignment is being made
870 zero in the vector loop by the peel.
871 NPEEL - the number of iterations in the peel loop if the misalignment
872 of DR_PEEL is known at compile time. */
875 vect_update_misalignment_for_peel (struct data_reference
*dr
,
876 struct data_reference
*dr_peel
, int npeel
)
879 vec
<dr_p
> same_align_drs
;
880 struct data_reference
*current_dr
;
881 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
882 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
883 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
884 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
886 /* For interleaved data accesses the step in the loop must be multiplied by
887 the size of the interleaving group. */
888 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
889 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
890 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
891 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
893 /* It can be assumed that the data refs with the same alignment as dr_peel
894 are aligned in the vector loop. */
896 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
897 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
899 if (current_dr
!= dr
)
901 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
902 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
903 SET_DR_MISALIGNMENT (dr
, 0);
907 if (known_alignment_for_access_p (dr
)
908 && known_alignment_for_access_p (dr_peel
))
910 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
911 int misal
= DR_MISALIGNMENT (dr
);
912 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
913 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
914 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
915 SET_DR_MISALIGNMENT (dr
, misal
);
919 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
921 SET_DR_MISALIGNMENT (dr
, -1);
925 /* Function vect_verify_datarefs_alignment
927 Return TRUE if all data references in the loop can be
928 handled with respect to alignment. */
931 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
933 vec
<data_reference_p
> datarefs
;
934 struct data_reference
*dr
;
935 enum dr_alignment_support supportable_dr_alignment
;
939 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
941 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
943 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
945 gimple stmt
= DR_STMT (dr
);
946 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
948 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
951 /* For interleaving, only the alignment of the first access matters.
952 Skip statements marked as not vectorizable. */
953 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
954 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
955 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
958 /* Strided loads perform only component accesses, alignment is
959 irrelevant for them. */
960 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
963 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
964 if (!supportable_dr_alignment
)
966 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
970 "not vectorized: unsupported unaligned load.");
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
973 "not vectorized: unsupported unaligned "
976 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
978 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
982 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
983 dump_printf_loc (MSG_NOTE
, vect_location
,
984 "Vectorizing an unaligned access.\n");
989 /* Given an memory reference EXP return whether its alignment is less
993 not_size_aligned (tree exp
)
995 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp
)), 1))
998 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp
)))
999 > get_object_alignment (exp
));
1002 /* Function vector_alignment_reachable_p
1004 Return true if vector alignment for DR is reachable by peeling
1005 a few loop iterations. Return false otherwise. */
1008 vector_alignment_reachable_p (struct data_reference
*dr
)
1010 gimple stmt
= DR_STMT (dr
);
1011 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1012 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1016 /* For interleaved access we peel only if number of iterations in
1017 the prolog loop ({VF - misalignment}), is a multiple of the
1018 number of the interleaved accesses. */
1019 int elem_size
, mis_in_elements
;
1020 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1022 /* FORNOW: handle only known alignment. */
1023 if (!known_alignment_for_access_p (dr
))
1026 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1027 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1029 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1033 /* If misalignment is known at the compile time then allow peeling
1034 only if natural alignment is reachable through peeling. */
1035 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1037 HOST_WIDE_INT elmsize
=
1038 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1039 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_NOTE
, vect_location
,
1042 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1043 dump_printf (MSG_NOTE
,
1044 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1046 if (DR_MISALIGNMENT (dr
) % elmsize
)
1048 if (dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1050 "data size does not divide the misalignment.\n");
1055 if (!known_alignment_for_access_p (dr
))
1057 tree type
= TREE_TYPE (DR_REF (dr
));
1058 bool is_packed
= not_size_aligned (DR_REF (dr
));
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1061 "Unknown misalignment, is_packed = %d\n",is_packed
);
1062 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1063 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1073 /* Calculate the cost of the memory access represented by DR. */
1076 vect_get_data_access_cost (struct data_reference
*dr
,
1077 unsigned int *inside_cost
,
1078 unsigned int *outside_cost
,
1079 stmt_vector_for_cost
*body_cost_vec
)
1081 gimple stmt
= DR_STMT (dr
);
1082 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1083 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1084 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1085 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1086 int ncopies
= vf
/ nunits
;
1088 if (DR_IS_READ (dr
))
1089 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1090 NULL
, body_cost_vec
, false);
1092 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE
, vect_location
,
1096 "vect_get_data_access_cost: inside_cost = %d, "
1097 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1101 /* Insert DR into peeling hash table with NPEEL as key. */
1104 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1107 struct _vect_peel_info elem
, *slot
;
1108 _vect_peel_info
**new_slot
;
1109 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1112 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1117 slot
= XNEW (struct _vect_peel_info
);
1118 slot
->npeel
= npeel
;
1121 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1125 if (!supportable_dr_alignment
&& unlimited_cost_model ())
1126 slot
->count
+= VECT_MAX_COST
;
1130 /* Traverse peeling hash table to find peeling option that aligns maximum
1131 number of data accesses. */
1134 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1135 _vect_peel_extended_info
*max
)
1137 vect_peel_info elem
= *slot
;
1139 if (elem
->count
> max
->peel_info
.count
1140 || (elem
->count
== max
->peel_info
.count
1141 && max
->peel_info
.npeel
> elem
->npeel
))
1143 max
->peel_info
.npeel
= elem
->npeel
;
1144 max
->peel_info
.count
= elem
->count
;
1145 max
->peel_info
.dr
= elem
->dr
;
1152 /* Traverse peeling hash table and calculate cost for each peeling option.
1153 Find the one with the lowest cost. */
1156 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1157 _vect_peel_extended_info
*min
)
1159 vect_peel_info elem
= *slot
;
1160 int save_misalignment
, dummy
;
1161 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1162 gimple stmt
= DR_STMT (elem
->dr
);
1163 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1164 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1165 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1166 struct data_reference
*dr
;
1167 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1168 int single_iter_cost
;
1170 prologue_cost_vec
.create (2);
1171 body_cost_vec
.create (2);
1172 epilogue_cost_vec
.create (2);
1174 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1176 stmt
= DR_STMT (dr
);
1177 stmt_info
= vinfo_for_stmt (stmt
);
1178 /* For interleaving, only the alignment of the first access
1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1181 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1184 save_misalignment
= DR_MISALIGNMENT (dr
);
1185 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1186 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1188 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1191 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1192 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1193 &dummy
, single_iter_cost
,
1195 &epilogue_cost_vec
);
1197 /* Prologue and epilogue costs are added to the target model later.
1198 These costs depend only on the scalar iteration cost, the
1199 number of peeling iterations finally chosen, and the number of
1200 misaligned statements. So discard the information found here. */
1201 prologue_cost_vec
.release ();
1202 epilogue_cost_vec
.release ();
1204 if (inside_cost
< min
->inside_cost
1205 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1207 min
->inside_cost
= inside_cost
;
1208 min
->outside_cost
= outside_cost
;
1209 min
->body_cost_vec
.release ();
1210 min
->body_cost_vec
= body_cost_vec
;
1211 min
->peel_info
.dr
= elem
->dr
;
1212 min
->peel_info
.npeel
= elem
->npeel
;
1215 body_cost_vec
.release ();
1221 /* Choose best peeling option by traversing peeling hash table and either
1222 choosing an option with the lowest cost (if cost model is enabled) or the
1223 option that aligns as many accesses as possible. */
1225 static struct data_reference
*
1226 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1227 unsigned int *npeel
,
1228 stmt_vector_for_cost
*body_cost_vec
)
1230 struct _vect_peel_extended_info res
;
1232 res
.peel_info
.dr
= NULL
;
1233 res
.body_cost_vec
= stmt_vector_for_cost ();
1235 if (!unlimited_cost_model ())
1237 res
.inside_cost
= INT_MAX
;
1238 res
.outside_cost
= INT_MAX
;
1239 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1240 .traverse
<_vect_peel_extended_info
*,
1241 vect_peeling_hash_get_lowest_cost
> (&res
);
1245 res
.peel_info
.count
= 0;
1246 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1247 .traverse
<_vect_peel_extended_info
*,
1248 vect_peeling_hash_get_most_frequent
> (&res
);
1251 *npeel
= res
.peel_info
.npeel
;
1252 *body_cost_vec
= res
.body_cost_vec
;
1253 return res
.peel_info
.dr
;
1257 /* Function vect_enhance_data_refs_alignment
1259 This pass will use loop versioning and loop peeling in order to enhance
1260 the alignment of data references in the loop.
1262 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1263 original loop is to be vectorized. Any other loops that are created by
1264 the transformations performed in this pass - are not supposed to be
1265 vectorized. This restriction will be relaxed.
1267 This pass will require a cost model to guide it whether to apply peeling
1268 or versioning or a combination of the two. For example, the scheme that
1269 intel uses when given a loop with several memory accesses, is as follows:
1270 choose one memory access ('p') which alignment you want to force by doing
1271 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1272 other accesses are not necessarily aligned, or (2) use loop versioning to
1273 generate one loop in which all accesses are aligned, and another loop in
1274 which only 'p' is necessarily aligned.
1276 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1277 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1278 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1280 Devising a cost model is the most critical aspect of this work. It will
1281 guide us on which access to peel for, whether to use loop versioning, how
1282 many versions to create, etc. The cost model will probably consist of
1283 generic considerations as well as target specific considerations (on
1284 powerpc for example, misaligned stores are more painful than misaligned
1287 Here are the general steps involved in alignment enhancements:
1289 -- original loop, before alignment analysis:
1290 for (i=0; i<N; i++){
1291 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1292 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1295 -- After vect_compute_data_refs_alignment:
1296 for (i=0; i<N; i++){
1297 x = q[i]; # DR_MISALIGNMENT(q) = 3
1298 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1301 -- Possibility 1: we do loop versioning:
1303 for (i=0; i<N; i++){ # loop 1A
1304 x = q[i]; # DR_MISALIGNMENT(q) = 3
1305 p[i] = y; # DR_MISALIGNMENT(p) = 0
1309 for (i=0; i<N; i++){ # loop 1B
1310 x = q[i]; # DR_MISALIGNMENT(q) = 3
1311 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1315 -- Possibility 2: we do loop peeling:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1320 for (i = 3; i < N; i++){ # loop 2A
1321 x = q[i]; # DR_MISALIGNMENT(q) = 0
1322 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1325 -- Possibility 3: combination of loop peeling and versioning:
1326 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1331 for (i = 3; i<N; i++){ # loop 3A
1332 x = q[i]; # DR_MISALIGNMENT(q) = 0
1333 p[i] = y; # DR_MISALIGNMENT(p) = 0
1337 for (i = 3; i<N; i++){ # loop 3B
1338 x = q[i]; # DR_MISALIGNMENT(q) = 0
1339 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1343 These loops are later passed to loop_transform to be vectorized. The
1344 vectorizer will use the alignment information to guide the transformation
1345 (whether to generate regular loads/stores, or with special handling for
1349 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1351 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1352 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1353 enum dr_alignment_support supportable_dr_alignment
;
1354 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1355 struct data_reference
*dr
;
1357 bool do_peeling
= false;
1358 bool do_versioning
= false;
1361 stmt_vec_info stmt_info
;
1362 unsigned int npeel
= 0;
1363 bool all_misalignments_unknown
= true;
1364 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1365 unsigned possible_npeel_number
= 1;
1367 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1368 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE
, vect_location
,
1372 "=== vect_enhance_data_refs_alignment ===\n");
1374 /* While cost model enhancements are expected in the future, the high level
1375 view of the code at this time is as follows:
1377 A) If there is a misaligned access then see if peeling to align
1378 this access can make all data references satisfy
1379 vect_supportable_dr_alignment. If so, update data structures
1380 as needed and return true.
1382 B) If peeling wasn't possible and there is a data reference with an
1383 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1384 then see if loop versioning checks can be used to make all data
1385 references satisfy vect_supportable_dr_alignment. If so, update
1386 data structures as needed and return true.
1388 C) If neither peeling nor versioning were successful then return false if
1389 any data reference does not satisfy vect_supportable_dr_alignment.
1391 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1393 Note, Possibility 3 above (which is peeling and versioning together) is not
1394 being done at this time. */
1396 /* (1) Peeling to force alignment. */
1398 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1400 + How many accesses will become aligned due to the peeling
1401 - How many accesses will become unaligned due to the peeling,
1402 and the cost of misaligned accesses.
1403 - The cost of peeling (the extra runtime checks, the increase
1406 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1408 stmt
= DR_STMT (dr
);
1409 stmt_info
= vinfo_for_stmt (stmt
);
1411 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1414 /* For interleaving, only the alignment of the first access
1416 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1417 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1420 /* For invariant accesses there is nothing to enhance. */
1421 if (integer_zerop (DR_STEP (dr
)))
1424 /* Strided loads perform only component accesses, alignment is
1425 irrelevant for them. */
1426 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1429 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1430 do_peeling
= vector_alignment_reachable_p (dr
);
1433 if (known_alignment_for_access_p (dr
))
1435 unsigned int npeel_tmp
;
1436 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1437 size_zero_node
) < 0;
1439 /* Save info about DR in the hash table. */
1440 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1441 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1443 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1444 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1445 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1446 TREE_TYPE (DR_REF (dr
))));
1447 npeel_tmp
= (negative
1448 ? (mis
- nelements
) : (nelements
- mis
))
1451 /* For multiple types, it is possible that the bigger type access
1452 will have more than one peeling option. E.g., a loop with two
1453 types: one of size (vector size / 4), and the other one of
1454 size (vector size / 8). Vectorization factor will 8. If both
1455 access are misaligned by 3, the first one needs one scalar
1456 iteration to be aligned, and the second one needs 5. But the
1457 the first one will be aligned also by peeling 5 scalar
1458 iterations, and in that case both accesses will be aligned.
1459 Hence, except for the immediate peeling amount, we also want
1460 to try to add full vector size, while we don't exceed
1461 vectorization factor.
1462 We do this automtically for cost model, since we calculate cost
1463 for every peeling option. */
1464 if (unlimited_cost_model ())
1465 possible_npeel_number
= vf
/nelements
;
1467 /* Handle the aligned case. We may decide to align some other
1468 access, making DR unaligned. */
1469 if (DR_MISALIGNMENT (dr
) == 0)
1472 if (unlimited_cost_model ())
1473 possible_npeel_number
++;
1476 for (j
= 0; j
< possible_npeel_number
; j
++)
1478 gcc_assert (npeel_tmp
<= vf
);
1479 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1480 npeel_tmp
+= nelements
;
1483 all_misalignments_unknown
= false;
1484 /* Data-ref that was chosen for the case that all the
1485 misalignments are unknown is not relevant anymore, since we
1486 have a data-ref with known alignment. */
1491 /* If we don't know any misalignment values, we prefer
1492 peeling for data-ref that has the maximum number of data-refs
1493 with the same alignment, unless the target prefers to align
1494 stores over load. */
1495 if (all_misalignments_unknown
)
1497 unsigned same_align_drs
1498 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1500 || same_align_drs_max
< same_align_drs
)
1502 same_align_drs_max
= same_align_drs
;
1505 /* For data-refs with the same number of related
1506 accesses prefer the one where the misalign
1507 computation will be invariant in the outermost loop. */
1508 else if (same_align_drs_max
== same_align_drs
)
1510 struct loop
*ivloop0
, *ivloop
;
1511 ivloop0
= outermost_invariant_loop_for_expr
1512 (loop
, DR_BASE_ADDRESS (dr0
));
1513 ivloop
= outermost_invariant_loop_for_expr
1514 (loop
, DR_BASE_ADDRESS (dr
));
1515 if ((ivloop
&& !ivloop0
)
1516 || (ivloop
&& ivloop0
1517 && flow_loop_nested_p (ivloop
, ivloop0
)))
1521 if (!first_store
&& DR_IS_WRITE (dr
))
1525 /* If there are both known and unknown misaligned accesses in the
1526 loop, we choose peeling amount according to the known
1528 if (!supportable_dr_alignment
)
1531 if (!first_store
&& DR_IS_WRITE (dr
))
1538 if (!aligned_access_p (dr
))
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1542 "vector alignment may not be reachable\n");
1548 /* Check if we can possibly peel the loop. */
1549 if (!vect_can_advance_ivs_p (loop_vinfo
)
1550 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1553 if (do_peeling
&& all_misalignments_unknown
1554 && vect_supportable_dr_alignment (dr0
, false))
1557 /* Check if the target requires to prefer stores over loads, i.e., if
1558 misaligned stores are more expensive than misaligned loads (taking
1559 drs with same alignment into account). */
1560 if (first_store
&& DR_IS_READ (dr0
))
1562 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1563 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1564 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1565 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1566 stmt_vector_for_cost dummy
;
1569 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1571 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1572 &store_outside_cost
, &dummy
);
1576 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1577 aligning the load DR0). */
1578 load_inside_penalty
= store_inside_cost
;
1579 load_outside_penalty
= store_outside_cost
;
1581 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1582 DR_STMT (first_store
))).iterate (i
, &dr
);
1584 if (DR_IS_READ (dr
))
1586 load_inside_penalty
+= load_inside_cost
;
1587 load_outside_penalty
+= load_outside_cost
;
1591 load_inside_penalty
+= store_inside_cost
;
1592 load_outside_penalty
+= store_outside_cost
;
1595 /* Calculate the penalty for leaving DR0 unaligned (by
1596 aligning the FIRST_STORE). */
1597 store_inside_penalty
= load_inside_cost
;
1598 store_outside_penalty
= load_outside_cost
;
1600 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1601 DR_STMT (dr0
))).iterate (i
, &dr
);
1603 if (DR_IS_READ (dr
))
1605 store_inside_penalty
+= load_inside_cost
;
1606 store_outside_penalty
+= load_outside_cost
;
1610 store_inside_penalty
+= store_inside_cost
;
1611 store_outside_penalty
+= store_outside_cost
;
1614 if (load_inside_penalty
> store_inside_penalty
1615 || (load_inside_penalty
== store_inside_penalty
1616 && load_outside_penalty
> store_outside_penalty
))
1620 /* In case there are only loads with different unknown misalignments, use
1621 peeling only if it may help to align other accesses in the loop. */
1623 && !STMT_VINFO_SAME_ALIGN_REFS (
1624 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1625 && vect_supportable_dr_alignment (dr0
, false)
1626 != dr_unaligned_supported
)
1630 if (do_peeling
&& !dr0
)
1632 /* Peeling is possible, but there is no data access that is not supported
1633 unless aligned. So we try to choose the best possible peeling. */
1635 /* We should get here only if there are drs with known misalignment. */
1636 gcc_assert (!all_misalignments_unknown
);
1638 /* Choose the best peeling from the hash table. */
1639 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1647 stmt
= DR_STMT (dr0
);
1648 stmt_info
= vinfo_for_stmt (stmt
);
1649 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1650 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1652 if (known_alignment_for_access_p (dr0
))
1654 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1655 size_zero_node
) < 0;
1658 /* Since it's known at compile time, compute the number of
1659 iterations in the peeled loop (the peeling factor) for use in
1660 updating DR_MISALIGNMENT values. The peeling factor is the
1661 vectorization factor minus the misalignment as an element
1663 mis
= DR_MISALIGNMENT (dr0
);
1664 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1665 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1669 /* For interleaved data access every iteration accesses all the
1670 members of the group, therefore we divide the number of iterations
1671 by the group size. */
1672 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1673 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1674 npeel
/= GROUP_SIZE (stmt_info
);
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_NOTE
, vect_location
,
1678 "Try peeling by %d\n", npeel
);
1681 /* Ensure that all data refs can be vectorized after the peel. */
1682 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1684 int save_misalignment
;
1689 stmt
= DR_STMT (dr
);
1690 stmt_info
= vinfo_for_stmt (stmt
);
1691 /* For interleaving, only the alignment of the first access
1693 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1694 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1697 /* Strided loads perform only component accesses, alignment is
1698 irrelevant for them. */
1699 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1702 save_misalignment
= DR_MISALIGNMENT (dr
);
1703 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1704 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1705 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1707 if (!supportable_dr_alignment
)
1714 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1716 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1721 body_cost_vec
.release ();
1728 unsigned max_allowed_peel
1729 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1730 if (max_allowed_peel
!= (unsigned)-1)
1732 unsigned max_peel
= npeel
;
1735 gimple dr_stmt
= DR_STMT (dr0
);
1736 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1737 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1738 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1740 if (max_peel
> max_allowed_peel
)
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE
, vect_location
,
1745 "Disable peeling, max peels reached: %d\n", max_peel
);
1752 stmt_info_for_cost
*si
;
1753 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1755 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1756 If the misalignment of DR_i is identical to that of dr0 then set
1757 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1758 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1759 by the peeling factor times the element size of DR_i (MOD the
1760 vectorization factor times the size). Otherwise, the
1761 misalignment of DR_i must be set to unknown. */
1762 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1764 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1766 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1768 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1770 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = DR_MISALIGNMENT (dr0
);
1771 SET_DR_MISALIGNMENT (dr0
, 0);
1772 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_NOTE
, vect_location
,
1775 "Alignment of access forced using peeling.\n");
1776 dump_printf_loc (MSG_NOTE
, vect_location
,
1777 "Peeling for alignment will be applied.\n");
1779 /* We've delayed passing the inside-loop peeling costs to the
1780 target cost model until we were sure peeling would happen.
1782 if (body_cost_vec
.exists ())
1784 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1786 struct _stmt_vec_info
*stmt_info
1787 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1788 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1789 si
->misalign
, vect_body
);
1791 body_cost_vec
.release ();
1794 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1800 body_cost_vec
.release ();
1802 /* (2) Versioning to force alignment. */
1804 /* Try versioning if:
1805 1) optimize loop for speed
1806 2) there is at least one unsupported misaligned data ref with an unknown
1808 3) all misaligned data refs with a known misalignment are supported, and
1809 4) the number of runtime alignment checks is within reason. */
1812 optimize_loop_nest_for_speed_p (loop
)
1813 && (!loop
->inner
); /* FORNOW */
1817 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1819 stmt
= DR_STMT (dr
);
1820 stmt_info
= vinfo_for_stmt (stmt
);
1822 /* For interleaving, only the alignment of the first access
1824 if (aligned_access_p (dr
)
1825 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1826 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1829 /* Strided loads perform only component accesses, alignment is
1830 irrelevant for them. */
1831 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1834 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1836 if (!supportable_dr_alignment
)
1842 if (known_alignment_for_access_p (dr
)
1843 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1844 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1846 do_versioning
= false;
1850 stmt
= DR_STMT (dr
);
1851 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1852 gcc_assert (vectype
);
1854 /* The rightmost bits of an aligned address must be zeros.
1855 Construct the mask needed for this test. For example,
1856 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1857 mask must be 15 = 0xf. */
1858 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1860 /* FORNOW: use the same mask to test all potentially unaligned
1861 references in the loop. The vectorizer currently supports
1862 a single vector size, see the reference to
1863 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1864 vectorization factor is computed. */
1865 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1866 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1867 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1868 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1873 /* Versioning requires at least one misaligned data reference. */
1874 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1875 do_versioning
= false;
1876 else if (!do_versioning
)
1877 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1882 vec
<gimple
> may_misalign_stmts
1883 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1886 /* It can now be assumed that the data references in the statements
1887 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1888 of the loop being vectorized. */
1889 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1891 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1892 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1893 SET_DR_MISALIGNMENT (dr
, 0);
1894 if (dump_enabled_p ())
1895 dump_printf_loc (MSG_NOTE
, vect_location
,
1896 "Alignment of access forced using versioning.\n");
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE
, vect_location
,
1901 "Versioning for alignment will be applied.\n");
1903 /* Peeling and versioning can't be done together at this time. */
1904 gcc_assert (! (do_peeling
&& do_versioning
));
1906 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1911 /* This point is reached if neither peeling nor versioning is being done. */
1912 gcc_assert (! (do_peeling
|| do_versioning
));
1914 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1919 /* Function vect_find_same_alignment_drs.
1921 Update group and alignment relations according to the chosen
1922 vectorization factor. */
1925 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1926 loop_vec_info loop_vinfo
)
1929 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1930 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1931 struct data_reference
*dra
= DDR_A (ddr
);
1932 struct data_reference
*drb
= DDR_B (ddr
);
1933 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1934 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1935 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1936 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1937 lambda_vector dist_v
;
1938 unsigned int loop_depth
;
1940 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1946 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1949 /* Loop-based vectorization and known data dependence. */
1950 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1953 /* Data-dependence analysis reports a distance vector of zero
1954 for data-references that overlap only in the first iteration
1955 but have different sign step (see PR45764).
1956 So as a sanity check require equal DR_STEP. */
1957 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1960 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1961 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1963 int dist
= dist_v
[loop_depth
];
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE
, vect_location
,
1967 "dependence distance = %d.\n", dist
);
1969 /* Same loop iteration. */
1971 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1973 /* Two references with distance zero have the same alignment. */
1974 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1975 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1976 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE
, vect_location
,
1979 "accesses have the same alignment.\n");
1980 dump_printf (MSG_NOTE
,
1981 "dependence distance modulo vf == 0 between ");
1982 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1983 dump_printf (MSG_NOTE
, " and ");
1984 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1985 dump_printf (MSG_NOTE
, "\n");
1992 /* Function vect_analyze_data_refs_alignment
1994 Analyze the alignment of the data-references in the loop.
1995 Return FALSE if a data reference is found that cannot be vectorized. */
1998 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1999 bb_vec_info bb_vinfo
)
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE
, vect_location
,
2003 "=== vect_analyze_data_refs_alignment ===\n");
2005 /* Mark groups of data references with same alignment using
2006 data dependence information. */
2009 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
2010 struct data_dependence_relation
*ddr
;
2013 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2014 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
2017 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2021 "not vectorized: can't calculate alignment "
2030 /* Analyze groups of accesses: check that DR belongs to a group of
2031 accesses of legal size, step, etc. Detect gaps, single element
2032 interleaving, and other special cases. Set grouped access info.
2033 Collect groups of strided stores for further use in SLP analysis. */
2036 vect_analyze_group_access (struct data_reference
*dr
)
2038 tree step
= DR_STEP (dr
);
2039 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2040 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2041 gimple stmt
= DR_STMT (dr
);
2042 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2043 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2044 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2045 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2046 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2047 bool slp_impossible
= false;
2048 struct loop
*loop
= NULL
;
2051 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2053 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2054 size of the interleaving group (including gaps). */
2055 groupsize
= absu_hwi (dr_step
) / type_size
;
2057 /* Not consecutive access is possible only if it is a part of interleaving. */
2058 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2060 /* Check if it this DR is a part of interleaving, and is a single
2061 element of the group that is accessed in the loop. */
2063 /* Gaps are supported only for loads. STEP must be a multiple of the type
2064 size. The size of the group must be a power of 2. */
2066 && (dr_step
% type_size
) == 0
2068 && exact_log2 (groupsize
) != -1)
2070 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2071 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2072 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE
, vect_location
,
2075 "Detected single element interleaving ");
2076 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2077 dump_printf (MSG_NOTE
, " step ");
2078 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2079 dump_printf (MSG_NOTE
, "\n");
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE
, vect_location
,
2086 "Data access with gaps requires scalar "
2090 if (dump_enabled_p ())
2091 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2092 "Peeling for outer loop is not"
2097 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2103 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2106 "not consecutive access ");
2107 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2108 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2113 /* Mark the statement as unvectorizable. */
2114 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2121 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2123 /* First stmt in the interleaving chain. Check the chain. */
2124 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2125 struct data_reference
*data_ref
= dr
;
2126 unsigned int count
= 1;
2127 tree prev_init
= DR_INIT (data_ref
);
2129 HOST_WIDE_INT diff
, gaps
= 0;
2130 unsigned HOST_WIDE_INT count_in_bytes
;
2134 /* Skip same data-refs. In case that two or more stmts share
2135 data-ref (supported only for loads), we vectorize only the first
2136 stmt, and the rest get their vectorized loads from the first
2138 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2139 DR_INIT (STMT_VINFO_DATA_REF (
2140 vinfo_for_stmt (next
)))))
2142 if (DR_IS_WRITE (data_ref
))
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2146 "Two store stmts share the same dr.\n");
2150 /* For load use the same data-ref load. */
2151 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2154 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2159 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2161 /* All group members have the same STEP by construction. */
2162 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2164 /* Check that the distance between two accesses is equal to the type
2165 size. Otherwise, we have gaps. */
2166 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2167 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2170 /* FORNOW: SLP of accesses with gaps is not supported. */
2171 slp_impossible
= true;
2172 if (DR_IS_WRITE (data_ref
))
2174 if (dump_enabled_p ())
2175 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2176 "interleaved store with gaps\n");
2183 last_accessed_element
+= diff
;
2185 /* Store the gap from the previous member of the group. If there is no
2186 gap in the access, GROUP_GAP is always 1. */
2187 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2189 prev_init
= DR_INIT (data_ref
);
2190 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2191 /* Count the number of data-refs in the chain. */
2195 /* COUNT is the number of accesses found, we multiply it by the size of
2196 the type to get COUNT_IN_BYTES. */
2197 count_in_bytes
= type_size
* count
;
2199 /* Check that the size of the interleaving (including gaps) is not
2200 greater than STEP. */
2202 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2207 "interleaving size is greater than step for ");
2208 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2210 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2215 /* Check that the size of the interleaving is equal to STEP for stores,
2216 i.e., that there are no gaps. */
2218 && absu_hwi (dr_step
) != count_in_bytes
)
2220 if (DR_IS_READ (dr
))
2222 slp_impossible
= true;
2223 /* There is a gap after the last load in the group. This gap is a
2224 difference between the groupsize and the number of elements.
2225 When there is no gap, this difference should be 0. */
2226 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2232 "interleaved store with gaps\n");
2237 /* Check that STEP is a multiple of type size. */
2239 && (dr_step
% type_size
) != 0)
2241 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2244 "step is not a multiple of type size: step ");
2245 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2246 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2247 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2248 TYPE_SIZE_UNIT (scalar_type
));
2249 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2257 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE
, vect_location
,
2260 "Detected interleaving of size %d\n", (int)groupsize
);
2262 /* SLP: create an SLP data structure for every interleaving group of
2263 stores for further analysis in vect_analyse_slp. */
2264 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2267 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2269 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2272 /* There is a gap in the end of the group. */
2273 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2277 "Data access with gaps requires scalar "
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2283 "Peeling for outer loop is not supported\n");
2287 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2295 /* Analyze the access pattern of the data-reference DR.
2296 In case of non-consecutive accesses call vect_analyze_group_access() to
2297 analyze groups of accesses. */
2300 vect_analyze_data_ref_access (struct data_reference
*dr
)
2302 tree step
= DR_STEP (dr
);
2303 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2304 gimple stmt
= DR_STMT (dr
);
2305 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2306 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2307 struct loop
*loop
= NULL
;
2310 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2312 if (loop_vinfo
&& !step
)
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2316 "bad data-ref access in loop\n");
2320 /* Allow invariant loads in not nested loops. */
2321 if (loop_vinfo
&& integer_zerop (step
))
2323 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2324 if (nested_in_vect_loop_p (loop
, stmt
))
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE
, vect_location
,
2328 "zero step in inner loop of nest\n");
2331 return DR_IS_READ (dr
);
2334 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2336 /* Interleaved accesses are not yet supported within outer-loop
2337 vectorization for references in the inner-loop. */
2338 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2340 /* For the rest of the analysis we use the outer-loop step. */
2341 step
= STMT_VINFO_DR_STEP (stmt_info
);
2342 if (integer_zerop (step
))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE
, vect_location
,
2346 "zero step in outer loop.\n");
2347 if (DR_IS_READ (dr
))
2355 if (TREE_CODE (step
) == INTEGER_CST
)
2357 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2358 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2360 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2362 /* Mark that it is not interleaving. */
2363 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2368 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE
, vect_location
,
2372 "grouped access in outer loop.\n");
2376 /* Assume this is a DR handled by non-constant strided load case. */
2377 if (TREE_CODE (step
) != INTEGER_CST
)
2378 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2380 /* Not consecutive access - check if it's a part of interleaving group. */
2381 return vect_analyze_group_access (dr
);
2386 /* A helper function used in the comparator function to sort data
2387 references. T1 and T2 are two data references to be compared.
2388 The function returns -1, 0, or 1. */
2391 compare_tree (tree t1
, tree t2
)
2394 enum tree_code code
;
2405 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2406 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2408 code
= TREE_CODE (t1
);
2411 /* For const values, we can just use hash values for comparisons. */
2419 hashval_t h1
= iterative_hash_expr (t1
, 0);
2420 hashval_t h2
= iterative_hash_expr (t2
, 0);
2422 return h1
< h2
? -1 : 1;
2427 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2431 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2432 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2436 tclass
= TREE_CODE_CLASS (code
);
2438 /* For var-decl, we could compare their UIDs. */
2439 if (tclass
== tcc_declaration
)
2441 if (DECL_UID (t1
) != DECL_UID (t2
))
2442 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2446 /* For expressions with operands, compare their operands recursively. */
2447 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2449 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2459 /* Compare two data-references DRA and DRB to group them into chunks
2460 suitable for grouping. */
2463 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2465 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2466 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2469 /* Stabilize sort. */
2473 /* Ordering of DRs according to base. */
2474 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2476 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2481 /* And according to DR_OFFSET. */
2482 if (!dr_equal_offsets_p (dra
, drb
))
2484 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2489 /* Put reads before writes. */
2490 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2491 return DR_IS_READ (dra
) ? -1 : 1;
2493 /* Then sort after access size. */
2494 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2497 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2498 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2503 /* And after step. */
2504 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2506 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2511 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2512 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2514 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2518 /* Function vect_analyze_data_ref_accesses.
2520 Analyze the access pattern of all the data references in the loop.
2522 FORNOW: the only access pattern that is considered vectorizable is a
2523 simple step 1 (consecutive) access.
2525 FORNOW: handle only arrays and pointer accesses. */
2528 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2531 vec
<data_reference_p
> datarefs
;
2532 struct data_reference
*dr
;
2534 if (dump_enabled_p ())
2535 dump_printf_loc (MSG_NOTE
, vect_location
,
2536 "=== vect_analyze_data_ref_accesses ===\n");
2539 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2541 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2543 if (datarefs
.is_empty ())
2546 /* Sort the array of datarefs to make building the interleaving chains
2548 qsort (datarefs
.address (), datarefs
.length (),
2549 sizeof (data_reference_p
), dr_group_sort_cmp
);
2551 /* Build the interleaving chains. */
2552 for (i
= 0; i
< datarefs
.length () - 1;)
2554 data_reference_p dra
= datarefs
[i
];
2555 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2556 stmt_vec_info lastinfo
= NULL
;
2557 for (i
= i
+ 1; i
< datarefs
.length (); ++i
)
2559 data_reference_p drb
= datarefs
[i
];
2560 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2562 /* ??? Imperfect sorting (non-compatible types, non-modulo
2563 accesses, same accesses) can lead to a group to be artificially
2564 split here as we don't just skip over those. If it really
2565 matters we can push those to a worklist and re-iterate
2566 over them. The we can just skip ahead to the next DR here. */
2568 /* Check that the data-refs have same first location (except init)
2569 and they are both either store or load (not load and store). */
2570 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2571 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2572 DR_BASE_ADDRESS (drb
), 0)
2573 || !dr_equal_offsets_p (dra
, drb
))
2576 /* Check that the data-refs have the same constant size and step. */
2577 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2578 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2579 if (!host_integerp (sza
, 1)
2580 || !host_integerp (szb
, 1)
2581 || !tree_int_cst_equal (sza
, szb
)
2582 || !host_integerp (DR_STEP (dra
), 0)
2583 || !host_integerp (DR_STEP (drb
), 0)
2584 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2587 /* Do not place the same access in the interleaving chain twice. */
2588 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2591 /* Check the types are compatible.
2592 ??? We don't distinguish this during sorting. */
2593 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2594 TREE_TYPE (DR_REF (drb
))))
2597 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2598 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2599 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2600 gcc_assert (init_a
< init_b
);
2602 /* If init_b == init_a + the size of the type * k, we have an
2603 interleaving, and DRA is accessed before DRB. */
2604 HOST_WIDE_INT type_size_a
= TREE_INT_CST_LOW (sza
);
2605 if ((init_b
- init_a
) % type_size_a
!= 0)
2608 /* The step (if not zero) is greater than the difference between
2609 data-refs' inits. This splits groups into suitable sizes. */
2610 HOST_WIDE_INT step
= TREE_INT_CST_LOW (DR_STEP (dra
));
2611 if (step
!= 0 && step
<= (init_b
- init_a
))
2614 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE
, vect_location
,
2617 "Detected interleaving ");
2618 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2619 dump_printf (MSG_NOTE
, " and ");
2620 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2621 dump_printf (MSG_NOTE
, "\n");
2624 /* Link the found element into the group list. */
2625 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2627 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2628 lastinfo
= stmtinfo_a
;
2630 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2631 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2632 lastinfo
= stmtinfo_b
;
2636 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2637 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2638 && !vect_analyze_data_ref_access (dr
))
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2642 "not vectorized: complicated access pattern.\n");
2646 /* Mark the statement as not vectorizable. */
2647 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2657 /* Function vect_prune_runtime_alias_test_list.
2659 Prune a list of ddrs to be tested at run-time by versioning for alias.
2660 Return FALSE if resulting list of ddrs is longer then allowed by
2661 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2664 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2667 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE
, vect_location
,
2672 "=== vect_prune_runtime_alias_test_list ===\n");
2674 for (i
= 0; i
< ddrs
.length (); )
2682 for (j
= 0; j
< i
; j
++)
2684 ddr_p ddr_j
= ddrs
[j
];
2686 if (vect_vfa_range_equal (ddr_i
, ddr_j
))
2688 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_NOTE
, vect_location
,
2691 "found equal ranges ");
2692 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2693 DR_REF (DDR_A (ddr_i
)));
2694 dump_printf (MSG_NOTE
, ", ");
2695 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2696 DR_REF (DDR_B (ddr_i
)));
2697 dump_printf (MSG_NOTE
, " and ");
2698 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2699 DR_REF (DDR_A (ddr_j
)));
2700 dump_printf (MSG_NOTE
, ", ");
2701 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2702 DR_REF (DDR_B (ddr_j
)));
2703 dump_printf (MSG_NOTE
, "\n");
2712 ddrs
.ordered_remove (i
);
2718 if (ddrs
.length () >
2719 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2721 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2724 "disable versioning for alias - max number of "
2725 "generated checks exceeded.\n");
2728 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
2736 /* Check whether a non-affine read in stmt is suitable for gather load
2737 and if so, return a builtin decl for that operation. */
2740 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2741 tree
*offp
, int *scalep
)
2743 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2744 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2745 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2746 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2747 tree offtype
= NULL_TREE
;
2748 tree decl
, base
, off
;
2749 enum machine_mode pmode
;
2750 int punsignedp
, pvolatilep
;
2752 /* The gather builtins need address of the form
2753 loop_invariant + vector * {1, 2, 4, 8}
2755 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2756 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2757 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2758 multiplications and additions in it. To get a vector, we need
2759 a single SSA_NAME that will be defined in the loop and will
2760 contain everything that is not loop invariant and that can be
2761 vectorized. The following code attempts to find such a preexistng
2762 SSA_NAME OFF and put the loop invariants into a tree BASE
2763 that can be gimplified before the loop. */
2764 base
= get_inner_reference (DR_REF (dr
), &pbitsize
, &pbitpos
, &off
,
2765 &pmode
, &punsignedp
, &pvolatilep
, false);
2766 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2768 if (TREE_CODE (base
) == MEM_REF
)
2770 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2772 if (off
== NULL_TREE
)
2774 double_int moff
= mem_ref_offset (base
);
2775 off
= double_int_to_tree (sizetype
, moff
);
2778 off
= size_binop (PLUS_EXPR
, off
,
2779 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2781 base
= TREE_OPERAND (base
, 0);
2784 base
= build_fold_addr_expr (base
);
2786 if (off
== NULL_TREE
)
2787 off
= size_zero_node
;
2789 /* If base is not loop invariant, either off is 0, then we start with just
2790 the constant offset in the loop invariant BASE and continue with base
2791 as OFF, otherwise give up.
2792 We could handle that case by gimplifying the addition of base + off
2793 into some SSA_NAME and use that as off, but for now punt. */
2794 if (!expr_invariant_in_loop_p (loop
, base
))
2796 if (!integer_zerop (off
))
2799 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
2801 /* Otherwise put base + constant offset into the loop invariant BASE
2802 and continue with OFF. */
2805 base
= fold_convert (sizetype
, base
);
2806 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
2809 /* OFF at this point may be either a SSA_NAME or some tree expression
2810 from get_inner_reference. Try to peel off loop invariants from it
2811 into BASE as long as possible. */
2813 while (offtype
== NULL_TREE
)
2815 enum tree_code code
;
2816 tree op0
, op1
, add
= NULL_TREE
;
2818 if (TREE_CODE (off
) == SSA_NAME
)
2820 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
2822 if (expr_invariant_in_loop_p (loop
, off
))
2825 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
2828 op0
= gimple_assign_rhs1 (def_stmt
);
2829 code
= gimple_assign_rhs_code (def_stmt
);
2830 op1
= gimple_assign_rhs2 (def_stmt
);
2834 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
2836 code
= TREE_CODE (off
);
2837 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
2841 case POINTER_PLUS_EXPR
:
2843 if (expr_invariant_in_loop_p (loop
, op0
))
2848 add
= fold_convert (sizetype
, add
);
2850 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
2851 base
= size_binop (PLUS_EXPR
, base
, add
);
2854 if (expr_invariant_in_loop_p (loop
, op1
))
2862 if (expr_invariant_in_loop_p (loop
, op1
))
2864 add
= fold_convert (sizetype
, op1
);
2865 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
2871 if (scale
== 1 && host_integerp (op1
, 0))
2873 scale
= tree_low_cst (op1
, 0);
2882 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
2883 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
2885 if (TYPE_PRECISION (TREE_TYPE (op0
))
2886 == TYPE_PRECISION (TREE_TYPE (off
)))
2891 if (TYPE_PRECISION (TREE_TYPE (op0
))
2892 < TYPE_PRECISION (TREE_TYPE (off
)))
2895 offtype
= TREE_TYPE (off
);
2906 /* If at the end OFF still isn't a SSA_NAME or isn't
2907 defined in the loop, punt. */
2908 if (TREE_CODE (off
) != SSA_NAME
2909 || expr_invariant_in_loop_p (loop
, off
))
2912 if (offtype
== NULL_TREE
)
2913 offtype
= TREE_TYPE (off
);
2915 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
2917 if (decl
== NULL_TREE
)
2929 /* Function vect_analyze_data_refs.
2931 Find all the data references in the loop or basic block.
2933 The general structure of the analysis of data refs in the vectorizer is as
2935 1- vect_analyze_data_refs(loop/bb): call
2936 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2937 in the loop/bb and their dependences.
2938 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2939 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2940 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2945 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
2946 bb_vec_info bb_vinfo
,
2949 struct loop
*loop
= NULL
;
2950 basic_block bb
= NULL
;
2952 vec
<data_reference_p
> datarefs
;
2953 struct data_reference
*dr
;
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_NOTE
, vect_location
,
2958 "=== vect_analyze_data_refs ===\n");
2962 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2963 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
))
2964 || find_data_references_in_loop
2965 (loop
, &LOOP_VINFO_DATAREFS (loop_vinfo
)))
2967 if (dump_enabled_p ())
2968 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2969 "not vectorized: loop contains function calls"
2970 " or data references that cannot be analyzed\n");
2974 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2978 gimple_stmt_iterator gsi
;
2980 bb
= BB_VINFO_BB (bb_vinfo
);
2981 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2983 gimple stmt
= gsi_stmt (gsi
);
2984 if (!find_data_references_in_stmt (NULL
, stmt
,
2985 &BB_VINFO_DATAREFS (bb_vinfo
)))
2987 /* Mark the rest of the basic-block as unvectorizable. */
2988 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
2990 stmt
= gsi_stmt (gsi
);
2991 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
2997 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3000 /* Go through the data-refs, check that the analysis succeeded. Update
3001 pointer from stmt_vec_info struct to DR and vectype. */
3003 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3006 stmt_vec_info stmt_info
;
3007 tree base
, offset
, init
;
3008 bool gather
= false;
3009 bool simd_lane_access
= false;
3013 if (!dr
|| !DR_REF (dr
))
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3017 "not vectorized: unhandled data-ref\n");
3021 stmt
= DR_STMT (dr
);
3022 stmt_info
= vinfo_for_stmt (stmt
);
3024 /* Discard clobbers from the dataref vector. We will remove
3025 clobber stmts during vectorization. */
3026 if (gimple_clobber_p (stmt
))
3028 if (i
== datarefs
.length () - 1)
3033 datarefs
[i
] = datarefs
.pop ();
3037 /* Check that analysis of the data-ref succeeded. */
3038 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3043 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3044 && targetm
.vectorize
.builtin_gather
!= NULL
;
3045 bool maybe_simd_lane_access
3046 = loop_vinfo
&& loop
->simduid
;
3048 /* If target supports vector gather loads, or if this might be
3049 a SIMD lane access, see if they can't be used. */
3051 && (maybe_gather
|| maybe_simd_lane_access
)
3052 && !nested_in_vect_loop_p (loop
, stmt
))
3054 struct data_reference
*newdr
3055 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3056 DR_REF (dr
), stmt
, true);
3057 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3058 if (DR_BASE_ADDRESS (newdr
)
3059 && DR_OFFSET (newdr
)
3062 && integer_zerop (DR_STEP (newdr
)))
3064 if (maybe_simd_lane_access
)
3066 tree off
= DR_OFFSET (newdr
);
3068 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3069 && TREE_CODE (off
) == MULT_EXPR
3070 && host_integerp (TREE_OPERAND (off
, 1), 1))
3072 tree step
= TREE_OPERAND (off
, 1);
3073 off
= TREE_OPERAND (off
, 0);
3075 if (CONVERT_EXPR_P (off
)
3076 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3078 < TYPE_PRECISION (TREE_TYPE (off
)))
3079 off
= TREE_OPERAND (off
, 0);
3080 if (TREE_CODE (off
) == SSA_NAME
)
3082 gimple def
= SSA_NAME_DEF_STMT (off
);
3083 tree reft
= TREE_TYPE (DR_REF (newdr
));
3084 if (gimple_call_internal_p (def
)
3085 && gimple_call_internal_fn (def
)
3086 == IFN_GOMP_SIMD_LANE
)
3088 tree arg
= gimple_call_arg (def
, 0);
3089 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3090 arg
= SSA_NAME_VAR (arg
);
3091 if (arg
== loop
->simduid
3093 && tree_int_cst_equal
3094 (TYPE_SIZE_UNIT (reft
),
3097 DR_OFFSET (newdr
) = ssize_int (0);
3098 DR_STEP (newdr
) = step
;
3099 DR_ALIGNED_TO (newdr
)
3100 = size_int (BIGGEST_ALIGNMENT
);
3102 simd_lane_access
= true;
3108 if (!simd_lane_access
&& maybe_gather
)
3114 if (!gather
&& !simd_lane_access
)
3115 free_data_ref (newdr
);
3118 if (!gather
&& !simd_lane_access
)
3120 if (dump_enabled_p ())
3122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3123 "not vectorized: data ref analysis "
3125 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3126 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3136 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3138 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3140 "not vectorized: base addr of dr is a "
3146 if (gather
|| simd_lane_access
)
3151 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3153 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3156 "not vectorized: volatile type ");
3157 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3158 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3167 if (stmt_can_throw_internal (stmt
))
3169 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3172 "not vectorized: statement can throw an "
3174 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3175 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3181 if (gather
|| simd_lane_access
)
3186 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3187 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3189 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3192 "not vectorized: statement is bitfield "
3194 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3195 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3201 if (gather
|| simd_lane_access
)
3206 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3207 offset
= unshare_expr (DR_OFFSET (dr
));
3208 init
= unshare_expr (DR_INIT (dr
));
3210 if (is_gimple_call (stmt
))
3212 if (dump_enabled_p ())
3214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3215 "not vectorized: dr in a call ");
3216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3217 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3223 if (gather
|| simd_lane_access
)
3228 /* Update DR field in stmt_vec_info struct. */
3230 /* If the dataref is in an inner-loop of the loop that is considered for
3231 for vectorization, we also want to analyze the access relative to
3232 the outer-loop (DR contains information only relative to the
3233 inner-most enclosing loop). We do that by building a reference to the
3234 first location accessed by the inner-loop, and analyze it relative to
3236 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3238 tree outer_step
, outer_base
, outer_init
;
3239 HOST_WIDE_INT pbitsize
, pbitpos
;
3241 enum machine_mode pmode
;
3242 int punsignedp
, pvolatilep
;
3243 affine_iv base_iv
, offset_iv
;
3246 /* Build a reference to the first location accessed by the
3247 inner-loop: *(BASE+INIT). (The first location is actually
3248 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3249 tree inner_base
= build_fold_indirect_ref
3250 (fold_build_pointer_plus (base
, init
));
3252 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_NOTE
, vect_location
,
3255 "analyze in outer-loop: ");
3256 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3257 dump_printf (MSG_NOTE
, "\n");
3260 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3261 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3262 gcc_assert (outer_base
!= NULL_TREE
);
3264 if (pbitpos
% BITS_PER_UNIT
!= 0)
3266 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3268 "failed: bit offset alignment.\n");
3272 outer_base
= build_fold_addr_expr (outer_base
);
3273 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3278 "failed: evolution of base is not affine.\n");
3285 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3293 offset_iv
.base
= ssize_int (0);
3294 offset_iv
.step
= ssize_int (0);
3296 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3301 "evolution of offset is not affine.\n");
3305 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3306 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3307 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3308 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3309 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3311 outer_step
= size_binop (PLUS_EXPR
,
3312 fold_convert (ssizetype
, base_iv
.step
),
3313 fold_convert (ssizetype
, offset_iv
.step
));
3315 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3316 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3317 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3318 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3319 STMT_VINFO_DR_OFFSET (stmt_info
) =
3320 fold_convert (ssizetype
, offset_iv
.base
);
3321 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3322 size_int (highest_pow2_factor (offset_iv
.base
));
3324 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_NOTE
, vect_location
,
3327 "\touter base_address: ");
3328 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3329 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3330 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3331 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3332 STMT_VINFO_DR_OFFSET (stmt_info
));
3333 dump_printf (MSG_NOTE
,
3334 "\n\touter constant offset from base address: ");
3335 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3336 STMT_VINFO_DR_INIT (stmt_info
));
3337 dump_printf (MSG_NOTE
, "\n\touter step: ");
3338 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3339 STMT_VINFO_DR_STEP (stmt_info
));
3340 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3341 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3342 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3343 dump_printf (MSG_NOTE
, "\n");
3347 if (STMT_VINFO_DATA_REF (stmt_info
))
3349 if (dump_enabled_p ())
3351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3352 "not vectorized: more than one data ref "
3354 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3355 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3361 if (gather
|| simd_lane_access
)
3366 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3367 if (simd_lane_access
)
3369 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3373 /* Set vectype for STMT. */
3374 scalar_type
= TREE_TYPE (DR_REF (dr
));
3375 STMT_VINFO_VECTYPE (stmt_info
) =
3376 get_vectype_for_scalar_type (scalar_type
);
3377 if (!STMT_VINFO_VECTYPE (stmt_info
))
3379 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3382 "not vectorized: no vectype for stmt: ");
3383 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3384 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3385 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3387 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3393 if (gather
|| simd_lane_access
)
3395 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_NOTE
, vect_location
,
3405 "got vectype for stmt: ");
3406 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3407 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3408 STMT_VINFO_VECTYPE (stmt_info
));
3409 dump_printf (MSG_NOTE
, "\n");
3413 /* Adjust the minimal vectorization factor according to the
3415 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3423 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3425 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3429 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3431 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3434 "not vectorized: not suitable for gather "
3436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3437 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3443 STMT_VINFO_GATHER_P (stmt_info
) = true;
3446 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3448 if (nested_in_vect_loop_p (loop
, stmt
)
3449 || !DR_IS_READ (dr
))
3451 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3454 "not vectorized: not suitable for strided "
3456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3457 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3461 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3465 /* If we stopped analysis at the first dataref we could not analyze
3466 when trying to vectorize a basic-block mark the rest of the datarefs
3467 as not vectorizable and truncate the vector of datarefs. That
3468 avoids spending useless time in analyzing their dependence. */
3469 if (i
!= datarefs
.length ())
3471 gcc_assert (bb_vinfo
!= NULL
);
3472 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3474 data_reference_p dr
= datarefs
[j
];
3475 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3478 datarefs
.truncate (i
);
3485 /* Function vect_get_new_vect_var.
3487 Returns a name for a new variable. The current naming scheme appends the
3488 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3489 the name of vectorizer generated variables, and appends that to NAME if
3493 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3500 case vect_simple_var
:
3503 case vect_scalar_var
:
3506 case vect_pointer_var
:
3515 char* tmp
= concat (prefix
, "_", name
, NULL
);
3516 new_vect_var
= create_tmp_reg (type
, tmp
);
3520 new_vect_var
= create_tmp_reg (type
, prefix
);
3522 return new_vect_var
;
3526 /* Function vect_create_addr_base_for_vector_ref.
3528 Create an expression that computes the address of the first memory location
3529 that will be accessed for a data reference.
3532 STMT: The statement containing the data reference.
3533 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3534 OFFSET: Optional. If supplied, it is be added to the initial address.
3535 LOOP: Specify relative to which loop-nest should the address be computed.
3536 For example, when the dataref is in an inner-loop nested in an
3537 outer-loop that is now being vectorized, LOOP can be either the
3538 outer-loop, or the inner-loop. The first memory location accessed
3539 by the following dataref ('in' points to short):
3546 if LOOP=i_loop: &in (relative to i_loop)
3547 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3550 1. Return an SSA_NAME whose value is the address of the memory location of
3551 the first vector of the data reference.
3552 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3553 these statement(s) which define the returned SSA_NAME.
3555 FORNOW: We are only handling array accesses with step 1. */
3558 vect_create_addr_base_for_vector_ref (gimple stmt
,
3559 gimple_seq
*new_stmt_list
,
3563 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3564 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3566 const char *base_name
;
3569 gimple_seq seq
= NULL
;
3573 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3574 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3576 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3578 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3580 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3582 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3583 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3584 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3588 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3589 base_offset
= unshare_expr (DR_OFFSET (dr
));
3590 init
= unshare_expr (DR_INIT (dr
));
3594 base_name
= get_name (data_ref_base
);
3597 base_offset
= ssize_int (0);
3598 init
= ssize_int (0);
3599 base_name
= get_name (DR_REF (dr
));
3602 /* Create base_offset */
3603 base_offset
= size_binop (PLUS_EXPR
,
3604 fold_convert (sizetype
, base_offset
),
3605 fold_convert (sizetype
, init
));
3609 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3610 fold_convert (sizetype
, offset
), step
);
3611 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3612 base_offset
, offset
);
3615 /* base + base_offset */
3617 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3620 addr_base
= build1 (ADDR_EXPR
,
3621 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3622 unshare_expr (DR_REF (dr
)));
3625 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3626 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3627 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3628 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3629 gimple_seq_add_seq (new_stmt_list
, seq
);
3631 if (DR_PTR_INFO (dr
)
3632 && TREE_CODE (addr_base
) == SSA_NAME
)
3634 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3636 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3639 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3642 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3643 dump_printf (MSG_NOTE
, "\n");
3650 /* Function vect_create_data_ref_ptr.
3652 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3653 location accessed in the loop by STMT, along with the def-use update
3654 chain to appropriately advance the pointer through the loop iterations.
3655 Also set aliasing information for the pointer. This pointer is used by
3656 the callers to this function to create a memory reference expression for
3657 vector load/store access.
3660 1. STMT: a stmt that references memory. Expected to be of the form
3661 GIMPLE_ASSIGN <name, data-ref> or
3662 GIMPLE_ASSIGN <data-ref, name>.
3663 2. AGGR_TYPE: the type of the reference, which should be either a vector
3665 3. AT_LOOP: the loop where the vector memref is to be created.
3666 4. OFFSET (optional): an offset to be added to the initial address accessed
3667 by the data-ref in STMT.
3668 5. BSI: location where the new stmts are to be placed if there is no loop
3669 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3670 pointing to the initial address.
3673 1. Declare a new ptr to vector_type, and have it point to the base of the
3674 data reference (initial addressed accessed by the data reference).
3675 For example, for vector of type V8HI, the following code is generated:
3678 ap = (v8hi *)initial_address;
3680 if OFFSET is not supplied:
3681 initial_address = &a[init];
3682 if OFFSET is supplied:
3683 initial_address = &a[init + OFFSET];
3685 Return the initial_address in INITIAL_ADDRESS.
3687 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3688 update the pointer in each iteration of the loop.
3690 Return the increment stmt that updates the pointer in PTR_INCR.
3692 3. Set INV_P to true if the access pattern of the data reference in the
3693 vectorized loop is invariant. Set it to false otherwise.
3695 4. Return the pointer. */
3698 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3699 tree offset
, tree
*initial_address
,
3700 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3701 bool only_init
, bool *inv_p
)
3703 const char *base_name
;
3704 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3705 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3706 struct loop
*loop
= NULL
;
3707 bool nested_in_vect_loop
= false;
3708 struct loop
*containing_loop
= NULL
;
3713 gimple_seq new_stmt_list
= NULL
;
3717 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3719 gimple_stmt_iterator incr_gsi
;
3721 tree indx_before_incr
, indx_after_incr
;
3724 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3726 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3727 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3731 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3732 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3733 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3734 pe
= loop_preheader_edge (loop
);
3738 gcc_assert (bb_vinfo
);
3743 /* Check the step (evolution) of the load in LOOP, and record
3744 whether it's invariant. */
3745 if (nested_in_vect_loop
)
3746 step
= STMT_VINFO_DR_STEP (stmt_info
);
3748 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
3750 if (integer_zerop (step
))
3755 /* Create an expression for the first address accessed by this load
3757 base_name
= get_name (DR_BASE_ADDRESS (dr
));
3759 if (dump_enabled_p ())
3761 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
3762 dump_printf_loc (MSG_NOTE
, vect_location
,
3763 "create %s-pointer variable to type: ",
3764 get_tree_code_name (TREE_CODE (aggr_type
)));
3765 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
3766 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
3767 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
3768 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
3769 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
3770 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
3771 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
3773 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
3774 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
3775 dump_printf (MSG_NOTE
, "\n");
3778 /* (1) Create the new aggregate-pointer variable.
3779 Vector and array types inherit the alias set of their component
3780 type by default so we need to use a ref-all pointer if the data
3781 reference does not conflict with the created aggregated data
3782 reference because it is not addressable. */
3783 bool need_ref_all
= false;
3784 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3785 get_alias_set (DR_REF (dr
))))
3786 need_ref_all
= true;
3787 /* Likewise for any of the data references in the stmt group. */
3788 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
3790 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
3793 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
3794 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
3795 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3796 get_alias_set (DR_REF (sdr
))))
3798 need_ref_all
= true;
3801 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
3805 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
3807 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
3810 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3811 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3812 def-use update cycles for the pointer: one relative to the outer-loop
3813 (LOOP), which is what steps (3) and (4) below do. The other is relative
3814 to the inner-loop (which is the inner-most loop containing the dataref),
3815 and this is done be step (5) below.
3817 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3818 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3819 redundant. Steps (3),(4) create the following:
3822 LOOP: vp1 = phi(vp0,vp2)
3828 If there is an inner-loop nested in loop, then step (5) will also be
3829 applied, and an additional update in the inner-loop will be created:
3832 LOOP: vp1 = phi(vp0,vp2)
3834 inner: vp3 = phi(vp1,vp4)
3835 vp4 = vp3 + inner_step
3841 /* (2) Calculate the initial address of the aggregate-pointer, and set
3842 the aggregate-pointer to point to it before the loop. */
3844 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3846 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
3852 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
3853 gcc_assert (!new_bb
);
3856 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
3859 *initial_address
= new_temp
;
3861 /* Create: p = (aggr_type *) initial_base */
3862 if (TREE_CODE (new_temp
) != SSA_NAME
3863 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
3865 vec_stmt
= gimple_build_assign (aggr_ptr
,
3866 fold_convert (aggr_ptr_type
, new_temp
));
3867 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
3868 /* Copy the points-to information if it exists. */
3869 if (DR_PTR_INFO (dr
))
3870 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
3871 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
3874 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
3875 gcc_assert (!new_bb
);
3878 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
3881 aggr_ptr_init
= new_temp
;
3883 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3884 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3885 inner-loop nested in LOOP (during outer-loop vectorization). */
3887 /* No update in loop is required. */
3888 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
3889 aptr
= aggr_ptr_init
;
3892 /* The step of the aggregate pointer is the type size. */
3893 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
3894 /* One exception to the above is when the scalar step of the load in
3895 LOOP is zero. In this case the step here is also zero. */
3897 iv_step
= size_zero_node
;
3898 else if (tree_int_cst_sgn (step
) == -1)
3899 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
3901 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
3903 create_iv (aggr_ptr_init
,
3904 fold_convert (aggr_ptr_type
, iv_step
),
3905 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
3906 &indx_before_incr
, &indx_after_incr
);
3907 incr
= gsi_stmt (incr_gsi
);
3908 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3910 /* Copy the points-to information if it exists. */
3911 if (DR_PTR_INFO (dr
))
3913 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3914 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3919 aptr
= indx_before_incr
;
3922 if (!nested_in_vect_loop
|| only_init
)
3926 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3927 nested in LOOP, if exists. */
3929 gcc_assert (nested_in_vect_loop
);
3932 standard_iv_increment_position (containing_loop
, &incr_gsi
,
3934 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
3935 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
3937 incr
= gsi_stmt (incr_gsi
);
3938 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3940 /* Copy the points-to information if it exists. */
3941 if (DR_PTR_INFO (dr
))
3943 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3944 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3949 return indx_before_incr
;
3956 /* Function bump_vector_ptr
3958 Increment a pointer (to a vector type) by vector-size. If requested,
3959 i.e. if PTR-INCR is given, then also connect the new increment stmt
3960 to the existing def-use update-chain of the pointer, by modifying
3961 the PTR_INCR as illustrated below:
3963 The pointer def-use update-chain before this function:
3964 DATAREF_PTR = phi (p_0, p_2)
3966 PTR_INCR: p_2 = DATAREF_PTR + step
3968 The pointer def-use update-chain after this function:
3969 DATAREF_PTR = phi (p_0, p_2)
3971 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3973 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3976 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3978 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3979 the loop. The increment amount across iterations is expected
3981 BSI - location where the new update stmt is to be placed.
3982 STMT - the original scalar memory-access stmt that is being vectorized.
3983 BUMP - optional. The offset by which to bump the pointer. If not given,
3984 the offset is assumed to be vector_size.
3986 Output: Return NEW_DATAREF_PTR as illustrated above.
3991 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
3992 gimple stmt
, tree bump
)
3994 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3995 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3996 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3997 tree update
= TYPE_SIZE_UNIT (vectype
);
4000 use_operand_p use_p
;
4001 tree new_dataref_ptr
;
4006 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
4007 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
4008 dataref_ptr
, update
);
4009 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4011 /* Copy the points-to information if it exists. */
4012 if (DR_PTR_INFO (dr
))
4014 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4015 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4019 return new_dataref_ptr
;
4021 /* Update the vector-pointer's cross-iteration increment. */
4022 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4024 tree use
= USE_FROM_PTR (use_p
);
4026 if (use
== dataref_ptr
)
4027 SET_USE (use_p
, new_dataref_ptr
);
4029 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4032 return new_dataref_ptr
;
4036 /* Function vect_create_destination_var.
4038 Create a new temporary of type VECTYPE. */
4041 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4047 enum vect_var_kind kind
;
4049 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4050 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4052 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4054 name
= get_name (scalar_dest
);
4056 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4058 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
4059 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4065 /* Function vect_grouped_store_supported.
4067 Returns TRUE if interleave high and interleave low permutations
4068 are supported, and FALSE otherwise. */
4071 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4073 enum machine_mode mode
= TYPE_MODE (vectype
);
4075 /* vect_permute_store_chain requires the group size to be a power of two. */
4076 if (exact_log2 (count
) == -1)
4078 if (dump_enabled_p ())
4079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4080 "the size of the group of accesses"
4081 " is not a power of 2\n");
4085 /* Check that the permutation is supported. */
4086 if (VECTOR_MODE_P (mode
))
4088 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4089 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4090 for (i
= 0; i
< nelt
/ 2; i
++)
4093 sel
[i
* 2 + 1] = i
+ nelt
;
4095 if (can_vec_perm_p (mode
, false, sel
))
4097 for (i
= 0; i
< nelt
; i
++)
4099 if (can_vec_perm_p (mode
, false, sel
))
4104 if (dump_enabled_p ())
4105 dump_printf (MSG_MISSED_OPTIMIZATION
,
4106 "interleave op not supported by target.\n");
4111 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4115 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4117 return vect_lanes_optab_supported_p ("vec_store_lanes",
4118 vec_store_lanes_optab
,
4123 /* Function vect_permute_store_chain.
4125 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4126 a power of 2, generate interleave_high/low stmts to reorder the data
4127 correctly for the stores. Return the final references for stores in
4130 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4131 The input is 4 vectors each containing 8 elements. We assign a number to
4132 each element, the input sequence is:
4134 1st vec: 0 1 2 3 4 5 6 7
4135 2nd vec: 8 9 10 11 12 13 14 15
4136 3rd vec: 16 17 18 19 20 21 22 23
4137 4th vec: 24 25 26 27 28 29 30 31
4139 The output sequence should be:
4141 1st vec: 0 8 16 24 1 9 17 25
4142 2nd vec: 2 10 18 26 3 11 19 27
4143 3rd vec: 4 12 20 28 5 13 21 30
4144 4th vec: 6 14 22 30 7 15 23 31
4146 i.e., we interleave the contents of the four vectors in their order.
4148 We use interleave_high/low instructions to create such output. The input of
4149 each interleave_high/low operation is two vectors:
4152 the even elements of the result vector are obtained left-to-right from the
4153 high/low elements of the first vector. The odd elements of the result are
4154 obtained left-to-right from the high/low elements of the second vector.
4155 The output of interleave_high will be: 0 4 1 5
4156 and of interleave_low: 2 6 3 7
4159 The permutation is done in log LENGTH stages. In each stage interleave_high
4160 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4161 where the first argument is taken from the first half of DR_CHAIN and the
4162 second argument from it's second half.
4165 I1: interleave_high (1st vec, 3rd vec)
4166 I2: interleave_low (1st vec, 3rd vec)
4167 I3: interleave_high (2nd vec, 4th vec)
4168 I4: interleave_low (2nd vec, 4th vec)
4170 The output for the first stage is:
4172 I1: 0 16 1 17 2 18 3 19
4173 I2: 4 20 5 21 6 22 7 23
4174 I3: 8 24 9 25 10 26 11 27
4175 I4: 12 28 13 29 14 30 15 31
4177 The output of the second stage, i.e. the final result is:
4179 I1: 0 8 16 24 1 9 17 25
4180 I2: 2 10 18 26 3 11 19 27
4181 I3: 4 12 20 28 5 13 21 30
4182 I4: 6 14 22 30 7 15 23 31. */
4185 vect_permute_store_chain (vec
<tree
> dr_chain
,
4186 unsigned int length
,
4188 gimple_stmt_iterator
*gsi
,
4189 vec
<tree
> *result_chain
)
4191 tree vect1
, vect2
, high
, low
;
4193 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4194 tree perm_mask_low
, perm_mask_high
;
4196 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4197 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4199 result_chain
->quick_grow (length
);
4200 memcpy (result_chain
->address (), dr_chain
.address (),
4201 length
* sizeof (tree
));
4203 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4206 sel
[i
* 2 + 1] = i
+ nelt
;
4208 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4209 gcc_assert (perm_mask_high
!= NULL
);
4211 for (i
= 0; i
< nelt
; i
++)
4213 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4214 gcc_assert (perm_mask_low
!= NULL
);
4216 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4218 for (j
= 0; j
< length
/2; j
++)
4220 vect1
= dr_chain
[j
];
4221 vect2
= dr_chain
[j
+length
/2];
4223 /* Create interleaving stmt:
4224 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4225 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4227 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4228 vect1
, vect2
, perm_mask_high
);
4229 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4230 (*result_chain
)[2*j
] = high
;
4232 /* Create interleaving stmt:
4233 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4234 nelt*3/2+1, ...}> */
4235 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4237 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4238 vect1
, vect2
, perm_mask_low
);
4239 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4240 (*result_chain
)[2*j
+1] = low
;
4242 memcpy (dr_chain
.address (), result_chain
->address (),
4243 length
* sizeof (tree
));
4247 /* Function vect_setup_realignment
4249 This function is called when vectorizing an unaligned load using
4250 the dr_explicit_realign[_optimized] scheme.
4251 This function generates the following code at the loop prolog:
4254 x msq_init = *(floor(p)); # prolog load
4255 realignment_token = call target_builtin;
4257 x msq = phi (msq_init, ---)
4259 The stmts marked with x are generated only for the case of
4260 dr_explicit_realign_optimized.
4262 The code above sets up a new (vector) pointer, pointing to the first
4263 location accessed by STMT, and a "floor-aligned" load using that pointer.
4264 It also generates code to compute the "realignment-token" (if the relevant
4265 target hook was defined), and creates a phi-node at the loop-header bb
4266 whose arguments are the result of the prolog-load (created by this
4267 function) and the result of a load that takes place in the loop (to be
4268 created by the caller to this function).
4270 For the case of dr_explicit_realign_optimized:
4271 The caller to this function uses the phi-result (msq) to create the
4272 realignment code inside the loop, and sets up the missing phi argument,
4275 msq = phi (msq_init, lsq)
4276 lsq = *(floor(p')); # load in loop
4277 result = realign_load (msq, lsq, realignment_token);
4279 For the case of dr_explicit_realign:
4281 msq = *(floor(p)); # load in loop
4283 lsq = *(floor(p')); # load in loop
4284 result = realign_load (msq, lsq, realignment_token);
4287 STMT - (scalar) load stmt to be vectorized. This load accesses
4288 a memory location that may be unaligned.
4289 BSI - place where new code is to be inserted.
4290 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4294 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4295 target hook, if defined.
4296 Return value - the result of the loop-header phi node. */
4299 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4300 tree
*realignment_token
,
4301 enum dr_alignment_support alignment_support_scheme
,
4303 struct loop
**at_loop
)
4305 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4306 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4307 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4308 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4309 struct loop
*loop
= NULL
;
4311 tree scalar_dest
= gimple_assign_lhs (stmt
);
4318 tree msq_init
= NULL_TREE
;
4321 tree msq
= NULL_TREE
;
4322 gimple_seq stmts
= NULL
;
4324 bool compute_in_loop
= false;
4325 bool nested_in_vect_loop
= false;
4326 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4327 struct loop
*loop_for_initial_load
= NULL
;
4331 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4332 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4335 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4336 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4338 /* We need to generate three things:
4339 1. the misalignment computation
4340 2. the extra vector load (for the optimized realignment scheme).
4341 3. the phi node for the two vectors from which the realignment is
4342 done (for the optimized realignment scheme). */
4344 /* 1. Determine where to generate the misalignment computation.
4346 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4347 calculation will be generated by this function, outside the loop (in the
4348 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4349 caller, inside the loop.
4351 Background: If the misalignment remains fixed throughout the iterations of
4352 the loop, then both realignment schemes are applicable, and also the
4353 misalignment computation can be done outside LOOP. This is because we are
4354 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4355 are a multiple of VS (the Vector Size), and therefore the misalignment in
4356 different vectorized LOOP iterations is always the same.
4357 The problem arises only if the memory access is in an inner-loop nested
4358 inside LOOP, which is now being vectorized using outer-loop vectorization.
4359 This is the only case when the misalignment of the memory access may not
4360 remain fixed throughout the iterations of the inner-loop (as explained in
4361 detail in vect_supportable_dr_alignment). In this case, not only is the
4362 optimized realignment scheme not applicable, but also the misalignment
4363 computation (and generation of the realignment token that is passed to
4364 REALIGN_LOAD) have to be done inside the loop.
4366 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4367 or not, which in turn determines if the misalignment is computed inside
4368 the inner-loop, or outside LOOP. */
4370 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4372 compute_in_loop
= true;
4373 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4377 /* 2. Determine where to generate the extra vector load.
4379 For the optimized realignment scheme, instead of generating two vector
4380 loads in each iteration, we generate a single extra vector load in the
4381 preheader of the loop, and in each iteration reuse the result of the
4382 vector load from the previous iteration. In case the memory access is in
4383 an inner-loop nested inside LOOP, which is now being vectorized using
4384 outer-loop vectorization, we need to determine whether this initial vector
4385 load should be generated at the preheader of the inner-loop, or can be
4386 generated at the preheader of LOOP. If the memory access has no evolution
4387 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4388 to be generated inside LOOP (in the preheader of the inner-loop). */
4390 if (nested_in_vect_loop
)
4392 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4393 bool invariant_in_outerloop
=
4394 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4395 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4398 loop_for_initial_load
= loop
;
4400 *at_loop
= loop_for_initial_load
;
4402 if (loop_for_initial_load
)
4403 pe
= loop_preheader_edge (loop_for_initial_load
);
4405 /* 3. For the case of the optimized realignment, create the first vector
4406 load at the loop preheader. */
4408 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4410 /* Create msq_init = *(floor(p1)) in the loop preheader */
4412 gcc_assert (!compute_in_loop
);
4413 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4414 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4415 NULL_TREE
, &init_addr
, NULL
, &inc
,
4417 new_temp
= copy_ssa_name (ptr
, NULL
);
4418 new_stmt
= gimple_build_assign_with_ops
4419 (BIT_AND_EXPR
, new_temp
, ptr
,
4420 build_int_cst (TREE_TYPE (ptr
),
4421 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4422 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4423 gcc_assert (!new_bb
);
4425 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4426 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4427 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4428 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4429 gimple_assign_set_lhs (new_stmt
, new_temp
);
4432 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4433 gcc_assert (!new_bb
);
4436 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4438 msq_init
= gimple_assign_lhs (new_stmt
);
4441 /* 4. Create realignment token using a target builtin, if available.
4442 It is done either inside the containing loop, or before LOOP (as
4443 determined above). */
4445 if (targetm
.vectorize
.builtin_mask_for_load
)
4449 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4452 /* Generate the INIT_ADDR computation outside LOOP. */
4453 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4457 pe
= loop_preheader_edge (loop
);
4458 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4459 gcc_assert (!new_bb
);
4462 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4465 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4466 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4468 vect_create_destination_var (scalar_dest
,
4469 gimple_call_return_type (new_stmt
));
4470 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4471 gimple_call_set_lhs (new_stmt
, new_temp
);
4473 if (compute_in_loop
)
4474 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4477 /* Generate the misalignment computation outside LOOP. */
4478 pe
= loop_preheader_edge (loop
);
4479 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4480 gcc_assert (!new_bb
);
4483 *realignment_token
= gimple_call_lhs (new_stmt
);
4485 /* The result of the CALL_EXPR to this builtin is determined from
4486 the value of the parameter and no global variables are touched
4487 which makes the builtin a "const" function. Requiring the
4488 builtin to have the "const" attribute makes it unnecessary
4489 to call mark_call_clobbered. */
4490 gcc_assert (TREE_READONLY (builtin_decl
));
4493 if (alignment_support_scheme
== dr_explicit_realign
)
4496 gcc_assert (!compute_in_loop
);
4497 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4500 /* 5. Create msq = phi <msq_init, lsq> in loop */
4502 pe
= loop_preheader_edge (containing_loop
);
4503 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4504 msq
= make_ssa_name (vec_dest
, NULL
);
4505 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4506 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4512 /* Function vect_grouped_load_supported.
4514 Returns TRUE if even and odd permutations are supported,
4515 and FALSE otherwise. */
4518 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4520 enum machine_mode mode
= TYPE_MODE (vectype
);
4522 /* vect_permute_load_chain requires the group size to be a power of two. */
4523 if (exact_log2 (count
) == -1)
4525 if (dump_enabled_p ())
4526 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4527 "the size of the group of accesses"
4528 " is not a power of 2\n");
4532 /* Check that the permutation is supported. */
4533 if (VECTOR_MODE_P (mode
))
4535 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4536 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4538 for (i
= 0; i
< nelt
; i
++)
4540 if (can_vec_perm_p (mode
, false, sel
))
4542 for (i
= 0; i
< nelt
; i
++)
4544 if (can_vec_perm_p (mode
, false, sel
))
4549 if (dump_enabled_p ())
4550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4551 "extract even/odd not supported by target\n");
4555 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4559 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4561 return vect_lanes_optab_supported_p ("vec_load_lanes",
4562 vec_load_lanes_optab
,
4566 /* Function vect_permute_load_chain.
4568 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4569 a power of 2, generate extract_even/odd stmts to reorder the input data
4570 correctly. Return the final references for loads in RESULT_CHAIN.
4572 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4573 The input is 4 vectors each containing 8 elements. We assign a number to each
4574 element, the input sequence is:
4576 1st vec: 0 1 2 3 4 5 6 7
4577 2nd vec: 8 9 10 11 12 13 14 15
4578 3rd vec: 16 17 18 19 20 21 22 23
4579 4th vec: 24 25 26 27 28 29 30 31
4581 The output sequence should be:
4583 1st vec: 0 4 8 12 16 20 24 28
4584 2nd vec: 1 5 9 13 17 21 25 29
4585 3rd vec: 2 6 10 14 18 22 26 30
4586 4th vec: 3 7 11 15 19 23 27 31
4588 i.e., the first output vector should contain the first elements of each
4589 interleaving group, etc.
4591 We use extract_even/odd instructions to create such output. The input of
4592 each extract_even/odd operation is two vectors
4596 and the output is the vector of extracted even/odd elements. The output of
4597 extract_even will be: 0 2 4 6
4598 and of extract_odd: 1 3 5 7
4601 The permutation is done in log LENGTH stages. In each stage extract_even
4602 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4603 their order. In our example,
4605 E1: extract_even (1st vec, 2nd vec)
4606 E2: extract_odd (1st vec, 2nd vec)
4607 E3: extract_even (3rd vec, 4th vec)
4608 E4: extract_odd (3rd vec, 4th vec)
4610 The output for the first stage will be:
4612 E1: 0 2 4 6 8 10 12 14
4613 E2: 1 3 5 7 9 11 13 15
4614 E3: 16 18 20 22 24 26 28 30
4615 E4: 17 19 21 23 25 27 29 31
4617 In order to proceed and create the correct sequence for the next stage (or
4618 for the correct output, if the second stage is the last one, as in our
4619 example), we first put the output of extract_even operation and then the
4620 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4621 The input for the second stage is:
4623 1st vec (E1): 0 2 4 6 8 10 12 14
4624 2nd vec (E3): 16 18 20 22 24 26 28 30
4625 3rd vec (E2): 1 3 5 7 9 11 13 15
4626 4th vec (E4): 17 19 21 23 25 27 29 31
4628 The output of the second stage:
4630 E1: 0 4 8 12 16 20 24 28
4631 E2: 2 6 10 14 18 22 26 30
4632 E3: 1 5 9 13 17 21 25 29
4633 E4: 3 7 11 15 19 23 27 31
4635 And RESULT_CHAIN after reordering:
4637 1st vec (E1): 0 4 8 12 16 20 24 28
4638 2nd vec (E3): 1 5 9 13 17 21 25 29
4639 3rd vec (E2): 2 6 10 14 18 22 26 30
4640 4th vec (E4): 3 7 11 15 19 23 27 31. */
4643 vect_permute_load_chain (vec
<tree
> dr_chain
,
4644 unsigned int length
,
4646 gimple_stmt_iterator
*gsi
,
4647 vec
<tree
> *result_chain
)
4649 tree data_ref
, first_vect
, second_vect
;
4650 tree perm_mask_even
, perm_mask_odd
;
4652 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4653 unsigned int i
, j
, log_length
= exact_log2 (length
);
4654 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4655 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4657 result_chain
->quick_grow (length
);
4658 memcpy (result_chain
->address (), dr_chain
.address (),
4659 length
* sizeof (tree
));
4661 for (i
= 0; i
< nelt
; ++i
)
4663 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4664 gcc_assert (perm_mask_even
!= NULL
);
4666 for (i
= 0; i
< nelt
; ++i
)
4668 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4669 gcc_assert (perm_mask_odd
!= NULL
);
4671 for (i
= 0; i
< log_length
; i
++)
4673 for (j
= 0; j
< length
; j
+= 2)
4675 first_vect
= dr_chain
[j
];
4676 second_vect
= dr_chain
[j
+1];
4678 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4679 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4680 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4681 first_vect
, second_vect
,
4683 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4684 (*result_chain
)[j
/2] = data_ref
;
4686 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4687 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4688 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4689 first_vect
, second_vect
,
4691 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4692 (*result_chain
)[j
/2+length
/2] = data_ref
;
4694 memcpy (dr_chain
.address (), result_chain
->address (),
4695 length
* sizeof (tree
));
4700 /* Function vect_transform_grouped_load.
4702 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4703 to perform their permutation and ascribe the result vectorized statements to
4704 the scalar statements.
4708 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4709 gimple_stmt_iterator
*gsi
)
4711 vec
<tree
> result_chain
= vNULL
;
4713 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4714 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4715 vectors, that are ready for vector computation. */
4716 result_chain
.create (size
);
4717 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4718 vect_record_grouped_load_vectors (stmt
, result_chain
);
4719 result_chain
.release ();
4722 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4723 generated as part of the vectorization of STMT. Assign the statement
4724 for each vector to the associated scalar statement. */
4727 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4729 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4730 gimple next_stmt
, new_stmt
;
4731 unsigned int i
, gap_count
;
4734 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4735 Since we scan the chain starting from it's first node, their order
4736 corresponds the order of data-refs in RESULT_CHAIN. */
4737 next_stmt
= first_stmt
;
4739 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
4744 /* Skip the gaps. Loads created for the gaps will be removed by dead
4745 code elimination pass later. No need to check for the first stmt in
4746 the group, since it always exists.
4747 GROUP_GAP is the number of steps in elements from the previous
4748 access (if there is no gap GROUP_GAP is 1). We skip loads that
4749 correspond to the gaps. */
4750 if (next_stmt
!= first_stmt
4751 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
4759 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
4760 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4761 copies, and we put the new vector statement in the first available
4763 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
4764 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
4767 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4770 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
4772 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
4775 prev_stmt
= rel_stmt
;
4777 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
4780 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
4785 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
4787 /* If NEXT_STMT accesses the same DR as the previous statement,
4788 put the same TMP_DATA_REF as its vectorized statement; otherwise
4789 get the next data-ref from RESULT_CHAIN. */
4790 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4796 /* Function vect_force_dr_alignment_p.
4798 Returns whether the alignment of a DECL can be forced to be aligned
4799 on ALIGNMENT bit boundary. */
4802 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
4804 if (TREE_CODE (decl
) != VAR_DECL
)
4807 /* We cannot change alignment of common or external symbols as another
4808 translation unit may contain a definition with lower alignment.
4809 The rules of common symbol linking mean that the definition
4810 will override the common symbol. The same is true for constant
4811 pool entries which may be shared and are not properly merged
4813 if (DECL_EXTERNAL (decl
)
4814 || DECL_COMMON (decl
)
4815 || DECL_IN_CONSTANT_POOL (decl
))
4818 if (TREE_ASM_WRITTEN (decl
))
4821 /* Do not override the alignment as specified by the ABI when the used
4822 attribute is set. */
4823 if (DECL_PRESERVE_P (decl
))
4826 /* Do not override explicit alignment set by the user when an explicit
4827 section name is also used. This is a common idiom used by many
4828 software projects. */
4829 if (DECL_SECTION_NAME (decl
) != NULL_TREE
4830 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
4833 if (TREE_STATIC (decl
))
4834 return (alignment
<= MAX_OFILE_ALIGNMENT
);
4836 return (alignment
<= MAX_STACK_ALIGNMENT
);
4840 /* Return whether the data reference DR is supported with respect to its
4842 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4843 it is aligned, i.e., check if it is possible to vectorize it with different
4846 enum dr_alignment_support
4847 vect_supportable_dr_alignment (struct data_reference
*dr
,
4848 bool check_aligned_accesses
)
4850 gimple stmt
= DR_STMT (dr
);
4851 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4852 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4853 enum machine_mode mode
= TYPE_MODE (vectype
);
4854 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4855 struct loop
*vect_loop
= NULL
;
4856 bool nested_in_vect_loop
= false;
4858 if (aligned_access_p (dr
) && !check_aligned_accesses
)
4863 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4864 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
4867 /* Possibly unaligned access. */
4869 /* We can choose between using the implicit realignment scheme (generating
4870 a misaligned_move stmt) and the explicit realignment scheme (generating
4871 aligned loads with a REALIGN_LOAD). There are two variants to the
4872 explicit realignment scheme: optimized, and unoptimized.
4873 We can optimize the realignment only if the step between consecutive
4874 vector loads is equal to the vector size. Since the vector memory
4875 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4876 is guaranteed that the misalignment amount remains the same throughout the
4877 execution of the vectorized loop. Therefore, we can create the
4878 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4879 at the loop preheader.
4881 However, in the case of outer-loop vectorization, when vectorizing a
4882 memory access in the inner-loop nested within the LOOP that is now being
4883 vectorized, while it is guaranteed that the misalignment of the
4884 vectorized memory access will remain the same in different outer-loop
4885 iterations, it is *not* guaranteed that is will remain the same throughout
4886 the execution of the inner-loop. This is because the inner-loop advances
4887 with the original scalar step (and not in steps of VS). If the inner-loop
4888 step happens to be a multiple of VS, then the misalignment remains fixed
4889 and we can use the optimized realignment scheme. For example:
4895 When vectorizing the i-loop in the above example, the step between
4896 consecutive vector loads is 1, and so the misalignment does not remain
4897 fixed across the execution of the inner-loop, and the realignment cannot
4898 be optimized (as illustrated in the following pseudo vectorized loop):
4900 for (i=0; i<N; i+=4)
4901 for (j=0; j<M; j++){
4902 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4903 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4904 // (assuming that we start from an aligned address).
4907 We therefore have to use the unoptimized realignment scheme:
4909 for (i=0; i<N; i+=4)
4910 for (j=k; j<M; j+=4)
4911 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4912 // that the misalignment of the initial address is
4915 The loop can then be vectorized as follows:
4917 for (k=0; k<4; k++){
4918 rt = get_realignment_token (&vp[k]);
4919 for (i=0; i<N; i+=4){
4921 for (j=k; j<M; j+=4){
4923 va = REALIGN_LOAD <v1,v2,rt>;
4930 if (DR_IS_READ (dr
))
4932 bool is_packed
= false;
4933 tree type
= (TREE_TYPE (DR_REF (dr
)));
4935 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
4936 && (!targetm
.vectorize
.builtin_mask_for_load
4937 || targetm
.vectorize
.builtin_mask_for_load ()))
4939 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4940 if ((nested_in_vect_loop
4941 && (TREE_INT_CST_LOW (DR_STEP (dr
))
4942 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
4944 return dr_explicit_realign
;
4946 return dr_explicit_realign_optimized
;
4948 if (!known_alignment_for_access_p (dr
))
4949 is_packed
= not_size_aligned (DR_REF (dr
));
4951 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4952 || targetm
.vectorize
.
4953 support_vector_misalignment (mode
, type
,
4954 DR_MISALIGNMENT (dr
), is_packed
))
4955 /* Can't software pipeline the loads, but can at least do them. */
4956 return dr_unaligned_supported
;
4960 bool is_packed
= false;
4961 tree type
= (TREE_TYPE (DR_REF (dr
)));
4963 if (!known_alignment_for_access_p (dr
))
4964 is_packed
= not_size_aligned (DR_REF (dr
));
4966 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4967 || targetm
.vectorize
.
4968 support_vector_misalignment (mode
, type
,
4969 DR_MISALIGNMENT (dr
), is_packed
))
4970 return dr_unaligned_supported
;
4974 return dr_unaligned_unsupported
;