1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
36 #include "gimple-expr.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
65 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
66 tree vectype
, unsigned HOST_WIDE_INT count
)
68 enum machine_mode mode
, array_mode
;
71 mode
= TYPE_MODE (vectype
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
76 if (array_mode
== BLKmode
)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
81 GET_MODE_NAME (mode
), count
);
85 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
89 "cannot use %s<%s><%s>\n", name
,
90 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE
, vect_location
,
96 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
97 GET_MODE_NAME (mode
));
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
121 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
122 HOST_WIDE_INT
*rhs_size_unit
)
124 tree scalar_type
= gimple_expr_type (stmt
);
125 HOST_WIDE_INT lhs
, rhs
;
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE
, vect_location
,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
165 dump_printf (MSG_NOTE
, " and ");
166 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
167 dump_printf (MSG_NOTE
, "\n");
170 if (optimize_loop_nest_for_size_p (loop
))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
174 "versioning not supported when optimizing"
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
184 "versioning not yet supported for outer-loops.\n");
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
190 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
195 "versioning not yet supported for non-constant "
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
213 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
214 loop_vec_info loop_vinfo
, int *max_vf
)
217 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
218 struct data_reference
*dra
= DDR_A (ddr
);
219 struct data_reference
*drb
= DDR_B (ddr
);
220 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
221 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
222 lambda_vector dist_v
;
223 unsigned int loop_depth
;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
235 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
238 /* Even if we have an anti-dependence then, as the vectorized loop covers at
239 least two scalar iterations, there is always also a true dependence.
240 As the vectorizer does not re-order loads and stores we can ignore
241 the anti-dependence if TBAA can disambiguate both DRs similar to the
242 case with known negative distance anti-dependences (positive
243 distance anti-dependences would violate TBAA constraints). */
244 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
245 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
246 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
247 get_alias_set (DR_REF (drb
))))
250 /* Unknown data dependence. */
251 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
253 /* If user asserted safelen consecutive iterations can be
254 executed concurrently, assume independence. */
255 if (loop
->safelen
>= 2)
257 if (loop
->safelen
< *max_vf
)
258 *max_vf
= loop
->safelen
;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
263 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
264 || STMT_VINFO_GATHER_P (stmtinfo_b
))
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
269 "versioning for alias not supported for: "
270 "can't determine dependence between ");
271 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
273 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
276 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
284 "versioning for alias required: "
285 "can't determine dependence between ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
288 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
291 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
294 /* Add to list of ddrs that need to be tested at run-time. */
295 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
298 /* Known data dependence. */
299 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
301 /* If user asserted safelen consecutive iterations can be
302 executed concurrently, assume independence. */
303 if (loop
->safelen
>= 2)
305 if (loop
->safelen
< *max_vf
)
306 *max_vf
= loop
->safelen
;
307 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
312 || STMT_VINFO_GATHER_P (stmtinfo_b
))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
321 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
324 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
329 if (dump_enabled_p ())
331 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
332 "versioning for alias required: "
333 "bad dist vector for ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
335 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
336 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
337 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
339 /* Add to list of ddrs that need to be tested at run-time. */
340 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
343 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
344 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
346 int dist
= dist_v
[loop_depth
];
348 if (dump_enabled_p ())
349 dump_printf_loc (MSG_NOTE
, vect_location
,
350 "dependence distance = %d.\n", dist
);
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE
, vect_location
,
357 "dependence distance == 0 between ");
358 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
359 dump_printf (MSG_NOTE
, " and ");
360 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
361 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
364 /* When we perform grouped accesses and perform implicit CSE
365 by detecting equal accesses and doing disambiguation with
366 runtime alias tests like for
374 where we will end up loading { a[i], a[i+1] } once, make
375 sure that inserting group loads before the first load and
376 stores after the last store will do the right thing. */
377 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
378 && GROUP_SAME_DR_STMT (stmtinfo_a
))
379 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
380 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
383 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
385 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
389 "READ_WRITE dependence in interleaving."
398 if (dist
> 0 && DDR_REVERSED_P (ddr
))
400 /* If DDR_REVERSED_P the order of the data-refs in DDR was
401 reversed (to make distance vector positive), and the actual
402 distance is negative. */
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
405 "dependence distance negative.\n");
410 && abs (dist
) < *max_vf
)
412 /* The dependence distance requires reduction of the maximal
413 vectorization factor. */
414 *max_vf
= abs (dist
);
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE
, vect_location
,
417 "adjusting maximal vectorization factor to %i\n",
421 if (abs (dist
) >= *max_vf
)
423 /* Dependence distance does not create dependence, as far as
424 vectorization is concerned, in this case. */
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_NOTE
, vect_location
,
427 "dependence distance >= VF.\n");
431 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
434 "not vectorized, possible dependence "
435 "between data-refs ");
436 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
437 dump_printf (MSG_NOTE
, " and ");
438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
439 dump_printf (MSG_NOTE
, "\n");
448 /* Function vect_analyze_data_ref_dependences.
450 Examine all the data references in the loop, and make sure there do not
451 exist any data dependences between them. Set *MAX_VF according to
452 the maximum vectorization factor the data dependences allow. */
455 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
458 struct data_dependence_relation
*ddr
;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE
, vect_location
,
462 "=== vect_analyze_data_ref_dependences ===\n");
464 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
465 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
466 &LOOP_VINFO_DDRS (loop_vinfo
),
467 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
470 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
471 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
478 /* Function vect_slp_analyze_data_ref_dependence.
480 Return TRUE if there (might) exist a dependence between a memory-reference
481 DRA and a memory-reference DRB. When versioning for alias may check a
482 dependence at run-time, return FALSE. Adjust *MAX_VF according to
483 the data dependence. */
486 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
488 struct data_reference
*dra
= DDR_A (ddr
);
489 struct data_reference
*drb
= DDR_B (ddr
);
491 /* We need to check dependences of statements marked as unvectorizable
492 as well, they still can prohibit vectorization. */
494 /* Independent data accesses. */
495 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
501 /* Read-read is OK. */
502 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
505 /* If dra and drb are part of the same interleaving chain consider
507 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
508 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
509 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
512 /* Unknown data dependence. */
513 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
518 "can't determine dependence between ");
519 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
520 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
522 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
525 else if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE
, vect_location
,
528 "determined dependence between ");
529 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
530 dump_printf (MSG_NOTE
, " and ");
531 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
532 dump_printf (MSG_NOTE
, "\n");
535 /* We do not vectorize basic blocks with write-write dependencies. */
536 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
539 /* If we have a read-write dependence check that the load is before the store.
540 When we vectorize basic blocks, vector load can be only before
541 corresponding scalar load, and vector store can be only after its
542 corresponding scalar store. So the order of the acceses is preserved in
543 case the load is before the store. */
544 gimple earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
545 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
547 /* That only holds for load-store pairs taking part in vectorization. */
548 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
549 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
557 /* Function vect_analyze_data_ref_dependences.
559 Examine all the data references in the basic-block, and make sure there
560 do not exist any data dependences between them. Set *MAX_VF according to
561 the maximum vectorization factor the data dependences allow. */
564 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
566 struct data_dependence_relation
*ddr
;
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE
, vect_location
,
571 "=== vect_slp_analyze_data_ref_dependences ===\n");
573 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
574 &BB_VINFO_DDRS (bb_vinfo
),
578 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
579 if (vect_slp_analyze_data_ref_dependence (ddr
))
586 /* Function vect_compute_data_ref_alignment
588 Compute the misalignment of the data reference DR.
591 1. If during the misalignment computation it is found that the data reference
592 cannot be vectorized then false is returned.
593 2. DR_MISALIGNMENT (DR) is defined.
595 FOR NOW: No analysis is actually performed. Misalignment is calculated
596 only for trivial cases. TODO. */
599 vect_compute_data_ref_alignment (struct data_reference
*dr
)
601 gimple stmt
= DR_STMT (dr
);
602 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
603 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
604 struct loop
*loop
= NULL
;
605 tree ref
= DR_REF (dr
);
607 tree base
, base_addr
;
610 tree aligned_to
, alignment
;
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE
, vect_location
,
614 "vect_compute_data_ref_alignment:\n");
617 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
619 /* Initialize misalignment to unknown. */
620 SET_DR_MISALIGNMENT (dr
, -1);
622 /* Strided loads perform only component accesses, misalignment information
623 is irrelevant for them. */
624 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
627 misalign
= DR_INIT (dr
);
628 aligned_to
= DR_ALIGNED_TO (dr
);
629 base_addr
= DR_BASE_ADDRESS (dr
);
630 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
632 /* In case the dataref is in an inner-loop of the loop that is being
633 vectorized (LOOP), we use the base and misalignment information
634 relative to the outer-loop (LOOP). This is ok only if the misalignment
635 stays the same throughout the execution of the inner-loop, which is why
636 we have to check that the stride of the dataref in the inner-loop evenly
637 divides by the vector size. */
638 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
640 tree step
= DR_STEP (dr
);
641 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
643 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_NOTE
, vect_location
,
647 "inner step divides the vector-size.\n");
648 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
649 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
650 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
656 "inner step doesn't divide the vector-size.\n");
657 misalign
= NULL_TREE
;
661 /* Similarly, if we're doing basic-block vectorization, we can only use
662 base and misalignment information relative to an innermost loop if the
663 misalignment stays the same throughout the execution of the loop.
664 As above, this is the case if the stride of the dataref evenly divides
665 by the vector size. */
668 tree step
= DR_STEP (dr
);
669 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
671 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
675 "SLP: step doesn't divide the vector-size.\n");
676 misalign
= NULL_TREE
;
680 base
= build_fold_indirect_ref (base_addr
);
681 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
683 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
689 "Unknown alignment for access: ");
690 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
691 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
697 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
699 || (TREE_CODE (base_addr
) == SSA_NAME
700 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
701 TREE_TYPE (base_addr
)))),
703 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
706 base_aligned
= false;
710 /* Do not change the alignment of global variables here if
711 flag_section_anchors is enabled as we already generated
712 RTL for other functions. Most global variables should
713 have been aligned during the IPA increase_alignment pass. */
714 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
715 || (TREE_STATIC (base
) && flag_section_anchors
))
717 if (dump_enabled_p ())
719 dump_printf_loc (MSG_NOTE
, vect_location
,
720 "can't force alignment of ref: ");
721 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
722 dump_printf (MSG_NOTE
, "\n");
727 /* Force the alignment of the decl.
728 NOTE: This is the only change to the code we make during
729 the analysis phase, before deciding to vectorize the loop. */
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
733 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
734 dump_printf (MSG_NOTE
, "\n");
737 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
738 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
741 /* If this is a backward running DR then first access in the larger
742 vectype actually is N-1 elements before the address in the DR.
743 Adjust misalign accordingly. */
744 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
746 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
747 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
748 otherwise we wouldn't be here. */
749 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
750 /* PLUS because DR_STEP was negative. */
751 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
754 /* Modulo alignment. */
755 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
757 if (!tree_fits_uhwi_p (misalign
))
759 /* Negative or overflowed misalignment value. */
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
762 "unexpected misalign value\n");
766 SET_DR_MISALIGNMENT (dr
, tree_to_uhwi (misalign
));
768 if (dump_enabled_p ())
770 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
771 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
772 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
773 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
780 /* Function vect_compute_data_refs_alignment
782 Compute the misalignment of data references in the loop.
783 Return FALSE if a data reference is found that cannot be vectorized. */
786 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
787 bb_vec_info bb_vinfo
)
789 vec
<data_reference_p
> datarefs
;
790 struct data_reference
*dr
;
794 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
796 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
798 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
799 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
800 && !vect_compute_data_ref_alignment (dr
))
804 /* Mark unsupported statement as unvectorizable. */
805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
816 /* Function vect_update_misalignment_for_peel
818 DR - the data reference whose misalignment is to be adjusted.
819 DR_PEEL - the data reference whose misalignment is being made
820 zero in the vector loop by the peel.
821 NPEEL - the number of iterations in the peel loop if the misalignment
822 of DR_PEEL is known at compile time. */
825 vect_update_misalignment_for_peel (struct data_reference
*dr
,
826 struct data_reference
*dr_peel
, int npeel
)
829 vec
<dr_p
> same_align_drs
;
830 struct data_reference
*current_dr
;
831 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
832 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
833 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
834 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
836 /* For interleaved data accesses the step in the loop must be multiplied by
837 the size of the interleaving group. */
838 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
839 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
840 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
841 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
843 /* It can be assumed that the data refs with the same alignment as dr_peel
844 are aligned in the vector loop. */
846 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
847 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
849 if (current_dr
!= dr
)
851 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
852 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
853 SET_DR_MISALIGNMENT (dr
, 0);
857 if (known_alignment_for_access_p (dr
)
858 && known_alignment_for_access_p (dr_peel
))
860 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
861 int misal
= DR_MISALIGNMENT (dr
);
862 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
863 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
864 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
865 SET_DR_MISALIGNMENT (dr
, misal
);
869 if (dump_enabled_p ())
870 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
871 SET_DR_MISALIGNMENT (dr
, -1);
875 /* Function vect_verify_datarefs_alignment
877 Return TRUE if all data references in the loop can be
878 handled with respect to alignment. */
881 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
883 vec
<data_reference_p
> datarefs
;
884 struct data_reference
*dr
;
885 enum dr_alignment_support supportable_dr_alignment
;
889 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
891 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
893 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
895 gimple stmt
= DR_STMT (dr
);
896 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
898 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
901 /* For interleaving, only the alignment of the first access matters.
902 Skip statements marked as not vectorizable. */
903 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
904 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
905 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
908 /* Strided loads perform only component accesses, alignment is
909 irrelevant for them. */
910 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
913 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
914 if (!supportable_dr_alignment
)
916 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
920 "not vectorized: unsupported unaligned load.");
922 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
923 "not vectorized: unsupported unaligned "
926 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
928 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
932 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE
, vect_location
,
934 "Vectorizing an unaligned access.\n");
939 /* Given an memory reference EXP return whether its alignment is less
943 not_size_aligned (tree exp
)
945 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
948 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
949 > get_object_alignment (exp
));
952 /* Function vector_alignment_reachable_p
954 Return true if vector alignment for DR is reachable by peeling
955 a few loop iterations. Return false otherwise. */
958 vector_alignment_reachable_p (struct data_reference
*dr
)
960 gimple stmt
= DR_STMT (dr
);
961 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
962 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
964 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
966 /* For interleaved access we peel only if number of iterations in
967 the prolog loop ({VF - misalignment}), is a multiple of the
968 number of the interleaved accesses. */
969 int elem_size
, mis_in_elements
;
970 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
972 /* FORNOW: handle only known alignment. */
973 if (!known_alignment_for_access_p (dr
))
976 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
977 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
979 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
983 /* If misalignment is known at the compile time then allow peeling
984 only if natural alignment is reachable through peeling. */
985 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
987 HOST_WIDE_INT elmsize
=
988 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
989 if (dump_enabled_p ())
991 dump_printf_loc (MSG_NOTE
, vect_location
,
992 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
993 dump_printf (MSG_NOTE
,
994 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
996 if (DR_MISALIGNMENT (dr
) % elmsize
)
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1000 "data size does not divide the misalignment.\n");
1005 if (!known_alignment_for_access_p (dr
))
1007 tree type
= TREE_TYPE (DR_REF (dr
));
1008 bool is_packed
= not_size_aligned (DR_REF (dr
));
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1011 "Unknown misalignment, is_packed = %d\n",is_packed
);
1012 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1013 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1023 /* Calculate the cost of the memory access represented by DR. */
1026 vect_get_data_access_cost (struct data_reference
*dr
,
1027 unsigned int *inside_cost
,
1028 unsigned int *outside_cost
,
1029 stmt_vector_for_cost
*body_cost_vec
)
1031 gimple stmt
= DR_STMT (dr
);
1032 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1033 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1034 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1035 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1036 int ncopies
= vf
/ nunits
;
1038 if (DR_IS_READ (dr
))
1039 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1040 NULL
, body_cost_vec
, false);
1042 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_NOTE
, vect_location
,
1046 "vect_get_data_access_cost: inside_cost = %d, "
1047 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1051 /* Insert DR into peeling hash table with NPEEL as key. */
1054 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1057 struct _vect_peel_info elem
, *slot
;
1058 _vect_peel_info
**new_slot
;
1059 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1062 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1067 slot
= XNEW (struct _vect_peel_info
);
1068 slot
->npeel
= npeel
;
1071 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1075 if (!supportable_dr_alignment
1076 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1077 slot
->count
+= VECT_MAX_COST
;
1081 /* Traverse peeling hash table to find peeling option that aligns maximum
1082 number of data accesses. */
1085 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1086 _vect_peel_extended_info
*max
)
1088 vect_peel_info elem
= *slot
;
1090 if (elem
->count
> max
->peel_info
.count
1091 || (elem
->count
== max
->peel_info
.count
1092 && max
->peel_info
.npeel
> elem
->npeel
))
1094 max
->peel_info
.npeel
= elem
->npeel
;
1095 max
->peel_info
.count
= elem
->count
;
1096 max
->peel_info
.dr
= elem
->dr
;
1103 /* Traverse peeling hash table and calculate cost for each peeling option.
1104 Find the one with the lowest cost. */
1107 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1108 _vect_peel_extended_info
*min
)
1110 vect_peel_info elem
= *slot
;
1111 int save_misalignment
, dummy
;
1112 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1113 gimple stmt
= DR_STMT (elem
->dr
);
1114 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1115 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1116 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1117 struct data_reference
*dr
;
1118 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1119 int single_iter_cost
;
1121 prologue_cost_vec
.create (2);
1122 body_cost_vec
.create (2);
1123 epilogue_cost_vec
.create (2);
1125 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1127 stmt
= DR_STMT (dr
);
1128 stmt_info
= vinfo_for_stmt (stmt
);
1129 /* For interleaving, only the alignment of the first access
1131 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1132 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1135 save_misalignment
= DR_MISALIGNMENT (dr
);
1136 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1137 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1139 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1142 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1143 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1144 &dummy
, single_iter_cost
,
1146 &epilogue_cost_vec
);
1148 /* Prologue and epilogue costs are added to the target model later.
1149 These costs depend only on the scalar iteration cost, the
1150 number of peeling iterations finally chosen, and the number of
1151 misaligned statements. So discard the information found here. */
1152 prologue_cost_vec
.release ();
1153 epilogue_cost_vec
.release ();
1155 if (inside_cost
< min
->inside_cost
1156 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1158 min
->inside_cost
= inside_cost
;
1159 min
->outside_cost
= outside_cost
;
1160 min
->body_cost_vec
.release ();
1161 min
->body_cost_vec
= body_cost_vec
;
1162 min
->peel_info
.dr
= elem
->dr
;
1163 min
->peel_info
.npeel
= elem
->npeel
;
1166 body_cost_vec
.release ();
1172 /* Choose best peeling option by traversing peeling hash table and either
1173 choosing an option with the lowest cost (if cost model is enabled) or the
1174 option that aligns as many accesses as possible. */
1176 static struct data_reference
*
1177 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1178 unsigned int *npeel
,
1179 stmt_vector_for_cost
*body_cost_vec
)
1181 struct _vect_peel_extended_info res
;
1183 res
.peel_info
.dr
= NULL
;
1184 res
.body_cost_vec
= stmt_vector_for_cost ();
1186 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1188 res
.inside_cost
= INT_MAX
;
1189 res
.outside_cost
= INT_MAX
;
1190 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1191 .traverse
<_vect_peel_extended_info
*,
1192 vect_peeling_hash_get_lowest_cost
> (&res
);
1196 res
.peel_info
.count
= 0;
1197 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1198 .traverse
<_vect_peel_extended_info
*,
1199 vect_peeling_hash_get_most_frequent
> (&res
);
1202 *npeel
= res
.peel_info
.npeel
;
1203 *body_cost_vec
= res
.body_cost_vec
;
1204 return res
.peel_info
.dr
;
1208 /* Function vect_enhance_data_refs_alignment
1210 This pass will use loop versioning and loop peeling in order to enhance
1211 the alignment of data references in the loop.
1213 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1214 original loop is to be vectorized. Any other loops that are created by
1215 the transformations performed in this pass - are not supposed to be
1216 vectorized. This restriction will be relaxed.
1218 This pass will require a cost model to guide it whether to apply peeling
1219 or versioning or a combination of the two. For example, the scheme that
1220 intel uses when given a loop with several memory accesses, is as follows:
1221 choose one memory access ('p') which alignment you want to force by doing
1222 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1223 other accesses are not necessarily aligned, or (2) use loop versioning to
1224 generate one loop in which all accesses are aligned, and another loop in
1225 which only 'p' is necessarily aligned.
1227 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1228 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1229 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1231 Devising a cost model is the most critical aspect of this work. It will
1232 guide us on which access to peel for, whether to use loop versioning, how
1233 many versions to create, etc. The cost model will probably consist of
1234 generic considerations as well as target specific considerations (on
1235 powerpc for example, misaligned stores are more painful than misaligned
1238 Here are the general steps involved in alignment enhancements:
1240 -- original loop, before alignment analysis:
1241 for (i=0; i<N; i++){
1242 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1243 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1246 -- After vect_compute_data_refs_alignment:
1247 for (i=0; i<N; i++){
1248 x = q[i]; # DR_MISALIGNMENT(q) = 3
1249 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1252 -- Possibility 1: we do loop versioning:
1254 for (i=0; i<N; i++){ # loop 1A
1255 x = q[i]; # DR_MISALIGNMENT(q) = 3
1256 p[i] = y; # DR_MISALIGNMENT(p) = 0
1260 for (i=0; i<N; i++){ # loop 1B
1261 x = q[i]; # DR_MISALIGNMENT(q) = 3
1262 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1266 -- Possibility 2: we do loop peeling:
1267 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1271 for (i = 3; i < N; i++){ # loop 2A
1272 x = q[i]; # DR_MISALIGNMENT(q) = 0
1273 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1276 -- Possibility 3: combination of loop peeling and versioning:
1277 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1282 for (i = 3; i<N; i++){ # loop 3A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = 0
1288 for (i = 3; i<N; i++){ # loop 3B
1289 x = q[i]; # DR_MISALIGNMENT(q) = 0
1290 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1294 These loops are later passed to loop_transform to be vectorized. The
1295 vectorizer will use the alignment information to guide the transformation
1296 (whether to generate regular loads/stores, or with special handling for
1300 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1302 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1303 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1304 enum dr_alignment_support supportable_dr_alignment
;
1305 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1306 struct data_reference
*dr
;
1308 bool do_peeling
= false;
1309 bool do_versioning
= false;
1312 stmt_vec_info stmt_info
;
1313 unsigned int npeel
= 0;
1314 bool all_misalignments_unknown
= true;
1315 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1316 unsigned possible_npeel_number
= 1;
1318 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1319 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_NOTE
, vect_location
,
1323 "=== vect_enhance_data_refs_alignment ===\n");
1325 /* While cost model enhancements are expected in the future, the high level
1326 view of the code at this time is as follows:
1328 A) If there is a misaligned access then see if peeling to align
1329 this access can make all data references satisfy
1330 vect_supportable_dr_alignment. If so, update data structures
1331 as needed and return true.
1333 B) If peeling wasn't possible and there is a data reference with an
1334 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1335 then see if loop versioning checks can be used to make all data
1336 references satisfy vect_supportable_dr_alignment. If so, update
1337 data structures as needed and return true.
1339 C) If neither peeling nor versioning were successful then return false if
1340 any data reference does not satisfy vect_supportable_dr_alignment.
1342 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1344 Note, Possibility 3 above (which is peeling and versioning together) is not
1345 being done at this time. */
1347 /* (1) Peeling to force alignment. */
1349 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1351 + How many accesses will become aligned due to the peeling
1352 - How many accesses will become unaligned due to the peeling,
1353 and the cost of misaligned accesses.
1354 - The cost of peeling (the extra runtime checks, the increase
1357 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1359 stmt
= DR_STMT (dr
);
1360 stmt_info
= vinfo_for_stmt (stmt
);
1362 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1365 /* For interleaving, only the alignment of the first access
1367 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1368 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1371 /* For invariant accesses there is nothing to enhance. */
1372 if (integer_zerop (DR_STEP (dr
)))
1375 /* Strided loads perform only component accesses, alignment is
1376 irrelevant for them. */
1377 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1380 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1381 do_peeling
= vector_alignment_reachable_p (dr
);
1384 if (known_alignment_for_access_p (dr
))
1386 unsigned int npeel_tmp
;
1387 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1388 size_zero_node
) < 0;
1390 /* Save info about DR in the hash table. */
1391 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1392 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1394 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1395 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1396 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1397 TREE_TYPE (DR_REF (dr
))));
1398 npeel_tmp
= (negative
1399 ? (mis
- nelements
) : (nelements
- mis
))
1402 /* For multiple types, it is possible that the bigger type access
1403 will have more than one peeling option. E.g., a loop with two
1404 types: one of size (vector size / 4), and the other one of
1405 size (vector size / 8). Vectorization factor will 8. If both
1406 access are misaligned by 3, the first one needs one scalar
1407 iteration to be aligned, and the second one needs 5. But the
1408 the first one will be aligned also by peeling 5 scalar
1409 iterations, and in that case both accesses will be aligned.
1410 Hence, except for the immediate peeling amount, we also want
1411 to try to add full vector size, while we don't exceed
1412 vectorization factor.
1413 We do this automtically for cost model, since we calculate cost
1414 for every peeling option. */
1415 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1416 possible_npeel_number
= vf
/nelements
;
1418 /* Handle the aligned case. We may decide to align some other
1419 access, making DR unaligned. */
1420 if (DR_MISALIGNMENT (dr
) == 0)
1423 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1424 possible_npeel_number
++;
1427 for (j
= 0; j
< possible_npeel_number
; j
++)
1429 gcc_assert (npeel_tmp
<= vf
);
1430 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1431 npeel_tmp
+= nelements
;
1434 all_misalignments_unknown
= false;
1435 /* Data-ref that was chosen for the case that all the
1436 misalignments are unknown is not relevant anymore, since we
1437 have a data-ref with known alignment. */
1442 /* If we don't know any misalignment values, we prefer
1443 peeling for data-ref that has the maximum number of data-refs
1444 with the same alignment, unless the target prefers to align
1445 stores over load. */
1446 if (all_misalignments_unknown
)
1448 unsigned same_align_drs
1449 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1451 || same_align_drs_max
< same_align_drs
)
1453 same_align_drs_max
= same_align_drs
;
1456 /* For data-refs with the same number of related
1457 accesses prefer the one where the misalign
1458 computation will be invariant in the outermost loop. */
1459 else if (same_align_drs_max
== same_align_drs
)
1461 struct loop
*ivloop0
, *ivloop
;
1462 ivloop0
= outermost_invariant_loop_for_expr
1463 (loop
, DR_BASE_ADDRESS (dr0
));
1464 ivloop
= outermost_invariant_loop_for_expr
1465 (loop
, DR_BASE_ADDRESS (dr
));
1466 if ((ivloop
&& !ivloop0
)
1467 || (ivloop
&& ivloop0
1468 && flow_loop_nested_p (ivloop
, ivloop0
)))
1472 if (!first_store
&& DR_IS_WRITE (dr
))
1476 /* If there are both known and unknown misaligned accesses in the
1477 loop, we choose peeling amount according to the known
1479 if (!supportable_dr_alignment
)
1482 if (!first_store
&& DR_IS_WRITE (dr
))
1489 if (!aligned_access_p (dr
))
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1493 "vector alignment may not be reachable\n");
1499 /* Check if we can possibly peel the loop. */
1500 if (!vect_can_advance_ivs_p (loop_vinfo
)
1501 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1504 if (do_peeling
&& all_misalignments_unknown
1505 && vect_supportable_dr_alignment (dr0
, false))
1508 /* Check if the target requires to prefer stores over loads, i.e., if
1509 misaligned stores are more expensive than misaligned loads (taking
1510 drs with same alignment into account). */
1511 if (first_store
&& DR_IS_READ (dr0
))
1513 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1514 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1515 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1516 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1517 stmt_vector_for_cost dummy
;
1520 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1522 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1523 &store_outside_cost
, &dummy
);
1527 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1528 aligning the load DR0). */
1529 load_inside_penalty
= store_inside_cost
;
1530 load_outside_penalty
= store_outside_cost
;
1532 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1533 DR_STMT (first_store
))).iterate (i
, &dr
);
1535 if (DR_IS_READ (dr
))
1537 load_inside_penalty
+= load_inside_cost
;
1538 load_outside_penalty
+= load_outside_cost
;
1542 load_inside_penalty
+= store_inside_cost
;
1543 load_outside_penalty
+= store_outside_cost
;
1546 /* Calculate the penalty for leaving DR0 unaligned (by
1547 aligning the FIRST_STORE). */
1548 store_inside_penalty
= load_inside_cost
;
1549 store_outside_penalty
= load_outside_cost
;
1551 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1552 DR_STMT (dr0
))).iterate (i
, &dr
);
1554 if (DR_IS_READ (dr
))
1556 store_inside_penalty
+= load_inside_cost
;
1557 store_outside_penalty
+= load_outside_cost
;
1561 store_inside_penalty
+= store_inside_cost
;
1562 store_outside_penalty
+= store_outside_cost
;
1565 if (load_inside_penalty
> store_inside_penalty
1566 || (load_inside_penalty
== store_inside_penalty
1567 && load_outside_penalty
> store_outside_penalty
))
1571 /* In case there are only loads with different unknown misalignments, use
1572 peeling only if it may help to align other accesses in the loop. */
1574 && !STMT_VINFO_SAME_ALIGN_REFS (
1575 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1576 && vect_supportable_dr_alignment (dr0
, false)
1577 != dr_unaligned_supported
)
1581 if (do_peeling
&& !dr0
)
1583 /* Peeling is possible, but there is no data access that is not supported
1584 unless aligned. So we try to choose the best possible peeling. */
1586 /* We should get here only if there are drs with known misalignment. */
1587 gcc_assert (!all_misalignments_unknown
);
1589 /* Choose the best peeling from the hash table. */
1590 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1598 stmt
= DR_STMT (dr0
);
1599 stmt_info
= vinfo_for_stmt (stmt
);
1600 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1601 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1603 if (known_alignment_for_access_p (dr0
))
1605 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1606 size_zero_node
) < 0;
1609 /* Since it's known at compile time, compute the number of
1610 iterations in the peeled loop (the peeling factor) for use in
1611 updating DR_MISALIGNMENT values. The peeling factor is the
1612 vectorization factor minus the misalignment as an element
1614 mis
= DR_MISALIGNMENT (dr0
);
1615 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1616 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1620 /* For interleaved data access every iteration accesses all the
1621 members of the group, therefore we divide the number of iterations
1622 by the group size. */
1623 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1624 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1625 npeel
/= GROUP_SIZE (stmt_info
);
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_NOTE
, vect_location
,
1629 "Try peeling by %d\n", npeel
);
1632 /* Ensure that all data refs can be vectorized after the peel. */
1633 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1635 int save_misalignment
;
1640 stmt
= DR_STMT (dr
);
1641 stmt_info
= vinfo_for_stmt (stmt
);
1642 /* For interleaving, only the alignment of the first access
1644 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1645 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1648 /* Strided loads perform only component accesses, alignment is
1649 irrelevant for them. */
1650 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1653 save_misalignment
= DR_MISALIGNMENT (dr
);
1654 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1655 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1656 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1658 if (!supportable_dr_alignment
)
1665 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1667 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1672 body_cost_vec
.release ();
1679 unsigned max_allowed_peel
1680 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1681 if (max_allowed_peel
!= (unsigned)-1)
1683 unsigned max_peel
= npeel
;
1686 gimple dr_stmt
= DR_STMT (dr0
);
1687 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1688 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1689 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1691 if (max_peel
> max_allowed_peel
)
1694 if (dump_enabled_p ())
1695 dump_printf_loc (MSG_NOTE
, vect_location
,
1696 "Disable peeling, max peels reached: %d\n", max_peel
);
1703 stmt_info_for_cost
*si
;
1704 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1706 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1707 If the misalignment of DR_i is identical to that of dr0 then set
1708 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1709 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1710 by the peeling factor times the element size of DR_i (MOD the
1711 vectorization factor times the size). Otherwise, the
1712 misalignment of DR_i must be set to unknown. */
1713 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1715 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1717 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1719 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1721 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1722 = DR_MISALIGNMENT (dr0
);
1723 SET_DR_MISALIGNMENT (dr0
, 0);
1724 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE
, vect_location
,
1727 "Alignment of access forced using peeling.\n");
1728 dump_printf_loc (MSG_NOTE
, vect_location
,
1729 "Peeling for alignment will be applied.\n");
1731 /* We've delayed passing the inside-loop peeling costs to the
1732 target cost model until we were sure peeling would happen.
1734 if (body_cost_vec
.exists ())
1736 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1738 struct _stmt_vec_info
*stmt_info
1739 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1740 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1741 si
->misalign
, vect_body
);
1743 body_cost_vec
.release ();
1746 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1752 body_cost_vec
.release ();
1754 /* (2) Versioning to force alignment. */
1756 /* Try versioning if:
1757 1) optimize loop for speed
1758 2) there is at least one unsupported misaligned data ref with an unknown
1760 3) all misaligned data refs with a known misalignment are supported, and
1761 4) the number of runtime alignment checks is within reason. */
1764 optimize_loop_nest_for_speed_p (loop
)
1765 && (!loop
->inner
); /* FORNOW */
1769 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1771 stmt
= DR_STMT (dr
);
1772 stmt_info
= vinfo_for_stmt (stmt
);
1774 /* For interleaving, only the alignment of the first access
1776 if (aligned_access_p (dr
)
1777 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1778 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1781 /* Strided loads perform only component accesses, alignment is
1782 irrelevant for them. */
1783 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1786 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1788 if (!supportable_dr_alignment
)
1794 if (known_alignment_for_access_p (dr
)
1795 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1796 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1798 do_versioning
= false;
1802 stmt
= DR_STMT (dr
);
1803 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1804 gcc_assert (vectype
);
1806 /* The rightmost bits of an aligned address must be zeros.
1807 Construct the mask needed for this test. For example,
1808 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1809 mask must be 15 = 0xf. */
1810 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1812 /* FORNOW: use the same mask to test all potentially unaligned
1813 references in the loop. The vectorizer currently supports
1814 a single vector size, see the reference to
1815 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1816 vectorization factor is computed. */
1817 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1818 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1819 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1825 /* Versioning requires at least one misaligned data reference. */
1826 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1827 do_versioning
= false;
1828 else if (!do_versioning
)
1829 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1834 vec
<gimple
> may_misalign_stmts
1835 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1838 /* It can now be assumed that the data references in the statements
1839 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1840 of the loop being vectorized. */
1841 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1843 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1844 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1845 SET_DR_MISALIGNMENT (dr
, 0);
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE
, vect_location
,
1848 "Alignment of access forced using versioning.\n");
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_NOTE
, vect_location
,
1853 "Versioning for alignment will be applied.\n");
1855 /* Peeling and versioning can't be done together at this time. */
1856 gcc_assert (! (do_peeling
&& do_versioning
));
1858 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1863 /* This point is reached if neither peeling nor versioning is being done. */
1864 gcc_assert (! (do_peeling
|| do_versioning
));
1866 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1871 /* Function vect_find_same_alignment_drs.
1873 Update group and alignment relations according to the chosen
1874 vectorization factor. */
1877 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1878 loop_vec_info loop_vinfo
)
1881 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1882 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1883 struct data_reference
*dra
= DDR_A (ddr
);
1884 struct data_reference
*drb
= DDR_B (ddr
);
1885 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1886 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1887 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1888 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1889 lambda_vector dist_v
;
1890 unsigned int loop_depth
;
1892 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1898 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1901 /* Loop-based vectorization and known data dependence. */
1902 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1905 /* Data-dependence analysis reports a distance vector of zero
1906 for data-references that overlap only in the first iteration
1907 but have different sign step (see PR45764).
1908 So as a sanity check require equal DR_STEP. */
1909 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1912 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1913 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1915 int dist
= dist_v
[loop_depth
];
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE
, vect_location
,
1919 "dependence distance = %d.\n", dist
);
1921 /* Same loop iteration. */
1923 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1925 /* Two references with distance zero have the same alignment. */
1926 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1927 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1928 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_NOTE
, vect_location
,
1931 "accesses have the same alignment.\n");
1932 dump_printf (MSG_NOTE
,
1933 "dependence distance modulo vf == 0 between ");
1934 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1935 dump_printf (MSG_NOTE
, " and ");
1936 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1937 dump_printf (MSG_NOTE
, "\n");
1944 /* Function vect_analyze_data_refs_alignment
1946 Analyze the alignment of the data-references in the loop.
1947 Return FALSE if a data reference is found that cannot be vectorized. */
1950 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1951 bb_vec_info bb_vinfo
)
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_NOTE
, vect_location
,
1955 "=== vect_analyze_data_refs_alignment ===\n");
1957 /* Mark groups of data references with same alignment using
1958 data dependence information. */
1961 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1962 struct data_dependence_relation
*ddr
;
1965 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1966 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1969 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1971 if (dump_enabled_p ())
1972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1973 "not vectorized: can't calculate alignment "
1982 /* Analyze groups of accesses: check that DR belongs to a group of
1983 accesses of legal size, step, etc. Detect gaps, single element
1984 interleaving, and other special cases. Set grouped access info.
1985 Collect groups of strided stores for further use in SLP analysis. */
1988 vect_analyze_group_access (struct data_reference
*dr
)
1990 tree step
= DR_STEP (dr
);
1991 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
1992 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
1993 gimple stmt
= DR_STMT (dr
);
1994 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1995 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1996 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
1997 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
1998 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
1999 bool slp_impossible
= false;
2000 struct loop
*loop
= NULL
;
2003 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2005 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2006 size of the interleaving group (including gaps). */
2007 groupsize
= absu_hwi (dr_step
) / type_size
;
2009 /* Not consecutive access is possible only if it is a part of interleaving. */
2010 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2012 /* Check if it this DR is a part of interleaving, and is a single
2013 element of the group that is accessed in the loop. */
2015 /* Gaps are supported only for loads. STEP must be a multiple of the type
2016 size. The size of the group must be a power of 2. */
2018 && (dr_step
% type_size
) == 0
2020 && exact_log2 (groupsize
) != -1)
2022 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2023 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2024 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE
, vect_location
,
2027 "Detected single element interleaving ");
2028 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2029 dump_printf (MSG_NOTE
, " step ");
2030 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2031 dump_printf (MSG_NOTE
, "\n");
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE
, vect_location
,
2038 "Data access with gaps requires scalar "
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2044 "Peeling for outer loop is not"
2049 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2055 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2058 "not consecutive access ");
2059 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2060 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2065 /* Mark the statement as unvectorizable. */
2066 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2073 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2075 /* First stmt in the interleaving chain. Check the chain. */
2076 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2077 struct data_reference
*data_ref
= dr
;
2078 unsigned int count
= 1;
2079 tree prev_init
= DR_INIT (data_ref
);
2081 HOST_WIDE_INT diff
, gaps
= 0;
2082 unsigned HOST_WIDE_INT count_in_bytes
;
2086 /* Skip same data-refs. In case that two or more stmts share
2087 data-ref (supported only for loads), we vectorize only the first
2088 stmt, and the rest get their vectorized loads from the first
2090 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2091 DR_INIT (STMT_VINFO_DATA_REF (
2092 vinfo_for_stmt (next
)))))
2094 if (DR_IS_WRITE (data_ref
))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2098 "Two store stmts share the same dr.\n");
2102 /* For load use the same data-ref load. */
2103 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2106 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2111 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2113 /* All group members have the same STEP by construction. */
2114 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2116 /* Check that the distance between two accesses is equal to the type
2117 size. Otherwise, we have gaps. */
2118 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2119 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2122 /* FORNOW: SLP of accesses with gaps is not supported. */
2123 slp_impossible
= true;
2124 if (DR_IS_WRITE (data_ref
))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2128 "interleaved store with gaps\n");
2135 last_accessed_element
+= diff
;
2137 /* Store the gap from the previous member of the group. If there is no
2138 gap in the access, GROUP_GAP is always 1. */
2139 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2141 prev_init
= DR_INIT (data_ref
);
2142 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2143 /* Count the number of data-refs in the chain. */
2147 /* COUNT is the number of accesses found, we multiply it by the size of
2148 the type to get COUNT_IN_BYTES. */
2149 count_in_bytes
= type_size
* count
;
2151 /* Check that the size of the interleaving (including gaps) is not
2152 greater than STEP. */
2154 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2156 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2159 "interleaving size is greater than step for ");
2160 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2162 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2167 /* Check that the size of the interleaving is equal to STEP for stores,
2168 i.e., that there are no gaps. */
2170 && absu_hwi (dr_step
) != count_in_bytes
)
2172 if (DR_IS_READ (dr
))
2174 slp_impossible
= true;
2175 /* There is a gap after the last load in the group. This gap is a
2176 difference between the groupsize and the number of elements.
2177 When there is no gap, this difference should be 0. */
2178 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "interleaved store with gaps\n");
2189 /* Check that STEP is a multiple of type size. */
2191 && (dr_step
% type_size
) != 0)
2193 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2196 "step is not a multiple of type size: step ");
2197 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2198 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2199 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2200 TYPE_SIZE_UNIT (scalar_type
));
2201 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2209 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE
, vect_location
,
2212 "Detected interleaving of size %d\n", (int)groupsize
);
2214 /* SLP: create an SLP data structure for every interleaving group of
2215 stores for further analysis in vect_analyse_slp. */
2216 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2219 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2221 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2224 /* There is a gap in the end of the group. */
2225 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2229 "Data access with gaps requires scalar "
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2235 "Peeling for outer loop is not supported\n");
2239 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2247 /* Analyze the access pattern of the data-reference DR.
2248 In case of non-consecutive accesses call vect_analyze_group_access() to
2249 analyze groups of accesses. */
2252 vect_analyze_data_ref_access (struct data_reference
*dr
)
2254 tree step
= DR_STEP (dr
);
2255 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2256 gimple stmt
= DR_STMT (dr
);
2257 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2258 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2259 struct loop
*loop
= NULL
;
2262 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2264 if (loop_vinfo
&& !step
)
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2268 "bad data-ref access in loop\n");
2272 /* Allow invariant loads in not nested loops. */
2273 if (loop_vinfo
&& integer_zerop (step
))
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2276 if (nested_in_vect_loop_p (loop
, stmt
))
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE
, vect_location
,
2280 "zero step in inner loop of nest\n");
2283 return DR_IS_READ (dr
);
2286 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2288 /* Interleaved accesses are not yet supported within outer-loop
2289 vectorization for references in the inner-loop. */
2290 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2292 /* For the rest of the analysis we use the outer-loop step. */
2293 step
= STMT_VINFO_DR_STEP (stmt_info
);
2294 if (integer_zerop (step
))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE
, vect_location
,
2298 "zero step in outer loop.\n");
2299 if (DR_IS_READ (dr
))
2307 if (TREE_CODE (step
) == INTEGER_CST
)
2309 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2310 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2312 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2314 /* Mark that it is not interleaving. */
2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2320 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE
, vect_location
,
2324 "grouped access in outer loop.\n");
2328 /* Assume this is a DR handled by non-constant strided load case. */
2329 if (TREE_CODE (step
) != INTEGER_CST
)
2330 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2332 /* Not consecutive access - check if it's a part of interleaving group. */
2333 return vect_analyze_group_access (dr
);
2338 /* A helper function used in the comparator function to sort data
2339 references. T1 and T2 are two data references to be compared.
2340 The function returns -1, 0, or 1. */
2343 compare_tree (tree t1
, tree t2
)
2346 enum tree_code code
;
2357 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2358 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2360 code
= TREE_CODE (t1
);
2363 /* For const values, we can just use hash values for comparisons. */
2371 hashval_t h1
= iterative_hash_expr (t1
, 0);
2372 hashval_t h2
= iterative_hash_expr (t2
, 0);
2374 return h1
< h2
? -1 : 1;
2379 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2383 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2384 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2388 tclass
= TREE_CODE_CLASS (code
);
2390 /* For var-decl, we could compare their UIDs. */
2391 if (tclass
== tcc_declaration
)
2393 if (DECL_UID (t1
) != DECL_UID (t2
))
2394 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2398 /* For expressions with operands, compare their operands recursively. */
2399 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2401 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2411 /* Compare two data-references DRA and DRB to group them into chunks
2412 suitable for grouping. */
2415 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2417 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2418 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2421 /* Stabilize sort. */
2425 /* Ordering of DRs according to base. */
2426 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2428 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2433 /* And according to DR_OFFSET. */
2434 if (!dr_equal_offsets_p (dra
, drb
))
2436 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2441 /* Put reads before writes. */
2442 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2443 return DR_IS_READ (dra
) ? -1 : 1;
2445 /* Then sort after access size. */
2446 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2447 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2449 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2450 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2455 /* And after step. */
2456 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2458 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2463 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2464 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2466 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2470 /* Function vect_analyze_data_ref_accesses.
2472 Analyze the access pattern of all the data references in the loop.
2474 FORNOW: the only access pattern that is considered vectorizable is a
2475 simple step 1 (consecutive) access.
2477 FORNOW: handle only arrays and pointer accesses. */
2480 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2483 vec
<data_reference_p
> datarefs
;
2484 struct data_reference
*dr
;
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE
, vect_location
,
2488 "=== vect_analyze_data_ref_accesses ===\n");
2491 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2493 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2495 if (datarefs
.is_empty ())
2498 /* Sort the array of datarefs to make building the interleaving chains
2499 linear. Don't modify the original vector's order, it is needed for
2500 determining what dependencies are reversed. */
2501 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2502 qsort (datarefs_copy
.address (), datarefs_copy
.length (),
2503 sizeof (data_reference_p
), dr_group_sort_cmp
);
2505 /* Build the interleaving chains. */
2506 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2508 data_reference_p dra
= datarefs_copy
[i
];
2509 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2510 stmt_vec_info lastinfo
= NULL
;
2511 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2513 data_reference_p drb
= datarefs_copy
[i
];
2514 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2516 /* ??? Imperfect sorting (non-compatible types, non-modulo
2517 accesses, same accesses) can lead to a group to be artificially
2518 split here as we don't just skip over those. If it really
2519 matters we can push those to a worklist and re-iterate
2520 over them. The we can just skip ahead to the next DR here. */
2522 /* Check that the data-refs have same first location (except init)
2523 and they are both either store or load (not load and store). */
2524 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2525 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2526 DR_BASE_ADDRESS (drb
), 0)
2527 || !dr_equal_offsets_p (dra
, drb
))
2530 /* Check that the data-refs have the same constant size and step. */
2531 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2532 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2533 if (!tree_fits_uhwi_p (sza
)
2534 || !tree_fits_uhwi_p (szb
)
2535 || !tree_int_cst_equal (sza
, szb
)
2536 || !tree_fits_shwi_p (DR_STEP (dra
))
2537 || !tree_fits_shwi_p (DR_STEP (drb
))
2538 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2541 /* Do not place the same access in the interleaving chain twice. */
2542 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2545 /* Check the types are compatible.
2546 ??? We don't distinguish this during sorting. */
2547 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2548 TREE_TYPE (DR_REF (drb
))))
2551 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2552 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2553 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2554 gcc_assert (init_a
< init_b
);
2556 /* If init_b == init_a + the size of the type * k, we have an
2557 interleaving, and DRA is accessed before DRB. */
2558 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2559 if ((init_b
- init_a
) % type_size_a
!= 0)
2562 /* The step (if not zero) is greater than the difference between
2563 data-refs' inits. This splits groups into suitable sizes. */
2564 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2565 if (step
!= 0 && step
<= (init_b
- init_a
))
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE
, vect_location
,
2571 "Detected interleaving ");
2572 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2573 dump_printf (MSG_NOTE
, " and ");
2574 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2575 dump_printf (MSG_NOTE
, "\n");
2578 /* Link the found element into the group list. */
2579 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2581 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2582 lastinfo
= stmtinfo_a
;
2584 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2585 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2586 lastinfo
= stmtinfo_b
;
2590 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2592 && !vect_analyze_data_ref_access (dr
))
2594 if (dump_enabled_p ())
2595 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2596 "not vectorized: complicated access pattern.\n");
2600 /* Mark the statement as not vectorizable. */
2601 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2606 datarefs_copy
.release ();
2611 datarefs_copy
.release ();
2616 /* Operator == between two dr_with_seg_len objects.
2618 This equality operator is used to make sure two data refs
2619 are the same one so that we will consider to combine the
2620 aliasing checks of those two pairs of data dependent data
2624 operator == (const dr_with_seg_len
& d1
,
2625 const dr_with_seg_len
& d2
)
2627 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2628 DR_BASE_ADDRESS (d2
.dr
), 0)
2629 && compare_tree (d1
.offset
, d2
.offset
) == 0
2630 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2633 /* Function comp_dr_with_seg_len_pair.
2635 Comparison function for sorting objects of dr_with_seg_len_pair_t
2636 so that we can combine aliasing checks in one scan. */
2639 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2641 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2642 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2644 const dr_with_seg_len
&p11
= p1
->first
,
2649 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2650 if a and c have the same basic address snd step, and b and d have the same
2651 address and step. Therefore, if any a&c or b&d don't have the same address
2652 and step, we don't care the order of those two pairs after sorting. */
2655 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2656 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2658 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2659 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2661 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2663 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2665 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2667 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2673 template <class T
> static void
2681 /* Function vect_vfa_segment_size.
2683 Create an expression that computes the size of segment
2684 that will be accessed for a data reference. The functions takes into
2685 account that realignment loads may access one more vector.
2688 DR: The data reference.
2689 LENGTH_FACTOR: segment length to consider.
2691 Return an expression whose value is the size of segment which will be
2695 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2697 tree segment_length
;
2699 if (integer_zerop (DR_STEP (dr
)))
2700 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2702 segment_length
= size_binop (MULT_EXPR
,
2703 fold_convert (sizetype
, DR_STEP (dr
)),
2704 fold_convert (sizetype
, length_factor
));
2706 if (vect_supportable_dr_alignment (dr
, false)
2707 == dr_explicit_realign_optimized
)
2709 tree vector_size
= TYPE_SIZE_UNIT
2710 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2712 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2714 return segment_length
;
2717 /* Function vect_prune_runtime_alias_test_list.
2719 Prune a list of ddrs to be tested at run-time by versioning for alias.
2720 Merge several alias checks into one if possible.
2721 Return FALSE if resulting list of ddrs is longer then allowed by
2722 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2725 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2727 vec
<ddr_p
> may_alias_ddrs
=
2728 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2729 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2730 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2731 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2732 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_NOTE
, vect_location
,
2740 "=== vect_prune_runtime_alias_test_list ===\n");
2742 if (may_alias_ddrs
.is_empty ())
2745 /* Basically, for each pair of dependent data refs store_ptr_0
2746 and load_ptr_0, we create an expression:
2748 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2749 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2751 for aliasing checks. However, in some cases we can decrease
2752 the number of checks by combining two checks into one. For
2753 example, suppose we have another pair of data refs store_ptr_0
2754 and load_ptr_1, and if the following condition is satisfied:
2756 load_ptr_0 < load_ptr_1 &&
2757 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2759 (this condition means, in each iteration of vectorized loop,
2760 the accessed memory of store_ptr_0 cannot be between the memory
2761 of load_ptr_0 and load_ptr_1.)
2763 we then can use only the following expression to finish the
2764 alising checks between store_ptr_0 & load_ptr_0 and
2765 store_ptr_0 & load_ptr_1:
2767 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2768 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2770 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2771 same basic address. */
2773 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2775 /* First, we collect all data ref pairs for aliasing checks. */
2776 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2778 struct data_reference
*dr_a
, *dr_b
;
2779 gimple dr_group_first_a
, dr_group_first_b
;
2780 tree segment_length_a
, segment_length_b
;
2781 gimple stmt_a
, stmt_b
;
2784 stmt_a
= DR_STMT (DDR_A (ddr
));
2785 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2786 if (dr_group_first_a
)
2788 stmt_a
= dr_group_first_a
;
2789 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2793 stmt_b
= DR_STMT (DDR_B (ddr
));
2794 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2795 if (dr_group_first_b
)
2797 stmt_b
= dr_group_first_b
;
2798 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2801 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2802 length_factor
= scalar_loop_iters
;
2804 length_factor
= size_int (vect_factor
);
2805 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2806 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2808 dr_with_seg_len_pair_t dr_with_seg_len_pair
2809 (dr_with_seg_len (dr_a
, segment_length_a
),
2810 dr_with_seg_len (dr_b
, segment_length_b
));
2812 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2813 swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2815 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2818 /* Second, we sort the collected data ref pairs so that we can scan
2819 them once to combine all possible aliasing checks. */
2820 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2822 /* Third, we scan the sorted dr pairs and check if we can combine
2823 alias checks of two neighbouring dr pairs. */
2824 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2826 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2827 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2828 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2829 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2830 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2832 /* Remove duplicate data ref pairs. */
2833 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2835 if (dump_enabled_p ())
2837 dump_printf_loc (MSG_NOTE
, vect_location
,
2838 "found equal ranges ");
2839 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2840 DR_REF (dr_a1
->dr
));
2841 dump_printf (MSG_NOTE
, ", ");
2842 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2843 DR_REF (dr_b1
->dr
));
2844 dump_printf (MSG_NOTE
, " and ");
2845 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2846 DR_REF (dr_a2
->dr
));
2847 dump_printf (MSG_NOTE
, ", ");
2848 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2849 DR_REF (dr_b2
->dr
));
2850 dump_printf (MSG_NOTE
, "\n");
2853 comp_alias_ddrs
.ordered_remove (i
--);
2857 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2859 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2860 and DR_A1 and DR_A2 are two consecutive memrefs. */
2861 if (*dr_a1
== *dr_a2
)
2863 swap (dr_a1
, dr_b1
);
2864 swap (dr_a2
, dr_b2
);
2867 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2868 DR_BASE_ADDRESS (dr_a2
->dr
),
2870 || !tree_fits_shwi_p (dr_a1
->offset
)
2871 || !tree_fits_shwi_p (dr_a2
->offset
))
2874 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2875 - tree_to_shwi (dr_a1
->offset
));
2878 /* Now we check if the following condition is satisfied:
2880 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2882 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2883 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2884 have to make a best estimation. We can get the minimum value
2885 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2886 then either of the following two conditions can guarantee the
2889 1: DIFF <= MIN_SEG_LEN_B
2890 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2895 min_seg_len_b
= (TREE_CODE (dr_b1
->seg_len
) == INTEGER_CST
) ?
2896 TREE_INT_CST_LOW (dr_b1
->seg_len
) :
2899 if (diff
<= min_seg_len_b
2900 || (TREE_CODE (dr_a1
->seg_len
) == INTEGER_CST
2901 && diff
- (HOST_WIDE_INT
) TREE_INT_CST_LOW (dr_a1
->seg_len
) <
2904 if (dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE
, vect_location
,
2907 "merging ranges for ");
2908 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2909 DR_REF (dr_a1
->dr
));
2910 dump_printf (MSG_NOTE
, ", ");
2911 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2912 DR_REF (dr_b1
->dr
));
2913 dump_printf (MSG_NOTE
, " and ");
2914 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2915 DR_REF (dr_a2
->dr
));
2916 dump_printf (MSG_NOTE
, ", ");
2917 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2918 DR_REF (dr_b2
->dr
));
2919 dump_printf (MSG_NOTE
, "\n");
2922 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2923 dr_a2
->seg_len
, size_int (diff
));
2924 comp_alias_ddrs
.ordered_remove (i
--);
2929 dump_printf_loc (MSG_NOTE
, vect_location
,
2930 "improved number of alias checks from %d to %d\n",
2931 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
2932 if ((int) comp_alias_ddrs
.length () >
2933 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2939 /* Check whether a non-affine read in stmt is suitable for gather load
2940 and if so, return a builtin decl for that operation. */
2943 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2944 tree
*offp
, int *scalep
)
2946 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2947 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2948 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2949 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2950 tree offtype
= NULL_TREE
;
2951 tree decl
, base
, off
;
2952 enum machine_mode pmode
;
2953 int punsignedp
, pvolatilep
;
2956 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2957 see if we can use the def stmt of the address. */
2958 if (is_gimple_call (stmt
)
2959 && gimple_call_internal_p (stmt
)
2960 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
2961 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
2962 && TREE_CODE (base
) == MEM_REF
2963 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
2964 && integer_zerop (TREE_OPERAND (base
, 1))
2965 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
2967 gimple def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
2968 if (is_gimple_assign (def_stmt
)
2969 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
2970 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
2973 /* The gather builtins need address of the form
2974 loop_invariant + vector * {1, 2, 4, 8}
2976 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2977 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2978 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2979 multiplications and additions in it. To get a vector, we need
2980 a single SSA_NAME that will be defined in the loop and will
2981 contain everything that is not loop invariant and that can be
2982 vectorized. The following code attempts to find such a preexistng
2983 SSA_NAME OFF and put the loop invariants into a tree BASE
2984 that can be gimplified before the loop. */
2985 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
,
2986 &pmode
, &punsignedp
, &pvolatilep
, false);
2987 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2989 if (TREE_CODE (base
) == MEM_REF
)
2991 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2993 if (off
== NULL_TREE
)
2995 double_int moff
= mem_ref_offset (base
);
2996 off
= double_int_to_tree (sizetype
, moff
);
2999 off
= size_binop (PLUS_EXPR
, off
,
3000 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3002 base
= TREE_OPERAND (base
, 0);
3005 base
= build_fold_addr_expr (base
);
3007 if (off
== NULL_TREE
)
3008 off
= size_zero_node
;
3010 /* If base is not loop invariant, either off is 0, then we start with just
3011 the constant offset in the loop invariant BASE and continue with base
3012 as OFF, otherwise give up.
3013 We could handle that case by gimplifying the addition of base + off
3014 into some SSA_NAME and use that as off, but for now punt. */
3015 if (!expr_invariant_in_loop_p (loop
, base
))
3017 if (!integer_zerop (off
))
3020 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3022 /* Otherwise put base + constant offset into the loop invariant BASE
3023 and continue with OFF. */
3026 base
= fold_convert (sizetype
, base
);
3027 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3030 /* OFF at this point may be either a SSA_NAME or some tree expression
3031 from get_inner_reference. Try to peel off loop invariants from it
3032 into BASE as long as possible. */
3034 while (offtype
== NULL_TREE
)
3036 enum tree_code code
;
3037 tree op0
, op1
, add
= NULL_TREE
;
3039 if (TREE_CODE (off
) == SSA_NAME
)
3041 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3043 if (expr_invariant_in_loop_p (loop
, off
))
3046 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3049 op0
= gimple_assign_rhs1 (def_stmt
);
3050 code
= gimple_assign_rhs_code (def_stmt
);
3051 op1
= gimple_assign_rhs2 (def_stmt
);
3055 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3057 code
= TREE_CODE (off
);
3058 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3062 case POINTER_PLUS_EXPR
:
3064 if (expr_invariant_in_loop_p (loop
, op0
))
3069 add
= fold_convert (sizetype
, add
);
3071 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3072 base
= size_binop (PLUS_EXPR
, base
, add
);
3075 if (expr_invariant_in_loop_p (loop
, op1
))
3083 if (expr_invariant_in_loop_p (loop
, op1
))
3085 add
= fold_convert (sizetype
, op1
);
3086 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3092 if (scale
== 1 && tree_fits_shwi_p (op1
))
3094 scale
= tree_to_shwi (op1
);
3103 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3104 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3106 if (TYPE_PRECISION (TREE_TYPE (op0
))
3107 == TYPE_PRECISION (TREE_TYPE (off
)))
3112 if (TYPE_PRECISION (TREE_TYPE (op0
))
3113 < TYPE_PRECISION (TREE_TYPE (off
)))
3116 offtype
= TREE_TYPE (off
);
3127 /* If at the end OFF still isn't a SSA_NAME or isn't
3128 defined in the loop, punt. */
3129 if (TREE_CODE (off
) != SSA_NAME
3130 || expr_invariant_in_loop_p (loop
, off
))
3133 if (offtype
== NULL_TREE
)
3134 offtype
= TREE_TYPE (off
);
3136 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3138 if (decl
== NULL_TREE
)
3150 /* Function vect_analyze_data_refs.
3152 Find all the data references in the loop or basic block.
3154 The general structure of the analysis of data refs in the vectorizer is as
3156 1- vect_analyze_data_refs(loop/bb): call
3157 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3158 in the loop/bb and their dependences.
3159 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3160 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3161 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3166 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3167 bb_vec_info bb_vinfo
,
3170 struct loop
*loop
= NULL
;
3171 basic_block bb
= NULL
;
3173 vec
<data_reference_p
> datarefs
;
3174 struct data_reference
*dr
;
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE
, vect_location
,
3179 "=== vect_analyze_data_refs ===\n");
3183 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3185 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3186 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3187 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3191 "not vectorized: loop contains function calls"
3192 " or data references that cannot be analyzed\n");
3196 for (i
= 0; i
< loop
->num_nodes
; i
++)
3198 gimple_stmt_iterator gsi
;
3200 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3202 gimple stmt
= gsi_stmt (gsi
);
3203 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3205 if (is_gimple_call (stmt
) && loop
->safelen
)
3207 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3208 if (fndecl
!= NULL_TREE
)
3210 struct cgraph_node
*node
= cgraph_get_node (fndecl
);
3211 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3213 unsigned int j
, n
= gimple_call_num_args (stmt
);
3214 for (j
= 0; j
< n
; j
++)
3216 op
= gimple_call_arg (stmt
, j
);
3218 || (REFERENCE_CLASS_P (op
)
3219 && get_base_address (op
)))
3222 op
= gimple_call_lhs (stmt
);
3223 /* Ignore #pragma omp declare simd functions
3224 if they don't have data references in the
3225 call stmt itself. */
3229 || (REFERENCE_CLASS_P (op
)
3230 && get_base_address (op
)))))
3235 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3236 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3238 "not vectorized: loop contains function "
3239 "calls or data references that cannot "
3246 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3250 gimple_stmt_iterator gsi
;
3252 bb
= BB_VINFO_BB (bb_vinfo
);
3253 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3255 gimple stmt
= gsi_stmt (gsi
);
3256 if (!find_data_references_in_stmt (NULL
, stmt
,
3257 &BB_VINFO_DATAREFS (bb_vinfo
)))
3259 /* Mark the rest of the basic-block as unvectorizable. */
3260 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3262 stmt
= gsi_stmt (gsi
);
3263 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3269 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3272 /* Go through the data-refs, check that the analysis succeeded. Update
3273 pointer from stmt_vec_info struct to DR and vectype. */
3275 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3278 stmt_vec_info stmt_info
;
3279 tree base
, offset
, init
;
3280 bool gather
= false;
3281 bool simd_lane_access
= false;
3285 if (!dr
|| !DR_REF (dr
))
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3289 "not vectorized: unhandled data-ref\n");
3293 stmt
= DR_STMT (dr
);
3294 stmt_info
= vinfo_for_stmt (stmt
);
3296 /* Discard clobbers from the dataref vector. We will remove
3297 clobber stmts during vectorization. */
3298 if (gimple_clobber_p (stmt
))
3301 if (i
== datarefs
.length () - 1)
3306 datarefs
.ordered_remove (i
);
3311 /* Check that analysis of the data-ref succeeded. */
3312 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3317 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3318 && targetm
.vectorize
.builtin_gather
!= NULL
;
3319 bool maybe_simd_lane_access
3320 = loop_vinfo
&& loop
->simduid
;
3322 /* If target supports vector gather loads, or if this might be
3323 a SIMD lane access, see if they can't be used. */
3325 && (maybe_gather
|| maybe_simd_lane_access
)
3326 && !nested_in_vect_loop_p (loop
, stmt
))
3328 struct data_reference
*newdr
3329 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3330 DR_REF (dr
), stmt
, true);
3331 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3332 if (DR_BASE_ADDRESS (newdr
)
3333 && DR_OFFSET (newdr
)
3336 && integer_zerop (DR_STEP (newdr
)))
3338 if (maybe_simd_lane_access
)
3340 tree off
= DR_OFFSET (newdr
);
3342 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3343 && TREE_CODE (off
) == MULT_EXPR
3344 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3346 tree step
= TREE_OPERAND (off
, 1);
3347 off
= TREE_OPERAND (off
, 0);
3349 if (CONVERT_EXPR_P (off
)
3350 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3352 < TYPE_PRECISION (TREE_TYPE (off
)))
3353 off
= TREE_OPERAND (off
, 0);
3354 if (TREE_CODE (off
) == SSA_NAME
)
3356 gimple def
= SSA_NAME_DEF_STMT (off
);
3357 tree reft
= TREE_TYPE (DR_REF (newdr
));
3358 if (is_gimple_call (def
)
3359 && gimple_call_internal_p (def
)
3360 && (gimple_call_internal_fn (def
)
3361 == IFN_GOMP_SIMD_LANE
))
3363 tree arg
= gimple_call_arg (def
, 0);
3364 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3365 arg
= SSA_NAME_VAR (arg
);
3366 if (arg
== loop
->simduid
3368 && tree_int_cst_equal
3369 (TYPE_SIZE_UNIT (reft
),
3372 DR_OFFSET (newdr
) = ssize_int (0);
3373 DR_STEP (newdr
) = step
;
3374 DR_ALIGNED_TO (newdr
)
3375 = size_int (BIGGEST_ALIGNMENT
);
3377 simd_lane_access
= true;
3383 if (!simd_lane_access
&& maybe_gather
)
3389 if (!gather
&& !simd_lane_access
)
3390 free_data_ref (newdr
);
3393 if (!gather
&& !simd_lane_access
)
3395 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3398 "not vectorized: data ref analysis "
3400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3401 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3411 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3413 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3415 "not vectorized: base addr of dr is a "
3421 if (gather
|| simd_lane_access
)
3426 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3428 if (dump_enabled_p ())
3430 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3431 "not vectorized: volatile type ");
3432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3433 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3442 if (stmt_can_throw_internal (stmt
))
3444 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3447 "not vectorized: statement can throw an "
3449 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3450 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3456 if (gather
|| simd_lane_access
)
3461 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3462 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3467 "not vectorized: statement is bitfield "
3469 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3470 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3476 if (gather
|| simd_lane_access
)
3481 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3482 offset
= unshare_expr (DR_OFFSET (dr
));
3483 init
= unshare_expr (DR_INIT (dr
));
3485 if (is_gimple_call (stmt
)
3486 && (!gimple_call_internal_p (stmt
)
3487 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3488 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3490 if (dump_enabled_p ())
3492 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3493 "not vectorized: dr in a call ");
3494 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3495 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3501 if (gather
|| simd_lane_access
)
3506 /* Update DR field in stmt_vec_info struct. */
3508 /* If the dataref is in an inner-loop of the loop that is considered for
3509 for vectorization, we also want to analyze the access relative to
3510 the outer-loop (DR contains information only relative to the
3511 inner-most enclosing loop). We do that by building a reference to the
3512 first location accessed by the inner-loop, and analyze it relative to
3514 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3516 tree outer_step
, outer_base
, outer_init
;
3517 HOST_WIDE_INT pbitsize
, pbitpos
;
3519 enum machine_mode pmode
;
3520 int punsignedp
, pvolatilep
;
3521 affine_iv base_iv
, offset_iv
;
3524 /* Build a reference to the first location accessed by the
3525 inner-loop: *(BASE+INIT). (The first location is actually
3526 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3527 tree inner_base
= build_fold_indirect_ref
3528 (fold_build_pointer_plus (base
, init
));
3530 if (dump_enabled_p ())
3532 dump_printf_loc (MSG_NOTE
, vect_location
,
3533 "analyze in outer-loop: ");
3534 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3535 dump_printf (MSG_NOTE
, "\n");
3538 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3539 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3540 gcc_assert (outer_base
!= NULL_TREE
);
3542 if (pbitpos
% BITS_PER_UNIT
!= 0)
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3546 "failed: bit offset alignment.\n");
3550 outer_base
= build_fold_addr_expr (outer_base
);
3551 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3556 "failed: evolution of base is not affine.\n");
3563 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3571 offset_iv
.base
= ssize_int (0);
3572 offset_iv
.step
= ssize_int (0);
3574 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3577 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3579 "evolution of offset is not affine.\n");
3583 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3584 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3585 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3586 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3587 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3589 outer_step
= size_binop (PLUS_EXPR
,
3590 fold_convert (ssizetype
, base_iv
.step
),
3591 fold_convert (ssizetype
, offset_iv
.step
));
3593 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3594 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3595 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3596 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3597 STMT_VINFO_DR_OFFSET (stmt_info
) =
3598 fold_convert (ssizetype
, offset_iv
.base
);
3599 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3600 size_int (highest_pow2_factor (offset_iv
.base
));
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_NOTE
, vect_location
,
3605 "\touter base_address: ");
3606 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3607 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3608 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3609 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3610 STMT_VINFO_DR_OFFSET (stmt_info
));
3611 dump_printf (MSG_NOTE
,
3612 "\n\touter constant offset from base address: ");
3613 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3614 STMT_VINFO_DR_INIT (stmt_info
));
3615 dump_printf (MSG_NOTE
, "\n\touter step: ");
3616 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3617 STMT_VINFO_DR_STEP (stmt_info
));
3618 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3619 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3620 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3621 dump_printf (MSG_NOTE
, "\n");
3625 if (STMT_VINFO_DATA_REF (stmt_info
))
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3630 "not vectorized: more than one data ref "
3632 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3633 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3639 if (gather
|| simd_lane_access
)
3644 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3645 if (simd_lane_access
)
3647 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3648 free_data_ref (datarefs
[i
]);
3652 /* Set vectype for STMT. */
3653 scalar_type
= TREE_TYPE (DR_REF (dr
));
3654 STMT_VINFO_VECTYPE (stmt_info
)
3655 = get_vectype_for_scalar_type (scalar_type
);
3656 if (!STMT_VINFO_VECTYPE (stmt_info
))
3658 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3661 "not vectorized: no vectype for stmt: ");
3662 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3663 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3664 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3666 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3672 if (gather
|| simd_lane_access
)
3674 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_NOTE
, vect_location
,
3685 "got vectype for stmt: ");
3686 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3687 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3688 STMT_VINFO_VECTYPE (stmt_info
));
3689 dump_printf (MSG_NOTE
, "\n");
3693 /* Adjust the minimal vectorization factor according to the
3695 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3703 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3705 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3709 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3711 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3714 "not vectorized: not suitable for gather "
3716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3717 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3723 STMT_VINFO_GATHER_P (stmt_info
) = true;
3726 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3728 if (nested_in_vect_loop_p (loop
, stmt
)
3729 || !DR_IS_READ (dr
))
3731 if (dump_enabled_p ())
3733 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3734 "not vectorized: not suitable for strided "
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3737 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3741 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3745 /* If we stopped analysis at the first dataref we could not analyze
3746 when trying to vectorize a basic-block mark the rest of the datarefs
3747 as not vectorizable and truncate the vector of datarefs. That
3748 avoids spending useless time in analyzing their dependence. */
3749 if (i
!= datarefs
.length ())
3751 gcc_assert (bb_vinfo
!= NULL
);
3752 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3754 data_reference_p dr
= datarefs
[j
];
3755 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3758 datarefs
.truncate (i
);
3765 /* Function vect_get_new_vect_var.
3767 Returns a name for a new variable. The current naming scheme appends the
3768 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3769 the name of vectorizer generated variables, and appends that to NAME if
3773 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3780 case vect_simple_var
:
3783 case vect_scalar_var
:
3786 case vect_pointer_var
:
3795 char* tmp
= concat (prefix
, "_", name
, NULL
);
3796 new_vect_var
= create_tmp_reg (type
, tmp
);
3800 new_vect_var
= create_tmp_reg (type
, prefix
);
3802 return new_vect_var
;
3806 /* Function vect_create_addr_base_for_vector_ref.
3808 Create an expression that computes the address of the first memory location
3809 that will be accessed for a data reference.
3812 STMT: The statement containing the data reference.
3813 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3814 OFFSET: Optional. If supplied, it is be added to the initial address.
3815 LOOP: Specify relative to which loop-nest should the address be computed.
3816 For example, when the dataref is in an inner-loop nested in an
3817 outer-loop that is now being vectorized, LOOP can be either the
3818 outer-loop, or the inner-loop. The first memory location accessed
3819 by the following dataref ('in' points to short):
3826 if LOOP=i_loop: &in (relative to i_loop)
3827 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3830 1. Return an SSA_NAME whose value is the address of the memory location of
3831 the first vector of the data reference.
3832 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3833 these statement(s) which define the returned SSA_NAME.
3835 FORNOW: We are only handling array accesses with step 1. */
3838 vect_create_addr_base_for_vector_ref (gimple stmt
,
3839 gimple_seq
*new_stmt_list
,
3843 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3844 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3846 const char *base_name
;
3849 gimple_seq seq
= NULL
;
3853 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3854 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3856 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3858 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3860 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3862 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3863 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3864 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3868 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3869 base_offset
= unshare_expr (DR_OFFSET (dr
));
3870 init
= unshare_expr (DR_INIT (dr
));
3874 base_name
= get_name (data_ref_base
);
3877 base_offset
= ssize_int (0);
3878 init
= ssize_int (0);
3879 base_name
= get_name (DR_REF (dr
));
3882 /* Create base_offset */
3883 base_offset
= size_binop (PLUS_EXPR
,
3884 fold_convert (sizetype
, base_offset
),
3885 fold_convert (sizetype
, init
));
3889 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3890 fold_convert (sizetype
, offset
), step
);
3891 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3892 base_offset
, offset
);
3895 /* base + base_offset */
3897 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3900 addr_base
= build1 (ADDR_EXPR
,
3901 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3902 unshare_expr (DR_REF (dr
)));
3905 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3906 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3907 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3908 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3909 gimple_seq_add_seq (new_stmt_list
, seq
);
3911 if (DR_PTR_INFO (dr
)
3912 && TREE_CODE (addr_base
) == SSA_NAME
)
3914 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3916 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3919 if (dump_enabled_p ())
3921 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3922 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3923 dump_printf (MSG_NOTE
, "\n");
3930 /* Function vect_create_data_ref_ptr.
3932 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3933 location accessed in the loop by STMT, along with the def-use update
3934 chain to appropriately advance the pointer through the loop iterations.
3935 Also set aliasing information for the pointer. This pointer is used by
3936 the callers to this function to create a memory reference expression for
3937 vector load/store access.
3940 1. STMT: a stmt that references memory. Expected to be of the form
3941 GIMPLE_ASSIGN <name, data-ref> or
3942 GIMPLE_ASSIGN <data-ref, name>.
3943 2. AGGR_TYPE: the type of the reference, which should be either a vector
3945 3. AT_LOOP: the loop where the vector memref is to be created.
3946 4. OFFSET (optional): an offset to be added to the initial address accessed
3947 by the data-ref in STMT.
3948 5. BSI: location where the new stmts are to be placed if there is no loop
3949 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3950 pointing to the initial address.
3953 1. Declare a new ptr to vector_type, and have it point to the base of the
3954 data reference (initial addressed accessed by the data reference).
3955 For example, for vector of type V8HI, the following code is generated:
3958 ap = (v8hi *)initial_address;
3960 if OFFSET is not supplied:
3961 initial_address = &a[init];
3962 if OFFSET is supplied:
3963 initial_address = &a[init + OFFSET];
3965 Return the initial_address in INITIAL_ADDRESS.
3967 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3968 update the pointer in each iteration of the loop.
3970 Return the increment stmt that updates the pointer in PTR_INCR.
3972 3. Set INV_P to true if the access pattern of the data reference in the
3973 vectorized loop is invariant. Set it to false otherwise.
3975 4. Return the pointer. */
3978 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3979 tree offset
, tree
*initial_address
,
3980 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3981 bool only_init
, bool *inv_p
)
3983 const char *base_name
;
3984 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3985 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3986 struct loop
*loop
= NULL
;
3987 bool nested_in_vect_loop
= false;
3988 struct loop
*containing_loop
= NULL
;
3993 gimple_seq new_stmt_list
= NULL
;
3997 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3999 gimple_stmt_iterator incr_gsi
;
4001 tree indx_before_incr
, indx_after_incr
;
4004 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4006 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4007 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4011 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4012 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4013 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4014 pe
= loop_preheader_edge (loop
);
4018 gcc_assert (bb_vinfo
);
4023 /* Check the step (evolution) of the load in LOOP, and record
4024 whether it's invariant. */
4025 if (nested_in_vect_loop
)
4026 step
= STMT_VINFO_DR_STEP (stmt_info
);
4028 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4030 if (integer_zerop (step
))
4035 /* Create an expression for the first address accessed by this load
4037 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4039 if (dump_enabled_p ())
4041 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4042 dump_printf_loc (MSG_NOTE
, vect_location
,
4043 "create %s-pointer variable to type: ",
4044 get_tree_code_name (TREE_CODE (aggr_type
)));
4045 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4046 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4047 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4048 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4049 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4050 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4051 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4053 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4054 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4055 dump_printf (MSG_NOTE
, "\n");
4058 /* (1) Create the new aggregate-pointer variable.
4059 Vector and array types inherit the alias set of their component
4060 type by default so we need to use a ref-all pointer if the data
4061 reference does not conflict with the created aggregated data
4062 reference because it is not addressable. */
4063 bool need_ref_all
= false;
4064 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4065 get_alias_set (DR_REF (dr
))))
4066 need_ref_all
= true;
4067 /* Likewise for any of the data references in the stmt group. */
4068 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4070 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4073 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4074 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4075 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4076 get_alias_set (DR_REF (sdr
))))
4078 need_ref_all
= true;
4081 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4085 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4087 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4090 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4091 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4092 def-use update cycles for the pointer: one relative to the outer-loop
4093 (LOOP), which is what steps (3) and (4) below do. The other is relative
4094 to the inner-loop (which is the inner-most loop containing the dataref),
4095 and this is done be step (5) below.
4097 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4098 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4099 redundant. Steps (3),(4) create the following:
4102 LOOP: vp1 = phi(vp0,vp2)
4108 If there is an inner-loop nested in loop, then step (5) will also be
4109 applied, and an additional update in the inner-loop will be created:
4112 LOOP: vp1 = phi(vp0,vp2)
4114 inner: vp3 = phi(vp1,vp4)
4115 vp4 = vp3 + inner_step
4121 /* (2) Calculate the initial address of the aggregate-pointer, and set
4122 the aggregate-pointer to point to it before the loop. */
4124 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4126 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4132 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4133 gcc_assert (!new_bb
);
4136 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4139 *initial_address
= new_temp
;
4141 /* Create: p = (aggr_type *) initial_base */
4142 if (TREE_CODE (new_temp
) != SSA_NAME
4143 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
4145 vec_stmt
= gimple_build_assign (aggr_ptr
,
4146 fold_convert (aggr_ptr_type
, new_temp
));
4147 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
4148 /* Copy the points-to information if it exists. */
4149 if (DR_PTR_INFO (dr
))
4150 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
4151 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
4154 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
4155 gcc_assert (!new_bb
);
4158 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
4161 aggr_ptr_init
= new_temp
;
4163 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4164 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4165 inner-loop nested in LOOP (during outer-loop vectorization). */
4167 /* No update in loop is required. */
4168 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4169 aptr
= aggr_ptr_init
;
4172 /* The step of the aggregate pointer is the type size. */
4173 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4174 /* One exception to the above is when the scalar step of the load in
4175 LOOP is zero. In this case the step here is also zero. */
4177 iv_step
= size_zero_node
;
4178 else if (tree_int_cst_sgn (step
) == -1)
4179 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4181 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4183 create_iv (aggr_ptr_init
,
4184 fold_convert (aggr_ptr_type
, iv_step
),
4185 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4186 &indx_before_incr
, &indx_after_incr
);
4187 incr
= gsi_stmt (incr_gsi
);
4188 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4190 /* Copy the points-to information if it exists. */
4191 if (DR_PTR_INFO (dr
))
4193 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4194 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4199 aptr
= indx_before_incr
;
4202 if (!nested_in_vect_loop
|| only_init
)
4206 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4207 nested in LOOP, if exists. */
4209 gcc_assert (nested_in_vect_loop
);
4212 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4214 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4215 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4217 incr
= gsi_stmt (incr_gsi
);
4218 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4220 /* Copy the points-to information if it exists. */
4221 if (DR_PTR_INFO (dr
))
4223 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4224 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4229 return indx_before_incr
;
4236 /* Function bump_vector_ptr
4238 Increment a pointer (to a vector type) by vector-size. If requested,
4239 i.e. if PTR-INCR is given, then also connect the new increment stmt
4240 to the existing def-use update-chain of the pointer, by modifying
4241 the PTR_INCR as illustrated below:
4243 The pointer def-use update-chain before this function:
4244 DATAREF_PTR = phi (p_0, p_2)
4246 PTR_INCR: p_2 = DATAREF_PTR + step
4248 The pointer def-use update-chain after this function:
4249 DATAREF_PTR = phi (p_0, p_2)
4251 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4253 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4256 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4258 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4259 the loop. The increment amount across iterations is expected
4261 BSI - location where the new update stmt is to be placed.
4262 STMT - the original scalar memory-access stmt that is being vectorized.
4263 BUMP - optional. The offset by which to bump the pointer. If not given,
4264 the offset is assumed to be vector_size.
4266 Output: Return NEW_DATAREF_PTR as illustrated above.
4271 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4272 gimple stmt
, tree bump
)
4274 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4275 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4276 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4277 tree update
= TYPE_SIZE_UNIT (vectype
);
4280 use_operand_p use_p
;
4281 tree new_dataref_ptr
;
4286 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
4287 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
4288 dataref_ptr
, update
);
4289 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4291 /* Copy the points-to information if it exists. */
4292 if (DR_PTR_INFO (dr
))
4294 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4295 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4299 return new_dataref_ptr
;
4301 /* Update the vector-pointer's cross-iteration increment. */
4302 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4304 tree use
= USE_FROM_PTR (use_p
);
4306 if (use
== dataref_ptr
)
4307 SET_USE (use_p
, new_dataref_ptr
);
4309 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4312 return new_dataref_ptr
;
4316 /* Function vect_create_destination_var.
4318 Create a new temporary of type VECTYPE. */
4321 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4327 enum vect_var_kind kind
;
4329 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4330 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4332 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4334 name
= get_name (scalar_dest
);
4336 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4338 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
4339 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4345 /* Function vect_grouped_store_supported.
4347 Returns TRUE if interleave high and interleave low permutations
4348 are supported, and FALSE otherwise. */
4351 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4353 enum machine_mode mode
= TYPE_MODE (vectype
);
4355 /* vect_permute_store_chain requires the group size to be a power of two. */
4356 if (exact_log2 (count
) == -1)
4358 if (dump_enabled_p ())
4359 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4360 "the size of the group of accesses"
4361 " is not a power of 2\n");
4365 /* Check that the permutation is supported. */
4366 if (VECTOR_MODE_P (mode
))
4368 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4369 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4370 for (i
= 0; i
< nelt
/ 2; i
++)
4373 sel
[i
* 2 + 1] = i
+ nelt
;
4375 if (can_vec_perm_p (mode
, false, sel
))
4377 for (i
= 0; i
< nelt
; i
++)
4379 if (can_vec_perm_p (mode
, false, sel
))
4384 if (dump_enabled_p ())
4385 dump_printf (MSG_MISSED_OPTIMIZATION
,
4386 "interleave op not supported by target.\n");
4391 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4395 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4397 return vect_lanes_optab_supported_p ("vec_store_lanes",
4398 vec_store_lanes_optab
,
4403 /* Function vect_permute_store_chain.
4405 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4406 a power of 2, generate interleave_high/low stmts to reorder the data
4407 correctly for the stores. Return the final references for stores in
4410 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4411 The input is 4 vectors each containing 8 elements. We assign a number to
4412 each element, the input sequence is:
4414 1st vec: 0 1 2 3 4 5 6 7
4415 2nd vec: 8 9 10 11 12 13 14 15
4416 3rd vec: 16 17 18 19 20 21 22 23
4417 4th vec: 24 25 26 27 28 29 30 31
4419 The output sequence should be:
4421 1st vec: 0 8 16 24 1 9 17 25
4422 2nd vec: 2 10 18 26 3 11 19 27
4423 3rd vec: 4 12 20 28 5 13 21 30
4424 4th vec: 6 14 22 30 7 15 23 31
4426 i.e., we interleave the contents of the four vectors in their order.
4428 We use interleave_high/low instructions to create such output. The input of
4429 each interleave_high/low operation is two vectors:
4432 the even elements of the result vector are obtained left-to-right from the
4433 high/low elements of the first vector. The odd elements of the result are
4434 obtained left-to-right from the high/low elements of the second vector.
4435 The output of interleave_high will be: 0 4 1 5
4436 and of interleave_low: 2 6 3 7
4439 The permutation is done in log LENGTH stages. In each stage interleave_high
4440 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4441 where the first argument is taken from the first half of DR_CHAIN and the
4442 second argument from it's second half.
4445 I1: interleave_high (1st vec, 3rd vec)
4446 I2: interleave_low (1st vec, 3rd vec)
4447 I3: interleave_high (2nd vec, 4th vec)
4448 I4: interleave_low (2nd vec, 4th vec)
4450 The output for the first stage is:
4452 I1: 0 16 1 17 2 18 3 19
4453 I2: 4 20 5 21 6 22 7 23
4454 I3: 8 24 9 25 10 26 11 27
4455 I4: 12 28 13 29 14 30 15 31
4457 The output of the second stage, i.e. the final result is:
4459 I1: 0 8 16 24 1 9 17 25
4460 I2: 2 10 18 26 3 11 19 27
4461 I3: 4 12 20 28 5 13 21 30
4462 I4: 6 14 22 30 7 15 23 31. */
4465 vect_permute_store_chain (vec
<tree
> dr_chain
,
4466 unsigned int length
,
4468 gimple_stmt_iterator
*gsi
,
4469 vec
<tree
> *result_chain
)
4471 tree vect1
, vect2
, high
, low
;
4473 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4474 tree perm_mask_low
, perm_mask_high
;
4476 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4477 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4479 result_chain
->quick_grow (length
);
4480 memcpy (result_chain
->address (), dr_chain
.address (),
4481 length
* sizeof (tree
));
4483 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4486 sel
[i
* 2 + 1] = i
+ nelt
;
4488 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4489 gcc_assert (perm_mask_high
!= NULL
);
4491 for (i
= 0; i
< nelt
; i
++)
4493 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4494 gcc_assert (perm_mask_low
!= NULL
);
4496 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4498 for (j
= 0; j
< length
/2; j
++)
4500 vect1
= dr_chain
[j
];
4501 vect2
= dr_chain
[j
+length
/2];
4503 /* Create interleaving stmt:
4504 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4505 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4507 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4508 vect1
, vect2
, perm_mask_high
);
4509 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4510 (*result_chain
)[2*j
] = high
;
4512 /* Create interleaving stmt:
4513 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4514 nelt*3/2+1, ...}> */
4515 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4517 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4518 vect1
, vect2
, perm_mask_low
);
4519 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4520 (*result_chain
)[2*j
+1] = low
;
4522 memcpy (dr_chain
.address (), result_chain
->address (),
4523 length
* sizeof (tree
));
4527 /* Function vect_setup_realignment
4529 This function is called when vectorizing an unaligned load using
4530 the dr_explicit_realign[_optimized] scheme.
4531 This function generates the following code at the loop prolog:
4534 x msq_init = *(floor(p)); # prolog load
4535 realignment_token = call target_builtin;
4537 x msq = phi (msq_init, ---)
4539 The stmts marked with x are generated only for the case of
4540 dr_explicit_realign_optimized.
4542 The code above sets up a new (vector) pointer, pointing to the first
4543 location accessed by STMT, and a "floor-aligned" load using that pointer.
4544 It also generates code to compute the "realignment-token" (if the relevant
4545 target hook was defined), and creates a phi-node at the loop-header bb
4546 whose arguments are the result of the prolog-load (created by this
4547 function) and the result of a load that takes place in the loop (to be
4548 created by the caller to this function).
4550 For the case of dr_explicit_realign_optimized:
4551 The caller to this function uses the phi-result (msq) to create the
4552 realignment code inside the loop, and sets up the missing phi argument,
4555 msq = phi (msq_init, lsq)
4556 lsq = *(floor(p')); # load in loop
4557 result = realign_load (msq, lsq, realignment_token);
4559 For the case of dr_explicit_realign:
4561 msq = *(floor(p)); # load in loop
4563 lsq = *(floor(p')); # load in loop
4564 result = realign_load (msq, lsq, realignment_token);
4567 STMT - (scalar) load stmt to be vectorized. This load accesses
4568 a memory location that may be unaligned.
4569 BSI - place where new code is to be inserted.
4570 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4574 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4575 target hook, if defined.
4576 Return value - the result of the loop-header phi node. */
4579 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4580 tree
*realignment_token
,
4581 enum dr_alignment_support alignment_support_scheme
,
4583 struct loop
**at_loop
)
4585 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4586 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4587 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4588 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4589 struct loop
*loop
= NULL
;
4591 tree scalar_dest
= gimple_assign_lhs (stmt
);
4598 tree msq_init
= NULL_TREE
;
4601 tree msq
= NULL_TREE
;
4602 gimple_seq stmts
= NULL
;
4604 bool compute_in_loop
= false;
4605 bool nested_in_vect_loop
= false;
4606 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4607 struct loop
*loop_for_initial_load
= NULL
;
4611 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4612 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4615 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4616 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4618 /* We need to generate three things:
4619 1. the misalignment computation
4620 2. the extra vector load (for the optimized realignment scheme).
4621 3. the phi node for the two vectors from which the realignment is
4622 done (for the optimized realignment scheme). */
4624 /* 1. Determine where to generate the misalignment computation.
4626 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4627 calculation will be generated by this function, outside the loop (in the
4628 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4629 caller, inside the loop.
4631 Background: If the misalignment remains fixed throughout the iterations of
4632 the loop, then both realignment schemes are applicable, and also the
4633 misalignment computation can be done outside LOOP. This is because we are
4634 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4635 are a multiple of VS (the Vector Size), and therefore the misalignment in
4636 different vectorized LOOP iterations is always the same.
4637 The problem arises only if the memory access is in an inner-loop nested
4638 inside LOOP, which is now being vectorized using outer-loop vectorization.
4639 This is the only case when the misalignment of the memory access may not
4640 remain fixed throughout the iterations of the inner-loop (as explained in
4641 detail in vect_supportable_dr_alignment). In this case, not only is the
4642 optimized realignment scheme not applicable, but also the misalignment
4643 computation (and generation of the realignment token that is passed to
4644 REALIGN_LOAD) have to be done inside the loop.
4646 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4647 or not, which in turn determines if the misalignment is computed inside
4648 the inner-loop, or outside LOOP. */
4650 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4652 compute_in_loop
= true;
4653 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4657 /* 2. Determine where to generate the extra vector load.
4659 For the optimized realignment scheme, instead of generating two vector
4660 loads in each iteration, we generate a single extra vector load in the
4661 preheader of the loop, and in each iteration reuse the result of the
4662 vector load from the previous iteration. In case the memory access is in
4663 an inner-loop nested inside LOOP, which is now being vectorized using
4664 outer-loop vectorization, we need to determine whether this initial vector
4665 load should be generated at the preheader of the inner-loop, or can be
4666 generated at the preheader of LOOP. If the memory access has no evolution
4667 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4668 to be generated inside LOOP (in the preheader of the inner-loop). */
4670 if (nested_in_vect_loop
)
4672 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4673 bool invariant_in_outerloop
=
4674 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4675 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4678 loop_for_initial_load
= loop
;
4680 *at_loop
= loop_for_initial_load
;
4682 if (loop_for_initial_load
)
4683 pe
= loop_preheader_edge (loop_for_initial_load
);
4685 /* 3. For the case of the optimized realignment, create the first vector
4686 load at the loop preheader. */
4688 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4690 /* Create msq_init = *(floor(p1)) in the loop preheader */
4692 gcc_assert (!compute_in_loop
);
4693 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4694 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4695 NULL_TREE
, &init_addr
, NULL
, &inc
,
4697 new_temp
= copy_ssa_name (ptr
, NULL
);
4698 new_stmt
= gimple_build_assign_with_ops
4699 (BIT_AND_EXPR
, new_temp
, ptr
,
4700 build_int_cst (TREE_TYPE (ptr
),
4701 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4702 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4703 gcc_assert (!new_bb
);
4705 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4706 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4707 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4708 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4709 gimple_assign_set_lhs (new_stmt
, new_temp
);
4712 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4713 gcc_assert (!new_bb
);
4716 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4718 msq_init
= gimple_assign_lhs (new_stmt
);
4721 /* 4. Create realignment token using a target builtin, if available.
4722 It is done either inside the containing loop, or before LOOP (as
4723 determined above). */
4725 if (targetm
.vectorize
.builtin_mask_for_load
)
4729 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4732 /* Generate the INIT_ADDR computation outside LOOP. */
4733 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4737 pe
= loop_preheader_edge (loop
);
4738 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4739 gcc_assert (!new_bb
);
4742 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4745 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4746 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4748 vect_create_destination_var (scalar_dest
,
4749 gimple_call_return_type (new_stmt
));
4750 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4751 gimple_call_set_lhs (new_stmt
, new_temp
);
4753 if (compute_in_loop
)
4754 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4757 /* Generate the misalignment computation outside LOOP. */
4758 pe
= loop_preheader_edge (loop
);
4759 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4760 gcc_assert (!new_bb
);
4763 *realignment_token
= gimple_call_lhs (new_stmt
);
4765 /* The result of the CALL_EXPR to this builtin is determined from
4766 the value of the parameter and no global variables are touched
4767 which makes the builtin a "const" function. Requiring the
4768 builtin to have the "const" attribute makes it unnecessary
4769 to call mark_call_clobbered. */
4770 gcc_assert (TREE_READONLY (builtin_decl
));
4773 if (alignment_support_scheme
== dr_explicit_realign
)
4776 gcc_assert (!compute_in_loop
);
4777 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4780 /* 5. Create msq = phi <msq_init, lsq> in loop */
4782 pe
= loop_preheader_edge (containing_loop
);
4783 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4784 msq
= make_ssa_name (vec_dest
, NULL
);
4785 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4786 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4792 /* Function vect_grouped_load_supported.
4794 Returns TRUE if even and odd permutations are supported,
4795 and FALSE otherwise. */
4798 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4800 enum machine_mode mode
= TYPE_MODE (vectype
);
4802 /* vect_permute_load_chain requires the group size to be a power of two. */
4803 if (exact_log2 (count
) == -1)
4805 if (dump_enabled_p ())
4806 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4807 "the size of the group of accesses"
4808 " is not a power of 2\n");
4812 /* Check that the permutation is supported. */
4813 if (VECTOR_MODE_P (mode
))
4815 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4816 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4818 for (i
= 0; i
< nelt
; i
++)
4820 if (can_vec_perm_p (mode
, false, sel
))
4822 for (i
= 0; i
< nelt
; i
++)
4824 if (can_vec_perm_p (mode
, false, sel
))
4829 if (dump_enabled_p ())
4830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4831 "extract even/odd not supported by target\n");
4835 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4839 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4841 return vect_lanes_optab_supported_p ("vec_load_lanes",
4842 vec_load_lanes_optab
,
4846 /* Function vect_permute_load_chain.
4848 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4849 a power of 2, generate extract_even/odd stmts to reorder the input data
4850 correctly. Return the final references for loads in RESULT_CHAIN.
4852 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4853 The input is 4 vectors each containing 8 elements. We assign a number to each
4854 element, the input sequence is:
4856 1st vec: 0 1 2 3 4 5 6 7
4857 2nd vec: 8 9 10 11 12 13 14 15
4858 3rd vec: 16 17 18 19 20 21 22 23
4859 4th vec: 24 25 26 27 28 29 30 31
4861 The output sequence should be:
4863 1st vec: 0 4 8 12 16 20 24 28
4864 2nd vec: 1 5 9 13 17 21 25 29
4865 3rd vec: 2 6 10 14 18 22 26 30
4866 4th vec: 3 7 11 15 19 23 27 31
4868 i.e., the first output vector should contain the first elements of each
4869 interleaving group, etc.
4871 We use extract_even/odd instructions to create such output. The input of
4872 each extract_even/odd operation is two vectors
4876 and the output is the vector of extracted even/odd elements. The output of
4877 extract_even will be: 0 2 4 6
4878 and of extract_odd: 1 3 5 7
4881 The permutation is done in log LENGTH stages. In each stage extract_even
4882 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4883 their order. In our example,
4885 E1: extract_even (1st vec, 2nd vec)
4886 E2: extract_odd (1st vec, 2nd vec)
4887 E3: extract_even (3rd vec, 4th vec)
4888 E4: extract_odd (3rd vec, 4th vec)
4890 The output for the first stage will be:
4892 E1: 0 2 4 6 8 10 12 14
4893 E2: 1 3 5 7 9 11 13 15
4894 E3: 16 18 20 22 24 26 28 30
4895 E4: 17 19 21 23 25 27 29 31
4897 In order to proceed and create the correct sequence for the next stage (or
4898 for the correct output, if the second stage is the last one, as in our
4899 example), we first put the output of extract_even operation and then the
4900 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4901 The input for the second stage is:
4903 1st vec (E1): 0 2 4 6 8 10 12 14
4904 2nd vec (E3): 16 18 20 22 24 26 28 30
4905 3rd vec (E2): 1 3 5 7 9 11 13 15
4906 4th vec (E4): 17 19 21 23 25 27 29 31
4908 The output of the second stage:
4910 E1: 0 4 8 12 16 20 24 28
4911 E2: 2 6 10 14 18 22 26 30
4912 E3: 1 5 9 13 17 21 25 29
4913 E4: 3 7 11 15 19 23 27 31
4915 And RESULT_CHAIN after reordering:
4917 1st vec (E1): 0 4 8 12 16 20 24 28
4918 2nd vec (E3): 1 5 9 13 17 21 25 29
4919 3rd vec (E2): 2 6 10 14 18 22 26 30
4920 4th vec (E4): 3 7 11 15 19 23 27 31. */
4923 vect_permute_load_chain (vec
<tree
> dr_chain
,
4924 unsigned int length
,
4926 gimple_stmt_iterator
*gsi
,
4927 vec
<tree
> *result_chain
)
4929 tree data_ref
, first_vect
, second_vect
;
4930 tree perm_mask_even
, perm_mask_odd
;
4932 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4933 unsigned int i
, j
, log_length
= exact_log2 (length
);
4934 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4935 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4937 result_chain
->quick_grow (length
);
4938 memcpy (result_chain
->address (), dr_chain
.address (),
4939 length
* sizeof (tree
));
4941 for (i
= 0; i
< nelt
; ++i
)
4943 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4944 gcc_assert (perm_mask_even
!= NULL
);
4946 for (i
= 0; i
< nelt
; ++i
)
4948 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4949 gcc_assert (perm_mask_odd
!= NULL
);
4951 for (i
= 0; i
< log_length
; i
++)
4953 for (j
= 0; j
< length
; j
+= 2)
4955 first_vect
= dr_chain
[j
];
4956 second_vect
= dr_chain
[j
+1];
4958 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4959 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4960 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4961 first_vect
, second_vect
,
4963 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4964 (*result_chain
)[j
/2] = data_ref
;
4966 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4967 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4968 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4969 first_vect
, second_vect
,
4971 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4972 (*result_chain
)[j
/2+length
/2] = data_ref
;
4974 memcpy (dr_chain
.address (), result_chain
->address (),
4975 length
* sizeof (tree
));
4980 /* Function vect_transform_grouped_load.
4982 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4983 to perform their permutation and ascribe the result vectorized statements to
4984 the scalar statements.
4988 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4989 gimple_stmt_iterator
*gsi
)
4991 vec
<tree
> result_chain
= vNULL
;
4993 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4994 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4995 vectors, that are ready for vector computation. */
4996 result_chain
.create (size
);
4997 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4998 vect_record_grouped_load_vectors (stmt
, result_chain
);
4999 result_chain
.release ();
5002 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5003 generated as part of the vectorization of STMT. Assign the statement
5004 for each vector to the associated scalar statement. */
5007 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
5009 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5010 gimple next_stmt
, new_stmt
;
5011 unsigned int i
, gap_count
;
5014 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5015 Since we scan the chain starting from it's first node, their order
5016 corresponds the order of data-refs in RESULT_CHAIN. */
5017 next_stmt
= first_stmt
;
5019 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5024 /* Skip the gaps. Loads created for the gaps will be removed by dead
5025 code elimination pass later. No need to check for the first stmt in
5026 the group, since it always exists.
5027 GROUP_GAP is the number of steps in elements from the previous
5028 access (if there is no gap GROUP_GAP is 1). We skip loads that
5029 correspond to the gaps. */
5030 if (next_stmt
!= first_stmt
5031 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5039 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5040 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5041 copies, and we put the new vector statement in the first available
5043 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5044 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5047 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5050 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5052 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5055 prev_stmt
= rel_stmt
;
5057 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5060 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5065 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5067 /* If NEXT_STMT accesses the same DR as the previous statement,
5068 put the same TMP_DATA_REF as its vectorized statement; otherwise
5069 get the next data-ref from RESULT_CHAIN. */
5070 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5076 /* Function vect_force_dr_alignment_p.
5078 Returns whether the alignment of a DECL can be forced to be aligned
5079 on ALIGNMENT bit boundary. */
5082 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5084 if (TREE_CODE (decl
) != VAR_DECL
)
5087 /* We cannot change alignment of common or external symbols as another
5088 translation unit may contain a definition with lower alignment.
5089 The rules of common symbol linking mean that the definition
5090 will override the common symbol. The same is true for constant
5091 pool entries which may be shared and are not properly merged
5093 if (DECL_EXTERNAL (decl
)
5094 || DECL_COMMON (decl
)
5095 || DECL_IN_CONSTANT_POOL (decl
))
5098 if (TREE_ASM_WRITTEN (decl
))
5101 /* Do not override the alignment as specified by the ABI when the used
5102 attribute is set. */
5103 if (DECL_PRESERVE_P (decl
))
5106 /* Do not override explicit alignment set by the user when an explicit
5107 section name is also used. This is a common idiom used by many
5108 software projects. */
5109 if (DECL_SECTION_NAME (decl
) != NULL_TREE
5110 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
5113 if (TREE_STATIC (decl
))
5114 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5116 return (alignment
<= MAX_STACK_ALIGNMENT
);
5120 /* Return whether the data reference DR is supported with respect to its
5122 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5123 it is aligned, i.e., check if it is possible to vectorize it with different
5126 enum dr_alignment_support
5127 vect_supportable_dr_alignment (struct data_reference
*dr
,
5128 bool check_aligned_accesses
)
5130 gimple stmt
= DR_STMT (dr
);
5131 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5132 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5133 enum machine_mode mode
= TYPE_MODE (vectype
);
5134 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5135 struct loop
*vect_loop
= NULL
;
5136 bool nested_in_vect_loop
= false;
5138 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5141 /* For now assume all conditional loads/stores support unaligned
5142 access without any special code. */
5143 if (is_gimple_call (stmt
)
5144 && gimple_call_internal_p (stmt
)
5145 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5146 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5147 return dr_unaligned_supported
;
5151 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5152 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5155 /* Possibly unaligned access. */
5157 /* We can choose between using the implicit realignment scheme (generating
5158 a misaligned_move stmt) and the explicit realignment scheme (generating
5159 aligned loads with a REALIGN_LOAD). There are two variants to the
5160 explicit realignment scheme: optimized, and unoptimized.
5161 We can optimize the realignment only if the step between consecutive
5162 vector loads is equal to the vector size. Since the vector memory
5163 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5164 is guaranteed that the misalignment amount remains the same throughout the
5165 execution of the vectorized loop. Therefore, we can create the
5166 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5167 at the loop preheader.
5169 However, in the case of outer-loop vectorization, when vectorizing a
5170 memory access in the inner-loop nested within the LOOP that is now being
5171 vectorized, while it is guaranteed that the misalignment of the
5172 vectorized memory access will remain the same in different outer-loop
5173 iterations, it is *not* guaranteed that is will remain the same throughout
5174 the execution of the inner-loop. This is because the inner-loop advances
5175 with the original scalar step (and not in steps of VS). If the inner-loop
5176 step happens to be a multiple of VS, then the misalignment remains fixed
5177 and we can use the optimized realignment scheme. For example:
5183 When vectorizing the i-loop in the above example, the step between
5184 consecutive vector loads is 1, and so the misalignment does not remain
5185 fixed across the execution of the inner-loop, and the realignment cannot
5186 be optimized (as illustrated in the following pseudo vectorized loop):
5188 for (i=0; i<N; i+=4)
5189 for (j=0; j<M; j++){
5190 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5191 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5192 // (assuming that we start from an aligned address).
5195 We therefore have to use the unoptimized realignment scheme:
5197 for (i=0; i<N; i+=4)
5198 for (j=k; j<M; j+=4)
5199 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5200 // that the misalignment of the initial address is
5203 The loop can then be vectorized as follows:
5205 for (k=0; k<4; k++){
5206 rt = get_realignment_token (&vp[k]);
5207 for (i=0; i<N; i+=4){
5209 for (j=k; j<M; j+=4){
5211 va = REALIGN_LOAD <v1,v2,rt>;
5218 if (DR_IS_READ (dr
))
5220 bool is_packed
= false;
5221 tree type
= (TREE_TYPE (DR_REF (dr
)));
5223 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5224 && (!targetm
.vectorize
.builtin_mask_for_load
5225 || targetm
.vectorize
.builtin_mask_for_load ()))
5227 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5228 if ((nested_in_vect_loop
5229 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5230 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5232 return dr_explicit_realign
;
5234 return dr_explicit_realign_optimized
;
5236 if (!known_alignment_for_access_p (dr
))
5237 is_packed
= not_size_aligned (DR_REF (dr
));
5239 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5240 || targetm
.vectorize
.
5241 support_vector_misalignment (mode
, type
,
5242 DR_MISALIGNMENT (dr
), is_packed
))
5243 /* Can't software pipeline the loads, but can at least do them. */
5244 return dr_unaligned_supported
;
5248 bool is_packed
= false;
5249 tree type
= (TREE_TYPE (DR_REF (dr
)));
5251 if (!known_alignment_for_access_p (dr
))
5252 is_packed
= not_size_aligned (DR_REF (dr
));
5254 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5255 || targetm
.vectorize
.
5256 support_vector_misalignment (mode
, type
,
5257 DR_MISALIGNMENT (dr
), is_packed
))
5258 return dr_unaligned_supported
;
5262 return dr_unaligned_unsupported
;