1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
37 #include "gimple-pretty-print.h"
38 #include "internal-fn.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-chrec.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "diagnostic-core.h"
52 /* Need to include rtl.h, expr.h, etc. for optabs. */
54 #include "insn-config.h"
63 #include "insn-codes.h"
67 /* Return true if load- or store-lanes optab OPTAB is implemented for
68 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
71 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
72 tree vectype
, unsigned HOST_WIDE_INT count
)
74 machine_mode mode
, array_mode
;
77 mode
= TYPE_MODE (vectype
);
78 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
79 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
82 if (array_mode
== BLKmode
)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
86 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
87 GET_MODE_NAME (mode
), count
);
91 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
95 "cannot use %s<%s><%s>\n", name
,
96 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_NOTE
, vect_location
,
102 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
103 GET_MODE_NAME (mode
));
109 /* Return the smallest scalar part of STMT.
110 This is used to determine the vectype of the stmt. We generally set the
111 vectype according to the type of the result (lhs). For stmts whose
112 result-type is different than the type of the arguments (e.g., demotion,
113 promotion), vectype will be reset appropriately (later). Note that we have
114 to visit the smallest datatype in this function, because that determines the
115 VF. If the smallest datatype in the loop is present only as the rhs of a
116 promotion operation - we'd miss it.
117 Such a case, where a variable of this datatype does not appear in the lhs
118 anywhere in the loop, can only occur if it's an invariant: e.g.:
119 'int_x = (int) short_inv', which we'd expect to have been optimized away by
120 invariant motion. However, we cannot rely on invariant motion to always
121 take invariants out of the loop, and so in the case of promotion we also
122 have to check the rhs.
123 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
127 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
128 HOST_WIDE_INT
*rhs_size_unit
)
130 tree scalar_type
= gimple_expr_type (stmt
);
131 HOST_WIDE_INT lhs
, rhs
;
133 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
135 if (is_gimple_assign (stmt
)
136 && (gimple_assign_cast_p (stmt
)
137 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
148 *lhs_size_unit
= lhs
;
149 *rhs_size_unit
= rhs
;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
159 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
161 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
166 if (dump_enabled_p ())
168 dump_printf_loc (MSG_NOTE
, vect_location
,
169 "mark for run-time aliasing test between ");
170 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
171 dump_printf (MSG_NOTE
, " and ");
172 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
173 dump_printf (MSG_NOTE
, "\n");
176 if (optimize_loop_nest_for_size_p (loop
))
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
180 "versioning not supported when optimizing"
185 /* FORNOW: We don't support versioning with outer-loop vectorization. */
188 if (dump_enabled_p ())
189 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
190 "versioning not yet supported for outer-loops.\n");
194 /* FORNOW: We don't support creating runtime alias tests for non-constant
196 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
197 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
201 "versioning not yet supported for non-constant "
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
211 /* Function vect_analyze_data_ref_dependence.
213 Return TRUE if there (might) exist a dependence between a memory-reference
214 DRA and a memory-reference DRB. When versioning for alias may check a
215 dependence at run-time, return FALSE. Adjust *MAX_VF according to
216 the data dependence. */
219 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
220 loop_vec_info loop_vinfo
, int *max_vf
)
223 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
224 struct data_reference
*dra
= DDR_A (ddr
);
225 struct data_reference
*drb
= DDR_B (ddr
);
226 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
227 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
228 lambda_vector dist_v
;
229 unsigned int loop_depth
;
231 /* In loop analysis all data references should be vectorizable. */
232 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
233 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
236 /* Independent data accesses. */
237 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
241 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
244 /* Even if we have an anti-dependence then, as the vectorized loop covers at
245 least two scalar iterations, there is always also a true dependence.
246 As the vectorizer does not re-order loads and stores we can ignore
247 the anti-dependence if TBAA can disambiguate both DRs similar to the
248 case with known negative distance anti-dependences (positive
249 distance anti-dependences would violate TBAA constraints). */
250 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
251 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
252 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
253 get_alias_set (DR_REF (drb
))))
256 /* Unknown data dependence. */
257 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
259 /* If user asserted safelen consecutive iterations can be
260 executed concurrently, assume independence. */
261 if (loop
->safelen
>= 2)
263 if (loop
->safelen
< *max_vf
)
264 *max_vf
= loop
->safelen
;
265 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
269 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
270 || STMT_VINFO_GATHER_P (stmtinfo_b
))
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
275 "versioning for alias not supported for: "
276 "can't determine dependence between ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
279 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
282 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
287 if (dump_enabled_p ())
289 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
290 "versioning for alias required: "
291 "can't determine dependence between ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
294 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
295 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
297 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
300 /* Add to list of ddrs that need to be tested at run-time. */
301 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
304 /* Known data dependence. */
305 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
307 /* If user asserted safelen consecutive iterations can be
308 executed concurrently, assume independence. */
309 if (loop
->safelen
>= 2)
311 if (loop
->safelen
< *max_vf
)
312 *max_vf
= loop
->safelen
;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
317 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
318 || STMT_VINFO_GATHER_P (stmtinfo_b
))
320 if (dump_enabled_p ())
322 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
323 "versioning for alias not supported for: "
324 "bad dist vector for ");
325 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
327 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
330 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
338 "versioning for alias required: "
339 "bad dist vector for ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
341 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
343 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
345 /* Add to list of ddrs that need to be tested at run-time. */
346 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
349 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
350 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
352 int dist
= dist_v
[loop_depth
];
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE
, vect_location
,
356 "dependence distance = %d.\n", dist
);
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE
, vect_location
,
363 "dependence distance == 0 between ");
364 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
365 dump_printf (MSG_NOTE
, " and ");
366 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
367 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
370 /* When we perform grouped accesses and perform implicit CSE
371 by detecting equal accesses and doing disambiguation with
372 runtime alias tests like for
380 where we will end up loading { a[i], a[i+1] } once, make
381 sure that inserting group loads before the first load and
382 stores after the last store will do the right thing.
383 Similar for groups like
387 where loads from the group interleave with the store. */
388 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
389 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
392 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
394 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
396 if (dump_enabled_p ())
397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
398 "READ_WRITE dependence in interleaving."
407 if (dist
> 0 && DDR_REVERSED_P (ddr
))
409 /* If DDR_REVERSED_P the order of the data-refs in DDR was
410 reversed (to make distance vector positive), and the actual
411 distance is negative. */
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
414 "dependence distance negative.\n");
415 /* Record a negative dependence distance to later limit the
416 amount of stmt copying / unrolling we can perform.
417 Only need to handle read-after-write dependence. */
419 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
420 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
421 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
426 && abs (dist
) < *max_vf
)
428 /* The dependence distance requires reduction of the maximal
429 vectorization factor. */
430 *max_vf
= abs (dist
);
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_NOTE
, vect_location
,
433 "adjusting maximal vectorization factor to %i\n",
437 if (abs (dist
) >= *max_vf
)
439 /* Dependence distance does not create dependence, as far as
440 vectorization is concerned, in this case. */
441 if (dump_enabled_p ())
442 dump_printf_loc (MSG_NOTE
, vect_location
,
443 "dependence distance >= VF.\n");
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
450 "not vectorized, possible dependence "
451 "between data-refs ");
452 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
453 dump_printf (MSG_NOTE
, " and ");
454 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
455 dump_printf (MSG_NOTE
, "\n");
464 /* Function vect_analyze_data_ref_dependences.
466 Examine all the data references in the loop, and make sure there do not
467 exist any data dependences between them. Set *MAX_VF according to
468 the maximum vectorization factor the data dependences allow. */
471 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
474 struct data_dependence_relation
*ddr
;
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_NOTE
, vect_location
,
478 "=== vect_analyze_data_ref_dependences ===\n");
480 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
481 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
482 &LOOP_VINFO_DDRS (loop_vinfo
),
483 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
486 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
487 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
494 /* Function vect_slp_analyze_data_ref_dependence.
496 Return TRUE if there (might) exist a dependence between a memory-reference
497 DRA and a memory-reference DRB. When versioning for alias may check a
498 dependence at run-time, return FALSE. Adjust *MAX_VF according to
499 the data dependence. */
502 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
504 struct data_reference
*dra
= DDR_A (ddr
);
505 struct data_reference
*drb
= DDR_B (ddr
);
507 /* We need to check dependences of statements marked as unvectorizable
508 as well, they still can prohibit vectorization. */
510 /* Independent data accesses. */
511 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
517 /* Read-read is OK. */
518 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
521 /* If dra and drb are part of the same interleaving chain consider
523 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
524 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
525 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
528 /* Unknown data dependence. */
529 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
531 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
534 "can't determine dependence between ");
535 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
536 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
538 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
541 else if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE
, vect_location
,
544 "determined dependence between ");
545 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
546 dump_printf (MSG_NOTE
, " and ");
547 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
548 dump_printf (MSG_NOTE
, "\n");
551 /* We do not vectorize basic blocks with write-write dependencies. */
552 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
555 /* If we have a read-write dependence check that the load is before the store.
556 When we vectorize basic blocks, vector load can be only before
557 corresponding scalar load, and vector store can be only after its
558 corresponding scalar store. So the order of the acceses is preserved in
559 case the load is before the store. */
560 gimple earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
561 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
563 /* That only holds for load-store pairs taking part in vectorization. */
564 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
565 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
573 /* Function vect_analyze_data_ref_dependences.
575 Examine all the data references in the basic-block, and make sure there
576 do not exist any data dependences between them. Set *MAX_VF according to
577 the maximum vectorization factor the data dependences allow. */
580 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
582 struct data_dependence_relation
*ddr
;
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_NOTE
, vect_location
,
587 "=== vect_slp_analyze_data_ref_dependences ===\n");
589 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
590 &BB_VINFO_DDRS (bb_vinfo
),
594 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
595 if (vect_slp_analyze_data_ref_dependence (ddr
))
602 /* Function vect_compute_data_ref_alignment
604 Compute the misalignment of the data reference DR.
607 1. If during the misalignment computation it is found that the data reference
608 cannot be vectorized then false is returned.
609 2. DR_MISALIGNMENT (DR) is defined.
611 FOR NOW: No analysis is actually performed. Misalignment is calculated
612 only for trivial cases. TODO. */
615 vect_compute_data_ref_alignment (struct data_reference
*dr
)
617 gimple stmt
= DR_STMT (dr
);
618 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
619 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
620 struct loop
*loop
= NULL
;
621 tree ref
= DR_REF (dr
);
623 tree base
, base_addr
;
625 tree misalign
= NULL_TREE
;
627 unsigned HOST_WIDE_INT alignment
;
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE
, vect_location
,
631 "vect_compute_data_ref_alignment:\n");
634 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
636 /* Initialize misalignment to unknown. */
637 SET_DR_MISALIGNMENT (dr
, -1);
639 /* Strided accesses perform only component accesses, misalignment information
640 is irrelevant for them. */
641 if (STMT_VINFO_STRIDED_P (stmt_info
)
642 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
645 if (tree_fits_shwi_p (DR_STEP (dr
)))
646 misalign
= DR_INIT (dr
);
647 aligned_to
= DR_ALIGNED_TO (dr
);
648 base_addr
= DR_BASE_ADDRESS (dr
);
649 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
651 /* In case the dataref is in an inner-loop of the loop that is being
652 vectorized (LOOP), we use the base and misalignment information
653 relative to the outer-loop (LOOP). This is ok only if the misalignment
654 stays the same throughout the execution of the inner-loop, which is why
655 we have to check that the stride of the dataref in the inner-loop evenly
656 divides by the vector size. */
657 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
659 tree step
= DR_STEP (dr
);
661 if (tree_fits_shwi_p (step
)
662 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE
, vect_location
,
666 "inner step divides the vector-size.\n");
667 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
668 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
669 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
675 "inner step doesn't divide the vector-size.\n");
676 misalign
= NULL_TREE
;
680 /* Similarly we can only use base and misalignment information relative to
681 an innermost loop if the misalignment stays the same throughout the
682 execution of the loop. As above, this is the case if the stride of
683 the dataref evenly divides by the vector size. */
686 tree step
= DR_STEP (dr
);
687 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
689 if (tree_fits_shwi_p (step
)
690 && ((tree_to_shwi (step
) * vf
)
691 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
695 "step doesn't divide the vector-size.\n");
696 misalign
= NULL_TREE
;
700 alignment
= TYPE_ALIGN_UNIT (vectype
);
702 if ((compare_tree_int (aligned_to
, alignment
) < 0)
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
708 "Unknown alignment for access: ");
709 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
710 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
715 /* To look at alignment of the base we have to preserve an inner MEM_REF
716 as that carries alignment information of the actual access. */
718 while (handled_component_p (base
))
719 base
= TREE_OPERAND (base
, 0);
720 if (TREE_CODE (base
) == MEM_REF
)
721 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
722 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
724 if (get_object_alignment (base
) >= TYPE_ALIGN (vectype
))
727 base_aligned
= false;
731 /* Strip an inner MEM_REF to a bare decl if possible. */
732 if (TREE_CODE (base
) == MEM_REF
733 && integer_zerop (TREE_OPERAND (base
, 1))
734 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
735 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
737 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
739 if (dump_enabled_p ())
741 dump_printf_loc (MSG_NOTE
, vect_location
,
742 "can't force alignment of ref: ");
743 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
744 dump_printf (MSG_NOTE
, "\n");
749 /* Force the alignment of the decl.
750 NOTE: This is the only change to the code we make during
751 the analysis phase, before deciding to vectorize the loop. */
752 if (dump_enabled_p ())
754 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
755 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
756 dump_printf (MSG_NOTE
, "\n");
759 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
760 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_sgn (DR_STEP (dr
)) < 0)
768 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
772 /* PLUS because DR_STEP was negative. */
773 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
776 SET_DR_MISALIGNMENT (dr
,
777 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
779 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
782 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
783 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
784 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
791 /* Function vect_compute_data_refs_alignment
793 Compute the misalignment of data references in the loop.
794 Return FALSE if a data reference is found that cannot be vectorized. */
797 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
798 bb_vec_info bb_vinfo
)
800 vec
<data_reference_p
> datarefs
;
801 struct data_reference
*dr
;
805 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
807 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
809 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
810 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
811 && !vect_compute_data_ref_alignment (dr
))
815 /* Mark unsupported statement as unvectorizable. */
816 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
827 /* Function vect_update_misalignment_for_peel
829 DR - the data reference whose misalignment is to be adjusted.
830 DR_PEEL - the data reference whose misalignment is being made
831 zero in the vector loop by the peel.
832 NPEEL - the number of iterations in the peel loop if the misalignment
833 of DR_PEEL is known at compile time. */
836 vect_update_misalignment_for_peel (struct data_reference
*dr
,
837 struct data_reference
*dr_peel
, int npeel
)
840 vec
<dr_p
> same_align_drs
;
841 struct data_reference
*current_dr
;
842 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
843 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
844 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
845 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
847 /* For interleaved data accesses the step in the loop must be multiplied by
848 the size of the interleaving group. */
849 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
850 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
851 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
852 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
854 /* It can be assumed that the data refs with the same alignment as dr_peel
855 are aligned in the vector loop. */
857 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
858 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
860 if (current_dr
!= dr
)
862 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
863 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
864 SET_DR_MISALIGNMENT (dr
, 0);
868 if (known_alignment_for_access_p (dr
)
869 && known_alignment_for_access_p (dr_peel
))
871 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
872 int misal
= DR_MISALIGNMENT (dr
);
873 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
874 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
875 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
876 SET_DR_MISALIGNMENT (dr
, misal
);
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
882 SET_DR_MISALIGNMENT (dr
, -1);
886 /* Function vect_verify_datarefs_alignment
888 Return TRUE if all data references in the loop can be
889 handled with respect to alignment. */
892 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
894 vec
<data_reference_p
> datarefs
;
895 struct data_reference
*dr
;
896 enum dr_alignment_support supportable_dr_alignment
;
900 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
902 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
904 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
906 gimple stmt
= DR_STMT (dr
);
907 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
909 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
912 /* For interleaving, only the alignment of the first access matters.
913 Skip statements marked as not vectorizable. */
914 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
915 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
916 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
919 /* Strided accesses perform only component accesses, alignment is
920 irrelevant for them. */
921 if (STMT_VINFO_STRIDED_P (stmt_info
)
922 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
925 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
926 if (!supportable_dr_alignment
)
928 if (dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
932 "not vectorized: unsupported unaligned load.");
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
935 "not vectorized: unsupported unaligned "
938 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
940 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
944 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE
, vect_location
,
946 "Vectorizing an unaligned access.\n");
951 /* Given an memory reference EXP return whether its alignment is less
955 not_size_aligned (tree exp
)
957 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
960 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
961 > get_object_alignment (exp
));
964 /* Function vector_alignment_reachable_p
966 Return true if vector alignment for DR is reachable by peeling
967 a few loop iterations. Return false otherwise. */
970 vector_alignment_reachable_p (struct data_reference
*dr
)
972 gimple stmt
= DR_STMT (dr
);
973 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
974 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
978 /* For interleaved access we peel only if number of iterations in
979 the prolog loop ({VF - misalignment}), is a multiple of the
980 number of the interleaved accesses. */
981 int elem_size
, mis_in_elements
;
982 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
984 /* FORNOW: handle only known alignment. */
985 if (!known_alignment_for_access_p (dr
))
988 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
989 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
991 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
995 /* If misalignment is known at the compile time then allow peeling
996 only if natural alignment is reachable through peeling. */
997 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
999 HOST_WIDE_INT elmsize
=
1000 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE
, vect_location
,
1004 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1005 dump_printf (MSG_NOTE
,
1006 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1008 if (DR_MISALIGNMENT (dr
) % elmsize
)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1012 "data size does not divide the misalignment.\n");
1017 if (!known_alignment_for_access_p (dr
))
1019 tree type
= TREE_TYPE (DR_REF (dr
));
1020 bool is_packed
= not_size_aligned (DR_REF (dr
));
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1023 "Unknown misalignment, is_packed = %d\n",is_packed
);
1024 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1025 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1035 /* Calculate the cost of the memory access represented by DR. */
1038 vect_get_data_access_cost (struct data_reference
*dr
,
1039 unsigned int *inside_cost
,
1040 unsigned int *outside_cost
,
1041 stmt_vector_for_cost
*body_cost_vec
)
1043 gimple stmt
= DR_STMT (dr
);
1044 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1045 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1046 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1047 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1048 int ncopies
= vf
/ nunits
;
1050 if (DR_IS_READ (dr
))
1051 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1052 NULL
, body_cost_vec
, false);
1054 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_NOTE
, vect_location
,
1058 "vect_get_data_access_cost: inside_cost = %d, "
1059 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1063 /* Insert DR into peeling hash table with NPEEL as key. */
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1069 struct _vect_peel_info elem
, *slot
;
1070 _vect_peel_info
**new_slot
;
1071 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1074 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
)->find (&elem
);
1079 slot
= XNEW (struct _vect_peel_info
);
1080 slot
->npeel
= npeel
;
1084 = LOOP_VINFO_PEELING_HTAB (loop_vinfo
)->find_slot (slot
, INSERT
);
1088 if (!supportable_dr_alignment
1089 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1090 slot
->count
+= VECT_MAX_COST
;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1099 _vect_peel_extended_info
*max
)
1101 vect_peel_info elem
= *slot
;
1103 if (elem
->count
> max
->peel_info
.count
1104 || (elem
->count
== max
->peel_info
.count
1105 && max
->peel_info
.npeel
> elem
->npeel
))
1107 max
->peel_info
.npeel
= elem
->npeel
;
1108 max
->peel_info
.count
= elem
->count
;
1109 max
->peel_info
.dr
= elem
->dr
;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1121 _vect_peel_extended_info
*min
)
1123 vect_peel_info elem
= *slot
;
1124 int save_misalignment
, dummy
;
1125 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1126 gimple stmt
= DR_STMT (elem
->dr
);
1127 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1128 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1129 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1130 struct data_reference
*dr
;
1131 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1133 prologue_cost_vec
.create (2);
1134 body_cost_vec
.create (2);
1135 epilogue_cost_vec
.create (2);
1137 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1139 stmt
= DR_STMT (dr
);
1140 stmt_info
= vinfo_for_stmt (stmt
);
1141 /* For interleaving, only the alignment of the first access
1143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1144 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1147 save_misalignment
= DR_MISALIGNMENT (dr
);
1148 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1149 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1151 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1154 outside_cost
+= vect_get_known_peeling_cost
1155 (loop_vinfo
, elem
->npeel
, &dummy
,
1156 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1157 &prologue_cost_vec
, &epilogue_cost_vec
);
1159 /* Prologue and epilogue costs are added to the target model later.
1160 These costs depend only on the scalar iteration cost, the
1161 number of peeling iterations finally chosen, and the number of
1162 misaligned statements. So discard the information found here. */
1163 prologue_cost_vec
.release ();
1164 epilogue_cost_vec
.release ();
1166 if (inside_cost
< min
->inside_cost
1167 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1169 min
->inside_cost
= inside_cost
;
1170 min
->outside_cost
= outside_cost
;
1171 min
->body_cost_vec
.release ();
1172 min
->body_cost_vec
= body_cost_vec
;
1173 min
->peel_info
.dr
= elem
->dr
;
1174 min
->peel_info
.npeel
= elem
->npeel
;
1177 body_cost_vec
.release ();
1183 /* Choose best peeling option by traversing peeling hash table and either
1184 choosing an option with the lowest cost (if cost model is enabled) or the
1185 option that aligns as many accesses as possible. */
1187 static struct data_reference
*
1188 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1189 unsigned int *npeel
,
1190 stmt_vector_for_cost
*body_cost_vec
)
1192 struct _vect_peel_extended_info res
;
1194 res
.peel_info
.dr
= NULL
;
1195 res
.body_cost_vec
= stmt_vector_for_cost ();
1197 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1199 res
.inside_cost
= INT_MAX
;
1200 res
.outside_cost
= INT_MAX
;
1201 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1202 ->traverse
<_vect_peel_extended_info
*,
1203 vect_peeling_hash_get_lowest_cost
> (&res
);
1207 res
.peel_info
.count
= 0;
1208 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1209 ->traverse
<_vect_peel_extended_info
*,
1210 vect_peeling_hash_get_most_frequent
> (&res
);
1213 *npeel
= res
.peel_info
.npeel
;
1214 *body_cost_vec
= res
.body_cost_vec
;
1215 return res
.peel_info
.dr
;
1219 /* Function vect_enhance_data_refs_alignment
1221 This pass will use loop versioning and loop peeling in order to enhance
1222 the alignment of data references in the loop.
1224 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1225 original loop is to be vectorized. Any other loops that are created by
1226 the transformations performed in this pass - are not supposed to be
1227 vectorized. This restriction will be relaxed.
1229 This pass will require a cost model to guide it whether to apply peeling
1230 or versioning or a combination of the two. For example, the scheme that
1231 intel uses when given a loop with several memory accesses, is as follows:
1232 choose one memory access ('p') which alignment you want to force by doing
1233 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1234 other accesses are not necessarily aligned, or (2) use loop versioning to
1235 generate one loop in which all accesses are aligned, and another loop in
1236 which only 'p' is necessarily aligned.
1238 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1239 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1240 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1242 Devising a cost model is the most critical aspect of this work. It will
1243 guide us on which access to peel for, whether to use loop versioning, how
1244 many versions to create, etc. The cost model will probably consist of
1245 generic considerations as well as target specific considerations (on
1246 powerpc for example, misaligned stores are more painful than misaligned
1249 Here are the general steps involved in alignment enhancements:
1251 -- original loop, before alignment analysis:
1252 for (i=0; i<N; i++){
1253 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1254 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1257 -- After vect_compute_data_refs_alignment:
1258 for (i=0; i<N; i++){
1259 x = q[i]; # DR_MISALIGNMENT(q) = 3
1260 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1263 -- Possibility 1: we do loop versioning:
1265 for (i=0; i<N; i++){ # loop 1A
1266 x = q[i]; # DR_MISALIGNMENT(q) = 3
1267 p[i] = y; # DR_MISALIGNMENT(p) = 0
1271 for (i=0; i<N; i++){ # loop 1B
1272 x = q[i]; # DR_MISALIGNMENT(q) = 3
1273 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1277 -- Possibility 2: we do loop peeling:
1278 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1282 for (i = 3; i < N; i++){ # loop 2A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1287 -- Possibility 3: combination of loop peeling and versioning:
1288 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1293 for (i = 3; i<N; i++){ # loop 3A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 0
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1299 for (i = 3; i<N; i++){ # loop 3B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 0
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 These loops are later passed to loop_transform to be vectorized. The
1306 vectorizer will use the alignment information to guide the transformation
1307 (whether to generate regular loads/stores, or with special handling for
1311 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1313 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1314 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1315 enum dr_alignment_support supportable_dr_alignment
;
1316 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1317 struct data_reference
*dr
;
1319 bool do_peeling
= false;
1320 bool do_versioning
= false;
1323 stmt_vec_info stmt_info
;
1324 unsigned int npeel
= 0;
1325 bool all_misalignments_unknown
= true;
1326 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1327 unsigned possible_npeel_number
= 1;
1329 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1330 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_NOTE
, vect_location
,
1334 "=== vect_enhance_data_refs_alignment ===\n");
1336 /* While cost model enhancements are expected in the future, the high level
1337 view of the code at this time is as follows:
1339 A) If there is a misaligned access then see if peeling to align
1340 this access can make all data references satisfy
1341 vect_supportable_dr_alignment. If so, update data structures
1342 as needed and return true.
1344 B) If peeling wasn't possible and there is a data reference with an
1345 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1346 then see if loop versioning checks can be used to make all data
1347 references satisfy vect_supportable_dr_alignment. If so, update
1348 data structures as needed and return true.
1350 C) If neither peeling nor versioning were successful then return false if
1351 any data reference does not satisfy vect_supportable_dr_alignment.
1353 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1355 Note, Possibility 3 above (which is peeling and versioning together) is not
1356 being done at this time. */
1358 /* (1) Peeling to force alignment. */
1360 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1362 + How many accesses will become aligned due to the peeling
1363 - How many accesses will become unaligned due to the peeling,
1364 and the cost of misaligned accesses.
1365 - The cost of peeling (the extra runtime checks, the increase
1368 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1370 stmt
= DR_STMT (dr
);
1371 stmt_info
= vinfo_for_stmt (stmt
);
1373 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1376 /* For interleaving, only the alignment of the first access
1378 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1379 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1382 /* For invariant accesses there is nothing to enhance. */
1383 if (integer_zerop (DR_STEP (dr
)))
1386 /* Strided accesses perform only component accesses, alignment is
1387 irrelevant for them. */
1388 if (STMT_VINFO_STRIDED_P (stmt_info
)
1389 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1392 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1393 do_peeling
= vector_alignment_reachable_p (dr
);
1396 if (known_alignment_for_access_p (dr
))
1398 unsigned int npeel_tmp
;
1399 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1400 size_zero_node
) < 0;
1402 /* Save info about DR in the hash table. */
1403 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
))
1404 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1405 = new hash_table
<peel_info_hasher
> (1);
1407 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1408 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1409 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1410 TREE_TYPE (DR_REF (dr
))));
1411 npeel_tmp
= (negative
1412 ? (mis
- nelements
) : (nelements
- mis
))
1415 /* For multiple types, it is possible that the bigger type access
1416 will have more than one peeling option. E.g., a loop with two
1417 types: one of size (vector size / 4), and the other one of
1418 size (vector size / 8). Vectorization factor will 8. If both
1419 access are misaligned by 3, the first one needs one scalar
1420 iteration to be aligned, and the second one needs 5. But the
1421 the first one will be aligned also by peeling 5 scalar
1422 iterations, and in that case both accesses will be aligned.
1423 Hence, except for the immediate peeling amount, we also want
1424 to try to add full vector size, while we don't exceed
1425 vectorization factor.
1426 We do this automtically for cost model, since we calculate cost
1427 for every peeling option. */
1428 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1430 if (STMT_SLP_TYPE (stmt_info
))
1431 possible_npeel_number
1432 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1434 possible_npeel_number
= vf
/ nelements
;
1437 /* Handle the aligned case. We may decide to align some other
1438 access, making DR unaligned. */
1439 if (DR_MISALIGNMENT (dr
) == 0)
1442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1443 possible_npeel_number
++;
1446 for (j
= 0; j
< possible_npeel_number
; j
++)
1448 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1449 npeel_tmp
+= nelements
;
1452 all_misalignments_unknown
= false;
1453 /* Data-ref that was chosen for the case that all the
1454 misalignments are unknown is not relevant anymore, since we
1455 have a data-ref with known alignment. */
1460 /* If we don't know any misalignment values, we prefer
1461 peeling for data-ref that has the maximum number of data-refs
1462 with the same alignment, unless the target prefers to align
1463 stores over load. */
1464 if (all_misalignments_unknown
)
1466 unsigned same_align_drs
1467 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1469 || same_align_drs_max
< same_align_drs
)
1471 same_align_drs_max
= same_align_drs
;
1474 /* For data-refs with the same number of related
1475 accesses prefer the one where the misalign
1476 computation will be invariant in the outermost loop. */
1477 else if (same_align_drs_max
== same_align_drs
)
1479 struct loop
*ivloop0
, *ivloop
;
1480 ivloop0
= outermost_invariant_loop_for_expr
1481 (loop
, DR_BASE_ADDRESS (dr0
));
1482 ivloop
= outermost_invariant_loop_for_expr
1483 (loop
, DR_BASE_ADDRESS (dr
));
1484 if ((ivloop
&& !ivloop0
)
1485 || (ivloop
&& ivloop0
1486 && flow_loop_nested_p (ivloop
, ivloop0
)))
1490 if (!first_store
&& DR_IS_WRITE (dr
))
1494 /* If there are both known and unknown misaligned accesses in the
1495 loop, we choose peeling amount according to the known
1497 if (!supportable_dr_alignment
)
1500 if (!first_store
&& DR_IS_WRITE (dr
))
1507 if (!aligned_access_p (dr
))
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1511 "vector alignment may not be reachable\n");
1517 /* Check if we can possibly peel the loop. */
1518 if (!vect_can_advance_ivs_p (loop_vinfo
)
1519 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1523 && all_misalignments_unknown
1524 && vect_supportable_dr_alignment (dr0
, false))
1526 /* Check if the target requires to prefer stores over loads, i.e., if
1527 misaligned stores are more expensive than misaligned loads (taking
1528 drs with same alignment into account). */
1529 if (first_store
&& DR_IS_READ (dr0
))
1531 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1532 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1533 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1534 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1535 stmt_vector_for_cost dummy
;
1538 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1540 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1541 &store_outside_cost
, &dummy
);
1545 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1546 aligning the load DR0). */
1547 load_inside_penalty
= store_inside_cost
;
1548 load_outside_penalty
= store_outside_cost
;
1550 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1551 DR_STMT (first_store
))).iterate (i
, &dr
);
1553 if (DR_IS_READ (dr
))
1555 load_inside_penalty
+= load_inside_cost
;
1556 load_outside_penalty
+= load_outside_cost
;
1560 load_inside_penalty
+= store_inside_cost
;
1561 load_outside_penalty
+= store_outside_cost
;
1564 /* Calculate the penalty for leaving DR0 unaligned (by
1565 aligning the FIRST_STORE). */
1566 store_inside_penalty
= load_inside_cost
;
1567 store_outside_penalty
= load_outside_cost
;
1569 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1570 DR_STMT (dr0
))).iterate (i
, &dr
);
1572 if (DR_IS_READ (dr
))
1574 store_inside_penalty
+= load_inside_cost
;
1575 store_outside_penalty
+= load_outside_cost
;
1579 store_inside_penalty
+= store_inside_cost
;
1580 store_outside_penalty
+= store_outside_cost
;
1583 if (load_inside_penalty
> store_inside_penalty
1584 || (load_inside_penalty
== store_inside_penalty
1585 && load_outside_penalty
> store_outside_penalty
))
1589 /* In case there are only loads with different unknown misalignments, use
1590 peeling only if it may help to align other accesses in the loop or
1591 if it may help improving load bandwith when we'd end up using
1593 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1595 && !STMT_VINFO_SAME_ALIGN_REFS (
1596 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1597 && (vect_supportable_dr_alignment (dr0
, false)
1598 != dr_unaligned_supported
1599 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1600 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1604 if (do_peeling
&& !dr0
)
1606 /* Peeling is possible, but there is no data access that is not supported
1607 unless aligned. So we try to choose the best possible peeling. */
1609 /* We should get here only if there are drs with known misalignment. */
1610 gcc_assert (!all_misalignments_unknown
);
1612 /* Choose the best peeling from the hash table. */
1613 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1621 stmt
= DR_STMT (dr0
);
1622 stmt_info
= vinfo_for_stmt (stmt
);
1623 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1624 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1626 if (known_alignment_for_access_p (dr0
))
1628 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1629 size_zero_node
) < 0;
1632 /* Since it's known at compile time, compute the number of
1633 iterations in the peeled loop (the peeling factor) for use in
1634 updating DR_MISALIGNMENT values. The peeling factor is the
1635 vectorization factor minus the misalignment as an element
1637 mis
= DR_MISALIGNMENT (dr0
);
1638 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1639 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1643 /* For interleaved data access every iteration accesses all the
1644 members of the group, therefore we divide the number of iterations
1645 by the group size. */
1646 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1647 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1648 npeel
/= GROUP_SIZE (stmt_info
);
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE
, vect_location
,
1652 "Try peeling by %d\n", npeel
);
1655 /* Ensure that all data refs can be vectorized after the peel. */
1656 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1658 int save_misalignment
;
1663 stmt
= DR_STMT (dr
);
1664 stmt_info
= vinfo_for_stmt (stmt
);
1665 /* For interleaving, only the alignment of the first access
1667 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1668 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1671 /* Strided accesses perform only component accesses, alignment is
1672 irrelevant for them. */
1673 if (STMT_VINFO_STRIDED_P (stmt_info
)
1674 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1677 save_misalignment
= DR_MISALIGNMENT (dr
);
1678 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1679 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1680 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1682 if (!supportable_dr_alignment
)
1689 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1691 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1696 body_cost_vec
.release ();
1701 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1704 unsigned max_allowed_peel
1705 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1706 if (max_allowed_peel
!= (unsigned)-1)
1708 unsigned max_peel
= npeel
;
1711 gimple dr_stmt
= DR_STMT (dr0
);
1712 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1713 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1714 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1716 if (max_peel
> max_allowed_peel
)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE
, vect_location
,
1721 "Disable peeling, max peels reached: %d\n", max_peel
);
1726 /* Cost model #2 - if peeling may result in a remaining loop not
1727 iterating enough to be vectorized then do not peel. */
1729 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1732 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1733 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1734 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1740 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1741 If the misalignment of DR_i is identical to that of dr0 then set
1742 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1743 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1744 by the peeling factor times the element size of DR_i (MOD the
1745 vectorization factor times the size). Otherwise, the
1746 misalignment of DR_i must be set to unknown. */
1747 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1749 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1751 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1753 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1755 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1756 = DR_MISALIGNMENT (dr0
);
1757 SET_DR_MISALIGNMENT (dr0
, 0);
1758 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE
, vect_location
,
1761 "Alignment of access forced using peeling.\n");
1762 dump_printf_loc (MSG_NOTE
, vect_location
,
1763 "Peeling for alignment will be applied.\n");
1765 /* The inside-loop cost will be accounted for in vectorizable_load
1766 and vectorizable_store correctly with adjusted alignments.
1767 Drop the body_cst_vec on the floor here. */
1768 body_cost_vec
.release ();
1770 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1776 body_cost_vec
.release ();
1778 /* (2) Versioning to force alignment. */
1780 /* Try versioning if:
1781 1) optimize loop for speed
1782 2) there is at least one unsupported misaligned data ref with an unknown
1784 3) all misaligned data refs with a known misalignment are supported, and
1785 4) the number of runtime alignment checks is within reason. */
1788 optimize_loop_nest_for_speed_p (loop
)
1789 && (!loop
->inner
); /* FORNOW */
1793 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1795 stmt
= DR_STMT (dr
);
1796 stmt_info
= vinfo_for_stmt (stmt
);
1798 /* For interleaving, only the alignment of the first access
1800 if (aligned_access_p (dr
)
1801 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1802 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1805 if (STMT_VINFO_STRIDED_P (stmt_info
))
1807 /* Strided loads perform only component accesses, alignment is
1808 irrelevant for them. */
1809 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1811 do_versioning
= false;
1815 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1817 if (!supportable_dr_alignment
)
1823 if (known_alignment_for_access_p (dr
)
1824 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1825 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1827 do_versioning
= false;
1831 stmt
= DR_STMT (dr
);
1832 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1833 gcc_assert (vectype
);
1835 /* The rightmost bits of an aligned address must be zeros.
1836 Construct the mask needed for this test. For example,
1837 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1838 mask must be 15 = 0xf. */
1839 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1841 /* FORNOW: use the same mask to test all potentially unaligned
1842 references in the loop. The vectorizer currently supports
1843 a single vector size, see the reference to
1844 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1845 vectorization factor is computed. */
1846 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1847 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1848 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1849 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1854 /* Versioning requires at least one misaligned data reference. */
1855 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1856 do_versioning
= false;
1857 else if (!do_versioning
)
1858 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1863 vec
<gimple
> may_misalign_stmts
1864 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1867 /* It can now be assumed that the data references in the statements
1868 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1869 of the loop being vectorized. */
1870 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1872 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1873 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1874 SET_DR_MISALIGNMENT (dr
, 0);
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE
, vect_location
,
1877 "Alignment of access forced using versioning.\n");
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE
, vect_location
,
1882 "Versioning for alignment will be applied.\n");
1884 /* Peeling and versioning can't be done together at this time. */
1885 gcc_assert (! (do_peeling
&& do_versioning
));
1887 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1892 /* This point is reached if neither peeling nor versioning is being done. */
1893 gcc_assert (! (do_peeling
|| do_versioning
));
1895 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1900 /* Function vect_find_same_alignment_drs.
1902 Update group and alignment relations according to the chosen
1903 vectorization factor. */
1906 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1907 loop_vec_info loop_vinfo
)
1910 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1911 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1912 struct data_reference
*dra
= DDR_A (ddr
);
1913 struct data_reference
*drb
= DDR_B (ddr
);
1914 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1915 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1916 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1917 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1918 lambda_vector dist_v
;
1919 unsigned int loop_depth
;
1921 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1927 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1930 /* Loop-based vectorization and known data dependence. */
1931 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1934 /* Data-dependence analysis reports a distance vector of zero
1935 for data-references that overlap only in the first iteration
1936 but have different sign step (see PR45764).
1937 So as a sanity check require equal DR_STEP. */
1938 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1941 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1942 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1944 int dist
= dist_v
[loop_depth
];
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_NOTE
, vect_location
,
1948 "dependence distance = %d.\n", dist
);
1950 /* Same loop iteration. */
1952 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1954 /* Two references with distance zero have the same alignment. */
1955 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1956 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1957 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE
, vect_location
,
1960 "accesses have the same alignment.\n");
1961 dump_printf (MSG_NOTE
,
1962 "dependence distance modulo vf == 0 between ");
1963 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1964 dump_printf (MSG_NOTE
, " and ");
1965 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1966 dump_printf (MSG_NOTE
, "\n");
1973 /* Function vect_analyze_data_refs_alignment
1975 Analyze the alignment of the data-references in the loop.
1976 Return FALSE if a data reference is found that cannot be vectorized. */
1979 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1980 bb_vec_info bb_vinfo
)
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE
, vect_location
,
1984 "=== vect_analyze_data_refs_alignment ===\n");
1986 /* Mark groups of data references with same alignment using
1987 data dependence information. */
1990 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1991 struct data_dependence_relation
*ddr
;
1994 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1995 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1998 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2002 "not vectorized: can't calculate alignment "
2011 /* Analyze groups of accesses: check that DR belongs to a group of
2012 accesses of legal size, step, etc. Detect gaps, single element
2013 interleaving, and other special cases. Set grouped access info.
2014 Collect groups of strided stores for further use in SLP analysis. */
2017 vect_analyze_group_access (struct data_reference
*dr
)
2019 tree step
= DR_STEP (dr
);
2020 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2021 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2022 gimple stmt
= DR_STMT (dr
);
2023 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2024 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2025 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2026 HOST_WIDE_INT dr_step
= -1;
2027 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2028 bool slp_impossible
= false;
2029 struct loop
*loop
= NULL
;
2032 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2034 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2035 size of the interleaving group (including gaps). */
2036 if (tree_fits_shwi_p (step
))
2038 dr_step
= tree_to_shwi (step
);
2039 groupsize
= absu_hwi (dr_step
) / type_size
;
2044 /* Not consecutive access is possible only if it is a part of interleaving. */
2045 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2047 /* Check if it this DR is a part of interleaving, and is a single
2048 element of the group that is accessed in the loop. */
2050 /* Gaps are supported only for loads. STEP must be a multiple of the type
2051 size. The size of the group must be a power of 2. */
2053 && (dr_step
% type_size
) == 0
2055 && exact_log2 (groupsize
) != -1)
2057 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2058 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2059 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE
, vect_location
,
2062 "Detected single element interleaving ");
2063 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2064 dump_printf (MSG_NOTE
, " step ");
2065 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2066 dump_printf (MSG_NOTE
, "\n");
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE
, vect_location
,
2073 "Data access with gaps requires scalar "
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2079 "Peeling for outer loop is not"
2084 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2090 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2093 "not consecutive access ");
2094 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2095 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2100 /* Mark the statement as unvectorizable. */
2101 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2108 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2110 /* First stmt in the interleaving chain. Check the chain. */
2111 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2112 struct data_reference
*data_ref
= dr
;
2113 unsigned int count
= 1;
2114 tree prev_init
= DR_INIT (data_ref
);
2116 HOST_WIDE_INT diff
, gaps
= 0;
2120 /* Skip same data-refs. In case that two or more stmts share
2121 data-ref (supported only for loads), we vectorize only the first
2122 stmt, and the rest get their vectorized loads from the first
2124 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2125 DR_INIT (STMT_VINFO_DATA_REF (
2126 vinfo_for_stmt (next
)))))
2128 if (DR_IS_WRITE (data_ref
))
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2132 "Two store stmts share the same dr.\n");
2136 /* For load use the same data-ref load. */
2137 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2140 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2145 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2147 /* All group members have the same STEP by construction. */
2148 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2150 /* Check that the distance between two accesses is equal to the type
2151 size. Otherwise, we have gaps. */
2152 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2153 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2156 /* FORNOW: SLP of accesses with gaps is not supported. */
2157 slp_impossible
= true;
2158 if (DR_IS_WRITE (data_ref
))
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2162 "interleaved store with gaps\n");
2169 last_accessed_element
+= diff
;
2171 /* Store the gap from the previous member of the group. If there is no
2172 gap in the access, GROUP_GAP is always 1. */
2173 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2175 prev_init
= DR_INIT (data_ref
);
2176 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2177 /* Count the number of data-refs in the chain. */
2182 groupsize
= count
+ gaps
;
2184 /* Check that the size of the interleaving is equal to count for stores,
2185 i.e., that there are no gaps. */
2186 if (groupsize
!= count
2187 && !DR_IS_READ (dr
))
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2191 "interleaved store with gaps\n");
2195 /* If there is a gap after the last load in the group it is the
2196 difference between the groupsize and the last accessed
2198 When there is no gap, this difference should be 0. */
2199 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2201 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2202 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE
, vect_location
,
2205 "Detected interleaving of size %d starting with ",
2207 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2208 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2209 dump_printf_loc (MSG_NOTE
, vect_location
,
2210 "There is a gap of %d elements after the group\n",
2211 (int)GROUP_GAP (vinfo_for_stmt (stmt
)));
2214 /* SLP: create an SLP data structure for every interleaving group of
2215 stores for further analysis in vect_analyse_slp. */
2216 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2219 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2221 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2224 /* If there is a gap in the end of the group or the group size cannot
2225 be made a multiple of the vector element count then we access excess
2226 elements in the last iteration and thus need to peel that off. */
2228 && (groupsize
- last_accessed_element
> 0
2229 || exact_log2 (groupsize
) == -1))
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2234 "Data access with gaps requires scalar "
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2240 "Peeling for outer loop is not supported\n");
2244 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2252 /* Analyze the access pattern of the data-reference DR.
2253 In case of non-consecutive accesses call vect_analyze_group_access() to
2254 analyze groups of accesses. */
2257 vect_analyze_data_ref_access (struct data_reference
*dr
)
2259 tree step
= DR_STEP (dr
);
2260 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2261 gimple stmt
= DR_STMT (dr
);
2262 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2263 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2264 struct loop
*loop
= NULL
;
2267 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2269 if (loop_vinfo
&& !step
)
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2273 "bad data-ref access in loop\n");
2277 /* Allow loads with zero step in inner-loop vectorization. */
2278 if (loop_vinfo
&& integer_zerop (step
))
2280 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2281 if (!nested_in_vect_loop_p (loop
, stmt
))
2282 return DR_IS_READ (dr
);
2283 /* Allow references with zero step for outer loops marked
2284 with pragma omp simd only - it guarantees absence of
2285 loop-carried dependencies between inner loop iterations. */
2286 if (!loop
->force_vectorize
)
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE
, vect_location
,
2290 "zero step in inner loop of nest\n");
2295 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2297 /* Interleaved accesses are not yet supported within outer-loop
2298 vectorization for references in the inner-loop. */
2299 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2301 /* For the rest of the analysis we use the outer-loop step. */
2302 step
= STMT_VINFO_DR_STEP (stmt_info
);
2303 if (integer_zerop (step
))
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_NOTE
, vect_location
,
2307 "zero step in outer loop.\n");
2308 if (DR_IS_READ (dr
))
2316 if (TREE_CODE (step
) == INTEGER_CST
)
2318 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2319 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2321 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2323 /* Mark that it is not interleaving. */
2324 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2329 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE
, vect_location
,
2333 "grouped access in outer loop.\n");
2338 /* Assume this is a DR handled by non-constant strided load case. */
2339 if (TREE_CODE (step
) != INTEGER_CST
)
2340 return (STMT_VINFO_STRIDED_P (stmt_info
)
2341 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2342 || vect_analyze_group_access (dr
)));
2344 /* Not consecutive access - check if it's a part of interleaving group. */
2345 return vect_analyze_group_access (dr
);
2350 /* A helper function used in the comparator function to sort data
2351 references. T1 and T2 are two data references to be compared.
2352 The function returns -1, 0, or 1. */
2355 compare_tree (tree t1
, tree t2
)
2358 enum tree_code code
;
2369 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2370 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2372 code
= TREE_CODE (t1
);
2375 /* For const values, we can just use hash values for comparisons. */
2383 hashval_t h1
= iterative_hash_expr (t1
, 0);
2384 hashval_t h2
= iterative_hash_expr (t2
, 0);
2386 return h1
< h2
? -1 : 1;
2391 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2395 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2396 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2400 tclass
= TREE_CODE_CLASS (code
);
2402 /* For var-decl, we could compare their UIDs. */
2403 if (tclass
== tcc_declaration
)
2405 if (DECL_UID (t1
) != DECL_UID (t2
))
2406 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2410 /* For expressions with operands, compare their operands recursively. */
2411 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2413 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2423 /* Compare two data-references DRA and DRB to group them into chunks
2424 suitable for grouping. */
2427 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2429 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2430 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2433 /* Stabilize sort. */
2437 /* Ordering of DRs according to base. */
2438 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2440 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2445 /* And according to DR_OFFSET. */
2446 if (!dr_equal_offsets_p (dra
, drb
))
2448 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2453 /* Put reads before writes. */
2454 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2455 return DR_IS_READ (dra
) ? -1 : 1;
2457 /* Then sort after access size. */
2458 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2459 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2461 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2462 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2467 /* And after step. */
2468 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2470 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2475 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2476 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2478 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2482 /* Function vect_analyze_data_ref_accesses.
2484 Analyze the access pattern of all the data references in the loop.
2486 FORNOW: the only access pattern that is considered vectorizable is a
2487 simple step 1 (consecutive) access.
2489 FORNOW: handle only arrays and pointer accesses. */
2492 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2495 vec
<data_reference_p
> datarefs
;
2496 struct data_reference
*dr
;
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE
, vect_location
,
2500 "=== vect_analyze_data_ref_accesses ===\n");
2503 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2505 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2507 if (datarefs
.is_empty ())
2510 /* Sort the array of datarefs to make building the interleaving chains
2511 linear. Don't modify the original vector's order, it is needed for
2512 determining what dependencies are reversed. */
2513 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2514 datarefs_copy
.qsort (dr_group_sort_cmp
);
2516 /* Build the interleaving chains. */
2517 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2519 data_reference_p dra
= datarefs_copy
[i
];
2520 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2521 stmt_vec_info lastinfo
= NULL
;
2522 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2524 data_reference_p drb
= datarefs_copy
[i
];
2525 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2527 /* ??? Imperfect sorting (non-compatible types, non-modulo
2528 accesses, same accesses) can lead to a group to be artificially
2529 split here as we don't just skip over those. If it really
2530 matters we can push those to a worklist and re-iterate
2531 over them. The we can just skip ahead to the next DR here. */
2533 /* Check that the data-refs have same first location (except init)
2534 and they are both either store or load (not load and store,
2535 not masked loads or stores). */
2536 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2537 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2538 DR_BASE_ADDRESS (drb
), 0)
2539 || !dr_equal_offsets_p (dra
, drb
)
2540 || !gimple_assign_single_p (DR_STMT (dra
))
2541 || !gimple_assign_single_p (DR_STMT (drb
)))
2544 /* Check that the data-refs have the same constant size. */
2545 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2546 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2547 if (!tree_fits_uhwi_p (sza
)
2548 || !tree_fits_uhwi_p (szb
)
2549 || !tree_int_cst_equal (sza
, szb
))
2552 /* Check that the data-refs have the same step. */
2553 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2556 /* Do not place the same access in the interleaving chain twice. */
2557 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2560 /* Check the types are compatible.
2561 ??? We don't distinguish this during sorting. */
2562 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2563 TREE_TYPE (DR_REF (drb
))))
2566 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2567 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2568 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2569 gcc_assert (init_a
< init_b
);
2571 /* If init_b == init_a + the size of the type * k, we have an
2572 interleaving, and DRA is accessed before DRB. */
2573 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2574 if ((init_b
- init_a
) % type_size_a
!= 0)
2577 /* If we have a store, the accesses are adjacent. This splits
2578 groups into chunks we support (we don't support vectorization
2579 of stores with gaps). */
2580 if (!DR_IS_READ (dra
)
2581 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2582 (DR_INIT (datarefs_copy
[i
-1]))
2586 /* If the step (if not zero or non-constant) is greater than the
2587 difference between data-refs' inits this splits groups into
2589 if (tree_fits_shwi_p (DR_STEP (dra
)))
2591 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2592 if (step
!= 0 && step
<= (init_b
- init_a
))
2596 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE
, vect_location
,
2599 "Detected interleaving ");
2600 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2601 dump_printf (MSG_NOTE
, " and ");
2602 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2603 dump_printf (MSG_NOTE
, "\n");
2606 /* Link the found element into the group list. */
2607 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2609 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2610 lastinfo
= stmtinfo_a
;
2612 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2613 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2614 lastinfo
= stmtinfo_b
;
2618 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2619 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2620 && !vect_analyze_data_ref_access (dr
))
2622 if (dump_enabled_p ())
2623 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2624 "not vectorized: complicated access pattern.\n");
2628 /* Mark the statement as not vectorizable. */
2629 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2634 datarefs_copy
.release ();
2639 datarefs_copy
.release ();
2644 /* Operator == between two dr_with_seg_len objects.
2646 This equality operator is used to make sure two data refs
2647 are the same one so that we will consider to combine the
2648 aliasing checks of those two pairs of data dependent data
2652 operator == (const dr_with_seg_len
& d1
,
2653 const dr_with_seg_len
& d2
)
2655 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2656 DR_BASE_ADDRESS (d2
.dr
), 0)
2657 && compare_tree (d1
.offset
, d2
.offset
) == 0
2658 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2661 /* Function comp_dr_with_seg_len_pair.
2663 Comparison function for sorting objects of dr_with_seg_len_pair_t
2664 so that we can combine aliasing checks in one scan. */
2667 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2669 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2670 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2672 const dr_with_seg_len
&p11
= p1
->first
,
2677 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2678 if a and c have the same basic address snd step, and b and d have the same
2679 address and step. Therefore, if any a&c or b&d don't have the same address
2680 and step, we don't care the order of those two pairs after sorting. */
2683 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2684 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2686 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2687 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2689 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2691 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2693 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2695 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2701 /* Function vect_vfa_segment_size.
2703 Create an expression that computes the size of segment
2704 that will be accessed for a data reference. The functions takes into
2705 account that realignment loads may access one more vector.
2708 DR: The data reference.
2709 LENGTH_FACTOR: segment length to consider.
2711 Return an expression whose value is the size of segment which will be
2715 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2717 tree segment_length
;
2719 if (integer_zerop (DR_STEP (dr
)))
2720 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2722 segment_length
= size_binop (MULT_EXPR
,
2723 fold_convert (sizetype
, DR_STEP (dr
)),
2724 fold_convert (sizetype
, length_factor
));
2726 if (vect_supportable_dr_alignment (dr
, false)
2727 == dr_explicit_realign_optimized
)
2729 tree vector_size
= TYPE_SIZE_UNIT
2730 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2732 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2734 return segment_length
;
2737 /* Function vect_prune_runtime_alias_test_list.
2739 Prune a list of ddrs to be tested at run-time by versioning for alias.
2740 Merge several alias checks into one if possible.
2741 Return FALSE if resulting list of ddrs is longer then allowed by
2742 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2745 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2747 vec
<ddr_p
> may_alias_ddrs
=
2748 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2749 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2750 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2751 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2752 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_NOTE
, vect_location
,
2760 "=== vect_prune_runtime_alias_test_list ===\n");
2762 if (may_alias_ddrs
.is_empty ())
2765 /* Basically, for each pair of dependent data refs store_ptr_0
2766 and load_ptr_0, we create an expression:
2768 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2769 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2771 for aliasing checks. However, in some cases we can decrease
2772 the number of checks by combining two checks into one. For
2773 example, suppose we have another pair of data refs store_ptr_0
2774 and load_ptr_1, and if the following condition is satisfied:
2776 load_ptr_0 < load_ptr_1 &&
2777 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2779 (this condition means, in each iteration of vectorized loop,
2780 the accessed memory of store_ptr_0 cannot be between the memory
2781 of load_ptr_0 and load_ptr_1.)
2783 we then can use only the following expression to finish the
2784 alising checks between store_ptr_0 & load_ptr_0 and
2785 store_ptr_0 & load_ptr_1:
2787 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2790 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2791 same basic address. */
2793 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2795 /* First, we collect all data ref pairs for aliasing checks. */
2796 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2798 struct data_reference
*dr_a
, *dr_b
;
2799 gimple dr_group_first_a
, dr_group_first_b
;
2800 tree segment_length_a
, segment_length_b
;
2801 gimple stmt_a
, stmt_b
;
2804 stmt_a
= DR_STMT (DDR_A (ddr
));
2805 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2806 if (dr_group_first_a
)
2808 stmt_a
= dr_group_first_a
;
2809 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2813 stmt_b
= DR_STMT (DDR_B (ddr
));
2814 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2815 if (dr_group_first_b
)
2817 stmt_b
= dr_group_first_b
;
2818 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2821 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2822 length_factor
= scalar_loop_iters
;
2824 length_factor
= size_int (vect_factor
);
2825 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2826 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2828 dr_with_seg_len_pair_t dr_with_seg_len_pair
2829 (dr_with_seg_len (dr_a
, segment_length_a
),
2830 dr_with_seg_len (dr_b
, segment_length_b
));
2832 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2833 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2835 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2838 /* Second, we sort the collected data ref pairs so that we can scan
2839 them once to combine all possible aliasing checks. */
2840 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2842 /* Third, we scan the sorted dr pairs and check if we can combine
2843 alias checks of two neighbouring dr pairs. */
2844 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2846 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2847 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2848 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2849 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2850 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2852 /* Remove duplicate data ref pairs. */
2853 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2855 if (dump_enabled_p ())
2857 dump_printf_loc (MSG_NOTE
, vect_location
,
2858 "found equal ranges ");
2859 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2860 DR_REF (dr_a1
->dr
));
2861 dump_printf (MSG_NOTE
, ", ");
2862 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2863 DR_REF (dr_b1
->dr
));
2864 dump_printf (MSG_NOTE
, " and ");
2865 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2866 DR_REF (dr_a2
->dr
));
2867 dump_printf (MSG_NOTE
, ", ");
2868 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2869 DR_REF (dr_b2
->dr
));
2870 dump_printf (MSG_NOTE
, "\n");
2873 comp_alias_ddrs
.ordered_remove (i
--);
2877 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2879 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2880 and DR_A1 and DR_A2 are two consecutive memrefs. */
2881 if (*dr_a1
== *dr_a2
)
2883 std::swap (dr_a1
, dr_b1
);
2884 std::swap (dr_a2
, dr_b2
);
2887 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2888 DR_BASE_ADDRESS (dr_a2
->dr
),
2890 || !tree_fits_shwi_p (dr_a1
->offset
)
2891 || !tree_fits_shwi_p (dr_a2
->offset
))
2894 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2895 - tree_to_shwi (dr_a1
->offset
));
2898 /* Now we check if the following condition is satisfied:
2900 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2902 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2903 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2904 have to make a best estimation. We can get the minimum value
2905 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2906 then either of the following two conditions can guarantee the
2909 1: DIFF <= MIN_SEG_LEN_B
2910 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2914 HOST_WIDE_INT min_seg_len_b
= (tree_fits_shwi_p (dr_b1
->seg_len
)
2915 ? tree_to_shwi (dr_b1
->seg_len
)
2918 if (diff
<= min_seg_len_b
2919 || (tree_fits_shwi_p (dr_a1
->seg_len
)
2920 && diff
- tree_to_shwi (dr_a1
->seg_len
) < min_seg_len_b
))
2922 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE
, vect_location
,
2925 "merging ranges for ");
2926 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2927 DR_REF (dr_a1
->dr
));
2928 dump_printf (MSG_NOTE
, ", ");
2929 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2930 DR_REF (dr_b1
->dr
));
2931 dump_printf (MSG_NOTE
, " and ");
2932 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2933 DR_REF (dr_a2
->dr
));
2934 dump_printf (MSG_NOTE
, ", ");
2935 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2936 DR_REF (dr_b2
->dr
));
2937 dump_printf (MSG_NOTE
, "\n");
2940 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2941 dr_a2
->seg_len
, size_int (diff
));
2942 comp_alias_ddrs
.ordered_remove (i
--);
2947 dump_printf_loc (MSG_NOTE
, vect_location
,
2948 "improved number of alias checks from %d to %d\n",
2949 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
2950 if ((int) comp_alias_ddrs
.length () >
2951 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2957 /* Check whether a non-affine read in stmt is suitable for gather load
2958 and if so, return a builtin decl for that operation. */
2961 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2962 tree
*offp
, int *scalep
)
2964 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2965 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2966 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2967 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2968 tree offtype
= NULL_TREE
;
2969 tree decl
, base
, off
;
2971 int punsignedp
, pvolatilep
;
2974 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2975 see if we can use the def stmt of the address. */
2976 if (is_gimple_call (stmt
)
2977 && gimple_call_internal_p (stmt
)
2978 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
2979 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
2980 && TREE_CODE (base
) == MEM_REF
2981 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
2982 && integer_zerop (TREE_OPERAND (base
, 1))
2983 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
2985 gimple def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
2986 if (is_gimple_assign (def_stmt
)
2987 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
2988 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
2991 /* The gather builtins need address of the form
2992 loop_invariant + vector * {1, 2, 4, 8}
2994 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2995 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2996 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2997 multiplications and additions in it. To get a vector, we need
2998 a single SSA_NAME that will be defined in the loop and will
2999 contain everything that is not loop invariant and that can be
3000 vectorized. The following code attempts to find such a preexistng
3001 SSA_NAME OFF and put the loop invariants into a tree BASE
3002 that can be gimplified before the loop. */
3003 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
,
3004 &pmode
, &punsignedp
, &pvolatilep
, false);
3005 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
3007 if (TREE_CODE (base
) == MEM_REF
)
3009 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3011 if (off
== NULL_TREE
)
3013 offset_int moff
= mem_ref_offset (base
);
3014 off
= wide_int_to_tree (sizetype
, moff
);
3017 off
= size_binop (PLUS_EXPR
, off
,
3018 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3020 base
= TREE_OPERAND (base
, 0);
3023 base
= build_fold_addr_expr (base
);
3025 if (off
== NULL_TREE
)
3026 off
= size_zero_node
;
3028 /* If base is not loop invariant, either off is 0, then we start with just
3029 the constant offset in the loop invariant BASE and continue with base
3030 as OFF, otherwise give up.
3031 We could handle that case by gimplifying the addition of base + off
3032 into some SSA_NAME and use that as off, but for now punt. */
3033 if (!expr_invariant_in_loop_p (loop
, base
))
3035 if (!integer_zerop (off
))
3038 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3040 /* Otherwise put base + constant offset into the loop invariant BASE
3041 and continue with OFF. */
3044 base
= fold_convert (sizetype
, base
);
3045 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3048 /* OFF at this point may be either a SSA_NAME or some tree expression
3049 from get_inner_reference. Try to peel off loop invariants from it
3050 into BASE as long as possible. */
3052 while (offtype
== NULL_TREE
)
3054 enum tree_code code
;
3055 tree op0
, op1
, add
= NULL_TREE
;
3057 if (TREE_CODE (off
) == SSA_NAME
)
3059 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3061 if (expr_invariant_in_loop_p (loop
, off
))
3064 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3067 op0
= gimple_assign_rhs1 (def_stmt
);
3068 code
= gimple_assign_rhs_code (def_stmt
);
3069 op1
= gimple_assign_rhs2 (def_stmt
);
3073 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3075 code
= TREE_CODE (off
);
3076 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3080 case POINTER_PLUS_EXPR
:
3082 if (expr_invariant_in_loop_p (loop
, op0
))
3087 add
= fold_convert (sizetype
, add
);
3089 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3090 base
= size_binop (PLUS_EXPR
, base
, add
);
3093 if (expr_invariant_in_loop_p (loop
, op1
))
3101 if (expr_invariant_in_loop_p (loop
, op1
))
3103 add
= fold_convert (sizetype
, op1
);
3104 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3110 if (scale
== 1 && tree_fits_shwi_p (op1
))
3112 scale
= tree_to_shwi (op1
);
3121 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3122 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3124 if (TYPE_PRECISION (TREE_TYPE (op0
))
3125 == TYPE_PRECISION (TREE_TYPE (off
)))
3130 if (TYPE_PRECISION (TREE_TYPE (op0
))
3131 < TYPE_PRECISION (TREE_TYPE (off
)))
3134 offtype
= TREE_TYPE (off
);
3145 /* If at the end OFF still isn't a SSA_NAME or isn't
3146 defined in the loop, punt. */
3147 if (TREE_CODE (off
) != SSA_NAME
3148 || expr_invariant_in_loop_p (loop
, off
))
3151 if (offtype
== NULL_TREE
)
3152 offtype
= TREE_TYPE (off
);
3154 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3156 if (decl
== NULL_TREE
)
3168 /* Function vect_analyze_data_refs.
3170 Find all the data references in the loop or basic block.
3172 The general structure of the analysis of data refs in the vectorizer is as
3174 1- vect_analyze_data_refs(loop/bb): call
3175 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3176 in the loop/bb and their dependences.
3177 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3178 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3179 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3184 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3185 bb_vec_info bb_vinfo
,
3186 int *min_vf
, unsigned *n_stmts
)
3188 struct loop
*loop
= NULL
;
3189 basic_block bb
= NULL
;
3191 vec
<data_reference_p
> datarefs
;
3192 struct data_reference
*dr
;
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_NOTE
, vect_location
,
3197 "=== vect_analyze_data_refs ===\n");
3201 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3203 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3204 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3205 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3209 "not vectorized: loop contains function calls"
3210 " or data references that cannot be analyzed\n");
3214 for (i
= 0; i
< loop
->num_nodes
; i
++)
3216 gimple_stmt_iterator gsi
;
3218 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3220 gimple stmt
= gsi_stmt (gsi
);
3221 if (is_gimple_debug (stmt
))
3224 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3226 if (is_gimple_call (stmt
) && loop
->safelen
)
3228 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3229 if (fndecl
!= NULL_TREE
)
3231 struct cgraph_node
*node
= cgraph_node::get (fndecl
);
3232 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3234 unsigned int j
, n
= gimple_call_num_args (stmt
);
3235 for (j
= 0; j
< n
; j
++)
3237 op
= gimple_call_arg (stmt
, j
);
3239 || (REFERENCE_CLASS_P (op
)
3240 && get_base_address (op
)))
3243 op
= gimple_call_lhs (stmt
);
3244 /* Ignore #pragma omp declare simd functions
3245 if they don't have data references in the
3246 call stmt itself. */
3250 || (REFERENCE_CLASS_P (op
)
3251 && get_base_address (op
)))))
3256 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3259 "not vectorized: loop contains function "
3260 "calls or data references that cannot "
3267 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3271 gimple_stmt_iterator gsi
;
3273 bb
= BB_VINFO_BB (bb_vinfo
);
3274 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3276 gimple stmt
= gsi_stmt (gsi
);
3277 if (is_gimple_debug (stmt
))
3280 if (!find_data_references_in_stmt (NULL
, stmt
,
3281 &BB_VINFO_DATAREFS (bb_vinfo
)))
3283 /* Mark the rest of the basic-block as unvectorizable. */
3284 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3286 stmt
= gsi_stmt (gsi
);
3287 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3293 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3296 /* Go through the data-refs, check that the analysis succeeded. Update
3297 pointer from stmt_vec_info struct to DR and vectype. */
3299 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3302 stmt_vec_info stmt_info
;
3303 tree base
, offset
, init
;
3304 bool gather
= false;
3305 bool simd_lane_access
= false;
3309 if (!dr
|| !DR_REF (dr
))
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3313 "not vectorized: unhandled data-ref\n");
3317 stmt
= DR_STMT (dr
);
3318 stmt_info
= vinfo_for_stmt (stmt
);
3320 /* Discard clobbers from the dataref vector. We will remove
3321 clobber stmts during vectorization. */
3322 if (gimple_clobber_p (stmt
))
3325 if (i
== datarefs
.length () - 1)
3330 datarefs
.ordered_remove (i
);
3335 /* Check that analysis of the data-ref succeeded. */
3336 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3341 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3342 && targetm
.vectorize
.builtin_gather
!= NULL
;
3343 bool maybe_simd_lane_access
3344 = loop_vinfo
&& loop
->simduid
;
3346 /* If target supports vector gather loads, or if this might be
3347 a SIMD lane access, see if they can't be used. */
3349 && (maybe_gather
|| maybe_simd_lane_access
)
3350 && !nested_in_vect_loop_p (loop
, stmt
))
3352 struct data_reference
*newdr
3353 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3354 DR_REF (dr
), stmt
, true);
3355 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3356 if (DR_BASE_ADDRESS (newdr
)
3357 && DR_OFFSET (newdr
)
3360 && integer_zerop (DR_STEP (newdr
)))
3362 if (maybe_simd_lane_access
)
3364 tree off
= DR_OFFSET (newdr
);
3366 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3367 && TREE_CODE (off
) == MULT_EXPR
3368 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3370 tree step
= TREE_OPERAND (off
, 1);
3371 off
= TREE_OPERAND (off
, 0);
3373 if (CONVERT_EXPR_P (off
)
3374 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3376 < TYPE_PRECISION (TREE_TYPE (off
)))
3377 off
= TREE_OPERAND (off
, 0);
3378 if (TREE_CODE (off
) == SSA_NAME
)
3380 gimple def
= SSA_NAME_DEF_STMT (off
);
3381 tree reft
= TREE_TYPE (DR_REF (newdr
));
3382 if (is_gimple_call (def
)
3383 && gimple_call_internal_p (def
)
3384 && (gimple_call_internal_fn (def
)
3385 == IFN_GOMP_SIMD_LANE
))
3387 tree arg
= gimple_call_arg (def
, 0);
3388 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3389 arg
= SSA_NAME_VAR (arg
);
3390 if (arg
== loop
->simduid
3392 && tree_int_cst_equal
3393 (TYPE_SIZE_UNIT (reft
),
3396 DR_OFFSET (newdr
) = ssize_int (0);
3397 DR_STEP (newdr
) = step
;
3398 DR_ALIGNED_TO (newdr
)
3399 = size_int (BIGGEST_ALIGNMENT
);
3401 simd_lane_access
= true;
3407 if (!simd_lane_access
&& maybe_gather
)
3413 if (!gather
&& !simd_lane_access
)
3414 free_data_ref (newdr
);
3417 if (!gather
&& !simd_lane_access
)
3419 if (dump_enabled_p ())
3421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3422 "not vectorized: data ref analysis "
3424 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3425 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3435 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3437 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3439 "not vectorized: base addr of dr is a "
3445 if (gather
|| simd_lane_access
)
3450 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3452 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3455 "not vectorized: volatile type ");
3456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3457 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3466 if (stmt_can_throw_internal (stmt
))
3468 if (dump_enabled_p ())
3470 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3471 "not vectorized: statement can throw an "
3473 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3474 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3480 if (gather
|| simd_lane_access
)
3485 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3486 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3488 if (dump_enabled_p ())
3490 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3491 "not vectorized: statement is bitfield "
3493 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3494 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3500 if (gather
|| simd_lane_access
)
3505 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3506 offset
= unshare_expr (DR_OFFSET (dr
));
3507 init
= unshare_expr (DR_INIT (dr
));
3509 if (is_gimple_call (stmt
)
3510 && (!gimple_call_internal_p (stmt
)
3511 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3512 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3514 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3517 "not vectorized: dr in a call ");
3518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3519 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3525 if (gather
|| simd_lane_access
)
3530 /* Update DR field in stmt_vec_info struct. */
3532 /* If the dataref is in an inner-loop of the loop that is considered for
3533 for vectorization, we also want to analyze the access relative to
3534 the outer-loop (DR contains information only relative to the
3535 inner-most enclosing loop). We do that by building a reference to the
3536 first location accessed by the inner-loop, and analyze it relative to
3538 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3540 tree outer_step
, outer_base
, outer_init
;
3541 HOST_WIDE_INT pbitsize
, pbitpos
;
3544 int punsignedp
, pvolatilep
;
3545 affine_iv base_iv
, offset_iv
;
3548 /* Build a reference to the first location accessed by the
3549 inner-loop: *(BASE+INIT). (The first location is actually
3550 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3551 tree inner_base
= build_fold_indirect_ref
3552 (fold_build_pointer_plus (base
, init
));
3554 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE
, vect_location
,
3557 "analyze in outer-loop: ");
3558 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3559 dump_printf (MSG_NOTE
, "\n");
3562 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3563 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3564 gcc_assert (outer_base
!= NULL_TREE
);
3566 if (pbitpos
% BITS_PER_UNIT
!= 0)
3568 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3570 "failed: bit offset alignment.\n");
3574 outer_base
= build_fold_addr_expr (outer_base
);
3575 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3580 "failed: evolution of base is not affine.\n");
3587 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3595 offset_iv
.base
= ssize_int (0);
3596 offset_iv
.step
= ssize_int (0);
3598 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3603 "evolution of offset is not affine.\n");
3607 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3608 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3609 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3610 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3611 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3613 outer_step
= size_binop (PLUS_EXPR
,
3614 fold_convert (ssizetype
, base_iv
.step
),
3615 fold_convert (ssizetype
, offset_iv
.step
));
3617 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3618 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3619 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3620 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3621 STMT_VINFO_DR_OFFSET (stmt_info
) =
3622 fold_convert (ssizetype
, offset_iv
.base
);
3623 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3624 size_int (highest_pow2_factor (offset_iv
.base
));
3626 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_NOTE
, vect_location
,
3629 "\touter base_address: ");
3630 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3631 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3632 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3633 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3634 STMT_VINFO_DR_OFFSET (stmt_info
));
3635 dump_printf (MSG_NOTE
,
3636 "\n\touter constant offset from base address: ");
3637 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3638 STMT_VINFO_DR_INIT (stmt_info
));
3639 dump_printf (MSG_NOTE
, "\n\touter step: ");
3640 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3641 STMT_VINFO_DR_STEP (stmt_info
));
3642 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3643 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3644 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3645 dump_printf (MSG_NOTE
, "\n");
3649 if (STMT_VINFO_DATA_REF (stmt_info
))
3651 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3654 "not vectorized: more than one data ref "
3656 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3657 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3663 if (gather
|| simd_lane_access
)
3668 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3669 if (simd_lane_access
)
3671 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3672 free_data_ref (datarefs
[i
]);
3676 /* Set vectype for STMT. */
3677 scalar_type
= TREE_TYPE (DR_REF (dr
));
3678 STMT_VINFO_VECTYPE (stmt_info
)
3679 = get_vectype_for_scalar_type (scalar_type
);
3680 if (!STMT_VINFO_VECTYPE (stmt_info
))
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3685 "not vectorized: no vectype for stmt: ");
3686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3687 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3688 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3690 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3696 if (gather
|| simd_lane_access
)
3698 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3706 if (dump_enabled_p ())
3708 dump_printf_loc (MSG_NOTE
, vect_location
,
3709 "got vectype for stmt: ");
3710 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3711 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3712 STMT_VINFO_VECTYPE (stmt_info
));
3713 dump_printf (MSG_NOTE
, "\n");
3717 /* Adjust the minimal vectorization factor according to the
3719 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3727 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3729 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3733 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3735 if (dump_enabled_p ())
3737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3738 "not vectorized: not suitable for gather "
3740 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3741 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3747 STMT_VINFO_GATHER_P (stmt_info
) = true;
3750 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3752 if (nested_in_vect_loop_p (loop
, stmt
))
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3757 "not vectorized: not suitable for strided "
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3764 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3768 /* If we stopped analysis at the first dataref we could not analyze
3769 when trying to vectorize a basic-block mark the rest of the datarefs
3770 as not vectorizable and truncate the vector of datarefs. That
3771 avoids spending useless time in analyzing their dependence. */
3772 if (i
!= datarefs
.length ())
3774 gcc_assert (bb_vinfo
!= NULL
);
3775 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3777 data_reference_p dr
= datarefs
[j
];
3778 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3781 datarefs
.truncate (i
);
3788 /* Function vect_get_new_vect_var.
3790 Returns a name for a new variable. The current naming scheme appends the
3791 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3792 the name of vectorizer generated variables, and appends that to NAME if
3796 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3803 case vect_simple_var
:
3806 case vect_scalar_var
:
3809 case vect_pointer_var
:
3818 char* tmp
= concat (prefix
, "_", name
, NULL
);
3819 new_vect_var
= create_tmp_reg (type
, tmp
);
3823 new_vect_var
= create_tmp_reg (type
, prefix
);
3825 return new_vect_var
;
3828 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3831 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3832 stmt_vec_info stmt_info
)
3834 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3835 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3836 int misalign
= DR_MISALIGNMENT (dr
);
3838 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
3840 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
3843 /* Function vect_create_addr_base_for_vector_ref.
3845 Create an expression that computes the address of the first memory location
3846 that will be accessed for a data reference.
3849 STMT: The statement containing the data reference.
3850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3851 OFFSET: Optional. If supplied, it is be added to the initial address.
3852 LOOP: Specify relative to which loop-nest should the address be computed.
3853 For example, when the dataref is in an inner-loop nested in an
3854 outer-loop that is now being vectorized, LOOP can be either the
3855 outer-loop, or the inner-loop. The first memory location accessed
3856 by the following dataref ('in' points to short):
3863 if LOOP=i_loop: &in (relative to i_loop)
3864 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3865 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3866 initial address. Unlike OFFSET, which is number of elements to
3867 be added, BYTE_OFFSET is measured in bytes.
3870 1. Return an SSA_NAME whose value is the address of the memory location of
3871 the first vector of the data reference.
3872 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3873 these statement(s) which define the returned SSA_NAME.
3875 FORNOW: We are only handling array accesses with step 1. */
3878 vect_create_addr_base_for_vector_ref (gimple stmt
,
3879 gimple_seq
*new_stmt_list
,
3884 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3885 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3887 const char *base_name
;
3890 gimple_seq seq
= NULL
;
3894 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3895 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3897 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3899 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3901 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3903 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3904 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3905 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3909 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3910 base_offset
= unshare_expr (DR_OFFSET (dr
));
3911 init
= unshare_expr (DR_INIT (dr
));
3915 base_name
= get_name (data_ref_base
);
3918 base_offset
= ssize_int (0);
3919 init
= ssize_int (0);
3920 base_name
= get_name (DR_REF (dr
));
3923 /* Create base_offset */
3924 base_offset
= size_binop (PLUS_EXPR
,
3925 fold_convert (sizetype
, base_offset
),
3926 fold_convert (sizetype
, init
));
3930 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3931 fold_convert (sizetype
, offset
), step
);
3932 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3933 base_offset
, offset
);
3937 byte_offset
= fold_convert (sizetype
, byte_offset
);
3938 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3939 base_offset
, byte_offset
);
3942 /* base + base_offset */
3944 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3947 addr_base
= build1 (ADDR_EXPR
,
3948 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3949 unshare_expr (DR_REF (dr
)));
3952 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3953 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3954 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
3955 gimple_seq_add_seq (new_stmt_list
, seq
);
3957 if (DR_PTR_INFO (dr
)
3958 && TREE_CODE (addr_base
) == SSA_NAME
3959 && !SSA_NAME_PTR_INFO (addr_base
))
3961 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
3962 if (offset
|| byte_offset
)
3963 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3966 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3969 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3970 dump_printf (MSG_NOTE
, "\n");
3977 /* Function vect_create_data_ref_ptr.
3979 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3980 location accessed in the loop by STMT, along with the def-use update
3981 chain to appropriately advance the pointer through the loop iterations.
3982 Also set aliasing information for the pointer. This pointer is used by
3983 the callers to this function to create a memory reference expression for
3984 vector load/store access.
3987 1. STMT: a stmt that references memory. Expected to be of the form
3988 GIMPLE_ASSIGN <name, data-ref> or
3989 GIMPLE_ASSIGN <data-ref, name>.
3990 2. AGGR_TYPE: the type of the reference, which should be either a vector
3992 3. AT_LOOP: the loop where the vector memref is to be created.
3993 4. OFFSET (optional): an offset to be added to the initial address accessed
3994 by the data-ref in STMT.
3995 5. BSI: location where the new stmts are to be placed if there is no loop
3996 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3997 pointing to the initial address.
3998 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3999 to the initial address accessed by the data-ref in STMT. This is
4000 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4004 1. Declare a new ptr to vector_type, and have it point to the base of the
4005 data reference (initial addressed accessed by the data reference).
4006 For example, for vector of type V8HI, the following code is generated:
4009 ap = (v8hi *)initial_address;
4011 if OFFSET is not supplied:
4012 initial_address = &a[init];
4013 if OFFSET is supplied:
4014 initial_address = &a[init + OFFSET];
4015 if BYTE_OFFSET is supplied:
4016 initial_address = &a[init] + BYTE_OFFSET;
4018 Return the initial_address in INITIAL_ADDRESS.
4020 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4021 update the pointer in each iteration of the loop.
4023 Return the increment stmt that updates the pointer in PTR_INCR.
4025 3. Set INV_P to true if the access pattern of the data reference in the
4026 vectorized loop is invariant. Set it to false otherwise.
4028 4. Return the pointer. */
4031 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
4032 tree offset
, tree
*initial_address
,
4033 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
4034 bool only_init
, bool *inv_p
, tree byte_offset
)
4036 const char *base_name
;
4037 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4038 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4039 struct loop
*loop
= NULL
;
4040 bool nested_in_vect_loop
= false;
4041 struct loop
*containing_loop
= NULL
;
4045 gimple_seq new_stmt_list
= NULL
;
4049 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4051 gimple_stmt_iterator incr_gsi
;
4053 tree indx_before_incr
, indx_after_incr
;
4056 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4058 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4059 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4063 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4064 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4065 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4066 pe
= loop_preheader_edge (loop
);
4070 gcc_assert (bb_vinfo
);
4075 /* Check the step (evolution) of the load in LOOP, and record
4076 whether it's invariant. */
4077 if (nested_in_vect_loop
)
4078 step
= STMT_VINFO_DR_STEP (stmt_info
);
4080 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4082 if (integer_zerop (step
))
4087 /* Create an expression for the first address accessed by this load
4089 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4091 if (dump_enabled_p ())
4093 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4094 dump_printf_loc (MSG_NOTE
, vect_location
,
4095 "create %s-pointer variable to type: ",
4096 get_tree_code_name (TREE_CODE (aggr_type
)));
4097 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4098 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4099 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4100 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4101 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4102 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4103 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4105 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4106 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4107 dump_printf (MSG_NOTE
, "\n");
4110 /* (1) Create the new aggregate-pointer variable.
4111 Vector and array types inherit the alias set of their component
4112 type by default so we need to use a ref-all pointer if the data
4113 reference does not conflict with the created aggregated data
4114 reference because it is not addressable. */
4115 bool need_ref_all
= false;
4116 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4117 get_alias_set (DR_REF (dr
))))
4118 need_ref_all
= true;
4119 /* Likewise for any of the data references in the stmt group. */
4120 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4122 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4125 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4126 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4127 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4128 get_alias_set (DR_REF (sdr
))))
4130 need_ref_all
= true;
4133 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4137 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4139 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4142 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4143 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4144 def-use update cycles for the pointer: one relative to the outer-loop
4145 (LOOP), which is what steps (3) and (4) below do. The other is relative
4146 to the inner-loop (which is the inner-most loop containing the dataref),
4147 and this is done be step (5) below.
4149 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4150 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4151 redundant. Steps (3),(4) create the following:
4154 LOOP: vp1 = phi(vp0,vp2)
4160 If there is an inner-loop nested in loop, then step (5) will also be
4161 applied, and an additional update in the inner-loop will be created:
4164 LOOP: vp1 = phi(vp0,vp2)
4166 inner: vp3 = phi(vp1,vp4)
4167 vp4 = vp3 + inner_step
4173 /* (2) Calculate the initial address of the aggregate-pointer, and set
4174 the aggregate-pointer to point to it before the loop. */
4176 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4178 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4179 offset
, loop
, byte_offset
);
4184 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4185 gcc_assert (!new_bb
);
4188 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4191 *initial_address
= new_temp
;
4192 aggr_ptr_init
= new_temp
;
4194 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4195 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4196 inner-loop nested in LOOP (during outer-loop vectorization). */
4198 /* No update in loop is required. */
4199 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4200 aptr
= aggr_ptr_init
;
4203 /* The step of the aggregate pointer is the type size. */
4204 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4205 /* One exception to the above is when the scalar step of the load in
4206 LOOP is zero. In this case the step here is also zero. */
4208 iv_step
= size_zero_node
;
4209 else if (tree_int_cst_sgn (step
) == -1)
4210 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4212 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4214 create_iv (aggr_ptr_init
,
4215 fold_convert (aggr_ptr_type
, iv_step
),
4216 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4217 &indx_before_incr
, &indx_after_incr
);
4218 incr
= gsi_stmt (incr_gsi
);
4219 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4221 /* Copy the points-to information if it exists. */
4222 if (DR_PTR_INFO (dr
))
4224 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4225 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4230 aptr
= indx_before_incr
;
4233 if (!nested_in_vect_loop
|| only_init
)
4237 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4238 nested in LOOP, if exists. */
4240 gcc_assert (nested_in_vect_loop
);
4243 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4245 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4246 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4248 incr
= gsi_stmt (incr_gsi
);
4249 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4251 /* Copy the points-to information if it exists. */
4252 if (DR_PTR_INFO (dr
))
4254 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4255 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4260 return indx_before_incr
;
4267 /* Function bump_vector_ptr
4269 Increment a pointer (to a vector type) by vector-size. If requested,
4270 i.e. if PTR-INCR is given, then also connect the new increment stmt
4271 to the existing def-use update-chain of the pointer, by modifying
4272 the PTR_INCR as illustrated below:
4274 The pointer def-use update-chain before this function:
4275 DATAREF_PTR = phi (p_0, p_2)
4277 PTR_INCR: p_2 = DATAREF_PTR + step
4279 The pointer def-use update-chain after this function:
4280 DATAREF_PTR = phi (p_0, p_2)
4282 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4284 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4287 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4289 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4290 the loop. The increment amount across iterations is expected
4292 BSI - location where the new update stmt is to be placed.
4293 STMT - the original scalar memory-access stmt that is being vectorized.
4294 BUMP - optional. The offset by which to bump the pointer. If not given,
4295 the offset is assumed to be vector_size.
4297 Output: Return NEW_DATAREF_PTR as illustrated above.
4302 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4303 gimple stmt
, tree bump
)
4305 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4306 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4307 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4308 tree update
= TYPE_SIZE_UNIT (vectype
);
4311 use_operand_p use_p
;
4312 tree new_dataref_ptr
;
4317 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4318 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4320 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4321 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4322 dataref_ptr
, update
);
4323 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4325 /* Copy the points-to information if it exists. */
4326 if (DR_PTR_INFO (dr
))
4328 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4329 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4333 return new_dataref_ptr
;
4335 /* Update the vector-pointer's cross-iteration increment. */
4336 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4338 tree use
= USE_FROM_PTR (use_p
);
4340 if (use
== dataref_ptr
)
4341 SET_USE (use_p
, new_dataref_ptr
);
4343 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4346 return new_dataref_ptr
;
4350 /* Function vect_create_destination_var.
4352 Create a new temporary of type VECTYPE. */
4355 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4361 enum vect_var_kind kind
;
4363 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4364 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4366 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4368 name
= get_name (scalar_dest
);
4370 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4372 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4373 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4379 /* Function vect_grouped_store_supported.
4381 Returns TRUE if interleave high and interleave low permutations
4382 are supported, and FALSE otherwise. */
4385 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4387 machine_mode mode
= TYPE_MODE (vectype
);
4389 /* vect_permute_store_chain requires the group size to be equal to 3 or
4390 be a power of two. */
4391 if (count
!= 3 && exact_log2 (count
) == -1)
4393 if (dump_enabled_p ())
4394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4395 "the size of the group of accesses"
4396 " is not a power of 2 or not eqaul to 3\n");
4400 /* Check that the permutation is supported. */
4401 if (VECTOR_MODE_P (mode
))
4403 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4404 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4408 unsigned int j0
= 0, j1
= 0, j2
= 0;
4411 for (j
= 0; j
< 3; j
++)
4413 int nelt0
= ((3 - j
) * nelt
) % 3;
4414 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4415 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4416 for (i
= 0; i
< nelt
; i
++)
4418 if (3 * i
+ nelt0
< nelt
)
4419 sel
[3 * i
+ nelt0
] = j0
++;
4420 if (3 * i
+ nelt1
< nelt
)
4421 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4422 if (3 * i
+ nelt2
< nelt
)
4423 sel
[3 * i
+ nelt2
] = 0;
4425 if (!can_vec_perm_p (mode
, false, sel
))
4427 if (dump_enabled_p ())
4428 dump_printf (MSG_MISSED_OPTIMIZATION
,
4429 "permutaion op not supported by target.\n");
4433 for (i
= 0; i
< nelt
; i
++)
4435 if (3 * i
+ nelt0
< nelt
)
4436 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4437 if (3 * i
+ nelt1
< nelt
)
4438 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4439 if (3 * i
+ nelt2
< nelt
)
4440 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4442 if (!can_vec_perm_p (mode
, false, sel
))
4444 if (dump_enabled_p ())
4445 dump_printf (MSG_MISSED_OPTIMIZATION
,
4446 "permutaion op not supported by target.\n");
4454 /* If length is not equal to 3 then only power of 2 is supported. */
4455 gcc_assert (exact_log2 (count
) != -1);
4457 for (i
= 0; i
< nelt
/ 2; i
++)
4460 sel
[i
* 2 + 1] = i
+ nelt
;
4462 if (can_vec_perm_p (mode
, false, sel
))
4464 for (i
= 0; i
< nelt
; i
++)
4466 if (can_vec_perm_p (mode
, false, sel
))
4472 if (dump_enabled_p ())
4473 dump_printf (MSG_MISSED_OPTIMIZATION
,
4474 "permutaion op not supported by target.\n");
4479 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4483 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4485 return vect_lanes_optab_supported_p ("vec_store_lanes",
4486 vec_store_lanes_optab
,
4491 /* Function vect_permute_store_chain.
4493 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4494 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4495 the data correctly for the stores. Return the final references for stores
4498 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4499 The input is 4 vectors each containing 8 elements. We assign a number to
4500 each element, the input sequence is:
4502 1st vec: 0 1 2 3 4 5 6 7
4503 2nd vec: 8 9 10 11 12 13 14 15
4504 3rd vec: 16 17 18 19 20 21 22 23
4505 4th vec: 24 25 26 27 28 29 30 31
4507 The output sequence should be:
4509 1st vec: 0 8 16 24 1 9 17 25
4510 2nd vec: 2 10 18 26 3 11 19 27
4511 3rd vec: 4 12 20 28 5 13 21 30
4512 4th vec: 6 14 22 30 7 15 23 31
4514 i.e., we interleave the contents of the four vectors in their order.
4516 We use interleave_high/low instructions to create such output. The input of
4517 each interleave_high/low operation is two vectors:
4520 the even elements of the result vector are obtained left-to-right from the
4521 high/low elements of the first vector. The odd elements of the result are
4522 obtained left-to-right from the high/low elements of the second vector.
4523 The output of interleave_high will be: 0 4 1 5
4524 and of interleave_low: 2 6 3 7
4527 The permutation is done in log LENGTH stages. In each stage interleave_high
4528 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4529 where the first argument is taken from the first half of DR_CHAIN and the
4530 second argument from it's second half.
4533 I1: interleave_high (1st vec, 3rd vec)
4534 I2: interleave_low (1st vec, 3rd vec)
4535 I3: interleave_high (2nd vec, 4th vec)
4536 I4: interleave_low (2nd vec, 4th vec)
4538 The output for the first stage is:
4540 I1: 0 16 1 17 2 18 3 19
4541 I2: 4 20 5 21 6 22 7 23
4542 I3: 8 24 9 25 10 26 11 27
4543 I4: 12 28 13 29 14 30 15 31
4545 The output of the second stage, i.e. the final result is:
4547 I1: 0 8 16 24 1 9 17 25
4548 I2: 2 10 18 26 3 11 19 27
4549 I3: 4 12 20 28 5 13 21 30
4550 I4: 6 14 22 30 7 15 23 31. */
4553 vect_permute_store_chain (vec
<tree
> dr_chain
,
4554 unsigned int length
,
4556 gimple_stmt_iterator
*gsi
,
4557 vec
<tree
> *result_chain
)
4559 tree vect1
, vect2
, high
, low
;
4561 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4562 tree perm_mask_low
, perm_mask_high
;
4564 tree perm3_mask_low
, perm3_mask_high
;
4565 unsigned int i
, n
, log_length
= exact_log2 (length
);
4566 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4567 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4569 result_chain
->quick_grow (length
);
4570 memcpy (result_chain
->address (), dr_chain
.address (),
4571 length
* sizeof (tree
));
4575 unsigned int j0
= 0, j1
= 0, j2
= 0;
4577 for (j
= 0; j
< 3; j
++)
4579 int nelt0
= ((3 - j
) * nelt
) % 3;
4580 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4581 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4583 for (i
= 0; i
< nelt
; i
++)
4585 if (3 * i
+ nelt0
< nelt
)
4586 sel
[3 * i
+ nelt0
] = j0
++;
4587 if (3 * i
+ nelt1
< nelt
)
4588 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4589 if (3 * i
+ nelt2
< nelt
)
4590 sel
[3 * i
+ nelt2
] = 0;
4592 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4594 for (i
= 0; i
< nelt
; i
++)
4596 if (3 * i
+ nelt0
< nelt
)
4597 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4598 if (3 * i
+ nelt1
< nelt
)
4599 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4600 if (3 * i
+ nelt2
< nelt
)
4601 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4603 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4605 vect1
= dr_chain
[0];
4606 vect2
= dr_chain
[1];
4608 /* Create interleaving stmt:
4609 low = VEC_PERM_EXPR <vect1, vect2,
4610 {j, nelt, *, j + 1, nelt + j + 1, *,
4611 j + 2, nelt + j + 2, *, ...}> */
4612 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4613 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4614 vect2
, perm3_mask_low
);
4615 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4618 vect2
= dr_chain
[2];
4619 /* Create interleaving stmt:
4620 low = VEC_PERM_EXPR <vect1, vect2,
4621 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4622 6, 7, nelt + j + 2, ...}> */
4623 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4624 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4625 vect2
, perm3_mask_high
);
4626 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4627 (*result_chain
)[j
] = data_ref
;
4632 /* If length is not equal to 3 then only power of 2 is supported. */
4633 gcc_assert (exact_log2 (length
) != -1);
4635 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4638 sel
[i
* 2 + 1] = i
+ nelt
;
4640 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4642 for (i
= 0; i
< nelt
; i
++)
4644 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4646 for (i
= 0, n
= log_length
; i
< n
; i
++)
4648 for (j
= 0; j
< length
/2; j
++)
4650 vect1
= dr_chain
[j
];
4651 vect2
= dr_chain
[j
+length
/2];
4653 /* Create interleaving stmt:
4654 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4656 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4657 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4658 vect2
, perm_mask_high
);
4659 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4660 (*result_chain
)[2*j
] = high
;
4662 /* Create interleaving stmt:
4663 low = VEC_PERM_EXPR <vect1, vect2,
4664 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4666 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4667 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4668 vect2
, perm_mask_low
);
4669 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4670 (*result_chain
)[2*j
+1] = low
;
4672 memcpy (dr_chain
.address (), result_chain
->address (),
4673 length
* sizeof (tree
));
4678 /* Function vect_setup_realignment
4680 This function is called when vectorizing an unaligned load using
4681 the dr_explicit_realign[_optimized] scheme.
4682 This function generates the following code at the loop prolog:
4685 x msq_init = *(floor(p)); # prolog load
4686 realignment_token = call target_builtin;
4688 x msq = phi (msq_init, ---)
4690 The stmts marked with x are generated only for the case of
4691 dr_explicit_realign_optimized.
4693 The code above sets up a new (vector) pointer, pointing to the first
4694 location accessed by STMT, and a "floor-aligned" load using that pointer.
4695 It also generates code to compute the "realignment-token" (if the relevant
4696 target hook was defined), and creates a phi-node at the loop-header bb
4697 whose arguments are the result of the prolog-load (created by this
4698 function) and the result of a load that takes place in the loop (to be
4699 created by the caller to this function).
4701 For the case of dr_explicit_realign_optimized:
4702 The caller to this function uses the phi-result (msq) to create the
4703 realignment code inside the loop, and sets up the missing phi argument,
4706 msq = phi (msq_init, lsq)
4707 lsq = *(floor(p')); # load in loop
4708 result = realign_load (msq, lsq, realignment_token);
4710 For the case of dr_explicit_realign:
4712 msq = *(floor(p)); # load in loop
4714 lsq = *(floor(p')); # load in loop
4715 result = realign_load (msq, lsq, realignment_token);
4718 STMT - (scalar) load stmt to be vectorized. This load accesses
4719 a memory location that may be unaligned.
4720 BSI - place where new code is to be inserted.
4721 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4725 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4726 target hook, if defined.
4727 Return value - the result of the loop-header phi node. */
4730 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4731 tree
*realignment_token
,
4732 enum dr_alignment_support alignment_support_scheme
,
4734 struct loop
**at_loop
)
4736 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4737 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4738 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4739 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4740 struct loop
*loop
= NULL
;
4742 tree scalar_dest
= gimple_assign_lhs (stmt
);
4748 tree msq_init
= NULL_TREE
;
4751 tree msq
= NULL_TREE
;
4752 gimple_seq stmts
= NULL
;
4754 bool compute_in_loop
= false;
4755 bool nested_in_vect_loop
= false;
4756 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4757 struct loop
*loop_for_initial_load
= NULL
;
4761 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4762 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4765 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4766 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4768 /* We need to generate three things:
4769 1. the misalignment computation
4770 2. the extra vector load (for the optimized realignment scheme).
4771 3. the phi node for the two vectors from which the realignment is
4772 done (for the optimized realignment scheme). */
4774 /* 1. Determine where to generate the misalignment computation.
4776 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4777 calculation will be generated by this function, outside the loop (in the
4778 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4779 caller, inside the loop.
4781 Background: If the misalignment remains fixed throughout the iterations of
4782 the loop, then both realignment schemes are applicable, and also the
4783 misalignment computation can be done outside LOOP. This is because we are
4784 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4785 are a multiple of VS (the Vector Size), and therefore the misalignment in
4786 different vectorized LOOP iterations is always the same.
4787 The problem arises only if the memory access is in an inner-loop nested
4788 inside LOOP, which is now being vectorized using outer-loop vectorization.
4789 This is the only case when the misalignment of the memory access may not
4790 remain fixed throughout the iterations of the inner-loop (as explained in
4791 detail in vect_supportable_dr_alignment). In this case, not only is the
4792 optimized realignment scheme not applicable, but also the misalignment
4793 computation (and generation of the realignment token that is passed to
4794 REALIGN_LOAD) have to be done inside the loop.
4796 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4797 or not, which in turn determines if the misalignment is computed inside
4798 the inner-loop, or outside LOOP. */
4800 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4802 compute_in_loop
= true;
4803 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4807 /* 2. Determine where to generate the extra vector load.
4809 For the optimized realignment scheme, instead of generating two vector
4810 loads in each iteration, we generate a single extra vector load in the
4811 preheader of the loop, and in each iteration reuse the result of the
4812 vector load from the previous iteration. In case the memory access is in
4813 an inner-loop nested inside LOOP, which is now being vectorized using
4814 outer-loop vectorization, we need to determine whether this initial vector
4815 load should be generated at the preheader of the inner-loop, or can be
4816 generated at the preheader of LOOP. If the memory access has no evolution
4817 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4818 to be generated inside LOOP (in the preheader of the inner-loop). */
4820 if (nested_in_vect_loop
)
4822 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4823 bool invariant_in_outerloop
=
4824 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4825 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4828 loop_for_initial_load
= loop
;
4830 *at_loop
= loop_for_initial_load
;
4832 if (loop_for_initial_load
)
4833 pe
= loop_preheader_edge (loop_for_initial_load
);
4835 /* 3. For the case of the optimized realignment, create the first vector
4836 load at the loop preheader. */
4838 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4840 /* Create msq_init = *(floor(p1)) in the loop preheader */
4843 gcc_assert (!compute_in_loop
);
4844 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4845 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4846 NULL_TREE
, &init_addr
, NULL
, &inc
,
4848 if (TREE_CODE (ptr
) == SSA_NAME
)
4849 new_temp
= copy_ssa_name (ptr
);
4851 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
4852 new_stmt
= gimple_build_assign
4853 (new_temp
, BIT_AND_EXPR
, ptr
,
4854 build_int_cst (TREE_TYPE (ptr
),
4855 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4856 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4857 gcc_assert (!new_bb
);
4859 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4860 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4861 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4862 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4863 gimple_assign_set_lhs (new_stmt
, new_temp
);
4866 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4867 gcc_assert (!new_bb
);
4870 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4872 msq_init
= gimple_assign_lhs (new_stmt
);
4875 /* 4. Create realignment token using a target builtin, if available.
4876 It is done either inside the containing loop, or before LOOP (as
4877 determined above). */
4879 if (targetm
.vectorize
.builtin_mask_for_load
)
4884 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4887 /* Generate the INIT_ADDR computation outside LOOP. */
4888 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4892 pe
= loop_preheader_edge (loop
);
4893 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4894 gcc_assert (!new_bb
);
4897 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4900 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4901 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4903 vect_create_destination_var (scalar_dest
,
4904 gimple_call_return_type (new_stmt
));
4905 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4906 gimple_call_set_lhs (new_stmt
, new_temp
);
4908 if (compute_in_loop
)
4909 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4912 /* Generate the misalignment computation outside LOOP. */
4913 pe
= loop_preheader_edge (loop
);
4914 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4915 gcc_assert (!new_bb
);
4918 *realignment_token
= gimple_call_lhs (new_stmt
);
4920 /* The result of the CALL_EXPR to this builtin is determined from
4921 the value of the parameter and no global variables are touched
4922 which makes the builtin a "const" function. Requiring the
4923 builtin to have the "const" attribute makes it unnecessary
4924 to call mark_call_clobbered. */
4925 gcc_assert (TREE_READONLY (builtin_decl
));
4928 if (alignment_support_scheme
== dr_explicit_realign
)
4931 gcc_assert (!compute_in_loop
);
4932 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4935 /* 5. Create msq = phi <msq_init, lsq> in loop */
4937 pe
= loop_preheader_edge (containing_loop
);
4938 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4939 msq
= make_ssa_name (vec_dest
);
4940 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4941 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4947 /* Function vect_grouped_load_supported.
4949 Returns TRUE if even and odd permutations are supported,
4950 and FALSE otherwise. */
4953 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4955 machine_mode mode
= TYPE_MODE (vectype
);
4957 /* vect_permute_load_chain requires the group size to be equal to 3 or
4958 be a power of two. */
4959 if (count
!= 3 && exact_log2 (count
) == -1)
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4963 "the size of the group of accesses"
4964 " is not a power of 2 or not equal to 3\n");
4968 /* Check that the permutation is supported. */
4969 if (VECTOR_MODE_P (mode
))
4971 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
4972 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4977 for (k
= 0; k
< 3; k
++)
4979 for (i
= 0; i
< nelt
; i
++)
4980 if (3 * i
+ k
< 2 * nelt
)
4984 if (!can_vec_perm_p (mode
, false, sel
))
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4988 "shuffle of 3 loads is not supported by"
4992 for (i
= 0, j
= 0; i
< nelt
; i
++)
4993 if (3 * i
+ k
< 2 * nelt
)
4996 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
4997 if (!can_vec_perm_p (mode
, false, sel
))
4999 if (dump_enabled_p ())
5000 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5001 "shuffle of 3 loads is not supported by"
5010 /* If length is not equal to 3 then only power of 2 is supported. */
5011 gcc_assert (exact_log2 (count
) != -1);
5012 for (i
= 0; i
< nelt
; i
++)
5014 if (can_vec_perm_p (mode
, false, sel
))
5016 for (i
= 0; i
< nelt
; i
++)
5018 if (can_vec_perm_p (mode
, false, sel
))
5024 if (dump_enabled_p ())
5025 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5026 "extract even/odd not supported by target\n");
5030 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5034 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5036 return vect_lanes_optab_supported_p ("vec_load_lanes",
5037 vec_load_lanes_optab
,
5041 /* Function vect_permute_load_chain.
5043 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5044 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5045 the input data correctly. Return the final references for loads in
5048 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5049 The input is 4 vectors each containing 8 elements. We assign a number to each
5050 element, the input sequence is:
5052 1st vec: 0 1 2 3 4 5 6 7
5053 2nd vec: 8 9 10 11 12 13 14 15
5054 3rd vec: 16 17 18 19 20 21 22 23
5055 4th vec: 24 25 26 27 28 29 30 31
5057 The output sequence should be:
5059 1st vec: 0 4 8 12 16 20 24 28
5060 2nd vec: 1 5 9 13 17 21 25 29
5061 3rd vec: 2 6 10 14 18 22 26 30
5062 4th vec: 3 7 11 15 19 23 27 31
5064 i.e., the first output vector should contain the first elements of each
5065 interleaving group, etc.
5067 We use extract_even/odd instructions to create such output. The input of
5068 each extract_even/odd operation is two vectors
5072 and the output is the vector of extracted even/odd elements. The output of
5073 extract_even will be: 0 2 4 6
5074 and of extract_odd: 1 3 5 7
5077 The permutation is done in log LENGTH stages. In each stage extract_even
5078 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5079 their order. In our example,
5081 E1: extract_even (1st vec, 2nd vec)
5082 E2: extract_odd (1st vec, 2nd vec)
5083 E3: extract_even (3rd vec, 4th vec)
5084 E4: extract_odd (3rd vec, 4th vec)
5086 The output for the first stage will be:
5088 E1: 0 2 4 6 8 10 12 14
5089 E2: 1 3 5 7 9 11 13 15
5090 E3: 16 18 20 22 24 26 28 30
5091 E4: 17 19 21 23 25 27 29 31
5093 In order to proceed and create the correct sequence for the next stage (or
5094 for the correct output, if the second stage is the last one, as in our
5095 example), we first put the output of extract_even operation and then the
5096 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5097 The input for the second stage is:
5099 1st vec (E1): 0 2 4 6 8 10 12 14
5100 2nd vec (E3): 16 18 20 22 24 26 28 30
5101 3rd vec (E2): 1 3 5 7 9 11 13 15
5102 4th vec (E4): 17 19 21 23 25 27 29 31
5104 The output of the second stage:
5106 E1: 0 4 8 12 16 20 24 28
5107 E2: 2 6 10 14 18 22 26 30
5108 E3: 1 5 9 13 17 21 25 29
5109 E4: 3 7 11 15 19 23 27 31
5111 And RESULT_CHAIN after reordering:
5113 1st vec (E1): 0 4 8 12 16 20 24 28
5114 2nd vec (E3): 1 5 9 13 17 21 25 29
5115 3rd vec (E2): 2 6 10 14 18 22 26 30
5116 4th vec (E4): 3 7 11 15 19 23 27 31. */
5119 vect_permute_load_chain (vec
<tree
> dr_chain
,
5120 unsigned int length
,
5122 gimple_stmt_iterator
*gsi
,
5123 vec
<tree
> *result_chain
)
5125 tree data_ref
, first_vect
, second_vect
;
5126 tree perm_mask_even
, perm_mask_odd
;
5127 tree perm3_mask_low
, perm3_mask_high
;
5129 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5130 unsigned int i
, j
, log_length
= exact_log2 (length
);
5131 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5132 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5134 result_chain
->quick_grow (length
);
5135 memcpy (result_chain
->address (), dr_chain
.address (),
5136 length
* sizeof (tree
));
5142 for (k
= 0; k
< 3; k
++)
5144 for (i
= 0; i
< nelt
; i
++)
5145 if (3 * i
+ k
< 2 * nelt
)
5149 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5151 for (i
= 0, j
= 0; i
< nelt
; i
++)
5152 if (3 * i
+ k
< 2 * nelt
)
5155 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5157 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5159 first_vect
= dr_chain
[0];
5160 second_vect
= dr_chain
[1];
5162 /* Create interleaving stmt (low part of):
5163 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5165 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5166 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5167 second_vect
, perm3_mask_low
);
5168 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5170 /* Create interleaving stmt (high part of):
5171 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5173 first_vect
= data_ref
;
5174 second_vect
= dr_chain
[2];
5175 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5176 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5177 second_vect
, perm3_mask_high
);
5178 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5179 (*result_chain
)[k
] = data_ref
;
5184 /* If length is not equal to 3 then only power of 2 is supported. */
5185 gcc_assert (exact_log2 (length
) != -1);
5187 for (i
= 0; i
< nelt
; ++i
)
5189 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5191 for (i
= 0; i
< nelt
; ++i
)
5193 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5195 for (i
= 0; i
< log_length
; i
++)
5197 for (j
= 0; j
< length
; j
+= 2)
5199 first_vect
= dr_chain
[j
];
5200 second_vect
= dr_chain
[j
+1];
5202 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5203 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5204 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5205 first_vect
, second_vect
,
5207 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5208 (*result_chain
)[j
/2] = data_ref
;
5210 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5211 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5212 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5213 first_vect
, second_vect
,
5215 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5216 (*result_chain
)[j
/2+length
/2] = data_ref
;
5218 memcpy (dr_chain
.address (), result_chain
->address (),
5219 length
* sizeof (tree
));
5224 /* Function vect_shift_permute_load_chain.
5226 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5227 sequence of stmts to reorder the input data accordingly.
5228 Return the final references for loads in RESULT_CHAIN.
5229 Return true if successed, false otherwise.
5231 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5232 The input is 3 vectors each containing 8 elements. We assign a
5233 number to each element, the input sequence is:
5235 1st vec: 0 1 2 3 4 5 6 7
5236 2nd vec: 8 9 10 11 12 13 14 15
5237 3rd vec: 16 17 18 19 20 21 22 23
5239 The output sequence should be:
5241 1st vec: 0 3 6 9 12 15 18 21
5242 2nd vec: 1 4 7 10 13 16 19 22
5243 3rd vec: 2 5 8 11 14 17 20 23
5245 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5247 First we shuffle all 3 vectors to get correct elements order:
5249 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5250 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5251 3rd vec: (16 19 22) (17 20 23) (18 21)
5253 Next we unite and shift vector 3 times:
5256 shift right by 6 the concatenation of:
5257 "1st vec" and "2nd vec"
5258 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5259 "2nd vec" and "3rd vec"
5260 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5261 "3rd vec" and "1st vec"
5262 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5265 So that now new vectors are:
5267 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5268 2nd vec: (10 13) (16 19 22) (17 20 23)
5269 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5272 shift right by 5 the concatenation of:
5273 "1st vec" and "3rd vec"
5274 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5275 "2nd vec" and "1st vec"
5276 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5277 "3rd vec" and "2nd vec"
5278 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5281 So that now new vectors are:
5283 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5284 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5285 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5288 shift right by 5 the concatenation of:
5289 "1st vec" and "1st vec"
5290 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5291 shift right by 3 the concatenation of:
5292 "2nd vec" and "2nd vec"
5293 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5296 So that now all vectors are READY:
5297 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5298 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5299 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5301 This algorithm is faster than one in vect_permute_load_chain if:
5302 1. "shift of a concatination" is faster than general permutation.
5304 2. The TARGET machine can't execute vector instructions in parallel.
5305 This is because each step of the algorithm depends on previous.
5306 The algorithm in vect_permute_load_chain is much more parallel.
5308 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5312 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5313 unsigned int length
,
5315 gimple_stmt_iterator
*gsi
,
5316 vec
<tree
> *result_chain
)
5318 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5319 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5320 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5323 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5325 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5326 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5327 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5328 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5330 result_chain
->quick_grow (length
);
5331 memcpy (result_chain
->address (), dr_chain
.address (),
5332 length
* sizeof (tree
));
5334 if (exact_log2 (length
) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5336 unsigned int j
, log_length
= exact_log2 (length
);
5337 for (i
= 0; i
< nelt
/ 2; ++i
)
5339 for (i
= 0; i
< nelt
/ 2; ++i
)
5340 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5341 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5343 if (dump_enabled_p ())
5344 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5345 "shuffle of 2 fields structure is not \
5346 supported by target\n");
5349 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5351 for (i
= 0; i
< nelt
/ 2; ++i
)
5353 for (i
= 0; i
< nelt
/ 2; ++i
)
5354 sel
[nelt
/ 2 + i
] = i
* 2;
5355 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5357 if (dump_enabled_p ())
5358 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5359 "shuffle of 2 fields structure is not \
5360 supported by target\n");
5363 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5365 /* Generating permutation constant to shift all elements.
5366 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5367 for (i
= 0; i
< nelt
; i
++)
5368 sel
[i
] = nelt
/ 2 + i
;
5369 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5371 if (dump_enabled_p ())
5372 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5373 "shift permutation is not supported by target\n");
5376 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5378 /* Generating permutation constant to select vector from 2.
5379 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5380 for (i
= 0; i
< nelt
/ 2; i
++)
5382 for (i
= nelt
/ 2; i
< nelt
; i
++)
5384 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5386 if (dump_enabled_p ())
5387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5388 "select is not supported by target\n");
5391 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5393 for (i
= 0; i
< log_length
; i
++)
5395 for (j
= 0; j
< length
; j
+= 2)
5397 first_vect
= dr_chain
[j
];
5398 second_vect
= dr_chain
[j
+ 1];
5400 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5401 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5402 first_vect
, first_vect
,
5404 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5407 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5408 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5409 second_vect
, second_vect
,
5411 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5414 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5415 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5416 vect
[0], vect
[1], shift1_mask
);
5417 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5418 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5420 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5421 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5422 vect
[0], vect
[1], select_mask
);
5423 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5424 (*result_chain
)[j
/2] = data_ref
;
5426 memcpy (dr_chain
.address (), result_chain
->address (),
5427 length
* sizeof (tree
));
5431 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5433 unsigned int k
= 0, l
= 0;
5435 /* Generating permutation constant to get all elements in rigth order.
5436 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5437 for (i
= 0; i
< nelt
; i
++)
5439 if (3 * k
+ (l
% 3) >= nelt
)
5442 l
+= (3 - (nelt
% 3));
5444 sel
[i
] = 3 * k
+ (l
% 3);
5447 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5449 if (dump_enabled_p ())
5450 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5451 "shuffle of 3 fields structure is not \
5452 supported by target\n");
5455 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5457 /* Generating permutation constant to shift all elements.
5458 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5459 for (i
= 0; i
< nelt
; i
++)
5460 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5461 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5463 if (dump_enabled_p ())
5464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5465 "shift permutation is not supported by target\n");
5468 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5470 /* Generating permutation constant to shift all elements.
5471 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5472 for (i
= 0; i
< nelt
; i
++)
5473 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5474 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5476 if (dump_enabled_p ())
5477 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5478 "shift permutation is not supported by target\n");
5481 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5483 /* Generating permutation constant to shift all elements.
5484 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5485 for (i
= 0; i
< nelt
; i
++)
5486 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5487 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5491 "shift permutation is not supported by target\n");
5494 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5496 /* Generating permutation constant to shift all elements.
5497 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5498 for (i
= 0; i
< nelt
; i
++)
5499 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5500 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5502 if (dump_enabled_p ())
5503 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5504 "shift permutation is not supported by target\n");
5507 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5509 for (k
= 0; k
< 3; k
++)
5511 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5512 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5513 dr_chain
[k
], dr_chain
[k
],
5515 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5519 for (k
= 0; k
< 3; k
++)
5521 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5522 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5523 vect
[k
% 3], vect
[(k
+ 1) % 3],
5525 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5526 vect_shift
[k
] = data_ref
;
5529 for (k
= 0; k
< 3; k
++)
5531 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5532 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5533 vect_shift
[(4 - k
) % 3],
5534 vect_shift
[(3 - k
) % 3],
5536 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5540 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5542 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5543 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5544 vect
[0], shift3_mask
);
5545 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5546 (*result_chain
)[nelt
% 3] = data_ref
;
5548 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5549 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5550 vect
[1], shift4_mask
);
5551 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5552 (*result_chain
)[0] = data_ref
;
5558 /* Function vect_transform_grouped_load.
5560 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5561 to perform their permutation and ascribe the result vectorized statements to
5562 the scalar statements.
5566 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
5567 gimple_stmt_iterator
*gsi
)
5570 vec
<tree
> result_chain
= vNULL
;
5572 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5573 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5574 vectors, that are ready for vector computation. */
5575 result_chain
.create (size
);
5577 /* If reassociation width for vector type is 2 or greater target machine can
5578 execute 2 or more vector instructions in parallel. Otherwise try to
5579 get chain for loads group using vect_shift_permute_load_chain. */
5580 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5581 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5582 || exact_log2 (size
) != -1
5583 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5584 gsi
, &result_chain
))
5585 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5586 vect_record_grouped_load_vectors (stmt
, result_chain
);
5587 result_chain
.release ();
5590 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5591 generated as part of the vectorization of STMT. Assign the statement
5592 for each vector to the associated scalar statement. */
5595 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
5597 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5598 gimple next_stmt
, new_stmt
;
5599 unsigned int i
, gap_count
;
5602 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5603 Since we scan the chain starting from it's first node, their order
5604 corresponds the order of data-refs in RESULT_CHAIN. */
5605 next_stmt
= first_stmt
;
5607 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5612 /* Skip the gaps. Loads created for the gaps will be removed by dead
5613 code elimination pass later. No need to check for the first stmt in
5614 the group, since it always exists.
5615 GROUP_GAP is the number of steps in elements from the previous
5616 access (if there is no gap GROUP_GAP is 1). We skip loads that
5617 correspond to the gaps. */
5618 if (next_stmt
!= first_stmt
5619 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5627 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5628 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5629 copies, and we put the new vector statement in the first available
5631 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5632 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5635 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5638 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5640 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5643 prev_stmt
= rel_stmt
;
5645 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5648 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5653 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5655 /* If NEXT_STMT accesses the same DR as the previous statement,
5656 put the same TMP_DATA_REF as its vectorized statement; otherwise
5657 get the next data-ref from RESULT_CHAIN. */
5658 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5664 /* Function vect_force_dr_alignment_p.
5666 Returns whether the alignment of a DECL can be forced to be aligned
5667 on ALIGNMENT bit boundary. */
5670 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5672 if (TREE_CODE (decl
) != VAR_DECL
)
5675 if (decl_in_symtab_p (decl
)
5676 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5679 if (TREE_STATIC (decl
))
5680 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5682 return (alignment
<= MAX_STACK_ALIGNMENT
);
5686 /* Return whether the data reference DR is supported with respect to its
5688 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5689 it is aligned, i.e., check if it is possible to vectorize it with different
5692 enum dr_alignment_support
5693 vect_supportable_dr_alignment (struct data_reference
*dr
,
5694 bool check_aligned_accesses
)
5696 gimple stmt
= DR_STMT (dr
);
5697 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5698 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5699 machine_mode mode
= TYPE_MODE (vectype
);
5700 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5701 struct loop
*vect_loop
= NULL
;
5702 bool nested_in_vect_loop
= false;
5704 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5707 /* For now assume all conditional loads/stores support unaligned
5708 access without any special code. */
5709 if (is_gimple_call (stmt
)
5710 && gimple_call_internal_p (stmt
)
5711 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5712 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5713 return dr_unaligned_supported
;
5717 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5718 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5721 /* Possibly unaligned access. */
5723 /* We can choose between using the implicit realignment scheme (generating
5724 a misaligned_move stmt) and the explicit realignment scheme (generating
5725 aligned loads with a REALIGN_LOAD). There are two variants to the
5726 explicit realignment scheme: optimized, and unoptimized.
5727 We can optimize the realignment only if the step between consecutive
5728 vector loads is equal to the vector size. Since the vector memory
5729 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5730 is guaranteed that the misalignment amount remains the same throughout the
5731 execution of the vectorized loop. Therefore, we can create the
5732 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5733 at the loop preheader.
5735 However, in the case of outer-loop vectorization, when vectorizing a
5736 memory access in the inner-loop nested within the LOOP that is now being
5737 vectorized, while it is guaranteed that the misalignment of the
5738 vectorized memory access will remain the same in different outer-loop
5739 iterations, it is *not* guaranteed that is will remain the same throughout
5740 the execution of the inner-loop. This is because the inner-loop advances
5741 with the original scalar step (and not in steps of VS). If the inner-loop
5742 step happens to be a multiple of VS, then the misalignment remains fixed
5743 and we can use the optimized realignment scheme. For example:
5749 When vectorizing the i-loop in the above example, the step between
5750 consecutive vector loads is 1, and so the misalignment does not remain
5751 fixed across the execution of the inner-loop, and the realignment cannot
5752 be optimized (as illustrated in the following pseudo vectorized loop):
5754 for (i=0; i<N; i+=4)
5755 for (j=0; j<M; j++){
5756 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5757 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5758 // (assuming that we start from an aligned address).
5761 We therefore have to use the unoptimized realignment scheme:
5763 for (i=0; i<N; i+=4)
5764 for (j=k; j<M; j+=4)
5765 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5766 // that the misalignment of the initial address is
5769 The loop can then be vectorized as follows:
5771 for (k=0; k<4; k++){
5772 rt = get_realignment_token (&vp[k]);
5773 for (i=0; i<N; i+=4){
5775 for (j=k; j<M; j+=4){
5777 va = REALIGN_LOAD <v1,v2,rt>;
5784 if (DR_IS_READ (dr
))
5786 bool is_packed
= false;
5787 tree type
= (TREE_TYPE (DR_REF (dr
)));
5789 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5790 && (!targetm
.vectorize
.builtin_mask_for_load
5791 || targetm
.vectorize
.builtin_mask_for_load ()))
5793 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5794 if ((nested_in_vect_loop
5795 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5796 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5798 return dr_explicit_realign
;
5800 return dr_explicit_realign_optimized
;
5802 if (!known_alignment_for_access_p (dr
))
5803 is_packed
= not_size_aligned (DR_REF (dr
));
5805 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5806 || targetm
.vectorize
.
5807 support_vector_misalignment (mode
, type
,
5808 DR_MISALIGNMENT (dr
), is_packed
))
5809 /* Can't software pipeline the loads, but can at least do them. */
5810 return dr_unaligned_supported
;
5814 bool is_packed
= false;
5815 tree type
= (TREE_TYPE (DR_REF (dr
)));
5817 if (!known_alignment_for_access_p (dr
))
5818 is_packed
= not_size_aligned (DR_REF (dr
));
5820 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5821 || targetm
.vectorize
.
5822 support_vector_misalignment (mode
, type
,
5823 DR_MISALIGNMENT (dr
), is_packed
))
5824 return dr_unaligned_supported
;
5828 return dr_unaligned_unsupported
;