1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
36 #include "gimple-expr.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
65 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
66 tree vectype
, unsigned HOST_WIDE_INT count
)
68 enum machine_mode mode
, array_mode
;
71 mode
= TYPE_MODE (vectype
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
76 if (array_mode
== BLKmode
)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
81 GET_MODE_NAME (mode
), count
);
85 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
89 "cannot use %s<%s><%s>\n", name
,
90 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE
, vect_location
,
96 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
97 GET_MODE_NAME (mode
));
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
121 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
122 HOST_WIDE_INT
*rhs_size_unit
)
124 tree scalar_type
= gimple_expr_type (stmt
);
125 HOST_WIDE_INT lhs
, rhs
;
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE
, vect_location
,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
165 dump_printf (MSG_NOTE
, " and ");
166 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
167 dump_printf (MSG_NOTE
, "\n");
170 if (optimize_loop_nest_for_size_p (loop
))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
174 "versioning not supported when optimizing"
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
184 "versioning not yet supported for outer-loops.\n");
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
190 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
195 "versioning not yet supported for non-constant "
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
213 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
214 loop_vec_info loop_vinfo
, int *max_vf
)
217 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
218 struct data_reference
*dra
= DDR_A (ddr
);
219 struct data_reference
*drb
= DDR_B (ddr
);
220 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
221 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
222 lambda_vector dist_v
;
223 unsigned int loop_depth
;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
235 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop
->safelen
>= 2)
245 if (loop
->safelen
< *max_vf
)
246 *max_vf
= loop
->safelen
;
247 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
251 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
252 || STMT_VINFO_GATHER_P (stmtinfo_b
))
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
257 "versioning for alias not supported for: "
258 "can't determine dependence between ");
259 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
261 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
262 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
264 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
272 "versioning for alias required: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
276 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
279 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
282 /* Add to list of ddrs that need to be tested at run-time. */
283 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
286 /* Known data dependence. */
287 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
289 /* If user asserted safelen consecutive iterations can be
290 executed concurrently, assume independence. */
291 if (loop
->safelen
>= 2)
293 if (loop
->safelen
< *max_vf
)
294 *max_vf
= loop
->safelen
;
295 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
299 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
300 || STMT_VINFO_GATHER_P (stmtinfo_b
))
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
305 "versioning for alias not supported for: "
306 "bad dist vector for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
309 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
312 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
320 "versioning for alias required: "
321 "bad dist vector for ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
323 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
324 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
325 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
331 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
332 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
334 int dist
= dist_v
[loop_depth
];
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE
, vect_location
,
338 "dependence distance = %d.\n", dist
);
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_NOTE
, vect_location
,
345 "dependence distance == 0 between ");
346 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
347 dump_printf (MSG_NOTE
, " and ");
348 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
349 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
352 /* When we perform grouped accesses and perform implicit CSE
353 by detecting equal accesses and doing disambiguation with
354 runtime alias tests like for
362 where we will end up loading { a[i], a[i+1] } once, make
363 sure that inserting group loads before the first load and
364 stores after the last store will do the right thing. */
365 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
366 && GROUP_SAME_DR_STMT (stmtinfo_a
))
367 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
368 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
371 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
373 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
375 if (dump_enabled_p ())
376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
377 "READ_WRITE dependence in interleaving."
386 if (dist
> 0 && DDR_REVERSED_P (ddr
))
388 /* If DDR_REVERSED_P the order of the data-refs in DDR was
389 reversed (to make distance vector positive), and the actual
390 distance is negative. */
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
393 "dependence distance negative.\n");
398 && abs (dist
) < *max_vf
)
400 /* The dependence distance requires reduction of the maximal
401 vectorization factor. */
402 *max_vf
= abs (dist
);
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE
, vect_location
,
405 "adjusting maximal vectorization factor to %i\n",
409 if (abs (dist
) >= *max_vf
)
411 /* Dependence distance does not create dependence, as far as
412 vectorization is concerned, in this case. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_NOTE
, vect_location
,
415 "dependence distance >= VF.\n");
419 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
422 "not vectorized, possible dependence "
423 "between data-refs ");
424 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
425 dump_printf (MSG_NOTE
, " and ");
426 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
427 dump_printf (MSG_NOTE
, "\n");
436 /* Function vect_analyze_data_ref_dependences.
438 Examine all the data references in the loop, and make sure there do not
439 exist any data dependences between them. Set *MAX_VF according to
440 the maximum vectorization factor the data dependences allow. */
443 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
446 struct data_dependence_relation
*ddr
;
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE
, vect_location
,
450 "=== vect_analyze_data_ref_dependences ===\n");
452 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
453 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
454 &LOOP_VINFO_DDRS (loop_vinfo
),
455 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
458 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
459 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
466 /* Function vect_slp_analyze_data_ref_dependence.
468 Return TRUE if there (might) exist a dependence between a memory-reference
469 DRA and a memory-reference DRB. When versioning for alias may check a
470 dependence at run-time, return FALSE. Adjust *MAX_VF according to
471 the data dependence. */
474 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
476 struct data_reference
*dra
= DDR_A (ddr
);
477 struct data_reference
*drb
= DDR_B (ddr
);
479 /* We need to check dependences of statements marked as unvectorizable
480 as well, they still can prohibit vectorization. */
482 /* Independent data accesses. */
483 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
489 /* Read-read is OK. */
490 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
493 /* If dra and drb are part of the same interleaving chain consider
495 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
496 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
497 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
500 /* Unknown data dependence. */
501 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
503 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
506 "can't determine dependence between ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
508 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
509 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
510 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
513 else if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE
, vect_location
,
516 "determined dependence between ");
517 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
518 dump_printf (MSG_NOTE
, " and ");
519 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
520 dump_printf (MSG_NOTE
, "\n");
523 /* We do not vectorize basic blocks with write-write dependencies. */
524 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
527 /* If we have a read-write dependence check that the load is before the store.
528 When we vectorize basic blocks, vector load can be only before
529 corresponding scalar load, and vector store can be only after its
530 corresponding scalar store. So the order of the acceses is preserved in
531 case the load is before the store. */
532 gimple earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
533 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
535 /* That only holds for load-store pairs taking part in vectorization. */
536 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
537 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
545 /* Function vect_analyze_data_ref_dependences.
547 Examine all the data references in the basic-block, and make sure there
548 do not exist any data dependences between them. Set *MAX_VF according to
549 the maximum vectorization factor the data dependences allow. */
552 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
554 struct data_dependence_relation
*ddr
;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE
, vect_location
,
559 "=== vect_slp_analyze_data_ref_dependences ===\n");
561 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
562 &BB_VINFO_DDRS (bb_vinfo
),
566 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
567 if (vect_slp_analyze_data_ref_dependence (ddr
))
574 /* Function vect_compute_data_ref_alignment
576 Compute the misalignment of the data reference DR.
579 1. If during the misalignment computation it is found that the data reference
580 cannot be vectorized then false is returned.
581 2. DR_MISALIGNMENT (DR) is defined.
583 FOR NOW: No analysis is actually performed. Misalignment is calculated
584 only for trivial cases. TODO. */
587 vect_compute_data_ref_alignment (struct data_reference
*dr
)
589 gimple stmt
= DR_STMT (dr
);
590 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
591 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
592 struct loop
*loop
= NULL
;
593 tree ref
= DR_REF (dr
);
595 tree base
, base_addr
;
598 tree aligned_to
, alignment
;
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_NOTE
, vect_location
,
602 "vect_compute_data_ref_alignment:\n");
605 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
607 /* Initialize misalignment to unknown. */
608 SET_DR_MISALIGNMENT (dr
, -1);
610 /* Strided loads perform only component accesses, misalignment information
611 is irrelevant for them. */
612 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
615 misalign
= DR_INIT (dr
);
616 aligned_to
= DR_ALIGNED_TO (dr
);
617 base_addr
= DR_BASE_ADDRESS (dr
);
618 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
620 /* In case the dataref is in an inner-loop of the loop that is being
621 vectorized (LOOP), we use the base and misalignment information
622 relative to the outer-loop (LOOP). This is ok only if the misalignment
623 stays the same throughout the execution of the inner-loop, which is why
624 we have to check that the stride of the dataref in the inner-loop evenly
625 divides by the vector size. */
626 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
628 tree step
= DR_STEP (dr
);
629 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
631 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE
, vect_location
,
635 "inner step divides the vector-size.\n");
636 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
637 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
638 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
642 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
644 "inner step doesn't divide the vector-size.\n");
645 misalign
= NULL_TREE
;
649 /* Similarly, if we're doing basic-block vectorization, we can only use
650 base and misalignment information relative to an innermost loop if the
651 misalignment stays the same throughout the execution of the loop.
652 As above, this is the case if the stride of the dataref evenly divides
653 by the vector size. */
656 tree step
= DR_STEP (dr
);
657 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
659 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
663 "SLP: step doesn't divide the vector-size.\n");
664 misalign
= NULL_TREE
;
668 base
= build_fold_indirect_ref (base_addr
);
669 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
671 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
677 "Unknown alignment for access: ");
678 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
679 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
685 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
687 || (TREE_CODE (base_addr
) == SSA_NAME
688 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
689 TREE_TYPE (base_addr
)))),
691 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
694 base_aligned
= false;
698 /* Do not change the alignment of global variables here if
699 flag_section_anchors is enabled as we already generated
700 RTL for other functions. Most global variables should
701 have been aligned during the IPA increase_alignment pass. */
702 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
703 || (TREE_STATIC (base
) && flag_section_anchors
))
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE
, vect_location
,
708 "can't force alignment of ref: ");
709 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
710 dump_printf (MSG_NOTE
, "\n");
715 /* Force the alignment of the decl.
716 NOTE: This is the only change to the code we make during
717 the analysis phase, before deciding to vectorize the loop. */
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
721 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
722 dump_printf (MSG_NOTE
, "\n");
725 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
726 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
729 /* If this is a backward running DR then first access in the larger
730 vectype actually is N-1 elements before the address in the DR.
731 Adjust misalign accordingly. */
732 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
734 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
735 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
736 otherwise we wouldn't be here. */
737 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
738 /* PLUS because DR_STEP was negative. */
739 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
742 /* Modulo alignment. */
743 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
745 if (!tree_fits_uhwi_p (misalign
))
747 /* Negative or overflowed misalignment value. */
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
750 "unexpected misalign value\n");
754 SET_DR_MISALIGNMENT (dr
, tree_to_uhwi (misalign
));
756 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
759 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
760 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
761 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
768 /* Function vect_compute_data_refs_alignment
770 Compute the misalignment of data references in the loop.
771 Return FALSE if a data reference is found that cannot be vectorized. */
774 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
775 bb_vec_info bb_vinfo
)
777 vec
<data_reference_p
> datarefs
;
778 struct data_reference
*dr
;
782 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
784 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
786 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
787 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
788 && !vect_compute_data_ref_alignment (dr
))
792 /* Mark unsupported statement as unvectorizable. */
793 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
804 /* Function vect_update_misalignment_for_peel
806 DR - the data reference whose misalignment is to be adjusted.
807 DR_PEEL - the data reference whose misalignment is being made
808 zero in the vector loop by the peel.
809 NPEEL - the number of iterations in the peel loop if the misalignment
810 of DR_PEEL is known at compile time. */
813 vect_update_misalignment_for_peel (struct data_reference
*dr
,
814 struct data_reference
*dr_peel
, int npeel
)
817 vec
<dr_p
> same_align_drs
;
818 struct data_reference
*current_dr
;
819 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
820 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
821 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
822 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
824 /* For interleaved data accesses the step in the loop must be multiplied by
825 the size of the interleaving group. */
826 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
827 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
828 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
829 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
831 /* It can be assumed that the data refs with the same alignment as dr_peel
832 are aligned in the vector loop. */
834 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
835 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
837 if (current_dr
!= dr
)
839 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
840 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
841 SET_DR_MISALIGNMENT (dr
, 0);
845 if (known_alignment_for_access_p (dr
)
846 && known_alignment_for_access_p (dr_peel
))
848 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
849 int misal
= DR_MISALIGNMENT (dr
);
850 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
851 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
852 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
853 SET_DR_MISALIGNMENT (dr
, misal
);
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
859 SET_DR_MISALIGNMENT (dr
, -1);
863 /* Function vect_verify_datarefs_alignment
865 Return TRUE if all data references in the loop can be
866 handled with respect to alignment. */
869 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
871 vec
<data_reference_p
> datarefs
;
872 struct data_reference
*dr
;
873 enum dr_alignment_support supportable_dr_alignment
;
877 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
879 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
881 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
883 gimple stmt
= DR_STMT (dr
);
884 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
886 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
889 /* For interleaving, only the alignment of the first access matters.
890 Skip statements marked as not vectorizable. */
891 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
892 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
893 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
896 /* Strided loads perform only component accesses, alignment is
897 irrelevant for them. */
898 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
901 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
902 if (!supportable_dr_alignment
)
904 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
908 "not vectorized: unsupported unaligned load.");
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
911 "not vectorized: unsupported unaligned "
914 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
916 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
920 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE
, vect_location
,
922 "Vectorizing an unaligned access.\n");
927 /* Given an memory reference EXP return whether its alignment is less
931 not_size_aligned (tree exp
)
933 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
936 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
937 > get_object_alignment (exp
));
940 /* Function vector_alignment_reachable_p
942 Return true if vector alignment for DR is reachable by peeling
943 a few loop iterations. Return false otherwise. */
946 vector_alignment_reachable_p (struct data_reference
*dr
)
948 gimple stmt
= DR_STMT (dr
);
949 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
950 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
952 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
954 /* For interleaved access we peel only if number of iterations in
955 the prolog loop ({VF - misalignment}), is a multiple of the
956 number of the interleaved accesses. */
957 int elem_size
, mis_in_elements
;
958 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
960 /* FORNOW: handle only known alignment. */
961 if (!known_alignment_for_access_p (dr
))
964 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
965 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
967 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
971 /* If misalignment is known at the compile time then allow peeling
972 only if natural alignment is reachable through peeling. */
973 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
975 HOST_WIDE_INT elmsize
=
976 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_NOTE
, vect_location
,
980 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
981 dump_printf (MSG_NOTE
,
982 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
984 if (DR_MISALIGNMENT (dr
) % elmsize
)
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
988 "data size does not divide the misalignment.\n");
993 if (!known_alignment_for_access_p (dr
))
995 tree type
= TREE_TYPE (DR_REF (dr
));
996 bool is_packed
= not_size_aligned (DR_REF (dr
));
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
999 "Unknown misalignment, is_packed = %d\n",is_packed
);
1000 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1001 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1011 /* Calculate the cost of the memory access represented by DR. */
1014 vect_get_data_access_cost (struct data_reference
*dr
,
1015 unsigned int *inside_cost
,
1016 unsigned int *outside_cost
,
1017 stmt_vector_for_cost
*body_cost_vec
)
1019 gimple stmt
= DR_STMT (dr
);
1020 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1021 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1022 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1023 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1024 int ncopies
= vf
/ nunits
;
1026 if (DR_IS_READ (dr
))
1027 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1028 NULL
, body_cost_vec
, false);
1030 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE
, vect_location
,
1034 "vect_get_data_access_cost: inside_cost = %d, "
1035 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1039 /* Insert DR into peeling hash table with NPEEL as key. */
1042 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1045 struct _vect_peel_info elem
, *slot
;
1046 _vect_peel_info
**new_slot
;
1047 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1050 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1055 slot
= XNEW (struct _vect_peel_info
);
1056 slot
->npeel
= npeel
;
1059 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1063 if (!supportable_dr_alignment
1064 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1065 slot
->count
+= VECT_MAX_COST
;
1069 /* Traverse peeling hash table to find peeling option that aligns maximum
1070 number of data accesses. */
1073 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1074 _vect_peel_extended_info
*max
)
1076 vect_peel_info elem
= *slot
;
1078 if (elem
->count
> max
->peel_info
.count
1079 || (elem
->count
== max
->peel_info
.count
1080 && max
->peel_info
.npeel
> elem
->npeel
))
1082 max
->peel_info
.npeel
= elem
->npeel
;
1083 max
->peel_info
.count
= elem
->count
;
1084 max
->peel_info
.dr
= elem
->dr
;
1091 /* Traverse peeling hash table and calculate cost for each peeling option.
1092 Find the one with the lowest cost. */
1095 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1096 _vect_peel_extended_info
*min
)
1098 vect_peel_info elem
= *slot
;
1099 int save_misalignment
, dummy
;
1100 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1101 gimple stmt
= DR_STMT (elem
->dr
);
1102 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1103 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1104 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1105 struct data_reference
*dr
;
1106 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1107 int single_iter_cost
;
1109 prologue_cost_vec
.create (2);
1110 body_cost_vec
.create (2);
1111 epilogue_cost_vec
.create (2);
1113 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1115 stmt
= DR_STMT (dr
);
1116 stmt_info
= vinfo_for_stmt (stmt
);
1117 /* For interleaving, only the alignment of the first access
1119 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1120 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1123 save_misalignment
= DR_MISALIGNMENT (dr
);
1124 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1125 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1127 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1130 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1131 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1132 &dummy
, single_iter_cost
,
1134 &epilogue_cost_vec
);
1136 /* Prologue and epilogue costs are added to the target model later.
1137 These costs depend only on the scalar iteration cost, the
1138 number of peeling iterations finally chosen, and the number of
1139 misaligned statements. So discard the information found here. */
1140 prologue_cost_vec
.release ();
1141 epilogue_cost_vec
.release ();
1143 if (inside_cost
< min
->inside_cost
1144 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1146 min
->inside_cost
= inside_cost
;
1147 min
->outside_cost
= outside_cost
;
1148 min
->body_cost_vec
.release ();
1149 min
->body_cost_vec
= body_cost_vec
;
1150 min
->peel_info
.dr
= elem
->dr
;
1151 min
->peel_info
.npeel
= elem
->npeel
;
1154 body_cost_vec
.release ();
1160 /* Choose best peeling option by traversing peeling hash table and either
1161 choosing an option with the lowest cost (if cost model is enabled) or the
1162 option that aligns as many accesses as possible. */
1164 static struct data_reference
*
1165 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1166 unsigned int *npeel
,
1167 stmt_vector_for_cost
*body_cost_vec
)
1169 struct _vect_peel_extended_info res
;
1171 res
.peel_info
.dr
= NULL
;
1172 res
.body_cost_vec
= stmt_vector_for_cost ();
1174 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1176 res
.inside_cost
= INT_MAX
;
1177 res
.outside_cost
= INT_MAX
;
1178 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1179 .traverse
<_vect_peel_extended_info
*,
1180 vect_peeling_hash_get_lowest_cost
> (&res
);
1184 res
.peel_info
.count
= 0;
1185 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1186 .traverse
<_vect_peel_extended_info
*,
1187 vect_peeling_hash_get_most_frequent
> (&res
);
1190 *npeel
= res
.peel_info
.npeel
;
1191 *body_cost_vec
= res
.body_cost_vec
;
1192 return res
.peel_info
.dr
;
1196 /* Function vect_enhance_data_refs_alignment
1198 This pass will use loop versioning and loop peeling in order to enhance
1199 the alignment of data references in the loop.
1201 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1202 original loop is to be vectorized. Any other loops that are created by
1203 the transformations performed in this pass - are not supposed to be
1204 vectorized. This restriction will be relaxed.
1206 This pass will require a cost model to guide it whether to apply peeling
1207 or versioning or a combination of the two. For example, the scheme that
1208 intel uses when given a loop with several memory accesses, is as follows:
1209 choose one memory access ('p') which alignment you want to force by doing
1210 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1211 other accesses are not necessarily aligned, or (2) use loop versioning to
1212 generate one loop in which all accesses are aligned, and another loop in
1213 which only 'p' is necessarily aligned.
1215 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1216 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1217 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1219 Devising a cost model is the most critical aspect of this work. It will
1220 guide us on which access to peel for, whether to use loop versioning, how
1221 many versions to create, etc. The cost model will probably consist of
1222 generic considerations as well as target specific considerations (on
1223 powerpc for example, misaligned stores are more painful than misaligned
1226 Here are the general steps involved in alignment enhancements:
1228 -- original loop, before alignment analysis:
1229 for (i=0; i<N; i++){
1230 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1231 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1234 -- After vect_compute_data_refs_alignment:
1235 for (i=0; i<N; i++){
1236 x = q[i]; # DR_MISALIGNMENT(q) = 3
1237 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1240 -- Possibility 1: we do loop versioning:
1242 for (i=0; i<N; i++){ # loop 1A
1243 x = q[i]; # DR_MISALIGNMENT(q) = 3
1244 p[i] = y; # DR_MISALIGNMENT(p) = 0
1248 for (i=0; i<N; i++){ # loop 1B
1249 x = q[i]; # DR_MISALIGNMENT(q) = 3
1250 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1254 -- Possibility 2: we do loop peeling:
1255 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1259 for (i = 3; i < N; i++){ # loop 2A
1260 x = q[i]; # DR_MISALIGNMENT(q) = 0
1261 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1264 -- Possibility 3: combination of loop peeling and versioning:
1265 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1270 for (i = 3; i<N; i++){ # loop 3A
1271 x = q[i]; # DR_MISALIGNMENT(q) = 0
1272 p[i] = y; # DR_MISALIGNMENT(p) = 0
1276 for (i = 3; i<N; i++){ # loop 3B
1277 x = q[i]; # DR_MISALIGNMENT(q) = 0
1278 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1282 These loops are later passed to loop_transform to be vectorized. The
1283 vectorizer will use the alignment information to guide the transformation
1284 (whether to generate regular loads/stores, or with special handling for
1288 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1290 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1291 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1292 enum dr_alignment_support supportable_dr_alignment
;
1293 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1294 struct data_reference
*dr
;
1296 bool do_peeling
= false;
1297 bool do_versioning
= false;
1300 stmt_vec_info stmt_info
;
1301 unsigned int npeel
= 0;
1302 bool all_misalignments_unknown
= true;
1303 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1304 unsigned possible_npeel_number
= 1;
1306 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1307 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_NOTE
, vect_location
,
1311 "=== vect_enhance_data_refs_alignment ===\n");
1313 /* While cost model enhancements are expected in the future, the high level
1314 view of the code at this time is as follows:
1316 A) If there is a misaligned access then see if peeling to align
1317 this access can make all data references satisfy
1318 vect_supportable_dr_alignment. If so, update data structures
1319 as needed and return true.
1321 B) If peeling wasn't possible and there is a data reference with an
1322 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1323 then see if loop versioning checks can be used to make all data
1324 references satisfy vect_supportable_dr_alignment. If so, update
1325 data structures as needed and return true.
1327 C) If neither peeling nor versioning were successful then return false if
1328 any data reference does not satisfy vect_supportable_dr_alignment.
1330 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1332 Note, Possibility 3 above (which is peeling and versioning together) is not
1333 being done at this time. */
1335 /* (1) Peeling to force alignment. */
1337 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1339 + How many accesses will become aligned due to the peeling
1340 - How many accesses will become unaligned due to the peeling,
1341 and the cost of misaligned accesses.
1342 - The cost of peeling (the extra runtime checks, the increase
1345 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1347 stmt
= DR_STMT (dr
);
1348 stmt_info
= vinfo_for_stmt (stmt
);
1350 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1353 /* For interleaving, only the alignment of the first access
1355 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1356 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1359 /* For invariant accesses there is nothing to enhance. */
1360 if (integer_zerop (DR_STEP (dr
)))
1363 /* Strided loads perform only component accesses, alignment is
1364 irrelevant for them. */
1365 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1368 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1369 do_peeling
= vector_alignment_reachable_p (dr
);
1372 if (known_alignment_for_access_p (dr
))
1374 unsigned int npeel_tmp
;
1375 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1376 size_zero_node
) < 0;
1378 /* Save info about DR in the hash table. */
1379 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1380 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1382 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1383 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1384 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1385 TREE_TYPE (DR_REF (dr
))));
1386 npeel_tmp
= (negative
1387 ? (mis
- nelements
) : (nelements
- mis
))
1390 /* For multiple types, it is possible that the bigger type access
1391 will have more than one peeling option. E.g., a loop with two
1392 types: one of size (vector size / 4), and the other one of
1393 size (vector size / 8). Vectorization factor will 8. If both
1394 access are misaligned by 3, the first one needs one scalar
1395 iteration to be aligned, and the second one needs 5. But the
1396 the first one will be aligned also by peeling 5 scalar
1397 iterations, and in that case both accesses will be aligned.
1398 Hence, except for the immediate peeling amount, we also want
1399 to try to add full vector size, while we don't exceed
1400 vectorization factor.
1401 We do this automtically for cost model, since we calculate cost
1402 for every peeling option. */
1403 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1404 possible_npeel_number
= vf
/nelements
;
1406 /* Handle the aligned case. We may decide to align some other
1407 access, making DR unaligned. */
1408 if (DR_MISALIGNMENT (dr
) == 0)
1411 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1412 possible_npeel_number
++;
1415 for (j
= 0; j
< possible_npeel_number
; j
++)
1417 gcc_assert (npeel_tmp
<= vf
);
1418 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1419 npeel_tmp
+= nelements
;
1422 all_misalignments_unknown
= false;
1423 /* Data-ref that was chosen for the case that all the
1424 misalignments are unknown is not relevant anymore, since we
1425 have a data-ref with known alignment. */
1430 /* If we don't know any misalignment values, we prefer
1431 peeling for data-ref that has the maximum number of data-refs
1432 with the same alignment, unless the target prefers to align
1433 stores over load. */
1434 if (all_misalignments_unknown
)
1436 unsigned same_align_drs
1437 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1439 || same_align_drs_max
< same_align_drs
)
1441 same_align_drs_max
= same_align_drs
;
1444 /* For data-refs with the same number of related
1445 accesses prefer the one where the misalign
1446 computation will be invariant in the outermost loop. */
1447 else if (same_align_drs_max
== same_align_drs
)
1449 struct loop
*ivloop0
, *ivloop
;
1450 ivloop0
= outermost_invariant_loop_for_expr
1451 (loop
, DR_BASE_ADDRESS (dr0
));
1452 ivloop
= outermost_invariant_loop_for_expr
1453 (loop
, DR_BASE_ADDRESS (dr
));
1454 if ((ivloop
&& !ivloop0
)
1455 || (ivloop
&& ivloop0
1456 && flow_loop_nested_p (ivloop
, ivloop0
)))
1460 if (!first_store
&& DR_IS_WRITE (dr
))
1464 /* If there are both known and unknown misaligned accesses in the
1465 loop, we choose peeling amount according to the known
1467 if (!supportable_dr_alignment
)
1470 if (!first_store
&& DR_IS_WRITE (dr
))
1477 if (!aligned_access_p (dr
))
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1481 "vector alignment may not be reachable\n");
1487 /* Check if we can possibly peel the loop. */
1488 if (!vect_can_advance_ivs_p (loop_vinfo
)
1489 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1492 if (do_peeling
&& all_misalignments_unknown
1493 && vect_supportable_dr_alignment (dr0
, false))
1496 /* Check if the target requires to prefer stores over loads, i.e., if
1497 misaligned stores are more expensive than misaligned loads (taking
1498 drs with same alignment into account). */
1499 if (first_store
&& DR_IS_READ (dr0
))
1501 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1502 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1503 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1504 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1505 stmt_vector_for_cost dummy
;
1508 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1510 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1511 &store_outside_cost
, &dummy
);
1515 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1516 aligning the load DR0). */
1517 load_inside_penalty
= store_inside_cost
;
1518 load_outside_penalty
= store_outside_cost
;
1520 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1521 DR_STMT (first_store
))).iterate (i
, &dr
);
1523 if (DR_IS_READ (dr
))
1525 load_inside_penalty
+= load_inside_cost
;
1526 load_outside_penalty
+= load_outside_cost
;
1530 load_inside_penalty
+= store_inside_cost
;
1531 load_outside_penalty
+= store_outside_cost
;
1534 /* Calculate the penalty for leaving DR0 unaligned (by
1535 aligning the FIRST_STORE). */
1536 store_inside_penalty
= load_inside_cost
;
1537 store_outside_penalty
= load_outside_cost
;
1539 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1540 DR_STMT (dr0
))).iterate (i
, &dr
);
1542 if (DR_IS_READ (dr
))
1544 store_inside_penalty
+= load_inside_cost
;
1545 store_outside_penalty
+= load_outside_cost
;
1549 store_inside_penalty
+= store_inside_cost
;
1550 store_outside_penalty
+= store_outside_cost
;
1553 if (load_inside_penalty
> store_inside_penalty
1554 || (load_inside_penalty
== store_inside_penalty
1555 && load_outside_penalty
> store_outside_penalty
))
1559 /* In case there are only loads with different unknown misalignments, use
1560 peeling only if it may help to align other accesses in the loop. */
1562 && !STMT_VINFO_SAME_ALIGN_REFS (
1563 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1564 && vect_supportable_dr_alignment (dr0
, false)
1565 != dr_unaligned_supported
)
1569 if (do_peeling
&& !dr0
)
1571 /* Peeling is possible, but there is no data access that is not supported
1572 unless aligned. So we try to choose the best possible peeling. */
1574 /* We should get here only if there are drs with known misalignment. */
1575 gcc_assert (!all_misalignments_unknown
);
1577 /* Choose the best peeling from the hash table. */
1578 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1586 stmt
= DR_STMT (dr0
);
1587 stmt_info
= vinfo_for_stmt (stmt
);
1588 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1589 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1591 if (known_alignment_for_access_p (dr0
))
1593 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1594 size_zero_node
) < 0;
1597 /* Since it's known at compile time, compute the number of
1598 iterations in the peeled loop (the peeling factor) for use in
1599 updating DR_MISALIGNMENT values. The peeling factor is the
1600 vectorization factor minus the misalignment as an element
1602 mis
= DR_MISALIGNMENT (dr0
);
1603 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1604 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1608 /* For interleaved data access every iteration accesses all the
1609 members of the group, therefore we divide the number of iterations
1610 by the group size. */
1611 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1613 npeel
/= GROUP_SIZE (stmt_info
);
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE
, vect_location
,
1617 "Try peeling by %d\n", npeel
);
1620 /* Ensure that all data refs can be vectorized after the peel. */
1621 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1623 int save_misalignment
;
1628 stmt
= DR_STMT (dr
);
1629 stmt_info
= vinfo_for_stmt (stmt
);
1630 /* For interleaving, only the alignment of the first access
1632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1633 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1636 /* Strided loads perform only component accesses, alignment is
1637 irrelevant for them. */
1638 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1641 save_misalignment
= DR_MISALIGNMENT (dr
);
1642 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1643 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1644 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1646 if (!supportable_dr_alignment
)
1653 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1655 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1660 body_cost_vec
.release ();
1667 unsigned max_allowed_peel
1668 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1669 if (max_allowed_peel
!= (unsigned)-1)
1671 unsigned max_peel
= npeel
;
1674 gimple dr_stmt
= DR_STMT (dr0
);
1675 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1676 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1677 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1679 if (max_peel
> max_allowed_peel
)
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE
, vect_location
,
1684 "Disable peeling, max peels reached: %d\n", max_peel
);
1691 stmt_info_for_cost
*si
;
1692 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1694 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1695 If the misalignment of DR_i is identical to that of dr0 then set
1696 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1697 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1698 by the peeling factor times the element size of DR_i (MOD the
1699 vectorization factor times the size). Otherwise, the
1700 misalignment of DR_i must be set to unknown. */
1701 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1703 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1705 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1707 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1709 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1710 = DR_MISALIGNMENT (dr0
);
1711 SET_DR_MISALIGNMENT (dr0
, 0);
1712 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_NOTE
, vect_location
,
1715 "Alignment of access forced using peeling.\n");
1716 dump_printf_loc (MSG_NOTE
, vect_location
,
1717 "Peeling for alignment will be applied.\n");
1719 /* We've delayed passing the inside-loop peeling costs to the
1720 target cost model until we were sure peeling would happen.
1722 if (body_cost_vec
.exists ())
1724 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1726 struct _stmt_vec_info
*stmt_info
1727 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1728 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1729 si
->misalign
, vect_body
);
1731 body_cost_vec
.release ();
1734 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1740 body_cost_vec
.release ();
1742 /* (2) Versioning to force alignment. */
1744 /* Try versioning if:
1745 1) optimize loop for speed
1746 2) there is at least one unsupported misaligned data ref with an unknown
1748 3) all misaligned data refs with a known misalignment are supported, and
1749 4) the number of runtime alignment checks is within reason. */
1752 optimize_loop_nest_for_speed_p (loop
)
1753 && (!loop
->inner
); /* FORNOW */
1757 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1759 stmt
= DR_STMT (dr
);
1760 stmt_info
= vinfo_for_stmt (stmt
);
1762 /* For interleaving, only the alignment of the first access
1764 if (aligned_access_p (dr
)
1765 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1766 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1769 /* Strided loads perform only component accesses, alignment is
1770 irrelevant for them. */
1771 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1774 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1776 if (!supportable_dr_alignment
)
1782 if (known_alignment_for_access_p (dr
)
1783 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1784 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1786 do_versioning
= false;
1790 stmt
= DR_STMT (dr
);
1791 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1792 gcc_assert (vectype
);
1794 /* The rightmost bits of an aligned address must be zeros.
1795 Construct the mask needed for this test. For example,
1796 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1797 mask must be 15 = 0xf. */
1798 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1800 /* FORNOW: use the same mask to test all potentially unaligned
1801 references in the loop. The vectorizer currently supports
1802 a single vector size, see the reference to
1803 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1804 vectorization factor is computed. */
1805 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1806 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1807 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1808 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1813 /* Versioning requires at least one misaligned data reference. */
1814 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1815 do_versioning
= false;
1816 else if (!do_versioning
)
1817 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1822 vec
<gimple
> may_misalign_stmts
1823 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1826 /* It can now be assumed that the data references in the statements
1827 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1828 of the loop being vectorized. */
1829 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1831 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1832 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1833 SET_DR_MISALIGNMENT (dr
, 0);
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE
, vect_location
,
1836 "Alignment of access forced using versioning.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE
, vect_location
,
1841 "Versioning for alignment will be applied.\n");
1843 /* Peeling and versioning can't be done together at this time. */
1844 gcc_assert (! (do_peeling
&& do_versioning
));
1846 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1851 /* This point is reached if neither peeling nor versioning is being done. */
1852 gcc_assert (! (do_peeling
|| do_versioning
));
1854 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1859 /* Function vect_find_same_alignment_drs.
1861 Update group and alignment relations according to the chosen
1862 vectorization factor. */
1865 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1866 loop_vec_info loop_vinfo
)
1869 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1870 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1871 struct data_reference
*dra
= DDR_A (ddr
);
1872 struct data_reference
*drb
= DDR_B (ddr
);
1873 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1874 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1875 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1876 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1877 lambda_vector dist_v
;
1878 unsigned int loop_depth
;
1880 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1886 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1889 /* Loop-based vectorization and known data dependence. */
1890 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1893 /* Data-dependence analysis reports a distance vector of zero
1894 for data-references that overlap only in the first iteration
1895 but have different sign step (see PR45764).
1896 So as a sanity check require equal DR_STEP. */
1897 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1900 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1901 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1903 int dist
= dist_v
[loop_depth
];
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE
, vect_location
,
1907 "dependence distance = %d.\n", dist
);
1909 /* Same loop iteration. */
1911 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1913 /* Two references with distance zero have the same alignment. */
1914 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1915 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1916 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE
, vect_location
,
1919 "accesses have the same alignment.\n");
1920 dump_printf (MSG_NOTE
,
1921 "dependence distance modulo vf == 0 between ");
1922 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1923 dump_printf (MSG_NOTE
, " and ");
1924 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1925 dump_printf (MSG_NOTE
, "\n");
1932 /* Function vect_analyze_data_refs_alignment
1934 Analyze the alignment of the data-references in the loop.
1935 Return FALSE if a data reference is found that cannot be vectorized. */
1938 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1939 bb_vec_info bb_vinfo
)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE
, vect_location
,
1943 "=== vect_analyze_data_refs_alignment ===\n");
1945 /* Mark groups of data references with same alignment using
1946 data dependence information. */
1949 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1950 struct data_dependence_relation
*ddr
;
1953 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1954 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1957 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1959 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1961 "not vectorized: can't calculate alignment "
1970 /* Analyze groups of accesses: check that DR belongs to a group of
1971 accesses of legal size, step, etc. Detect gaps, single element
1972 interleaving, and other special cases. Set grouped access info.
1973 Collect groups of strided stores for further use in SLP analysis. */
1976 vect_analyze_group_access (struct data_reference
*dr
)
1978 tree step
= DR_STEP (dr
);
1979 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
1980 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
1981 gimple stmt
= DR_STMT (dr
);
1982 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1983 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1984 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
1985 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
1986 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
1987 bool slp_impossible
= false;
1988 struct loop
*loop
= NULL
;
1991 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1993 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1994 size of the interleaving group (including gaps). */
1995 groupsize
= absu_hwi (dr_step
) / type_size
;
1997 /* Not consecutive access is possible only if it is a part of interleaving. */
1998 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2000 /* Check if it this DR is a part of interleaving, and is a single
2001 element of the group that is accessed in the loop. */
2003 /* Gaps are supported only for loads. STEP must be a multiple of the type
2004 size. The size of the group must be a power of 2. */
2006 && (dr_step
% type_size
) == 0
2008 && exact_log2 (groupsize
) != -1)
2010 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2011 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE
, vect_location
,
2015 "Detected single element interleaving ");
2016 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2017 dump_printf (MSG_NOTE
, " step ");
2018 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2019 dump_printf (MSG_NOTE
, "\n");
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE
, vect_location
,
2026 "Data access with gaps requires scalar "
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2032 "Peeling for outer loop is not"
2037 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2043 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2046 "not consecutive access ");
2047 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2048 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2065 struct data_reference
*data_ref
= dr
;
2066 unsigned int count
= 1;
2067 tree prev_init
= DR_INIT (data_ref
);
2069 HOST_WIDE_INT diff
, gaps
= 0;
2070 unsigned HOST_WIDE_INT count_in_bytes
;
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2078 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next
)))))
2082 if (DR_IS_WRITE (data_ref
))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2086 "Two store stmts share the same dr.\n");
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2094 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2099 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2107 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible
= true;
2112 if (DR_IS_WRITE (data_ref
))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2116 "interleaved store with gaps\n");
2123 last_accessed_element
+= diff
;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2129 prev_init
= DR_INIT (data_ref
);
2130 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2131 /* Count the number of data-refs in the chain. */
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes
= type_size
* count
;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2142 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2150 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2155 /* Check that the size of the interleaving is equal to STEP for stores,
2156 i.e., that there are no gaps. */
2158 && absu_hwi (dr_step
) != count_in_bytes
)
2160 if (DR_IS_READ (dr
))
2162 slp_impossible
= true;
2163 /* There is a gap after the last load in the group. This gap is a
2164 difference between the groupsize and the number of elements.
2165 When there is no gap, this difference should be 0. */
2166 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2172 "interleaved store with gaps\n");
2177 /* Check that STEP is a multiple of type size. */
2179 && (dr_step
% type_size
) != 0)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "step is not a multiple of type size: step ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2186 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2188 TYPE_SIZE_UNIT (scalar_type
));
2189 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2197 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_NOTE
, vect_location
,
2200 "Detected interleaving of size %d\n", (int)groupsize
);
2202 /* SLP: create an SLP data structure for every interleaving group of
2203 stores for further analysis in vect_analyse_slp. */
2204 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2207 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2209 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2212 /* There is a gap in the end of the group. */
2213 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2217 "Data access with gaps requires scalar "
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2223 "Peeling for outer loop is not supported\n");
2227 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2235 /* Analyze the access pattern of the data-reference DR.
2236 In case of non-consecutive accesses call vect_analyze_group_access() to
2237 analyze groups of accesses. */
2240 vect_analyze_data_ref_access (struct data_reference
*dr
)
2242 tree step
= DR_STEP (dr
);
2243 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2244 gimple stmt
= DR_STMT (dr
);
2245 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2246 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2247 struct loop
*loop
= NULL
;
2250 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2252 if (loop_vinfo
&& !step
)
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2256 "bad data-ref access in loop\n");
2260 /* Allow invariant loads in not nested loops. */
2261 if (loop_vinfo
&& integer_zerop (step
))
2263 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2264 if (nested_in_vect_loop_p (loop
, stmt
))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
,
2268 "zero step in inner loop of nest\n");
2271 return DR_IS_READ (dr
);
2274 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2276 /* Interleaved accesses are not yet supported within outer-loop
2277 vectorization for references in the inner-loop. */
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2280 /* For the rest of the analysis we use the outer-loop step. */
2281 step
= STMT_VINFO_DR_STEP (stmt_info
);
2282 if (integer_zerop (step
))
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE
, vect_location
,
2286 "zero step in outer loop.\n");
2287 if (DR_IS_READ (dr
))
2295 if (TREE_CODE (step
) == INTEGER_CST
)
2297 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2298 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2300 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2302 /* Mark that it is not interleaving. */
2303 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2308 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE
, vect_location
,
2312 "grouped access in outer loop.\n");
2316 /* Assume this is a DR handled by non-constant strided load case. */
2317 if (TREE_CODE (step
) != INTEGER_CST
)
2318 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2320 /* Not consecutive access - check if it's a part of interleaving group. */
2321 return vect_analyze_group_access (dr
);
2326 /* A helper function used in the comparator function to sort data
2327 references. T1 and T2 are two data references to be compared.
2328 The function returns -1, 0, or 1. */
2331 compare_tree (tree t1
, tree t2
)
2334 enum tree_code code
;
2345 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2346 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2348 code
= TREE_CODE (t1
);
2351 /* For const values, we can just use hash values for comparisons. */
2359 hashval_t h1
= iterative_hash_expr (t1
, 0);
2360 hashval_t h2
= iterative_hash_expr (t2
, 0);
2362 return h1
< h2
? -1 : 1;
2367 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2371 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2372 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2376 tclass
= TREE_CODE_CLASS (code
);
2378 /* For var-decl, we could compare their UIDs. */
2379 if (tclass
== tcc_declaration
)
2381 if (DECL_UID (t1
) != DECL_UID (t2
))
2382 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2386 /* For expressions with operands, compare their operands recursively. */
2387 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2389 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2399 /* Compare two data-references DRA and DRB to group them into chunks
2400 suitable for grouping. */
2403 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2405 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2406 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2409 /* Stabilize sort. */
2413 /* Ordering of DRs according to base. */
2414 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2416 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2421 /* And according to DR_OFFSET. */
2422 if (!dr_equal_offsets_p (dra
, drb
))
2424 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2429 /* Put reads before writes. */
2430 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2431 return DR_IS_READ (dra
) ? -1 : 1;
2433 /* Then sort after access size. */
2434 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2435 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2437 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2438 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2443 /* And after step. */
2444 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2446 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2451 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2452 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2454 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2458 /* Function vect_analyze_data_ref_accesses.
2460 Analyze the access pattern of all the data references in the loop.
2462 FORNOW: the only access pattern that is considered vectorizable is a
2463 simple step 1 (consecutive) access.
2465 FORNOW: handle only arrays and pointer accesses. */
2468 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2471 vec
<data_reference_p
> datarefs
;
2472 struct data_reference
*dr
;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE
, vect_location
,
2476 "=== vect_analyze_data_ref_accesses ===\n");
2479 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2481 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2483 if (datarefs
.is_empty ())
2486 /* Sort the array of datarefs to make building the interleaving chains
2488 qsort (datarefs
.address (), datarefs
.length (),
2489 sizeof (data_reference_p
), dr_group_sort_cmp
);
2491 /* Build the interleaving chains. */
2492 for (i
= 0; i
< datarefs
.length () - 1;)
2494 data_reference_p dra
= datarefs
[i
];
2495 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2496 stmt_vec_info lastinfo
= NULL
;
2497 for (i
= i
+ 1; i
< datarefs
.length (); ++i
)
2499 data_reference_p drb
= datarefs
[i
];
2500 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2502 /* ??? Imperfect sorting (non-compatible types, non-modulo
2503 accesses, same accesses) can lead to a group to be artificially
2504 split here as we don't just skip over those. If it really
2505 matters we can push those to a worklist and re-iterate
2506 over them. The we can just skip ahead to the next DR here. */
2508 /* Check that the data-refs have same first location (except init)
2509 and they are both either store or load (not load and store). */
2510 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2511 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2512 DR_BASE_ADDRESS (drb
), 0)
2513 || !dr_equal_offsets_p (dra
, drb
))
2516 /* Check that the data-refs have the same constant size and step. */
2517 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2518 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2519 if (!tree_fits_uhwi_p (sza
)
2520 || !tree_fits_uhwi_p (szb
)
2521 || !tree_int_cst_equal (sza
, szb
)
2522 || !tree_fits_shwi_p (DR_STEP (dra
))
2523 || !tree_fits_shwi_p (DR_STEP (drb
))
2524 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2527 /* Do not place the same access in the interleaving chain twice. */
2528 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2531 /* Check the types are compatible.
2532 ??? We don't distinguish this during sorting. */
2533 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2534 TREE_TYPE (DR_REF (drb
))))
2537 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2538 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2539 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2540 gcc_assert (init_a
< init_b
);
2542 /* If init_b == init_a + the size of the type * k, we have an
2543 interleaving, and DRA is accessed before DRB. */
2544 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2545 if ((init_b
- init_a
) % type_size_a
!= 0)
2548 /* The step (if not zero) is greater than the difference between
2549 data-refs' inits. This splits groups into suitable sizes. */
2550 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2551 if (step
!= 0 && step
<= (init_b
- init_a
))
2554 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_NOTE
, vect_location
,
2557 "Detected interleaving ");
2558 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2559 dump_printf (MSG_NOTE
, " and ");
2560 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2561 dump_printf (MSG_NOTE
, "\n");
2564 /* Link the found element into the group list. */
2565 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2567 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2568 lastinfo
= stmtinfo_a
;
2570 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2571 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2572 lastinfo
= stmtinfo_b
;
2576 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2577 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2578 && !vect_analyze_data_ref_access (dr
))
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2582 "not vectorized: complicated access pattern.\n");
2586 /* Mark the statement as not vectorizable. */
2587 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2598 /* Operator == between two dr_with_seg_len objects.
2600 This equality operator is used to make sure two data refs
2601 are the same one so that we will consider to combine the
2602 aliasing checks of those two pairs of data dependent data
2606 operator == (const dr_with_seg_len
& d1
,
2607 const dr_with_seg_len
& d2
)
2609 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2610 DR_BASE_ADDRESS (d2
.dr
), 0)
2611 && compare_tree (d1
.offset
, d2
.offset
) == 0
2612 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2615 /* Function comp_dr_with_seg_len_pair.
2617 Comparison function for sorting objects of dr_with_seg_len_pair_t
2618 so that we can combine aliasing checks in one scan. */
2621 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2623 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2624 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2626 const dr_with_seg_len
&p11
= p1
->first
,
2631 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2632 if a and c have the same basic address snd step, and b and d have the same
2633 address and step. Therefore, if any a&c or b&d don't have the same address
2634 and step, we don't care the order of those two pairs after sorting. */
2637 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2638 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2640 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2641 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2643 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2645 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2647 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2649 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2655 template <class T
> static void
2663 /* Function vect_vfa_segment_size.
2665 Create an expression that computes the size of segment
2666 that will be accessed for a data reference. The functions takes into
2667 account that realignment loads may access one more vector.
2670 DR: The data reference.
2671 LENGTH_FACTOR: segment length to consider.
2673 Return an expression whose value is the size of segment which will be
2677 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2679 tree segment_length
;
2681 if (integer_zerop (DR_STEP (dr
)))
2682 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2684 segment_length
= size_binop (MULT_EXPR
,
2685 fold_convert (sizetype
, DR_STEP (dr
)),
2686 fold_convert (sizetype
, length_factor
));
2688 if (vect_supportable_dr_alignment (dr
, false)
2689 == dr_explicit_realign_optimized
)
2691 tree vector_size
= TYPE_SIZE_UNIT
2692 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2694 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2696 return segment_length
;
2699 /* Function vect_prune_runtime_alias_test_list.
2701 Prune a list of ddrs to be tested at run-time by versioning for alias.
2702 Merge several alias checks into one if possible.
2703 Return FALSE if resulting list of ddrs is longer then allowed by
2704 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2707 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2709 vec
<ddr_p
> may_alias_ddrs
=
2710 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2711 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2712 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2713 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2714 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_NOTE
, vect_location
,
2722 "=== vect_prune_runtime_alias_test_list ===\n");
2724 if (may_alias_ddrs
.is_empty ())
2727 /* Basically, for each pair of dependent data refs store_ptr_0
2728 and load_ptr_0, we create an expression:
2730 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2731 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2733 for aliasing checks. However, in some cases we can decrease
2734 the number of checks by combining two checks into one. For
2735 example, suppose we have another pair of data refs store_ptr_0
2736 and load_ptr_1, and if the following condition is satisfied:
2738 load_ptr_0 < load_ptr_1 &&
2739 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2741 (this condition means, in each iteration of vectorized loop,
2742 the accessed memory of store_ptr_0 cannot be between the memory
2743 of load_ptr_0 and load_ptr_1.)
2745 we then can use only the following expression to finish the
2746 alising checks between store_ptr_0 & load_ptr_0 and
2747 store_ptr_0 & load_ptr_1:
2749 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2750 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2752 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2753 same basic address. */
2755 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2757 /* First, we collect all data ref pairs for aliasing checks. */
2758 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2760 struct data_reference
*dr_a
, *dr_b
;
2761 gimple dr_group_first_a
, dr_group_first_b
;
2762 tree segment_length_a
, segment_length_b
;
2763 gimple stmt_a
, stmt_b
;
2766 stmt_a
= DR_STMT (DDR_A (ddr
));
2767 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2768 if (dr_group_first_a
)
2770 stmt_a
= dr_group_first_a
;
2771 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2775 stmt_b
= DR_STMT (DDR_B (ddr
));
2776 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2777 if (dr_group_first_b
)
2779 stmt_b
= dr_group_first_b
;
2780 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2783 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2784 length_factor
= scalar_loop_iters
;
2786 length_factor
= size_int (vect_factor
);
2787 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2788 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2790 dr_with_seg_len_pair_t dr_with_seg_len_pair
2791 (dr_with_seg_len (dr_a
, segment_length_a
),
2792 dr_with_seg_len (dr_b
, segment_length_b
));
2794 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2795 swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2797 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2800 /* Second, we sort the collected data ref pairs so that we can scan
2801 them once to combine all possible aliasing checks. */
2802 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2804 /* Third, we scan the sorted dr pairs and check if we can combine
2805 alias checks of two neighbouring dr pairs. */
2806 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2808 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2809 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2810 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2811 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2812 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2814 /* Remove duplicate data ref pairs. */
2815 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2817 if (dump_enabled_p ())
2819 dump_printf_loc (MSG_NOTE
, vect_location
,
2820 "found equal ranges ");
2821 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2822 DR_REF (dr_a1
->dr
));
2823 dump_printf (MSG_NOTE
, ", ");
2824 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2825 DR_REF (dr_b1
->dr
));
2826 dump_printf (MSG_NOTE
, " and ");
2827 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2828 DR_REF (dr_a2
->dr
));
2829 dump_printf (MSG_NOTE
, ", ");
2830 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2831 DR_REF (dr_b2
->dr
));
2832 dump_printf (MSG_NOTE
, "\n");
2835 comp_alias_ddrs
.ordered_remove (i
--);
2839 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2841 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2842 and DR_A1 and DR_A2 are two consecutive memrefs. */
2843 if (*dr_a1
== *dr_a2
)
2845 swap (dr_a1
, dr_b1
);
2846 swap (dr_a2
, dr_b2
);
2849 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2850 DR_BASE_ADDRESS (dr_a2
->dr
),
2852 || !tree_fits_shwi_p (dr_a1
->offset
)
2853 || !tree_fits_shwi_p (dr_a2
->offset
))
2856 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2857 - tree_to_shwi (dr_a1
->offset
));
2860 /* Now we check if the following condition is satisfied:
2862 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2864 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2865 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2866 have to make a best estimation. We can get the minimum value
2867 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2868 then either of the following two conditions can guarantee the
2871 1: DIFF <= MIN_SEG_LEN_B
2872 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2877 min_seg_len_b
= (TREE_CODE (dr_b1
->seg_len
) == INTEGER_CST
) ?
2878 TREE_INT_CST_LOW (dr_b1
->seg_len
) :
2881 if (diff
<= min_seg_len_b
2882 || (TREE_CODE (dr_a1
->seg_len
) == INTEGER_CST
2883 && diff
- (HOST_WIDE_INT
) TREE_INT_CST_LOW (dr_a1
->seg_len
) <
2886 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2887 dr_a2
->seg_len
, size_int (diff
));
2888 comp_alias_ddrs
.ordered_remove (i
--);
2893 if ((int) comp_alias_ddrs
.length () >
2894 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2896 if (dump_enabled_p ())
2898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2899 "disable versioning for alias - max number of "
2900 "generated checks exceeded.\n");
2909 /* Check whether a non-affine read in stmt is suitable for gather load
2910 and if so, return a builtin decl for that operation. */
2913 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2914 tree
*offp
, int *scalep
)
2916 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2917 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2918 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2919 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2920 tree offtype
= NULL_TREE
;
2921 tree decl
, base
, off
;
2922 enum machine_mode pmode
;
2923 int punsignedp
, pvolatilep
;
2926 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2927 see if we can use the def stmt of the address. */
2928 if (is_gimple_call (stmt
)
2929 && gimple_call_internal_p (stmt
)
2930 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
2931 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
2932 && TREE_CODE (base
) == MEM_REF
2933 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
2934 && integer_zerop (TREE_OPERAND (base
, 1))
2935 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
2937 gimple def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
2938 if (is_gimple_assign (def_stmt
)
2939 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
2940 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
2943 /* The gather builtins need address of the form
2944 loop_invariant + vector * {1, 2, 4, 8}
2946 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2947 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2948 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2949 multiplications and additions in it. To get a vector, we need
2950 a single SSA_NAME that will be defined in the loop and will
2951 contain everything that is not loop invariant and that can be
2952 vectorized. The following code attempts to find such a preexistng
2953 SSA_NAME OFF and put the loop invariants into a tree BASE
2954 that can be gimplified before the loop. */
2955 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
,
2956 &pmode
, &punsignedp
, &pvolatilep
, false);
2957 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2959 if (TREE_CODE (base
) == MEM_REF
)
2961 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2963 if (off
== NULL_TREE
)
2965 double_int moff
= mem_ref_offset (base
);
2966 off
= double_int_to_tree (sizetype
, moff
);
2969 off
= size_binop (PLUS_EXPR
, off
,
2970 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2972 base
= TREE_OPERAND (base
, 0);
2975 base
= build_fold_addr_expr (base
);
2977 if (off
== NULL_TREE
)
2978 off
= size_zero_node
;
2980 /* If base is not loop invariant, either off is 0, then we start with just
2981 the constant offset in the loop invariant BASE and continue with base
2982 as OFF, otherwise give up.
2983 We could handle that case by gimplifying the addition of base + off
2984 into some SSA_NAME and use that as off, but for now punt. */
2985 if (!expr_invariant_in_loop_p (loop
, base
))
2987 if (!integer_zerop (off
))
2990 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
2992 /* Otherwise put base + constant offset into the loop invariant BASE
2993 and continue with OFF. */
2996 base
= fold_convert (sizetype
, base
);
2997 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3000 /* OFF at this point may be either a SSA_NAME or some tree expression
3001 from get_inner_reference. Try to peel off loop invariants from it
3002 into BASE as long as possible. */
3004 while (offtype
== NULL_TREE
)
3006 enum tree_code code
;
3007 tree op0
, op1
, add
= NULL_TREE
;
3009 if (TREE_CODE (off
) == SSA_NAME
)
3011 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3013 if (expr_invariant_in_loop_p (loop
, off
))
3016 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3019 op0
= gimple_assign_rhs1 (def_stmt
);
3020 code
= gimple_assign_rhs_code (def_stmt
);
3021 op1
= gimple_assign_rhs2 (def_stmt
);
3025 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3027 code
= TREE_CODE (off
);
3028 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3032 case POINTER_PLUS_EXPR
:
3034 if (expr_invariant_in_loop_p (loop
, op0
))
3039 add
= fold_convert (sizetype
, add
);
3041 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3042 base
= size_binop (PLUS_EXPR
, base
, add
);
3045 if (expr_invariant_in_loop_p (loop
, op1
))
3053 if (expr_invariant_in_loop_p (loop
, op1
))
3055 add
= fold_convert (sizetype
, op1
);
3056 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3062 if (scale
== 1 && tree_fits_shwi_p (op1
))
3064 scale
= tree_to_shwi (op1
);
3073 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3074 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3076 if (TYPE_PRECISION (TREE_TYPE (op0
))
3077 == TYPE_PRECISION (TREE_TYPE (off
)))
3082 if (TYPE_PRECISION (TREE_TYPE (op0
))
3083 < TYPE_PRECISION (TREE_TYPE (off
)))
3086 offtype
= TREE_TYPE (off
);
3097 /* If at the end OFF still isn't a SSA_NAME or isn't
3098 defined in the loop, punt. */
3099 if (TREE_CODE (off
) != SSA_NAME
3100 || expr_invariant_in_loop_p (loop
, off
))
3103 if (offtype
== NULL_TREE
)
3104 offtype
= TREE_TYPE (off
);
3106 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3108 if (decl
== NULL_TREE
)
3120 /* Function vect_analyze_data_refs.
3122 Find all the data references in the loop or basic block.
3124 The general structure of the analysis of data refs in the vectorizer is as
3126 1- vect_analyze_data_refs(loop/bb): call
3127 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3128 in the loop/bb and their dependences.
3129 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3130 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3131 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3136 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3137 bb_vec_info bb_vinfo
,
3140 struct loop
*loop
= NULL
;
3141 basic_block bb
= NULL
;
3143 vec
<data_reference_p
> datarefs
;
3144 struct data_reference
*dr
;
3147 if (dump_enabled_p ())
3148 dump_printf_loc (MSG_NOTE
, vect_location
,
3149 "=== vect_analyze_data_refs ===\n");
3153 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3155 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3156 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3157 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3159 if (dump_enabled_p ())
3160 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3161 "not vectorized: loop contains function calls"
3162 " or data references that cannot be analyzed\n");
3166 for (i
= 0; i
< loop
->num_nodes
; i
++)
3168 gimple_stmt_iterator gsi
;
3170 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3172 gimple stmt
= gsi_stmt (gsi
);
3173 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3175 if (is_gimple_call (stmt
) && loop
->safelen
)
3177 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3178 if (fndecl
!= NULL_TREE
)
3180 struct cgraph_node
*node
= cgraph_get_node (fndecl
);
3181 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3183 unsigned int j
, n
= gimple_call_num_args (stmt
);
3184 for (j
= 0; j
< n
; j
++)
3186 op
= gimple_call_arg (stmt
, j
);
3188 || (REFERENCE_CLASS_P (op
)
3189 && get_base_address (op
)))
3192 op
= gimple_call_lhs (stmt
);
3193 /* Ignore #pragma omp declare simd functions
3194 if they don't have data references in the
3195 call stmt itself. */
3199 || (REFERENCE_CLASS_P (op
)
3200 && get_base_address (op
)))))
3205 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3208 "not vectorized: loop contains function "
3209 "calls or data references that cannot "
3216 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3220 gimple_stmt_iterator gsi
;
3222 bb
= BB_VINFO_BB (bb_vinfo
);
3223 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3225 gimple stmt
= gsi_stmt (gsi
);
3226 if (!find_data_references_in_stmt (NULL
, stmt
,
3227 &BB_VINFO_DATAREFS (bb_vinfo
)))
3229 /* Mark the rest of the basic-block as unvectorizable. */
3230 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3232 stmt
= gsi_stmt (gsi
);
3233 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3239 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3242 /* Go through the data-refs, check that the analysis succeeded. Update
3243 pointer from stmt_vec_info struct to DR and vectype. */
3245 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3248 stmt_vec_info stmt_info
;
3249 tree base
, offset
, init
;
3250 bool gather
= false;
3251 bool simd_lane_access
= false;
3255 if (!dr
|| !DR_REF (dr
))
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3259 "not vectorized: unhandled data-ref\n");
3263 stmt
= DR_STMT (dr
);
3264 stmt_info
= vinfo_for_stmt (stmt
);
3266 /* Discard clobbers from the dataref vector. We will remove
3267 clobber stmts during vectorization. */
3268 if (gimple_clobber_p (stmt
))
3270 if (i
== datarefs
.length () - 1)
3275 datarefs
[i
] = datarefs
.pop ();
3279 /* Check that analysis of the data-ref succeeded. */
3280 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3285 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3286 && targetm
.vectorize
.builtin_gather
!= NULL
;
3287 bool maybe_simd_lane_access
3288 = loop_vinfo
&& loop
->simduid
;
3290 /* If target supports vector gather loads, or if this might be
3291 a SIMD lane access, see if they can't be used. */
3293 && (maybe_gather
|| maybe_simd_lane_access
)
3294 && !nested_in_vect_loop_p (loop
, stmt
))
3296 struct data_reference
*newdr
3297 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3298 DR_REF (dr
), stmt
, true);
3299 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3300 if (DR_BASE_ADDRESS (newdr
)
3301 && DR_OFFSET (newdr
)
3304 && integer_zerop (DR_STEP (newdr
)))
3306 if (maybe_simd_lane_access
)
3308 tree off
= DR_OFFSET (newdr
);
3310 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3311 && TREE_CODE (off
) == MULT_EXPR
3312 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3314 tree step
= TREE_OPERAND (off
, 1);
3315 off
= TREE_OPERAND (off
, 0);
3317 if (CONVERT_EXPR_P (off
)
3318 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3320 < TYPE_PRECISION (TREE_TYPE (off
)))
3321 off
= TREE_OPERAND (off
, 0);
3322 if (TREE_CODE (off
) == SSA_NAME
)
3324 gimple def
= SSA_NAME_DEF_STMT (off
);
3325 tree reft
= TREE_TYPE (DR_REF (newdr
));
3326 if (is_gimple_call (def
)
3327 && gimple_call_internal_p (def
)
3328 && (gimple_call_internal_fn (def
)
3329 == IFN_GOMP_SIMD_LANE
))
3331 tree arg
= gimple_call_arg (def
, 0);
3332 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3333 arg
= SSA_NAME_VAR (arg
);
3334 if (arg
== loop
->simduid
3336 && tree_int_cst_equal
3337 (TYPE_SIZE_UNIT (reft
),
3340 DR_OFFSET (newdr
) = ssize_int (0);
3341 DR_STEP (newdr
) = step
;
3342 DR_ALIGNED_TO (newdr
)
3343 = size_int (BIGGEST_ALIGNMENT
);
3345 simd_lane_access
= true;
3351 if (!simd_lane_access
&& maybe_gather
)
3357 if (!gather
&& !simd_lane_access
)
3358 free_data_ref (newdr
);
3361 if (!gather
&& !simd_lane_access
)
3363 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3366 "not vectorized: data ref analysis "
3368 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3369 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3379 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3381 if (dump_enabled_p ())
3382 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3383 "not vectorized: base addr of dr is a "
3389 if (gather
|| simd_lane_access
)
3394 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3396 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3399 "not vectorized: volatile type ");
3400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3401 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3410 if (stmt_can_throw_internal (stmt
))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3415 "not vectorized: statement can throw an "
3417 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3418 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3424 if (gather
|| simd_lane_access
)
3429 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3430 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3432 if (dump_enabled_p ())
3434 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3435 "not vectorized: statement is bitfield "
3437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3438 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3444 if (gather
|| simd_lane_access
)
3449 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3450 offset
= unshare_expr (DR_OFFSET (dr
));
3451 init
= unshare_expr (DR_INIT (dr
));
3453 if (is_gimple_call (stmt
)
3454 && (!gimple_call_internal_p (stmt
)
3455 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3456 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3458 if (dump_enabled_p ())
3460 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3461 "not vectorized: dr in a call ");
3462 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3463 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3469 if (gather
|| simd_lane_access
)
3474 /* Update DR field in stmt_vec_info struct. */
3476 /* If the dataref is in an inner-loop of the loop that is considered for
3477 for vectorization, we also want to analyze the access relative to
3478 the outer-loop (DR contains information only relative to the
3479 inner-most enclosing loop). We do that by building a reference to the
3480 first location accessed by the inner-loop, and analyze it relative to
3482 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3484 tree outer_step
, outer_base
, outer_init
;
3485 HOST_WIDE_INT pbitsize
, pbitpos
;
3487 enum machine_mode pmode
;
3488 int punsignedp
, pvolatilep
;
3489 affine_iv base_iv
, offset_iv
;
3492 /* Build a reference to the first location accessed by the
3493 inner-loop: *(BASE+INIT). (The first location is actually
3494 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3495 tree inner_base
= build_fold_indirect_ref
3496 (fold_build_pointer_plus (base
, init
));
3498 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_NOTE
, vect_location
,
3501 "analyze in outer-loop: ");
3502 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3503 dump_printf (MSG_NOTE
, "\n");
3506 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3507 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3508 gcc_assert (outer_base
!= NULL_TREE
);
3510 if (pbitpos
% BITS_PER_UNIT
!= 0)
3512 if (dump_enabled_p ())
3513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3514 "failed: bit offset alignment.\n");
3518 outer_base
= build_fold_addr_expr (outer_base
);
3519 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3524 "failed: evolution of base is not affine.\n");
3531 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3539 offset_iv
.base
= ssize_int (0);
3540 offset_iv
.step
= ssize_int (0);
3542 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3545 if (dump_enabled_p ())
3546 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3547 "evolution of offset is not affine.\n");
3551 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3552 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3553 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3554 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3555 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3557 outer_step
= size_binop (PLUS_EXPR
,
3558 fold_convert (ssizetype
, base_iv
.step
),
3559 fold_convert (ssizetype
, offset_iv
.step
));
3561 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3562 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3563 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3564 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3565 STMT_VINFO_DR_OFFSET (stmt_info
) =
3566 fold_convert (ssizetype
, offset_iv
.base
);
3567 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3568 size_int (highest_pow2_factor (offset_iv
.base
));
3570 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_NOTE
, vect_location
,
3573 "\touter base_address: ");
3574 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3575 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3576 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3577 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3578 STMT_VINFO_DR_OFFSET (stmt_info
));
3579 dump_printf (MSG_NOTE
,
3580 "\n\touter constant offset from base address: ");
3581 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3582 STMT_VINFO_DR_INIT (stmt_info
));
3583 dump_printf (MSG_NOTE
, "\n\touter step: ");
3584 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3585 STMT_VINFO_DR_STEP (stmt_info
));
3586 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3587 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3588 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3589 dump_printf (MSG_NOTE
, "\n");
3593 if (STMT_VINFO_DATA_REF (stmt_info
))
3595 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3598 "not vectorized: more than one data ref "
3600 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3601 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3607 if (gather
|| simd_lane_access
)
3612 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3613 if (simd_lane_access
)
3615 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3619 /* Set vectype for STMT. */
3620 scalar_type
= TREE_TYPE (DR_REF (dr
));
3621 STMT_VINFO_VECTYPE (stmt_info
) =
3622 get_vectype_for_scalar_type (scalar_type
);
3623 if (!STMT_VINFO_VECTYPE (stmt_info
))
3625 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3628 "not vectorized: no vectype for stmt: ");
3629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3630 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3631 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3633 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3639 if (gather
|| simd_lane_access
)
3641 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_NOTE
, vect_location
,
3651 "got vectype for stmt: ");
3652 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3653 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3654 STMT_VINFO_VECTYPE (stmt_info
));
3655 dump_printf (MSG_NOTE
, "\n");
3659 /* Adjust the minimal vectorization factor according to the
3661 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3669 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3671 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3675 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3677 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3680 "not vectorized: not suitable for gather "
3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3683 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3689 STMT_VINFO_GATHER_P (stmt_info
) = true;
3692 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3694 if (nested_in_vect_loop_p (loop
, stmt
)
3695 || !DR_IS_READ (dr
))
3697 if (dump_enabled_p ())
3699 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3700 "not vectorized: not suitable for strided "
3702 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3703 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3707 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3711 /* If we stopped analysis at the first dataref we could not analyze
3712 when trying to vectorize a basic-block mark the rest of the datarefs
3713 as not vectorizable and truncate the vector of datarefs. That
3714 avoids spending useless time in analyzing their dependence. */
3715 if (i
!= datarefs
.length ())
3717 gcc_assert (bb_vinfo
!= NULL
);
3718 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3720 data_reference_p dr
= datarefs
[j
];
3721 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3724 datarefs
.truncate (i
);
3731 /* Function vect_get_new_vect_var.
3733 Returns a name for a new variable. The current naming scheme appends the
3734 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3735 the name of vectorizer generated variables, and appends that to NAME if
3739 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3746 case vect_simple_var
:
3749 case vect_scalar_var
:
3752 case vect_pointer_var
:
3761 char* tmp
= concat (prefix
, "_", name
, NULL
);
3762 new_vect_var
= create_tmp_reg (type
, tmp
);
3766 new_vect_var
= create_tmp_reg (type
, prefix
);
3768 return new_vect_var
;
3772 /* Function vect_create_addr_base_for_vector_ref.
3774 Create an expression that computes the address of the first memory location
3775 that will be accessed for a data reference.
3778 STMT: The statement containing the data reference.
3779 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3780 OFFSET: Optional. If supplied, it is be added to the initial address.
3781 LOOP: Specify relative to which loop-nest should the address be computed.
3782 For example, when the dataref is in an inner-loop nested in an
3783 outer-loop that is now being vectorized, LOOP can be either the
3784 outer-loop, or the inner-loop. The first memory location accessed
3785 by the following dataref ('in' points to short):
3792 if LOOP=i_loop: &in (relative to i_loop)
3793 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3796 1. Return an SSA_NAME whose value is the address of the memory location of
3797 the first vector of the data reference.
3798 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3799 these statement(s) which define the returned SSA_NAME.
3801 FORNOW: We are only handling array accesses with step 1. */
3804 vect_create_addr_base_for_vector_ref (gimple stmt
,
3805 gimple_seq
*new_stmt_list
,
3809 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3810 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3812 const char *base_name
;
3815 gimple_seq seq
= NULL
;
3819 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3820 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3822 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3824 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3826 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3828 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3829 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3830 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3834 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3835 base_offset
= unshare_expr (DR_OFFSET (dr
));
3836 init
= unshare_expr (DR_INIT (dr
));
3840 base_name
= get_name (data_ref_base
);
3843 base_offset
= ssize_int (0);
3844 init
= ssize_int (0);
3845 base_name
= get_name (DR_REF (dr
));
3848 /* Create base_offset */
3849 base_offset
= size_binop (PLUS_EXPR
,
3850 fold_convert (sizetype
, base_offset
),
3851 fold_convert (sizetype
, init
));
3855 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3856 fold_convert (sizetype
, offset
), step
);
3857 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3858 base_offset
, offset
);
3861 /* base + base_offset */
3863 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3866 addr_base
= build1 (ADDR_EXPR
,
3867 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3868 unshare_expr (DR_REF (dr
)));
3871 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3872 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3873 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3874 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3875 gimple_seq_add_seq (new_stmt_list
, seq
);
3877 if (DR_PTR_INFO (dr
)
3878 && TREE_CODE (addr_base
) == SSA_NAME
)
3880 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3882 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3885 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3888 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3889 dump_printf (MSG_NOTE
, "\n");
3896 /* Function vect_create_data_ref_ptr.
3898 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3899 location accessed in the loop by STMT, along with the def-use update
3900 chain to appropriately advance the pointer through the loop iterations.
3901 Also set aliasing information for the pointer. This pointer is used by
3902 the callers to this function to create a memory reference expression for
3903 vector load/store access.
3906 1. STMT: a stmt that references memory. Expected to be of the form
3907 GIMPLE_ASSIGN <name, data-ref> or
3908 GIMPLE_ASSIGN <data-ref, name>.
3909 2. AGGR_TYPE: the type of the reference, which should be either a vector
3911 3. AT_LOOP: the loop where the vector memref is to be created.
3912 4. OFFSET (optional): an offset to be added to the initial address accessed
3913 by the data-ref in STMT.
3914 5. BSI: location where the new stmts are to be placed if there is no loop
3915 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3916 pointing to the initial address.
3919 1. Declare a new ptr to vector_type, and have it point to the base of the
3920 data reference (initial addressed accessed by the data reference).
3921 For example, for vector of type V8HI, the following code is generated:
3924 ap = (v8hi *)initial_address;
3926 if OFFSET is not supplied:
3927 initial_address = &a[init];
3928 if OFFSET is supplied:
3929 initial_address = &a[init + OFFSET];
3931 Return the initial_address in INITIAL_ADDRESS.
3933 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3934 update the pointer in each iteration of the loop.
3936 Return the increment stmt that updates the pointer in PTR_INCR.
3938 3. Set INV_P to true if the access pattern of the data reference in the
3939 vectorized loop is invariant. Set it to false otherwise.
3941 4. Return the pointer. */
3944 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3945 tree offset
, tree
*initial_address
,
3946 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3947 bool only_init
, bool *inv_p
)
3949 const char *base_name
;
3950 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3951 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3952 struct loop
*loop
= NULL
;
3953 bool nested_in_vect_loop
= false;
3954 struct loop
*containing_loop
= NULL
;
3959 gimple_seq new_stmt_list
= NULL
;
3963 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3965 gimple_stmt_iterator incr_gsi
;
3967 tree indx_before_incr
, indx_after_incr
;
3970 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3972 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3973 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3977 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3978 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3979 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3980 pe
= loop_preheader_edge (loop
);
3984 gcc_assert (bb_vinfo
);
3989 /* Check the step (evolution) of the load in LOOP, and record
3990 whether it's invariant. */
3991 if (nested_in_vect_loop
)
3992 step
= STMT_VINFO_DR_STEP (stmt_info
);
3994 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
3996 if (integer_zerop (step
))
4001 /* Create an expression for the first address accessed by this load
4003 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4005 if (dump_enabled_p ())
4007 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4008 dump_printf_loc (MSG_NOTE
, vect_location
,
4009 "create %s-pointer variable to type: ",
4010 get_tree_code_name (TREE_CODE (aggr_type
)));
4011 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4012 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4013 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4014 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4015 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4016 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4017 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4019 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4020 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4021 dump_printf (MSG_NOTE
, "\n");
4024 /* (1) Create the new aggregate-pointer variable.
4025 Vector and array types inherit the alias set of their component
4026 type by default so we need to use a ref-all pointer if the data
4027 reference does not conflict with the created aggregated data
4028 reference because it is not addressable. */
4029 bool need_ref_all
= false;
4030 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4031 get_alias_set (DR_REF (dr
))))
4032 need_ref_all
= true;
4033 /* Likewise for any of the data references in the stmt group. */
4034 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4036 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4039 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4040 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4041 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4042 get_alias_set (DR_REF (sdr
))))
4044 need_ref_all
= true;
4047 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4051 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4053 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4056 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4057 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4058 def-use update cycles for the pointer: one relative to the outer-loop
4059 (LOOP), which is what steps (3) and (4) below do. The other is relative
4060 to the inner-loop (which is the inner-most loop containing the dataref),
4061 and this is done be step (5) below.
4063 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4064 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4065 redundant. Steps (3),(4) create the following:
4068 LOOP: vp1 = phi(vp0,vp2)
4074 If there is an inner-loop nested in loop, then step (5) will also be
4075 applied, and an additional update in the inner-loop will be created:
4078 LOOP: vp1 = phi(vp0,vp2)
4080 inner: vp3 = phi(vp1,vp4)
4081 vp4 = vp3 + inner_step
4087 /* (2) Calculate the initial address of the aggregate-pointer, and set
4088 the aggregate-pointer to point to it before the loop. */
4090 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4092 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4098 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4099 gcc_assert (!new_bb
);
4102 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4105 *initial_address
= new_temp
;
4107 /* Create: p = (aggr_type *) initial_base */
4108 if (TREE_CODE (new_temp
) != SSA_NAME
4109 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
4111 vec_stmt
= gimple_build_assign (aggr_ptr
,
4112 fold_convert (aggr_ptr_type
, new_temp
));
4113 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
4114 /* Copy the points-to information if it exists. */
4115 if (DR_PTR_INFO (dr
))
4116 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
4117 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
4120 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
4121 gcc_assert (!new_bb
);
4124 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
4127 aggr_ptr_init
= new_temp
;
4129 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4130 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4131 inner-loop nested in LOOP (during outer-loop vectorization). */
4133 /* No update in loop is required. */
4134 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4135 aptr
= aggr_ptr_init
;
4138 /* The step of the aggregate pointer is the type size. */
4139 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4140 /* One exception to the above is when the scalar step of the load in
4141 LOOP is zero. In this case the step here is also zero. */
4143 iv_step
= size_zero_node
;
4144 else if (tree_int_cst_sgn (step
) == -1)
4145 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4147 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4149 create_iv (aggr_ptr_init
,
4150 fold_convert (aggr_ptr_type
, iv_step
),
4151 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4152 &indx_before_incr
, &indx_after_incr
);
4153 incr
= gsi_stmt (incr_gsi
);
4154 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4156 /* Copy the points-to information if it exists. */
4157 if (DR_PTR_INFO (dr
))
4159 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4160 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4165 aptr
= indx_before_incr
;
4168 if (!nested_in_vect_loop
|| only_init
)
4172 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4173 nested in LOOP, if exists. */
4175 gcc_assert (nested_in_vect_loop
);
4178 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4180 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4181 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4183 incr
= gsi_stmt (incr_gsi
);
4184 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4186 /* Copy the points-to information if it exists. */
4187 if (DR_PTR_INFO (dr
))
4189 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4190 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4195 return indx_before_incr
;
4202 /* Function bump_vector_ptr
4204 Increment a pointer (to a vector type) by vector-size. If requested,
4205 i.e. if PTR-INCR is given, then also connect the new increment stmt
4206 to the existing def-use update-chain of the pointer, by modifying
4207 the PTR_INCR as illustrated below:
4209 The pointer def-use update-chain before this function:
4210 DATAREF_PTR = phi (p_0, p_2)
4212 PTR_INCR: p_2 = DATAREF_PTR + step
4214 The pointer def-use update-chain after this function:
4215 DATAREF_PTR = phi (p_0, p_2)
4217 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4219 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4222 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4224 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4225 the loop. The increment amount across iterations is expected
4227 BSI - location where the new update stmt is to be placed.
4228 STMT - the original scalar memory-access stmt that is being vectorized.
4229 BUMP - optional. The offset by which to bump the pointer. If not given,
4230 the offset is assumed to be vector_size.
4232 Output: Return NEW_DATAREF_PTR as illustrated above.
4237 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4238 gimple stmt
, tree bump
)
4240 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4241 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4242 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4243 tree update
= TYPE_SIZE_UNIT (vectype
);
4246 use_operand_p use_p
;
4247 tree new_dataref_ptr
;
4252 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
4253 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
4254 dataref_ptr
, update
);
4255 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4257 /* Copy the points-to information if it exists. */
4258 if (DR_PTR_INFO (dr
))
4260 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4261 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4265 return new_dataref_ptr
;
4267 /* Update the vector-pointer's cross-iteration increment. */
4268 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4270 tree use
= USE_FROM_PTR (use_p
);
4272 if (use
== dataref_ptr
)
4273 SET_USE (use_p
, new_dataref_ptr
);
4275 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4278 return new_dataref_ptr
;
4282 /* Function vect_create_destination_var.
4284 Create a new temporary of type VECTYPE. */
4287 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4293 enum vect_var_kind kind
;
4295 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4296 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4298 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4300 name
= get_name (scalar_dest
);
4302 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4304 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
4305 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4311 /* Function vect_grouped_store_supported.
4313 Returns TRUE if interleave high and interleave low permutations
4314 are supported, and FALSE otherwise. */
4317 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4319 enum machine_mode mode
= TYPE_MODE (vectype
);
4321 /* vect_permute_store_chain requires the group size to be a power of two. */
4322 if (exact_log2 (count
) == -1)
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4326 "the size of the group of accesses"
4327 " is not a power of 2\n");
4331 /* Check that the permutation is supported. */
4332 if (VECTOR_MODE_P (mode
))
4334 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4335 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4336 for (i
= 0; i
< nelt
/ 2; i
++)
4339 sel
[i
* 2 + 1] = i
+ nelt
;
4341 if (can_vec_perm_p (mode
, false, sel
))
4343 for (i
= 0; i
< nelt
; i
++)
4345 if (can_vec_perm_p (mode
, false, sel
))
4350 if (dump_enabled_p ())
4351 dump_printf (MSG_MISSED_OPTIMIZATION
,
4352 "interleave op not supported by target.\n");
4357 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4361 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4363 return vect_lanes_optab_supported_p ("vec_store_lanes",
4364 vec_store_lanes_optab
,
4369 /* Function vect_permute_store_chain.
4371 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4372 a power of 2, generate interleave_high/low stmts to reorder the data
4373 correctly for the stores. Return the final references for stores in
4376 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4377 The input is 4 vectors each containing 8 elements. We assign a number to
4378 each element, the input sequence is:
4380 1st vec: 0 1 2 3 4 5 6 7
4381 2nd vec: 8 9 10 11 12 13 14 15
4382 3rd vec: 16 17 18 19 20 21 22 23
4383 4th vec: 24 25 26 27 28 29 30 31
4385 The output sequence should be:
4387 1st vec: 0 8 16 24 1 9 17 25
4388 2nd vec: 2 10 18 26 3 11 19 27
4389 3rd vec: 4 12 20 28 5 13 21 30
4390 4th vec: 6 14 22 30 7 15 23 31
4392 i.e., we interleave the contents of the four vectors in their order.
4394 We use interleave_high/low instructions to create such output. The input of
4395 each interleave_high/low operation is two vectors:
4398 the even elements of the result vector are obtained left-to-right from the
4399 high/low elements of the first vector. The odd elements of the result are
4400 obtained left-to-right from the high/low elements of the second vector.
4401 The output of interleave_high will be: 0 4 1 5
4402 and of interleave_low: 2 6 3 7
4405 The permutation is done in log LENGTH stages. In each stage interleave_high
4406 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4407 where the first argument is taken from the first half of DR_CHAIN and the
4408 second argument from it's second half.
4411 I1: interleave_high (1st vec, 3rd vec)
4412 I2: interleave_low (1st vec, 3rd vec)
4413 I3: interleave_high (2nd vec, 4th vec)
4414 I4: interleave_low (2nd vec, 4th vec)
4416 The output for the first stage is:
4418 I1: 0 16 1 17 2 18 3 19
4419 I2: 4 20 5 21 6 22 7 23
4420 I3: 8 24 9 25 10 26 11 27
4421 I4: 12 28 13 29 14 30 15 31
4423 The output of the second stage, i.e. the final result is:
4425 I1: 0 8 16 24 1 9 17 25
4426 I2: 2 10 18 26 3 11 19 27
4427 I3: 4 12 20 28 5 13 21 30
4428 I4: 6 14 22 30 7 15 23 31. */
4431 vect_permute_store_chain (vec
<tree
> dr_chain
,
4432 unsigned int length
,
4434 gimple_stmt_iterator
*gsi
,
4435 vec
<tree
> *result_chain
)
4437 tree vect1
, vect2
, high
, low
;
4439 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4440 tree perm_mask_low
, perm_mask_high
;
4442 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4443 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4445 result_chain
->quick_grow (length
);
4446 memcpy (result_chain
->address (), dr_chain
.address (),
4447 length
* sizeof (tree
));
4449 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4452 sel
[i
* 2 + 1] = i
+ nelt
;
4454 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4455 gcc_assert (perm_mask_high
!= NULL
);
4457 for (i
= 0; i
< nelt
; i
++)
4459 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4460 gcc_assert (perm_mask_low
!= NULL
);
4462 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4464 for (j
= 0; j
< length
/2; j
++)
4466 vect1
= dr_chain
[j
];
4467 vect2
= dr_chain
[j
+length
/2];
4469 /* Create interleaving stmt:
4470 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4471 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4473 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4474 vect1
, vect2
, perm_mask_high
);
4475 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4476 (*result_chain
)[2*j
] = high
;
4478 /* Create interleaving stmt:
4479 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4480 nelt*3/2+1, ...}> */
4481 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4483 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4484 vect1
, vect2
, perm_mask_low
);
4485 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4486 (*result_chain
)[2*j
+1] = low
;
4488 memcpy (dr_chain
.address (), result_chain
->address (),
4489 length
* sizeof (tree
));
4493 /* Function vect_setup_realignment
4495 This function is called when vectorizing an unaligned load using
4496 the dr_explicit_realign[_optimized] scheme.
4497 This function generates the following code at the loop prolog:
4500 x msq_init = *(floor(p)); # prolog load
4501 realignment_token = call target_builtin;
4503 x msq = phi (msq_init, ---)
4505 The stmts marked with x are generated only for the case of
4506 dr_explicit_realign_optimized.
4508 The code above sets up a new (vector) pointer, pointing to the first
4509 location accessed by STMT, and a "floor-aligned" load using that pointer.
4510 It also generates code to compute the "realignment-token" (if the relevant
4511 target hook was defined), and creates a phi-node at the loop-header bb
4512 whose arguments are the result of the prolog-load (created by this
4513 function) and the result of a load that takes place in the loop (to be
4514 created by the caller to this function).
4516 For the case of dr_explicit_realign_optimized:
4517 The caller to this function uses the phi-result (msq) to create the
4518 realignment code inside the loop, and sets up the missing phi argument,
4521 msq = phi (msq_init, lsq)
4522 lsq = *(floor(p')); # load in loop
4523 result = realign_load (msq, lsq, realignment_token);
4525 For the case of dr_explicit_realign:
4527 msq = *(floor(p)); # load in loop
4529 lsq = *(floor(p')); # load in loop
4530 result = realign_load (msq, lsq, realignment_token);
4533 STMT - (scalar) load stmt to be vectorized. This load accesses
4534 a memory location that may be unaligned.
4535 BSI - place where new code is to be inserted.
4536 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4540 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4541 target hook, if defined.
4542 Return value - the result of the loop-header phi node. */
4545 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4546 tree
*realignment_token
,
4547 enum dr_alignment_support alignment_support_scheme
,
4549 struct loop
**at_loop
)
4551 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4552 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4553 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4554 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4555 struct loop
*loop
= NULL
;
4557 tree scalar_dest
= gimple_assign_lhs (stmt
);
4564 tree msq_init
= NULL_TREE
;
4567 tree msq
= NULL_TREE
;
4568 gimple_seq stmts
= NULL
;
4570 bool compute_in_loop
= false;
4571 bool nested_in_vect_loop
= false;
4572 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4573 struct loop
*loop_for_initial_load
= NULL
;
4577 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4578 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4581 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4582 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4584 /* We need to generate three things:
4585 1. the misalignment computation
4586 2. the extra vector load (for the optimized realignment scheme).
4587 3. the phi node for the two vectors from which the realignment is
4588 done (for the optimized realignment scheme). */
4590 /* 1. Determine where to generate the misalignment computation.
4592 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4593 calculation will be generated by this function, outside the loop (in the
4594 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4595 caller, inside the loop.
4597 Background: If the misalignment remains fixed throughout the iterations of
4598 the loop, then both realignment schemes are applicable, and also the
4599 misalignment computation can be done outside LOOP. This is because we are
4600 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4601 are a multiple of VS (the Vector Size), and therefore the misalignment in
4602 different vectorized LOOP iterations is always the same.
4603 The problem arises only if the memory access is in an inner-loop nested
4604 inside LOOP, which is now being vectorized using outer-loop vectorization.
4605 This is the only case when the misalignment of the memory access may not
4606 remain fixed throughout the iterations of the inner-loop (as explained in
4607 detail in vect_supportable_dr_alignment). In this case, not only is the
4608 optimized realignment scheme not applicable, but also the misalignment
4609 computation (and generation of the realignment token that is passed to
4610 REALIGN_LOAD) have to be done inside the loop.
4612 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4613 or not, which in turn determines if the misalignment is computed inside
4614 the inner-loop, or outside LOOP. */
4616 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4618 compute_in_loop
= true;
4619 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4623 /* 2. Determine where to generate the extra vector load.
4625 For the optimized realignment scheme, instead of generating two vector
4626 loads in each iteration, we generate a single extra vector load in the
4627 preheader of the loop, and in each iteration reuse the result of the
4628 vector load from the previous iteration. In case the memory access is in
4629 an inner-loop nested inside LOOP, which is now being vectorized using
4630 outer-loop vectorization, we need to determine whether this initial vector
4631 load should be generated at the preheader of the inner-loop, or can be
4632 generated at the preheader of LOOP. If the memory access has no evolution
4633 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4634 to be generated inside LOOP (in the preheader of the inner-loop). */
4636 if (nested_in_vect_loop
)
4638 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4639 bool invariant_in_outerloop
=
4640 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4641 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4644 loop_for_initial_load
= loop
;
4646 *at_loop
= loop_for_initial_load
;
4648 if (loop_for_initial_load
)
4649 pe
= loop_preheader_edge (loop_for_initial_load
);
4651 /* 3. For the case of the optimized realignment, create the first vector
4652 load at the loop preheader. */
4654 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4656 /* Create msq_init = *(floor(p1)) in the loop preheader */
4658 gcc_assert (!compute_in_loop
);
4659 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4660 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4661 NULL_TREE
, &init_addr
, NULL
, &inc
,
4663 new_temp
= copy_ssa_name (ptr
, NULL
);
4664 new_stmt
= gimple_build_assign_with_ops
4665 (BIT_AND_EXPR
, new_temp
, ptr
,
4666 build_int_cst (TREE_TYPE (ptr
),
4667 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4668 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4669 gcc_assert (!new_bb
);
4671 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4672 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4673 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4674 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4675 gimple_assign_set_lhs (new_stmt
, new_temp
);
4678 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4679 gcc_assert (!new_bb
);
4682 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4684 msq_init
= gimple_assign_lhs (new_stmt
);
4687 /* 4. Create realignment token using a target builtin, if available.
4688 It is done either inside the containing loop, or before LOOP (as
4689 determined above). */
4691 if (targetm
.vectorize
.builtin_mask_for_load
)
4695 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4698 /* Generate the INIT_ADDR computation outside LOOP. */
4699 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4703 pe
= loop_preheader_edge (loop
);
4704 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4705 gcc_assert (!new_bb
);
4708 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4711 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4712 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4714 vect_create_destination_var (scalar_dest
,
4715 gimple_call_return_type (new_stmt
));
4716 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4717 gimple_call_set_lhs (new_stmt
, new_temp
);
4719 if (compute_in_loop
)
4720 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4723 /* Generate the misalignment computation outside LOOP. */
4724 pe
= loop_preheader_edge (loop
);
4725 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4726 gcc_assert (!new_bb
);
4729 *realignment_token
= gimple_call_lhs (new_stmt
);
4731 /* The result of the CALL_EXPR to this builtin is determined from
4732 the value of the parameter and no global variables are touched
4733 which makes the builtin a "const" function. Requiring the
4734 builtin to have the "const" attribute makes it unnecessary
4735 to call mark_call_clobbered. */
4736 gcc_assert (TREE_READONLY (builtin_decl
));
4739 if (alignment_support_scheme
== dr_explicit_realign
)
4742 gcc_assert (!compute_in_loop
);
4743 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4746 /* 5. Create msq = phi <msq_init, lsq> in loop */
4748 pe
= loop_preheader_edge (containing_loop
);
4749 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4750 msq
= make_ssa_name (vec_dest
, NULL
);
4751 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4752 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4758 /* Function vect_grouped_load_supported.
4760 Returns TRUE if even and odd permutations are supported,
4761 and FALSE otherwise. */
4764 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4766 enum machine_mode mode
= TYPE_MODE (vectype
);
4768 /* vect_permute_load_chain requires the group size to be a power of two. */
4769 if (exact_log2 (count
) == -1)
4771 if (dump_enabled_p ())
4772 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4773 "the size of the group of accesses"
4774 " is not a power of 2\n");
4778 /* Check that the permutation is supported. */
4779 if (VECTOR_MODE_P (mode
))
4781 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4782 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4784 for (i
= 0; i
< nelt
; i
++)
4786 if (can_vec_perm_p (mode
, false, sel
))
4788 for (i
= 0; i
< nelt
; i
++)
4790 if (can_vec_perm_p (mode
, false, sel
))
4795 if (dump_enabled_p ())
4796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4797 "extract even/odd not supported by target\n");
4801 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4805 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4807 return vect_lanes_optab_supported_p ("vec_load_lanes",
4808 vec_load_lanes_optab
,
4812 /* Function vect_permute_load_chain.
4814 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4815 a power of 2, generate extract_even/odd stmts to reorder the input data
4816 correctly. Return the final references for loads in RESULT_CHAIN.
4818 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4819 The input is 4 vectors each containing 8 elements. We assign a number to each
4820 element, the input sequence is:
4822 1st vec: 0 1 2 3 4 5 6 7
4823 2nd vec: 8 9 10 11 12 13 14 15
4824 3rd vec: 16 17 18 19 20 21 22 23
4825 4th vec: 24 25 26 27 28 29 30 31
4827 The output sequence should be:
4829 1st vec: 0 4 8 12 16 20 24 28
4830 2nd vec: 1 5 9 13 17 21 25 29
4831 3rd vec: 2 6 10 14 18 22 26 30
4832 4th vec: 3 7 11 15 19 23 27 31
4834 i.e., the first output vector should contain the first elements of each
4835 interleaving group, etc.
4837 We use extract_even/odd instructions to create such output. The input of
4838 each extract_even/odd operation is two vectors
4842 and the output is the vector of extracted even/odd elements. The output of
4843 extract_even will be: 0 2 4 6
4844 and of extract_odd: 1 3 5 7
4847 The permutation is done in log LENGTH stages. In each stage extract_even
4848 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4849 their order. In our example,
4851 E1: extract_even (1st vec, 2nd vec)
4852 E2: extract_odd (1st vec, 2nd vec)
4853 E3: extract_even (3rd vec, 4th vec)
4854 E4: extract_odd (3rd vec, 4th vec)
4856 The output for the first stage will be:
4858 E1: 0 2 4 6 8 10 12 14
4859 E2: 1 3 5 7 9 11 13 15
4860 E3: 16 18 20 22 24 26 28 30
4861 E4: 17 19 21 23 25 27 29 31
4863 In order to proceed and create the correct sequence for the next stage (or
4864 for the correct output, if the second stage is the last one, as in our
4865 example), we first put the output of extract_even operation and then the
4866 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4867 The input for the second stage is:
4869 1st vec (E1): 0 2 4 6 8 10 12 14
4870 2nd vec (E3): 16 18 20 22 24 26 28 30
4871 3rd vec (E2): 1 3 5 7 9 11 13 15
4872 4th vec (E4): 17 19 21 23 25 27 29 31
4874 The output of the second stage:
4876 E1: 0 4 8 12 16 20 24 28
4877 E2: 2 6 10 14 18 22 26 30
4878 E3: 1 5 9 13 17 21 25 29
4879 E4: 3 7 11 15 19 23 27 31
4881 And RESULT_CHAIN after reordering:
4883 1st vec (E1): 0 4 8 12 16 20 24 28
4884 2nd vec (E3): 1 5 9 13 17 21 25 29
4885 3rd vec (E2): 2 6 10 14 18 22 26 30
4886 4th vec (E4): 3 7 11 15 19 23 27 31. */
4889 vect_permute_load_chain (vec
<tree
> dr_chain
,
4890 unsigned int length
,
4892 gimple_stmt_iterator
*gsi
,
4893 vec
<tree
> *result_chain
)
4895 tree data_ref
, first_vect
, second_vect
;
4896 tree perm_mask_even
, perm_mask_odd
;
4898 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4899 unsigned int i
, j
, log_length
= exact_log2 (length
);
4900 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4901 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4903 result_chain
->quick_grow (length
);
4904 memcpy (result_chain
->address (), dr_chain
.address (),
4905 length
* sizeof (tree
));
4907 for (i
= 0; i
< nelt
; ++i
)
4909 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4910 gcc_assert (perm_mask_even
!= NULL
);
4912 for (i
= 0; i
< nelt
; ++i
)
4914 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4915 gcc_assert (perm_mask_odd
!= NULL
);
4917 for (i
= 0; i
< log_length
; i
++)
4919 for (j
= 0; j
< length
; j
+= 2)
4921 first_vect
= dr_chain
[j
];
4922 second_vect
= dr_chain
[j
+1];
4924 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4925 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4926 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4927 first_vect
, second_vect
,
4929 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4930 (*result_chain
)[j
/2] = data_ref
;
4932 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4933 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4934 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4935 first_vect
, second_vect
,
4937 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4938 (*result_chain
)[j
/2+length
/2] = data_ref
;
4940 memcpy (dr_chain
.address (), result_chain
->address (),
4941 length
* sizeof (tree
));
4946 /* Function vect_transform_grouped_load.
4948 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4949 to perform their permutation and ascribe the result vectorized statements to
4950 the scalar statements.
4954 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4955 gimple_stmt_iterator
*gsi
)
4957 vec
<tree
> result_chain
= vNULL
;
4959 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4960 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4961 vectors, that are ready for vector computation. */
4962 result_chain
.create (size
);
4963 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4964 vect_record_grouped_load_vectors (stmt
, result_chain
);
4965 result_chain
.release ();
4968 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4969 generated as part of the vectorization of STMT. Assign the statement
4970 for each vector to the associated scalar statement. */
4973 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4975 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4976 gimple next_stmt
, new_stmt
;
4977 unsigned int i
, gap_count
;
4980 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4981 Since we scan the chain starting from it's first node, their order
4982 corresponds the order of data-refs in RESULT_CHAIN. */
4983 next_stmt
= first_stmt
;
4985 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
4990 /* Skip the gaps. Loads created for the gaps will be removed by dead
4991 code elimination pass later. No need to check for the first stmt in
4992 the group, since it always exists.
4993 GROUP_GAP is the number of steps in elements from the previous
4994 access (if there is no gap GROUP_GAP is 1). We skip loads that
4995 correspond to the gaps. */
4996 if (next_stmt
!= first_stmt
4997 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5005 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5006 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5007 copies, and we put the new vector statement in the first available
5009 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5010 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5013 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5016 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5018 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5021 prev_stmt
= rel_stmt
;
5023 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5026 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5031 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5033 /* If NEXT_STMT accesses the same DR as the previous statement,
5034 put the same TMP_DATA_REF as its vectorized statement; otherwise
5035 get the next data-ref from RESULT_CHAIN. */
5036 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5042 /* Function vect_force_dr_alignment_p.
5044 Returns whether the alignment of a DECL can be forced to be aligned
5045 on ALIGNMENT bit boundary. */
5048 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5050 if (TREE_CODE (decl
) != VAR_DECL
)
5053 /* We cannot change alignment of common or external symbols as another
5054 translation unit may contain a definition with lower alignment.
5055 The rules of common symbol linking mean that the definition
5056 will override the common symbol. The same is true for constant
5057 pool entries which may be shared and are not properly merged
5059 if (DECL_EXTERNAL (decl
)
5060 || DECL_COMMON (decl
)
5061 || DECL_IN_CONSTANT_POOL (decl
))
5064 if (TREE_ASM_WRITTEN (decl
))
5067 /* Do not override the alignment as specified by the ABI when the used
5068 attribute is set. */
5069 if (DECL_PRESERVE_P (decl
))
5072 /* Do not override explicit alignment set by the user when an explicit
5073 section name is also used. This is a common idiom used by many
5074 software projects. */
5075 if (DECL_SECTION_NAME (decl
) != NULL_TREE
5076 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
5079 if (TREE_STATIC (decl
))
5080 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5082 return (alignment
<= MAX_STACK_ALIGNMENT
);
5086 /* Return whether the data reference DR is supported with respect to its
5088 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5089 it is aligned, i.e., check if it is possible to vectorize it with different
5092 enum dr_alignment_support
5093 vect_supportable_dr_alignment (struct data_reference
*dr
,
5094 bool check_aligned_accesses
)
5096 gimple stmt
= DR_STMT (dr
);
5097 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5098 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5099 enum machine_mode mode
= TYPE_MODE (vectype
);
5100 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5101 struct loop
*vect_loop
= NULL
;
5102 bool nested_in_vect_loop
= false;
5104 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5107 /* For now assume all conditional loads/stores support unaligned
5108 access without any special code. */
5109 if (is_gimple_call (stmt
)
5110 && gimple_call_internal_p (stmt
)
5111 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5112 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5113 return dr_unaligned_supported
;
5117 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5118 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5121 /* Possibly unaligned access. */
5123 /* We can choose between using the implicit realignment scheme (generating
5124 a misaligned_move stmt) and the explicit realignment scheme (generating
5125 aligned loads with a REALIGN_LOAD). There are two variants to the
5126 explicit realignment scheme: optimized, and unoptimized.
5127 We can optimize the realignment only if the step between consecutive
5128 vector loads is equal to the vector size. Since the vector memory
5129 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5130 is guaranteed that the misalignment amount remains the same throughout the
5131 execution of the vectorized loop. Therefore, we can create the
5132 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5133 at the loop preheader.
5135 However, in the case of outer-loop vectorization, when vectorizing a
5136 memory access in the inner-loop nested within the LOOP that is now being
5137 vectorized, while it is guaranteed that the misalignment of the
5138 vectorized memory access will remain the same in different outer-loop
5139 iterations, it is *not* guaranteed that is will remain the same throughout
5140 the execution of the inner-loop. This is because the inner-loop advances
5141 with the original scalar step (and not in steps of VS). If the inner-loop
5142 step happens to be a multiple of VS, then the misalignment remains fixed
5143 and we can use the optimized realignment scheme. For example:
5149 When vectorizing the i-loop in the above example, the step between
5150 consecutive vector loads is 1, and so the misalignment does not remain
5151 fixed across the execution of the inner-loop, and the realignment cannot
5152 be optimized (as illustrated in the following pseudo vectorized loop):
5154 for (i=0; i<N; i+=4)
5155 for (j=0; j<M; j++){
5156 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5157 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5158 // (assuming that we start from an aligned address).
5161 We therefore have to use the unoptimized realignment scheme:
5163 for (i=0; i<N; i+=4)
5164 for (j=k; j<M; j+=4)
5165 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5166 // that the misalignment of the initial address is
5169 The loop can then be vectorized as follows:
5171 for (k=0; k<4; k++){
5172 rt = get_realignment_token (&vp[k]);
5173 for (i=0; i<N; i+=4){
5175 for (j=k; j<M; j+=4){
5177 va = REALIGN_LOAD <v1,v2,rt>;
5184 if (DR_IS_READ (dr
))
5186 bool is_packed
= false;
5187 tree type
= (TREE_TYPE (DR_REF (dr
)));
5189 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5190 && (!targetm
.vectorize
.builtin_mask_for_load
5191 || targetm
.vectorize
.builtin_mask_for_load ()))
5193 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5194 if ((nested_in_vect_loop
5195 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5196 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5198 return dr_explicit_realign
;
5200 return dr_explicit_realign_optimized
;
5202 if (!known_alignment_for_access_p (dr
))
5203 is_packed
= not_size_aligned (DR_REF (dr
));
5205 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5206 || targetm
.vectorize
.
5207 support_vector_misalignment (mode
, type
,
5208 DR_MISALIGNMENT (dr
), is_packed
))
5209 /* Can't software pipeline the loads, but can at least do them. */
5210 return dr_unaligned_supported
;
5214 bool is_packed
= false;
5215 tree type
= (TREE_TYPE (DR_REF (dr
)));
5217 if (!known_alignment_for_access_p (dr
))
5218 is_packed
= not_size_aligned (DR_REF (dr
));
5220 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5221 || targetm
.vectorize
.
5222 support_vector_misalignment (mode
, type
,
5223 DR_MISALIGNMENT (dr
), is_packed
))
5224 return dr_unaligned_supported
;
5228 return dr_unaligned_unsupported
;