1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
36 #include "gimple-expr.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
65 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
66 tree vectype
, unsigned HOST_WIDE_INT count
)
68 enum machine_mode mode
, array_mode
;
71 mode
= TYPE_MODE (vectype
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
76 if (array_mode
== BLKmode
)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
81 GET_MODE_NAME (mode
), count
);
85 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
89 "cannot use %s<%s><%s>\n", name
,
90 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE
, vect_location
,
96 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
97 GET_MODE_NAME (mode
));
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
121 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
122 HOST_WIDE_INT
*rhs_size_unit
)
124 tree scalar_type
= gimple_expr_type (stmt
);
125 HOST_WIDE_INT lhs
, rhs
;
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE
, vect_location
,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
165 dump_printf (MSG_NOTE
, " and ");
166 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
167 dump_printf (MSG_NOTE
, "\n");
170 if (optimize_loop_nest_for_size_p (loop
))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
174 "versioning not supported when optimizing"
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
184 "versioning not yet supported for outer-loops.\n");
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
190 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
195 "versioning not yet supported for non-constant "
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
213 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
214 loop_vec_info loop_vinfo
, int *max_vf
)
217 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
218 struct data_reference
*dra
= DDR_A (ddr
);
219 struct data_reference
*drb
= DDR_B (ddr
);
220 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
221 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
222 lambda_vector dist_v
;
223 unsigned int loop_depth
;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
235 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop
->safelen
>= 2)
245 if (loop
->safelen
< *max_vf
)
246 *max_vf
= loop
->safelen
;
247 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
251 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
252 || STMT_VINFO_GATHER_P (stmtinfo_b
))
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
257 "versioning for alias not supported for: "
258 "can't determine dependence between ");
259 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
261 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
262 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
264 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
272 "versioning for alias required: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
276 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
279 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
282 /* Add to list of ddrs that need to be tested at run-time. */
283 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
286 /* Known data dependence. */
287 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
289 /* If user asserted safelen consecutive iterations can be
290 executed concurrently, assume independence. */
291 if (loop
->safelen
>= 2)
293 if (loop
->safelen
< *max_vf
)
294 *max_vf
= loop
->safelen
;
295 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
299 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
300 || STMT_VINFO_GATHER_P (stmtinfo_b
))
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
305 "versioning for alias not supported for: "
306 "bad dist vector for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
309 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
312 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
320 "versioning for alias required: "
321 "bad dist vector for ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
323 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
324 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
325 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
331 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
332 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
334 int dist
= dist_v
[loop_depth
];
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE
, vect_location
,
338 "dependence distance = %d.\n", dist
);
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_NOTE
, vect_location
,
345 "dependence distance == 0 between ");
346 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
347 dump_printf (MSG_NOTE
, " and ");
348 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
349 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
352 /* When we perform grouped accesses and perform implicit CSE
353 by detecting equal accesses and doing disambiguation with
354 runtime alias tests like for
362 where we will end up loading { a[i], a[i+1] } once, make
363 sure that inserting group loads before the first load and
364 stores after the last store will do the right thing. */
365 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
366 && GROUP_SAME_DR_STMT (stmtinfo_a
))
367 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
368 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
371 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
373 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
375 if (dump_enabled_p ())
376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
377 "READ_WRITE dependence in interleaving."
386 if (dist
> 0 && DDR_REVERSED_P (ddr
))
388 /* If DDR_REVERSED_P the order of the data-refs in DDR was
389 reversed (to make distance vector positive), and the actual
390 distance is negative. */
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
393 "dependence distance negative.\n");
398 && abs (dist
) < *max_vf
)
400 /* The dependence distance requires reduction of the maximal
401 vectorization factor. */
402 *max_vf
= abs (dist
);
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE
, vect_location
,
405 "adjusting maximal vectorization factor to %i\n",
409 if (abs (dist
) >= *max_vf
)
411 /* Dependence distance does not create dependence, as far as
412 vectorization is concerned, in this case. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_NOTE
, vect_location
,
415 "dependence distance >= VF.\n");
419 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
422 "not vectorized, possible dependence "
423 "between data-refs ");
424 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
425 dump_printf (MSG_NOTE
, " and ");
426 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
427 dump_printf (MSG_NOTE
, "\n");
436 /* Function vect_analyze_data_ref_dependences.
438 Examine all the data references in the loop, and make sure there do not
439 exist any data dependences between them. Set *MAX_VF according to
440 the maximum vectorization factor the data dependences allow. */
443 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
446 struct data_dependence_relation
*ddr
;
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE
, vect_location
,
450 "=== vect_analyze_data_ref_dependences ===\n");
452 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
453 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
454 &LOOP_VINFO_DDRS (loop_vinfo
),
455 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
458 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
459 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
466 /* Function vect_slp_analyze_data_ref_dependence.
468 Return TRUE if there (might) exist a dependence between a memory-reference
469 DRA and a memory-reference DRB. When versioning for alias may check a
470 dependence at run-time, return FALSE. Adjust *MAX_VF according to
471 the data dependence. */
474 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
476 struct data_reference
*dra
= DDR_A (ddr
);
477 struct data_reference
*drb
= DDR_B (ddr
);
479 /* We need to check dependences of statements marked as unvectorizable
480 as well, they still can prohibit vectorization. */
482 /* Independent data accesses. */
483 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
489 /* Read-read is OK. */
490 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
493 /* If dra and drb are part of the same interleaving chain consider
495 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
496 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
497 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
500 /* Unknown data dependence. */
501 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
503 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
506 "can't determine dependence between ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
508 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
509 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
510 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
513 else if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE
, vect_location
,
516 "determined dependence between ");
517 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
518 dump_printf (MSG_NOTE
, " and ");
519 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
520 dump_printf (MSG_NOTE
, "\n");
523 /* We do not vectorize basic blocks with write-write dependencies. */
524 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
527 /* If we have a read-write dependence check that the load is before the store.
528 When we vectorize basic blocks, vector load can be only before
529 corresponding scalar load, and vector store can be only after its
530 corresponding scalar store. So the order of the acceses is preserved in
531 case the load is before the store. */
532 gimple earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
533 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
535 /* That only holds for load-store pairs taking part in vectorization. */
536 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
537 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
545 /* Function vect_analyze_data_ref_dependences.
547 Examine all the data references in the basic-block, and make sure there
548 do not exist any data dependences between them. Set *MAX_VF according to
549 the maximum vectorization factor the data dependences allow. */
552 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
554 struct data_dependence_relation
*ddr
;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE
, vect_location
,
559 "=== vect_slp_analyze_data_ref_dependences ===\n");
561 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
562 &BB_VINFO_DDRS (bb_vinfo
),
566 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
567 if (vect_slp_analyze_data_ref_dependence (ddr
))
574 /* Function vect_compute_data_ref_alignment
576 Compute the misalignment of the data reference DR.
579 1. If during the misalignment computation it is found that the data reference
580 cannot be vectorized then false is returned.
581 2. DR_MISALIGNMENT (DR) is defined.
583 FOR NOW: No analysis is actually performed. Misalignment is calculated
584 only for trivial cases. TODO. */
587 vect_compute_data_ref_alignment (struct data_reference
*dr
)
589 gimple stmt
= DR_STMT (dr
);
590 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
591 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
592 struct loop
*loop
= NULL
;
593 tree ref
= DR_REF (dr
);
595 tree base
, base_addr
;
598 tree aligned_to
, alignment
;
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_NOTE
, vect_location
,
602 "vect_compute_data_ref_alignment:\n");
605 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
607 /* Initialize misalignment to unknown. */
608 SET_DR_MISALIGNMENT (dr
, -1);
610 /* Strided loads perform only component accesses, misalignment information
611 is irrelevant for them. */
612 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
615 misalign
= DR_INIT (dr
);
616 aligned_to
= DR_ALIGNED_TO (dr
);
617 base_addr
= DR_BASE_ADDRESS (dr
);
618 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
620 /* In case the dataref is in an inner-loop of the loop that is being
621 vectorized (LOOP), we use the base and misalignment information
622 relative to the outer-loop (LOOP). This is ok only if the misalignment
623 stays the same throughout the execution of the inner-loop, which is why
624 we have to check that the stride of the dataref in the inner-loop evenly
625 divides by the vector size. */
626 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
628 tree step
= DR_STEP (dr
);
629 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
631 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE
, vect_location
,
635 "inner step divides the vector-size.\n");
636 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
637 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
638 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
642 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
644 "inner step doesn't divide the vector-size.\n");
645 misalign
= NULL_TREE
;
649 /* Similarly, if we're doing basic-block vectorization, we can only use
650 base and misalignment information relative to an innermost loop if the
651 misalignment stays the same throughout the execution of the loop.
652 As above, this is the case if the stride of the dataref evenly divides
653 by the vector size. */
656 tree step
= DR_STEP (dr
);
657 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
659 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
663 "SLP: step doesn't divide the vector-size.\n");
664 misalign
= NULL_TREE
;
668 base
= build_fold_indirect_ref (base_addr
);
669 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
671 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
677 "Unknown alignment for access: ");
678 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
679 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
685 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
687 || (TREE_CODE (base_addr
) == SSA_NAME
688 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
689 TREE_TYPE (base_addr
)))),
691 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
694 base_aligned
= false;
698 /* Do not change the alignment of global variables here if
699 flag_section_anchors is enabled as we already generated
700 RTL for other functions. Most global variables should
701 have been aligned during the IPA increase_alignment pass. */
702 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
703 || (TREE_STATIC (base
) && flag_section_anchors
))
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE
, vect_location
,
708 "can't force alignment of ref: ");
709 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
710 dump_printf (MSG_NOTE
, "\n");
715 /* Force the alignment of the decl.
716 NOTE: This is the only change to the code we make during
717 the analysis phase, before deciding to vectorize the loop. */
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
721 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
722 dump_printf (MSG_NOTE
, "\n");
725 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
726 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
729 /* If this is a backward running DR then first access in the larger
730 vectype actually is N-1 elements before the address in the DR.
731 Adjust misalign accordingly. */
732 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
734 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
735 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
736 otherwise we wouldn't be here. */
737 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
738 /* PLUS because DR_STEP was negative. */
739 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
742 /* Modulo alignment. */
743 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
745 if (!tree_fits_uhwi_p (misalign
))
747 /* Negative or overflowed misalignment value. */
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
750 "unexpected misalign value\n");
754 SET_DR_MISALIGNMENT (dr
, tree_to_uhwi (misalign
));
756 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
759 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
760 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
761 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
768 /* Function vect_compute_data_refs_alignment
770 Compute the misalignment of data references in the loop.
771 Return FALSE if a data reference is found that cannot be vectorized. */
774 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
775 bb_vec_info bb_vinfo
)
777 vec
<data_reference_p
> datarefs
;
778 struct data_reference
*dr
;
782 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
784 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
786 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
787 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
788 && !vect_compute_data_ref_alignment (dr
))
792 /* Mark unsupported statement as unvectorizable. */
793 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
804 /* Function vect_update_misalignment_for_peel
806 DR - the data reference whose misalignment is to be adjusted.
807 DR_PEEL - the data reference whose misalignment is being made
808 zero in the vector loop by the peel.
809 NPEEL - the number of iterations in the peel loop if the misalignment
810 of DR_PEEL is known at compile time. */
813 vect_update_misalignment_for_peel (struct data_reference
*dr
,
814 struct data_reference
*dr_peel
, int npeel
)
817 vec
<dr_p
> same_align_drs
;
818 struct data_reference
*current_dr
;
819 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
820 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
821 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
822 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
824 /* For interleaved data accesses the step in the loop must be multiplied by
825 the size of the interleaving group. */
826 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
827 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
828 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
829 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
831 /* It can be assumed that the data refs with the same alignment as dr_peel
832 are aligned in the vector loop. */
834 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
835 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
837 if (current_dr
!= dr
)
839 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
840 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
841 SET_DR_MISALIGNMENT (dr
, 0);
845 if (known_alignment_for_access_p (dr
)
846 && known_alignment_for_access_p (dr_peel
))
848 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
849 int misal
= DR_MISALIGNMENT (dr
);
850 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
851 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
852 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
853 SET_DR_MISALIGNMENT (dr
, misal
);
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
859 SET_DR_MISALIGNMENT (dr
, -1);
863 /* Function vect_verify_datarefs_alignment
865 Return TRUE if all data references in the loop can be
866 handled with respect to alignment. */
869 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
871 vec
<data_reference_p
> datarefs
;
872 struct data_reference
*dr
;
873 enum dr_alignment_support supportable_dr_alignment
;
877 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
879 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
881 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
883 gimple stmt
= DR_STMT (dr
);
884 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
886 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
889 /* For interleaving, only the alignment of the first access matters.
890 Skip statements marked as not vectorizable. */
891 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
892 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
893 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
896 /* Strided loads perform only component accesses, alignment is
897 irrelevant for them. */
898 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
901 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
902 if (!supportable_dr_alignment
)
904 if (dump_enabled_p ())
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
908 "not vectorized: unsupported unaligned load.");
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
911 "not vectorized: unsupported unaligned "
914 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
916 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
920 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE
, vect_location
,
922 "Vectorizing an unaligned access.\n");
927 /* Given an memory reference EXP return whether its alignment is less
931 not_size_aligned (tree exp
)
933 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
936 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
937 > get_object_alignment (exp
));
940 /* Function vector_alignment_reachable_p
942 Return true if vector alignment for DR is reachable by peeling
943 a few loop iterations. Return false otherwise. */
946 vector_alignment_reachable_p (struct data_reference
*dr
)
948 gimple stmt
= DR_STMT (dr
);
949 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
950 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
952 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
954 /* For interleaved access we peel only if number of iterations in
955 the prolog loop ({VF - misalignment}), is a multiple of the
956 number of the interleaved accesses. */
957 int elem_size
, mis_in_elements
;
958 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
960 /* FORNOW: handle only known alignment. */
961 if (!known_alignment_for_access_p (dr
))
964 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
965 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
967 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
971 /* If misalignment is known at the compile time then allow peeling
972 only if natural alignment is reachable through peeling. */
973 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
975 HOST_WIDE_INT elmsize
=
976 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_NOTE
, vect_location
,
980 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
981 dump_printf (MSG_NOTE
,
982 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
984 if (DR_MISALIGNMENT (dr
) % elmsize
)
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
988 "data size does not divide the misalignment.\n");
993 if (!known_alignment_for_access_p (dr
))
995 tree type
= TREE_TYPE (DR_REF (dr
));
996 bool is_packed
= not_size_aligned (DR_REF (dr
));
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
999 "Unknown misalignment, is_packed = %d\n",is_packed
);
1000 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1001 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1011 /* Calculate the cost of the memory access represented by DR. */
1014 vect_get_data_access_cost (struct data_reference
*dr
,
1015 unsigned int *inside_cost
,
1016 unsigned int *outside_cost
,
1017 stmt_vector_for_cost
*body_cost_vec
)
1019 gimple stmt
= DR_STMT (dr
);
1020 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1021 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1022 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1023 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1024 int ncopies
= vf
/ nunits
;
1026 if (DR_IS_READ (dr
))
1027 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1028 NULL
, body_cost_vec
, false);
1030 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE
, vect_location
,
1034 "vect_get_data_access_cost: inside_cost = %d, "
1035 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1039 /* Insert DR into peeling hash table with NPEEL as key. */
1042 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1045 struct _vect_peel_info elem
, *slot
;
1046 _vect_peel_info
**new_slot
;
1047 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1050 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1055 slot
= XNEW (struct _vect_peel_info
);
1056 slot
->npeel
= npeel
;
1059 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1063 if (!supportable_dr_alignment
1064 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1065 slot
->count
+= VECT_MAX_COST
;
1069 /* Traverse peeling hash table to find peeling option that aligns maximum
1070 number of data accesses. */
1073 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1074 _vect_peel_extended_info
*max
)
1076 vect_peel_info elem
= *slot
;
1078 if (elem
->count
> max
->peel_info
.count
1079 || (elem
->count
== max
->peel_info
.count
1080 && max
->peel_info
.npeel
> elem
->npeel
))
1082 max
->peel_info
.npeel
= elem
->npeel
;
1083 max
->peel_info
.count
= elem
->count
;
1084 max
->peel_info
.dr
= elem
->dr
;
1091 /* Traverse peeling hash table and calculate cost for each peeling option.
1092 Find the one with the lowest cost. */
1095 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1096 _vect_peel_extended_info
*min
)
1098 vect_peel_info elem
= *slot
;
1099 int save_misalignment
, dummy
;
1100 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1101 gimple stmt
= DR_STMT (elem
->dr
);
1102 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1103 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1104 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1105 struct data_reference
*dr
;
1106 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1107 int single_iter_cost
;
1109 prologue_cost_vec
.create (2);
1110 body_cost_vec
.create (2);
1111 epilogue_cost_vec
.create (2);
1113 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1115 stmt
= DR_STMT (dr
);
1116 stmt_info
= vinfo_for_stmt (stmt
);
1117 /* For interleaving, only the alignment of the first access
1119 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1120 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1123 save_misalignment
= DR_MISALIGNMENT (dr
);
1124 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1125 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1127 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1130 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1131 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1132 &dummy
, single_iter_cost
,
1134 &epilogue_cost_vec
);
1136 /* Prologue and epilogue costs are added to the target model later.
1137 These costs depend only on the scalar iteration cost, the
1138 number of peeling iterations finally chosen, and the number of
1139 misaligned statements. So discard the information found here. */
1140 prologue_cost_vec
.release ();
1141 epilogue_cost_vec
.release ();
1143 if (inside_cost
< min
->inside_cost
1144 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1146 min
->inside_cost
= inside_cost
;
1147 min
->outside_cost
= outside_cost
;
1148 min
->body_cost_vec
.release ();
1149 min
->body_cost_vec
= body_cost_vec
;
1150 min
->peel_info
.dr
= elem
->dr
;
1151 min
->peel_info
.npeel
= elem
->npeel
;
1154 body_cost_vec
.release ();
1160 /* Choose best peeling option by traversing peeling hash table and either
1161 choosing an option with the lowest cost (if cost model is enabled) or the
1162 option that aligns as many accesses as possible. */
1164 static struct data_reference
*
1165 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1166 unsigned int *npeel
,
1167 stmt_vector_for_cost
*body_cost_vec
)
1169 struct _vect_peel_extended_info res
;
1171 res
.peel_info
.dr
= NULL
;
1172 res
.body_cost_vec
= stmt_vector_for_cost ();
1174 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1176 res
.inside_cost
= INT_MAX
;
1177 res
.outside_cost
= INT_MAX
;
1178 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1179 .traverse
<_vect_peel_extended_info
*,
1180 vect_peeling_hash_get_lowest_cost
> (&res
);
1184 res
.peel_info
.count
= 0;
1185 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1186 .traverse
<_vect_peel_extended_info
*,
1187 vect_peeling_hash_get_most_frequent
> (&res
);
1190 *npeel
= res
.peel_info
.npeel
;
1191 *body_cost_vec
= res
.body_cost_vec
;
1192 return res
.peel_info
.dr
;
1196 /* Function vect_enhance_data_refs_alignment
1198 This pass will use loop versioning and loop peeling in order to enhance
1199 the alignment of data references in the loop.
1201 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1202 original loop is to be vectorized. Any other loops that are created by
1203 the transformations performed in this pass - are not supposed to be
1204 vectorized. This restriction will be relaxed.
1206 This pass will require a cost model to guide it whether to apply peeling
1207 or versioning or a combination of the two. For example, the scheme that
1208 intel uses when given a loop with several memory accesses, is as follows:
1209 choose one memory access ('p') which alignment you want to force by doing
1210 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1211 other accesses are not necessarily aligned, or (2) use loop versioning to
1212 generate one loop in which all accesses are aligned, and another loop in
1213 which only 'p' is necessarily aligned.
1215 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1216 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1217 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1219 Devising a cost model is the most critical aspect of this work. It will
1220 guide us on which access to peel for, whether to use loop versioning, how
1221 many versions to create, etc. The cost model will probably consist of
1222 generic considerations as well as target specific considerations (on
1223 powerpc for example, misaligned stores are more painful than misaligned
1226 Here are the general steps involved in alignment enhancements:
1228 -- original loop, before alignment analysis:
1229 for (i=0; i<N; i++){
1230 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1231 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1234 -- After vect_compute_data_refs_alignment:
1235 for (i=0; i<N; i++){
1236 x = q[i]; # DR_MISALIGNMENT(q) = 3
1237 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1240 -- Possibility 1: we do loop versioning:
1242 for (i=0; i<N; i++){ # loop 1A
1243 x = q[i]; # DR_MISALIGNMENT(q) = 3
1244 p[i] = y; # DR_MISALIGNMENT(p) = 0
1248 for (i=0; i<N; i++){ # loop 1B
1249 x = q[i]; # DR_MISALIGNMENT(q) = 3
1250 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1254 -- Possibility 2: we do loop peeling:
1255 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1259 for (i = 3; i < N; i++){ # loop 2A
1260 x = q[i]; # DR_MISALIGNMENT(q) = 0
1261 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1264 -- Possibility 3: combination of loop peeling and versioning:
1265 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1270 for (i = 3; i<N; i++){ # loop 3A
1271 x = q[i]; # DR_MISALIGNMENT(q) = 0
1272 p[i] = y; # DR_MISALIGNMENT(p) = 0
1276 for (i = 3; i<N; i++){ # loop 3B
1277 x = q[i]; # DR_MISALIGNMENT(q) = 0
1278 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1282 These loops are later passed to loop_transform to be vectorized. The
1283 vectorizer will use the alignment information to guide the transformation
1284 (whether to generate regular loads/stores, or with special handling for
1288 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1290 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1291 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1292 enum dr_alignment_support supportable_dr_alignment
;
1293 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1294 struct data_reference
*dr
;
1296 bool do_peeling
= false;
1297 bool do_versioning
= false;
1300 stmt_vec_info stmt_info
;
1301 unsigned int npeel
= 0;
1302 bool all_misalignments_unknown
= true;
1303 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1304 unsigned possible_npeel_number
= 1;
1306 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1307 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_NOTE
, vect_location
,
1311 "=== vect_enhance_data_refs_alignment ===\n");
1313 /* While cost model enhancements are expected in the future, the high level
1314 view of the code at this time is as follows:
1316 A) If there is a misaligned access then see if peeling to align
1317 this access can make all data references satisfy
1318 vect_supportable_dr_alignment. If so, update data structures
1319 as needed and return true.
1321 B) If peeling wasn't possible and there is a data reference with an
1322 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1323 then see if loop versioning checks can be used to make all data
1324 references satisfy vect_supportable_dr_alignment. If so, update
1325 data structures as needed and return true.
1327 C) If neither peeling nor versioning were successful then return false if
1328 any data reference does not satisfy vect_supportable_dr_alignment.
1330 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1332 Note, Possibility 3 above (which is peeling and versioning together) is not
1333 being done at this time. */
1335 /* (1) Peeling to force alignment. */
1337 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1339 + How many accesses will become aligned due to the peeling
1340 - How many accesses will become unaligned due to the peeling,
1341 and the cost of misaligned accesses.
1342 - The cost of peeling (the extra runtime checks, the increase
1345 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1347 stmt
= DR_STMT (dr
);
1348 stmt_info
= vinfo_for_stmt (stmt
);
1350 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1353 /* For interleaving, only the alignment of the first access
1355 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1356 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1359 /* For invariant accesses there is nothing to enhance. */
1360 if (integer_zerop (DR_STEP (dr
)))
1363 /* Strided loads perform only component accesses, alignment is
1364 irrelevant for them. */
1365 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1368 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1369 do_peeling
= vector_alignment_reachable_p (dr
);
1372 if (known_alignment_for_access_p (dr
))
1374 unsigned int npeel_tmp
;
1375 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1376 size_zero_node
) < 0;
1378 /* Save info about DR in the hash table. */
1379 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1380 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1382 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1383 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1384 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1385 TREE_TYPE (DR_REF (dr
))));
1386 npeel_tmp
= (negative
1387 ? (mis
- nelements
) : (nelements
- mis
))
1390 /* For multiple types, it is possible that the bigger type access
1391 will have more than one peeling option. E.g., a loop with two
1392 types: one of size (vector size / 4), and the other one of
1393 size (vector size / 8). Vectorization factor will 8. If both
1394 access are misaligned by 3, the first one needs one scalar
1395 iteration to be aligned, and the second one needs 5. But the
1396 the first one will be aligned also by peeling 5 scalar
1397 iterations, and in that case both accesses will be aligned.
1398 Hence, except for the immediate peeling amount, we also want
1399 to try to add full vector size, while we don't exceed
1400 vectorization factor.
1401 We do this automtically for cost model, since we calculate cost
1402 for every peeling option. */
1403 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1404 possible_npeel_number
= vf
/nelements
;
1406 /* Handle the aligned case. We may decide to align some other
1407 access, making DR unaligned. */
1408 if (DR_MISALIGNMENT (dr
) == 0)
1411 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1412 possible_npeel_number
++;
1415 for (j
= 0; j
< possible_npeel_number
; j
++)
1417 gcc_assert (npeel_tmp
<= vf
);
1418 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1419 npeel_tmp
+= nelements
;
1422 all_misalignments_unknown
= false;
1423 /* Data-ref that was chosen for the case that all the
1424 misalignments are unknown is not relevant anymore, since we
1425 have a data-ref with known alignment. */
1430 /* If we don't know any misalignment values, we prefer
1431 peeling for data-ref that has the maximum number of data-refs
1432 with the same alignment, unless the target prefers to align
1433 stores over load. */
1434 if (all_misalignments_unknown
)
1436 unsigned same_align_drs
1437 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1439 || same_align_drs_max
< same_align_drs
)
1441 same_align_drs_max
= same_align_drs
;
1444 /* For data-refs with the same number of related
1445 accesses prefer the one where the misalign
1446 computation will be invariant in the outermost loop. */
1447 else if (same_align_drs_max
== same_align_drs
)
1449 struct loop
*ivloop0
, *ivloop
;
1450 ivloop0
= outermost_invariant_loop_for_expr
1451 (loop
, DR_BASE_ADDRESS (dr0
));
1452 ivloop
= outermost_invariant_loop_for_expr
1453 (loop
, DR_BASE_ADDRESS (dr
));
1454 if ((ivloop
&& !ivloop0
)
1455 || (ivloop
&& ivloop0
1456 && flow_loop_nested_p (ivloop
, ivloop0
)))
1460 if (!first_store
&& DR_IS_WRITE (dr
))
1464 /* If there are both known and unknown misaligned accesses in the
1465 loop, we choose peeling amount according to the known
1467 if (!supportable_dr_alignment
)
1470 if (!first_store
&& DR_IS_WRITE (dr
))
1477 if (!aligned_access_p (dr
))
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1481 "vector alignment may not be reachable\n");
1487 /* Check if we can possibly peel the loop. */
1488 if (!vect_can_advance_ivs_p (loop_vinfo
)
1489 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1492 if (do_peeling
&& all_misalignments_unknown
1493 && vect_supportable_dr_alignment (dr0
, false))
1496 /* Check if the target requires to prefer stores over loads, i.e., if
1497 misaligned stores are more expensive than misaligned loads (taking
1498 drs with same alignment into account). */
1499 if (first_store
&& DR_IS_READ (dr0
))
1501 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1502 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1503 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1504 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1505 stmt_vector_for_cost dummy
;
1508 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1510 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1511 &store_outside_cost
, &dummy
);
1515 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1516 aligning the load DR0). */
1517 load_inside_penalty
= store_inside_cost
;
1518 load_outside_penalty
= store_outside_cost
;
1520 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1521 DR_STMT (first_store
))).iterate (i
, &dr
);
1523 if (DR_IS_READ (dr
))
1525 load_inside_penalty
+= load_inside_cost
;
1526 load_outside_penalty
+= load_outside_cost
;
1530 load_inside_penalty
+= store_inside_cost
;
1531 load_outside_penalty
+= store_outside_cost
;
1534 /* Calculate the penalty for leaving DR0 unaligned (by
1535 aligning the FIRST_STORE). */
1536 store_inside_penalty
= load_inside_cost
;
1537 store_outside_penalty
= load_outside_cost
;
1539 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1540 DR_STMT (dr0
))).iterate (i
, &dr
);
1542 if (DR_IS_READ (dr
))
1544 store_inside_penalty
+= load_inside_cost
;
1545 store_outside_penalty
+= load_outside_cost
;
1549 store_inside_penalty
+= store_inside_cost
;
1550 store_outside_penalty
+= store_outside_cost
;
1553 if (load_inside_penalty
> store_inside_penalty
1554 || (load_inside_penalty
== store_inside_penalty
1555 && load_outside_penalty
> store_outside_penalty
))
1559 /* In case there are only loads with different unknown misalignments, use
1560 peeling only if it may help to align other accesses in the loop. */
1562 && !STMT_VINFO_SAME_ALIGN_REFS (
1563 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1564 && vect_supportable_dr_alignment (dr0
, false)
1565 != dr_unaligned_supported
)
1569 if (do_peeling
&& !dr0
)
1571 /* Peeling is possible, but there is no data access that is not supported
1572 unless aligned. So we try to choose the best possible peeling. */
1574 /* We should get here only if there are drs with known misalignment. */
1575 gcc_assert (!all_misalignments_unknown
);
1577 /* Choose the best peeling from the hash table. */
1578 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1586 stmt
= DR_STMT (dr0
);
1587 stmt_info
= vinfo_for_stmt (stmt
);
1588 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1589 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1591 if (known_alignment_for_access_p (dr0
))
1593 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1594 size_zero_node
) < 0;
1597 /* Since it's known at compile time, compute the number of
1598 iterations in the peeled loop (the peeling factor) for use in
1599 updating DR_MISALIGNMENT values. The peeling factor is the
1600 vectorization factor minus the misalignment as an element
1602 mis
= DR_MISALIGNMENT (dr0
);
1603 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1604 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1608 /* For interleaved data access every iteration accesses all the
1609 members of the group, therefore we divide the number of iterations
1610 by the group size. */
1611 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1613 npeel
/= GROUP_SIZE (stmt_info
);
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE
, vect_location
,
1617 "Try peeling by %d\n", npeel
);
1620 /* Ensure that all data refs can be vectorized after the peel. */
1621 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1623 int save_misalignment
;
1628 stmt
= DR_STMT (dr
);
1629 stmt_info
= vinfo_for_stmt (stmt
);
1630 /* For interleaving, only the alignment of the first access
1632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1633 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1636 /* Strided loads perform only component accesses, alignment is
1637 irrelevant for them. */
1638 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1641 save_misalignment
= DR_MISALIGNMENT (dr
);
1642 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1643 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1644 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1646 if (!supportable_dr_alignment
)
1653 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1655 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1660 body_cost_vec
.release ();
1667 unsigned max_allowed_peel
1668 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1669 if (max_allowed_peel
!= (unsigned)-1)
1671 unsigned max_peel
= npeel
;
1674 gimple dr_stmt
= DR_STMT (dr0
);
1675 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1676 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1677 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1679 if (max_peel
> max_allowed_peel
)
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE
, vect_location
,
1684 "Disable peeling, max peels reached: %d\n", max_peel
);
1691 stmt_info_for_cost
*si
;
1692 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1694 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1695 If the misalignment of DR_i is identical to that of dr0 then set
1696 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1697 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1698 by the peeling factor times the element size of DR_i (MOD the
1699 vectorization factor times the size). Otherwise, the
1700 misalignment of DR_i must be set to unknown. */
1701 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1703 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1705 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1707 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1709 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1710 = DR_MISALIGNMENT (dr0
);
1711 SET_DR_MISALIGNMENT (dr0
, 0);
1712 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_NOTE
, vect_location
,
1715 "Alignment of access forced using peeling.\n");
1716 dump_printf_loc (MSG_NOTE
, vect_location
,
1717 "Peeling for alignment will be applied.\n");
1719 /* We've delayed passing the inside-loop peeling costs to the
1720 target cost model until we were sure peeling would happen.
1722 if (body_cost_vec
.exists ())
1724 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1726 struct _stmt_vec_info
*stmt_info
1727 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1728 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1729 si
->misalign
, vect_body
);
1731 body_cost_vec
.release ();
1734 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1740 body_cost_vec
.release ();
1742 /* (2) Versioning to force alignment. */
1744 /* Try versioning if:
1745 1) optimize loop for speed
1746 2) there is at least one unsupported misaligned data ref with an unknown
1748 3) all misaligned data refs with a known misalignment are supported, and
1749 4) the number of runtime alignment checks is within reason. */
1752 optimize_loop_nest_for_speed_p (loop
)
1753 && (!loop
->inner
); /* FORNOW */
1757 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1759 stmt
= DR_STMT (dr
);
1760 stmt_info
= vinfo_for_stmt (stmt
);
1762 /* For interleaving, only the alignment of the first access
1764 if (aligned_access_p (dr
)
1765 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1766 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1769 /* Strided loads perform only component accesses, alignment is
1770 irrelevant for them. */
1771 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1774 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1776 if (!supportable_dr_alignment
)
1782 if (known_alignment_for_access_p (dr
)
1783 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1784 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1786 do_versioning
= false;
1790 stmt
= DR_STMT (dr
);
1791 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1792 gcc_assert (vectype
);
1794 /* The rightmost bits of an aligned address must be zeros.
1795 Construct the mask needed for this test. For example,
1796 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1797 mask must be 15 = 0xf. */
1798 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1800 /* FORNOW: use the same mask to test all potentially unaligned
1801 references in the loop. The vectorizer currently supports
1802 a single vector size, see the reference to
1803 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1804 vectorization factor is computed. */
1805 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1806 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1807 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1808 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1813 /* Versioning requires at least one misaligned data reference. */
1814 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1815 do_versioning
= false;
1816 else if (!do_versioning
)
1817 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1822 vec
<gimple
> may_misalign_stmts
1823 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1826 /* It can now be assumed that the data references in the statements
1827 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1828 of the loop being vectorized. */
1829 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1831 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1832 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1833 SET_DR_MISALIGNMENT (dr
, 0);
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE
, vect_location
,
1836 "Alignment of access forced using versioning.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE
, vect_location
,
1841 "Versioning for alignment will be applied.\n");
1843 /* Peeling and versioning can't be done together at this time. */
1844 gcc_assert (! (do_peeling
&& do_versioning
));
1846 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1851 /* This point is reached if neither peeling nor versioning is being done. */
1852 gcc_assert (! (do_peeling
|| do_versioning
));
1854 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1859 /* Function vect_find_same_alignment_drs.
1861 Update group and alignment relations according to the chosen
1862 vectorization factor. */
1865 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1866 loop_vec_info loop_vinfo
)
1869 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1870 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1871 struct data_reference
*dra
= DDR_A (ddr
);
1872 struct data_reference
*drb
= DDR_B (ddr
);
1873 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1874 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1875 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1876 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1877 lambda_vector dist_v
;
1878 unsigned int loop_depth
;
1880 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1886 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1889 /* Loop-based vectorization and known data dependence. */
1890 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1893 /* Data-dependence analysis reports a distance vector of zero
1894 for data-references that overlap only in the first iteration
1895 but have different sign step (see PR45764).
1896 So as a sanity check require equal DR_STEP. */
1897 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1900 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1901 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1903 int dist
= dist_v
[loop_depth
];
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE
, vect_location
,
1907 "dependence distance = %d.\n", dist
);
1909 /* Same loop iteration. */
1911 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1913 /* Two references with distance zero have the same alignment. */
1914 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1915 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1916 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE
, vect_location
,
1919 "accesses have the same alignment.\n");
1920 dump_printf (MSG_NOTE
,
1921 "dependence distance modulo vf == 0 between ");
1922 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1923 dump_printf (MSG_NOTE
, " and ");
1924 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1925 dump_printf (MSG_NOTE
, "\n");
1932 /* Function vect_analyze_data_refs_alignment
1934 Analyze the alignment of the data-references in the loop.
1935 Return FALSE if a data reference is found that cannot be vectorized. */
1938 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1939 bb_vec_info bb_vinfo
)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE
, vect_location
,
1943 "=== vect_analyze_data_refs_alignment ===\n");
1945 /* Mark groups of data references with same alignment using
1946 data dependence information. */
1949 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1950 struct data_dependence_relation
*ddr
;
1953 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1954 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1957 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1959 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1961 "not vectorized: can't calculate alignment "
1970 /* Analyze groups of accesses: check that DR belongs to a group of
1971 accesses of legal size, step, etc. Detect gaps, single element
1972 interleaving, and other special cases. Set grouped access info.
1973 Collect groups of strided stores for further use in SLP analysis. */
1976 vect_analyze_group_access (struct data_reference
*dr
)
1978 tree step
= DR_STEP (dr
);
1979 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
1980 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
1981 gimple stmt
= DR_STMT (dr
);
1982 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1983 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1984 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
1985 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
1986 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
1987 bool slp_impossible
= false;
1988 struct loop
*loop
= NULL
;
1991 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1993 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1994 size of the interleaving group (including gaps). */
1995 groupsize
= absu_hwi (dr_step
) / type_size
;
1997 /* Not consecutive access is possible only if it is a part of interleaving. */
1998 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2000 /* Check if it this DR is a part of interleaving, and is a single
2001 element of the group that is accessed in the loop. */
2003 /* Gaps are supported only for loads. STEP must be a multiple of the type
2004 size. The size of the group must be a power of 2. */
2006 && (dr_step
% type_size
) == 0
2008 && exact_log2 (groupsize
) != -1)
2010 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2011 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE
, vect_location
,
2015 "Detected single element interleaving ");
2016 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2017 dump_printf (MSG_NOTE
, " step ");
2018 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2019 dump_printf (MSG_NOTE
, "\n");
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE
, vect_location
,
2026 "Data access with gaps requires scalar "
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2032 "Peeling for outer loop is not"
2037 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2043 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2046 "not consecutive access ");
2047 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2048 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2065 struct data_reference
*data_ref
= dr
;
2066 unsigned int count
= 1;
2067 tree prev_init
= DR_INIT (data_ref
);
2069 HOST_WIDE_INT diff
, gaps
= 0;
2070 unsigned HOST_WIDE_INT count_in_bytes
;
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2078 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next
)))))
2082 if (DR_IS_WRITE (data_ref
))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2086 "Two store stmts share the same dr.\n");
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2094 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2099 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2107 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible
= true;
2112 if (DR_IS_WRITE (data_ref
))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2116 "interleaved store with gaps\n");
2123 last_accessed_element
+= diff
;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2129 prev_init
= DR_INIT (data_ref
);
2130 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2131 /* Count the number of data-refs in the chain. */
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes
= type_size
* count
;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2142 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2150 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2155 /* Check that the size of the interleaving is equal to STEP for stores,
2156 i.e., that there are no gaps. */
2158 && absu_hwi (dr_step
) != count_in_bytes
)
2160 if (DR_IS_READ (dr
))
2162 slp_impossible
= true;
2163 /* There is a gap after the last load in the group. This gap is a
2164 difference between the groupsize and the number of elements.
2165 When there is no gap, this difference should be 0. */
2166 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2172 "interleaved store with gaps\n");
2177 /* Check that STEP is a multiple of type size. */
2179 && (dr_step
% type_size
) != 0)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "step is not a multiple of type size: step ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2186 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2188 TYPE_SIZE_UNIT (scalar_type
));
2189 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2197 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_NOTE
, vect_location
,
2200 "Detected interleaving of size %d\n", (int)groupsize
);
2202 /* SLP: create an SLP data structure for every interleaving group of
2203 stores for further analysis in vect_analyse_slp. */
2204 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2207 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2209 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2212 /* There is a gap in the end of the group. */
2213 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2217 "Data access with gaps requires scalar "
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2223 "Peeling for outer loop is not supported\n");
2227 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2235 /* Analyze the access pattern of the data-reference DR.
2236 In case of non-consecutive accesses call vect_analyze_group_access() to
2237 analyze groups of accesses. */
2240 vect_analyze_data_ref_access (struct data_reference
*dr
)
2242 tree step
= DR_STEP (dr
);
2243 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2244 gimple stmt
= DR_STMT (dr
);
2245 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2246 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2247 struct loop
*loop
= NULL
;
2250 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2252 if (loop_vinfo
&& !step
)
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2256 "bad data-ref access in loop\n");
2260 /* Allow invariant loads in not nested loops. */
2261 if (loop_vinfo
&& integer_zerop (step
))
2263 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2264 if (nested_in_vect_loop_p (loop
, stmt
))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
,
2268 "zero step in inner loop of nest\n");
2271 return DR_IS_READ (dr
);
2274 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2276 /* Interleaved accesses are not yet supported within outer-loop
2277 vectorization for references in the inner-loop. */
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2280 /* For the rest of the analysis we use the outer-loop step. */
2281 step
= STMT_VINFO_DR_STEP (stmt_info
);
2282 if (integer_zerop (step
))
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE
, vect_location
,
2286 "zero step in outer loop.\n");
2287 if (DR_IS_READ (dr
))
2295 if (TREE_CODE (step
) == INTEGER_CST
)
2297 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2298 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2300 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2302 /* Mark that it is not interleaving. */
2303 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2308 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE
, vect_location
,
2312 "grouped access in outer loop.\n");
2316 /* Assume this is a DR handled by non-constant strided load case. */
2317 if (TREE_CODE (step
) != INTEGER_CST
)
2318 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2320 /* Not consecutive access - check if it's a part of interleaving group. */
2321 return vect_analyze_group_access (dr
);
2326 /* A helper function used in the comparator function to sort data
2327 references. T1 and T2 are two data references to be compared.
2328 The function returns -1, 0, or 1. */
2331 compare_tree (tree t1
, tree t2
)
2334 enum tree_code code
;
2345 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2346 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2348 code
= TREE_CODE (t1
);
2351 /* For const values, we can just use hash values for comparisons. */
2359 hashval_t h1
= iterative_hash_expr (t1
, 0);
2360 hashval_t h2
= iterative_hash_expr (t2
, 0);
2362 return h1
< h2
? -1 : 1;
2367 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2371 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2372 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2376 tclass
= TREE_CODE_CLASS (code
);
2378 /* For var-decl, we could compare their UIDs. */
2379 if (tclass
== tcc_declaration
)
2381 if (DECL_UID (t1
) != DECL_UID (t2
))
2382 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2386 /* For expressions with operands, compare their operands recursively. */
2387 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2389 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2399 /* Compare two data-references DRA and DRB to group them into chunks
2400 suitable for grouping. */
2403 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2405 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2406 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2409 /* Stabilize sort. */
2413 /* Ordering of DRs according to base. */
2414 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2416 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2421 /* And according to DR_OFFSET. */
2422 if (!dr_equal_offsets_p (dra
, drb
))
2424 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2429 /* Put reads before writes. */
2430 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2431 return DR_IS_READ (dra
) ? -1 : 1;
2433 /* Then sort after access size. */
2434 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2435 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2437 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2438 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2443 /* And after step. */
2444 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2446 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2451 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2452 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2454 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2458 /* Function vect_analyze_data_ref_accesses.
2460 Analyze the access pattern of all the data references in the loop.
2462 FORNOW: the only access pattern that is considered vectorizable is a
2463 simple step 1 (consecutive) access.
2465 FORNOW: handle only arrays and pointer accesses. */
2468 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2471 vec
<data_reference_p
> datarefs
;
2472 struct data_reference
*dr
;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE
, vect_location
,
2476 "=== vect_analyze_data_ref_accesses ===\n");
2479 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2481 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2483 if (datarefs
.is_empty ())
2486 /* Sort the array of datarefs to make building the interleaving chains
2487 linear. Don't modify the original vector's order, it is needed for
2488 determining what dependencies are reversed. */
2489 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2490 qsort (datarefs_copy
.address (), datarefs_copy
.length (),
2491 sizeof (data_reference_p
), dr_group_sort_cmp
);
2493 /* Build the interleaving chains. */
2494 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2496 data_reference_p dra
= datarefs_copy
[i
];
2497 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2498 stmt_vec_info lastinfo
= NULL
;
2499 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2501 data_reference_p drb
= datarefs_copy
[i
];
2502 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2504 /* ??? Imperfect sorting (non-compatible types, non-modulo
2505 accesses, same accesses) can lead to a group to be artificially
2506 split here as we don't just skip over those. If it really
2507 matters we can push those to a worklist and re-iterate
2508 over them. The we can just skip ahead to the next DR here. */
2510 /* Check that the data-refs have same first location (except init)
2511 and they are both either store or load (not load and store). */
2512 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2513 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2514 DR_BASE_ADDRESS (drb
), 0)
2515 || !dr_equal_offsets_p (dra
, drb
))
2518 /* Check that the data-refs have the same constant size and step. */
2519 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2520 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2521 if (!tree_fits_uhwi_p (sza
)
2522 || !tree_fits_uhwi_p (szb
)
2523 || !tree_int_cst_equal (sza
, szb
)
2524 || !tree_fits_shwi_p (DR_STEP (dra
))
2525 || !tree_fits_shwi_p (DR_STEP (drb
))
2526 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2529 /* Do not place the same access in the interleaving chain twice. */
2530 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2533 /* Check the types are compatible.
2534 ??? We don't distinguish this during sorting. */
2535 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2536 TREE_TYPE (DR_REF (drb
))))
2539 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2540 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2541 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2542 gcc_assert (init_a
< init_b
);
2544 /* If init_b == init_a + the size of the type * k, we have an
2545 interleaving, and DRA is accessed before DRB. */
2546 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2547 if ((init_b
- init_a
) % type_size_a
!= 0)
2550 /* The step (if not zero) is greater than the difference between
2551 data-refs' inits. This splits groups into suitable sizes. */
2552 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2553 if (step
!= 0 && step
<= (init_b
- init_a
))
2556 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE
, vect_location
,
2559 "Detected interleaving ");
2560 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2561 dump_printf (MSG_NOTE
, " and ");
2562 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2563 dump_printf (MSG_NOTE
, "\n");
2566 /* Link the found element into the group list. */
2567 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2569 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2570 lastinfo
= stmtinfo_a
;
2572 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2573 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2574 lastinfo
= stmtinfo_b
;
2578 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2579 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2580 && !vect_analyze_data_ref_access (dr
))
2582 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2584 "not vectorized: complicated access pattern.\n");
2588 /* Mark the statement as not vectorizable. */
2589 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2594 datarefs_copy
.release ();
2599 datarefs_copy
.release ();
2604 /* Operator == between two dr_with_seg_len objects.
2606 This equality operator is used to make sure two data refs
2607 are the same one so that we will consider to combine the
2608 aliasing checks of those two pairs of data dependent data
2612 operator == (const dr_with_seg_len
& d1
,
2613 const dr_with_seg_len
& d2
)
2615 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2616 DR_BASE_ADDRESS (d2
.dr
), 0)
2617 && compare_tree (d1
.offset
, d2
.offset
) == 0
2618 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2621 /* Function comp_dr_with_seg_len_pair.
2623 Comparison function for sorting objects of dr_with_seg_len_pair_t
2624 so that we can combine aliasing checks in one scan. */
2627 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2629 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2630 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2632 const dr_with_seg_len
&p11
= p1
->first
,
2637 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2638 if a and c have the same basic address snd step, and b and d have the same
2639 address and step. Therefore, if any a&c or b&d don't have the same address
2640 and step, we don't care the order of those two pairs after sorting. */
2643 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2644 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2646 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2647 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2649 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2651 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2653 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2655 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2661 template <class T
> static void
2669 /* Function vect_vfa_segment_size.
2671 Create an expression that computes the size of segment
2672 that will be accessed for a data reference. The functions takes into
2673 account that realignment loads may access one more vector.
2676 DR: The data reference.
2677 LENGTH_FACTOR: segment length to consider.
2679 Return an expression whose value is the size of segment which will be
2683 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2685 tree segment_length
;
2687 if (integer_zerop (DR_STEP (dr
)))
2688 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2690 segment_length
= size_binop (MULT_EXPR
,
2691 fold_convert (sizetype
, DR_STEP (dr
)),
2692 fold_convert (sizetype
, length_factor
));
2694 if (vect_supportable_dr_alignment (dr
, false)
2695 == dr_explicit_realign_optimized
)
2697 tree vector_size
= TYPE_SIZE_UNIT
2698 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2700 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2702 return segment_length
;
2705 /* Function vect_prune_runtime_alias_test_list.
2707 Prune a list of ddrs to be tested at run-time by versioning for alias.
2708 Merge several alias checks into one if possible.
2709 Return FALSE if resulting list of ddrs is longer then allowed by
2710 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2713 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2715 vec
<ddr_p
> may_alias_ddrs
=
2716 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2717 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2718 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2719 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2720 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE
, vect_location
,
2728 "=== vect_prune_runtime_alias_test_list ===\n");
2730 if (may_alias_ddrs
.is_empty ())
2733 /* Basically, for each pair of dependent data refs store_ptr_0
2734 and load_ptr_0, we create an expression:
2736 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2737 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2739 for aliasing checks. However, in some cases we can decrease
2740 the number of checks by combining two checks into one. For
2741 example, suppose we have another pair of data refs store_ptr_0
2742 and load_ptr_1, and if the following condition is satisfied:
2744 load_ptr_0 < load_ptr_1 &&
2745 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2747 (this condition means, in each iteration of vectorized loop,
2748 the accessed memory of store_ptr_0 cannot be between the memory
2749 of load_ptr_0 and load_ptr_1.)
2751 we then can use only the following expression to finish the
2752 alising checks between store_ptr_0 & load_ptr_0 and
2753 store_ptr_0 & load_ptr_1:
2755 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2756 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2758 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2759 same basic address. */
2761 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2763 /* First, we collect all data ref pairs for aliasing checks. */
2764 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2766 struct data_reference
*dr_a
, *dr_b
;
2767 gimple dr_group_first_a
, dr_group_first_b
;
2768 tree segment_length_a
, segment_length_b
;
2769 gimple stmt_a
, stmt_b
;
2772 stmt_a
= DR_STMT (DDR_A (ddr
));
2773 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2774 if (dr_group_first_a
)
2776 stmt_a
= dr_group_first_a
;
2777 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2781 stmt_b
= DR_STMT (DDR_B (ddr
));
2782 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2783 if (dr_group_first_b
)
2785 stmt_b
= dr_group_first_b
;
2786 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2789 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2790 length_factor
= scalar_loop_iters
;
2792 length_factor
= size_int (vect_factor
);
2793 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2794 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2796 dr_with_seg_len_pair_t dr_with_seg_len_pair
2797 (dr_with_seg_len (dr_a
, segment_length_a
),
2798 dr_with_seg_len (dr_b
, segment_length_b
));
2800 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2801 swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2803 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2806 /* Second, we sort the collected data ref pairs so that we can scan
2807 them once to combine all possible aliasing checks. */
2808 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2810 /* Third, we scan the sorted dr pairs and check if we can combine
2811 alias checks of two neighbouring dr pairs. */
2812 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2814 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2815 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2816 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2817 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2818 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2820 /* Remove duplicate data ref pairs. */
2821 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2823 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_NOTE
, vect_location
,
2826 "found equal ranges ");
2827 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2828 DR_REF (dr_a1
->dr
));
2829 dump_printf (MSG_NOTE
, ", ");
2830 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2831 DR_REF (dr_b1
->dr
));
2832 dump_printf (MSG_NOTE
, " and ");
2833 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2834 DR_REF (dr_a2
->dr
));
2835 dump_printf (MSG_NOTE
, ", ");
2836 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2837 DR_REF (dr_b2
->dr
));
2838 dump_printf (MSG_NOTE
, "\n");
2841 comp_alias_ddrs
.ordered_remove (i
--);
2845 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2847 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2848 and DR_A1 and DR_A2 are two consecutive memrefs. */
2849 if (*dr_a1
== *dr_a2
)
2851 swap (dr_a1
, dr_b1
);
2852 swap (dr_a2
, dr_b2
);
2855 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2856 DR_BASE_ADDRESS (dr_a2
->dr
),
2858 || !tree_fits_shwi_p (dr_a1
->offset
)
2859 || !tree_fits_shwi_p (dr_a2
->offset
))
2862 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2863 - tree_to_shwi (dr_a1
->offset
));
2866 /* Now we check if the following condition is satisfied:
2868 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2870 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2871 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2872 have to make a best estimation. We can get the minimum value
2873 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2874 then either of the following two conditions can guarantee the
2877 1: DIFF <= MIN_SEG_LEN_B
2878 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2883 min_seg_len_b
= (TREE_CODE (dr_b1
->seg_len
) == INTEGER_CST
) ?
2884 TREE_INT_CST_LOW (dr_b1
->seg_len
) :
2887 if (diff
<= min_seg_len_b
2888 || (TREE_CODE (dr_a1
->seg_len
) == INTEGER_CST
2889 && diff
- (HOST_WIDE_INT
) TREE_INT_CST_LOW (dr_a1
->seg_len
) <
2892 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2893 dr_a2
->seg_len
, size_int (diff
));
2894 comp_alias_ddrs
.ordered_remove (i
--);
2899 if ((int) comp_alias_ddrs
.length () >
2900 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2902 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2905 "disable versioning for alias - max number of "
2906 "generated checks exceeded.\n");
2915 /* Check whether a non-affine read in stmt is suitable for gather load
2916 and if so, return a builtin decl for that operation. */
2919 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2920 tree
*offp
, int *scalep
)
2922 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2923 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2924 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2925 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2926 tree offtype
= NULL_TREE
;
2927 tree decl
, base
, off
;
2928 enum machine_mode pmode
;
2929 int punsignedp
, pvolatilep
;
2932 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2933 see if we can use the def stmt of the address. */
2934 if (is_gimple_call (stmt
)
2935 && gimple_call_internal_p (stmt
)
2936 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
2937 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
2938 && TREE_CODE (base
) == MEM_REF
2939 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
2940 && integer_zerop (TREE_OPERAND (base
, 1))
2941 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
2943 gimple def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
2944 if (is_gimple_assign (def_stmt
)
2945 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
2946 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
2949 /* The gather builtins need address of the form
2950 loop_invariant + vector * {1, 2, 4, 8}
2952 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2953 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2954 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2955 multiplications and additions in it. To get a vector, we need
2956 a single SSA_NAME that will be defined in the loop and will
2957 contain everything that is not loop invariant and that can be
2958 vectorized. The following code attempts to find such a preexistng
2959 SSA_NAME OFF and put the loop invariants into a tree BASE
2960 that can be gimplified before the loop. */
2961 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
,
2962 &pmode
, &punsignedp
, &pvolatilep
, false);
2963 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2965 if (TREE_CODE (base
) == MEM_REF
)
2967 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2969 if (off
== NULL_TREE
)
2971 double_int moff
= mem_ref_offset (base
);
2972 off
= double_int_to_tree (sizetype
, moff
);
2975 off
= size_binop (PLUS_EXPR
, off
,
2976 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2978 base
= TREE_OPERAND (base
, 0);
2981 base
= build_fold_addr_expr (base
);
2983 if (off
== NULL_TREE
)
2984 off
= size_zero_node
;
2986 /* If base is not loop invariant, either off is 0, then we start with just
2987 the constant offset in the loop invariant BASE and continue with base
2988 as OFF, otherwise give up.
2989 We could handle that case by gimplifying the addition of base + off
2990 into some SSA_NAME and use that as off, but for now punt. */
2991 if (!expr_invariant_in_loop_p (loop
, base
))
2993 if (!integer_zerop (off
))
2996 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
2998 /* Otherwise put base + constant offset into the loop invariant BASE
2999 and continue with OFF. */
3002 base
= fold_convert (sizetype
, base
);
3003 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3006 /* OFF at this point may be either a SSA_NAME or some tree expression
3007 from get_inner_reference. Try to peel off loop invariants from it
3008 into BASE as long as possible. */
3010 while (offtype
== NULL_TREE
)
3012 enum tree_code code
;
3013 tree op0
, op1
, add
= NULL_TREE
;
3015 if (TREE_CODE (off
) == SSA_NAME
)
3017 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3019 if (expr_invariant_in_loop_p (loop
, off
))
3022 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3025 op0
= gimple_assign_rhs1 (def_stmt
);
3026 code
= gimple_assign_rhs_code (def_stmt
);
3027 op1
= gimple_assign_rhs2 (def_stmt
);
3031 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3033 code
= TREE_CODE (off
);
3034 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3038 case POINTER_PLUS_EXPR
:
3040 if (expr_invariant_in_loop_p (loop
, op0
))
3045 add
= fold_convert (sizetype
, add
);
3047 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3048 base
= size_binop (PLUS_EXPR
, base
, add
);
3051 if (expr_invariant_in_loop_p (loop
, op1
))
3059 if (expr_invariant_in_loop_p (loop
, op1
))
3061 add
= fold_convert (sizetype
, op1
);
3062 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3068 if (scale
== 1 && tree_fits_shwi_p (op1
))
3070 scale
= tree_to_shwi (op1
);
3079 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3080 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3082 if (TYPE_PRECISION (TREE_TYPE (op0
))
3083 == TYPE_PRECISION (TREE_TYPE (off
)))
3088 if (TYPE_PRECISION (TREE_TYPE (op0
))
3089 < TYPE_PRECISION (TREE_TYPE (off
)))
3092 offtype
= TREE_TYPE (off
);
3103 /* If at the end OFF still isn't a SSA_NAME or isn't
3104 defined in the loop, punt. */
3105 if (TREE_CODE (off
) != SSA_NAME
3106 || expr_invariant_in_loop_p (loop
, off
))
3109 if (offtype
== NULL_TREE
)
3110 offtype
= TREE_TYPE (off
);
3112 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3114 if (decl
== NULL_TREE
)
3126 /* Function vect_analyze_data_refs.
3128 Find all the data references in the loop or basic block.
3130 The general structure of the analysis of data refs in the vectorizer is as
3132 1- vect_analyze_data_refs(loop/bb): call
3133 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3134 in the loop/bb and their dependences.
3135 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3136 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3137 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3142 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3143 bb_vec_info bb_vinfo
,
3146 struct loop
*loop
= NULL
;
3147 basic_block bb
= NULL
;
3149 vec
<data_reference_p
> datarefs
;
3150 struct data_reference
*dr
;
3153 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_NOTE
, vect_location
,
3155 "=== vect_analyze_data_refs ===\n");
3159 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3161 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3162 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3163 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3167 "not vectorized: loop contains function calls"
3168 " or data references that cannot be analyzed\n");
3172 for (i
= 0; i
< loop
->num_nodes
; i
++)
3174 gimple_stmt_iterator gsi
;
3176 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3178 gimple stmt
= gsi_stmt (gsi
);
3179 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3181 if (is_gimple_call (stmt
) && loop
->safelen
)
3183 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3184 if (fndecl
!= NULL_TREE
)
3186 struct cgraph_node
*node
= cgraph_get_node (fndecl
);
3187 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3189 unsigned int j
, n
= gimple_call_num_args (stmt
);
3190 for (j
= 0; j
< n
; j
++)
3192 op
= gimple_call_arg (stmt
, j
);
3194 || (REFERENCE_CLASS_P (op
)
3195 && get_base_address (op
)))
3198 op
= gimple_call_lhs (stmt
);
3199 /* Ignore #pragma omp declare simd functions
3200 if they don't have data references in the
3201 call stmt itself. */
3205 || (REFERENCE_CLASS_P (op
)
3206 && get_base_address (op
)))))
3211 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3214 "not vectorized: loop contains function "
3215 "calls or data references that cannot "
3222 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3226 gimple_stmt_iterator gsi
;
3228 bb
= BB_VINFO_BB (bb_vinfo
);
3229 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3231 gimple stmt
= gsi_stmt (gsi
);
3232 if (!find_data_references_in_stmt (NULL
, stmt
,
3233 &BB_VINFO_DATAREFS (bb_vinfo
)))
3235 /* Mark the rest of the basic-block as unvectorizable. */
3236 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3238 stmt
= gsi_stmt (gsi
);
3239 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3245 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3248 /* Go through the data-refs, check that the analysis succeeded. Update
3249 pointer from stmt_vec_info struct to DR and vectype. */
3251 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3254 stmt_vec_info stmt_info
;
3255 tree base
, offset
, init
;
3256 bool gather
= false;
3257 bool simd_lane_access
= false;
3261 if (!dr
|| !DR_REF (dr
))
3263 if (dump_enabled_p ())
3264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3265 "not vectorized: unhandled data-ref\n");
3269 stmt
= DR_STMT (dr
);
3270 stmt_info
= vinfo_for_stmt (stmt
);
3272 /* Discard clobbers from the dataref vector. We will remove
3273 clobber stmts during vectorization. */
3274 if (gimple_clobber_p (stmt
))
3276 if (i
== datarefs
.length () - 1)
3281 datarefs
[i
] = datarefs
.pop ();
3285 /* Check that analysis of the data-ref succeeded. */
3286 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3291 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3292 && targetm
.vectorize
.builtin_gather
!= NULL
;
3293 bool maybe_simd_lane_access
3294 = loop_vinfo
&& loop
->simduid
;
3296 /* If target supports vector gather loads, or if this might be
3297 a SIMD lane access, see if they can't be used. */
3299 && (maybe_gather
|| maybe_simd_lane_access
)
3300 && !nested_in_vect_loop_p (loop
, stmt
))
3302 struct data_reference
*newdr
3303 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3304 DR_REF (dr
), stmt
, true);
3305 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3306 if (DR_BASE_ADDRESS (newdr
)
3307 && DR_OFFSET (newdr
)
3310 && integer_zerop (DR_STEP (newdr
)))
3312 if (maybe_simd_lane_access
)
3314 tree off
= DR_OFFSET (newdr
);
3316 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3317 && TREE_CODE (off
) == MULT_EXPR
3318 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3320 tree step
= TREE_OPERAND (off
, 1);
3321 off
= TREE_OPERAND (off
, 0);
3323 if (CONVERT_EXPR_P (off
)
3324 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3326 < TYPE_PRECISION (TREE_TYPE (off
)))
3327 off
= TREE_OPERAND (off
, 0);
3328 if (TREE_CODE (off
) == SSA_NAME
)
3330 gimple def
= SSA_NAME_DEF_STMT (off
);
3331 tree reft
= TREE_TYPE (DR_REF (newdr
));
3332 if (is_gimple_call (def
)
3333 && gimple_call_internal_p (def
)
3334 && (gimple_call_internal_fn (def
)
3335 == IFN_GOMP_SIMD_LANE
))
3337 tree arg
= gimple_call_arg (def
, 0);
3338 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3339 arg
= SSA_NAME_VAR (arg
);
3340 if (arg
== loop
->simduid
3342 && tree_int_cst_equal
3343 (TYPE_SIZE_UNIT (reft
),
3346 DR_OFFSET (newdr
) = ssize_int (0);
3347 DR_STEP (newdr
) = step
;
3348 DR_ALIGNED_TO (newdr
)
3349 = size_int (BIGGEST_ALIGNMENT
);
3351 simd_lane_access
= true;
3357 if (!simd_lane_access
&& maybe_gather
)
3363 if (!gather
&& !simd_lane_access
)
3364 free_data_ref (newdr
);
3367 if (!gather
&& !simd_lane_access
)
3369 if (dump_enabled_p ())
3371 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3372 "not vectorized: data ref analysis "
3374 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3375 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3385 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3389 "not vectorized: base addr of dr is a "
3395 if (gather
|| simd_lane_access
)
3400 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3405 "not vectorized: volatile type ");
3406 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3407 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3416 if (stmt_can_throw_internal (stmt
))
3418 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3421 "not vectorized: statement can throw an "
3423 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3424 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3430 if (gather
|| simd_lane_access
)
3435 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3436 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3438 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3441 "not vectorized: statement is bitfield "
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3444 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3450 if (gather
|| simd_lane_access
)
3455 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3456 offset
= unshare_expr (DR_OFFSET (dr
));
3457 init
= unshare_expr (DR_INIT (dr
));
3459 if (is_gimple_call (stmt
)
3460 && (!gimple_call_internal_p (stmt
)
3461 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3462 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3467 "not vectorized: dr in a call ");
3468 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3469 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3475 if (gather
|| simd_lane_access
)
3480 /* Update DR field in stmt_vec_info struct. */
3482 /* If the dataref is in an inner-loop of the loop that is considered for
3483 for vectorization, we also want to analyze the access relative to
3484 the outer-loop (DR contains information only relative to the
3485 inner-most enclosing loop). We do that by building a reference to the
3486 first location accessed by the inner-loop, and analyze it relative to
3488 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3490 tree outer_step
, outer_base
, outer_init
;
3491 HOST_WIDE_INT pbitsize
, pbitpos
;
3493 enum machine_mode pmode
;
3494 int punsignedp
, pvolatilep
;
3495 affine_iv base_iv
, offset_iv
;
3498 /* Build a reference to the first location accessed by the
3499 inner-loop: *(BASE+INIT). (The first location is actually
3500 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3501 tree inner_base
= build_fold_indirect_ref
3502 (fold_build_pointer_plus (base
, init
));
3504 if (dump_enabled_p ())
3506 dump_printf_loc (MSG_NOTE
, vect_location
,
3507 "analyze in outer-loop: ");
3508 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3509 dump_printf (MSG_NOTE
, "\n");
3512 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3513 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3514 gcc_assert (outer_base
!= NULL_TREE
);
3516 if (pbitpos
% BITS_PER_UNIT
!= 0)
3518 if (dump_enabled_p ())
3519 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3520 "failed: bit offset alignment.\n");
3524 outer_base
= build_fold_addr_expr (outer_base
);
3525 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3528 if (dump_enabled_p ())
3529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3530 "failed: evolution of base is not affine.\n");
3537 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3545 offset_iv
.base
= ssize_int (0);
3546 offset_iv
.step
= ssize_int (0);
3548 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3551 if (dump_enabled_p ())
3552 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3553 "evolution of offset is not affine.\n");
3557 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3558 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3559 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3560 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3561 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3563 outer_step
= size_binop (PLUS_EXPR
,
3564 fold_convert (ssizetype
, base_iv
.step
),
3565 fold_convert (ssizetype
, offset_iv
.step
));
3567 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3568 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3569 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3570 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3571 STMT_VINFO_DR_OFFSET (stmt_info
) =
3572 fold_convert (ssizetype
, offset_iv
.base
);
3573 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3574 size_int (highest_pow2_factor (offset_iv
.base
));
3576 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_NOTE
, vect_location
,
3579 "\touter base_address: ");
3580 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3581 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3582 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3583 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3584 STMT_VINFO_DR_OFFSET (stmt_info
));
3585 dump_printf (MSG_NOTE
,
3586 "\n\touter constant offset from base address: ");
3587 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3588 STMT_VINFO_DR_INIT (stmt_info
));
3589 dump_printf (MSG_NOTE
, "\n\touter step: ");
3590 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3591 STMT_VINFO_DR_STEP (stmt_info
));
3592 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3593 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3594 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3595 dump_printf (MSG_NOTE
, "\n");
3599 if (STMT_VINFO_DATA_REF (stmt_info
))
3601 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3604 "not vectorized: more than one data ref "
3606 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3607 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3613 if (gather
|| simd_lane_access
)
3618 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3619 if (simd_lane_access
)
3621 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3625 /* Set vectype for STMT. */
3626 scalar_type
= TREE_TYPE (DR_REF (dr
));
3627 STMT_VINFO_VECTYPE (stmt_info
) =
3628 get_vectype_for_scalar_type (scalar_type
);
3629 if (!STMT_VINFO_VECTYPE (stmt_info
))
3631 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3634 "not vectorized: no vectype for stmt: ");
3635 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3636 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3637 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3639 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3645 if (gather
|| simd_lane_access
)
3647 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3654 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_NOTE
, vect_location
,
3657 "got vectype for stmt: ");
3658 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3659 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3660 STMT_VINFO_VECTYPE (stmt_info
));
3661 dump_printf (MSG_NOTE
, "\n");
3665 /* Adjust the minimal vectorization factor according to the
3667 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3675 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3677 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3681 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3683 if (dump_enabled_p ())
3685 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3686 "not vectorized: not suitable for gather "
3688 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3689 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3695 STMT_VINFO_GATHER_P (stmt_info
) = true;
3698 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3700 if (nested_in_vect_loop_p (loop
, stmt
)
3701 || !DR_IS_READ (dr
))
3703 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3706 "not vectorized: not suitable for strided "
3708 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3709 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3713 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3717 /* If we stopped analysis at the first dataref we could not analyze
3718 when trying to vectorize a basic-block mark the rest of the datarefs
3719 as not vectorizable and truncate the vector of datarefs. That
3720 avoids spending useless time in analyzing their dependence. */
3721 if (i
!= datarefs
.length ())
3723 gcc_assert (bb_vinfo
!= NULL
);
3724 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3726 data_reference_p dr
= datarefs
[j
];
3727 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3730 datarefs
.truncate (i
);
3737 /* Function vect_get_new_vect_var.
3739 Returns a name for a new variable. The current naming scheme appends the
3740 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3741 the name of vectorizer generated variables, and appends that to NAME if
3745 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3752 case vect_simple_var
:
3755 case vect_scalar_var
:
3758 case vect_pointer_var
:
3767 char* tmp
= concat (prefix
, "_", name
, NULL
);
3768 new_vect_var
= create_tmp_reg (type
, tmp
);
3772 new_vect_var
= create_tmp_reg (type
, prefix
);
3774 return new_vect_var
;
3778 /* Function vect_create_addr_base_for_vector_ref.
3780 Create an expression that computes the address of the first memory location
3781 that will be accessed for a data reference.
3784 STMT: The statement containing the data reference.
3785 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3786 OFFSET: Optional. If supplied, it is be added to the initial address.
3787 LOOP: Specify relative to which loop-nest should the address be computed.
3788 For example, when the dataref is in an inner-loop nested in an
3789 outer-loop that is now being vectorized, LOOP can be either the
3790 outer-loop, or the inner-loop. The first memory location accessed
3791 by the following dataref ('in' points to short):
3798 if LOOP=i_loop: &in (relative to i_loop)
3799 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3802 1. Return an SSA_NAME whose value is the address of the memory location of
3803 the first vector of the data reference.
3804 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3805 these statement(s) which define the returned SSA_NAME.
3807 FORNOW: We are only handling array accesses with step 1. */
3810 vect_create_addr_base_for_vector_ref (gimple stmt
,
3811 gimple_seq
*new_stmt_list
,
3815 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3816 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3818 const char *base_name
;
3821 gimple_seq seq
= NULL
;
3825 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3826 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3828 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3830 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3832 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3834 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3835 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3836 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3840 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3841 base_offset
= unshare_expr (DR_OFFSET (dr
));
3842 init
= unshare_expr (DR_INIT (dr
));
3846 base_name
= get_name (data_ref_base
);
3849 base_offset
= ssize_int (0);
3850 init
= ssize_int (0);
3851 base_name
= get_name (DR_REF (dr
));
3854 /* Create base_offset */
3855 base_offset
= size_binop (PLUS_EXPR
,
3856 fold_convert (sizetype
, base_offset
),
3857 fold_convert (sizetype
, init
));
3861 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3862 fold_convert (sizetype
, offset
), step
);
3863 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3864 base_offset
, offset
);
3867 /* base + base_offset */
3869 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3872 addr_base
= build1 (ADDR_EXPR
,
3873 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3874 unshare_expr (DR_REF (dr
)));
3877 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3878 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3879 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3880 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3881 gimple_seq_add_seq (new_stmt_list
, seq
);
3883 if (DR_PTR_INFO (dr
)
3884 && TREE_CODE (addr_base
) == SSA_NAME
)
3886 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3888 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3891 if (dump_enabled_p ())
3893 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3894 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3895 dump_printf (MSG_NOTE
, "\n");
3902 /* Function vect_create_data_ref_ptr.
3904 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3905 location accessed in the loop by STMT, along with the def-use update
3906 chain to appropriately advance the pointer through the loop iterations.
3907 Also set aliasing information for the pointer. This pointer is used by
3908 the callers to this function to create a memory reference expression for
3909 vector load/store access.
3912 1. STMT: a stmt that references memory. Expected to be of the form
3913 GIMPLE_ASSIGN <name, data-ref> or
3914 GIMPLE_ASSIGN <data-ref, name>.
3915 2. AGGR_TYPE: the type of the reference, which should be either a vector
3917 3. AT_LOOP: the loop where the vector memref is to be created.
3918 4. OFFSET (optional): an offset to be added to the initial address accessed
3919 by the data-ref in STMT.
3920 5. BSI: location where the new stmts are to be placed if there is no loop
3921 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3922 pointing to the initial address.
3925 1. Declare a new ptr to vector_type, and have it point to the base of the
3926 data reference (initial addressed accessed by the data reference).
3927 For example, for vector of type V8HI, the following code is generated:
3930 ap = (v8hi *)initial_address;
3932 if OFFSET is not supplied:
3933 initial_address = &a[init];
3934 if OFFSET is supplied:
3935 initial_address = &a[init + OFFSET];
3937 Return the initial_address in INITIAL_ADDRESS.
3939 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3940 update the pointer in each iteration of the loop.
3942 Return the increment stmt that updates the pointer in PTR_INCR.
3944 3. Set INV_P to true if the access pattern of the data reference in the
3945 vectorized loop is invariant. Set it to false otherwise.
3947 4. Return the pointer. */
3950 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3951 tree offset
, tree
*initial_address
,
3952 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3953 bool only_init
, bool *inv_p
)
3955 const char *base_name
;
3956 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3957 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3958 struct loop
*loop
= NULL
;
3959 bool nested_in_vect_loop
= false;
3960 struct loop
*containing_loop
= NULL
;
3965 gimple_seq new_stmt_list
= NULL
;
3969 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3971 gimple_stmt_iterator incr_gsi
;
3973 tree indx_before_incr
, indx_after_incr
;
3976 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3978 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3979 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3983 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3984 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3985 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3986 pe
= loop_preheader_edge (loop
);
3990 gcc_assert (bb_vinfo
);
3995 /* Check the step (evolution) of the load in LOOP, and record
3996 whether it's invariant. */
3997 if (nested_in_vect_loop
)
3998 step
= STMT_VINFO_DR_STEP (stmt_info
);
4000 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4002 if (integer_zerop (step
))
4007 /* Create an expression for the first address accessed by this load
4009 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4011 if (dump_enabled_p ())
4013 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4014 dump_printf_loc (MSG_NOTE
, vect_location
,
4015 "create %s-pointer variable to type: ",
4016 get_tree_code_name (TREE_CODE (aggr_type
)));
4017 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4018 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4019 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4020 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4021 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4022 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4023 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4025 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4026 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4027 dump_printf (MSG_NOTE
, "\n");
4030 /* (1) Create the new aggregate-pointer variable.
4031 Vector and array types inherit the alias set of their component
4032 type by default so we need to use a ref-all pointer if the data
4033 reference does not conflict with the created aggregated data
4034 reference because it is not addressable. */
4035 bool need_ref_all
= false;
4036 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4037 get_alias_set (DR_REF (dr
))))
4038 need_ref_all
= true;
4039 /* Likewise for any of the data references in the stmt group. */
4040 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4042 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4045 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4046 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4047 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4048 get_alias_set (DR_REF (sdr
))))
4050 need_ref_all
= true;
4053 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4057 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4059 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4062 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4063 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4064 def-use update cycles for the pointer: one relative to the outer-loop
4065 (LOOP), which is what steps (3) and (4) below do. The other is relative
4066 to the inner-loop (which is the inner-most loop containing the dataref),
4067 and this is done be step (5) below.
4069 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4070 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4071 redundant. Steps (3),(4) create the following:
4074 LOOP: vp1 = phi(vp0,vp2)
4080 If there is an inner-loop nested in loop, then step (5) will also be
4081 applied, and an additional update in the inner-loop will be created:
4084 LOOP: vp1 = phi(vp0,vp2)
4086 inner: vp3 = phi(vp1,vp4)
4087 vp4 = vp3 + inner_step
4093 /* (2) Calculate the initial address of the aggregate-pointer, and set
4094 the aggregate-pointer to point to it before the loop. */
4096 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4098 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4104 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4105 gcc_assert (!new_bb
);
4108 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4111 *initial_address
= new_temp
;
4113 /* Create: p = (aggr_type *) initial_base */
4114 if (TREE_CODE (new_temp
) != SSA_NAME
4115 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
4117 vec_stmt
= gimple_build_assign (aggr_ptr
,
4118 fold_convert (aggr_ptr_type
, new_temp
));
4119 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
4120 /* Copy the points-to information if it exists. */
4121 if (DR_PTR_INFO (dr
))
4122 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
4123 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
4126 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
4127 gcc_assert (!new_bb
);
4130 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
4133 aggr_ptr_init
= new_temp
;
4135 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4136 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4137 inner-loop nested in LOOP (during outer-loop vectorization). */
4139 /* No update in loop is required. */
4140 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4141 aptr
= aggr_ptr_init
;
4144 /* The step of the aggregate pointer is the type size. */
4145 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4146 /* One exception to the above is when the scalar step of the load in
4147 LOOP is zero. In this case the step here is also zero. */
4149 iv_step
= size_zero_node
;
4150 else if (tree_int_cst_sgn (step
) == -1)
4151 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4153 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4155 create_iv (aggr_ptr_init
,
4156 fold_convert (aggr_ptr_type
, iv_step
),
4157 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4158 &indx_before_incr
, &indx_after_incr
);
4159 incr
= gsi_stmt (incr_gsi
);
4160 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4162 /* Copy the points-to information if it exists. */
4163 if (DR_PTR_INFO (dr
))
4165 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4166 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4171 aptr
= indx_before_incr
;
4174 if (!nested_in_vect_loop
|| only_init
)
4178 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4179 nested in LOOP, if exists. */
4181 gcc_assert (nested_in_vect_loop
);
4184 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4186 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4187 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4189 incr
= gsi_stmt (incr_gsi
);
4190 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4192 /* Copy the points-to information if it exists. */
4193 if (DR_PTR_INFO (dr
))
4195 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4196 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4201 return indx_before_incr
;
4208 /* Function bump_vector_ptr
4210 Increment a pointer (to a vector type) by vector-size. If requested,
4211 i.e. if PTR-INCR is given, then also connect the new increment stmt
4212 to the existing def-use update-chain of the pointer, by modifying
4213 the PTR_INCR as illustrated below:
4215 The pointer def-use update-chain before this function:
4216 DATAREF_PTR = phi (p_0, p_2)
4218 PTR_INCR: p_2 = DATAREF_PTR + step
4220 The pointer def-use update-chain after this function:
4221 DATAREF_PTR = phi (p_0, p_2)
4223 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4225 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4228 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4230 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4231 the loop. The increment amount across iterations is expected
4233 BSI - location where the new update stmt is to be placed.
4234 STMT - the original scalar memory-access stmt that is being vectorized.
4235 BUMP - optional. The offset by which to bump the pointer. If not given,
4236 the offset is assumed to be vector_size.
4238 Output: Return NEW_DATAREF_PTR as illustrated above.
4243 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4244 gimple stmt
, tree bump
)
4246 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4247 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4248 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4249 tree update
= TYPE_SIZE_UNIT (vectype
);
4252 use_operand_p use_p
;
4253 tree new_dataref_ptr
;
4258 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
4259 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
4260 dataref_ptr
, update
);
4261 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr
))
4266 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4267 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4271 return new_dataref_ptr
;
4273 /* Update the vector-pointer's cross-iteration increment. */
4274 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4276 tree use
= USE_FROM_PTR (use_p
);
4278 if (use
== dataref_ptr
)
4279 SET_USE (use_p
, new_dataref_ptr
);
4281 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4284 return new_dataref_ptr
;
4288 /* Function vect_create_destination_var.
4290 Create a new temporary of type VECTYPE. */
4293 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4299 enum vect_var_kind kind
;
4301 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4302 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4304 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4306 name
= get_name (scalar_dest
);
4308 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4310 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
4311 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4317 /* Function vect_grouped_store_supported.
4319 Returns TRUE if interleave high and interleave low permutations
4320 are supported, and FALSE otherwise. */
4323 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4325 enum machine_mode mode
= TYPE_MODE (vectype
);
4327 /* vect_permute_store_chain requires the group size to be a power of two. */
4328 if (exact_log2 (count
) == -1)
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4332 "the size of the group of accesses"
4333 " is not a power of 2\n");
4337 /* Check that the permutation is supported. */
4338 if (VECTOR_MODE_P (mode
))
4340 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4341 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4342 for (i
= 0; i
< nelt
/ 2; i
++)
4345 sel
[i
* 2 + 1] = i
+ nelt
;
4347 if (can_vec_perm_p (mode
, false, sel
))
4349 for (i
= 0; i
< nelt
; i
++)
4351 if (can_vec_perm_p (mode
, false, sel
))
4356 if (dump_enabled_p ())
4357 dump_printf (MSG_MISSED_OPTIMIZATION
,
4358 "interleave op not supported by target.\n");
4363 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4367 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4369 return vect_lanes_optab_supported_p ("vec_store_lanes",
4370 vec_store_lanes_optab
,
4375 /* Function vect_permute_store_chain.
4377 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4378 a power of 2, generate interleave_high/low stmts to reorder the data
4379 correctly for the stores. Return the final references for stores in
4382 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4383 The input is 4 vectors each containing 8 elements. We assign a number to
4384 each element, the input sequence is:
4386 1st vec: 0 1 2 3 4 5 6 7
4387 2nd vec: 8 9 10 11 12 13 14 15
4388 3rd vec: 16 17 18 19 20 21 22 23
4389 4th vec: 24 25 26 27 28 29 30 31
4391 The output sequence should be:
4393 1st vec: 0 8 16 24 1 9 17 25
4394 2nd vec: 2 10 18 26 3 11 19 27
4395 3rd vec: 4 12 20 28 5 13 21 30
4396 4th vec: 6 14 22 30 7 15 23 31
4398 i.e., we interleave the contents of the four vectors in their order.
4400 We use interleave_high/low instructions to create such output. The input of
4401 each interleave_high/low operation is two vectors:
4404 the even elements of the result vector are obtained left-to-right from the
4405 high/low elements of the first vector. The odd elements of the result are
4406 obtained left-to-right from the high/low elements of the second vector.
4407 The output of interleave_high will be: 0 4 1 5
4408 and of interleave_low: 2 6 3 7
4411 The permutation is done in log LENGTH stages. In each stage interleave_high
4412 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4413 where the first argument is taken from the first half of DR_CHAIN and the
4414 second argument from it's second half.
4417 I1: interleave_high (1st vec, 3rd vec)
4418 I2: interleave_low (1st vec, 3rd vec)
4419 I3: interleave_high (2nd vec, 4th vec)
4420 I4: interleave_low (2nd vec, 4th vec)
4422 The output for the first stage is:
4424 I1: 0 16 1 17 2 18 3 19
4425 I2: 4 20 5 21 6 22 7 23
4426 I3: 8 24 9 25 10 26 11 27
4427 I4: 12 28 13 29 14 30 15 31
4429 The output of the second stage, i.e. the final result is:
4431 I1: 0 8 16 24 1 9 17 25
4432 I2: 2 10 18 26 3 11 19 27
4433 I3: 4 12 20 28 5 13 21 30
4434 I4: 6 14 22 30 7 15 23 31. */
4437 vect_permute_store_chain (vec
<tree
> dr_chain
,
4438 unsigned int length
,
4440 gimple_stmt_iterator
*gsi
,
4441 vec
<tree
> *result_chain
)
4443 tree vect1
, vect2
, high
, low
;
4445 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4446 tree perm_mask_low
, perm_mask_high
;
4448 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4449 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4451 result_chain
->quick_grow (length
);
4452 memcpy (result_chain
->address (), dr_chain
.address (),
4453 length
* sizeof (tree
));
4455 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4458 sel
[i
* 2 + 1] = i
+ nelt
;
4460 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4461 gcc_assert (perm_mask_high
!= NULL
);
4463 for (i
= 0; i
< nelt
; i
++)
4465 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4466 gcc_assert (perm_mask_low
!= NULL
);
4468 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4470 for (j
= 0; j
< length
/2; j
++)
4472 vect1
= dr_chain
[j
];
4473 vect2
= dr_chain
[j
+length
/2];
4475 /* Create interleaving stmt:
4476 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4477 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4479 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4480 vect1
, vect2
, perm_mask_high
);
4481 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4482 (*result_chain
)[2*j
] = high
;
4484 /* Create interleaving stmt:
4485 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4486 nelt*3/2+1, ...}> */
4487 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4489 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4490 vect1
, vect2
, perm_mask_low
);
4491 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4492 (*result_chain
)[2*j
+1] = low
;
4494 memcpy (dr_chain
.address (), result_chain
->address (),
4495 length
* sizeof (tree
));
4499 /* Function vect_setup_realignment
4501 This function is called when vectorizing an unaligned load using
4502 the dr_explicit_realign[_optimized] scheme.
4503 This function generates the following code at the loop prolog:
4506 x msq_init = *(floor(p)); # prolog load
4507 realignment_token = call target_builtin;
4509 x msq = phi (msq_init, ---)
4511 The stmts marked with x are generated only for the case of
4512 dr_explicit_realign_optimized.
4514 The code above sets up a new (vector) pointer, pointing to the first
4515 location accessed by STMT, and a "floor-aligned" load using that pointer.
4516 It also generates code to compute the "realignment-token" (if the relevant
4517 target hook was defined), and creates a phi-node at the loop-header bb
4518 whose arguments are the result of the prolog-load (created by this
4519 function) and the result of a load that takes place in the loop (to be
4520 created by the caller to this function).
4522 For the case of dr_explicit_realign_optimized:
4523 The caller to this function uses the phi-result (msq) to create the
4524 realignment code inside the loop, and sets up the missing phi argument,
4527 msq = phi (msq_init, lsq)
4528 lsq = *(floor(p')); # load in loop
4529 result = realign_load (msq, lsq, realignment_token);
4531 For the case of dr_explicit_realign:
4533 msq = *(floor(p)); # load in loop
4535 lsq = *(floor(p')); # load in loop
4536 result = realign_load (msq, lsq, realignment_token);
4539 STMT - (scalar) load stmt to be vectorized. This load accesses
4540 a memory location that may be unaligned.
4541 BSI - place where new code is to be inserted.
4542 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4546 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4547 target hook, if defined.
4548 Return value - the result of the loop-header phi node. */
4551 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4552 tree
*realignment_token
,
4553 enum dr_alignment_support alignment_support_scheme
,
4555 struct loop
**at_loop
)
4557 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4558 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4559 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4560 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4561 struct loop
*loop
= NULL
;
4563 tree scalar_dest
= gimple_assign_lhs (stmt
);
4570 tree msq_init
= NULL_TREE
;
4573 tree msq
= NULL_TREE
;
4574 gimple_seq stmts
= NULL
;
4576 bool compute_in_loop
= false;
4577 bool nested_in_vect_loop
= false;
4578 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4579 struct loop
*loop_for_initial_load
= NULL
;
4583 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4584 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4587 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4588 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4590 /* We need to generate three things:
4591 1. the misalignment computation
4592 2. the extra vector load (for the optimized realignment scheme).
4593 3. the phi node for the two vectors from which the realignment is
4594 done (for the optimized realignment scheme). */
4596 /* 1. Determine where to generate the misalignment computation.
4598 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4599 calculation will be generated by this function, outside the loop (in the
4600 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4601 caller, inside the loop.
4603 Background: If the misalignment remains fixed throughout the iterations of
4604 the loop, then both realignment schemes are applicable, and also the
4605 misalignment computation can be done outside LOOP. This is because we are
4606 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4607 are a multiple of VS (the Vector Size), and therefore the misalignment in
4608 different vectorized LOOP iterations is always the same.
4609 The problem arises only if the memory access is in an inner-loop nested
4610 inside LOOP, which is now being vectorized using outer-loop vectorization.
4611 This is the only case when the misalignment of the memory access may not
4612 remain fixed throughout the iterations of the inner-loop (as explained in
4613 detail in vect_supportable_dr_alignment). In this case, not only is the
4614 optimized realignment scheme not applicable, but also the misalignment
4615 computation (and generation of the realignment token that is passed to
4616 REALIGN_LOAD) have to be done inside the loop.
4618 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4619 or not, which in turn determines if the misalignment is computed inside
4620 the inner-loop, or outside LOOP. */
4622 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4624 compute_in_loop
= true;
4625 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4629 /* 2. Determine where to generate the extra vector load.
4631 For the optimized realignment scheme, instead of generating two vector
4632 loads in each iteration, we generate a single extra vector load in the
4633 preheader of the loop, and in each iteration reuse the result of the
4634 vector load from the previous iteration. In case the memory access is in
4635 an inner-loop nested inside LOOP, which is now being vectorized using
4636 outer-loop vectorization, we need to determine whether this initial vector
4637 load should be generated at the preheader of the inner-loop, or can be
4638 generated at the preheader of LOOP. If the memory access has no evolution
4639 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4640 to be generated inside LOOP (in the preheader of the inner-loop). */
4642 if (nested_in_vect_loop
)
4644 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4645 bool invariant_in_outerloop
=
4646 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4647 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4650 loop_for_initial_load
= loop
;
4652 *at_loop
= loop_for_initial_load
;
4654 if (loop_for_initial_load
)
4655 pe
= loop_preheader_edge (loop_for_initial_load
);
4657 /* 3. For the case of the optimized realignment, create the first vector
4658 load at the loop preheader. */
4660 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4662 /* Create msq_init = *(floor(p1)) in the loop preheader */
4664 gcc_assert (!compute_in_loop
);
4665 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4666 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4667 NULL_TREE
, &init_addr
, NULL
, &inc
,
4669 new_temp
= copy_ssa_name (ptr
, NULL
);
4670 new_stmt
= gimple_build_assign_with_ops
4671 (BIT_AND_EXPR
, new_temp
, ptr
,
4672 build_int_cst (TREE_TYPE (ptr
),
4673 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4674 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4675 gcc_assert (!new_bb
);
4677 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4678 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4679 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4680 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4681 gimple_assign_set_lhs (new_stmt
, new_temp
);
4684 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4685 gcc_assert (!new_bb
);
4688 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4690 msq_init
= gimple_assign_lhs (new_stmt
);
4693 /* 4. Create realignment token using a target builtin, if available.
4694 It is done either inside the containing loop, or before LOOP (as
4695 determined above). */
4697 if (targetm
.vectorize
.builtin_mask_for_load
)
4701 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4704 /* Generate the INIT_ADDR computation outside LOOP. */
4705 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4709 pe
= loop_preheader_edge (loop
);
4710 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4711 gcc_assert (!new_bb
);
4714 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4717 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4718 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4720 vect_create_destination_var (scalar_dest
,
4721 gimple_call_return_type (new_stmt
));
4722 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4723 gimple_call_set_lhs (new_stmt
, new_temp
);
4725 if (compute_in_loop
)
4726 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4729 /* Generate the misalignment computation outside LOOP. */
4730 pe
= loop_preheader_edge (loop
);
4731 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4732 gcc_assert (!new_bb
);
4735 *realignment_token
= gimple_call_lhs (new_stmt
);
4737 /* The result of the CALL_EXPR to this builtin is determined from
4738 the value of the parameter and no global variables are touched
4739 which makes the builtin a "const" function. Requiring the
4740 builtin to have the "const" attribute makes it unnecessary
4741 to call mark_call_clobbered. */
4742 gcc_assert (TREE_READONLY (builtin_decl
));
4745 if (alignment_support_scheme
== dr_explicit_realign
)
4748 gcc_assert (!compute_in_loop
);
4749 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4752 /* 5. Create msq = phi <msq_init, lsq> in loop */
4754 pe
= loop_preheader_edge (containing_loop
);
4755 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4756 msq
= make_ssa_name (vec_dest
, NULL
);
4757 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4758 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4764 /* Function vect_grouped_load_supported.
4766 Returns TRUE if even and odd permutations are supported,
4767 and FALSE otherwise. */
4770 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4772 enum machine_mode mode
= TYPE_MODE (vectype
);
4774 /* vect_permute_load_chain requires the group size to be a power of two. */
4775 if (exact_log2 (count
) == -1)
4777 if (dump_enabled_p ())
4778 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4779 "the size of the group of accesses"
4780 " is not a power of 2\n");
4784 /* Check that the permutation is supported. */
4785 if (VECTOR_MODE_P (mode
))
4787 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4788 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4790 for (i
= 0; i
< nelt
; i
++)
4792 if (can_vec_perm_p (mode
, false, sel
))
4794 for (i
= 0; i
< nelt
; i
++)
4796 if (can_vec_perm_p (mode
, false, sel
))
4801 if (dump_enabled_p ())
4802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4803 "extract even/odd not supported by target\n");
4807 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4811 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4813 return vect_lanes_optab_supported_p ("vec_load_lanes",
4814 vec_load_lanes_optab
,
4818 /* Function vect_permute_load_chain.
4820 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4821 a power of 2, generate extract_even/odd stmts to reorder the input data
4822 correctly. Return the final references for loads in RESULT_CHAIN.
4824 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4825 The input is 4 vectors each containing 8 elements. We assign a number to each
4826 element, the input sequence is:
4828 1st vec: 0 1 2 3 4 5 6 7
4829 2nd vec: 8 9 10 11 12 13 14 15
4830 3rd vec: 16 17 18 19 20 21 22 23
4831 4th vec: 24 25 26 27 28 29 30 31
4833 The output sequence should be:
4835 1st vec: 0 4 8 12 16 20 24 28
4836 2nd vec: 1 5 9 13 17 21 25 29
4837 3rd vec: 2 6 10 14 18 22 26 30
4838 4th vec: 3 7 11 15 19 23 27 31
4840 i.e., the first output vector should contain the first elements of each
4841 interleaving group, etc.
4843 We use extract_even/odd instructions to create such output. The input of
4844 each extract_even/odd operation is two vectors
4848 and the output is the vector of extracted even/odd elements. The output of
4849 extract_even will be: 0 2 4 6
4850 and of extract_odd: 1 3 5 7
4853 The permutation is done in log LENGTH stages. In each stage extract_even
4854 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4855 their order. In our example,
4857 E1: extract_even (1st vec, 2nd vec)
4858 E2: extract_odd (1st vec, 2nd vec)
4859 E3: extract_even (3rd vec, 4th vec)
4860 E4: extract_odd (3rd vec, 4th vec)
4862 The output for the first stage will be:
4864 E1: 0 2 4 6 8 10 12 14
4865 E2: 1 3 5 7 9 11 13 15
4866 E3: 16 18 20 22 24 26 28 30
4867 E4: 17 19 21 23 25 27 29 31
4869 In order to proceed and create the correct sequence for the next stage (or
4870 for the correct output, if the second stage is the last one, as in our
4871 example), we first put the output of extract_even operation and then the
4872 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4873 The input for the second stage is:
4875 1st vec (E1): 0 2 4 6 8 10 12 14
4876 2nd vec (E3): 16 18 20 22 24 26 28 30
4877 3rd vec (E2): 1 3 5 7 9 11 13 15
4878 4th vec (E4): 17 19 21 23 25 27 29 31
4880 The output of the second stage:
4882 E1: 0 4 8 12 16 20 24 28
4883 E2: 2 6 10 14 18 22 26 30
4884 E3: 1 5 9 13 17 21 25 29
4885 E4: 3 7 11 15 19 23 27 31
4887 And RESULT_CHAIN after reordering:
4889 1st vec (E1): 0 4 8 12 16 20 24 28
4890 2nd vec (E3): 1 5 9 13 17 21 25 29
4891 3rd vec (E2): 2 6 10 14 18 22 26 30
4892 4th vec (E4): 3 7 11 15 19 23 27 31. */
4895 vect_permute_load_chain (vec
<tree
> dr_chain
,
4896 unsigned int length
,
4898 gimple_stmt_iterator
*gsi
,
4899 vec
<tree
> *result_chain
)
4901 tree data_ref
, first_vect
, second_vect
;
4902 tree perm_mask_even
, perm_mask_odd
;
4904 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4905 unsigned int i
, j
, log_length
= exact_log2 (length
);
4906 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4907 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4909 result_chain
->quick_grow (length
);
4910 memcpy (result_chain
->address (), dr_chain
.address (),
4911 length
* sizeof (tree
));
4913 for (i
= 0; i
< nelt
; ++i
)
4915 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4916 gcc_assert (perm_mask_even
!= NULL
);
4918 for (i
= 0; i
< nelt
; ++i
)
4920 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4921 gcc_assert (perm_mask_odd
!= NULL
);
4923 for (i
= 0; i
< log_length
; i
++)
4925 for (j
= 0; j
< length
; j
+= 2)
4927 first_vect
= dr_chain
[j
];
4928 second_vect
= dr_chain
[j
+1];
4930 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4931 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4932 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4933 first_vect
, second_vect
,
4935 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4936 (*result_chain
)[j
/2] = data_ref
;
4938 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4939 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4940 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4941 first_vect
, second_vect
,
4943 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4944 (*result_chain
)[j
/2+length
/2] = data_ref
;
4946 memcpy (dr_chain
.address (), result_chain
->address (),
4947 length
* sizeof (tree
));
4952 /* Function vect_transform_grouped_load.
4954 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4955 to perform their permutation and ascribe the result vectorized statements to
4956 the scalar statements.
4960 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4961 gimple_stmt_iterator
*gsi
)
4963 vec
<tree
> result_chain
= vNULL
;
4965 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4966 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4967 vectors, that are ready for vector computation. */
4968 result_chain
.create (size
);
4969 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4970 vect_record_grouped_load_vectors (stmt
, result_chain
);
4971 result_chain
.release ();
4974 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4975 generated as part of the vectorization of STMT. Assign the statement
4976 for each vector to the associated scalar statement. */
4979 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4981 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4982 gimple next_stmt
, new_stmt
;
4983 unsigned int i
, gap_count
;
4986 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4987 Since we scan the chain starting from it's first node, their order
4988 corresponds the order of data-refs in RESULT_CHAIN. */
4989 next_stmt
= first_stmt
;
4991 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
4996 /* Skip the gaps. Loads created for the gaps will be removed by dead
4997 code elimination pass later. No need to check for the first stmt in
4998 the group, since it always exists.
4999 GROUP_GAP is the number of steps in elements from the previous
5000 access (if there is no gap GROUP_GAP is 1). We skip loads that
5001 correspond to the gaps. */
5002 if (next_stmt
!= first_stmt
5003 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5011 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5012 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5013 copies, and we put the new vector statement in the first available
5015 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5016 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5019 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5022 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5024 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5027 prev_stmt
= rel_stmt
;
5029 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5032 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5037 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5039 /* If NEXT_STMT accesses the same DR as the previous statement,
5040 put the same TMP_DATA_REF as its vectorized statement; otherwise
5041 get the next data-ref from RESULT_CHAIN. */
5042 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5048 /* Function vect_force_dr_alignment_p.
5050 Returns whether the alignment of a DECL can be forced to be aligned
5051 on ALIGNMENT bit boundary. */
5054 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5056 if (TREE_CODE (decl
) != VAR_DECL
)
5059 /* We cannot change alignment of common or external symbols as another
5060 translation unit may contain a definition with lower alignment.
5061 The rules of common symbol linking mean that the definition
5062 will override the common symbol. The same is true for constant
5063 pool entries which may be shared and are not properly merged
5065 if (DECL_EXTERNAL (decl
)
5066 || DECL_COMMON (decl
)
5067 || DECL_IN_CONSTANT_POOL (decl
))
5070 if (TREE_ASM_WRITTEN (decl
))
5073 /* Do not override the alignment as specified by the ABI when the used
5074 attribute is set. */
5075 if (DECL_PRESERVE_P (decl
))
5078 /* Do not override explicit alignment set by the user when an explicit
5079 section name is also used. This is a common idiom used by many
5080 software projects. */
5081 if (DECL_SECTION_NAME (decl
) != NULL_TREE
5082 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
5085 if (TREE_STATIC (decl
))
5086 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5088 return (alignment
<= MAX_STACK_ALIGNMENT
);
5092 /* Return whether the data reference DR is supported with respect to its
5094 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5095 it is aligned, i.e., check if it is possible to vectorize it with different
5098 enum dr_alignment_support
5099 vect_supportable_dr_alignment (struct data_reference
*dr
,
5100 bool check_aligned_accesses
)
5102 gimple stmt
= DR_STMT (dr
);
5103 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5104 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5105 enum machine_mode mode
= TYPE_MODE (vectype
);
5106 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5107 struct loop
*vect_loop
= NULL
;
5108 bool nested_in_vect_loop
= false;
5110 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5113 /* For now assume all conditional loads/stores support unaligned
5114 access without any special code. */
5115 if (is_gimple_call (stmt
)
5116 && gimple_call_internal_p (stmt
)
5117 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5118 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5119 return dr_unaligned_supported
;
5123 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5124 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5127 /* Possibly unaligned access. */
5129 /* We can choose between using the implicit realignment scheme (generating
5130 a misaligned_move stmt) and the explicit realignment scheme (generating
5131 aligned loads with a REALIGN_LOAD). There are two variants to the
5132 explicit realignment scheme: optimized, and unoptimized.
5133 We can optimize the realignment only if the step between consecutive
5134 vector loads is equal to the vector size. Since the vector memory
5135 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5136 is guaranteed that the misalignment amount remains the same throughout the
5137 execution of the vectorized loop. Therefore, we can create the
5138 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5139 at the loop preheader.
5141 However, in the case of outer-loop vectorization, when vectorizing a
5142 memory access in the inner-loop nested within the LOOP that is now being
5143 vectorized, while it is guaranteed that the misalignment of the
5144 vectorized memory access will remain the same in different outer-loop
5145 iterations, it is *not* guaranteed that is will remain the same throughout
5146 the execution of the inner-loop. This is because the inner-loop advances
5147 with the original scalar step (and not in steps of VS). If the inner-loop
5148 step happens to be a multiple of VS, then the misalignment remains fixed
5149 and we can use the optimized realignment scheme. For example:
5155 When vectorizing the i-loop in the above example, the step between
5156 consecutive vector loads is 1, and so the misalignment does not remain
5157 fixed across the execution of the inner-loop, and the realignment cannot
5158 be optimized (as illustrated in the following pseudo vectorized loop):
5160 for (i=0; i<N; i+=4)
5161 for (j=0; j<M; j++){
5162 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5163 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5164 // (assuming that we start from an aligned address).
5167 We therefore have to use the unoptimized realignment scheme:
5169 for (i=0; i<N; i+=4)
5170 for (j=k; j<M; j+=4)
5171 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5172 // that the misalignment of the initial address is
5175 The loop can then be vectorized as follows:
5177 for (k=0; k<4; k++){
5178 rt = get_realignment_token (&vp[k]);
5179 for (i=0; i<N; i+=4){
5181 for (j=k; j<M; j+=4){
5183 va = REALIGN_LOAD <v1,v2,rt>;
5190 if (DR_IS_READ (dr
))
5192 bool is_packed
= false;
5193 tree type
= (TREE_TYPE (DR_REF (dr
)));
5195 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5196 && (!targetm
.vectorize
.builtin_mask_for_load
5197 || targetm
.vectorize
.builtin_mask_for_load ()))
5199 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5200 if ((nested_in_vect_loop
5201 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5202 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5204 return dr_explicit_realign
;
5206 return dr_explicit_realign_optimized
;
5208 if (!known_alignment_for_access_p (dr
))
5209 is_packed
= not_size_aligned (DR_REF (dr
));
5211 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5212 || targetm
.vectorize
.
5213 support_vector_misalignment (mode
, type
,
5214 DR_MISALIGNMENT (dr
), is_packed
))
5215 /* Can't software pipeline the loads, but can at least do them. */
5216 return dr_unaligned_supported
;
5220 bool is_packed
= false;
5221 tree type
= (TREE_TYPE (DR_REF (dr
)));
5223 if (!known_alignment_for_access_p (dr
))
5224 is_packed
= not_size_aligned (DR_REF (dr
));
5226 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5227 || targetm
.vectorize
.
5228 support_vector_misalignment (mode
, type
,
5229 DR_MISALIGNMENT (dr
), is_packed
))
5230 return dr_unaligned_supported
;
5234 return dr_unaligned_unsupported
;