1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40 /* Need to include rtl.h, expr.h, etc. for optabs. */
44 /* Return true if load- or store-lanes optab OPTAB is implemented for
45 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
49 tree vectype
, unsigned HOST_WIDE_INT count
)
51 enum machine_mode mode
, array_mode
;
54 mode
= TYPE_MODE (vectype
);
55 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
56 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
59 if (array_mode
== BLKmode
)
61 if (dump_enabled_p ())
62 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
63 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]",
64 GET_MODE_NAME (mode
), count
);
68 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
72 "cannot use %s<%s><%s>", name
,
73 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_NOTE
, vect_location
,
79 "can use %s<%s><%s>", name
, GET_MODE_NAME (array_mode
),
80 GET_MODE_NAME (mode
));
86 /* Return the smallest scalar part of STMT.
87 This is used to determine the vectype of the stmt. We generally set the
88 vectype according to the type of the result (lhs). For stmts whose
89 result-type is different than the type of the arguments (e.g., demotion,
90 promotion), vectype will be reset appropriately (later). Note that we have
91 to visit the smallest datatype in this function, because that determines the
92 VF. If the smallest datatype in the loop is present only as the rhs of a
93 promotion operation - we'd miss it.
94 Such a case, where a variable of this datatype does not appear in the lhs
95 anywhere in the loop, can only occur if it's an invariant: e.g.:
96 'int_x = (int) short_inv', which we'd expect to have been optimized away by
97 invariant motion. However, we cannot rely on invariant motion to always
98 take invariants out of the loop, and so in the case of promotion we also
99 have to check the rhs.
100 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
104 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
105 HOST_WIDE_INT
*rhs_size_unit
)
107 tree scalar_type
= gimple_expr_type (stmt
);
108 HOST_WIDE_INT lhs
, rhs
;
110 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
112 if (is_gimple_assign (stmt
)
113 && (gimple_assign_cast_p (stmt
)
114 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
115 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
116 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
118 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
120 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
122 scalar_type
= rhs_type
;
125 *lhs_size_unit
= lhs
;
126 *rhs_size_unit
= rhs
;
131 /* Check if data references pointed by DR_I and DR_J are same or
132 belong to same interleaving group. Return FALSE if drs are
133 different, otherwise return TRUE. */
136 vect_same_range_drs (data_reference_p dr_i
, data_reference_p dr_j
)
138 gimple stmt_i
= DR_STMT (dr_i
);
139 gimple stmt_j
= DR_STMT (dr_j
);
141 if (operand_equal_p (DR_REF (dr_i
), DR_REF (dr_j
), 0)
142 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
143 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
))
144 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
145 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
)))))
151 /* If address ranges represented by DDR_I and DDR_J are equal,
152 return TRUE, otherwise return FALSE. */
155 vect_vfa_range_equal (ddr_p ddr_i
, ddr_p ddr_j
)
157 if ((vect_same_range_drs (DDR_A (ddr_i
), DDR_A (ddr_j
))
158 && vect_same_range_drs (DDR_B (ddr_i
), DDR_B (ddr_j
)))
159 || (vect_same_range_drs (DDR_A (ddr_i
), DDR_B (ddr_j
))
160 && vect_same_range_drs (DDR_B (ddr_i
), DDR_A (ddr_j
))))
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
171 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
173 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
178 if (dump_enabled_p ())
180 dump_printf_loc (MSG_NOTE
, vect_location
,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
183 dump_printf (MSG_NOTE
, " and ");
184 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
187 if (optimize_loop_nest_for_size_p (loop
))
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
191 "versioning not supported when optimizing for size.");
195 /* FORNOW: We don't support versioning with outer-loop vectorization. */
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
200 "versioning not yet supported for outer-loops.");
204 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
207 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
209 if (dump_enabled_p ())
210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
211 "versioning not yet supported for non-constant "
216 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
221 /* Function vect_analyze_data_ref_dependence.
223 Return TRUE if there (might) exist a dependence between a memory-reference
224 DRA and a memory-reference DRB. When versioning for alias may check a
225 dependence at run-time, return FALSE. Adjust *MAX_VF according to
226 the data dependence. */
229 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
230 loop_vec_info loop_vinfo
, int *max_vf
)
233 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
234 struct data_reference
*dra
= DDR_A (ddr
);
235 struct data_reference
*drb
= DDR_B (ddr
);
236 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
237 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
238 lambda_vector dist_v
;
239 unsigned int loop_depth
;
241 /* In loop analysis all data references should be vectorizable. */
242 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
243 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
246 /* Independent data accesses. */
247 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
251 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
254 /* Unknown data dependence. */
255 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
257 /* If user asserted safelen consecutive iterations can be
258 executed concurrently, assume independence. */
259 if (loop
->safelen
>= 2)
261 if (loop
->safelen
< *max_vf
)
262 *max_vf
= loop
->safelen
;
266 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
267 || STMT_VINFO_GATHER_P (stmtinfo_b
))
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
272 "versioning for alias not supported for: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
276 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
295 /* Add to list of ddrs that need to be tested at run-time. */
296 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
299 /* Known data dependence. */
300 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
302 /* If user asserted safelen consecutive iterations can be
303 executed concurrently, assume independence. */
304 if (loop
->safelen
>= 2)
306 if (loop
->safelen
< *max_vf
)
307 *max_vf
= loop
->safelen
;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
312 || STMT_VINFO_GATHER_P (stmtinfo_b
))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
321 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
341 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
344 int dist
= dist_v
[loop_depth
];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE
, vect_location
,
348 "dependence distance = %d.", dist
);
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE
, vect_location
,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
357 dump_printf (MSG_NOTE
, " and ");
358 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
361 /* When we perform grouped accesses and perform implicit CSE
362 by detecting equal accesses and doing disambiguation with
363 runtime alias tests like for
371 where we will end up loading { a[i], a[i+1] } once, make
372 sure that inserting group loads before the first load and
373 stores after the last store will do the right thing. */
374 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
375 && GROUP_SAME_DR_STMT (stmtinfo_a
))
376 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
377 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
380 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
382 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
384 if (dump_enabled_p ())
385 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
386 "READ_WRITE dependence in interleaving.");
394 if (dist
> 0 && DDR_REVERSED_P (ddr
))
396 /* If DDR_REVERSED_P the order of the data-refs in DDR was
397 reversed (to make distance vector positive), and the actual
398 distance is negative. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
401 "dependence distance negative.");
406 && abs (dist
) < *max_vf
)
408 /* The dependence distance requires reduction of the maximal
409 vectorization factor. */
410 *max_vf
= abs (dist
);
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE
, vect_location
,
413 "adjusting maximal vectorization factor to %i",
417 if (abs (dist
) >= *max_vf
)
419 /* Dependence distance does not create dependence, as far as
420 vectorization is concerned, in this case. */
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_NOTE
, vect_location
,
423 "dependence distance >= VF.");
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
430 "not vectorized, possible dependence "
431 "between data-refs ");
432 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
433 dump_printf (MSG_NOTE
, " and ");
434 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
443 /* Function vect_analyze_data_ref_dependences.
445 Examine all the data references in the loop, and make sure there do not
446 exist any data dependences between them. Set *MAX_VF according to
447 the maximum vectorization factor the data dependences allow. */
450 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
453 struct data_dependence_relation
*ddr
;
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE
, vect_location
,
457 "=== vect_analyze_data_ref_dependences ===");
459 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
460 &LOOP_VINFO_DDRS (loop_vinfo
),
461 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
464 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
465 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
472 /* Function vect_slp_analyze_data_ref_dependence.
474 Return TRUE if there (might) exist a dependence between a memory-reference
475 DRA and a memory-reference DRB. When versioning for alias may check a
476 dependence at run-time, return FALSE. Adjust *MAX_VF according to
477 the data dependence. */
480 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
482 struct data_reference
*dra
= DDR_A (ddr
);
483 struct data_reference
*drb
= DDR_B (ddr
);
485 /* We need to check dependences of statements marked as unvectorizable
486 as well, they still can prohibit vectorization. */
488 /* Independent data accesses. */
489 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
495 /* Read-read is OK. */
496 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
499 /* If dra and drb are part of the same interleaving chain consider
501 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
502 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
503 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
506 /* Unknown data dependence. */
507 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
514 "can't determine dependence between ");
515 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
516 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
517 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
520 /* We do not vectorize basic blocks with write-write dependencies. */
521 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
524 /* Check that it's not a load-after-store dependence. */
525 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
526 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE
, vect_location
,
535 "determined dependence between ");
536 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
537 dump_printf (MSG_NOTE
, " and ");
538 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
541 /* Do not vectorize basic blocks with write-write dependences. */
542 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
545 /* Check dependence between DRA and DRB for basic block vectorization.
546 If the accesses share same bases and offsets, we can compare their initial
547 constant offsets to decide whether they differ or not. In case of a read-
548 write dependence we check that the load is before the store to ensure that
549 vectorization will not change the order of the accesses. */
551 HOST_WIDE_INT type_size_a
, type_size_b
, init_a
, init_b
;
554 /* Check that the data-refs have same bases and offsets. If not, we can't
555 determine if they are dependent. */
556 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
557 || !dr_equal_offsets_p (dra
, drb
))
560 /* Check the types. */
561 type_size_a
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))));
562 type_size_b
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
564 if (type_size_a
!= type_size_b
565 || !types_compatible_p (TREE_TYPE (DR_REF (dra
)),
566 TREE_TYPE (DR_REF (drb
))))
569 init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
570 init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
572 /* Two different locations - no dependence. */
573 if (init_a
!= init_b
)
576 /* We have a read-write dependence. Check that the load is before the store.
577 When we vectorize basic blocks, vector load can be only before
578 corresponding scalar load, and vector store can be only after its
579 corresponding scalar store. So the order of the acceses is preserved in
580 case the load is before the store. */
581 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
582 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
589 /* Function vect_analyze_data_ref_dependences.
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
598 struct data_dependence_relation
*ddr
;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE
, vect_location
,
603 "=== vect_slp_analyze_data_ref_dependences ===");
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
606 &BB_VINFO_DDRS (bb_vinfo
),
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
611 if (vect_slp_analyze_data_ref_dependence (ddr
))
618 /* Function vect_compute_data_ref_alignment
620 Compute the misalignment of the data reference DR.
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
631 vect_compute_data_ref_alignment (struct data_reference
*dr
)
633 gimple stmt
= DR_STMT (dr
);
634 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
635 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
636 struct loop
*loop
= NULL
;
637 tree ref
= DR_REF (dr
);
639 tree base
, base_addr
;
642 tree aligned_to
, alignment
;
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE
, vect_location
,
646 "vect_compute_data_ref_alignment:");
649 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr
, -1);
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
659 misalign
= DR_INIT (dr
);
660 aligned_to
= DR_ALIGNED_TO (dr
);
661 base_addr
= DR_BASE_ADDRESS (dr
);
662 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
672 tree step
= DR_STEP (dr
);
673 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
675 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE
, vect_location
,
679 "inner step divides the vector-size.");
680 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
681 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
682 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
688 "inner step doesn't divide the vector-size.");
689 misalign
= NULL_TREE
;
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
700 tree step
= DR_STEP (dr
);
701 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
703 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
707 "SLP: step doesn't divide the vector-size.");
708 misalign
= NULL_TREE
;
712 base
= build_fold_indirect_ref (base_addr
);
713 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
715 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
728 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
730 || (TREE_CODE (base_addr
) == SSA_NAME
731 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
732 TREE_TYPE (base_addr
)))),
734 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
737 base_aligned
= false;
741 /* Do not change the alignment of global variables here if
742 flag_section_anchors is enabled as we already generated
743 RTL for other functions. Most global variables should
744 have been aligned during the IPA increase_alignment pass. */
745 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
746 || (TREE_STATIC (base
) && flag_section_anchors
))
748 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE
, vect_location
,
751 "can't force alignment of ref: ");
752 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
757 /* Force the alignment of the decl.
758 NOTE: This is the only change to the code we make during
759 the analysis phase, before deciding to vectorize the loop. */
760 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
763 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
766 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
767 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
770 /* If this is a backward running DR then first access in the larger
771 vectype actually is N-1 elements before the address in the DR.
772 Adjust misalign accordingly. */
773 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
775 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
776 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
777 otherwise we wouldn't be here. */
778 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
779 /* PLUS because DR_STEP was negative. */
780 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
783 /* Modulo alignment. */
784 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
786 if (!host_integerp (misalign
, 1))
788 /* Negative or overflowed misalignment value. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
791 "unexpected misalign value");
795 SET_DR_MISALIGNMENT (dr
, TREE_INT_CST_LOW (misalign
));
797 if (dump_enabled_p ())
799 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
800 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
801 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
808 /* Function vect_compute_data_refs_alignment
810 Compute the misalignment of data references in the loop.
811 Return FALSE if a data reference is found that cannot be vectorized. */
814 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
815 bb_vec_info bb_vinfo
)
817 vec
<data_reference_p
> datarefs
;
818 struct data_reference
*dr
;
822 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
824 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
826 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
827 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
828 && !vect_compute_data_ref_alignment (dr
))
832 /* Mark unsupported statement as unvectorizable. */
833 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
844 /* Function vect_update_misalignment_for_peel
846 DR - the data reference whose misalignment is to be adjusted.
847 DR_PEEL - the data reference whose misalignment is being made
848 zero in the vector loop by the peel.
849 NPEEL - the number of iterations in the peel loop if the misalignment
850 of DR_PEEL is known at compile time. */
853 vect_update_misalignment_for_peel (struct data_reference
*dr
,
854 struct data_reference
*dr_peel
, int npeel
)
857 vec
<dr_p
> same_align_drs
;
858 struct data_reference
*current_dr
;
859 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
860 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
861 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
862 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
864 /* For interleaved data accesses the step in the loop must be multiplied by
865 the size of the interleaving group. */
866 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
867 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
868 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
869 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
871 /* It can be assumed that the data refs with the same alignment as dr_peel
872 are aligned in the vector loop. */
874 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
875 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
877 if (current_dr
!= dr
)
879 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
880 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
881 SET_DR_MISALIGNMENT (dr
, 0);
885 if (known_alignment_for_access_p (dr
)
886 && known_alignment_for_access_p (dr_peel
))
888 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
889 int misal
= DR_MISALIGNMENT (dr
);
890 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
891 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
892 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
893 SET_DR_MISALIGNMENT (dr
, misal
);
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.");
899 SET_DR_MISALIGNMENT (dr
, -1);
903 /* Function vect_verify_datarefs_alignment
905 Return TRUE if all data references in the loop can be
906 handled with respect to alignment. */
909 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
911 vec
<data_reference_p
> datarefs
;
912 struct data_reference
*dr
;
913 enum dr_alignment_support supportable_dr_alignment
;
917 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
919 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
921 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
923 gimple stmt
= DR_STMT (dr
);
924 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
926 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
929 /* For interleaving, only the alignment of the first access matters.
930 Skip statements marked as not vectorizable. */
931 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
932 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
933 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
936 /* Strided loads perform only component accesses, alignment is
937 irrelevant for them. */
938 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
941 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
942 if (!supportable_dr_alignment
)
944 if (dump_enabled_p ())
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
948 "not vectorized: unsupported unaligned load.");
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
951 "not vectorized: unsupported unaligned "
954 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
959 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE
, vect_location
,
961 "Vectorizing an unaligned access.");
966 /* Given an memory reference EXP return whether its alignment is less
970 not_size_aligned (tree exp
)
972 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp
)), 1))
975 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp
)))
976 > get_object_alignment (exp
));
979 /* Function vector_alignment_reachable_p
981 Return true if vector alignment for DR is reachable by peeling
982 a few loop iterations. Return false otherwise. */
985 vector_alignment_reachable_p (struct data_reference
*dr
)
987 gimple stmt
= DR_STMT (dr
);
988 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
989 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
991 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
993 /* For interleaved access we peel only if number of iterations in
994 the prolog loop ({VF - misalignment}), is a multiple of the
995 number of the interleaved accesses. */
996 int elem_size
, mis_in_elements
;
997 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
999 /* FORNOW: handle only known alignment. */
1000 if (!known_alignment_for_access_p (dr
))
1003 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1004 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1006 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1010 /* If misalignment is known at the compile time then allow peeling
1011 only if natural alignment is reachable through peeling. */
1012 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1014 HOST_WIDE_INT elmsize
=
1015 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1016 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_NOTE
, vect_location
,
1019 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1020 dump_printf (MSG_NOTE
,
1021 ". misalignment = %d. ", DR_MISALIGNMENT (dr
));
1023 if (DR_MISALIGNMENT (dr
) % elmsize
)
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1027 "data size does not divide the misalignment.\n");
1032 if (!known_alignment_for_access_p (dr
))
1034 tree type
= TREE_TYPE (DR_REF (dr
));
1035 bool is_packed
= not_size_aligned (DR_REF (dr
));
1036 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1038 "Unknown misalignment, is_packed = %d",is_packed
);
1039 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1040 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1050 /* Calculate the cost of the memory access represented by DR. */
1053 vect_get_data_access_cost (struct data_reference
*dr
,
1054 unsigned int *inside_cost
,
1055 unsigned int *outside_cost
,
1056 stmt_vector_for_cost
*body_cost_vec
)
1058 gimple stmt
= DR_STMT (dr
);
1059 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1060 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1061 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1062 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1063 int ncopies
= vf
/ nunits
;
1065 if (DR_IS_READ (dr
))
1066 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1067 NULL
, body_cost_vec
, false);
1069 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_NOTE
, vect_location
,
1073 "vect_get_data_access_cost: inside_cost = %d, "
1074 "outside_cost = %d.", *inside_cost
, *outside_cost
);
1078 /* Insert DR into peeling hash table with NPEEL as key. */
1081 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1084 struct _vect_peel_info elem
, *slot
;
1085 _vect_peel_info
**new_slot
;
1086 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1089 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1094 slot
= XNEW (struct _vect_peel_info
);
1095 slot
->npeel
= npeel
;
1098 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1102 if (!supportable_dr_alignment
&& !flag_vect_cost_model
)
1103 slot
->count
+= VECT_MAX_COST
;
1107 /* Traverse peeling hash table to find peeling option that aligns maximum
1108 number of data accesses. */
1111 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1112 _vect_peel_extended_info
*max
)
1114 vect_peel_info elem
= *slot
;
1116 if (elem
->count
> max
->peel_info
.count
1117 || (elem
->count
== max
->peel_info
.count
1118 && max
->peel_info
.npeel
> elem
->npeel
))
1120 max
->peel_info
.npeel
= elem
->npeel
;
1121 max
->peel_info
.count
= elem
->count
;
1122 max
->peel_info
.dr
= elem
->dr
;
1129 /* Traverse peeling hash table and calculate cost for each peeling option.
1130 Find the one with the lowest cost. */
1133 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1134 _vect_peel_extended_info
*min
)
1136 vect_peel_info elem
= *slot
;
1137 int save_misalignment
, dummy
;
1138 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1139 gimple stmt
= DR_STMT (elem
->dr
);
1140 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1141 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1142 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1143 struct data_reference
*dr
;
1144 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1145 int single_iter_cost
;
1147 prologue_cost_vec
.create (2);
1148 body_cost_vec
.create (2);
1149 epilogue_cost_vec
.create (2);
1151 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1153 stmt
= DR_STMT (dr
);
1154 stmt_info
= vinfo_for_stmt (stmt
);
1155 /* For interleaving, only the alignment of the first access
1157 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1158 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1161 save_misalignment
= DR_MISALIGNMENT (dr
);
1162 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1163 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1165 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1168 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1169 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1170 &dummy
, single_iter_cost
,
1172 &epilogue_cost_vec
);
1174 /* Prologue and epilogue costs are added to the target model later.
1175 These costs depend only on the scalar iteration cost, the
1176 number of peeling iterations finally chosen, and the number of
1177 misaligned statements. So discard the information found here. */
1178 prologue_cost_vec
.release ();
1179 epilogue_cost_vec
.release ();
1181 if (inside_cost
< min
->inside_cost
1182 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1184 min
->inside_cost
= inside_cost
;
1185 min
->outside_cost
= outside_cost
;
1186 min
->body_cost_vec
.release ();
1187 min
->body_cost_vec
= body_cost_vec
;
1188 min
->peel_info
.dr
= elem
->dr
;
1189 min
->peel_info
.npeel
= elem
->npeel
;
1192 body_cost_vec
.release ();
1198 /* Choose best peeling option by traversing peeling hash table and either
1199 choosing an option with the lowest cost (if cost model is enabled) or the
1200 option that aligns as many accesses as possible. */
1202 static struct data_reference
*
1203 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1204 unsigned int *npeel
,
1205 stmt_vector_for_cost
*body_cost_vec
)
1207 struct _vect_peel_extended_info res
;
1209 res
.peel_info
.dr
= NULL
;
1210 res
.body_cost_vec
= stmt_vector_for_cost();
1212 if (flag_vect_cost_model
)
1214 res
.inside_cost
= INT_MAX
;
1215 res
.outside_cost
= INT_MAX
;
1216 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1217 .traverse
<_vect_peel_extended_info
*,
1218 vect_peeling_hash_get_lowest_cost
> (&res
);
1222 res
.peel_info
.count
= 0;
1223 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1224 .traverse
<_vect_peel_extended_info
*,
1225 vect_peeling_hash_get_most_frequent
> (&res
);
1228 *npeel
= res
.peel_info
.npeel
;
1229 *body_cost_vec
= res
.body_cost_vec
;
1230 return res
.peel_info
.dr
;
1234 /* Function vect_enhance_data_refs_alignment
1236 This pass will use loop versioning and loop peeling in order to enhance
1237 the alignment of data references in the loop.
1239 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1240 original loop is to be vectorized. Any other loops that are created by
1241 the transformations performed in this pass - are not supposed to be
1242 vectorized. This restriction will be relaxed.
1244 This pass will require a cost model to guide it whether to apply peeling
1245 or versioning or a combination of the two. For example, the scheme that
1246 intel uses when given a loop with several memory accesses, is as follows:
1247 choose one memory access ('p') which alignment you want to force by doing
1248 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1249 other accesses are not necessarily aligned, or (2) use loop versioning to
1250 generate one loop in which all accesses are aligned, and another loop in
1251 which only 'p' is necessarily aligned.
1253 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1254 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1255 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1257 Devising a cost model is the most critical aspect of this work. It will
1258 guide us on which access to peel for, whether to use loop versioning, how
1259 many versions to create, etc. The cost model will probably consist of
1260 generic considerations as well as target specific considerations (on
1261 powerpc for example, misaligned stores are more painful than misaligned
1264 Here are the general steps involved in alignment enhancements:
1266 -- original loop, before alignment analysis:
1267 for (i=0; i<N; i++){
1268 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1269 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1272 -- After vect_compute_data_refs_alignment:
1273 for (i=0; i<N; i++){
1274 x = q[i]; # DR_MISALIGNMENT(q) = 3
1275 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1278 -- Possibility 1: we do loop versioning:
1280 for (i=0; i<N; i++){ # loop 1A
1281 x = q[i]; # DR_MISALIGNMENT(q) = 3
1282 p[i] = y; # DR_MISALIGNMENT(p) = 0
1286 for (i=0; i<N; i++){ # loop 1B
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1292 -- Possibility 2: we do loop peeling:
1293 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1297 for (i = 3; i < N; i++){ # loop 2A
1298 x = q[i]; # DR_MISALIGNMENT(q) = 0
1299 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1302 -- Possibility 3: combination of loop peeling and versioning:
1303 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1308 for (i = 3; i<N; i++){ # loop 3A
1309 x = q[i]; # DR_MISALIGNMENT(q) = 0
1310 p[i] = y; # DR_MISALIGNMENT(p) = 0
1314 for (i = 3; i<N; i++){ # loop 3B
1315 x = q[i]; # DR_MISALIGNMENT(q) = 0
1316 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1320 These loops are later passed to loop_transform to be vectorized. The
1321 vectorizer will use the alignment information to guide the transformation
1322 (whether to generate regular loads/stores, or with special handling for
1326 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1328 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1329 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1330 enum dr_alignment_support supportable_dr_alignment
;
1331 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1332 struct data_reference
*dr
;
1334 bool do_peeling
= false;
1335 bool do_versioning
= false;
1338 stmt_vec_info stmt_info
;
1339 unsigned int npeel
= 0;
1340 bool all_misalignments_unknown
= true;
1341 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1342 unsigned possible_npeel_number
= 1;
1344 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1345 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost();
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_NOTE
, vect_location
,
1349 "=== vect_enhance_data_refs_alignment ===");
1351 /* While cost model enhancements are expected in the future, the high level
1352 view of the code at this time is as follows:
1354 A) If there is a misaligned access then see if peeling to align
1355 this access can make all data references satisfy
1356 vect_supportable_dr_alignment. If so, update data structures
1357 as needed and return true.
1359 B) If peeling wasn't possible and there is a data reference with an
1360 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1361 then see if loop versioning checks can be used to make all data
1362 references satisfy vect_supportable_dr_alignment. If so, update
1363 data structures as needed and return true.
1365 C) If neither peeling nor versioning were successful then return false if
1366 any data reference does not satisfy vect_supportable_dr_alignment.
1368 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1370 Note, Possibility 3 above (which is peeling and versioning together) is not
1371 being done at this time. */
1373 /* (1) Peeling to force alignment. */
1375 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1377 + How many accesses will become aligned due to the peeling
1378 - How many accesses will become unaligned due to the peeling,
1379 and the cost of misaligned accesses.
1380 - The cost of peeling (the extra runtime checks, the increase
1383 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1385 stmt
= DR_STMT (dr
);
1386 stmt_info
= vinfo_for_stmt (stmt
);
1388 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1391 /* For interleaving, only the alignment of the first access
1393 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1394 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1397 /* For invariant accesses there is nothing to enhance. */
1398 if (integer_zerop (DR_STEP (dr
)))
1401 /* Strided loads perform only component accesses, alignment is
1402 irrelevant for them. */
1403 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1406 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1407 do_peeling
= vector_alignment_reachable_p (dr
);
1410 if (known_alignment_for_access_p (dr
))
1412 unsigned int npeel_tmp
;
1413 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1414 size_zero_node
) < 0;
1416 /* Save info about DR in the hash table. */
1417 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1418 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1420 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1421 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1422 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1423 TREE_TYPE (DR_REF (dr
))));
1424 npeel_tmp
= (negative
1425 ? (mis
- nelements
) : (nelements
- mis
))
1428 /* For multiple types, it is possible that the bigger type access
1429 will have more than one peeling option. E.g., a loop with two
1430 types: one of size (vector size / 4), and the other one of
1431 size (vector size / 8). Vectorization factor will 8. If both
1432 access are misaligned by 3, the first one needs one scalar
1433 iteration to be aligned, and the second one needs 5. But the
1434 the first one will be aligned also by peeling 5 scalar
1435 iterations, and in that case both accesses will be aligned.
1436 Hence, except for the immediate peeling amount, we also want
1437 to try to add full vector size, while we don't exceed
1438 vectorization factor.
1439 We do this automtically for cost model, since we calculate cost
1440 for every peeling option. */
1441 if (!flag_vect_cost_model
)
1442 possible_npeel_number
= vf
/nelements
;
1444 /* Handle the aligned case. We may decide to align some other
1445 access, making DR unaligned. */
1446 if (DR_MISALIGNMENT (dr
) == 0)
1449 if (!flag_vect_cost_model
)
1450 possible_npeel_number
++;
1453 for (j
= 0; j
< possible_npeel_number
; j
++)
1455 gcc_assert (npeel_tmp
<= vf
);
1456 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1457 npeel_tmp
+= nelements
;
1460 all_misalignments_unknown
= false;
1461 /* Data-ref that was chosen for the case that all the
1462 misalignments are unknown is not relevant anymore, since we
1463 have a data-ref with known alignment. */
1468 /* If we don't know any misalignment values, we prefer
1469 peeling for data-ref that has the maximum number of data-refs
1470 with the same alignment, unless the target prefers to align
1471 stores over load. */
1472 if (all_misalignments_unknown
)
1474 unsigned same_align_drs
1475 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1477 || same_align_drs_max
< same_align_drs
)
1479 same_align_drs_max
= same_align_drs
;
1482 /* For data-refs with the same number of related
1483 accesses prefer the one where the misalign
1484 computation will be invariant in the outermost loop. */
1485 else if (same_align_drs_max
== same_align_drs
)
1487 struct loop
*ivloop0
, *ivloop
;
1488 ivloop0
= outermost_invariant_loop_for_expr
1489 (loop
, DR_BASE_ADDRESS (dr0
));
1490 ivloop
= outermost_invariant_loop_for_expr
1491 (loop
, DR_BASE_ADDRESS (dr
));
1492 if ((ivloop
&& !ivloop0
)
1493 || (ivloop
&& ivloop0
1494 && flow_loop_nested_p (ivloop
, ivloop0
)))
1498 if (!first_store
&& DR_IS_WRITE (dr
))
1502 /* If there are both known and unknown misaligned accesses in the
1503 loop, we choose peeling amount according to the known
1505 if (!supportable_dr_alignment
)
1508 if (!first_store
&& DR_IS_WRITE (dr
))
1515 if (!aligned_access_p (dr
))
1517 if (dump_enabled_p ())
1518 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1519 "vector alignment may not be reachable");
1525 /* Check if we can possibly peel the loop. */
1526 if (!vect_can_advance_ivs_p (loop_vinfo
)
1527 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1530 if (do_peeling
&& all_misalignments_unknown
1531 && vect_supportable_dr_alignment (dr0
, false))
1534 /* Check if the target requires to prefer stores over loads, i.e., if
1535 misaligned stores are more expensive than misaligned loads (taking
1536 drs with same alignment into account). */
1537 if (first_store
&& DR_IS_READ (dr0
))
1539 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1540 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1541 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1542 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1543 stmt_vector_for_cost dummy
;
1546 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1548 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1549 &store_outside_cost
, &dummy
);
1553 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1554 aligning the load DR0). */
1555 load_inside_penalty
= store_inside_cost
;
1556 load_outside_penalty
= store_outside_cost
;
1558 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1559 DR_STMT (first_store
))).iterate (i
, &dr
);
1561 if (DR_IS_READ (dr
))
1563 load_inside_penalty
+= load_inside_cost
;
1564 load_outside_penalty
+= load_outside_cost
;
1568 load_inside_penalty
+= store_inside_cost
;
1569 load_outside_penalty
+= store_outside_cost
;
1572 /* Calculate the penalty for leaving DR0 unaligned (by
1573 aligning the FIRST_STORE). */
1574 store_inside_penalty
= load_inside_cost
;
1575 store_outside_penalty
= load_outside_cost
;
1577 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1578 DR_STMT (dr0
))).iterate (i
, &dr
);
1580 if (DR_IS_READ (dr
))
1582 store_inside_penalty
+= load_inside_cost
;
1583 store_outside_penalty
+= load_outside_cost
;
1587 store_inside_penalty
+= store_inside_cost
;
1588 store_outside_penalty
+= store_outside_cost
;
1591 if (load_inside_penalty
> store_inside_penalty
1592 || (load_inside_penalty
== store_inside_penalty
1593 && load_outside_penalty
> store_outside_penalty
))
1597 /* In case there are only loads with different unknown misalignments, use
1598 peeling only if it may help to align other accesses in the loop. */
1600 && !STMT_VINFO_SAME_ALIGN_REFS (
1601 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1602 && vect_supportable_dr_alignment (dr0
, false)
1603 != dr_unaligned_supported
)
1607 if (do_peeling
&& !dr0
)
1609 /* Peeling is possible, but there is no data access that is not supported
1610 unless aligned. So we try to choose the best possible peeling. */
1612 /* We should get here only if there are drs with known misalignment. */
1613 gcc_assert (!all_misalignments_unknown
);
1615 /* Choose the best peeling from the hash table. */
1616 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1624 stmt
= DR_STMT (dr0
);
1625 stmt_info
= vinfo_for_stmt (stmt
);
1626 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1627 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1629 if (known_alignment_for_access_p (dr0
))
1631 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1632 size_zero_node
) < 0;
1635 /* Since it's known at compile time, compute the number of
1636 iterations in the peeled loop (the peeling factor) for use in
1637 updating DR_MISALIGNMENT values. The peeling factor is the
1638 vectorization factor minus the misalignment as an element
1640 mis
= DR_MISALIGNMENT (dr0
);
1641 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1642 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1646 /* For interleaved data access every iteration accesses all the
1647 members of the group, therefore we divide the number of iterations
1648 by the group size. */
1649 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1650 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1651 npeel
/= GROUP_SIZE (stmt_info
);
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_NOTE
, vect_location
,
1655 "Try peeling by %d", npeel
);
1658 /* Ensure that all data refs can be vectorized after the peel. */
1659 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1661 int save_misalignment
;
1666 stmt
= DR_STMT (dr
);
1667 stmt_info
= vinfo_for_stmt (stmt
);
1668 /* For interleaving, only the alignment of the first access
1670 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1671 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1674 /* Strided loads perform only component accesses, alignment is
1675 irrelevant for them. */
1676 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1679 save_misalignment
= DR_MISALIGNMENT (dr
);
1680 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1681 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1682 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1684 if (!supportable_dr_alignment
)
1691 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1693 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1698 body_cost_vec
.release ();
1705 stmt_info_for_cost
*si
;
1706 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1708 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1709 If the misalignment of DR_i is identical to that of dr0 then set
1710 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1711 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1712 by the peeling factor times the element size of DR_i (MOD the
1713 vectorization factor times the size). Otherwise, the
1714 misalignment of DR_i must be set to unknown. */
1715 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1717 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1719 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1721 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1723 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = DR_MISALIGNMENT (dr0
);
1724 SET_DR_MISALIGNMENT (dr0
, 0);
1725 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE
, vect_location
,
1728 "Alignment of access forced using peeling.");
1729 dump_printf_loc (MSG_NOTE
, vect_location
,
1730 "Peeling for alignment will be applied.");
1732 /* We've delayed passing the inside-loop peeling costs to the
1733 target cost model until we were sure peeling would happen.
1735 if (body_cost_vec
.exists ())
1737 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1739 struct _stmt_vec_info
*stmt_info
1740 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1741 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1742 si
->misalign
, vect_body
);
1744 body_cost_vec
.release ();
1747 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1753 body_cost_vec
.release ();
1755 /* (2) Versioning to force alignment. */
1757 /* Try versioning if:
1758 1) flag_tree_vect_loop_version is TRUE
1759 2) optimize loop for speed
1760 3) there is at least one unsupported misaligned data ref with an unknown
1762 4) all misaligned data refs with a known misalignment are supported, and
1763 5) the number of runtime alignment checks is within reason. */
1766 flag_tree_vect_loop_version
1767 && optimize_loop_nest_for_speed_p (loop
)
1768 && (!loop
->inner
); /* FORNOW */
1772 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1774 stmt
= DR_STMT (dr
);
1775 stmt_info
= vinfo_for_stmt (stmt
);
1777 /* For interleaving, only the alignment of the first access
1779 if (aligned_access_p (dr
)
1780 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1781 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1784 /* Strided loads perform only component accesses, alignment is
1785 irrelevant for them. */
1786 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1789 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1791 if (!supportable_dr_alignment
)
1797 if (known_alignment_for_access_p (dr
)
1798 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1799 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1801 do_versioning
= false;
1805 stmt
= DR_STMT (dr
);
1806 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1807 gcc_assert (vectype
);
1809 /* The rightmost bits of an aligned address must be zeros.
1810 Construct the mask needed for this test. For example,
1811 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1812 mask must be 15 = 0xf. */
1813 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1815 /* FORNOW: use the same mask to test all potentially unaligned
1816 references in the loop. The vectorizer currently supports
1817 a single vector size, see the reference to
1818 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1819 vectorization factor is computed. */
1820 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1821 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1822 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1823 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1828 /* Versioning requires at least one misaligned data reference. */
1829 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1830 do_versioning
= false;
1831 else if (!do_versioning
)
1832 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1837 vec
<gimple
> may_misalign_stmts
1838 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1841 /* It can now be assumed that the data references in the statements
1842 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1843 of the loop being vectorized. */
1844 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1846 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1847 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1848 SET_DR_MISALIGNMENT (dr
, 0);
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE
, vect_location
,
1851 "Alignment of access forced using versioning.");
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE
, vect_location
,
1856 "Versioning for alignment will be applied.");
1858 /* Peeling and versioning can't be done together at this time. */
1859 gcc_assert (! (do_peeling
&& do_versioning
));
1861 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1866 /* This point is reached if neither peeling nor versioning is being done. */
1867 gcc_assert (! (do_peeling
|| do_versioning
));
1869 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1874 /* Function vect_find_same_alignment_drs.
1876 Update group and alignment relations according to the chosen
1877 vectorization factor. */
1880 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1881 loop_vec_info loop_vinfo
)
1884 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1885 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1886 struct data_reference
*dra
= DDR_A (ddr
);
1887 struct data_reference
*drb
= DDR_B (ddr
);
1888 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1889 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1890 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1891 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1892 lambda_vector dist_v
;
1893 unsigned int loop_depth
;
1895 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1901 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1904 /* Loop-based vectorization and known data dependence. */
1905 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1908 /* Data-dependence analysis reports a distance vector of zero
1909 for data-references that overlap only in the first iteration
1910 but have different sign step (see PR45764).
1911 So as a sanity check require equal DR_STEP. */
1912 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1915 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1916 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1918 int dist
= dist_v
[loop_depth
];
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE
, vect_location
,
1922 "dependence distance = %d.", dist
);
1924 /* Same loop iteration. */
1926 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1928 /* Two references with distance zero have the same alignment. */
1929 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1930 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1931 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_NOTE
, vect_location
,
1934 "accesses have the same alignment.");
1935 dump_printf (MSG_NOTE
,
1936 "dependence distance modulo vf == 0 between ");
1937 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1938 dump_printf (MSG_NOTE
, " and ");
1939 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1946 /* Function vect_analyze_data_refs_alignment
1948 Analyze the alignment of the data-references in the loop.
1949 Return FALSE if a data reference is found that cannot be vectorized. */
1952 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1953 bb_vec_info bb_vinfo
)
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE
, vect_location
,
1957 "=== vect_analyze_data_refs_alignment ===");
1959 /* Mark groups of data references with same alignment using
1960 data dependence information. */
1963 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1964 struct data_dependence_relation
*ddr
;
1967 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1968 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1971 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1975 "not vectorized: can't calculate alignment "
1984 /* Analyze groups of accesses: check that DR belongs to a group of
1985 accesses of legal size, step, etc. Detect gaps, single element
1986 interleaving, and other special cases. Set grouped access info.
1987 Collect groups of strided stores for further use in SLP analysis. */
1990 vect_analyze_group_access (struct data_reference
*dr
)
1992 tree step
= DR_STEP (dr
);
1993 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
1994 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
1995 gimple stmt
= DR_STMT (dr
);
1996 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1997 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1998 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
1999 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2000 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2001 bool slp_impossible
= false;
2002 struct loop
*loop
= NULL
;
2005 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2007 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2008 size of the interleaving group (including gaps). */
2009 groupsize
= absu_hwi (dr_step
) / type_size
;
2011 /* Not consecutive access is possible only if it is a part of interleaving. */
2012 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2014 /* Check if it this DR is a part of interleaving, and is a single
2015 element of the group that is accessed in the loop. */
2017 /* Gaps are supported only for loads. STEP must be a multiple of the type
2018 size. The size of the group must be a power of 2. */
2020 && (dr_step
% type_size
) == 0
2022 && exact_log2 (groupsize
) != -1)
2024 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2025 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2026 if (dump_enabled_p ())
2028 dump_printf_loc (MSG_NOTE
, vect_location
,
2029 "Detected single element interleaving ");
2030 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2031 dump_printf (MSG_NOTE
, " step ");
2032 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2037 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_NOTE
, vect_location
,
2039 "Data access with gaps requires scalar "
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2045 "Peeling for outer loop is not"
2050 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2056 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2059 "not consecutive access ");
2060 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2065 /* Mark the statement as unvectorizable. */
2066 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2073 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2075 /* First stmt in the interleaving chain. Check the chain. */
2076 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2077 struct data_reference
*data_ref
= dr
;
2078 unsigned int count
= 1;
2079 tree prev_init
= DR_INIT (data_ref
);
2081 HOST_WIDE_INT diff
, gaps
= 0;
2082 unsigned HOST_WIDE_INT count_in_bytes
;
2086 /* Skip same data-refs. In case that two or more stmts share
2087 data-ref (supported only for loads), we vectorize only the first
2088 stmt, and the rest get their vectorized loads from the first
2090 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2091 DR_INIT (STMT_VINFO_DATA_REF (
2092 vinfo_for_stmt (next
)))))
2094 if (DR_IS_WRITE (data_ref
))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2098 "Two store stmts share the same dr.");
2102 /* For load use the same data-ref load. */
2103 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2106 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2111 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2113 /* All group members have the same STEP by construction. */
2114 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2116 /* Check that the distance between two accesses is equal to the type
2117 size. Otherwise, we have gaps. */
2118 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2119 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2122 /* FORNOW: SLP of accesses with gaps is not supported. */
2123 slp_impossible
= true;
2124 if (DR_IS_WRITE (data_ref
))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2128 "interleaved store with gaps");
2135 last_accessed_element
+= diff
;
2137 /* Store the gap from the previous member of the group. If there is no
2138 gap in the access, GROUP_GAP is always 1. */
2139 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2141 prev_init
= DR_INIT (data_ref
);
2142 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2143 /* Count the number of data-refs in the chain. */
2147 /* COUNT is the number of accesses found, we multiply it by the size of
2148 the type to get COUNT_IN_BYTES. */
2149 count_in_bytes
= type_size
* count
;
2151 /* Check that the size of the interleaving (including gaps) is not
2152 greater than STEP. */
2154 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2156 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2159 "interleaving size is greater than step for ");
2160 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dr
));
2165 /* Check that the size of the interleaving is equal to STEP for stores,
2166 i.e., that there are no gaps. */
2168 && absu_hwi (dr_step
) != count_in_bytes
)
2170 if (DR_IS_READ (dr
))
2172 slp_impossible
= true;
2173 /* There is a gap after the last load in the group. This gap is a
2174 difference between the groupsize and the number of elements.
2175 When there is no gap, this difference should be 0. */
2176 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2182 "interleaved store with gaps");
2187 /* Check that STEP is a multiple of type size. */
2189 && (dr_step
% type_size
) != 0)
2191 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2194 "step is not a multiple of type size: step ");
2195 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2196 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2197 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2198 TYPE_SIZE_UNIT (scalar_type
));
2206 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2207 if (dump_enabled_p ())
2208 dump_printf_loc (MSG_NOTE
, vect_location
,
2209 "Detected interleaving of size %d", (int)groupsize
);
2211 /* SLP: create an SLP data structure for every interleaving group of
2212 stores for further analysis in vect_analyse_slp. */
2213 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2216 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2218 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2221 /* There is a gap in the end of the group. */
2222 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2226 "Data access with gaps requires scalar "
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2232 "Peeling for outer loop is not supported");
2236 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2244 /* Analyze the access pattern of the data-reference DR.
2245 In case of non-consecutive accesses call vect_analyze_group_access() to
2246 analyze groups of accesses. */
2249 vect_analyze_data_ref_access (struct data_reference
*dr
)
2251 tree step
= DR_STEP (dr
);
2252 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2253 gimple stmt
= DR_STMT (dr
);
2254 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2255 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2256 struct loop
*loop
= NULL
;
2259 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2261 if (loop_vinfo
&& !step
)
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2265 "bad data-ref access in loop");
2269 /* Allow invariant loads in not nested loops. */
2270 if (loop_vinfo
&& integer_zerop (step
))
2272 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2273 if (nested_in_vect_loop_p (loop
, stmt
))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_NOTE
, vect_location
,
2277 "zero step in inner loop of nest");
2280 return DR_IS_READ (dr
);
2283 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2285 /* Interleaved accesses are not yet supported within outer-loop
2286 vectorization for references in the inner-loop. */
2287 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2289 /* For the rest of the analysis we use the outer-loop step. */
2290 step
= STMT_VINFO_DR_STEP (stmt_info
);
2291 if (integer_zerop (step
))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE
, vect_location
,
2295 "zero step in outer loop.");
2296 if (DR_IS_READ (dr
))
2304 if (TREE_CODE (step
) == INTEGER_CST
)
2306 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2307 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2309 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2311 /* Mark that it is not interleaving. */
2312 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2317 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE
, vect_location
,
2321 "grouped access in outer loop.");
2325 /* Assume this is a DR handled by non-constant strided load case. */
2326 if (TREE_CODE (step
) != INTEGER_CST
)
2327 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2329 /* Not consecutive access - check if it's a part of interleaving group. */
2330 return vect_analyze_group_access (dr
);
2335 /* A helper function used in the comparator function to sort data
2336 references. T1 and T2 are two data references to be compared.
2337 The function returns -1, 0, or 1. */
2340 compare_tree (tree t1
, tree t2
)
2343 enum tree_code code
;
2354 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2355 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2357 code
= TREE_CODE (t1
);
2360 /* For const values, we can just use hash values for comparisons. */
2368 hashval_t h1
= iterative_hash_expr (t1
, 0);
2369 hashval_t h2
= iterative_hash_expr (t2
, 0);
2371 return h1
< h2
? -1 : 1;
2376 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2380 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2381 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2385 tclass
= TREE_CODE_CLASS (code
);
2387 /* For var-decl, we could compare their UIDs. */
2388 if (tclass
== tcc_declaration
)
2390 if (DECL_UID (t1
) != DECL_UID (t2
))
2391 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2395 /* For expressions with operands, compare their operands recursively. */
2396 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2398 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2408 /* Compare two data-references DRA and DRB to group them into chunks
2409 suitable for grouping. */
2412 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2414 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2415 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2418 /* Stabilize sort. */
2422 /* Ordering of DRs according to base. */
2423 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2425 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2430 /* And according to DR_OFFSET. */
2431 if (!dr_equal_offsets_p (dra
, drb
))
2433 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2438 /* Put reads before writes. */
2439 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2440 return DR_IS_READ (dra
) ? -1 : 1;
2442 /* Then sort after access size. */
2443 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2444 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2446 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2447 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2452 /* And after step. */
2453 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2455 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2460 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2461 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2463 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2467 /* Function vect_analyze_data_ref_accesses.
2469 Analyze the access pattern of all the data references in the loop.
2471 FORNOW: the only access pattern that is considered vectorizable is a
2472 simple step 1 (consecutive) access.
2474 FORNOW: handle only arrays and pointer accesses. */
2477 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2480 vec
<data_reference_p
> datarefs
;
2481 struct data_reference
*dr
;
2483 if (dump_enabled_p ())
2484 dump_printf_loc (MSG_NOTE
, vect_location
,
2485 "=== vect_analyze_data_ref_accesses ===");
2488 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2490 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2492 if (datarefs
.is_empty ())
2495 /* Sort the array of datarefs to make building the interleaving chains
2497 qsort (datarefs
.address(), datarefs
.length (),
2498 sizeof (data_reference_p
), dr_group_sort_cmp
);
2500 /* Build the interleaving chains. */
2501 for (i
= 0; i
< datarefs
.length () - 1;)
2503 data_reference_p dra
= datarefs
[i
];
2504 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2505 stmt_vec_info lastinfo
= NULL
;
2506 for (i
= i
+ 1; i
< datarefs
.length (); ++i
)
2508 data_reference_p drb
= datarefs
[i
];
2509 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2511 /* ??? Imperfect sorting (non-compatible types, non-modulo
2512 accesses, same accesses) can lead to a group to be artificially
2513 split here as we don't just skip over those. If it really
2514 matters we can push those to a worklist and re-iterate
2515 over them. The we can just skip ahead to the next DR here. */
2517 /* Check that the data-refs have same first location (except init)
2518 and they are both either store or load (not load and store). */
2519 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2520 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2521 DR_BASE_ADDRESS (drb
), 0)
2522 || !dr_equal_offsets_p (dra
, drb
))
2525 /* Check that the data-refs have the same constant size and step. */
2526 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2527 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2528 if (!host_integerp (sza
, 1)
2529 || !host_integerp (szb
, 1)
2530 || !tree_int_cst_equal (sza
, szb
)
2531 || !host_integerp (DR_STEP (dra
), 0)
2532 || !host_integerp (DR_STEP (drb
), 0)
2533 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2536 /* Do not place the same access in the interleaving chain twice. */
2537 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2540 /* Check the types are compatible.
2541 ??? We don't distinguish this during sorting. */
2542 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2543 TREE_TYPE (DR_REF (drb
))))
2546 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2547 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2548 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2549 gcc_assert (init_a
< init_b
);
2551 /* If init_b == init_a + the size of the type * k, we have an
2552 interleaving, and DRA is accessed before DRB. */
2553 HOST_WIDE_INT type_size_a
= TREE_INT_CST_LOW (sza
);
2554 if ((init_b
- init_a
) % type_size_a
!= 0)
2557 /* The step (if not zero) is greater than the difference between
2558 data-refs' inits. This splits groups into suitable sizes. */
2559 HOST_WIDE_INT step
= TREE_INT_CST_LOW (DR_STEP (dra
));
2560 if (step
!= 0 && step
<= (init_b
- init_a
))
2563 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_NOTE
, vect_location
,
2566 "Detected interleaving ");
2567 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2568 dump_printf (MSG_NOTE
, " and ");
2569 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2572 /* Link the found element into the group list. */
2573 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2575 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2576 lastinfo
= stmtinfo_a
;
2578 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2579 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2580 lastinfo
= stmtinfo_b
;
2584 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2585 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2586 && !vect_analyze_data_ref_access (dr
))
2588 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2590 "not vectorized: complicated access pattern.");
2594 /* Mark the statement as not vectorizable. */
2595 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2605 /* Function vect_prune_runtime_alias_test_list.
2607 Prune a list of ddrs to be tested at run-time by versioning for alias.
2608 Return FALSE if resulting list of ddrs is longer then allowed by
2609 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2612 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2615 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2618 if (dump_enabled_p ())
2619 dump_printf_loc (MSG_NOTE
, vect_location
,
2620 "=== vect_prune_runtime_alias_test_list ===");
2622 for (i
= 0; i
< ddrs
.length (); )
2630 for (j
= 0; j
< i
; j
++)
2632 ddr_p ddr_j
= ddrs
[j
];
2634 if (vect_vfa_range_equal (ddr_i
, ddr_j
))
2636 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE
, vect_location
,
2639 "found equal ranges ");
2640 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr_i
)));
2641 dump_printf (MSG_NOTE
, ", ");
2642 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr_i
)));
2643 dump_printf (MSG_NOTE
, " and ");
2644 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr_j
)));
2645 dump_printf (MSG_NOTE
, ", ");
2646 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr_j
)));
2655 ddrs
.ordered_remove (i
);
2661 if (ddrs
.length () >
2662 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2664 if (dump_enabled_p ())
2666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2667 "disable versioning for alias - max number of "
2668 "generated checks exceeded.");
2671 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
2679 /* Check whether a non-affine read in stmt is suitable for gather load
2680 and if so, return a builtin decl for that operation. */
2683 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2684 tree
*offp
, int *scalep
)
2686 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2687 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2688 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2689 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2690 tree offtype
= NULL_TREE
;
2691 tree decl
, base
, off
;
2692 enum machine_mode pmode
;
2693 int punsignedp
, pvolatilep
;
2695 /* The gather builtins need address of the form
2696 loop_invariant + vector * {1, 2, 4, 8}
2698 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2699 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2700 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2701 multiplications and additions in it. To get a vector, we need
2702 a single SSA_NAME that will be defined in the loop and will
2703 contain everything that is not loop invariant and that can be
2704 vectorized. The following code attempts to find such a preexistng
2705 SSA_NAME OFF and put the loop invariants into a tree BASE
2706 that can be gimplified before the loop. */
2707 base
= get_inner_reference (DR_REF (dr
), &pbitsize
, &pbitpos
, &off
,
2708 &pmode
, &punsignedp
, &pvolatilep
, false);
2709 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2711 if (TREE_CODE (base
) == MEM_REF
)
2713 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2715 if (off
== NULL_TREE
)
2717 double_int moff
= mem_ref_offset (base
);
2718 off
= double_int_to_tree (sizetype
, moff
);
2721 off
= size_binop (PLUS_EXPR
, off
,
2722 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2724 base
= TREE_OPERAND (base
, 0);
2727 base
= build_fold_addr_expr (base
);
2729 if (off
== NULL_TREE
)
2730 off
= size_zero_node
;
2732 /* If base is not loop invariant, either off is 0, then we start with just
2733 the constant offset in the loop invariant BASE and continue with base
2734 as OFF, otherwise give up.
2735 We could handle that case by gimplifying the addition of base + off
2736 into some SSA_NAME and use that as off, but for now punt. */
2737 if (!expr_invariant_in_loop_p (loop
, base
))
2739 if (!integer_zerop (off
))
2742 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
2744 /* Otherwise put base + constant offset into the loop invariant BASE
2745 and continue with OFF. */
2748 base
= fold_convert (sizetype
, base
);
2749 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
2752 /* OFF at this point may be either a SSA_NAME or some tree expression
2753 from get_inner_reference. Try to peel off loop invariants from it
2754 into BASE as long as possible. */
2756 while (offtype
== NULL_TREE
)
2758 enum tree_code code
;
2759 tree op0
, op1
, add
= NULL_TREE
;
2761 if (TREE_CODE (off
) == SSA_NAME
)
2763 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
2765 if (expr_invariant_in_loop_p (loop
, off
))
2768 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
2771 op0
= gimple_assign_rhs1 (def_stmt
);
2772 code
= gimple_assign_rhs_code (def_stmt
);
2773 op1
= gimple_assign_rhs2 (def_stmt
);
2777 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
2779 code
= TREE_CODE (off
);
2780 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
2784 case POINTER_PLUS_EXPR
:
2786 if (expr_invariant_in_loop_p (loop
, op0
))
2791 add
= fold_convert (sizetype
, add
);
2793 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
2794 base
= size_binop (PLUS_EXPR
, base
, add
);
2797 if (expr_invariant_in_loop_p (loop
, op1
))
2805 if (expr_invariant_in_loop_p (loop
, op1
))
2807 add
= fold_convert (sizetype
, op1
);
2808 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
2814 if (scale
== 1 && host_integerp (op1
, 0))
2816 scale
= tree_low_cst (op1
, 0);
2825 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
2826 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
2828 if (TYPE_PRECISION (TREE_TYPE (op0
))
2829 == TYPE_PRECISION (TREE_TYPE (off
)))
2834 if (TYPE_PRECISION (TREE_TYPE (op0
))
2835 < TYPE_PRECISION (TREE_TYPE (off
)))
2838 offtype
= TREE_TYPE (off
);
2849 /* If at the end OFF still isn't a SSA_NAME or isn't
2850 defined in the loop, punt. */
2851 if (TREE_CODE (off
) != SSA_NAME
2852 || expr_invariant_in_loop_p (loop
, off
))
2855 if (offtype
== NULL_TREE
)
2856 offtype
= TREE_TYPE (off
);
2858 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
2860 if (decl
== NULL_TREE
)
2872 /* Function vect_analyze_data_refs.
2874 Find all the data references in the loop or basic block.
2876 The general structure of the analysis of data refs in the vectorizer is as
2878 1- vect_analyze_data_refs(loop/bb): call
2879 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2880 in the loop/bb and their dependences.
2881 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2882 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2883 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2888 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
2889 bb_vec_info bb_vinfo
,
2892 struct loop
*loop
= NULL
;
2893 basic_block bb
= NULL
;
2895 vec
<data_reference_p
> datarefs
;
2896 struct data_reference
*dr
;
2899 if (dump_enabled_p ())
2900 dump_printf_loc (MSG_NOTE
, vect_location
,
2901 "=== vect_analyze_data_refs ===\n");
2905 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2906 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
))
2907 || find_data_references_in_loop
2908 (loop
, &LOOP_VINFO_DATAREFS (loop_vinfo
)))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2912 "not vectorized: loop contains function calls"
2913 " or data references that cannot be analyzed");
2917 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2921 gimple_stmt_iterator gsi
;
2923 bb
= BB_VINFO_BB (bb_vinfo
);
2924 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2926 gimple stmt
= gsi_stmt (gsi
);
2927 if (!find_data_references_in_stmt (NULL
, stmt
,
2928 &BB_VINFO_DATAREFS (bb_vinfo
)))
2930 /* Mark the rest of the basic-block as unvectorizable. */
2931 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
2933 stmt
= gsi_stmt (gsi
);
2934 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
2940 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2943 /* Go through the data-refs, check that the analysis succeeded. Update
2944 pointer from stmt_vec_info struct to DR and vectype. */
2946 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2949 stmt_vec_info stmt_info
;
2950 tree base
, offset
, init
;
2951 bool gather
= false;
2952 bool simd_lane_access
= false;
2956 if (!dr
|| !DR_REF (dr
))
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2960 "not vectorized: unhandled data-ref ");
2964 stmt
= DR_STMT (dr
);
2965 stmt_info
= vinfo_for_stmt (stmt
);
2967 /* Discard clobbers from the dataref vector. We will remove
2968 clobber stmts during vectorization. */
2969 if (gimple_clobber_p (stmt
))
2971 if (i
== datarefs
.length () - 1)
2976 datarefs
[i
] = datarefs
.pop ();
2980 /* Check that analysis of the data-ref succeeded. */
2981 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
2986 && !TREE_THIS_VOLATILE (DR_REF (dr
))
2987 && targetm
.vectorize
.builtin_gather
!= NULL
;
2988 bool maybe_simd_lane_access
2989 = loop_vinfo
&& loop
->simduid
;
2991 /* If target supports vector gather loads, or if this might be
2992 a SIMD lane access, see if they can't be used. */
2994 && (maybe_gather
|| maybe_simd_lane_access
)
2995 && !nested_in_vect_loop_p (loop
, stmt
))
2997 struct data_reference
*newdr
2998 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
2999 DR_REF (dr
), stmt
, true);
3000 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3001 if (DR_BASE_ADDRESS (newdr
)
3002 && DR_OFFSET (newdr
)
3005 && integer_zerop (DR_STEP (newdr
)))
3007 if (maybe_simd_lane_access
)
3009 tree off
= DR_OFFSET (newdr
);
3011 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3012 && TREE_CODE (off
) == MULT_EXPR
3013 && host_integerp (TREE_OPERAND (off
, 1), 1))
3015 tree step
= TREE_OPERAND (off
, 1);
3016 off
= TREE_OPERAND (off
, 0);
3018 if (CONVERT_EXPR_P (off
)
3019 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3021 < TYPE_PRECISION (TREE_TYPE (off
)))
3022 off
= TREE_OPERAND (off
, 0);
3023 if (TREE_CODE (off
) == SSA_NAME
)
3025 gimple def
= SSA_NAME_DEF_STMT (off
);
3026 tree reft
= TREE_TYPE (DR_REF (newdr
));
3027 if (gimple_call_internal_p (def
)
3028 && gimple_call_internal_fn (def
)
3029 == IFN_GOMP_SIMD_LANE
)
3031 tree arg
= gimple_call_arg (def
, 0);
3032 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3033 arg
= SSA_NAME_VAR (arg
);
3034 if (arg
== loop
->simduid
3036 && tree_int_cst_equal
3037 (TYPE_SIZE_UNIT (reft
),
3040 DR_OFFSET (newdr
) = ssize_int (0);
3041 DR_STEP (newdr
) = step
;
3043 simd_lane_access
= true;
3049 if (!simd_lane_access
&& maybe_gather
)
3055 if (!gather
&& !simd_lane_access
)
3056 free_data_ref (newdr
);
3059 if (!gather
&& !simd_lane_access
)
3061 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3064 "not vectorized: data ref analysis "
3066 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3076 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3080 "not vectorized: base addr of dr is a "
3086 if (gather
|| simd_lane_access
)
3091 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3093 if (dump_enabled_p ())
3095 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3096 "not vectorized: volatile type ");
3097 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3106 if (stmt_can_throw_internal (stmt
))
3108 if (dump_enabled_p ())
3110 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3111 "not vectorized: statement can throw an "
3113 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3119 if (gather
|| simd_lane_access
)
3124 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3125 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3127 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3130 "not vectorized: statement is bitfield "
3132 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3138 if (gather
|| simd_lane_access
)
3143 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3144 offset
= unshare_expr (DR_OFFSET (dr
));
3145 init
= unshare_expr (DR_INIT (dr
));
3147 if (is_gimple_call (stmt
))
3149 if (dump_enabled_p ())
3151 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3152 "not vectorized: dr in a call ");
3153 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3159 if (gather
|| simd_lane_access
)
3164 /* Update DR field in stmt_vec_info struct. */
3166 /* If the dataref is in an inner-loop of the loop that is considered for
3167 for vectorization, we also want to analyze the access relative to
3168 the outer-loop (DR contains information only relative to the
3169 inner-most enclosing loop). We do that by building a reference to the
3170 first location accessed by the inner-loop, and analyze it relative to
3172 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3174 tree outer_step
, outer_base
, outer_init
;
3175 HOST_WIDE_INT pbitsize
, pbitpos
;
3177 enum machine_mode pmode
;
3178 int punsignedp
, pvolatilep
;
3179 affine_iv base_iv
, offset_iv
;
3182 /* Build a reference to the first location accessed by the
3183 inner-loop: *(BASE+INIT). (The first location is actually
3184 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3185 tree inner_base
= build_fold_indirect_ref
3186 (fold_build_pointer_plus (base
, init
));
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE
, vect_location
,
3191 "analyze in outer-loop: ");
3192 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3195 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3196 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3197 gcc_assert (outer_base
!= NULL_TREE
);
3199 if (pbitpos
% BITS_PER_UNIT
!= 0)
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3203 "failed: bit offset alignment.\n");
3207 outer_base
= build_fold_addr_expr (outer_base
);
3208 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3211 if (dump_enabled_p ())
3212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3213 "failed: evolution of base is not affine.\n");
3220 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3228 offset_iv
.base
= ssize_int (0);
3229 offset_iv
.step
= ssize_int (0);
3231 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3234 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3236 "evolution of offset is not affine.\n");
3240 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3241 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3242 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3243 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3244 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3246 outer_step
= size_binop (PLUS_EXPR
,
3247 fold_convert (ssizetype
, base_iv
.step
),
3248 fold_convert (ssizetype
, offset_iv
.step
));
3250 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3251 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3252 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3253 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3254 STMT_VINFO_DR_OFFSET (stmt_info
) =
3255 fold_convert (ssizetype
, offset_iv
.base
);
3256 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3257 size_int (highest_pow2_factor (offset_iv
.base
));
3259 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE
, vect_location
,
3262 "\touter base_address: ");
3263 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3264 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3265 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3266 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3267 STMT_VINFO_DR_OFFSET (stmt_info
));
3268 dump_printf (MSG_NOTE
,
3269 "\n\touter constant offset from base address: ");
3270 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3271 STMT_VINFO_DR_INIT (stmt_info
));
3272 dump_printf (MSG_NOTE
, "\n\touter step: ");
3273 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3274 STMT_VINFO_DR_STEP (stmt_info
));
3275 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3276 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3277 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3281 if (STMT_VINFO_DATA_REF (stmt_info
))
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3286 "not vectorized: more than one data ref "
3288 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3294 if (gather
|| simd_lane_access
)
3299 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3300 if (simd_lane_access
)
3302 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3306 /* Set vectype for STMT. */
3307 scalar_type
= TREE_TYPE (DR_REF (dr
));
3308 STMT_VINFO_VECTYPE (stmt_info
) =
3309 get_vectype_for_scalar_type (scalar_type
);
3310 if (!STMT_VINFO_VECTYPE (stmt_info
))
3312 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3315 "not vectorized: no vectype for stmt: ");
3316 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3317 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3318 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3325 if (gather
|| simd_lane_access
)
3327 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3334 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE
, vect_location
,
3337 "got vectype for stmt: ");
3338 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3339 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3340 STMT_VINFO_VECTYPE (stmt_info
));
3344 /* Adjust the minimal vectorization factor according to the
3346 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3354 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3356 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3360 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3362 if (dump_enabled_p ())
3364 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3365 "not vectorized: not suitable for gather "
3367 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3373 STMT_VINFO_GATHER_P (stmt_info
) = true;
3376 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3378 if (nested_in_vect_loop_p (loop
, stmt
)
3379 || !DR_IS_READ (dr
))
3381 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3384 "not vectorized: not suitable for strided "
3386 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3390 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3394 /* If we stopped analysis at the first dataref we could not analyze
3395 when trying to vectorize a basic-block mark the rest of the datarefs
3396 as not vectorizable and truncate the vector of datarefs. That
3397 avoids spending useless time in analyzing their dependence. */
3398 if (i
!= datarefs
.length ())
3400 gcc_assert (bb_vinfo
!= NULL
);
3401 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3403 data_reference_p dr
= datarefs
[j
];
3404 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3407 datarefs
.truncate (i
);
3414 /* Function vect_get_new_vect_var.
3416 Returns a name for a new variable. The current naming scheme appends the
3417 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3418 the name of vectorizer generated variables, and appends that to NAME if
3422 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3429 case vect_simple_var
:
3432 case vect_scalar_var
:
3435 case vect_pointer_var
:
3444 char* tmp
= concat (prefix
, "_", name
, NULL
);
3445 new_vect_var
= create_tmp_reg (type
, tmp
);
3449 new_vect_var
= create_tmp_reg (type
, prefix
);
3451 return new_vect_var
;
3455 /* Function vect_create_addr_base_for_vector_ref.
3457 Create an expression that computes the address of the first memory location
3458 that will be accessed for a data reference.
3461 STMT: The statement containing the data reference.
3462 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3463 OFFSET: Optional. If supplied, it is be added to the initial address.
3464 LOOP: Specify relative to which loop-nest should the address be computed.
3465 For example, when the dataref is in an inner-loop nested in an
3466 outer-loop that is now being vectorized, LOOP can be either the
3467 outer-loop, or the inner-loop. The first memory location accessed
3468 by the following dataref ('in' points to short):
3475 if LOOP=i_loop: &in (relative to i_loop)
3476 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3479 1. Return an SSA_NAME whose value is the address of the memory location of
3480 the first vector of the data reference.
3481 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3482 these statement(s) which define the returned SSA_NAME.
3484 FORNOW: We are only handling array accesses with step 1. */
3487 vect_create_addr_base_for_vector_ref (gimple stmt
,
3488 gimple_seq
*new_stmt_list
,
3492 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3493 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3495 const char *base_name
;
3498 gimple_seq seq
= NULL
;
3502 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3503 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3505 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3507 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3509 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3511 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3512 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3513 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3517 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3518 base_offset
= unshare_expr (DR_OFFSET (dr
));
3519 init
= unshare_expr (DR_INIT (dr
));
3523 base_name
= get_name (data_ref_base
);
3526 base_offset
= ssize_int (0);
3527 init
= ssize_int (0);
3528 base_name
= get_name (DR_REF (dr
));
3531 /* Create base_offset */
3532 base_offset
= size_binop (PLUS_EXPR
,
3533 fold_convert (sizetype
, base_offset
),
3534 fold_convert (sizetype
, init
));
3538 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3539 fold_convert (sizetype
, offset
), step
);
3540 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3541 base_offset
, offset
);
3544 /* base + base_offset */
3546 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3549 addr_base
= build1 (ADDR_EXPR
,
3550 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3551 unshare_expr (DR_REF (dr
)));
3554 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3555 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3556 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3557 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3558 gimple_seq_add_seq (new_stmt_list
, seq
);
3560 if (DR_PTR_INFO (dr
)
3561 && TREE_CODE (addr_base
) == SSA_NAME
)
3563 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3565 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3568 if (dump_enabled_p ())
3570 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3571 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3578 /* Function vect_create_data_ref_ptr.
3580 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3581 location accessed in the loop by STMT, along with the def-use update
3582 chain to appropriately advance the pointer through the loop iterations.
3583 Also set aliasing information for the pointer. This pointer is used by
3584 the callers to this function to create a memory reference expression for
3585 vector load/store access.
3588 1. STMT: a stmt that references memory. Expected to be of the form
3589 GIMPLE_ASSIGN <name, data-ref> or
3590 GIMPLE_ASSIGN <data-ref, name>.
3591 2. AGGR_TYPE: the type of the reference, which should be either a vector
3593 3. AT_LOOP: the loop where the vector memref is to be created.
3594 4. OFFSET (optional): an offset to be added to the initial address accessed
3595 by the data-ref in STMT.
3596 5. BSI: location where the new stmts are to be placed if there is no loop
3597 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3598 pointing to the initial address.
3601 1. Declare a new ptr to vector_type, and have it point to the base of the
3602 data reference (initial addressed accessed by the data reference).
3603 For example, for vector of type V8HI, the following code is generated:
3606 ap = (v8hi *)initial_address;
3608 if OFFSET is not supplied:
3609 initial_address = &a[init];
3610 if OFFSET is supplied:
3611 initial_address = &a[init + OFFSET];
3613 Return the initial_address in INITIAL_ADDRESS.
3615 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3616 update the pointer in each iteration of the loop.
3618 Return the increment stmt that updates the pointer in PTR_INCR.
3620 3. Set INV_P to true if the access pattern of the data reference in the
3621 vectorized loop is invariant. Set it to false otherwise.
3623 4. Return the pointer. */
3626 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3627 tree offset
, tree
*initial_address
,
3628 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3629 bool only_init
, bool *inv_p
)
3631 const char *base_name
;
3632 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3633 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3634 struct loop
*loop
= NULL
;
3635 bool nested_in_vect_loop
= false;
3636 struct loop
*containing_loop
= NULL
;
3641 gimple_seq new_stmt_list
= NULL
;
3645 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3647 gimple_stmt_iterator incr_gsi
;
3649 tree indx_before_incr
, indx_after_incr
;
3652 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3654 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3655 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3659 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3660 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3661 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3662 pe
= loop_preheader_edge (loop
);
3666 gcc_assert (bb_vinfo
);
3671 /* Check the step (evolution) of the load in LOOP, and record
3672 whether it's invariant. */
3673 if (nested_in_vect_loop
)
3674 step
= STMT_VINFO_DR_STEP (stmt_info
);
3676 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
3678 if (integer_zerop (step
))
3683 /* Create an expression for the first address accessed by this load
3685 base_name
= get_name (DR_BASE_ADDRESS (dr
));
3687 if (dump_enabled_p ())
3689 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
3690 dump_printf_loc (MSG_NOTE
, vect_location
,
3691 "create %s-pointer variable to type: ",
3692 tree_code_name
[(int) TREE_CODE (aggr_type
)]);
3693 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
3694 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
3695 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
3696 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
3697 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
3698 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
3699 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
3701 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
3702 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
3705 /* (1) Create the new aggregate-pointer variable.
3706 Vector and array types inherit the alias set of their component
3707 type by default so we need to use a ref-all pointer if the data
3708 reference does not conflict with the created aggregated data
3709 reference because it is not addressable. */
3710 bool need_ref_all
= false;
3711 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3712 get_alias_set (DR_REF (dr
))))
3713 need_ref_all
= true;
3714 /* Likewise for any of the data references in the stmt group. */
3715 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
3717 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
3720 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
3721 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
3722 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3723 get_alias_set (DR_REF (sdr
))))
3725 need_ref_all
= true;
3728 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
3732 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
3734 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
3737 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3738 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3739 def-use update cycles for the pointer: one relative to the outer-loop
3740 (LOOP), which is what steps (3) and (4) below do. The other is relative
3741 to the inner-loop (which is the inner-most loop containing the dataref),
3742 and this is done be step (5) below.
3744 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3745 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3746 redundant. Steps (3),(4) create the following:
3749 LOOP: vp1 = phi(vp0,vp2)
3755 If there is an inner-loop nested in loop, then step (5) will also be
3756 applied, and an additional update in the inner-loop will be created:
3759 LOOP: vp1 = phi(vp0,vp2)
3761 inner: vp3 = phi(vp1,vp4)
3762 vp4 = vp3 + inner_step
3768 /* (2) Calculate the initial address of the aggregate-pointer, and set
3769 the aggregate-pointer to point to it before the loop. */
3771 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3773 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
3779 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
3780 gcc_assert (!new_bb
);
3783 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
3786 *initial_address
= new_temp
;
3788 /* Create: p = (aggr_type *) initial_base */
3789 if (TREE_CODE (new_temp
) != SSA_NAME
3790 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
3792 vec_stmt
= gimple_build_assign (aggr_ptr
,
3793 fold_convert (aggr_ptr_type
, new_temp
));
3794 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
3795 /* Copy the points-to information if it exists. */
3796 if (DR_PTR_INFO (dr
))
3797 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
3798 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
3801 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
3802 gcc_assert (!new_bb
);
3805 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
3808 aggr_ptr_init
= new_temp
;
3810 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3811 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3812 inner-loop nested in LOOP (during outer-loop vectorization). */
3814 /* No update in loop is required. */
3815 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
3816 aptr
= aggr_ptr_init
;
3819 /* The step of the aggregate pointer is the type size. */
3820 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
3821 /* One exception to the above is when the scalar step of the load in
3822 LOOP is zero. In this case the step here is also zero. */
3824 iv_step
= size_zero_node
;
3825 else if (tree_int_cst_sgn (step
) == -1)
3826 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
3828 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
3830 create_iv (aggr_ptr_init
,
3831 fold_convert (aggr_ptr_type
, iv_step
),
3832 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
3833 &indx_before_incr
, &indx_after_incr
);
3834 incr
= gsi_stmt (incr_gsi
);
3835 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3837 /* Copy the points-to information if it exists. */
3838 if (DR_PTR_INFO (dr
))
3840 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3841 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3846 aptr
= indx_before_incr
;
3849 if (!nested_in_vect_loop
|| only_init
)
3853 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3854 nested in LOOP, if exists. */
3856 gcc_assert (nested_in_vect_loop
);
3859 standard_iv_increment_position (containing_loop
, &incr_gsi
,
3861 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
3862 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
3864 incr
= gsi_stmt (incr_gsi
);
3865 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3867 /* Copy the points-to information if it exists. */
3868 if (DR_PTR_INFO (dr
))
3870 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3871 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3876 return indx_before_incr
;
3883 /* Function bump_vector_ptr
3885 Increment a pointer (to a vector type) by vector-size. If requested,
3886 i.e. if PTR-INCR is given, then also connect the new increment stmt
3887 to the existing def-use update-chain of the pointer, by modifying
3888 the PTR_INCR as illustrated below:
3890 The pointer def-use update-chain before this function:
3891 DATAREF_PTR = phi (p_0, p_2)
3893 PTR_INCR: p_2 = DATAREF_PTR + step
3895 The pointer def-use update-chain after this function:
3896 DATAREF_PTR = phi (p_0, p_2)
3898 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3900 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3903 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3905 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3906 the loop. The increment amount across iterations is expected
3908 BSI - location where the new update stmt is to be placed.
3909 STMT - the original scalar memory-access stmt that is being vectorized.
3910 BUMP - optional. The offset by which to bump the pointer. If not given,
3911 the offset is assumed to be vector_size.
3913 Output: Return NEW_DATAREF_PTR as illustrated above.
3918 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
3919 gimple stmt
, tree bump
)
3921 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3922 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3923 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3924 tree update
= TYPE_SIZE_UNIT (vectype
);
3927 use_operand_p use_p
;
3928 tree new_dataref_ptr
;
3933 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
3934 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
3935 dataref_ptr
, update
);
3936 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
3938 /* Copy the points-to information if it exists. */
3939 if (DR_PTR_INFO (dr
))
3941 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
3942 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
3946 return new_dataref_ptr
;
3948 /* Update the vector-pointer's cross-iteration increment. */
3949 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
3951 tree use
= USE_FROM_PTR (use_p
);
3953 if (use
== dataref_ptr
)
3954 SET_USE (use_p
, new_dataref_ptr
);
3956 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
3959 return new_dataref_ptr
;
3963 /* Function vect_create_destination_var.
3965 Create a new temporary of type VECTYPE. */
3968 vect_create_destination_var (tree scalar_dest
, tree vectype
)
3974 enum vect_var_kind kind
;
3976 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
3977 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
3979 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
3981 name
= get_name (scalar_dest
);
3983 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
3985 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
3986 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
3992 /* Function vect_grouped_store_supported.
3994 Returns TRUE if interleave high and interleave low permutations
3995 are supported, and FALSE otherwise. */
3998 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4000 enum machine_mode mode
= TYPE_MODE (vectype
);
4002 /* vect_permute_store_chain requires the group size to be a power of two. */
4003 if (exact_log2 (count
) == -1)
4005 if (dump_enabled_p ())
4006 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4007 "the size of the group of accesses"
4008 " is not a power of 2");
4012 /* Check that the permutation is supported. */
4013 if (VECTOR_MODE_P (mode
))
4015 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4016 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4017 for (i
= 0; i
< nelt
/ 2; i
++)
4020 sel
[i
* 2 + 1] = i
+ nelt
;
4022 if (can_vec_perm_p (mode
, false, sel
))
4024 for (i
= 0; i
< nelt
; i
++)
4026 if (can_vec_perm_p (mode
, false, sel
))
4031 if (dump_enabled_p ())
4032 dump_printf (MSG_MISSED_OPTIMIZATION
,
4033 "interleave op not supported by target.");
4038 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4042 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4044 return vect_lanes_optab_supported_p ("vec_store_lanes",
4045 vec_store_lanes_optab
,
4050 /* Function vect_permute_store_chain.
4052 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4053 a power of 2, generate interleave_high/low stmts to reorder the data
4054 correctly for the stores. Return the final references for stores in
4057 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4058 The input is 4 vectors each containing 8 elements. We assign a number to
4059 each element, the input sequence is:
4061 1st vec: 0 1 2 3 4 5 6 7
4062 2nd vec: 8 9 10 11 12 13 14 15
4063 3rd vec: 16 17 18 19 20 21 22 23
4064 4th vec: 24 25 26 27 28 29 30 31
4066 The output sequence should be:
4068 1st vec: 0 8 16 24 1 9 17 25
4069 2nd vec: 2 10 18 26 3 11 19 27
4070 3rd vec: 4 12 20 28 5 13 21 30
4071 4th vec: 6 14 22 30 7 15 23 31
4073 i.e., we interleave the contents of the four vectors in their order.
4075 We use interleave_high/low instructions to create such output. The input of
4076 each interleave_high/low operation is two vectors:
4079 the even elements of the result vector are obtained left-to-right from the
4080 high/low elements of the first vector. The odd elements of the result are
4081 obtained left-to-right from the high/low elements of the second vector.
4082 The output of interleave_high will be: 0 4 1 5
4083 and of interleave_low: 2 6 3 7
4086 The permutation is done in log LENGTH stages. In each stage interleave_high
4087 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4088 where the first argument is taken from the first half of DR_CHAIN and the
4089 second argument from it's second half.
4092 I1: interleave_high (1st vec, 3rd vec)
4093 I2: interleave_low (1st vec, 3rd vec)
4094 I3: interleave_high (2nd vec, 4th vec)
4095 I4: interleave_low (2nd vec, 4th vec)
4097 The output for the first stage is:
4099 I1: 0 16 1 17 2 18 3 19
4100 I2: 4 20 5 21 6 22 7 23
4101 I3: 8 24 9 25 10 26 11 27
4102 I4: 12 28 13 29 14 30 15 31
4104 The output of the second stage, i.e. the final result is:
4106 I1: 0 8 16 24 1 9 17 25
4107 I2: 2 10 18 26 3 11 19 27
4108 I3: 4 12 20 28 5 13 21 30
4109 I4: 6 14 22 30 7 15 23 31. */
4112 vect_permute_store_chain (vec
<tree
> dr_chain
,
4113 unsigned int length
,
4115 gimple_stmt_iterator
*gsi
,
4116 vec
<tree
> *result_chain
)
4118 tree vect1
, vect2
, high
, low
;
4120 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4121 tree perm_mask_low
, perm_mask_high
;
4123 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4124 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4126 result_chain
->quick_grow (length
);
4127 memcpy (result_chain
->address (), dr_chain
.address (),
4128 length
* sizeof (tree
));
4130 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4133 sel
[i
* 2 + 1] = i
+ nelt
;
4135 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4136 gcc_assert (perm_mask_high
!= NULL
);
4138 for (i
= 0; i
< nelt
; i
++)
4140 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4141 gcc_assert (perm_mask_low
!= NULL
);
4143 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4145 for (j
= 0; j
< length
/2; j
++)
4147 vect1
= dr_chain
[j
];
4148 vect2
= dr_chain
[j
+length
/2];
4150 /* Create interleaving stmt:
4151 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4152 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4154 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4155 vect1
, vect2
, perm_mask_high
);
4156 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4157 (*result_chain
)[2*j
] = high
;
4159 /* Create interleaving stmt:
4160 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4161 nelt*3/2+1, ...}> */
4162 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4164 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4165 vect1
, vect2
, perm_mask_low
);
4166 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4167 (*result_chain
)[2*j
+1] = low
;
4169 memcpy (dr_chain
.address (), result_chain
->address (),
4170 length
* sizeof (tree
));
4174 /* Function vect_setup_realignment
4176 This function is called when vectorizing an unaligned load using
4177 the dr_explicit_realign[_optimized] scheme.
4178 This function generates the following code at the loop prolog:
4181 x msq_init = *(floor(p)); # prolog load
4182 realignment_token = call target_builtin;
4184 x msq = phi (msq_init, ---)
4186 The stmts marked with x are generated only for the case of
4187 dr_explicit_realign_optimized.
4189 The code above sets up a new (vector) pointer, pointing to the first
4190 location accessed by STMT, and a "floor-aligned" load using that pointer.
4191 It also generates code to compute the "realignment-token" (if the relevant
4192 target hook was defined), and creates a phi-node at the loop-header bb
4193 whose arguments are the result of the prolog-load (created by this
4194 function) and the result of a load that takes place in the loop (to be
4195 created by the caller to this function).
4197 For the case of dr_explicit_realign_optimized:
4198 The caller to this function uses the phi-result (msq) to create the
4199 realignment code inside the loop, and sets up the missing phi argument,
4202 msq = phi (msq_init, lsq)
4203 lsq = *(floor(p')); # load in loop
4204 result = realign_load (msq, lsq, realignment_token);
4206 For the case of dr_explicit_realign:
4208 msq = *(floor(p)); # load in loop
4210 lsq = *(floor(p')); # load in loop
4211 result = realign_load (msq, lsq, realignment_token);
4214 STMT - (scalar) load stmt to be vectorized. This load accesses
4215 a memory location that may be unaligned.
4216 BSI - place where new code is to be inserted.
4217 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4221 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4222 target hook, if defined.
4223 Return value - the result of the loop-header phi node. */
4226 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4227 tree
*realignment_token
,
4228 enum dr_alignment_support alignment_support_scheme
,
4230 struct loop
**at_loop
)
4232 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4233 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4234 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4235 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4236 struct loop
*loop
= NULL
;
4238 tree scalar_dest
= gimple_assign_lhs (stmt
);
4245 tree msq_init
= NULL_TREE
;
4248 tree msq
= NULL_TREE
;
4249 gimple_seq stmts
= NULL
;
4251 bool compute_in_loop
= false;
4252 bool nested_in_vect_loop
= false;
4253 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4254 struct loop
*loop_for_initial_load
= NULL
;
4258 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4259 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4262 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4263 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4265 /* We need to generate three things:
4266 1. the misalignment computation
4267 2. the extra vector load (for the optimized realignment scheme).
4268 3. the phi node for the two vectors from which the realignment is
4269 done (for the optimized realignment scheme). */
4271 /* 1. Determine where to generate the misalignment computation.
4273 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4274 calculation will be generated by this function, outside the loop (in the
4275 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4276 caller, inside the loop.
4278 Background: If the misalignment remains fixed throughout the iterations of
4279 the loop, then both realignment schemes are applicable, and also the
4280 misalignment computation can be done outside LOOP. This is because we are
4281 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4282 are a multiple of VS (the Vector Size), and therefore the misalignment in
4283 different vectorized LOOP iterations is always the same.
4284 The problem arises only if the memory access is in an inner-loop nested
4285 inside LOOP, which is now being vectorized using outer-loop vectorization.
4286 This is the only case when the misalignment of the memory access may not
4287 remain fixed throughout the iterations of the inner-loop (as explained in
4288 detail in vect_supportable_dr_alignment). In this case, not only is the
4289 optimized realignment scheme not applicable, but also the misalignment
4290 computation (and generation of the realignment token that is passed to
4291 REALIGN_LOAD) have to be done inside the loop.
4293 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4294 or not, which in turn determines if the misalignment is computed inside
4295 the inner-loop, or outside LOOP. */
4297 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4299 compute_in_loop
= true;
4300 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4304 /* 2. Determine where to generate the extra vector load.
4306 For the optimized realignment scheme, instead of generating two vector
4307 loads in each iteration, we generate a single extra vector load in the
4308 preheader of the loop, and in each iteration reuse the result of the
4309 vector load from the previous iteration. In case the memory access is in
4310 an inner-loop nested inside LOOP, which is now being vectorized using
4311 outer-loop vectorization, we need to determine whether this initial vector
4312 load should be generated at the preheader of the inner-loop, or can be
4313 generated at the preheader of LOOP. If the memory access has no evolution
4314 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4315 to be generated inside LOOP (in the preheader of the inner-loop). */
4317 if (nested_in_vect_loop
)
4319 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4320 bool invariant_in_outerloop
=
4321 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4322 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4325 loop_for_initial_load
= loop
;
4327 *at_loop
= loop_for_initial_load
;
4329 if (loop_for_initial_load
)
4330 pe
= loop_preheader_edge (loop_for_initial_load
);
4332 /* 3. For the case of the optimized realignment, create the first vector
4333 load at the loop preheader. */
4335 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4337 /* Create msq_init = *(floor(p1)) in the loop preheader */
4339 gcc_assert (!compute_in_loop
);
4340 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4341 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4342 NULL_TREE
, &init_addr
, NULL
, &inc
,
4344 new_temp
= copy_ssa_name (ptr
, NULL
);
4345 new_stmt
= gimple_build_assign_with_ops
4346 (BIT_AND_EXPR
, new_temp
, ptr
,
4347 build_int_cst (TREE_TYPE (ptr
),
4348 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4349 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4350 gcc_assert (!new_bb
);
4352 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4353 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4354 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4355 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4356 gimple_assign_set_lhs (new_stmt
, new_temp
);
4359 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4360 gcc_assert (!new_bb
);
4363 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4365 msq_init
= gimple_assign_lhs (new_stmt
);
4368 /* 4. Create realignment token using a target builtin, if available.
4369 It is done either inside the containing loop, or before LOOP (as
4370 determined above). */
4372 if (targetm
.vectorize
.builtin_mask_for_load
)
4376 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4379 /* Generate the INIT_ADDR computation outside LOOP. */
4380 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4384 pe
= loop_preheader_edge (loop
);
4385 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4386 gcc_assert (!new_bb
);
4389 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4392 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4393 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4395 vect_create_destination_var (scalar_dest
,
4396 gimple_call_return_type (new_stmt
));
4397 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4398 gimple_call_set_lhs (new_stmt
, new_temp
);
4400 if (compute_in_loop
)
4401 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4404 /* Generate the misalignment computation outside LOOP. */
4405 pe
= loop_preheader_edge (loop
);
4406 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4407 gcc_assert (!new_bb
);
4410 *realignment_token
= gimple_call_lhs (new_stmt
);
4412 /* The result of the CALL_EXPR to this builtin is determined from
4413 the value of the parameter and no global variables are touched
4414 which makes the builtin a "const" function. Requiring the
4415 builtin to have the "const" attribute makes it unnecessary
4416 to call mark_call_clobbered. */
4417 gcc_assert (TREE_READONLY (builtin_decl
));
4420 if (alignment_support_scheme
== dr_explicit_realign
)
4423 gcc_assert (!compute_in_loop
);
4424 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4427 /* 5. Create msq = phi <msq_init, lsq> in loop */
4429 pe
= loop_preheader_edge (containing_loop
);
4430 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4431 msq
= make_ssa_name (vec_dest
, NULL
);
4432 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4433 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4439 /* Function vect_grouped_load_supported.
4441 Returns TRUE if even and odd permutations are supported,
4442 and FALSE otherwise. */
4445 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4447 enum machine_mode mode
= TYPE_MODE (vectype
);
4449 /* vect_permute_load_chain requires the group size to be a power of two. */
4450 if (exact_log2 (count
) == -1)
4452 if (dump_enabled_p ())
4453 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4454 "the size of the group of accesses"
4455 " is not a power of 2");
4459 /* Check that the permutation is supported. */
4460 if (VECTOR_MODE_P (mode
))
4462 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4463 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4465 for (i
= 0; i
< nelt
; i
++)
4467 if (can_vec_perm_p (mode
, false, sel
))
4469 for (i
= 0; i
< nelt
; i
++)
4471 if (can_vec_perm_p (mode
, false, sel
))
4476 if (dump_enabled_p ())
4477 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4478 "extract even/odd not supported by target");
4482 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4486 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4488 return vect_lanes_optab_supported_p ("vec_load_lanes",
4489 vec_load_lanes_optab
,
4493 /* Function vect_permute_load_chain.
4495 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4496 a power of 2, generate extract_even/odd stmts to reorder the input data
4497 correctly. Return the final references for loads in RESULT_CHAIN.
4499 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4500 The input is 4 vectors each containing 8 elements. We assign a number to each
4501 element, the input sequence is:
4503 1st vec: 0 1 2 3 4 5 6 7
4504 2nd vec: 8 9 10 11 12 13 14 15
4505 3rd vec: 16 17 18 19 20 21 22 23
4506 4th vec: 24 25 26 27 28 29 30 31
4508 The output sequence should be:
4510 1st vec: 0 4 8 12 16 20 24 28
4511 2nd vec: 1 5 9 13 17 21 25 29
4512 3rd vec: 2 6 10 14 18 22 26 30
4513 4th vec: 3 7 11 15 19 23 27 31
4515 i.e., the first output vector should contain the first elements of each
4516 interleaving group, etc.
4518 We use extract_even/odd instructions to create such output. The input of
4519 each extract_even/odd operation is two vectors
4523 and the output is the vector of extracted even/odd elements. The output of
4524 extract_even will be: 0 2 4 6
4525 and of extract_odd: 1 3 5 7
4528 The permutation is done in log LENGTH stages. In each stage extract_even
4529 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4530 their order. In our example,
4532 E1: extract_even (1st vec, 2nd vec)
4533 E2: extract_odd (1st vec, 2nd vec)
4534 E3: extract_even (3rd vec, 4th vec)
4535 E4: extract_odd (3rd vec, 4th vec)
4537 The output for the first stage will be:
4539 E1: 0 2 4 6 8 10 12 14
4540 E2: 1 3 5 7 9 11 13 15
4541 E3: 16 18 20 22 24 26 28 30
4542 E4: 17 19 21 23 25 27 29 31
4544 In order to proceed and create the correct sequence for the next stage (or
4545 for the correct output, if the second stage is the last one, as in our
4546 example), we first put the output of extract_even operation and then the
4547 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4548 The input for the second stage is:
4550 1st vec (E1): 0 2 4 6 8 10 12 14
4551 2nd vec (E3): 16 18 20 22 24 26 28 30
4552 3rd vec (E2): 1 3 5 7 9 11 13 15
4553 4th vec (E4): 17 19 21 23 25 27 29 31
4555 The output of the second stage:
4557 E1: 0 4 8 12 16 20 24 28
4558 E2: 2 6 10 14 18 22 26 30
4559 E3: 1 5 9 13 17 21 25 29
4560 E4: 3 7 11 15 19 23 27 31
4562 And RESULT_CHAIN after reordering:
4564 1st vec (E1): 0 4 8 12 16 20 24 28
4565 2nd vec (E3): 1 5 9 13 17 21 25 29
4566 3rd vec (E2): 2 6 10 14 18 22 26 30
4567 4th vec (E4): 3 7 11 15 19 23 27 31. */
4570 vect_permute_load_chain (vec
<tree
> dr_chain
,
4571 unsigned int length
,
4573 gimple_stmt_iterator
*gsi
,
4574 vec
<tree
> *result_chain
)
4576 tree data_ref
, first_vect
, second_vect
;
4577 tree perm_mask_even
, perm_mask_odd
;
4579 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4580 unsigned int i
, j
, log_length
= exact_log2 (length
);
4581 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4582 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4584 result_chain
->quick_grow (length
);
4585 memcpy (result_chain
->address (), dr_chain
.address (),
4586 length
* sizeof (tree
));
4588 for (i
= 0; i
< nelt
; ++i
)
4590 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4591 gcc_assert (perm_mask_even
!= NULL
);
4593 for (i
= 0; i
< nelt
; ++i
)
4595 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4596 gcc_assert (perm_mask_odd
!= NULL
);
4598 for (i
= 0; i
< log_length
; i
++)
4600 for (j
= 0; j
< length
; j
+= 2)
4602 first_vect
= dr_chain
[j
];
4603 second_vect
= dr_chain
[j
+1];
4605 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4606 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4607 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4608 first_vect
, second_vect
,
4610 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4611 (*result_chain
)[j
/2] = data_ref
;
4613 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4614 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4615 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4616 first_vect
, second_vect
,
4618 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4619 (*result_chain
)[j
/2+length
/2] = data_ref
;
4621 memcpy (dr_chain
.address (), result_chain
->address (),
4622 length
* sizeof (tree
));
4627 /* Function vect_transform_grouped_load.
4629 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4630 to perform their permutation and ascribe the result vectorized statements to
4631 the scalar statements.
4635 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4636 gimple_stmt_iterator
*gsi
)
4638 vec
<tree
> result_chain
= vNULL
;
4640 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4641 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4642 vectors, that are ready for vector computation. */
4643 result_chain
.create (size
);
4644 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4645 vect_record_grouped_load_vectors (stmt
, result_chain
);
4646 result_chain
.release ();
4649 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4650 generated as part of the vectorization of STMT. Assign the statement
4651 for each vector to the associated scalar statement. */
4654 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4656 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4657 gimple next_stmt
, new_stmt
;
4658 unsigned int i
, gap_count
;
4661 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4662 Since we scan the chain starting from it's first node, their order
4663 corresponds the order of data-refs in RESULT_CHAIN. */
4664 next_stmt
= first_stmt
;
4666 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
4671 /* Skip the gaps. Loads created for the gaps will be removed by dead
4672 code elimination pass later. No need to check for the first stmt in
4673 the group, since it always exists.
4674 GROUP_GAP is the number of steps in elements from the previous
4675 access (if there is no gap GROUP_GAP is 1). We skip loads that
4676 correspond to the gaps. */
4677 if (next_stmt
!= first_stmt
4678 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
4686 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
4687 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4688 copies, and we put the new vector statement in the first available
4690 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
4691 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
4694 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4697 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
4699 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
4702 prev_stmt
= rel_stmt
;
4704 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
4707 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
4712 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
4714 /* If NEXT_STMT accesses the same DR as the previous statement,
4715 put the same TMP_DATA_REF as its vectorized statement; otherwise
4716 get the next data-ref from RESULT_CHAIN. */
4717 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4723 /* Function vect_force_dr_alignment_p.
4725 Returns whether the alignment of a DECL can be forced to be aligned
4726 on ALIGNMENT bit boundary. */
4729 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
4731 if (TREE_CODE (decl
) != VAR_DECL
)
4734 /* We cannot change alignment of common or external symbols as another
4735 translation unit may contain a definition with lower alignment.
4736 The rules of common symbol linking mean that the definition
4737 will override the common symbol. The same is true for constant
4738 pool entries which may be shared and are not properly merged
4740 if (DECL_EXTERNAL (decl
)
4741 || DECL_COMMON (decl
)
4742 || DECL_IN_CONSTANT_POOL (decl
))
4745 if (TREE_ASM_WRITTEN (decl
))
4748 /* Do not override the alignment as specified by the ABI when the used
4749 attribute is set. */
4750 if (DECL_PRESERVE_P (decl
))
4753 /* Do not override explicit alignment set by the user when an explicit
4754 section name is also used. This is a common idiom used by many
4755 software projects. */
4756 if (DECL_SECTION_NAME (decl
) != NULL_TREE
4757 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
4760 if (TREE_STATIC (decl
))
4761 return (alignment
<= MAX_OFILE_ALIGNMENT
);
4763 return (alignment
<= MAX_STACK_ALIGNMENT
);
4767 /* Return whether the data reference DR is supported with respect to its
4769 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4770 it is aligned, i.e., check if it is possible to vectorize it with different
4773 enum dr_alignment_support
4774 vect_supportable_dr_alignment (struct data_reference
*dr
,
4775 bool check_aligned_accesses
)
4777 gimple stmt
= DR_STMT (dr
);
4778 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4779 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4780 enum machine_mode mode
= TYPE_MODE (vectype
);
4781 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4782 struct loop
*vect_loop
= NULL
;
4783 bool nested_in_vect_loop
= false;
4785 if (aligned_access_p (dr
) && !check_aligned_accesses
)
4790 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4791 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
4794 /* Possibly unaligned access. */
4796 /* We can choose between using the implicit realignment scheme (generating
4797 a misaligned_move stmt) and the explicit realignment scheme (generating
4798 aligned loads with a REALIGN_LOAD). There are two variants to the
4799 explicit realignment scheme: optimized, and unoptimized.
4800 We can optimize the realignment only if the step between consecutive
4801 vector loads is equal to the vector size. Since the vector memory
4802 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4803 is guaranteed that the misalignment amount remains the same throughout the
4804 execution of the vectorized loop. Therefore, we can create the
4805 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4806 at the loop preheader.
4808 However, in the case of outer-loop vectorization, when vectorizing a
4809 memory access in the inner-loop nested within the LOOP that is now being
4810 vectorized, while it is guaranteed that the misalignment of the
4811 vectorized memory access will remain the same in different outer-loop
4812 iterations, it is *not* guaranteed that is will remain the same throughout
4813 the execution of the inner-loop. This is because the inner-loop advances
4814 with the original scalar step (and not in steps of VS). If the inner-loop
4815 step happens to be a multiple of VS, then the misalignment remains fixed
4816 and we can use the optimized realignment scheme. For example:
4822 When vectorizing the i-loop in the above example, the step between
4823 consecutive vector loads is 1, and so the misalignment does not remain
4824 fixed across the execution of the inner-loop, and the realignment cannot
4825 be optimized (as illustrated in the following pseudo vectorized loop):
4827 for (i=0; i<N; i+=4)
4828 for (j=0; j<M; j++){
4829 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4830 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4831 // (assuming that we start from an aligned address).
4834 We therefore have to use the unoptimized realignment scheme:
4836 for (i=0; i<N; i+=4)
4837 for (j=k; j<M; j+=4)
4838 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4839 // that the misalignment of the initial address is
4842 The loop can then be vectorized as follows:
4844 for (k=0; k<4; k++){
4845 rt = get_realignment_token (&vp[k]);
4846 for (i=0; i<N; i+=4){
4848 for (j=k; j<M; j+=4){
4850 va = REALIGN_LOAD <v1,v2,rt>;
4857 if (DR_IS_READ (dr
))
4859 bool is_packed
= false;
4860 tree type
= (TREE_TYPE (DR_REF (dr
)));
4862 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
4863 && (!targetm
.vectorize
.builtin_mask_for_load
4864 || targetm
.vectorize
.builtin_mask_for_load ()))
4866 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4867 if ((nested_in_vect_loop
4868 && (TREE_INT_CST_LOW (DR_STEP (dr
))
4869 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
4871 return dr_explicit_realign
;
4873 return dr_explicit_realign_optimized
;
4875 if (!known_alignment_for_access_p (dr
))
4876 is_packed
= not_size_aligned (DR_REF (dr
));
4878 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4879 || targetm
.vectorize
.
4880 support_vector_misalignment (mode
, type
,
4881 DR_MISALIGNMENT (dr
), is_packed
))
4882 /* Can't software pipeline the loads, but can at least do them. */
4883 return dr_unaligned_supported
;
4887 bool is_packed
= false;
4888 tree type
= (TREE_TYPE (DR_REF (dr
)));
4890 if (!known_alignment_for_access_p (dr
))
4891 is_packed
= not_size_aligned (DR_REF (dr
));
4893 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4894 || targetm
.vectorize
.
4895 support_vector_misalignment (mode
, type
,
4896 DR_MISALIGNMENT (dr
), is_packed
))
4897 return dr_unaligned_supported
;
4901 return dr_unaligned_unsupported
;