1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40 /* Need to include rtl.h, expr.h, etc. for optabs. */
44 /* Return true if load- or store-lanes optab OPTAB is implemented for
45 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
49 tree vectype
, unsigned HOST_WIDE_INT count
)
51 enum machine_mode mode
, array_mode
;
54 mode
= TYPE_MODE (vectype
);
55 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
56 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
59 if (array_mode
== BLKmode
)
61 if (dump_enabled_p ())
62 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
63 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]",
64 GET_MODE_NAME (mode
), count
);
68 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
72 "cannot use %s<%s><%s>", name
,
73 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_NOTE
, vect_location
,
79 "can use %s<%s><%s>", name
, GET_MODE_NAME (array_mode
),
80 GET_MODE_NAME (mode
));
86 /* Return the smallest scalar part of STMT.
87 This is used to determine the vectype of the stmt. We generally set the
88 vectype according to the type of the result (lhs). For stmts whose
89 result-type is different than the type of the arguments (e.g., demotion,
90 promotion), vectype will be reset appropriately (later). Note that we have
91 to visit the smallest datatype in this function, because that determines the
92 VF. If the smallest datatype in the loop is present only as the rhs of a
93 promotion operation - we'd miss it.
94 Such a case, where a variable of this datatype does not appear in the lhs
95 anywhere in the loop, can only occur if it's an invariant: e.g.:
96 'int_x = (int) short_inv', which we'd expect to have been optimized away by
97 invariant motion. However, we cannot rely on invariant motion to always
98 take invariants out of the loop, and so in the case of promotion we also
99 have to check the rhs.
100 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
104 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
105 HOST_WIDE_INT
*rhs_size_unit
)
107 tree scalar_type
= gimple_expr_type (stmt
);
108 HOST_WIDE_INT lhs
, rhs
;
110 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
112 if (is_gimple_assign (stmt
)
113 && (gimple_assign_cast_p (stmt
)
114 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
115 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
116 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
118 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
120 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
122 scalar_type
= rhs_type
;
125 *lhs_size_unit
= lhs
;
126 *rhs_size_unit
= rhs
;
131 /* Check if data references pointed by DR_I and DR_J are same or
132 belong to same interleaving group. Return FALSE if drs are
133 different, otherwise return TRUE. */
136 vect_same_range_drs (data_reference_p dr_i
, data_reference_p dr_j
)
138 gimple stmt_i
= DR_STMT (dr_i
);
139 gimple stmt_j
= DR_STMT (dr_j
);
141 if (operand_equal_p (DR_REF (dr_i
), DR_REF (dr_j
), 0)
142 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
143 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
))
144 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i
))
145 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j
)))))
151 /* If address ranges represented by DDR_I and DDR_J are equal,
152 return TRUE, otherwise return FALSE. */
155 vect_vfa_range_equal (ddr_p ddr_i
, ddr_p ddr_j
)
157 if ((vect_same_range_drs (DDR_A (ddr_i
), DDR_A (ddr_j
))
158 && vect_same_range_drs (DDR_B (ddr_i
), DDR_B (ddr_j
)))
159 || (vect_same_range_drs (DDR_A (ddr_i
), DDR_B (ddr_j
))
160 && vect_same_range_drs (DDR_B (ddr_i
), DDR_A (ddr_j
))))
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
171 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
173 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
178 if (dump_enabled_p ())
180 dump_printf_loc (MSG_NOTE
, vect_location
,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
183 dump_printf (MSG_NOTE
, " and ");
184 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
187 if (optimize_loop_nest_for_size_p (loop
))
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
191 "versioning not supported when optimizing for size.");
195 /* FORNOW: We don't support versioning with outer-loop vectorization. */
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
200 "versioning not yet supported for outer-loops.");
204 /* FORNOW: We don't support creating runtime alias tests for non-constant
206 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
207 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
209 if (dump_enabled_p ())
210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
211 "versioning not yet supported for non-constant "
216 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
221 /* Function vect_analyze_data_ref_dependence.
223 Return TRUE if there (might) exist a dependence between a memory-reference
224 DRA and a memory-reference DRB. When versioning for alias may check a
225 dependence at run-time, return FALSE. Adjust *MAX_VF according to
226 the data dependence. */
229 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
230 loop_vec_info loop_vinfo
, int *max_vf
)
233 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
234 struct data_reference
*dra
= DDR_A (ddr
);
235 struct data_reference
*drb
= DDR_B (ddr
);
236 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
237 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
238 lambda_vector dist_v
;
239 unsigned int loop_depth
;
241 /* In loop analysis all data references should be vectorizable. */
242 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
243 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
246 /* Independent data accesses. */
247 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
251 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
254 /* Unknown data dependence. */
255 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
257 /* If user asserted safelen consecutive iterations can be
258 executed concurrently, assume independence. */
259 if (loop
->safelen
>= 2)
261 if (loop
->safelen
< *max_vf
)
262 *max_vf
= loop
->safelen
;
266 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
267 || STMT_VINFO_GATHER_P (stmtinfo_b
))
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
272 "versioning for alias not supported for: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
276 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
295 /* Add to list of ddrs that need to be tested at run-time. */
296 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
299 /* Known data dependence. */
300 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
302 /* If user asserted safelen consecutive iterations can be
303 executed concurrently, assume independence. */
304 if (loop
->safelen
>= 2)
306 if (loop
->safelen
< *max_vf
)
307 *max_vf
= loop
->safelen
;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
312 || STMT_VINFO_GATHER_P (stmtinfo_b
))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
321 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
341 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
344 int dist
= dist_v
[loop_depth
];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE
, vect_location
,
348 "dependence distance = %d.", dist
);
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE
, vect_location
,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
357 dump_printf (MSG_NOTE
, " and ");
358 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
361 /* When we perform grouped accesses and perform implicit CSE
362 by detecting equal accesses and doing disambiguation with
363 runtime alias tests like for
371 where we will end up loading { a[i], a[i+1] } once, make
372 sure that inserting group loads before the first load and
373 stores after the last store will do the right thing. */
374 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
375 && GROUP_SAME_DR_STMT (stmtinfo_a
))
376 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
377 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
380 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
382 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
384 if (dump_enabled_p ())
385 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
386 "READ_WRITE dependence in interleaving.");
394 if (dist
> 0 && DDR_REVERSED_P (ddr
))
396 /* If DDR_REVERSED_P the order of the data-refs in DDR was
397 reversed (to make distance vector positive), and the actual
398 distance is negative. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
401 "dependence distance negative.");
406 && abs (dist
) < *max_vf
)
408 /* The dependence distance requires reduction of the maximal
409 vectorization factor. */
410 *max_vf
= abs (dist
);
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE
, vect_location
,
413 "adjusting maximal vectorization factor to %i",
417 if (abs (dist
) >= *max_vf
)
419 /* Dependence distance does not create dependence, as far as
420 vectorization is concerned, in this case. */
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_NOTE
, vect_location
,
423 "dependence distance >= VF.");
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
430 "not vectorized, possible dependence "
431 "between data-refs ");
432 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
433 dump_printf (MSG_NOTE
, " and ");
434 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
443 /* Function vect_analyze_data_ref_dependences.
445 Examine all the data references in the loop, and make sure there do not
446 exist any data dependences between them. Set *MAX_VF according to
447 the maximum vectorization factor the data dependences allow. */
450 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
453 struct data_dependence_relation
*ddr
;
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE
, vect_location
,
457 "=== vect_analyze_data_ref_dependences ===");
459 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
460 &LOOP_VINFO_DDRS (loop_vinfo
),
461 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
464 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
465 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
472 /* Function vect_slp_analyze_data_ref_dependence.
474 Return TRUE if there (might) exist a dependence between a memory-reference
475 DRA and a memory-reference DRB. When versioning for alias may check a
476 dependence at run-time, return FALSE. Adjust *MAX_VF according to
477 the data dependence. */
480 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
482 struct data_reference
*dra
= DDR_A (ddr
);
483 struct data_reference
*drb
= DDR_B (ddr
);
485 /* We need to check dependences of statements marked as unvectorizable
486 as well, they still can prohibit vectorization. */
488 /* Independent data accesses. */
489 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
495 /* Read-read is OK. */
496 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
499 /* If dra and drb are part of the same interleaving chain consider
501 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
502 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
503 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
506 /* Unknown data dependence. */
507 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
514 "can't determine dependence between ");
515 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
516 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
517 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
520 /* We do not vectorize basic blocks with write-write dependencies. */
521 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
524 /* Check that it's not a load-after-store dependence. */
525 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
526 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE
, vect_location
,
535 "determined dependence between ");
536 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
537 dump_printf (MSG_NOTE
, " and ");
538 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
541 /* Do not vectorize basic blocks with write-write dependences. */
542 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
545 /* Check dependence between DRA and DRB for basic block vectorization.
546 If the accesses share same bases and offsets, we can compare their initial
547 constant offsets to decide whether they differ or not. In case of a read-
548 write dependence we check that the load is before the store to ensure that
549 vectorization will not change the order of the accesses. */
551 HOST_WIDE_INT type_size_a
, type_size_b
, init_a
, init_b
;
554 /* Check that the data-refs have same bases and offsets. If not, we can't
555 determine if they are dependent. */
556 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
557 || !dr_equal_offsets_p (dra
, drb
))
560 /* Check the types. */
561 type_size_a
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))));
562 type_size_b
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
564 if (type_size_a
!= type_size_b
565 || !types_compatible_p (TREE_TYPE (DR_REF (dra
)),
566 TREE_TYPE (DR_REF (drb
))))
569 init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
570 init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
572 /* Two different locations - no dependence. */
573 if (init_a
!= init_b
)
576 /* We have a read-write dependence. Check that the load is before the store.
577 When we vectorize basic blocks, vector load can be only before
578 corresponding scalar load, and vector store can be only after its
579 corresponding scalar store. So the order of the acceses is preserved in
580 case the load is before the store. */
581 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
582 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
589 /* Function vect_analyze_data_ref_dependences.
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
598 struct data_dependence_relation
*ddr
;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE
, vect_location
,
603 "=== vect_slp_analyze_data_ref_dependences ===");
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
606 &BB_VINFO_DDRS (bb_vinfo
),
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
611 if (vect_slp_analyze_data_ref_dependence (ddr
))
618 /* Function vect_compute_data_ref_alignment
620 Compute the misalignment of the data reference DR.
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
631 vect_compute_data_ref_alignment (struct data_reference
*dr
)
633 gimple stmt
= DR_STMT (dr
);
634 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
635 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
636 struct loop
*loop
= NULL
;
637 tree ref
= DR_REF (dr
);
639 tree base
, base_addr
;
642 tree aligned_to
, alignment
;
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE
, vect_location
,
646 "vect_compute_data_ref_alignment:");
649 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr
, -1);
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
659 misalign
= DR_INIT (dr
);
660 aligned_to
= DR_ALIGNED_TO (dr
);
661 base_addr
= DR_BASE_ADDRESS (dr
);
662 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
672 tree step
= DR_STEP (dr
);
673 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
675 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE
, vect_location
,
679 "inner step divides the vector-size.");
680 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
681 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
682 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
688 "inner step doesn't divide the vector-size.");
689 misalign
= NULL_TREE
;
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
700 tree step
= DR_STEP (dr
);
701 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
703 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
707 "SLP: step doesn't divide the vector-size.");
708 misalign
= NULL_TREE
;
712 base
= build_fold_indirect_ref (base_addr
);
713 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
715 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
728 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
730 || (TREE_CODE (base_addr
) == SSA_NAME
731 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
732 TREE_TYPE (base_addr
)))),
734 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
737 base_aligned
= false;
741 /* Do not change the alignment of global variables here if
742 flag_section_anchors is enabled as we already generated
743 RTL for other functions. Most global variables should
744 have been aligned during the IPA increase_alignment pass. */
745 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
746 || (TREE_STATIC (base
) && flag_section_anchors
))
748 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE
, vect_location
,
751 "can't force alignment of ref: ");
752 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
757 /* Force the alignment of the decl.
758 NOTE: This is the only change to the code we make during
759 the analysis phase, before deciding to vectorize the loop. */
760 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
763 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
766 DECL_ALIGN (base
) = TYPE_ALIGN (vectype
);
767 DECL_USER_ALIGN (base
) = 1;
770 /* At this point we assume that the base is aligned. */
771 gcc_assert (base_aligned
772 || (TREE_CODE (base
) == VAR_DECL
773 && DECL_ALIGN (base
) >= TYPE_ALIGN (vectype
)));
775 /* If this is a backward running DR then first access in the larger
776 vectype actually is N-1 elements before the address in the DR.
777 Adjust misalign accordingly. */
778 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
780 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
781 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
782 otherwise we wouldn't be here. */
783 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
784 /* PLUS because DR_STEP was negative. */
785 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
788 /* Modulo alignment. */
789 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
791 if (!host_integerp (misalign
, 1))
793 /* Negative or overflowed misalignment value. */
794 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
796 "unexpected misalign value");
800 SET_DR_MISALIGNMENT (dr
, TREE_INT_CST_LOW (misalign
));
802 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
805 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
806 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
813 /* Function vect_compute_data_refs_alignment
815 Compute the misalignment of data references in the loop.
816 Return FALSE if a data reference is found that cannot be vectorized. */
819 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
820 bb_vec_info bb_vinfo
)
822 vec
<data_reference_p
> datarefs
;
823 struct data_reference
*dr
;
827 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
829 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
831 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
832 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
833 && !vect_compute_data_ref_alignment (dr
))
837 /* Mark unsupported statement as unvectorizable. */
838 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
849 /* Function vect_update_misalignment_for_peel
851 DR - the data reference whose misalignment is to be adjusted.
852 DR_PEEL - the data reference whose misalignment is being made
853 zero in the vector loop by the peel.
854 NPEEL - the number of iterations in the peel loop if the misalignment
855 of DR_PEEL is known at compile time. */
858 vect_update_misalignment_for_peel (struct data_reference
*dr
,
859 struct data_reference
*dr_peel
, int npeel
)
862 vec
<dr_p
> same_align_drs
;
863 struct data_reference
*current_dr
;
864 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
865 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
866 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
867 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
869 /* For interleaved data accesses the step in the loop must be multiplied by
870 the size of the interleaving group. */
871 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
872 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
873 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
874 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
876 /* It can be assumed that the data refs with the same alignment as dr_peel
877 are aligned in the vector loop. */
879 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
880 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
882 if (current_dr
!= dr
)
884 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
885 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
886 SET_DR_MISALIGNMENT (dr
, 0);
890 if (known_alignment_for_access_p (dr
)
891 && known_alignment_for_access_p (dr_peel
))
893 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
894 int misal
= DR_MISALIGNMENT (dr
);
895 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
896 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
897 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
898 SET_DR_MISALIGNMENT (dr
, misal
);
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.");
904 SET_DR_MISALIGNMENT (dr
, -1);
908 /* Function vect_verify_datarefs_alignment
910 Return TRUE if all data references in the loop can be
911 handled with respect to alignment. */
914 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
916 vec
<data_reference_p
> datarefs
;
917 struct data_reference
*dr
;
918 enum dr_alignment_support supportable_dr_alignment
;
922 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
924 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
926 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
928 gimple stmt
= DR_STMT (dr
);
929 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
931 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
934 /* For interleaving, only the alignment of the first access matters.
935 Skip statements marked as not vectorizable. */
936 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
937 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
938 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
941 /* Strided loads perform only component accesses, alignment is
942 irrelevant for them. */
943 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
946 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
947 if (!supportable_dr_alignment
)
949 if (dump_enabled_p ())
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
953 "not vectorized: unsupported unaligned load.");
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
956 "not vectorized: unsupported unaligned "
959 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
964 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
965 dump_printf_loc (MSG_NOTE
, vect_location
,
966 "Vectorizing an unaligned access.");
971 /* Given an memory reference EXP return whether its alignment is less
975 not_size_aligned (tree exp
)
977 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp
)), 1))
980 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp
)))
981 > get_object_alignment (exp
));
984 /* Function vector_alignment_reachable_p
986 Return true if vector alignment for DR is reachable by peeling
987 a few loop iterations. Return false otherwise. */
990 vector_alignment_reachable_p (struct data_reference
*dr
)
992 gimple stmt
= DR_STMT (dr
);
993 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
994 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
998 /* For interleaved access we peel only if number of iterations in
999 the prolog loop ({VF - misalignment}), is a multiple of the
1000 number of the interleaved accesses. */
1001 int elem_size
, mis_in_elements
;
1002 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1004 /* FORNOW: handle only known alignment. */
1005 if (!known_alignment_for_access_p (dr
))
1008 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1009 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1011 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1015 /* If misalignment is known at the compile time then allow peeling
1016 only if natural alignment is reachable through peeling. */
1017 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1019 HOST_WIDE_INT elmsize
=
1020 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1021 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE
, vect_location
,
1024 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1025 dump_printf (MSG_NOTE
,
1026 ". misalignment = %d. ", DR_MISALIGNMENT (dr
));
1028 if (DR_MISALIGNMENT (dr
) % elmsize
)
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1032 "data size does not divide the misalignment.\n");
1037 if (!known_alignment_for_access_p (dr
))
1039 tree type
= TREE_TYPE (DR_REF (dr
));
1040 bool is_packed
= not_size_aligned (DR_REF (dr
));
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1043 "Unknown misalignment, is_packed = %d",is_packed
);
1044 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1045 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1055 /* Calculate the cost of the memory access represented by DR. */
1058 vect_get_data_access_cost (struct data_reference
*dr
,
1059 unsigned int *inside_cost
,
1060 unsigned int *outside_cost
,
1061 stmt_vector_for_cost
*body_cost_vec
)
1063 gimple stmt
= DR_STMT (dr
);
1064 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1065 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1066 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1067 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1068 int ncopies
= vf
/ nunits
;
1070 if (DR_IS_READ (dr
))
1071 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1072 NULL
, body_cost_vec
, false);
1074 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_NOTE
, vect_location
,
1078 "vect_get_data_access_cost: inside_cost = %d, "
1079 "outside_cost = %d.", *inside_cost
, *outside_cost
);
1083 /* Insert DR into peeling hash table with NPEEL as key. */
1086 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1089 struct _vect_peel_info elem
, *slot
;
1090 _vect_peel_info
**new_slot
;
1091 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1094 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1099 slot
= XNEW (struct _vect_peel_info
);
1100 slot
->npeel
= npeel
;
1103 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1107 if (!supportable_dr_alignment
&& !flag_vect_cost_model
)
1108 slot
->count
+= VECT_MAX_COST
;
1112 /* Traverse peeling hash table to find peeling option that aligns maximum
1113 number of data accesses. */
1116 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1117 _vect_peel_extended_info
*max
)
1119 vect_peel_info elem
= *slot
;
1121 if (elem
->count
> max
->peel_info
.count
1122 || (elem
->count
== max
->peel_info
.count
1123 && max
->peel_info
.npeel
> elem
->npeel
))
1125 max
->peel_info
.npeel
= elem
->npeel
;
1126 max
->peel_info
.count
= elem
->count
;
1127 max
->peel_info
.dr
= elem
->dr
;
1134 /* Traverse peeling hash table and calculate cost for each peeling option.
1135 Find the one with the lowest cost. */
1138 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1139 _vect_peel_extended_info
*min
)
1141 vect_peel_info elem
= *slot
;
1142 int save_misalignment
, dummy
;
1143 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1144 gimple stmt
= DR_STMT (elem
->dr
);
1145 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1146 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1147 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1148 struct data_reference
*dr
;
1149 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1150 int single_iter_cost
;
1152 prologue_cost_vec
.create (2);
1153 body_cost_vec
.create (2);
1154 epilogue_cost_vec
.create (2);
1156 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1158 stmt
= DR_STMT (dr
);
1159 stmt_info
= vinfo_for_stmt (stmt
);
1160 /* For interleaving, only the alignment of the first access
1162 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1163 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1166 save_misalignment
= DR_MISALIGNMENT (dr
);
1167 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1168 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1170 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1173 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1174 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1175 &dummy
, single_iter_cost
,
1177 &epilogue_cost_vec
);
1179 /* Prologue and epilogue costs are added to the target model later.
1180 These costs depend only on the scalar iteration cost, the
1181 number of peeling iterations finally chosen, and the number of
1182 misaligned statements. So discard the information found here. */
1183 prologue_cost_vec
.release ();
1184 epilogue_cost_vec
.release ();
1186 if (inside_cost
< min
->inside_cost
1187 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1189 min
->inside_cost
= inside_cost
;
1190 min
->outside_cost
= outside_cost
;
1191 min
->body_cost_vec
.release ();
1192 min
->body_cost_vec
= body_cost_vec
;
1193 min
->peel_info
.dr
= elem
->dr
;
1194 min
->peel_info
.npeel
= elem
->npeel
;
1197 body_cost_vec
.release ();
1203 /* Choose best peeling option by traversing peeling hash table and either
1204 choosing an option with the lowest cost (if cost model is enabled) or the
1205 option that aligns as many accesses as possible. */
1207 static struct data_reference
*
1208 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1209 unsigned int *npeel
,
1210 stmt_vector_for_cost
*body_cost_vec
)
1212 struct _vect_peel_extended_info res
;
1214 res
.peel_info
.dr
= NULL
;
1215 res
.body_cost_vec
= stmt_vector_for_cost();
1217 if (flag_vect_cost_model
)
1219 res
.inside_cost
= INT_MAX
;
1220 res
.outside_cost
= INT_MAX
;
1221 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1222 .traverse
<_vect_peel_extended_info
*,
1223 vect_peeling_hash_get_lowest_cost
> (&res
);
1227 res
.peel_info
.count
= 0;
1228 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1229 .traverse
<_vect_peel_extended_info
*,
1230 vect_peeling_hash_get_most_frequent
> (&res
);
1233 *npeel
= res
.peel_info
.npeel
;
1234 *body_cost_vec
= res
.body_cost_vec
;
1235 return res
.peel_info
.dr
;
1239 /* Function vect_enhance_data_refs_alignment
1241 This pass will use loop versioning and loop peeling in order to enhance
1242 the alignment of data references in the loop.
1244 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1245 original loop is to be vectorized. Any other loops that are created by
1246 the transformations performed in this pass - are not supposed to be
1247 vectorized. This restriction will be relaxed.
1249 This pass will require a cost model to guide it whether to apply peeling
1250 or versioning or a combination of the two. For example, the scheme that
1251 intel uses when given a loop with several memory accesses, is as follows:
1252 choose one memory access ('p') which alignment you want to force by doing
1253 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1254 other accesses are not necessarily aligned, or (2) use loop versioning to
1255 generate one loop in which all accesses are aligned, and another loop in
1256 which only 'p' is necessarily aligned.
1258 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1259 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1260 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1262 Devising a cost model is the most critical aspect of this work. It will
1263 guide us on which access to peel for, whether to use loop versioning, how
1264 many versions to create, etc. The cost model will probably consist of
1265 generic considerations as well as target specific considerations (on
1266 powerpc for example, misaligned stores are more painful than misaligned
1269 Here are the general steps involved in alignment enhancements:
1271 -- original loop, before alignment analysis:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 -- After vect_compute_data_refs_alignment:
1278 for (i=0; i<N; i++){
1279 x = q[i]; # DR_MISALIGNMENT(q) = 3
1280 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 -- Possibility 1: we do loop versioning:
1285 for (i=0; i<N; i++){ # loop 1A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = 0
1291 for (i=0; i<N; i++){ # loop 1B
1292 x = q[i]; # DR_MISALIGNMENT(q) = 3
1293 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1297 -- Possibility 2: we do loop peeling:
1298 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1302 for (i = 3; i < N; i++){ # loop 2A
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1307 -- Possibility 3: combination of loop peeling and versioning:
1308 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1313 for (i = 3; i<N; i++){ # loop 3A
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = 0
1319 for (i = 3; i<N; i++){ # loop 3B
1320 x = q[i]; # DR_MISALIGNMENT(q) = 0
1321 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1325 These loops are later passed to loop_transform to be vectorized. The
1326 vectorizer will use the alignment information to guide the transformation
1327 (whether to generate regular loads/stores, or with special handling for
1331 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1333 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1334 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1335 enum dr_alignment_support supportable_dr_alignment
;
1336 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1337 struct data_reference
*dr
;
1339 bool do_peeling
= false;
1340 bool do_versioning
= false;
1343 stmt_vec_info stmt_info
;
1344 unsigned int npeel
= 0;
1345 bool all_misalignments_unknown
= true;
1346 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1347 unsigned possible_npeel_number
= 1;
1349 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1350 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost();
1352 if (dump_enabled_p ())
1353 dump_printf_loc (MSG_NOTE
, vect_location
,
1354 "=== vect_enhance_data_refs_alignment ===");
1356 /* While cost model enhancements are expected in the future, the high level
1357 view of the code at this time is as follows:
1359 A) If there is a misaligned access then see if peeling to align
1360 this access can make all data references satisfy
1361 vect_supportable_dr_alignment. If so, update data structures
1362 as needed and return true.
1364 B) If peeling wasn't possible and there is a data reference with an
1365 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1366 then see if loop versioning checks can be used to make all data
1367 references satisfy vect_supportable_dr_alignment. If so, update
1368 data structures as needed and return true.
1370 C) If neither peeling nor versioning were successful then return false if
1371 any data reference does not satisfy vect_supportable_dr_alignment.
1373 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1375 Note, Possibility 3 above (which is peeling and versioning together) is not
1376 being done at this time. */
1378 /* (1) Peeling to force alignment. */
1380 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1382 + How many accesses will become aligned due to the peeling
1383 - How many accesses will become unaligned due to the peeling,
1384 and the cost of misaligned accesses.
1385 - The cost of peeling (the extra runtime checks, the increase
1388 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1390 stmt
= DR_STMT (dr
);
1391 stmt_info
= vinfo_for_stmt (stmt
);
1393 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1396 /* For interleaving, only the alignment of the first access
1398 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1399 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1402 /* For invariant accesses there is nothing to enhance. */
1403 if (integer_zerop (DR_STEP (dr
)))
1406 /* Strided loads perform only component accesses, alignment is
1407 irrelevant for them. */
1408 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1411 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1412 do_peeling
= vector_alignment_reachable_p (dr
);
1415 if (known_alignment_for_access_p (dr
))
1417 unsigned int npeel_tmp
;
1418 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1419 size_zero_node
) < 0;
1421 /* Save info about DR in the hash table. */
1422 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1423 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1425 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1426 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1427 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1428 TREE_TYPE (DR_REF (dr
))));
1429 npeel_tmp
= (negative
1430 ? (mis
- nelements
) : (nelements
- mis
))
1433 /* For multiple types, it is possible that the bigger type access
1434 will have more than one peeling option. E.g., a loop with two
1435 types: one of size (vector size / 4), and the other one of
1436 size (vector size / 8). Vectorization factor will 8. If both
1437 access are misaligned by 3, the first one needs one scalar
1438 iteration to be aligned, and the second one needs 5. But the
1439 the first one will be aligned also by peeling 5 scalar
1440 iterations, and in that case both accesses will be aligned.
1441 Hence, except for the immediate peeling amount, we also want
1442 to try to add full vector size, while we don't exceed
1443 vectorization factor.
1444 We do this automtically for cost model, since we calculate cost
1445 for every peeling option. */
1446 if (!flag_vect_cost_model
)
1447 possible_npeel_number
= vf
/nelements
;
1449 /* Handle the aligned case. We may decide to align some other
1450 access, making DR unaligned. */
1451 if (DR_MISALIGNMENT (dr
) == 0)
1454 if (!flag_vect_cost_model
)
1455 possible_npeel_number
++;
1458 for (j
= 0; j
< possible_npeel_number
; j
++)
1460 gcc_assert (npeel_tmp
<= vf
);
1461 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1462 npeel_tmp
+= nelements
;
1465 all_misalignments_unknown
= false;
1466 /* Data-ref that was chosen for the case that all the
1467 misalignments are unknown is not relevant anymore, since we
1468 have a data-ref with known alignment. */
1473 /* If we don't know any misalignment values, we prefer
1474 peeling for data-ref that has the maximum number of data-refs
1475 with the same alignment, unless the target prefers to align
1476 stores over load. */
1477 if (all_misalignments_unknown
)
1479 unsigned same_align_drs
1480 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1482 || same_align_drs_max
< same_align_drs
)
1484 same_align_drs_max
= same_align_drs
;
1487 /* For data-refs with the same number of related
1488 accesses prefer the one where the misalign
1489 computation will be invariant in the outermost loop. */
1490 else if (same_align_drs_max
== same_align_drs
)
1492 struct loop
*ivloop0
, *ivloop
;
1493 ivloop0
= outermost_invariant_loop_for_expr
1494 (loop
, DR_BASE_ADDRESS (dr0
));
1495 ivloop
= outermost_invariant_loop_for_expr
1496 (loop
, DR_BASE_ADDRESS (dr
));
1497 if ((ivloop
&& !ivloop0
)
1498 || (ivloop
&& ivloop0
1499 && flow_loop_nested_p (ivloop
, ivloop0
)))
1503 if (!first_store
&& DR_IS_WRITE (dr
))
1507 /* If there are both known and unknown misaligned accesses in the
1508 loop, we choose peeling amount according to the known
1510 if (!supportable_dr_alignment
)
1513 if (!first_store
&& DR_IS_WRITE (dr
))
1520 if (!aligned_access_p (dr
))
1522 if (dump_enabled_p ())
1523 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1524 "vector alignment may not be reachable");
1530 /* Check if we can possibly peel the loop. */
1531 if (!vect_can_advance_ivs_p (loop_vinfo
)
1532 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1535 if (do_peeling
&& all_misalignments_unknown
1536 && vect_supportable_dr_alignment (dr0
, false))
1539 /* Check if the target requires to prefer stores over loads, i.e., if
1540 misaligned stores are more expensive than misaligned loads (taking
1541 drs with same alignment into account). */
1542 if (first_store
&& DR_IS_READ (dr0
))
1544 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1545 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1546 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1547 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1548 stmt_vector_for_cost dummy
;
1551 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1553 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1554 &store_outside_cost
, &dummy
);
1558 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1559 aligning the load DR0). */
1560 load_inside_penalty
= store_inside_cost
;
1561 load_outside_penalty
= store_outside_cost
;
1563 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1564 DR_STMT (first_store
))).iterate (i
, &dr
);
1566 if (DR_IS_READ (dr
))
1568 load_inside_penalty
+= load_inside_cost
;
1569 load_outside_penalty
+= load_outside_cost
;
1573 load_inside_penalty
+= store_inside_cost
;
1574 load_outside_penalty
+= store_outside_cost
;
1577 /* Calculate the penalty for leaving DR0 unaligned (by
1578 aligning the FIRST_STORE). */
1579 store_inside_penalty
= load_inside_cost
;
1580 store_outside_penalty
= load_outside_cost
;
1582 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583 DR_STMT (dr0
))).iterate (i
, &dr
);
1585 if (DR_IS_READ (dr
))
1587 store_inside_penalty
+= load_inside_cost
;
1588 store_outside_penalty
+= load_outside_cost
;
1592 store_inside_penalty
+= store_inside_cost
;
1593 store_outside_penalty
+= store_outside_cost
;
1596 if (load_inside_penalty
> store_inside_penalty
1597 || (load_inside_penalty
== store_inside_penalty
1598 && load_outside_penalty
> store_outside_penalty
))
1602 /* In case there are only loads with different unknown misalignments, use
1603 peeling only if it may help to align other accesses in the loop. */
1605 && !STMT_VINFO_SAME_ALIGN_REFS (
1606 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1607 && vect_supportable_dr_alignment (dr0
, false)
1608 != dr_unaligned_supported
)
1612 if (do_peeling
&& !dr0
)
1614 /* Peeling is possible, but there is no data access that is not supported
1615 unless aligned. So we try to choose the best possible peeling. */
1617 /* We should get here only if there are drs with known misalignment. */
1618 gcc_assert (!all_misalignments_unknown
);
1620 /* Choose the best peeling from the hash table. */
1621 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1629 stmt
= DR_STMT (dr0
);
1630 stmt_info
= vinfo_for_stmt (stmt
);
1631 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1632 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1634 if (known_alignment_for_access_p (dr0
))
1636 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1637 size_zero_node
) < 0;
1640 /* Since it's known at compile time, compute the number of
1641 iterations in the peeled loop (the peeling factor) for use in
1642 updating DR_MISALIGNMENT values. The peeling factor is the
1643 vectorization factor minus the misalignment as an element
1645 mis
= DR_MISALIGNMENT (dr0
);
1646 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1647 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1651 /* For interleaved data access every iteration accesses all the
1652 members of the group, therefore we divide the number of iterations
1653 by the group size. */
1654 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1655 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1656 npeel
/= GROUP_SIZE (stmt_info
);
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_NOTE
, vect_location
,
1660 "Try peeling by %d", npeel
);
1663 /* Ensure that all data refs can be vectorized after the peel. */
1664 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1666 int save_misalignment
;
1671 stmt
= DR_STMT (dr
);
1672 stmt_info
= vinfo_for_stmt (stmt
);
1673 /* For interleaving, only the alignment of the first access
1675 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1676 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1679 /* Strided loads perform only component accesses, alignment is
1680 irrelevant for them. */
1681 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1684 save_misalignment
= DR_MISALIGNMENT (dr
);
1685 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1686 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1687 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1689 if (!supportable_dr_alignment
)
1696 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1698 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1703 body_cost_vec
.release ();
1710 stmt_info_for_cost
*si
;
1711 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1713 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1714 If the misalignment of DR_i is identical to that of dr0 then set
1715 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1716 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1717 by the peeling factor times the element size of DR_i (MOD the
1718 vectorization factor times the size). Otherwise, the
1719 misalignment of DR_i must be set to unknown. */
1720 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1722 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1724 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1726 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1728 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo
) = DR_MISALIGNMENT (dr0
);
1729 SET_DR_MISALIGNMENT (dr0
, 0);
1730 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE
, vect_location
,
1733 "Alignment of access forced using peeling.");
1734 dump_printf_loc (MSG_NOTE
, vect_location
,
1735 "Peeling for alignment will be applied.");
1737 /* We've delayed passing the inside-loop peeling costs to the
1738 target cost model until we were sure peeling would happen.
1740 if (body_cost_vec
.exists ())
1742 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1744 struct _stmt_vec_info
*stmt_info
1745 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1746 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1747 si
->misalign
, vect_body
);
1749 body_cost_vec
.release ();
1752 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1758 body_cost_vec
.release ();
1760 /* (2) Versioning to force alignment. */
1762 /* Try versioning if:
1763 1) flag_tree_vect_loop_version is TRUE
1764 2) optimize loop for speed
1765 3) there is at least one unsupported misaligned data ref with an unknown
1767 4) all misaligned data refs with a known misalignment are supported, and
1768 5) the number of runtime alignment checks is within reason. */
1771 flag_tree_vect_loop_version
1772 && optimize_loop_nest_for_speed_p (loop
)
1773 && (!loop
->inner
); /* FORNOW */
1777 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1779 stmt
= DR_STMT (dr
);
1780 stmt_info
= vinfo_for_stmt (stmt
);
1782 /* For interleaving, only the alignment of the first access
1784 if (aligned_access_p (dr
)
1785 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1786 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1789 /* Strided loads perform only component accesses, alignment is
1790 irrelevant for them. */
1791 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1794 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1796 if (!supportable_dr_alignment
)
1802 if (known_alignment_for_access_p (dr
)
1803 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1804 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1806 do_versioning
= false;
1810 stmt
= DR_STMT (dr
);
1811 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1812 gcc_assert (vectype
);
1814 /* The rightmost bits of an aligned address must be zeros.
1815 Construct the mask needed for this test. For example,
1816 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1817 mask must be 15 = 0xf. */
1818 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1820 /* FORNOW: use the same mask to test all potentially unaligned
1821 references in the loop. The vectorizer currently supports
1822 a single vector size, see the reference to
1823 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1824 vectorization factor is computed. */
1825 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1826 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1827 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1828 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1833 /* Versioning requires at least one misaligned data reference. */
1834 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1835 do_versioning
= false;
1836 else if (!do_versioning
)
1837 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1842 vec
<gimple
> may_misalign_stmts
1843 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1846 /* It can now be assumed that the data references in the statements
1847 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1848 of the loop being vectorized. */
1849 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1851 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1852 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1853 SET_DR_MISALIGNMENT (dr
, 0);
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE
, vect_location
,
1856 "Alignment of access forced using versioning.");
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE
, vect_location
,
1861 "Versioning for alignment will be applied.");
1863 /* Peeling and versioning can't be done together at this time. */
1864 gcc_assert (! (do_peeling
&& do_versioning
));
1866 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1871 /* This point is reached if neither peeling nor versioning is being done. */
1872 gcc_assert (! (do_peeling
|| do_versioning
));
1874 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1879 /* Function vect_find_same_alignment_drs.
1881 Update group and alignment relations according to the chosen
1882 vectorization factor. */
1885 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1886 loop_vec_info loop_vinfo
)
1889 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1890 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1891 struct data_reference
*dra
= DDR_A (ddr
);
1892 struct data_reference
*drb
= DDR_B (ddr
);
1893 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1894 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1895 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1896 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1897 lambda_vector dist_v
;
1898 unsigned int loop_depth
;
1900 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1906 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1909 /* Loop-based vectorization and known data dependence. */
1910 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1913 /* Data-dependence analysis reports a distance vector of zero
1914 for data-references that overlap only in the first iteration
1915 but have different sign step (see PR45764).
1916 So as a sanity check require equal DR_STEP. */
1917 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1920 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1921 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1923 int dist
= dist_v
[loop_depth
];
1925 if (dump_enabled_p ())
1926 dump_printf_loc (MSG_NOTE
, vect_location
,
1927 "dependence distance = %d.", dist
);
1929 /* Same loop iteration. */
1931 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1933 /* Two references with distance zero have the same alignment. */
1934 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1935 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1936 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE
, vect_location
,
1939 "accesses have the same alignment.");
1940 dump_printf (MSG_NOTE
,
1941 "dependence distance modulo vf == 0 between ");
1942 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1943 dump_printf (MSG_NOTE
, " and ");
1944 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1951 /* Function vect_analyze_data_refs_alignment
1953 Analyze the alignment of the data-references in the loop.
1954 Return FALSE if a data reference is found that cannot be vectorized. */
1957 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1958 bb_vec_info bb_vinfo
)
1960 if (dump_enabled_p ())
1961 dump_printf_loc (MSG_NOTE
, vect_location
,
1962 "=== vect_analyze_data_refs_alignment ===");
1964 /* Mark groups of data references with same alignment using
1965 data dependence information. */
1968 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1969 struct data_dependence_relation
*ddr
;
1972 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1973 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1976 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1980 "not vectorized: can't calculate alignment "
1989 /* Analyze groups of accesses: check that DR belongs to a group of
1990 accesses of legal size, step, etc. Detect gaps, single element
1991 interleaving, and other special cases. Set grouped access info.
1992 Collect groups of strided stores for further use in SLP analysis. */
1995 vect_analyze_group_access (struct data_reference
*dr
)
1997 tree step
= DR_STEP (dr
);
1998 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
1999 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2000 gimple stmt
= DR_STMT (dr
);
2001 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2002 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2003 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2004 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2005 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2006 bool slp_impossible
= false;
2007 struct loop
*loop
= NULL
;
2010 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2012 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2013 size of the interleaving group (including gaps). */
2014 groupsize
= absu_hwi (dr_step
) / type_size
;
2016 /* Not consecutive access is possible only if it is a part of interleaving. */
2017 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2019 /* Check if it this DR is a part of interleaving, and is a single
2020 element of the group that is accessed in the loop. */
2022 /* Gaps are supported only for loads. STEP must be a multiple of the type
2023 size. The size of the group must be a power of 2. */
2025 && (dr_step
% type_size
) == 0
2027 && exact_log2 (groupsize
) != -1)
2029 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2030 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE
, vect_location
,
2034 "Detected single element interleaving ");
2035 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2036 dump_printf (MSG_NOTE
, " step ");
2037 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE
, vect_location
,
2044 "Data access with gaps requires scalar "
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2050 "Peeling for outer loop is not"
2055 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2061 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2064 "not consecutive access ");
2065 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2070 /* Mark the statement as unvectorizable. */
2071 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2078 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2080 /* First stmt in the interleaving chain. Check the chain. */
2081 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2082 struct data_reference
*data_ref
= dr
;
2083 unsigned int count
= 1;
2084 tree prev_init
= DR_INIT (data_ref
);
2086 HOST_WIDE_INT diff
, gaps
= 0;
2087 unsigned HOST_WIDE_INT count_in_bytes
;
2091 /* Skip same data-refs. In case that two or more stmts share
2092 data-ref (supported only for loads), we vectorize only the first
2093 stmt, and the rest get their vectorized loads from the first
2095 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2096 DR_INIT (STMT_VINFO_DATA_REF (
2097 vinfo_for_stmt (next
)))))
2099 if (DR_IS_WRITE (data_ref
))
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2103 "Two store stmts share the same dr.");
2107 /* For load use the same data-ref load. */
2108 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2111 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2116 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2118 /* All group members have the same STEP by construction. */
2119 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2121 /* Check that the distance between two accesses is equal to the type
2122 size. Otherwise, we have gaps. */
2123 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2124 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2127 /* FORNOW: SLP of accesses with gaps is not supported. */
2128 slp_impossible
= true;
2129 if (DR_IS_WRITE (data_ref
))
2131 if (dump_enabled_p ())
2132 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2133 "interleaved store with gaps");
2140 last_accessed_element
+= diff
;
2142 /* Store the gap from the previous member of the group. If there is no
2143 gap in the access, GROUP_GAP is always 1. */
2144 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2146 prev_init
= DR_INIT (data_ref
);
2147 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2148 /* Count the number of data-refs in the chain. */
2152 /* COUNT is the number of accesses found, we multiply it by the size of
2153 the type to get COUNT_IN_BYTES. */
2154 count_in_bytes
= type_size
* count
;
2156 /* Check that the size of the interleaving (including gaps) is not
2157 greater than STEP. */
2159 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2161 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2164 "interleaving size is greater than step for ");
2165 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dr
));
2170 /* Check that the size of the interleaving is equal to STEP for stores,
2171 i.e., that there are no gaps. */
2173 && absu_hwi (dr_step
) != count_in_bytes
)
2175 if (DR_IS_READ (dr
))
2177 slp_impossible
= true;
2178 /* There is a gap after the last load in the group. This gap is a
2179 difference between the groupsize and the number of elements.
2180 When there is no gap, this difference should be 0. */
2181 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2187 "interleaved store with gaps");
2192 /* Check that STEP is a multiple of type size. */
2194 && (dr_step
% type_size
) != 0)
2196 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2199 "step is not a multiple of type size: step ");
2200 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2201 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2202 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2203 TYPE_SIZE_UNIT (scalar_type
));
2211 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE
, vect_location
,
2214 "Detected interleaving of size %d", (int)groupsize
);
2216 /* SLP: create an SLP data structure for every interleaving group of
2217 stores for further analysis in vect_analyse_slp. */
2218 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2221 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2223 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2226 /* There is a gap in the end of the group. */
2227 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2231 "Data access with gaps requires scalar "
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2237 "Peeling for outer loop is not supported");
2241 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2249 /* Analyze the access pattern of the data-reference DR.
2250 In case of non-consecutive accesses call vect_analyze_group_access() to
2251 analyze groups of accesses. */
2254 vect_analyze_data_ref_access (struct data_reference
*dr
)
2256 tree step
= DR_STEP (dr
);
2257 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2258 gimple stmt
= DR_STMT (dr
);
2259 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2260 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2261 struct loop
*loop
= NULL
;
2264 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2266 if (loop_vinfo
&& !step
)
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2270 "bad data-ref access in loop");
2274 /* Allow invariant loads in loops. */
2275 if (loop_vinfo
&& integer_zerop (step
))
2277 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2278 return DR_IS_READ (dr
);
2281 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2283 /* Interleaved accesses are not yet supported within outer-loop
2284 vectorization for references in the inner-loop. */
2285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2287 /* For the rest of the analysis we use the outer-loop step. */
2288 step
= STMT_VINFO_DR_STEP (stmt_info
);
2289 if (integer_zerop (step
))
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE
, vect_location
,
2293 "zero step in outer loop.");
2294 if (DR_IS_READ (dr
))
2302 if (TREE_CODE (step
) == INTEGER_CST
)
2304 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2305 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2307 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2309 /* Mark that it is not interleaving. */
2310 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2315 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_NOTE
, vect_location
,
2319 "grouped access in outer loop.");
2323 /* Assume this is a DR handled by non-constant strided load case. */
2324 if (TREE_CODE (step
) != INTEGER_CST
)
2325 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2327 /* Not consecutive access - check if it's a part of interleaving group. */
2328 return vect_analyze_group_access (dr
);
2333 /* A helper function used in the comparator function to sort data
2334 references. T1 and T2 are two data references to be compared.
2335 The function returns -1, 0, or 1. */
2338 compare_tree (tree t1
, tree t2
)
2341 enum tree_code code
;
2352 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2353 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2355 code
= TREE_CODE (t1
);
2358 /* For const values, we can just use hash values for comparisons. */
2366 hashval_t h1
= iterative_hash_expr (t1
, 0);
2367 hashval_t h2
= iterative_hash_expr (t2
, 0);
2369 return h1
< h2
? -1 : 1;
2374 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2378 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2379 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2383 tclass
= TREE_CODE_CLASS (code
);
2385 /* For var-decl, we could compare their UIDs. */
2386 if (tclass
== tcc_declaration
)
2388 if (DECL_UID (t1
) != DECL_UID (t2
))
2389 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2393 /* For expressions with operands, compare their operands recursively. */
2394 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2396 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2406 /* Compare two data-references DRA and DRB to group them into chunks
2407 suitable for grouping. */
2410 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2412 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2413 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2416 /* Stabilize sort. */
2420 /* Ordering of DRs according to base. */
2421 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2423 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2428 /* And according to DR_OFFSET. */
2429 if (!dr_equal_offsets_p (dra
, drb
))
2431 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2436 /* Put reads before writes. */
2437 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2438 return DR_IS_READ (dra
) ? -1 : 1;
2440 /* Then sort after access size. */
2441 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2442 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2444 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2445 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2450 /* And after step. */
2451 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2453 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2458 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2459 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2461 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2465 /* Function vect_analyze_data_ref_accesses.
2467 Analyze the access pattern of all the data references in the loop.
2469 FORNOW: the only access pattern that is considered vectorizable is a
2470 simple step 1 (consecutive) access.
2472 FORNOW: handle only arrays and pointer accesses. */
2475 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2478 vec
<data_reference_p
> datarefs
;
2479 struct data_reference
*dr
;
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE
, vect_location
,
2483 "=== vect_analyze_data_ref_accesses ===");
2486 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2488 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2490 if (datarefs
.is_empty ())
2493 /* Sort the array of datarefs to make building the interleaving chains
2495 qsort (datarefs
.address(), datarefs
.length (),
2496 sizeof (data_reference_p
), dr_group_sort_cmp
);
2498 /* Build the interleaving chains. */
2499 for (i
= 0; i
< datarefs
.length () - 1;)
2501 data_reference_p dra
= datarefs
[i
];
2502 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2503 stmt_vec_info lastinfo
= NULL
;
2504 for (i
= i
+ 1; i
< datarefs
.length (); ++i
)
2506 data_reference_p drb
= datarefs
[i
];
2507 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2509 /* ??? Imperfect sorting (non-compatible types, non-modulo
2510 accesses, same accesses) can lead to a group to be artificially
2511 split here as we don't just skip over those. If it really
2512 matters we can push those to a worklist and re-iterate
2513 over them. The we can just skip ahead to the next DR here. */
2515 /* Check that the data-refs have same first location (except init)
2516 and they are both either store or load (not load and store). */
2517 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2518 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2519 DR_BASE_ADDRESS (drb
), 0)
2520 || !dr_equal_offsets_p (dra
, drb
))
2523 /* Check that the data-refs have the same constant size and step. */
2524 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2525 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2526 if (!host_integerp (sza
, 1)
2527 || !host_integerp (szb
, 1)
2528 || !tree_int_cst_equal (sza
, szb
)
2529 || !host_integerp (DR_STEP (dra
), 0)
2530 || !host_integerp (DR_STEP (drb
), 0)
2531 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2534 /* Do not place the same access in the interleaving chain twice. */
2535 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2538 /* Check the types are compatible.
2539 ??? We don't distinguish this during sorting. */
2540 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2541 TREE_TYPE (DR_REF (drb
))))
2544 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2545 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2546 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2547 gcc_assert (init_a
< init_b
);
2549 /* If init_b == init_a + the size of the type * k, we have an
2550 interleaving, and DRA is accessed before DRB. */
2551 HOST_WIDE_INT type_size_a
= TREE_INT_CST_LOW (sza
);
2552 if ((init_b
- init_a
) % type_size_a
!= 0)
2555 /* The step (if not zero) is greater than the difference between
2556 data-refs' inits. This splits groups into suitable sizes. */
2557 HOST_WIDE_INT step
= TREE_INT_CST_LOW (DR_STEP (dra
));
2558 if (step
!= 0 && step
<= (init_b
- init_a
))
2561 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_NOTE
, vect_location
,
2564 "Detected interleaving ");
2565 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2566 dump_printf (MSG_NOTE
, " and ");
2567 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2570 /* Link the found element into the group list. */
2571 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2573 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2574 lastinfo
= stmtinfo_a
;
2576 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2577 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2578 lastinfo
= stmtinfo_b
;
2582 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2583 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2584 && !vect_analyze_data_ref_access (dr
))
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2588 "not vectorized: complicated access pattern.");
2592 /* Mark the statement as not vectorizable. */
2593 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2603 /* Function vect_prune_runtime_alias_test_list.
2605 Prune a list of ddrs to be tested at run-time by versioning for alias.
2606 Return FALSE if resulting list of ddrs is longer then allowed by
2607 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2610 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2613 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE
, vect_location
,
2618 "=== vect_prune_runtime_alias_test_list ===");
2620 for (i
= 0; i
< ddrs
.length (); )
2628 for (j
= 0; j
< i
; j
++)
2630 ddr_p ddr_j
= ddrs
[j
];
2632 if (vect_vfa_range_equal (ddr_i
, ddr_j
))
2634 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_NOTE
, vect_location
,
2637 "found equal ranges ");
2638 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr_i
)));
2639 dump_printf (MSG_NOTE
, ", ");
2640 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr_i
)));
2641 dump_printf (MSG_NOTE
, " and ");
2642 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr_j
)));
2643 dump_printf (MSG_NOTE
, ", ");
2644 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr_j
)));
2653 ddrs
.ordered_remove (i
);
2659 if (ddrs
.length () >
2660 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2662 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2665 "disable versioning for alias - max number of "
2666 "generated checks exceeded.");
2669 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
2677 /* Check whether a non-affine read in stmt is suitable for gather load
2678 and if so, return a builtin decl for that operation. */
2681 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2682 tree
*offp
, int *scalep
)
2684 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2685 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2686 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2687 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2688 tree offtype
= NULL_TREE
;
2689 tree decl
, base
, off
;
2690 enum machine_mode pmode
;
2691 int punsignedp
, pvolatilep
;
2693 /* The gather builtins need address of the form
2694 loop_invariant + vector * {1, 2, 4, 8}
2696 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2697 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2698 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2699 multiplications and additions in it. To get a vector, we need
2700 a single SSA_NAME that will be defined in the loop and will
2701 contain everything that is not loop invariant and that can be
2702 vectorized. The following code attempts to find such a preexistng
2703 SSA_NAME OFF and put the loop invariants into a tree BASE
2704 that can be gimplified before the loop. */
2705 base
= get_inner_reference (DR_REF (dr
), &pbitsize
, &pbitpos
, &off
,
2706 &pmode
, &punsignedp
, &pvolatilep
, false);
2707 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2709 if (TREE_CODE (base
) == MEM_REF
)
2711 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2713 if (off
== NULL_TREE
)
2715 double_int moff
= mem_ref_offset (base
);
2716 off
= double_int_to_tree (sizetype
, moff
);
2719 off
= size_binop (PLUS_EXPR
, off
,
2720 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2722 base
= TREE_OPERAND (base
, 0);
2725 base
= build_fold_addr_expr (base
);
2727 if (off
== NULL_TREE
)
2728 off
= size_zero_node
;
2730 /* If base is not loop invariant, either off is 0, then we start with just
2731 the constant offset in the loop invariant BASE and continue with base
2732 as OFF, otherwise give up.
2733 We could handle that case by gimplifying the addition of base + off
2734 into some SSA_NAME and use that as off, but for now punt. */
2735 if (!expr_invariant_in_loop_p (loop
, base
))
2737 if (!integer_zerop (off
))
2740 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
2742 /* Otherwise put base + constant offset into the loop invariant BASE
2743 and continue with OFF. */
2746 base
= fold_convert (sizetype
, base
);
2747 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
2750 /* OFF at this point may be either a SSA_NAME or some tree expression
2751 from get_inner_reference. Try to peel off loop invariants from it
2752 into BASE as long as possible. */
2754 while (offtype
== NULL_TREE
)
2756 enum tree_code code
;
2757 tree op0
, op1
, add
= NULL_TREE
;
2759 if (TREE_CODE (off
) == SSA_NAME
)
2761 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
2763 if (expr_invariant_in_loop_p (loop
, off
))
2766 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
2769 op0
= gimple_assign_rhs1 (def_stmt
);
2770 code
= gimple_assign_rhs_code (def_stmt
);
2771 op1
= gimple_assign_rhs2 (def_stmt
);
2775 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
2777 code
= TREE_CODE (off
);
2778 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
2782 case POINTER_PLUS_EXPR
:
2784 if (expr_invariant_in_loop_p (loop
, op0
))
2789 add
= fold_convert (sizetype
, add
);
2791 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
2792 base
= size_binop (PLUS_EXPR
, base
, add
);
2795 if (expr_invariant_in_loop_p (loop
, op1
))
2803 if (expr_invariant_in_loop_p (loop
, op1
))
2805 add
= fold_convert (sizetype
, op1
);
2806 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
2812 if (scale
== 1 && host_integerp (op1
, 0))
2814 scale
= tree_low_cst (op1
, 0);
2823 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
2824 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
2826 if (TYPE_PRECISION (TREE_TYPE (op0
))
2827 == TYPE_PRECISION (TREE_TYPE (off
)))
2832 if (TYPE_PRECISION (TREE_TYPE (op0
))
2833 < TYPE_PRECISION (TREE_TYPE (off
)))
2836 offtype
= TREE_TYPE (off
);
2847 /* If at the end OFF still isn't a SSA_NAME or isn't
2848 defined in the loop, punt. */
2849 if (TREE_CODE (off
) != SSA_NAME
2850 || expr_invariant_in_loop_p (loop
, off
))
2853 if (offtype
== NULL_TREE
)
2854 offtype
= TREE_TYPE (off
);
2856 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
2858 if (decl
== NULL_TREE
)
2870 /* Function vect_analyze_data_refs.
2872 Find all the data references in the loop or basic block.
2874 The general structure of the analysis of data refs in the vectorizer is as
2876 1- vect_analyze_data_refs(loop/bb): call
2877 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2878 in the loop/bb and their dependences.
2879 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2880 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2881 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2886 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
2887 bb_vec_info bb_vinfo
,
2890 struct loop
*loop
= NULL
;
2891 basic_block bb
= NULL
;
2893 vec
<data_reference_p
> datarefs
;
2894 struct data_reference
*dr
;
2897 if (dump_enabled_p ())
2898 dump_printf_loc (MSG_NOTE
, vect_location
,
2899 "=== vect_analyze_data_refs ===\n");
2903 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2904 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
))
2905 || find_data_references_in_loop
2906 (loop
, &LOOP_VINFO_DATAREFS (loop_vinfo
)))
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2910 "not vectorized: loop contains function calls"
2911 " or data references that cannot be analyzed");
2915 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2919 gimple_stmt_iterator gsi
;
2921 bb
= BB_VINFO_BB (bb_vinfo
);
2922 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2924 gimple stmt
= gsi_stmt (gsi
);
2925 if (!find_data_references_in_stmt (NULL
, stmt
,
2926 &BB_VINFO_DATAREFS (bb_vinfo
)))
2928 /* Mark the rest of the basic-block as unvectorizable. */
2929 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
2931 stmt
= gsi_stmt (gsi
);
2932 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
2938 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2941 /* Go through the data-refs, check that the analysis succeeded. Update
2942 pointer from stmt_vec_info struct to DR and vectype. */
2944 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2947 stmt_vec_info stmt_info
;
2948 tree base
, offset
, init
;
2949 bool gather
= false;
2950 bool simd_lane_access
= false;
2954 if (!dr
|| !DR_REF (dr
))
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2958 "not vectorized: unhandled data-ref ");
2962 stmt
= DR_STMT (dr
);
2963 stmt_info
= vinfo_for_stmt (stmt
);
2965 /* Discard clobbers from the dataref vector. We will remove
2966 clobber stmts during vectorization. */
2967 if (gimple_clobber_p (stmt
))
2969 if (i
== datarefs
.length () - 1)
2974 datarefs
[i
] = datarefs
.pop ();
2978 /* Check that analysis of the data-ref succeeded. */
2979 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
2984 && !TREE_THIS_VOLATILE (DR_REF (dr
))
2985 && targetm
.vectorize
.builtin_gather
!= NULL
;
2986 bool maybe_simd_lane_access
2987 = loop_vinfo
&& loop
->simduid
;
2989 /* If target supports vector gather loads, or if this might be
2990 a SIMD lane access, see if they can't be used. */
2992 && (maybe_gather
|| maybe_simd_lane_access
)
2993 && !nested_in_vect_loop_p (loop
, stmt
))
2995 struct data_reference
*newdr
2996 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
2997 DR_REF (dr
), stmt
, true);
2998 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
2999 if (DR_BASE_ADDRESS (newdr
)
3000 && DR_OFFSET (newdr
)
3003 && integer_zerop (DR_STEP (newdr
)))
3005 if (maybe_simd_lane_access
)
3007 tree off
= DR_OFFSET (newdr
);
3009 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3010 && TREE_CODE (off
) == MULT_EXPR
3011 && host_integerp (TREE_OPERAND (off
, 1), 1))
3013 tree step
= TREE_OPERAND (off
, 1);
3014 off
= TREE_OPERAND (off
, 0);
3016 if (CONVERT_EXPR_P (off
)
3017 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3019 < TYPE_PRECISION (TREE_TYPE (off
)))
3020 off
= TREE_OPERAND (off
, 0);
3021 if (TREE_CODE (off
) == SSA_NAME
)
3023 gimple def
= SSA_NAME_DEF_STMT (off
);
3024 tree reft
= TREE_TYPE (DR_REF (newdr
));
3025 if (gimple_call_internal_p (def
)
3026 && gimple_call_internal_fn (def
)
3027 == IFN_GOMP_SIMD_LANE
)
3029 tree arg
= gimple_call_arg (def
, 0);
3030 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3031 arg
= SSA_NAME_VAR (arg
);
3032 if (arg
== loop
->simduid
3034 && tree_int_cst_equal
3035 (TYPE_SIZE_UNIT (reft
),
3038 DR_OFFSET (newdr
) = ssize_int (0);
3039 DR_STEP (newdr
) = step
;
3041 simd_lane_access
= true;
3047 if (!simd_lane_access
&& maybe_gather
)
3053 if (!gather
&& !simd_lane_access
)
3054 free_data_ref (newdr
);
3057 if (!gather
&& !simd_lane_access
)
3059 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3062 "not vectorized: data ref analysis "
3064 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3074 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3076 if (dump_enabled_p ())
3077 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3078 "not vectorized: base addr of dr is a "
3084 if (gather
|| simd_lane_access
)
3089 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3091 if (dump_enabled_p ())
3093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3094 "not vectorized: volatile type ");
3095 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3104 if (stmt_can_throw_internal (stmt
))
3106 if (dump_enabled_p ())
3108 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3109 "not vectorized: statement can throw an "
3111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3117 if (gather
|| simd_lane_access
)
3122 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3123 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3125 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3128 "not vectorized: statement is bitfield "
3130 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3136 if (gather
|| simd_lane_access
)
3141 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3142 offset
= unshare_expr (DR_OFFSET (dr
));
3143 init
= unshare_expr (DR_INIT (dr
));
3145 if (is_gimple_call (stmt
))
3147 if (dump_enabled_p ())
3149 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3150 "not vectorized: dr in a call ");
3151 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3157 if (gather
|| simd_lane_access
)
3162 /* Update DR field in stmt_vec_info struct. */
3164 /* If the dataref is in an inner-loop of the loop that is considered for
3165 for vectorization, we also want to analyze the access relative to
3166 the outer-loop (DR contains information only relative to the
3167 inner-most enclosing loop). We do that by building a reference to the
3168 first location accessed by the inner-loop, and analyze it relative to
3170 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3172 tree outer_step
, outer_base
, outer_init
;
3173 HOST_WIDE_INT pbitsize
, pbitpos
;
3175 enum machine_mode pmode
;
3176 int punsignedp
, pvolatilep
;
3177 affine_iv base_iv
, offset_iv
;
3180 /* Build a reference to the first location accessed by the
3181 inner-loop: *(BASE+INIT). (The first location is actually
3182 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3183 tree inner_base
= build_fold_indirect_ref
3184 (fold_build_pointer_plus (base
, init
));
3186 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_NOTE
, vect_location
,
3189 "analyze in outer-loop: ");
3190 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3193 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3194 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3195 gcc_assert (outer_base
!= NULL_TREE
);
3197 if (pbitpos
% BITS_PER_UNIT
!= 0)
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3201 "failed: bit offset alignment.\n");
3205 outer_base
= build_fold_addr_expr (outer_base
);
3206 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3211 "failed: evolution of base is not affine.\n");
3218 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3226 offset_iv
.base
= ssize_int (0);
3227 offset_iv
.step
= ssize_int (0);
3229 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3234 "evolution of offset is not affine.\n");
3238 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3239 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3240 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3241 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3242 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3244 outer_step
= size_binop (PLUS_EXPR
,
3245 fold_convert (ssizetype
, base_iv
.step
),
3246 fold_convert (ssizetype
, offset_iv
.step
));
3248 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3249 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3250 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3251 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3252 STMT_VINFO_DR_OFFSET (stmt_info
) =
3253 fold_convert (ssizetype
, offset_iv
.base
);
3254 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3255 size_int (highest_pow2_factor (offset_iv
.base
));
3257 if (dump_enabled_p ())
3259 dump_printf_loc (MSG_NOTE
, vect_location
,
3260 "\touter base_address: ");
3261 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3262 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3263 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3264 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3265 STMT_VINFO_DR_OFFSET (stmt_info
));
3266 dump_printf (MSG_NOTE
,
3267 "\n\touter constant offset from base address: ");
3268 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3269 STMT_VINFO_DR_INIT (stmt_info
));
3270 dump_printf (MSG_NOTE
, "\n\touter step: ");
3271 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3272 STMT_VINFO_DR_STEP (stmt_info
));
3273 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3274 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3275 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3279 if (STMT_VINFO_DATA_REF (stmt_info
))
3281 if (dump_enabled_p ())
3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3284 "not vectorized: more than one data ref "
3286 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3292 if (gather
|| simd_lane_access
)
3297 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3298 if (simd_lane_access
)
3300 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3304 /* Set vectype for STMT. */
3305 scalar_type
= TREE_TYPE (DR_REF (dr
));
3306 STMT_VINFO_VECTYPE (stmt_info
) =
3307 get_vectype_for_scalar_type (scalar_type
);
3308 if (!STMT_VINFO_VECTYPE (stmt_info
))
3310 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3313 "not vectorized: no vectype for stmt: ");
3314 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3315 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3316 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3323 if (gather
|| simd_lane_access
)
3325 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3332 if (dump_enabled_p ())
3334 dump_printf_loc (MSG_NOTE
, vect_location
,
3335 "got vectype for stmt: ");
3336 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3337 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3338 STMT_VINFO_VECTYPE (stmt_info
));
3342 /* Adjust the minimal vectorization factor according to the
3344 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3352 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3354 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3358 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3360 if (dump_enabled_p ())
3362 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3363 "not vectorized: not suitable for gather "
3365 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3371 STMT_VINFO_GATHER_P (stmt_info
) = true;
3374 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3376 if (nested_in_vect_loop_p (loop
, stmt
)
3377 || !DR_IS_READ (dr
))
3379 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3382 "not vectorized: not suitable for strided "
3384 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3388 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3392 /* If we stopped analysis at the first dataref we could not analyze
3393 when trying to vectorize a basic-block mark the rest of the datarefs
3394 as not vectorizable and truncate the vector of datarefs. That
3395 avoids spending useless time in analyzing their dependence. */
3396 if (i
!= datarefs
.length ())
3398 gcc_assert (bb_vinfo
!= NULL
);
3399 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3401 data_reference_p dr
= datarefs
[j
];
3402 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3405 datarefs
.truncate (i
);
3412 /* Function vect_get_new_vect_var.
3414 Returns a name for a new variable. The current naming scheme appends the
3415 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3416 the name of vectorizer generated variables, and appends that to NAME if
3420 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3427 case vect_simple_var
:
3430 case vect_scalar_var
:
3433 case vect_pointer_var
:
3442 char* tmp
= concat (prefix
, "_", name
, NULL
);
3443 new_vect_var
= create_tmp_reg (type
, tmp
);
3447 new_vect_var
= create_tmp_reg (type
, prefix
);
3449 return new_vect_var
;
3453 /* Function vect_create_addr_base_for_vector_ref.
3455 Create an expression that computes the address of the first memory location
3456 that will be accessed for a data reference.
3459 STMT: The statement containing the data reference.
3460 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3461 OFFSET: Optional. If supplied, it is be added to the initial address.
3462 LOOP: Specify relative to which loop-nest should the address be computed.
3463 For example, when the dataref is in an inner-loop nested in an
3464 outer-loop that is now being vectorized, LOOP can be either the
3465 outer-loop, or the inner-loop. The first memory location accessed
3466 by the following dataref ('in' points to short):
3473 if LOOP=i_loop: &in (relative to i_loop)
3474 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3477 1. Return an SSA_NAME whose value is the address of the memory location of
3478 the first vector of the data reference.
3479 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3480 these statement(s) which define the returned SSA_NAME.
3482 FORNOW: We are only handling array accesses with step 1. */
3485 vect_create_addr_base_for_vector_ref (gimple stmt
,
3486 gimple_seq
*new_stmt_list
,
3490 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3491 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3493 const char *base_name
;
3496 gimple_seq seq
= NULL
;
3500 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3501 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3503 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3505 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3507 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3509 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3510 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3511 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3515 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3516 base_offset
= unshare_expr (DR_OFFSET (dr
));
3517 init
= unshare_expr (DR_INIT (dr
));
3521 base_name
= get_name (data_ref_base
);
3524 base_offset
= ssize_int (0);
3525 init
= ssize_int (0);
3526 base_name
= get_name (DR_REF (dr
));
3529 /* Create base_offset */
3530 base_offset
= size_binop (PLUS_EXPR
,
3531 fold_convert (sizetype
, base_offset
),
3532 fold_convert (sizetype
, init
));
3536 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3537 fold_convert (sizetype
, offset
), step
);
3538 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3539 base_offset
, offset
);
3542 /* base + base_offset */
3544 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3547 addr_base
= build1 (ADDR_EXPR
,
3548 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3549 unshare_expr (DR_REF (dr
)));
3552 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3553 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3554 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3555 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3556 gimple_seq_add_seq (new_stmt_list
, seq
);
3558 if (DR_PTR_INFO (dr
)
3559 && TREE_CODE (addr_base
) == SSA_NAME
)
3561 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3563 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3569 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3576 /* Function vect_create_data_ref_ptr.
3578 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3579 location accessed in the loop by STMT, along with the def-use update
3580 chain to appropriately advance the pointer through the loop iterations.
3581 Also set aliasing information for the pointer. This pointer is used by
3582 the callers to this function to create a memory reference expression for
3583 vector load/store access.
3586 1. STMT: a stmt that references memory. Expected to be of the form
3587 GIMPLE_ASSIGN <name, data-ref> or
3588 GIMPLE_ASSIGN <data-ref, name>.
3589 2. AGGR_TYPE: the type of the reference, which should be either a vector
3591 3. AT_LOOP: the loop where the vector memref is to be created.
3592 4. OFFSET (optional): an offset to be added to the initial address accessed
3593 by the data-ref in STMT.
3594 5. BSI: location where the new stmts are to be placed if there is no loop
3595 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3596 pointing to the initial address.
3599 1. Declare a new ptr to vector_type, and have it point to the base of the
3600 data reference (initial addressed accessed by the data reference).
3601 For example, for vector of type V8HI, the following code is generated:
3604 ap = (v8hi *)initial_address;
3606 if OFFSET is not supplied:
3607 initial_address = &a[init];
3608 if OFFSET is supplied:
3609 initial_address = &a[init + OFFSET];
3611 Return the initial_address in INITIAL_ADDRESS.
3613 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3614 update the pointer in each iteration of the loop.
3616 Return the increment stmt that updates the pointer in PTR_INCR.
3618 3. Set INV_P to true if the access pattern of the data reference in the
3619 vectorized loop is invariant. Set it to false otherwise.
3621 4. Return the pointer. */
3624 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3625 tree offset
, tree
*initial_address
,
3626 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3627 bool only_init
, bool *inv_p
)
3629 const char *base_name
;
3630 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3631 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3632 struct loop
*loop
= NULL
;
3633 bool nested_in_vect_loop
= false;
3634 struct loop
*containing_loop
= NULL
;
3639 gimple_seq new_stmt_list
= NULL
;
3643 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3645 gimple_stmt_iterator incr_gsi
;
3647 tree indx_before_incr
, indx_after_incr
;
3650 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3652 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3653 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3657 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3658 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3659 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3660 pe
= loop_preheader_edge (loop
);
3664 gcc_assert (bb_vinfo
);
3669 /* Check the step (evolution) of the load in LOOP, and record
3670 whether it's invariant. */
3671 if (nested_in_vect_loop
)
3672 step
= STMT_VINFO_DR_STEP (stmt_info
);
3674 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
3676 if (integer_zerop (step
))
3681 /* Create an expression for the first address accessed by this load
3683 base_name
= get_name (DR_BASE_ADDRESS (dr
));
3685 if (dump_enabled_p ())
3687 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
3688 dump_printf_loc (MSG_NOTE
, vect_location
,
3689 "create %s-pointer variable to type: ",
3690 tree_code_name
[(int) TREE_CODE (aggr_type
)]);
3691 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
3692 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
3693 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
3694 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
3695 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
3696 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
3697 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
3699 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
3700 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
3703 /* (1) Create the new aggregate-pointer variable.
3704 Vector and array types inherit the alias set of their component
3705 type by default so we need to use a ref-all pointer if the data
3706 reference does not conflict with the created aggregated data
3707 reference because it is not addressable. */
3708 bool need_ref_all
= false;
3709 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3710 get_alias_set (DR_REF (dr
))))
3711 need_ref_all
= true;
3712 /* Likewise for any of the data references in the stmt group. */
3713 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
3715 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
3718 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
3719 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
3720 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
3721 get_alias_set (DR_REF (sdr
))))
3723 need_ref_all
= true;
3726 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
3730 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
3732 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
3735 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3736 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3737 def-use update cycles for the pointer: one relative to the outer-loop
3738 (LOOP), which is what steps (3) and (4) below do. The other is relative
3739 to the inner-loop (which is the inner-most loop containing the dataref),
3740 and this is done be step (5) below.
3742 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3743 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3744 redundant. Steps (3),(4) create the following:
3747 LOOP: vp1 = phi(vp0,vp2)
3753 If there is an inner-loop nested in loop, then step (5) will also be
3754 applied, and an additional update in the inner-loop will be created:
3757 LOOP: vp1 = phi(vp0,vp2)
3759 inner: vp3 = phi(vp1,vp4)
3760 vp4 = vp3 + inner_step
3766 /* (2) Calculate the initial address of the aggregate-pointer, and set
3767 the aggregate-pointer to point to it before the loop. */
3769 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3771 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
3777 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
3778 gcc_assert (!new_bb
);
3781 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
3784 *initial_address
= new_temp
;
3786 /* Create: p = (aggr_type *) initial_base */
3787 if (TREE_CODE (new_temp
) != SSA_NAME
3788 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
3790 vec_stmt
= gimple_build_assign (aggr_ptr
,
3791 fold_convert (aggr_ptr_type
, new_temp
));
3792 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
3793 /* Copy the points-to information if it exists. */
3794 if (DR_PTR_INFO (dr
))
3795 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
3796 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
3799 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
3800 gcc_assert (!new_bb
);
3803 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
3806 aggr_ptr_init
= new_temp
;
3808 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3809 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3810 inner-loop nested in LOOP (during outer-loop vectorization). */
3812 /* No update in loop is required. */
3813 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
3814 aptr
= aggr_ptr_init
;
3817 /* The step of the aggregate pointer is the type size. */
3818 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
3819 /* One exception to the above is when the scalar step of the load in
3820 LOOP is zero. In this case the step here is also zero. */
3822 iv_step
= size_zero_node
;
3823 else if (tree_int_cst_sgn (step
) == -1)
3824 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
3826 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
3828 create_iv (aggr_ptr_init
,
3829 fold_convert (aggr_ptr_type
, iv_step
),
3830 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
3831 &indx_before_incr
, &indx_after_incr
);
3832 incr
= gsi_stmt (incr_gsi
);
3833 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3835 /* Copy the points-to information if it exists. */
3836 if (DR_PTR_INFO (dr
))
3838 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3839 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3844 aptr
= indx_before_incr
;
3847 if (!nested_in_vect_loop
|| only_init
)
3851 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3852 nested in LOOP, if exists. */
3854 gcc_assert (nested_in_vect_loop
);
3857 standard_iv_increment_position (containing_loop
, &incr_gsi
,
3859 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
3860 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
3862 incr
= gsi_stmt (incr_gsi
);
3863 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
3865 /* Copy the points-to information if it exists. */
3866 if (DR_PTR_INFO (dr
))
3868 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
3869 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
3874 return indx_before_incr
;
3881 /* Function bump_vector_ptr
3883 Increment a pointer (to a vector type) by vector-size. If requested,
3884 i.e. if PTR-INCR is given, then also connect the new increment stmt
3885 to the existing def-use update-chain of the pointer, by modifying
3886 the PTR_INCR as illustrated below:
3888 The pointer def-use update-chain before this function:
3889 DATAREF_PTR = phi (p_0, p_2)
3891 PTR_INCR: p_2 = DATAREF_PTR + step
3893 The pointer def-use update-chain after this function:
3894 DATAREF_PTR = phi (p_0, p_2)
3896 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3898 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3901 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3903 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3904 the loop. The increment amount across iterations is expected
3906 BSI - location where the new update stmt is to be placed.
3907 STMT - the original scalar memory-access stmt that is being vectorized.
3908 BUMP - optional. The offset by which to bump the pointer. If not given,
3909 the offset is assumed to be vector_size.
3911 Output: Return NEW_DATAREF_PTR as illustrated above.
3916 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
3917 gimple stmt
, tree bump
)
3919 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3920 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3921 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3922 tree update
= TYPE_SIZE_UNIT (vectype
);
3925 use_operand_p use_p
;
3926 tree new_dataref_ptr
;
3931 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
3932 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
3933 dataref_ptr
, update
);
3934 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
3936 /* Copy the points-to information if it exists. */
3937 if (DR_PTR_INFO (dr
))
3939 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
3940 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
3944 return new_dataref_ptr
;
3946 /* Update the vector-pointer's cross-iteration increment. */
3947 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
3949 tree use
= USE_FROM_PTR (use_p
);
3951 if (use
== dataref_ptr
)
3952 SET_USE (use_p
, new_dataref_ptr
);
3954 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
3957 return new_dataref_ptr
;
3961 /* Function vect_create_destination_var.
3963 Create a new temporary of type VECTYPE. */
3966 vect_create_destination_var (tree scalar_dest
, tree vectype
)
3972 enum vect_var_kind kind
;
3974 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
3975 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
3977 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
3979 name
= get_name (scalar_dest
);
3981 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
3983 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
3984 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
3990 /* Function vect_grouped_store_supported.
3992 Returns TRUE if interleave high and interleave low permutations
3993 are supported, and FALSE otherwise. */
3996 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
3998 enum machine_mode mode
= TYPE_MODE (vectype
);
4000 /* vect_permute_store_chain requires the group size to be a power of two. */
4001 if (exact_log2 (count
) == -1)
4003 if (dump_enabled_p ())
4004 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4005 "the size of the group of accesses"
4006 " is not a power of 2");
4010 /* Check that the permutation is supported. */
4011 if (VECTOR_MODE_P (mode
))
4013 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4014 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4015 for (i
= 0; i
< nelt
/ 2; i
++)
4018 sel
[i
* 2 + 1] = i
+ nelt
;
4020 if (can_vec_perm_p (mode
, false, sel
))
4022 for (i
= 0; i
< nelt
; i
++)
4024 if (can_vec_perm_p (mode
, false, sel
))
4029 if (dump_enabled_p ())
4030 dump_printf (MSG_MISSED_OPTIMIZATION
,
4031 "interleave op not supported by target.");
4036 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4040 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4042 return vect_lanes_optab_supported_p ("vec_store_lanes",
4043 vec_store_lanes_optab
,
4048 /* Function vect_permute_store_chain.
4050 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4051 a power of 2, generate interleave_high/low stmts to reorder the data
4052 correctly for the stores. Return the final references for stores in
4055 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4056 The input is 4 vectors each containing 8 elements. We assign a number to
4057 each element, the input sequence is:
4059 1st vec: 0 1 2 3 4 5 6 7
4060 2nd vec: 8 9 10 11 12 13 14 15
4061 3rd vec: 16 17 18 19 20 21 22 23
4062 4th vec: 24 25 26 27 28 29 30 31
4064 The output sequence should be:
4066 1st vec: 0 8 16 24 1 9 17 25
4067 2nd vec: 2 10 18 26 3 11 19 27
4068 3rd vec: 4 12 20 28 5 13 21 30
4069 4th vec: 6 14 22 30 7 15 23 31
4071 i.e., we interleave the contents of the four vectors in their order.
4073 We use interleave_high/low instructions to create such output. The input of
4074 each interleave_high/low operation is two vectors:
4077 the even elements of the result vector are obtained left-to-right from the
4078 high/low elements of the first vector. The odd elements of the result are
4079 obtained left-to-right from the high/low elements of the second vector.
4080 The output of interleave_high will be: 0 4 1 5
4081 and of interleave_low: 2 6 3 7
4084 The permutation is done in log LENGTH stages. In each stage interleave_high
4085 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4086 where the first argument is taken from the first half of DR_CHAIN and the
4087 second argument from it's second half.
4090 I1: interleave_high (1st vec, 3rd vec)
4091 I2: interleave_low (1st vec, 3rd vec)
4092 I3: interleave_high (2nd vec, 4th vec)
4093 I4: interleave_low (2nd vec, 4th vec)
4095 The output for the first stage is:
4097 I1: 0 16 1 17 2 18 3 19
4098 I2: 4 20 5 21 6 22 7 23
4099 I3: 8 24 9 25 10 26 11 27
4100 I4: 12 28 13 29 14 30 15 31
4102 The output of the second stage, i.e. the final result is:
4104 I1: 0 8 16 24 1 9 17 25
4105 I2: 2 10 18 26 3 11 19 27
4106 I3: 4 12 20 28 5 13 21 30
4107 I4: 6 14 22 30 7 15 23 31. */
4110 vect_permute_store_chain (vec
<tree
> dr_chain
,
4111 unsigned int length
,
4113 gimple_stmt_iterator
*gsi
,
4114 vec
<tree
> *result_chain
)
4116 tree vect1
, vect2
, high
, low
;
4118 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4119 tree perm_mask_low
, perm_mask_high
;
4121 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4122 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4124 result_chain
->quick_grow (length
);
4125 memcpy (result_chain
->address (), dr_chain
.address (),
4126 length
* sizeof (tree
));
4128 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4131 sel
[i
* 2 + 1] = i
+ nelt
;
4133 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4134 gcc_assert (perm_mask_high
!= NULL
);
4136 for (i
= 0; i
< nelt
; i
++)
4138 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4139 gcc_assert (perm_mask_low
!= NULL
);
4141 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4143 for (j
= 0; j
< length
/2; j
++)
4145 vect1
= dr_chain
[j
];
4146 vect2
= dr_chain
[j
+length
/2];
4148 /* Create interleaving stmt:
4149 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4150 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4152 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4153 vect1
, vect2
, perm_mask_high
);
4154 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4155 (*result_chain
)[2*j
] = high
;
4157 /* Create interleaving stmt:
4158 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4159 nelt*3/2+1, ...}> */
4160 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4162 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4163 vect1
, vect2
, perm_mask_low
);
4164 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4165 (*result_chain
)[2*j
+1] = low
;
4167 memcpy (dr_chain
.address (), result_chain
->address (),
4168 length
* sizeof (tree
));
4172 /* Function vect_setup_realignment
4174 This function is called when vectorizing an unaligned load using
4175 the dr_explicit_realign[_optimized] scheme.
4176 This function generates the following code at the loop prolog:
4179 x msq_init = *(floor(p)); # prolog load
4180 realignment_token = call target_builtin;
4182 x msq = phi (msq_init, ---)
4184 The stmts marked with x are generated only for the case of
4185 dr_explicit_realign_optimized.
4187 The code above sets up a new (vector) pointer, pointing to the first
4188 location accessed by STMT, and a "floor-aligned" load using that pointer.
4189 It also generates code to compute the "realignment-token" (if the relevant
4190 target hook was defined), and creates a phi-node at the loop-header bb
4191 whose arguments are the result of the prolog-load (created by this
4192 function) and the result of a load that takes place in the loop (to be
4193 created by the caller to this function).
4195 For the case of dr_explicit_realign_optimized:
4196 The caller to this function uses the phi-result (msq) to create the
4197 realignment code inside the loop, and sets up the missing phi argument,
4200 msq = phi (msq_init, lsq)
4201 lsq = *(floor(p')); # load in loop
4202 result = realign_load (msq, lsq, realignment_token);
4204 For the case of dr_explicit_realign:
4206 msq = *(floor(p)); # load in loop
4208 lsq = *(floor(p')); # load in loop
4209 result = realign_load (msq, lsq, realignment_token);
4212 STMT - (scalar) load stmt to be vectorized. This load accesses
4213 a memory location that may be unaligned.
4214 BSI - place where new code is to be inserted.
4215 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4219 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4220 target hook, if defined.
4221 Return value - the result of the loop-header phi node. */
4224 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4225 tree
*realignment_token
,
4226 enum dr_alignment_support alignment_support_scheme
,
4228 struct loop
**at_loop
)
4230 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4231 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4232 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4233 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4234 struct loop
*loop
= NULL
;
4236 tree scalar_dest
= gimple_assign_lhs (stmt
);
4243 tree msq_init
= NULL_TREE
;
4246 tree msq
= NULL_TREE
;
4247 gimple_seq stmts
= NULL
;
4249 bool compute_in_loop
= false;
4250 bool nested_in_vect_loop
= false;
4251 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4252 struct loop
*loop_for_initial_load
= NULL
;
4256 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4257 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4260 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4261 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4263 /* We need to generate three things:
4264 1. the misalignment computation
4265 2. the extra vector load (for the optimized realignment scheme).
4266 3. the phi node for the two vectors from which the realignment is
4267 done (for the optimized realignment scheme). */
4269 /* 1. Determine where to generate the misalignment computation.
4271 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4272 calculation will be generated by this function, outside the loop (in the
4273 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4274 caller, inside the loop.
4276 Background: If the misalignment remains fixed throughout the iterations of
4277 the loop, then both realignment schemes are applicable, and also the
4278 misalignment computation can be done outside LOOP. This is because we are
4279 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4280 are a multiple of VS (the Vector Size), and therefore the misalignment in
4281 different vectorized LOOP iterations is always the same.
4282 The problem arises only if the memory access is in an inner-loop nested
4283 inside LOOP, which is now being vectorized using outer-loop vectorization.
4284 This is the only case when the misalignment of the memory access may not
4285 remain fixed throughout the iterations of the inner-loop (as explained in
4286 detail in vect_supportable_dr_alignment). In this case, not only is the
4287 optimized realignment scheme not applicable, but also the misalignment
4288 computation (and generation of the realignment token that is passed to
4289 REALIGN_LOAD) have to be done inside the loop.
4291 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4292 or not, which in turn determines if the misalignment is computed inside
4293 the inner-loop, or outside LOOP. */
4295 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4297 compute_in_loop
= true;
4298 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4302 /* 2. Determine where to generate the extra vector load.
4304 For the optimized realignment scheme, instead of generating two vector
4305 loads in each iteration, we generate a single extra vector load in the
4306 preheader of the loop, and in each iteration reuse the result of the
4307 vector load from the previous iteration. In case the memory access is in
4308 an inner-loop nested inside LOOP, which is now being vectorized using
4309 outer-loop vectorization, we need to determine whether this initial vector
4310 load should be generated at the preheader of the inner-loop, or can be
4311 generated at the preheader of LOOP. If the memory access has no evolution
4312 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4313 to be generated inside LOOP (in the preheader of the inner-loop). */
4315 if (nested_in_vect_loop
)
4317 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4318 bool invariant_in_outerloop
=
4319 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4320 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4323 loop_for_initial_load
= loop
;
4325 *at_loop
= loop_for_initial_load
;
4327 if (loop_for_initial_load
)
4328 pe
= loop_preheader_edge (loop_for_initial_load
);
4330 /* 3. For the case of the optimized realignment, create the first vector
4331 load at the loop preheader. */
4333 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4335 /* Create msq_init = *(floor(p1)) in the loop preheader */
4337 gcc_assert (!compute_in_loop
);
4338 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4339 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4340 NULL_TREE
, &init_addr
, NULL
, &inc
,
4342 new_temp
= copy_ssa_name (ptr
, NULL
);
4343 new_stmt
= gimple_build_assign_with_ops
4344 (BIT_AND_EXPR
, new_temp
, ptr
,
4345 build_int_cst (TREE_TYPE (ptr
),
4346 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4347 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4348 gcc_assert (!new_bb
);
4350 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4351 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4352 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4353 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4354 gimple_assign_set_lhs (new_stmt
, new_temp
);
4357 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4358 gcc_assert (!new_bb
);
4361 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4363 msq_init
= gimple_assign_lhs (new_stmt
);
4366 /* 4. Create realignment token using a target builtin, if available.
4367 It is done either inside the containing loop, or before LOOP (as
4368 determined above). */
4370 if (targetm
.vectorize
.builtin_mask_for_load
)
4374 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4377 /* Generate the INIT_ADDR computation outside LOOP. */
4378 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4382 pe
= loop_preheader_edge (loop
);
4383 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4384 gcc_assert (!new_bb
);
4387 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4390 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4391 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4393 vect_create_destination_var (scalar_dest
,
4394 gimple_call_return_type (new_stmt
));
4395 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4396 gimple_call_set_lhs (new_stmt
, new_temp
);
4398 if (compute_in_loop
)
4399 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4402 /* Generate the misalignment computation outside LOOP. */
4403 pe
= loop_preheader_edge (loop
);
4404 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4405 gcc_assert (!new_bb
);
4408 *realignment_token
= gimple_call_lhs (new_stmt
);
4410 /* The result of the CALL_EXPR to this builtin is determined from
4411 the value of the parameter and no global variables are touched
4412 which makes the builtin a "const" function. Requiring the
4413 builtin to have the "const" attribute makes it unnecessary
4414 to call mark_call_clobbered. */
4415 gcc_assert (TREE_READONLY (builtin_decl
));
4418 if (alignment_support_scheme
== dr_explicit_realign
)
4421 gcc_assert (!compute_in_loop
);
4422 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4425 /* 5. Create msq = phi <msq_init, lsq> in loop */
4427 pe
= loop_preheader_edge (containing_loop
);
4428 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4429 msq
= make_ssa_name (vec_dest
, NULL
);
4430 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4431 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4437 /* Function vect_grouped_load_supported.
4439 Returns TRUE if even and odd permutations are supported,
4440 and FALSE otherwise. */
4443 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4445 enum machine_mode mode
= TYPE_MODE (vectype
);
4447 /* vect_permute_load_chain requires the group size to be a power of two. */
4448 if (exact_log2 (count
) == -1)
4450 if (dump_enabled_p ())
4451 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4452 "the size of the group of accesses"
4453 " is not a power of 2");
4457 /* Check that the permutation is supported. */
4458 if (VECTOR_MODE_P (mode
))
4460 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4461 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4463 for (i
= 0; i
< nelt
; i
++)
4465 if (can_vec_perm_p (mode
, false, sel
))
4467 for (i
= 0; i
< nelt
; i
++)
4469 if (can_vec_perm_p (mode
, false, sel
))
4474 if (dump_enabled_p ())
4475 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4476 "extract even/odd not supported by target");
4480 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4484 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4486 return vect_lanes_optab_supported_p ("vec_load_lanes",
4487 vec_load_lanes_optab
,
4491 /* Function vect_permute_load_chain.
4493 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4494 a power of 2, generate extract_even/odd stmts to reorder the input data
4495 correctly. Return the final references for loads in RESULT_CHAIN.
4497 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4498 The input is 4 vectors each containing 8 elements. We assign a number to each
4499 element, the input sequence is:
4501 1st vec: 0 1 2 3 4 5 6 7
4502 2nd vec: 8 9 10 11 12 13 14 15
4503 3rd vec: 16 17 18 19 20 21 22 23
4504 4th vec: 24 25 26 27 28 29 30 31
4506 The output sequence should be:
4508 1st vec: 0 4 8 12 16 20 24 28
4509 2nd vec: 1 5 9 13 17 21 25 29
4510 3rd vec: 2 6 10 14 18 22 26 30
4511 4th vec: 3 7 11 15 19 23 27 31
4513 i.e., the first output vector should contain the first elements of each
4514 interleaving group, etc.
4516 We use extract_even/odd instructions to create such output. The input of
4517 each extract_even/odd operation is two vectors
4521 and the output is the vector of extracted even/odd elements. The output of
4522 extract_even will be: 0 2 4 6
4523 and of extract_odd: 1 3 5 7
4526 The permutation is done in log LENGTH stages. In each stage extract_even
4527 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4528 their order. In our example,
4530 E1: extract_even (1st vec, 2nd vec)
4531 E2: extract_odd (1st vec, 2nd vec)
4532 E3: extract_even (3rd vec, 4th vec)
4533 E4: extract_odd (3rd vec, 4th vec)
4535 The output for the first stage will be:
4537 E1: 0 2 4 6 8 10 12 14
4538 E2: 1 3 5 7 9 11 13 15
4539 E3: 16 18 20 22 24 26 28 30
4540 E4: 17 19 21 23 25 27 29 31
4542 In order to proceed and create the correct sequence for the next stage (or
4543 for the correct output, if the second stage is the last one, as in our
4544 example), we first put the output of extract_even operation and then the
4545 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4546 The input for the second stage is:
4548 1st vec (E1): 0 2 4 6 8 10 12 14
4549 2nd vec (E3): 16 18 20 22 24 26 28 30
4550 3rd vec (E2): 1 3 5 7 9 11 13 15
4551 4th vec (E4): 17 19 21 23 25 27 29 31
4553 The output of the second stage:
4555 E1: 0 4 8 12 16 20 24 28
4556 E2: 2 6 10 14 18 22 26 30
4557 E3: 1 5 9 13 17 21 25 29
4558 E4: 3 7 11 15 19 23 27 31
4560 And RESULT_CHAIN after reordering:
4562 1st vec (E1): 0 4 8 12 16 20 24 28
4563 2nd vec (E3): 1 5 9 13 17 21 25 29
4564 3rd vec (E2): 2 6 10 14 18 22 26 30
4565 4th vec (E4): 3 7 11 15 19 23 27 31. */
4568 vect_permute_load_chain (vec
<tree
> dr_chain
,
4569 unsigned int length
,
4571 gimple_stmt_iterator
*gsi
,
4572 vec
<tree
> *result_chain
)
4574 tree data_ref
, first_vect
, second_vect
;
4575 tree perm_mask_even
, perm_mask_odd
;
4577 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4578 unsigned int i
, j
, log_length
= exact_log2 (length
);
4579 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4580 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4582 result_chain
->quick_grow (length
);
4583 memcpy (result_chain
->address (), dr_chain
.address (),
4584 length
* sizeof (tree
));
4586 for (i
= 0; i
< nelt
; ++i
)
4588 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4589 gcc_assert (perm_mask_even
!= NULL
);
4591 for (i
= 0; i
< nelt
; ++i
)
4593 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4594 gcc_assert (perm_mask_odd
!= NULL
);
4596 for (i
= 0; i
< log_length
; i
++)
4598 for (j
= 0; j
< length
; j
+= 2)
4600 first_vect
= dr_chain
[j
];
4601 second_vect
= dr_chain
[j
+1];
4603 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4604 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4605 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4606 first_vect
, second_vect
,
4608 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4609 (*result_chain
)[j
/2] = data_ref
;
4611 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4612 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4613 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4614 first_vect
, second_vect
,
4616 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4617 (*result_chain
)[j
/2+length
/2] = data_ref
;
4619 memcpy (dr_chain
.address (), result_chain
->address (),
4620 length
* sizeof (tree
));
4625 /* Function vect_transform_grouped_load.
4627 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4628 to perform their permutation and ascribe the result vectorized statements to
4629 the scalar statements.
4633 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4634 gimple_stmt_iterator
*gsi
)
4636 vec
<tree
> result_chain
= vNULL
;
4638 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4639 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4640 vectors, that are ready for vector computation. */
4641 result_chain
.create (size
);
4642 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4643 vect_record_grouped_load_vectors (stmt
, result_chain
);
4644 result_chain
.release ();
4647 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4648 generated as part of the vectorization of STMT. Assign the statement
4649 for each vector to the associated scalar statement. */
4652 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4654 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4655 gimple next_stmt
, new_stmt
;
4656 unsigned int i
, gap_count
;
4659 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4660 Since we scan the chain starting from it's first node, their order
4661 corresponds the order of data-refs in RESULT_CHAIN. */
4662 next_stmt
= first_stmt
;
4664 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
4669 /* Skip the gaps. Loads created for the gaps will be removed by dead
4670 code elimination pass later. No need to check for the first stmt in
4671 the group, since it always exists.
4672 GROUP_GAP is the number of steps in elements from the previous
4673 access (if there is no gap GROUP_GAP is 1). We skip loads that
4674 correspond to the gaps. */
4675 if (next_stmt
!= first_stmt
4676 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
4684 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
4685 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4686 copies, and we put the new vector statement in the first available
4688 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
4689 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
4692 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4695 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
4697 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
4700 prev_stmt
= rel_stmt
;
4702 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
4705 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
4710 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
4712 /* If NEXT_STMT accesses the same DR as the previous statement,
4713 put the same TMP_DATA_REF as its vectorized statement; otherwise
4714 get the next data-ref from RESULT_CHAIN. */
4715 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
4721 /* Function vect_force_dr_alignment_p.
4723 Returns whether the alignment of a DECL can be forced to be aligned
4724 on ALIGNMENT bit boundary. */
4727 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
4729 if (TREE_CODE (decl
) != VAR_DECL
)
4732 /* We cannot change alignment of common or external symbols as another
4733 translation unit may contain a definition with lower alignment.
4734 The rules of common symbol linking mean that the definition
4735 will override the common symbol. The same is true for constant
4736 pool entries which may be shared and are not properly merged
4738 if (DECL_EXTERNAL (decl
)
4739 || DECL_COMMON (decl
)
4740 || DECL_IN_CONSTANT_POOL (decl
))
4743 if (TREE_ASM_WRITTEN (decl
))
4746 /* Do not override the alignment as specified by the ABI when the used
4747 attribute is set. */
4748 if (DECL_PRESERVE_P (decl
))
4751 /* Do not override explicit alignment set by the user when an explicit
4752 section name is also used. This is a common idiom used by many
4753 software projects. */
4754 if (DECL_SECTION_NAME (decl
) != NULL_TREE
4755 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
4758 if (TREE_STATIC (decl
))
4759 return (alignment
<= MAX_OFILE_ALIGNMENT
);
4761 return (alignment
<= MAX_STACK_ALIGNMENT
);
4765 /* Return whether the data reference DR is supported with respect to its
4767 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4768 it is aligned, i.e., check if it is possible to vectorize it with different
4771 enum dr_alignment_support
4772 vect_supportable_dr_alignment (struct data_reference
*dr
,
4773 bool check_aligned_accesses
)
4775 gimple stmt
= DR_STMT (dr
);
4776 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4777 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4778 enum machine_mode mode
= TYPE_MODE (vectype
);
4779 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4780 struct loop
*vect_loop
= NULL
;
4781 bool nested_in_vect_loop
= false;
4783 if (aligned_access_p (dr
) && !check_aligned_accesses
)
4788 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4789 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
4792 /* Possibly unaligned access. */
4794 /* We can choose between using the implicit realignment scheme (generating
4795 a misaligned_move stmt) and the explicit realignment scheme (generating
4796 aligned loads with a REALIGN_LOAD). There are two variants to the
4797 explicit realignment scheme: optimized, and unoptimized.
4798 We can optimize the realignment only if the step between consecutive
4799 vector loads is equal to the vector size. Since the vector memory
4800 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4801 is guaranteed that the misalignment amount remains the same throughout the
4802 execution of the vectorized loop. Therefore, we can create the
4803 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4804 at the loop preheader.
4806 However, in the case of outer-loop vectorization, when vectorizing a
4807 memory access in the inner-loop nested within the LOOP that is now being
4808 vectorized, while it is guaranteed that the misalignment of the
4809 vectorized memory access will remain the same in different outer-loop
4810 iterations, it is *not* guaranteed that is will remain the same throughout
4811 the execution of the inner-loop. This is because the inner-loop advances
4812 with the original scalar step (and not in steps of VS). If the inner-loop
4813 step happens to be a multiple of VS, then the misalignment remains fixed
4814 and we can use the optimized realignment scheme. For example:
4820 When vectorizing the i-loop in the above example, the step between
4821 consecutive vector loads is 1, and so the misalignment does not remain
4822 fixed across the execution of the inner-loop, and the realignment cannot
4823 be optimized (as illustrated in the following pseudo vectorized loop):
4825 for (i=0; i<N; i+=4)
4826 for (j=0; j<M; j++){
4827 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4828 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4829 // (assuming that we start from an aligned address).
4832 We therefore have to use the unoptimized realignment scheme:
4834 for (i=0; i<N; i+=4)
4835 for (j=k; j<M; j+=4)
4836 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4837 // that the misalignment of the initial address is
4840 The loop can then be vectorized as follows:
4842 for (k=0; k<4; k++){
4843 rt = get_realignment_token (&vp[k]);
4844 for (i=0; i<N; i+=4){
4846 for (j=k; j<M; j+=4){
4848 va = REALIGN_LOAD <v1,v2,rt>;
4855 if (DR_IS_READ (dr
))
4857 bool is_packed
= false;
4858 tree type
= (TREE_TYPE (DR_REF (dr
)));
4860 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
4861 && (!targetm
.vectorize
.builtin_mask_for_load
4862 || targetm
.vectorize
.builtin_mask_for_load ()))
4864 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4865 if ((nested_in_vect_loop
4866 && (TREE_INT_CST_LOW (DR_STEP (dr
))
4867 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
4869 return dr_explicit_realign
;
4871 return dr_explicit_realign_optimized
;
4873 if (!known_alignment_for_access_p (dr
))
4874 is_packed
= not_size_aligned (DR_REF (dr
));
4876 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4877 || targetm
.vectorize
.
4878 support_vector_misalignment (mode
, type
,
4879 DR_MISALIGNMENT (dr
), is_packed
))
4880 /* Can't software pipeline the loads, but can at least do them. */
4881 return dr_unaligned_supported
;
4885 bool is_packed
= false;
4886 tree type
= (TREE_TYPE (DR_REF (dr
)));
4888 if (!known_alignment_for_access_p (dr
))
4889 is_packed
= not_size_aligned (DR_REF (dr
));
4891 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
4892 || targetm
.vectorize
.
4893 support_vector_misalignment (mode
, type
,
4894 DR_MISALIGNMENT (dr
), is_packed
))
4895 return dr_unaligned_supported
;
4899 return dr_unaligned_unsupported
;