1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
28 #include "stor-layout.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
36 #include "gimple-expr.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
65 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
66 tree vectype
, unsigned HOST_WIDE_INT count
)
68 enum machine_mode mode
, array_mode
;
71 mode
= TYPE_MODE (vectype
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
76 if (array_mode
== BLKmode
)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
81 GET_MODE_NAME (mode
), count
);
85 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
89 "cannot use %s<%s><%s>\n", name
,
90 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE
, vect_location
,
96 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
97 GET_MODE_NAME (mode
));
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
121 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
122 HOST_WIDE_INT
*rhs_size_unit
)
124 tree scalar_type
= gimple_expr_type (stmt
);
125 HOST_WIDE_INT lhs
, rhs
;
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE
, vect_location
,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
165 dump_printf (MSG_NOTE
, " and ");
166 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
167 dump_printf (MSG_NOTE
, "\n");
170 if (optimize_loop_nest_for_size_p (loop
))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
174 "versioning not supported when optimizing"
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
184 "versioning not yet supported for outer-loops.\n");
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
190 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
195 "versioning not yet supported for non-constant "
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
213 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
214 loop_vec_info loop_vinfo
, int *max_vf
)
217 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
218 struct data_reference
*dra
= DDR_A (ddr
);
219 struct data_reference
*drb
= DDR_B (ddr
);
220 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
221 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
222 lambda_vector dist_v
;
223 unsigned int loop_depth
;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
235 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop
->safelen
>= 2)
245 if (loop
->safelen
< *max_vf
)
246 *max_vf
= loop
->safelen
;
250 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
251 || STMT_VINFO_GATHER_P (stmtinfo_b
))
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
256 "versioning for alias not supported for: "
257 "can't determine dependence between ");
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
260 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
261 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
263 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
268 if (dump_enabled_p ())
270 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
271 "versioning for alias required: "
272 "can't determine dependence between ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
275 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
276 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
278 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
281 /* Add to list of ddrs that need to be tested at run-time. */
282 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
285 /* Known data dependence. */
286 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
288 /* If user asserted safelen consecutive iterations can be
289 executed concurrently, assume independence. */
290 if (loop
->safelen
>= 2)
292 if (loop
->safelen
< *max_vf
)
293 *max_vf
= loop
->safelen
;
297 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
298 || STMT_VINFO_GATHER_P (stmtinfo_b
))
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
303 "versioning for alias not supported for: "
304 "bad dist vector for ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
307 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
310 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
318 "versioning for alias required: "
319 "bad dist vector for ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
321 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
325 /* Add to list of ddrs that need to be tested at run-time. */
326 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
329 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
330 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
332 int dist
= dist_v
[loop_depth
];
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE
, vect_location
,
336 "dependence distance = %d.\n", dist
);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE
, vect_location
,
343 "dependence distance == 0 between ");
344 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
345 dump_printf (MSG_NOTE
, " and ");
346 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
347 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
350 /* When we perform grouped accesses and perform implicit CSE
351 by detecting equal accesses and doing disambiguation with
352 runtime alias tests like for
360 where we will end up loading { a[i], a[i+1] } once, make
361 sure that inserting group loads before the first load and
362 stores after the last store will do the right thing. */
363 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
364 && GROUP_SAME_DR_STMT (stmtinfo_a
))
365 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
)
366 && GROUP_SAME_DR_STMT (stmtinfo_b
)))
369 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
371 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
375 "READ_WRITE dependence in interleaving."
384 if (dist
> 0 && DDR_REVERSED_P (ddr
))
386 /* If DDR_REVERSED_P the order of the data-refs in DDR was
387 reversed (to make distance vector positive), and the actual
388 distance is negative. */
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "dependence distance negative.\n");
396 && abs (dist
) < *max_vf
)
398 /* The dependence distance requires reduction of the maximal
399 vectorization factor. */
400 *max_vf
= abs (dist
);
401 if (dump_enabled_p ())
402 dump_printf_loc (MSG_NOTE
, vect_location
,
403 "adjusting maximal vectorization factor to %i\n",
407 if (abs (dist
) >= *max_vf
)
409 /* Dependence distance does not create dependence, as far as
410 vectorization is concerned, in this case. */
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE
, vect_location
,
413 "dependence distance >= VF.\n");
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
420 "not vectorized, possible dependence "
421 "between data-refs ");
422 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
423 dump_printf (MSG_NOTE
, " and ");
424 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
425 dump_printf (MSG_NOTE
, "\n");
434 /* Function vect_analyze_data_ref_dependences.
436 Examine all the data references in the loop, and make sure there do not
437 exist any data dependences between them. Set *MAX_VF according to
438 the maximum vectorization factor the data dependences allow. */
441 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
444 struct data_dependence_relation
*ddr
;
446 if (dump_enabled_p ())
447 dump_printf_loc (MSG_NOTE
, vect_location
,
448 "=== vect_analyze_data_ref_dependences ===\n");
450 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
451 &LOOP_VINFO_DDRS (loop_vinfo
),
452 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
455 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
456 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
463 /* Function vect_slp_analyze_data_ref_dependence.
465 Return TRUE if there (might) exist a dependence between a memory-reference
466 DRA and a memory-reference DRB. When versioning for alias may check a
467 dependence at run-time, return FALSE. Adjust *MAX_VF according to
468 the data dependence. */
471 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
473 struct data_reference
*dra
= DDR_A (ddr
);
474 struct data_reference
*drb
= DDR_B (ddr
);
476 /* We need to check dependences of statements marked as unvectorizable
477 as well, they still can prohibit vectorization. */
479 /* Independent data accesses. */
480 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
486 /* Read-read is OK. */
487 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
490 /* If dra and drb are part of the same interleaving chain consider
492 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
493 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
494 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
497 /* Unknown data dependence. */
498 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
502 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
505 "can't determine dependence between ");
506 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
507 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
508 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
509 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
512 /* We do not vectorize basic blocks with write-write dependencies. */
513 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
516 /* Check that it's not a load-after-store dependence. */
517 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
518 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
524 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE
, vect_location
,
527 "determined dependence between ");
528 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
529 dump_printf (MSG_NOTE
, " and ");
530 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
531 dump_printf (MSG_NOTE
, "\n");
534 /* Do not vectorize basic blocks with write-write dependences. */
535 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
538 /* Check dependence between DRA and DRB for basic block vectorization.
539 If the accesses share same bases and offsets, we can compare their initial
540 constant offsets to decide whether they differ or not. In case of a read-
541 write dependence we check that the load is before the store to ensure that
542 vectorization will not change the order of the accesses. */
544 HOST_WIDE_INT type_size_a
, type_size_b
, init_a
, init_b
;
547 /* Check that the data-refs have same bases and offsets. If not, we can't
548 determine if they are dependent. */
549 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
550 || !dr_equal_offsets_p (dra
, drb
))
553 /* Check the types. */
554 type_size_a
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))));
555 type_size_b
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
557 if (type_size_a
!= type_size_b
558 || !types_compatible_p (TREE_TYPE (DR_REF (dra
)),
559 TREE_TYPE (DR_REF (drb
))))
562 init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
563 init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
565 /* Two different locations - no dependence. */
566 if (init_a
!= init_b
)
569 /* We have a read-write dependence. Check that the load is before the store.
570 When we vectorize basic blocks, vector load can be only before
571 corresponding scalar load, and vector store can be only after its
572 corresponding scalar store. So the order of the acceses is preserved in
573 case the load is before the store. */
574 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
575 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
582 /* Function vect_analyze_data_ref_dependences.
584 Examine all the data references in the basic-block, and make sure there
585 do not exist any data dependences between them. Set *MAX_VF according to
586 the maximum vectorization factor the data dependences allow. */
589 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
591 struct data_dependence_relation
*ddr
;
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_NOTE
, vect_location
,
596 "=== vect_slp_analyze_data_ref_dependences ===\n");
598 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
599 &BB_VINFO_DDRS (bb_vinfo
),
603 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
604 if (vect_slp_analyze_data_ref_dependence (ddr
))
611 /* Function vect_compute_data_ref_alignment
613 Compute the misalignment of the data reference DR.
616 1. If during the misalignment computation it is found that the data reference
617 cannot be vectorized then false is returned.
618 2. DR_MISALIGNMENT (DR) is defined.
620 FOR NOW: No analysis is actually performed. Misalignment is calculated
621 only for trivial cases. TODO. */
624 vect_compute_data_ref_alignment (struct data_reference
*dr
)
626 gimple stmt
= DR_STMT (dr
);
627 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
628 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
629 struct loop
*loop
= NULL
;
630 tree ref
= DR_REF (dr
);
632 tree base
, base_addr
;
635 tree aligned_to
, alignment
;
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_NOTE
, vect_location
,
639 "vect_compute_data_ref_alignment:\n");
642 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
644 /* Initialize misalignment to unknown. */
645 SET_DR_MISALIGNMENT (dr
, -1);
647 /* Strided loads perform only component accesses, misalignment information
648 is irrelevant for them. */
649 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
652 misalign
= DR_INIT (dr
);
653 aligned_to
= DR_ALIGNED_TO (dr
);
654 base_addr
= DR_BASE_ADDRESS (dr
);
655 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
657 /* In case the dataref is in an inner-loop of the loop that is being
658 vectorized (LOOP), we use the base and misalignment information
659 relative to the outer-loop (LOOP). This is ok only if the misalignment
660 stays the same throughout the execution of the inner-loop, which is why
661 we have to check that the stride of the dataref in the inner-loop evenly
662 divides by the vector size. */
663 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
665 tree step
= DR_STEP (dr
);
666 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
668 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE
, vect_location
,
672 "inner step divides the vector-size.\n");
673 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
674 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
675 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
681 "inner step doesn't divide the vector-size.\n");
682 misalign
= NULL_TREE
;
686 /* Similarly, if we're doing basic-block vectorization, we can only use
687 base and misalignment information relative to an innermost loop if the
688 misalignment stays the same throughout the execution of the loop.
689 As above, this is the case if the stride of the dataref evenly divides
690 by the vector size. */
693 tree step
= DR_STEP (dr
);
694 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
696 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
700 "SLP: step doesn't divide the vector-size.\n");
701 misalign
= NULL_TREE
;
705 base
= build_fold_indirect_ref (base_addr
);
706 alignment
= ssize_int (TYPE_ALIGN (vectype
)/BITS_PER_UNIT
);
708 if ((aligned_to
&& tree_int_cst_compare (aligned_to
, alignment
) < 0)
711 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
714 "Unknown alignment for access: ");
715 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, base
);
716 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
722 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base
)),
724 || (TREE_CODE (base_addr
) == SSA_NAME
725 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
726 TREE_TYPE (base_addr
)))),
728 || (get_pointer_alignment (base_addr
) >= TYPE_ALIGN (vectype
)))
731 base_aligned
= false;
735 /* Do not change the alignment of global variables here if
736 flag_section_anchors is enabled as we already generated
737 RTL for other functions. Most global variables should
738 have been aligned during the IPA increase_alignment pass. */
739 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
))
740 || (TREE_STATIC (base
) && flag_section_anchors
))
742 if (dump_enabled_p ())
744 dump_printf_loc (MSG_NOTE
, vect_location
,
745 "can't force alignment of ref: ");
746 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
747 dump_printf (MSG_NOTE
, "\n");
752 /* Force the alignment of the decl.
753 NOTE: This is the only change to the code we make during
754 the analysis phase, before deciding to vectorize the loop. */
755 if (dump_enabled_p ())
757 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
758 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
759 dump_printf (MSG_NOTE
, "\n");
762 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
763 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
766 /* If this is a backward running DR then first access in the larger
767 vectype actually is N-1 elements before the address in the DR.
768 Adjust misalign accordingly. */
769 if (tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0)
771 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
772 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
773 otherwise we wouldn't be here. */
774 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
775 /* PLUS because DR_STEP was negative. */
776 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
779 /* Modulo alignment. */
780 misalign
= size_binop (FLOOR_MOD_EXPR
, misalign
, alignment
);
782 if (!tree_fits_uhwi_p (misalign
))
784 /* Negative or overflowed misalignment value. */
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
787 "unexpected misalign value\n");
791 SET_DR_MISALIGNMENT (dr
, tree_to_uhwi (misalign
));
793 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
796 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
797 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
798 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
805 /* Function vect_compute_data_refs_alignment
807 Compute the misalignment of data references in the loop.
808 Return FALSE if a data reference is found that cannot be vectorized. */
811 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
812 bb_vec_info bb_vinfo
)
814 vec
<data_reference_p
> datarefs
;
815 struct data_reference
*dr
;
819 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
821 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
823 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
824 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
825 && !vect_compute_data_ref_alignment (dr
))
829 /* Mark unsupported statement as unvectorizable. */
830 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
841 /* Function vect_update_misalignment_for_peel
843 DR - the data reference whose misalignment is to be adjusted.
844 DR_PEEL - the data reference whose misalignment is being made
845 zero in the vector loop by the peel.
846 NPEEL - the number of iterations in the peel loop if the misalignment
847 of DR_PEEL is known at compile time. */
850 vect_update_misalignment_for_peel (struct data_reference
*dr
,
851 struct data_reference
*dr_peel
, int npeel
)
854 vec
<dr_p
> same_align_drs
;
855 struct data_reference
*current_dr
;
856 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
857 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
858 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
859 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
861 /* For interleaved data accesses the step in the loop must be multiplied by
862 the size of the interleaving group. */
863 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
864 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
865 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
866 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
868 /* It can be assumed that the data refs with the same alignment as dr_peel
869 are aligned in the vector loop. */
871 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
872 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
874 if (current_dr
!= dr
)
876 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
877 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
878 SET_DR_MISALIGNMENT (dr
, 0);
882 if (known_alignment_for_access_p (dr
)
883 && known_alignment_for_access_p (dr_peel
))
885 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
886 int misal
= DR_MISALIGNMENT (dr
);
887 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
888 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
889 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
890 SET_DR_MISALIGNMENT (dr
, misal
);
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
896 SET_DR_MISALIGNMENT (dr
, -1);
900 /* Function vect_verify_datarefs_alignment
902 Return TRUE if all data references in the loop can be
903 handled with respect to alignment. */
906 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
908 vec
<data_reference_p
> datarefs
;
909 struct data_reference
*dr
;
910 enum dr_alignment_support supportable_dr_alignment
;
914 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
916 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
918 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
920 gimple stmt
= DR_STMT (dr
);
921 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
923 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
926 /* For interleaving, only the alignment of the first access matters.
927 Skip statements marked as not vectorizable. */
928 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
929 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
930 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
933 /* Strided loads perform only component accesses, alignment is
934 irrelevant for them. */
935 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
938 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
939 if (!supportable_dr_alignment
)
941 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
945 "not vectorized: unsupported unaligned load.");
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
948 "not vectorized: unsupported unaligned "
951 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
953 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
957 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
958 dump_printf_loc (MSG_NOTE
, vect_location
,
959 "Vectorizing an unaligned access.\n");
964 /* Given an memory reference EXP return whether its alignment is less
968 not_size_aligned (tree exp
)
970 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
973 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
974 > get_object_alignment (exp
));
977 /* Function vector_alignment_reachable_p
979 Return true if vector alignment for DR is reachable by peeling
980 a few loop iterations. Return false otherwise. */
983 vector_alignment_reachable_p (struct data_reference
*dr
)
985 gimple stmt
= DR_STMT (dr
);
986 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
987 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
989 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
991 /* For interleaved access we peel only if number of iterations in
992 the prolog loop ({VF - misalignment}), is a multiple of the
993 number of the interleaved accesses. */
994 int elem_size
, mis_in_elements
;
995 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
997 /* FORNOW: handle only known alignment. */
998 if (!known_alignment_for_access_p (dr
))
1001 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1002 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1004 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1008 /* If misalignment is known at the compile time then allow peeling
1009 only if natural alignment is reachable through peeling. */
1010 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1012 HOST_WIDE_INT elmsize
=
1013 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1014 if (dump_enabled_p ())
1016 dump_printf_loc (MSG_NOTE
, vect_location
,
1017 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1018 dump_printf (MSG_NOTE
,
1019 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1021 if (DR_MISALIGNMENT (dr
) % elmsize
)
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1025 "data size does not divide the misalignment.\n");
1030 if (!known_alignment_for_access_p (dr
))
1032 tree type
= TREE_TYPE (DR_REF (dr
));
1033 bool is_packed
= not_size_aligned (DR_REF (dr
));
1034 if (dump_enabled_p ())
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1036 "Unknown misalignment, is_packed = %d\n",is_packed
);
1037 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1038 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1048 /* Calculate the cost of the memory access represented by DR. */
1051 vect_get_data_access_cost (struct data_reference
*dr
,
1052 unsigned int *inside_cost
,
1053 unsigned int *outside_cost
,
1054 stmt_vector_for_cost
*body_cost_vec
)
1056 gimple stmt
= DR_STMT (dr
);
1057 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1058 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1059 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1060 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1061 int ncopies
= vf
/ nunits
;
1063 if (DR_IS_READ (dr
))
1064 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1065 NULL
, body_cost_vec
, false);
1067 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1069 if (dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE
, vect_location
,
1071 "vect_get_data_access_cost: inside_cost = %d, "
1072 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1076 /* Insert DR into peeling hash table with NPEEL as key. */
1079 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1082 struct _vect_peel_info elem
, *slot
;
1083 _vect_peel_info
**new_slot
;
1084 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1087 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find (&elem
);
1092 slot
= XNEW (struct _vect_peel_info
);
1093 slot
->npeel
= npeel
;
1096 new_slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
).find_slot (slot
, INSERT
);
1100 if (!supportable_dr_alignment
1101 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1102 slot
->count
+= VECT_MAX_COST
;
1106 /* Traverse peeling hash table to find peeling option that aligns maximum
1107 number of data accesses. */
1110 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1111 _vect_peel_extended_info
*max
)
1113 vect_peel_info elem
= *slot
;
1115 if (elem
->count
> max
->peel_info
.count
1116 || (elem
->count
== max
->peel_info
.count
1117 && max
->peel_info
.npeel
> elem
->npeel
))
1119 max
->peel_info
.npeel
= elem
->npeel
;
1120 max
->peel_info
.count
= elem
->count
;
1121 max
->peel_info
.dr
= elem
->dr
;
1128 /* Traverse peeling hash table and calculate cost for each peeling option.
1129 Find the one with the lowest cost. */
1132 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1133 _vect_peel_extended_info
*min
)
1135 vect_peel_info elem
= *slot
;
1136 int save_misalignment
, dummy
;
1137 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1138 gimple stmt
= DR_STMT (elem
->dr
);
1139 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1140 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1141 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1142 struct data_reference
*dr
;
1143 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1144 int single_iter_cost
;
1146 prologue_cost_vec
.create (2);
1147 body_cost_vec
.create (2);
1148 epilogue_cost_vec
.create (2);
1150 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1152 stmt
= DR_STMT (dr
);
1153 stmt_info
= vinfo_for_stmt (stmt
);
1154 /* For interleaving, only the alignment of the first access
1156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1157 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1160 save_misalignment
= DR_MISALIGNMENT (dr
);
1161 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1162 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1164 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1167 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1168 outside_cost
+= vect_get_known_peeling_cost (loop_vinfo
, elem
->npeel
,
1169 &dummy
, single_iter_cost
,
1171 &epilogue_cost_vec
);
1173 /* Prologue and epilogue costs are added to the target model later.
1174 These costs depend only on the scalar iteration cost, the
1175 number of peeling iterations finally chosen, and the number of
1176 misaligned statements. So discard the information found here. */
1177 prologue_cost_vec
.release ();
1178 epilogue_cost_vec
.release ();
1180 if (inside_cost
< min
->inside_cost
1181 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1183 min
->inside_cost
= inside_cost
;
1184 min
->outside_cost
= outside_cost
;
1185 min
->body_cost_vec
.release ();
1186 min
->body_cost_vec
= body_cost_vec
;
1187 min
->peel_info
.dr
= elem
->dr
;
1188 min
->peel_info
.npeel
= elem
->npeel
;
1191 body_cost_vec
.release ();
1197 /* Choose best peeling option by traversing peeling hash table and either
1198 choosing an option with the lowest cost (if cost model is enabled) or the
1199 option that aligns as many accesses as possible. */
1201 static struct data_reference
*
1202 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1203 unsigned int *npeel
,
1204 stmt_vector_for_cost
*body_cost_vec
)
1206 struct _vect_peel_extended_info res
;
1208 res
.peel_info
.dr
= NULL
;
1209 res
.body_cost_vec
= stmt_vector_for_cost ();
1211 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1213 res
.inside_cost
= INT_MAX
;
1214 res
.outside_cost
= INT_MAX
;
1215 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1216 .traverse
<_vect_peel_extended_info
*,
1217 vect_peeling_hash_get_lowest_cost
> (&res
);
1221 res
.peel_info
.count
= 0;
1222 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1223 .traverse
<_vect_peel_extended_info
*,
1224 vect_peeling_hash_get_most_frequent
> (&res
);
1227 *npeel
= res
.peel_info
.npeel
;
1228 *body_cost_vec
= res
.body_cost_vec
;
1229 return res
.peel_info
.dr
;
1233 /* Function vect_enhance_data_refs_alignment
1235 This pass will use loop versioning and loop peeling in order to enhance
1236 the alignment of data references in the loop.
1238 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1239 original loop is to be vectorized. Any other loops that are created by
1240 the transformations performed in this pass - are not supposed to be
1241 vectorized. This restriction will be relaxed.
1243 This pass will require a cost model to guide it whether to apply peeling
1244 or versioning or a combination of the two. For example, the scheme that
1245 intel uses when given a loop with several memory accesses, is as follows:
1246 choose one memory access ('p') which alignment you want to force by doing
1247 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1248 other accesses are not necessarily aligned, or (2) use loop versioning to
1249 generate one loop in which all accesses are aligned, and another loop in
1250 which only 'p' is necessarily aligned.
1252 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1253 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1254 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1256 Devising a cost model is the most critical aspect of this work. It will
1257 guide us on which access to peel for, whether to use loop versioning, how
1258 many versions to create, etc. The cost model will probably consist of
1259 generic considerations as well as target specific considerations (on
1260 powerpc for example, misaligned stores are more painful than misaligned
1263 Here are the general steps involved in alignment enhancements:
1265 -- original loop, before alignment analysis:
1266 for (i=0; i<N; i++){
1267 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1268 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1271 -- After vect_compute_data_refs_alignment:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = 3
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 -- Possibility 1: we do loop versioning:
1279 for (i=0; i<N; i++){ # loop 1A
1280 x = q[i]; # DR_MISALIGNMENT(q) = 3
1281 p[i] = y; # DR_MISALIGNMENT(p) = 0
1285 for (i=0; i<N; i++){ # loop 1B
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1291 -- Possibility 2: we do loop peeling:
1292 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1296 for (i = 3; i < N; i++){ # loop 2A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1301 -- Possibility 3: combination of loop peeling and versioning:
1302 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1307 for (i = 3; i<N; i++){ # loop 3A
1308 x = q[i]; # DR_MISALIGNMENT(q) = 0
1309 p[i] = y; # DR_MISALIGNMENT(p) = 0
1313 for (i = 3; i<N; i++){ # loop 3B
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1319 These loops are later passed to loop_transform to be vectorized. The
1320 vectorizer will use the alignment information to guide the transformation
1321 (whether to generate regular loads/stores, or with special handling for
1325 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1327 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1328 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1329 enum dr_alignment_support supportable_dr_alignment
;
1330 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1331 struct data_reference
*dr
;
1333 bool do_peeling
= false;
1334 bool do_versioning
= false;
1337 stmt_vec_info stmt_info
;
1338 unsigned int npeel
= 0;
1339 bool all_misalignments_unknown
= true;
1340 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1341 unsigned possible_npeel_number
= 1;
1343 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1344 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1346 if (dump_enabled_p ())
1347 dump_printf_loc (MSG_NOTE
, vect_location
,
1348 "=== vect_enhance_data_refs_alignment ===\n");
1350 /* While cost model enhancements are expected in the future, the high level
1351 view of the code at this time is as follows:
1353 A) If there is a misaligned access then see if peeling to align
1354 this access can make all data references satisfy
1355 vect_supportable_dr_alignment. If so, update data structures
1356 as needed and return true.
1358 B) If peeling wasn't possible and there is a data reference with an
1359 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1360 then see if loop versioning checks can be used to make all data
1361 references satisfy vect_supportable_dr_alignment. If so, update
1362 data structures as needed and return true.
1364 C) If neither peeling nor versioning were successful then return false if
1365 any data reference does not satisfy vect_supportable_dr_alignment.
1367 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1369 Note, Possibility 3 above (which is peeling and versioning together) is not
1370 being done at this time. */
1372 /* (1) Peeling to force alignment. */
1374 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1376 + How many accesses will become aligned due to the peeling
1377 - How many accesses will become unaligned due to the peeling,
1378 and the cost of misaligned accesses.
1379 - The cost of peeling (the extra runtime checks, the increase
1382 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1384 stmt
= DR_STMT (dr
);
1385 stmt_info
= vinfo_for_stmt (stmt
);
1387 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1390 /* For interleaving, only the alignment of the first access
1392 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1393 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1396 /* For invariant accesses there is nothing to enhance. */
1397 if (integer_zerop (DR_STEP (dr
)))
1400 /* Strided loads perform only component accesses, alignment is
1401 irrelevant for them. */
1402 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1405 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1406 do_peeling
= vector_alignment_reachable_p (dr
);
1409 if (known_alignment_for_access_p (dr
))
1411 unsigned int npeel_tmp
;
1412 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1413 size_zero_node
) < 0;
1415 /* Save info about DR in the hash table. */
1416 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
).is_created ())
1417 LOOP_VINFO_PEELING_HTAB (loop_vinfo
).create (1);
1419 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1420 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1421 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1422 TREE_TYPE (DR_REF (dr
))));
1423 npeel_tmp
= (negative
1424 ? (mis
- nelements
) : (nelements
- mis
))
1427 /* For multiple types, it is possible that the bigger type access
1428 will have more than one peeling option. E.g., a loop with two
1429 types: one of size (vector size / 4), and the other one of
1430 size (vector size / 8). Vectorization factor will 8. If both
1431 access are misaligned by 3, the first one needs one scalar
1432 iteration to be aligned, and the second one needs 5. But the
1433 the first one will be aligned also by peeling 5 scalar
1434 iterations, and in that case both accesses will be aligned.
1435 Hence, except for the immediate peeling amount, we also want
1436 to try to add full vector size, while we don't exceed
1437 vectorization factor.
1438 We do this automtically for cost model, since we calculate cost
1439 for every peeling option. */
1440 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1441 possible_npeel_number
= vf
/nelements
;
1443 /* Handle the aligned case. We may decide to align some other
1444 access, making DR unaligned. */
1445 if (DR_MISALIGNMENT (dr
) == 0)
1448 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1449 possible_npeel_number
++;
1452 for (j
= 0; j
< possible_npeel_number
; j
++)
1454 gcc_assert (npeel_tmp
<= vf
);
1455 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1456 npeel_tmp
+= nelements
;
1459 all_misalignments_unknown
= false;
1460 /* Data-ref that was chosen for the case that all the
1461 misalignments are unknown is not relevant anymore, since we
1462 have a data-ref with known alignment. */
1467 /* If we don't know any misalignment values, we prefer
1468 peeling for data-ref that has the maximum number of data-refs
1469 with the same alignment, unless the target prefers to align
1470 stores over load. */
1471 if (all_misalignments_unknown
)
1473 unsigned same_align_drs
1474 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1476 || same_align_drs_max
< same_align_drs
)
1478 same_align_drs_max
= same_align_drs
;
1481 /* For data-refs with the same number of related
1482 accesses prefer the one where the misalign
1483 computation will be invariant in the outermost loop. */
1484 else if (same_align_drs_max
== same_align_drs
)
1486 struct loop
*ivloop0
, *ivloop
;
1487 ivloop0
= outermost_invariant_loop_for_expr
1488 (loop
, DR_BASE_ADDRESS (dr0
));
1489 ivloop
= outermost_invariant_loop_for_expr
1490 (loop
, DR_BASE_ADDRESS (dr
));
1491 if ((ivloop
&& !ivloop0
)
1492 || (ivloop
&& ivloop0
1493 && flow_loop_nested_p (ivloop
, ivloop0
)))
1497 if (!first_store
&& DR_IS_WRITE (dr
))
1501 /* If there are both known and unknown misaligned accesses in the
1502 loop, we choose peeling amount according to the known
1504 if (!supportable_dr_alignment
)
1507 if (!first_store
&& DR_IS_WRITE (dr
))
1514 if (!aligned_access_p (dr
))
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1518 "vector alignment may not be reachable\n");
1524 /* Check if we can possibly peel the loop. */
1525 if (!vect_can_advance_ivs_p (loop_vinfo
)
1526 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1529 if (do_peeling
&& all_misalignments_unknown
1530 && vect_supportable_dr_alignment (dr0
, false))
1533 /* Check if the target requires to prefer stores over loads, i.e., if
1534 misaligned stores are more expensive than misaligned loads (taking
1535 drs with same alignment into account). */
1536 if (first_store
&& DR_IS_READ (dr0
))
1538 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1539 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1540 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1541 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1542 stmt_vector_for_cost dummy
;
1545 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1547 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1548 &store_outside_cost
, &dummy
);
1552 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1553 aligning the load DR0). */
1554 load_inside_penalty
= store_inside_cost
;
1555 load_outside_penalty
= store_outside_cost
;
1557 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1558 DR_STMT (first_store
))).iterate (i
, &dr
);
1560 if (DR_IS_READ (dr
))
1562 load_inside_penalty
+= load_inside_cost
;
1563 load_outside_penalty
+= load_outside_cost
;
1567 load_inside_penalty
+= store_inside_cost
;
1568 load_outside_penalty
+= store_outside_cost
;
1571 /* Calculate the penalty for leaving DR0 unaligned (by
1572 aligning the FIRST_STORE). */
1573 store_inside_penalty
= load_inside_cost
;
1574 store_outside_penalty
= load_outside_cost
;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (dr0
))).iterate (i
, &dr
);
1579 if (DR_IS_READ (dr
))
1581 store_inside_penalty
+= load_inside_cost
;
1582 store_outside_penalty
+= load_outside_cost
;
1586 store_inside_penalty
+= store_inside_cost
;
1587 store_outside_penalty
+= store_outside_cost
;
1590 if (load_inside_penalty
> store_inside_penalty
1591 || (load_inside_penalty
== store_inside_penalty
1592 && load_outside_penalty
> store_outside_penalty
))
1596 /* In case there are only loads with different unknown misalignments, use
1597 peeling only if it may help to align other accesses in the loop. */
1599 && !STMT_VINFO_SAME_ALIGN_REFS (
1600 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1601 && vect_supportable_dr_alignment (dr0
, false)
1602 != dr_unaligned_supported
)
1606 if (do_peeling
&& !dr0
)
1608 /* Peeling is possible, but there is no data access that is not supported
1609 unless aligned. So we try to choose the best possible peeling. */
1611 /* We should get here only if there are drs with known misalignment. */
1612 gcc_assert (!all_misalignments_unknown
);
1614 /* Choose the best peeling from the hash table. */
1615 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1623 stmt
= DR_STMT (dr0
);
1624 stmt_info
= vinfo_for_stmt (stmt
);
1625 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1626 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1628 if (known_alignment_for_access_p (dr0
))
1630 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1631 size_zero_node
) < 0;
1634 /* Since it's known at compile time, compute the number of
1635 iterations in the peeled loop (the peeling factor) for use in
1636 updating DR_MISALIGNMENT values. The peeling factor is the
1637 vectorization factor minus the misalignment as an element
1639 mis
= DR_MISALIGNMENT (dr0
);
1640 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1641 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1645 /* For interleaved data access every iteration accesses all the
1646 members of the group, therefore we divide the number of iterations
1647 by the group size. */
1648 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1649 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1650 npeel
/= GROUP_SIZE (stmt_info
);
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_NOTE
, vect_location
,
1654 "Try peeling by %d\n", npeel
);
1657 /* Ensure that all data refs can be vectorized after the peel. */
1658 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1660 int save_misalignment
;
1665 stmt
= DR_STMT (dr
);
1666 stmt_info
= vinfo_for_stmt (stmt
);
1667 /* For interleaving, only the alignment of the first access
1669 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1670 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1673 /* Strided loads perform only component accesses, alignment is
1674 irrelevant for them. */
1675 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1678 save_misalignment
= DR_MISALIGNMENT (dr
);
1679 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1680 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1681 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1683 if (!supportable_dr_alignment
)
1690 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1692 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1697 body_cost_vec
.release ();
1704 unsigned max_allowed_peel
1705 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1706 if (max_allowed_peel
!= (unsigned)-1)
1708 unsigned max_peel
= npeel
;
1711 gimple dr_stmt
= DR_STMT (dr0
);
1712 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1713 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1714 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1716 if (max_peel
> max_allowed_peel
)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE
, vect_location
,
1721 "Disable peeling, max peels reached: %d\n", max_peel
);
1728 stmt_info_for_cost
*si
;
1729 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
1731 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1732 If the misalignment of DR_i is identical to that of dr0 then set
1733 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1734 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1735 by the peeling factor times the element size of DR_i (MOD the
1736 vectorization factor times the size). Otherwise, the
1737 misalignment of DR_i must be set to unknown. */
1738 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1740 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1742 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1744 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1746 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1747 = DR_MISALIGNMENT (dr0
);
1748 SET_DR_MISALIGNMENT (dr0
, 0);
1749 if (dump_enabled_p ())
1751 dump_printf_loc (MSG_NOTE
, vect_location
,
1752 "Alignment of access forced using peeling.\n");
1753 dump_printf_loc (MSG_NOTE
, vect_location
,
1754 "Peeling for alignment will be applied.\n");
1756 /* We've delayed passing the inside-loop peeling costs to the
1757 target cost model until we were sure peeling would happen.
1759 if (body_cost_vec
.exists ())
1761 FOR_EACH_VEC_ELT (body_cost_vec
, i
, si
)
1763 struct _stmt_vec_info
*stmt_info
1764 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1765 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1766 si
->misalign
, vect_body
);
1768 body_cost_vec
.release ();
1771 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1777 body_cost_vec
.release ();
1779 /* (2) Versioning to force alignment. */
1781 /* Try versioning if:
1782 1) optimize loop for speed
1783 2) there is at least one unsupported misaligned data ref with an unknown
1785 3) all misaligned data refs with a known misalignment are supported, and
1786 4) the number of runtime alignment checks is within reason. */
1789 optimize_loop_nest_for_speed_p (loop
)
1790 && (!loop
->inner
); /* FORNOW */
1794 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1796 stmt
= DR_STMT (dr
);
1797 stmt_info
= vinfo_for_stmt (stmt
);
1799 /* For interleaving, only the alignment of the first access
1801 if (aligned_access_p (dr
)
1802 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1803 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1806 /* Strided loads perform only component accesses, alignment is
1807 irrelevant for them. */
1808 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1811 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1813 if (!supportable_dr_alignment
)
1819 if (known_alignment_for_access_p (dr
)
1820 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1821 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1823 do_versioning
= false;
1827 stmt
= DR_STMT (dr
);
1828 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1829 gcc_assert (vectype
);
1831 /* The rightmost bits of an aligned address must be zeros.
1832 Construct the mask needed for this test. For example,
1833 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1834 mask must be 15 = 0xf. */
1835 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1837 /* FORNOW: use the same mask to test all potentially unaligned
1838 references in the loop. The vectorizer currently supports
1839 a single vector size, see the reference to
1840 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1841 vectorization factor is computed. */
1842 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1843 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1844 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1845 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1850 /* Versioning requires at least one misaligned data reference. */
1851 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1852 do_versioning
= false;
1853 else if (!do_versioning
)
1854 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1859 vec
<gimple
> may_misalign_stmts
1860 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1863 /* It can now be assumed that the data references in the statements
1864 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1865 of the loop being vectorized. */
1866 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1868 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1869 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1870 SET_DR_MISALIGNMENT (dr
, 0);
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_NOTE
, vect_location
,
1873 "Alignment of access forced using versioning.\n");
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_NOTE
, vect_location
,
1878 "Versioning for alignment will be applied.\n");
1880 /* Peeling and versioning can't be done together at this time. */
1881 gcc_assert (! (do_peeling
&& do_versioning
));
1883 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1888 /* This point is reached if neither peeling nor versioning is being done. */
1889 gcc_assert (! (do_peeling
|| do_versioning
));
1891 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1896 /* Function vect_find_same_alignment_drs.
1898 Update group and alignment relations according to the chosen
1899 vectorization factor. */
1902 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1903 loop_vec_info loop_vinfo
)
1906 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1907 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1908 struct data_reference
*dra
= DDR_A (ddr
);
1909 struct data_reference
*drb
= DDR_B (ddr
);
1910 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1911 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1912 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1913 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1914 lambda_vector dist_v
;
1915 unsigned int loop_depth
;
1917 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1923 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1926 /* Loop-based vectorization and known data dependence. */
1927 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1930 /* Data-dependence analysis reports a distance vector of zero
1931 for data-references that overlap only in the first iteration
1932 but have different sign step (see PR45764).
1933 So as a sanity check require equal DR_STEP. */
1934 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1937 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1938 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1940 int dist
= dist_v
[loop_depth
];
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_NOTE
, vect_location
,
1944 "dependence distance = %d.\n", dist
);
1946 /* Same loop iteration. */
1948 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1950 /* Two references with distance zero have the same alignment. */
1951 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1952 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1953 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE
, vect_location
,
1956 "accesses have the same alignment.\n");
1957 dump_printf (MSG_NOTE
,
1958 "dependence distance modulo vf == 0 between ");
1959 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1960 dump_printf (MSG_NOTE
, " and ");
1961 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1962 dump_printf (MSG_NOTE
, "\n");
1969 /* Function vect_analyze_data_refs_alignment
1971 Analyze the alignment of the data-references in the loop.
1972 Return FALSE if a data reference is found that cannot be vectorized. */
1975 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1976 bb_vec_info bb_vinfo
)
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_NOTE
, vect_location
,
1980 "=== vect_analyze_data_refs_alignment ===\n");
1982 /* Mark groups of data references with same alignment using
1983 data dependence information. */
1986 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
1987 struct data_dependence_relation
*ddr
;
1990 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
1991 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
1994 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
1996 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1998 "not vectorized: can't calculate alignment "
2007 /* Analyze groups of accesses: check that DR belongs to a group of
2008 accesses of legal size, step, etc. Detect gaps, single element
2009 interleaving, and other special cases. Set grouped access info.
2010 Collect groups of strided stores for further use in SLP analysis. */
2013 vect_analyze_group_access (struct data_reference
*dr
)
2015 tree step
= DR_STEP (dr
);
2016 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2017 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2018 gimple stmt
= DR_STMT (dr
);
2019 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2020 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2021 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2022 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2023 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2024 bool slp_impossible
= false;
2025 struct loop
*loop
= NULL
;
2028 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2030 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2031 size of the interleaving group (including gaps). */
2032 groupsize
= absu_hwi (dr_step
) / type_size
;
2034 /* Not consecutive access is possible only if it is a part of interleaving. */
2035 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2037 /* Check if it this DR is a part of interleaving, and is a single
2038 element of the group that is accessed in the loop. */
2040 /* Gaps are supported only for loads. STEP must be a multiple of the type
2041 size. The size of the group must be a power of 2. */
2043 && (dr_step
% type_size
) == 0
2045 && exact_log2 (groupsize
) != -1)
2047 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2048 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2049 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE
, vect_location
,
2052 "Detected single element interleaving ");
2053 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2054 dump_printf (MSG_NOTE
, " step ");
2055 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2056 dump_printf (MSG_NOTE
, "\n");
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE
, vect_location
,
2063 "Data access with gaps requires scalar "
2067 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2069 "Peeling for outer loop is not"
2074 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2080 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2083 "not consecutive access ");
2084 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2085 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2090 /* Mark the statement as unvectorizable. */
2091 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2098 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2100 /* First stmt in the interleaving chain. Check the chain. */
2101 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2102 struct data_reference
*data_ref
= dr
;
2103 unsigned int count
= 1;
2104 tree prev_init
= DR_INIT (data_ref
);
2106 HOST_WIDE_INT diff
, gaps
= 0;
2107 unsigned HOST_WIDE_INT count_in_bytes
;
2111 /* Skip same data-refs. In case that two or more stmts share
2112 data-ref (supported only for loads), we vectorize only the first
2113 stmt, and the rest get their vectorized loads from the first
2115 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2116 DR_INIT (STMT_VINFO_DATA_REF (
2117 vinfo_for_stmt (next
)))))
2119 if (DR_IS_WRITE (data_ref
))
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2123 "Two store stmts share the same dr.\n");
2127 /* For load use the same data-ref load. */
2128 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2131 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2136 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2138 /* All group members have the same STEP by construction. */
2139 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2141 /* Check that the distance between two accesses is equal to the type
2142 size. Otherwise, we have gaps. */
2143 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2144 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2147 /* FORNOW: SLP of accesses with gaps is not supported. */
2148 slp_impossible
= true;
2149 if (DR_IS_WRITE (data_ref
))
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2153 "interleaved store with gaps\n");
2160 last_accessed_element
+= diff
;
2162 /* Store the gap from the previous member of the group. If there is no
2163 gap in the access, GROUP_GAP is always 1. */
2164 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2166 prev_init
= DR_INIT (data_ref
);
2167 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2168 /* Count the number of data-refs in the chain. */
2172 /* COUNT is the number of accesses found, we multiply it by the size of
2173 the type to get COUNT_IN_BYTES. */
2174 count_in_bytes
= type_size
* count
;
2176 /* Check that the size of the interleaving (including gaps) is not
2177 greater than STEP. */
2179 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "interleaving size is greater than step for ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2187 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2192 /* Check that the size of the interleaving is equal to STEP for stores,
2193 i.e., that there are no gaps. */
2195 && absu_hwi (dr_step
) != count_in_bytes
)
2197 if (DR_IS_READ (dr
))
2199 slp_impossible
= true;
2200 /* There is a gap after the last load in the group. This gap is a
2201 difference between the groupsize and the number of elements.
2202 When there is no gap, this difference should be 0. */
2203 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2207 if (dump_enabled_p ())
2208 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2209 "interleaved store with gaps\n");
2214 /* Check that STEP is a multiple of type size. */
2216 && (dr_step
% type_size
) != 0)
2218 if (dump_enabled_p ())
2220 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2221 "step is not a multiple of type size: step ");
2222 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2223 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2224 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2225 TYPE_SIZE_UNIT (scalar_type
));
2226 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2234 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE
, vect_location
,
2237 "Detected interleaving of size %d\n", (int)groupsize
);
2239 /* SLP: create an SLP data structure for every interleaving group of
2240 stores for further analysis in vect_analyse_slp. */
2241 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2244 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2246 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2249 /* There is a gap in the end of the group. */
2250 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2254 "Data access with gaps requires scalar "
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2260 "Peeling for outer loop is not supported\n");
2264 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2272 /* Analyze the access pattern of the data-reference DR.
2273 In case of non-consecutive accesses call vect_analyze_group_access() to
2274 analyze groups of accesses. */
2277 vect_analyze_data_ref_access (struct data_reference
*dr
)
2279 tree step
= DR_STEP (dr
);
2280 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2281 gimple stmt
= DR_STMT (dr
);
2282 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2283 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2284 struct loop
*loop
= NULL
;
2287 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2289 if (loop_vinfo
&& !step
)
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2293 "bad data-ref access in loop\n");
2297 /* Allow invariant loads in not nested loops. */
2298 if (loop_vinfo
&& integer_zerop (step
))
2300 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2301 if (nested_in_vect_loop_p (loop
, stmt
))
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE
, vect_location
,
2305 "zero step in inner loop of nest\n");
2308 return DR_IS_READ (dr
);
2311 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2313 /* Interleaved accesses are not yet supported within outer-loop
2314 vectorization for references in the inner-loop. */
2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2317 /* For the rest of the analysis we use the outer-loop step. */
2318 step
= STMT_VINFO_DR_STEP (stmt_info
);
2319 if (integer_zerop (step
))
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE
, vect_location
,
2323 "zero step in outer loop.\n");
2324 if (DR_IS_READ (dr
))
2332 if (TREE_CODE (step
) == INTEGER_CST
)
2334 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2335 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2337 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2339 /* Mark that it is not interleaving. */
2340 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2345 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_NOTE
, vect_location
,
2349 "grouped access in outer loop.\n");
2353 /* Assume this is a DR handled by non-constant strided load case. */
2354 if (TREE_CODE (step
) != INTEGER_CST
)
2355 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2357 /* Not consecutive access - check if it's a part of interleaving group. */
2358 return vect_analyze_group_access (dr
);
2363 /* A helper function used in the comparator function to sort data
2364 references. T1 and T2 are two data references to be compared.
2365 The function returns -1, 0, or 1. */
2368 compare_tree (tree t1
, tree t2
)
2371 enum tree_code code
;
2382 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2383 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2385 code
= TREE_CODE (t1
);
2388 /* For const values, we can just use hash values for comparisons. */
2396 hashval_t h1
= iterative_hash_expr (t1
, 0);
2397 hashval_t h2
= iterative_hash_expr (t2
, 0);
2399 return h1
< h2
? -1 : 1;
2404 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2408 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2409 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2413 tclass
= TREE_CODE_CLASS (code
);
2415 /* For var-decl, we could compare their UIDs. */
2416 if (tclass
== tcc_declaration
)
2418 if (DECL_UID (t1
) != DECL_UID (t2
))
2419 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2423 /* For expressions with operands, compare their operands recursively. */
2424 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2426 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2436 /* Compare two data-references DRA and DRB to group them into chunks
2437 suitable for grouping. */
2440 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2442 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2443 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2446 /* Stabilize sort. */
2450 /* Ordering of DRs according to base. */
2451 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2453 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2458 /* And according to DR_OFFSET. */
2459 if (!dr_equal_offsets_p (dra
, drb
))
2461 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2466 /* Put reads before writes. */
2467 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2468 return DR_IS_READ (dra
) ? -1 : 1;
2470 /* Then sort after access size. */
2471 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2472 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2474 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2475 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2480 /* And after step. */
2481 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2483 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2488 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2489 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2491 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2495 /* Function vect_analyze_data_ref_accesses.
2497 Analyze the access pattern of all the data references in the loop.
2499 FORNOW: the only access pattern that is considered vectorizable is a
2500 simple step 1 (consecutive) access.
2502 FORNOW: handle only arrays and pointer accesses. */
2505 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2508 vec
<data_reference_p
> datarefs
;
2509 struct data_reference
*dr
;
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE
, vect_location
,
2513 "=== vect_analyze_data_ref_accesses ===\n");
2516 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2518 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2520 if (datarefs
.is_empty ())
2523 /* Sort the array of datarefs to make building the interleaving chains
2525 qsort (datarefs
.address (), datarefs
.length (),
2526 sizeof (data_reference_p
), dr_group_sort_cmp
);
2528 /* Build the interleaving chains. */
2529 for (i
= 0; i
< datarefs
.length () - 1;)
2531 data_reference_p dra
= datarefs
[i
];
2532 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2533 stmt_vec_info lastinfo
= NULL
;
2534 for (i
= i
+ 1; i
< datarefs
.length (); ++i
)
2536 data_reference_p drb
= datarefs
[i
];
2537 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2539 /* ??? Imperfect sorting (non-compatible types, non-modulo
2540 accesses, same accesses) can lead to a group to be artificially
2541 split here as we don't just skip over those. If it really
2542 matters we can push those to a worklist and re-iterate
2543 over them. The we can just skip ahead to the next DR here. */
2545 /* Check that the data-refs have same first location (except init)
2546 and they are both either store or load (not load and store). */
2547 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2548 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2549 DR_BASE_ADDRESS (drb
), 0)
2550 || !dr_equal_offsets_p (dra
, drb
))
2553 /* Check that the data-refs have the same constant size and step. */
2554 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2555 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2556 if (!tree_fits_uhwi_p (sza
)
2557 || !tree_fits_uhwi_p (szb
)
2558 || !tree_int_cst_equal (sza
, szb
)
2559 || !tree_fits_shwi_p (DR_STEP (dra
))
2560 || !tree_fits_shwi_p (DR_STEP (drb
))
2561 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2564 /* Do not place the same access in the interleaving chain twice. */
2565 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2568 /* Check the types are compatible.
2569 ??? We don't distinguish this during sorting. */
2570 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2571 TREE_TYPE (DR_REF (drb
))))
2574 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2575 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2576 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2577 gcc_assert (init_a
< init_b
);
2579 /* If init_b == init_a + the size of the type * k, we have an
2580 interleaving, and DRA is accessed before DRB. */
2581 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2582 if ((init_b
- init_a
) % type_size_a
!= 0)
2585 /* The step (if not zero) is greater than the difference between
2586 data-refs' inits. This splits groups into suitable sizes. */
2587 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2588 if (step
!= 0 && step
<= (init_b
- init_a
))
2591 if (dump_enabled_p ())
2593 dump_printf_loc (MSG_NOTE
, vect_location
,
2594 "Detected interleaving ");
2595 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2596 dump_printf (MSG_NOTE
, " and ");
2597 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2598 dump_printf (MSG_NOTE
, "\n");
2601 /* Link the found element into the group list. */
2602 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2604 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2605 lastinfo
= stmtinfo_a
;
2607 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2608 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2609 lastinfo
= stmtinfo_b
;
2613 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2614 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2615 && !vect_analyze_data_ref_access (dr
))
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2619 "not vectorized: complicated access pattern.\n");
2623 /* Mark the statement as not vectorizable. */
2624 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2635 /* Operator == between two dr_with_seg_len objects.
2637 This equality operator is used to make sure two data refs
2638 are the same one so that we will consider to combine the
2639 aliasing checks of those two pairs of data dependent data
2643 operator == (const dr_with_seg_len
& d1
,
2644 const dr_with_seg_len
& d2
)
2646 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2647 DR_BASE_ADDRESS (d2
.dr
), 0)
2648 && compare_tree (d1
.offset
, d2
.offset
) == 0
2649 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2652 /* Function comp_dr_with_seg_len_pair.
2654 Comparison function for sorting objects of dr_with_seg_len_pair_t
2655 so that we can combine aliasing checks in one scan. */
2658 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2660 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2661 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2663 const dr_with_seg_len
&p11
= p1
->first
,
2668 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2669 if a and c have the same basic address snd step, and b and d have the same
2670 address and step. Therefore, if any a&c or b&d don't have the same address
2671 and step, we don't care the order of those two pairs after sorting. */
2674 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2675 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2677 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2678 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2680 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2682 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2684 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2686 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2692 template <class T
> static void
2700 /* Function vect_vfa_segment_size.
2702 Create an expression that computes the size of segment
2703 that will be accessed for a data reference. The functions takes into
2704 account that realignment loads may access one more vector.
2707 DR: The data reference.
2708 LENGTH_FACTOR: segment length to consider.
2710 Return an expression whose value is the size of segment which will be
2714 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2716 tree segment_length
;
2718 if (integer_zerop (DR_STEP (dr
)))
2719 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2721 segment_length
= size_binop (MULT_EXPR
,
2722 fold_convert (sizetype
, DR_STEP (dr
)),
2723 fold_convert (sizetype
, length_factor
));
2725 if (vect_supportable_dr_alignment (dr
, false)
2726 == dr_explicit_realign_optimized
)
2728 tree vector_size
= TYPE_SIZE_UNIT
2729 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2731 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2733 return segment_length
;
2736 /* Function vect_prune_runtime_alias_test_list.
2738 Prune a list of ddrs to be tested at run-time by versioning for alias.
2739 Merge several alias checks into one if possible.
2740 Return FALSE if resulting list of ddrs is longer then allowed by
2741 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2744 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2746 vec
<ddr_p
> may_alias_ddrs
=
2747 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2748 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2749 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2750 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2751 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE
, vect_location
,
2759 "=== vect_prune_runtime_alias_test_list ===\n");
2761 if (may_alias_ddrs
.is_empty ())
2764 /* Basically, for each pair of dependent data refs store_ptr_0
2765 and load_ptr_0, we create an expression:
2767 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2768 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2770 for aliasing checks. However, in some cases we can decrease
2771 the number of checks by combining two checks into one. For
2772 example, suppose we have another pair of data refs store_ptr_0
2773 and load_ptr_1, and if the following condition is satisfied:
2775 load_ptr_0 < load_ptr_1 &&
2776 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2778 (this condition means, in each iteration of vectorized loop,
2779 the accessed memory of store_ptr_0 cannot be between the memory
2780 of load_ptr_0 and load_ptr_1.)
2782 we then can use only the following expression to finish the
2783 alising checks between store_ptr_0 & load_ptr_0 and
2784 store_ptr_0 & load_ptr_1:
2786 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2787 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2789 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2790 same basic address. */
2792 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2794 /* First, we collect all data ref pairs for aliasing checks. */
2795 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2797 struct data_reference
*dr_a
, *dr_b
;
2798 gimple dr_group_first_a
, dr_group_first_b
;
2799 tree segment_length_a
, segment_length_b
;
2800 gimple stmt_a
, stmt_b
;
2803 stmt_a
= DR_STMT (DDR_A (ddr
));
2804 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2805 if (dr_group_first_a
)
2807 stmt_a
= dr_group_first_a
;
2808 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2812 stmt_b
= DR_STMT (DDR_B (ddr
));
2813 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2814 if (dr_group_first_b
)
2816 stmt_b
= dr_group_first_b
;
2817 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2820 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2821 length_factor
= scalar_loop_iters
;
2823 length_factor
= size_int (vect_factor
);
2824 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2825 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2827 dr_with_seg_len_pair_t dr_with_seg_len_pair
2828 (dr_with_seg_len (dr_a
, segment_length_a
),
2829 dr_with_seg_len (dr_b
, segment_length_b
));
2831 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2832 swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2834 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2837 /* Second, we sort the collected data ref pairs so that we can scan
2838 them once to combine all possible aliasing checks. */
2839 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2841 /* Third, we scan the sorted dr pairs and check if we can combine
2842 alias checks of two neighbouring dr pairs. */
2843 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2845 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2846 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2847 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2848 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2849 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2851 /* Remove duplicate data ref pairs. */
2852 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2854 if (dump_enabled_p ())
2856 dump_printf_loc (MSG_NOTE
, vect_location
,
2857 "found equal ranges ");
2858 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2859 DR_REF (dr_a1
->dr
));
2860 dump_printf (MSG_NOTE
, ", ");
2861 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2862 DR_REF (dr_b1
->dr
));
2863 dump_printf (MSG_NOTE
, " and ");
2864 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2865 DR_REF (dr_a2
->dr
));
2866 dump_printf (MSG_NOTE
, ", ");
2867 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2868 DR_REF (dr_b2
->dr
));
2869 dump_printf (MSG_NOTE
, "\n");
2872 comp_alias_ddrs
.ordered_remove (i
--);
2876 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2878 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2879 and DR_A1 and DR_A2 are two consecutive memrefs. */
2880 if (*dr_a1
== *dr_a2
)
2882 swap (dr_a1
, dr_b1
);
2883 swap (dr_a2
, dr_b2
);
2886 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2887 DR_BASE_ADDRESS (dr_a2
->dr
),
2889 || !tree_fits_shwi_p (dr_a1
->offset
)
2890 || !tree_fits_shwi_p (dr_a2
->offset
))
2893 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2894 - tree_to_shwi (dr_a1
->offset
));
2897 /* Now we check if the following condition is satisfied:
2899 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2901 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2902 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2903 have to make a best estimation. We can get the minimum value
2904 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2905 then either of the following two conditions can guarantee the
2908 1: DIFF <= MIN_SEG_LEN_B
2909 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2914 min_seg_len_b
= (TREE_CODE (dr_b1
->seg_len
) == INTEGER_CST
) ?
2915 TREE_INT_CST_LOW (dr_b1
->seg_len
) :
2918 if (diff
<= min_seg_len_b
2919 || (TREE_CODE (dr_a1
->seg_len
) == INTEGER_CST
2920 && diff
- (HOST_WIDE_INT
) TREE_INT_CST_LOW (dr_a1
->seg_len
) <
2923 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2924 dr_a2
->seg_len
, size_int (diff
));
2925 comp_alias_ddrs
.ordered_remove (i
--);
2930 if ((int) comp_alias_ddrs
.length () >
2931 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2933 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2936 "disable versioning for alias - max number of "
2937 "generated checks exceeded.\n");
2946 /* Check whether a non-affine read in stmt is suitable for gather load
2947 and if so, return a builtin decl for that operation. */
2950 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2951 tree
*offp
, int *scalep
)
2953 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2954 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2955 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2956 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2957 tree offtype
= NULL_TREE
;
2958 tree decl
, base
, off
;
2959 enum machine_mode pmode
;
2960 int punsignedp
, pvolatilep
;
2962 /* The gather builtins need address of the form
2963 loop_invariant + vector * {1, 2, 4, 8}
2965 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2966 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2967 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2968 multiplications and additions in it. To get a vector, we need
2969 a single SSA_NAME that will be defined in the loop and will
2970 contain everything that is not loop invariant and that can be
2971 vectorized. The following code attempts to find such a preexistng
2972 SSA_NAME OFF and put the loop invariants into a tree BASE
2973 that can be gimplified before the loop. */
2974 base
= get_inner_reference (DR_REF (dr
), &pbitsize
, &pbitpos
, &off
,
2975 &pmode
, &punsignedp
, &pvolatilep
, false);
2976 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
2978 if (TREE_CODE (base
) == MEM_REF
)
2980 if (!integer_zerop (TREE_OPERAND (base
, 1)))
2982 if (off
== NULL_TREE
)
2984 double_int moff
= mem_ref_offset (base
);
2985 off
= double_int_to_tree (sizetype
, moff
);
2988 off
= size_binop (PLUS_EXPR
, off
,
2989 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
2991 base
= TREE_OPERAND (base
, 0);
2994 base
= build_fold_addr_expr (base
);
2996 if (off
== NULL_TREE
)
2997 off
= size_zero_node
;
2999 /* If base is not loop invariant, either off is 0, then we start with just
3000 the constant offset in the loop invariant BASE and continue with base
3001 as OFF, otherwise give up.
3002 We could handle that case by gimplifying the addition of base + off
3003 into some SSA_NAME and use that as off, but for now punt. */
3004 if (!expr_invariant_in_loop_p (loop
, base
))
3006 if (!integer_zerop (off
))
3009 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3011 /* Otherwise put base + constant offset into the loop invariant BASE
3012 and continue with OFF. */
3015 base
= fold_convert (sizetype
, base
);
3016 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3019 /* OFF at this point may be either a SSA_NAME or some tree expression
3020 from get_inner_reference. Try to peel off loop invariants from it
3021 into BASE as long as possible. */
3023 while (offtype
== NULL_TREE
)
3025 enum tree_code code
;
3026 tree op0
, op1
, add
= NULL_TREE
;
3028 if (TREE_CODE (off
) == SSA_NAME
)
3030 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3032 if (expr_invariant_in_loop_p (loop
, off
))
3035 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3038 op0
= gimple_assign_rhs1 (def_stmt
);
3039 code
= gimple_assign_rhs_code (def_stmt
);
3040 op1
= gimple_assign_rhs2 (def_stmt
);
3044 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3046 code
= TREE_CODE (off
);
3047 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3051 case POINTER_PLUS_EXPR
:
3053 if (expr_invariant_in_loop_p (loop
, op0
))
3058 add
= fold_convert (sizetype
, add
);
3060 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3061 base
= size_binop (PLUS_EXPR
, base
, add
);
3064 if (expr_invariant_in_loop_p (loop
, op1
))
3072 if (expr_invariant_in_loop_p (loop
, op1
))
3074 add
= fold_convert (sizetype
, op1
);
3075 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3081 if (scale
== 1 && tree_fits_shwi_p (op1
))
3083 scale
= tree_to_shwi (op1
);
3092 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3093 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3095 if (TYPE_PRECISION (TREE_TYPE (op0
))
3096 == TYPE_PRECISION (TREE_TYPE (off
)))
3101 if (TYPE_PRECISION (TREE_TYPE (op0
))
3102 < TYPE_PRECISION (TREE_TYPE (off
)))
3105 offtype
= TREE_TYPE (off
);
3116 /* If at the end OFF still isn't a SSA_NAME or isn't
3117 defined in the loop, punt. */
3118 if (TREE_CODE (off
) != SSA_NAME
3119 || expr_invariant_in_loop_p (loop
, off
))
3122 if (offtype
== NULL_TREE
)
3123 offtype
= TREE_TYPE (off
);
3125 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3127 if (decl
== NULL_TREE
)
3139 /* Function vect_analyze_data_refs.
3141 Find all the data references in the loop or basic block.
3143 The general structure of the analysis of data refs in the vectorizer is as
3145 1- vect_analyze_data_refs(loop/bb): call
3146 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3147 in the loop/bb and their dependences.
3148 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3149 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3150 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3155 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3156 bb_vec_info bb_vinfo
,
3159 struct loop
*loop
= NULL
;
3160 basic_block bb
= NULL
;
3162 vec
<data_reference_p
> datarefs
;
3163 struct data_reference
*dr
;
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE
, vect_location
,
3168 "=== vect_analyze_data_refs ===\n");
3172 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3174 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3175 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3176 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3180 "not vectorized: loop contains function calls"
3181 " or data references that cannot be analyzed\n");
3185 for (i
= 0; i
< loop
->num_nodes
; i
++)
3187 gimple_stmt_iterator gsi
;
3189 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3191 gimple stmt
= gsi_stmt (gsi
);
3192 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3194 if (is_gimple_call (stmt
) && loop
->safelen
)
3196 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3197 if (fndecl
!= NULL_TREE
)
3199 struct cgraph_node
*node
= cgraph_get_node (fndecl
);
3200 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3202 unsigned int j
, n
= gimple_call_num_args (stmt
);
3203 for (j
= 0; j
< n
; j
++)
3205 op
= gimple_call_arg (stmt
, j
);
3207 || (REFERENCE_CLASS_P (op
)
3208 && get_base_address (op
)))
3211 op
= gimple_call_lhs (stmt
);
3212 /* Ignore #pragma omp declare simd functions
3213 if they don't have data references in the
3214 call stmt itself. */
3218 || (REFERENCE_CLASS_P (op
)
3219 && get_base_address (op
)))))
3224 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3225 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3227 "not vectorized: loop contains function "
3228 "calls or data references that cannot "
3235 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3239 gimple_stmt_iterator gsi
;
3241 bb
= BB_VINFO_BB (bb_vinfo
);
3242 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3244 gimple stmt
= gsi_stmt (gsi
);
3245 if (!find_data_references_in_stmt (NULL
, stmt
,
3246 &BB_VINFO_DATAREFS (bb_vinfo
)))
3248 /* Mark the rest of the basic-block as unvectorizable. */
3249 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3251 stmt
= gsi_stmt (gsi
);
3252 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3258 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3261 /* Go through the data-refs, check that the analysis succeeded. Update
3262 pointer from stmt_vec_info struct to DR and vectype. */
3264 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3267 stmt_vec_info stmt_info
;
3268 tree base
, offset
, init
;
3269 bool gather
= false;
3270 bool simd_lane_access
= false;
3274 if (!dr
|| !DR_REF (dr
))
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3278 "not vectorized: unhandled data-ref\n");
3282 stmt
= DR_STMT (dr
);
3283 stmt_info
= vinfo_for_stmt (stmt
);
3285 /* Discard clobbers from the dataref vector. We will remove
3286 clobber stmts during vectorization. */
3287 if (gimple_clobber_p (stmt
))
3289 if (i
== datarefs
.length () - 1)
3294 datarefs
[i
] = datarefs
.pop ();
3298 /* Check that analysis of the data-ref succeeded. */
3299 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3304 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3305 && targetm
.vectorize
.builtin_gather
!= NULL
;
3306 bool maybe_simd_lane_access
3307 = loop_vinfo
&& loop
->simduid
;
3309 /* If target supports vector gather loads, or if this might be
3310 a SIMD lane access, see if they can't be used. */
3312 && (maybe_gather
|| maybe_simd_lane_access
)
3313 && !nested_in_vect_loop_p (loop
, stmt
))
3315 struct data_reference
*newdr
3316 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3317 DR_REF (dr
), stmt
, true);
3318 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3319 if (DR_BASE_ADDRESS (newdr
)
3320 && DR_OFFSET (newdr
)
3323 && integer_zerop (DR_STEP (newdr
)))
3325 if (maybe_simd_lane_access
)
3327 tree off
= DR_OFFSET (newdr
);
3329 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3330 && TREE_CODE (off
) == MULT_EXPR
3331 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3333 tree step
= TREE_OPERAND (off
, 1);
3334 off
= TREE_OPERAND (off
, 0);
3336 if (CONVERT_EXPR_P (off
)
3337 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3339 < TYPE_PRECISION (TREE_TYPE (off
)))
3340 off
= TREE_OPERAND (off
, 0);
3341 if (TREE_CODE (off
) == SSA_NAME
)
3343 gimple def
= SSA_NAME_DEF_STMT (off
);
3344 tree reft
= TREE_TYPE (DR_REF (newdr
));
3345 if (gimple_call_internal_p (def
)
3346 && gimple_call_internal_fn (def
)
3347 == IFN_GOMP_SIMD_LANE
)
3349 tree arg
= gimple_call_arg (def
, 0);
3350 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3351 arg
= SSA_NAME_VAR (arg
);
3352 if (arg
== loop
->simduid
3354 && tree_int_cst_equal
3355 (TYPE_SIZE_UNIT (reft
),
3358 DR_OFFSET (newdr
) = ssize_int (0);
3359 DR_STEP (newdr
) = step
;
3360 DR_ALIGNED_TO (newdr
)
3361 = size_int (BIGGEST_ALIGNMENT
);
3363 simd_lane_access
= true;
3369 if (!simd_lane_access
&& maybe_gather
)
3375 if (!gather
&& !simd_lane_access
)
3376 free_data_ref (newdr
);
3379 if (!gather
&& !simd_lane_access
)
3381 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3384 "not vectorized: data ref analysis "
3386 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3387 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3397 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3401 "not vectorized: base addr of dr is a "
3407 if (gather
|| simd_lane_access
)
3412 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3417 "not vectorized: volatile type ");
3418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3419 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3428 if (stmt_can_throw_internal (stmt
))
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3433 "not vectorized: statement can throw an "
3435 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3436 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3442 if (gather
|| simd_lane_access
)
3447 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3448 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3453 "not vectorized: statement is bitfield "
3455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3456 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3462 if (gather
|| simd_lane_access
)
3467 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3468 offset
= unshare_expr (DR_OFFSET (dr
));
3469 init
= unshare_expr (DR_INIT (dr
));
3471 if (is_gimple_call (stmt
))
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3476 "not vectorized: dr in a call ");
3477 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3478 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3484 if (gather
|| simd_lane_access
)
3489 /* Update DR field in stmt_vec_info struct. */
3491 /* If the dataref is in an inner-loop of the loop that is considered for
3492 for vectorization, we also want to analyze the access relative to
3493 the outer-loop (DR contains information only relative to the
3494 inner-most enclosing loop). We do that by building a reference to the
3495 first location accessed by the inner-loop, and analyze it relative to
3497 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3499 tree outer_step
, outer_base
, outer_init
;
3500 HOST_WIDE_INT pbitsize
, pbitpos
;
3502 enum machine_mode pmode
;
3503 int punsignedp
, pvolatilep
;
3504 affine_iv base_iv
, offset_iv
;
3507 /* Build a reference to the first location accessed by the
3508 inner-loop: *(BASE+INIT). (The first location is actually
3509 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3510 tree inner_base
= build_fold_indirect_ref
3511 (fold_build_pointer_plus (base
, init
));
3513 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_NOTE
, vect_location
,
3516 "analyze in outer-loop: ");
3517 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3518 dump_printf (MSG_NOTE
, "\n");
3521 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3522 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3523 gcc_assert (outer_base
!= NULL_TREE
);
3525 if (pbitpos
% BITS_PER_UNIT
!= 0)
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3529 "failed: bit offset alignment.\n");
3533 outer_base
= build_fold_addr_expr (outer_base
);
3534 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3539 "failed: evolution of base is not affine.\n");
3546 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3554 offset_iv
.base
= ssize_int (0);
3555 offset_iv
.step
= ssize_int (0);
3557 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3562 "evolution of offset is not affine.\n");
3566 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3567 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3568 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3569 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3570 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3572 outer_step
= size_binop (PLUS_EXPR
,
3573 fold_convert (ssizetype
, base_iv
.step
),
3574 fold_convert (ssizetype
, offset_iv
.step
));
3576 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3577 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3578 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3579 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3580 STMT_VINFO_DR_OFFSET (stmt_info
) =
3581 fold_convert (ssizetype
, offset_iv
.base
);
3582 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3583 size_int (highest_pow2_factor (offset_iv
.base
));
3585 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_NOTE
, vect_location
,
3588 "\touter base_address: ");
3589 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3590 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3591 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3592 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3593 STMT_VINFO_DR_OFFSET (stmt_info
));
3594 dump_printf (MSG_NOTE
,
3595 "\n\touter constant offset from base address: ");
3596 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3597 STMT_VINFO_DR_INIT (stmt_info
));
3598 dump_printf (MSG_NOTE
, "\n\touter step: ");
3599 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3600 STMT_VINFO_DR_STEP (stmt_info
));
3601 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3602 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3603 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3604 dump_printf (MSG_NOTE
, "\n");
3608 if (STMT_VINFO_DATA_REF (stmt_info
))
3610 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3613 "not vectorized: more than one data ref "
3615 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3616 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3622 if (gather
|| simd_lane_access
)
3627 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3628 if (simd_lane_access
)
3630 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3634 /* Set vectype for STMT. */
3635 scalar_type
= TREE_TYPE (DR_REF (dr
));
3636 STMT_VINFO_VECTYPE (stmt_info
) =
3637 get_vectype_for_scalar_type (scalar_type
);
3638 if (!STMT_VINFO_VECTYPE (stmt_info
))
3640 if (dump_enabled_p ())
3642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3643 "not vectorized: no vectype for stmt: ");
3644 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3645 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3646 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3648 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3654 if (gather
|| simd_lane_access
)
3656 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3663 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_NOTE
, vect_location
,
3666 "got vectype for stmt: ");
3667 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3668 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3669 STMT_VINFO_VECTYPE (stmt_info
));
3670 dump_printf (MSG_NOTE
, "\n");
3674 /* Adjust the minimal vectorization factor according to the
3676 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3684 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3686 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3690 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3692 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3695 "not vectorized: not suitable for gather "
3697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3698 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3704 STMT_VINFO_GATHER_P (stmt_info
) = true;
3707 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3709 if (nested_in_vect_loop_p (loop
, stmt
)
3710 || !DR_IS_READ (dr
))
3712 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3715 "not vectorized: not suitable for strided "
3717 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3718 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3722 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3726 /* If we stopped analysis at the first dataref we could not analyze
3727 when trying to vectorize a basic-block mark the rest of the datarefs
3728 as not vectorizable and truncate the vector of datarefs. That
3729 avoids spending useless time in analyzing their dependence. */
3730 if (i
!= datarefs
.length ())
3732 gcc_assert (bb_vinfo
!= NULL
);
3733 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3735 data_reference_p dr
= datarefs
[j
];
3736 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3739 datarefs
.truncate (i
);
3746 /* Function vect_get_new_vect_var.
3748 Returns a name for a new variable. The current naming scheme appends the
3749 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3750 the name of vectorizer generated variables, and appends that to NAME if
3754 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3761 case vect_simple_var
:
3764 case vect_scalar_var
:
3767 case vect_pointer_var
:
3776 char* tmp
= concat (prefix
, "_", name
, NULL
);
3777 new_vect_var
= create_tmp_reg (type
, tmp
);
3781 new_vect_var
= create_tmp_reg (type
, prefix
);
3783 return new_vect_var
;
3787 /* Function vect_create_addr_base_for_vector_ref.
3789 Create an expression that computes the address of the first memory location
3790 that will be accessed for a data reference.
3793 STMT: The statement containing the data reference.
3794 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3795 OFFSET: Optional. If supplied, it is be added to the initial address.
3796 LOOP: Specify relative to which loop-nest should the address be computed.
3797 For example, when the dataref is in an inner-loop nested in an
3798 outer-loop that is now being vectorized, LOOP can be either the
3799 outer-loop, or the inner-loop. The first memory location accessed
3800 by the following dataref ('in' points to short):
3807 if LOOP=i_loop: &in (relative to i_loop)
3808 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3811 1. Return an SSA_NAME whose value is the address of the memory location of
3812 the first vector of the data reference.
3813 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3814 these statement(s) which define the returned SSA_NAME.
3816 FORNOW: We are only handling array accesses with step 1. */
3819 vect_create_addr_base_for_vector_ref (gimple stmt
,
3820 gimple_seq
*new_stmt_list
,
3824 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3825 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3827 const char *base_name
;
3830 gimple_seq seq
= NULL
;
3834 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3835 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3837 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3839 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3841 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3843 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3844 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3845 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3849 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3850 base_offset
= unshare_expr (DR_OFFSET (dr
));
3851 init
= unshare_expr (DR_INIT (dr
));
3855 base_name
= get_name (data_ref_base
);
3858 base_offset
= ssize_int (0);
3859 init
= ssize_int (0);
3860 base_name
= get_name (DR_REF (dr
));
3863 /* Create base_offset */
3864 base_offset
= size_binop (PLUS_EXPR
,
3865 fold_convert (sizetype
, base_offset
),
3866 fold_convert (sizetype
, init
));
3870 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3871 fold_convert (sizetype
, offset
), step
);
3872 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3873 base_offset
, offset
);
3876 /* base + base_offset */
3878 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3881 addr_base
= build1 (ADDR_EXPR
,
3882 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3883 unshare_expr (DR_REF (dr
)));
3886 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3887 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3888 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3889 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3890 gimple_seq_add_seq (new_stmt_list
, seq
);
3892 if (DR_PTR_INFO (dr
)
3893 && TREE_CODE (addr_base
) == SSA_NAME
)
3895 duplicate_ssa_name_ptr_info (addr_base
, DR_PTR_INFO (dr
));
3897 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3900 if (dump_enabled_p ())
3902 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3903 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3904 dump_printf (MSG_NOTE
, "\n");
3911 /* Function vect_create_data_ref_ptr.
3913 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3914 location accessed in the loop by STMT, along with the def-use update
3915 chain to appropriately advance the pointer through the loop iterations.
3916 Also set aliasing information for the pointer. This pointer is used by
3917 the callers to this function to create a memory reference expression for
3918 vector load/store access.
3921 1. STMT: a stmt that references memory. Expected to be of the form
3922 GIMPLE_ASSIGN <name, data-ref> or
3923 GIMPLE_ASSIGN <data-ref, name>.
3924 2. AGGR_TYPE: the type of the reference, which should be either a vector
3926 3. AT_LOOP: the loop where the vector memref is to be created.
3927 4. OFFSET (optional): an offset to be added to the initial address accessed
3928 by the data-ref in STMT.
3929 5. BSI: location where the new stmts are to be placed if there is no loop
3930 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3931 pointing to the initial address.
3934 1. Declare a new ptr to vector_type, and have it point to the base of the
3935 data reference (initial addressed accessed by the data reference).
3936 For example, for vector of type V8HI, the following code is generated:
3939 ap = (v8hi *)initial_address;
3941 if OFFSET is not supplied:
3942 initial_address = &a[init];
3943 if OFFSET is supplied:
3944 initial_address = &a[init + OFFSET];
3946 Return the initial_address in INITIAL_ADDRESS.
3948 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3949 update the pointer in each iteration of the loop.
3951 Return the increment stmt that updates the pointer in PTR_INCR.
3953 3. Set INV_P to true if the access pattern of the data reference in the
3954 vectorized loop is invariant. Set it to false otherwise.
3956 4. Return the pointer. */
3959 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
3960 tree offset
, tree
*initial_address
,
3961 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
3962 bool only_init
, bool *inv_p
)
3964 const char *base_name
;
3965 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3966 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3967 struct loop
*loop
= NULL
;
3968 bool nested_in_vect_loop
= false;
3969 struct loop
*containing_loop
= NULL
;
3974 gimple_seq new_stmt_list
= NULL
;
3978 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3980 gimple_stmt_iterator incr_gsi
;
3982 tree indx_before_incr
, indx_after_incr
;
3985 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
3987 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
3988 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
3992 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3993 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
3994 containing_loop
= (gimple_bb (stmt
))->loop_father
;
3995 pe
= loop_preheader_edge (loop
);
3999 gcc_assert (bb_vinfo
);
4004 /* Check the step (evolution) of the load in LOOP, and record
4005 whether it's invariant. */
4006 if (nested_in_vect_loop
)
4007 step
= STMT_VINFO_DR_STEP (stmt_info
);
4009 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4011 if (integer_zerop (step
))
4016 /* Create an expression for the first address accessed by this load
4018 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4020 if (dump_enabled_p ())
4022 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4023 dump_printf_loc (MSG_NOTE
, vect_location
,
4024 "create %s-pointer variable to type: ",
4025 get_tree_code_name (TREE_CODE (aggr_type
)));
4026 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4027 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4028 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4029 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4030 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4031 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4032 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4034 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4035 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4036 dump_printf (MSG_NOTE
, "\n");
4039 /* (1) Create the new aggregate-pointer variable.
4040 Vector and array types inherit the alias set of their component
4041 type by default so we need to use a ref-all pointer if the data
4042 reference does not conflict with the created aggregated data
4043 reference because it is not addressable. */
4044 bool need_ref_all
= false;
4045 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4046 get_alias_set (DR_REF (dr
))))
4047 need_ref_all
= true;
4048 /* Likewise for any of the data references in the stmt group. */
4049 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4051 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4054 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4055 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4056 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4057 get_alias_set (DR_REF (sdr
))))
4059 need_ref_all
= true;
4062 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4066 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4068 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4071 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4072 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4073 def-use update cycles for the pointer: one relative to the outer-loop
4074 (LOOP), which is what steps (3) and (4) below do. The other is relative
4075 to the inner-loop (which is the inner-most loop containing the dataref),
4076 and this is done be step (5) below.
4078 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4079 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4080 redundant. Steps (3),(4) create the following:
4083 LOOP: vp1 = phi(vp0,vp2)
4089 If there is an inner-loop nested in loop, then step (5) will also be
4090 applied, and an additional update in the inner-loop will be created:
4093 LOOP: vp1 = phi(vp0,vp2)
4095 inner: vp3 = phi(vp1,vp4)
4096 vp4 = vp3 + inner_step
4102 /* (2) Calculate the initial address of the aggregate-pointer, and set
4103 the aggregate-pointer to point to it before the loop. */
4105 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4107 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4113 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4114 gcc_assert (!new_bb
);
4117 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4120 *initial_address
= new_temp
;
4122 /* Create: p = (aggr_type *) initial_base */
4123 if (TREE_CODE (new_temp
) != SSA_NAME
4124 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
4126 vec_stmt
= gimple_build_assign (aggr_ptr
,
4127 fold_convert (aggr_ptr_type
, new_temp
));
4128 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
4129 /* Copy the points-to information if it exists. */
4130 if (DR_PTR_INFO (dr
))
4131 duplicate_ssa_name_ptr_info (aggr_ptr_init
, DR_PTR_INFO (dr
));
4132 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
4135 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
4136 gcc_assert (!new_bb
);
4139 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
4142 aggr_ptr_init
= new_temp
;
4144 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4145 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4146 inner-loop nested in LOOP (during outer-loop vectorization). */
4148 /* No update in loop is required. */
4149 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4150 aptr
= aggr_ptr_init
;
4153 /* The step of the aggregate pointer is the type size. */
4154 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4155 /* One exception to the above is when the scalar step of the load in
4156 LOOP is zero. In this case the step here is also zero. */
4158 iv_step
= size_zero_node
;
4159 else if (tree_int_cst_sgn (step
) == -1)
4160 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4162 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4164 create_iv (aggr_ptr_init
,
4165 fold_convert (aggr_ptr_type
, iv_step
),
4166 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4167 &indx_before_incr
, &indx_after_incr
);
4168 incr
= gsi_stmt (incr_gsi
);
4169 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4171 /* Copy the points-to information if it exists. */
4172 if (DR_PTR_INFO (dr
))
4174 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4175 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4180 aptr
= indx_before_incr
;
4183 if (!nested_in_vect_loop
|| only_init
)
4187 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4188 nested in LOOP, if exists. */
4190 gcc_assert (nested_in_vect_loop
);
4193 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4195 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4196 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4198 incr
= gsi_stmt (incr_gsi
);
4199 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4201 /* Copy the points-to information if it exists. */
4202 if (DR_PTR_INFO (dr
))
4204 duplicate_ssa_name_ptr_info (indx_before_incr
, DR_PTR_INFO (dr
));
4205 duplicate_ssa_name_ptr_info (indx_after_incr
, DR_PTR_INFO (dr
));
4210 return indx_before_incr
;
4217 /* Function bump_vector_ptr
4219 Increment a pointer (to a vector type) by vector-size. If requested,
4220 i.e. if PTR-INCR is given, then also connect the new increment stmt
4221 to the existing def-use update-chain of the pointer, by modifying
4222 the PTR_INCR as illustrated below:
4224 The pointer def-use update-chain before this function:
4225 DATAREF_PTR = phi (p_0, p_2)
4227 PTR_INCR: p_2 = DATAREF_PTR + step
4229 The pointer def-use update-chain after this function:
4230 DATAREF_PTR = phi (p_0, p_2)
4232 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4234 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4237 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4239 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4240 the loop. The increment amount across iterations is expected
4242 BSI - location where the new update stmt is to be placed.
4243 STMT - the original scalar memory-access stmt that is being vectorized.
4244 BUMP - optional. The offset by which to bump the pointer. If not given,
4245 the offset is assumed to be vector_size.
4247 Output: Return NEW_DATAREF_PTR as illustrated above.
4252 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4253 gimple stmt
, tree bump
)
4255 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4256 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4257 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4258 tree update
= TYPE_SIZE_UNIT (vectype
);
4261 use_operand_p use_p
;
4262 tree new_dataref_ptr
;
4267 new_dataref_ptr
= copy_ssa_name (dataref_ptr
, NULL
);
4268 incr_stmt
= gimple_build_assign_with_ops (POINTER_PLUS_EXPR
, new_dataref_ptr
,
4269 dataref_ptr
, update
);
4270 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4272 /* Copy the points-to information if it exists. */
4273 if (DR_PTR_INFO (dr
))
4275 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4276 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4280 return new_dataref_ptr
;
4282 /* Update the vector-pointer's cross-iteration increment. */
4283 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4285 tree use
= USE_FROM_PTR (use_p
);
4287 if (use
== dataref_ptr
)
4288 SET_USE (use_p
, new_dataref_ptr
);
4290 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4293 return new_dataref_ptr
;
4297 /* Function vect_create_destination_var.
4299 Create a new temporary of type VECTYPE. */
4302 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4308 enum vect_var_kind kind
;
4310 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4311 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4313 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4315 name
= get_name (scalar_dest
);
4317 asprintf (&new_name
, "%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4319 asprintf (&new_name
, "_%u", SSA_NAME_VERSION (scalar_dest
));
4320 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4326 /* Function vect_grouped_store_supported.
4328 Returns TRUE if interleave high and interleave low permutations
4329 are supported, and FALSE otherwise. */
4332 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4334 enum machine_mode mode
= TYPE_MODE (vectype
);
4336 /* vect_permute_store_chain requires the group size to be a power of two. */
4337 if (exact_log2 (count
) == -1)
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4341 "the size of the group of accesses"
4342 " is not a power of 2\n");
4346 /* Check that the permutation is supported. */
4347 if (VECTOR_MODE_P (mode
))
4349 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4350 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4351 for (i
= 0; i
< nelt
/ 2; i
++)
4354 sel
[i
* 2 + 1] = i
+ nelt
;
4356 if (can_vec_perm_p (mode
, false, sel
))
4358 for (i
= 0; i
< nelt
; i
++)
4360 if (can_vec_perm_p (mode
, false, sel
))
4365 if (dump_enabled_p ())
4366 dump_printf (MSG_MISSED_OPTIMIZATION
,
4367 "interleave op not supported by target.\n");
4372 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4376 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4378 return vect_lanes_optab_supported_p ("vec_store_lanes",
4379 vec_store_lanes_optab
,
4384 /* Function vect_permute_store_chain.
4386 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4387 a power of 2, generate interleave_high/low stmts to reorder the data
4388 correctly for the stores. Return the final references for stores in
4391 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4392 The input is 4 vectors each containing 8 elements. We assign a number to
4393 each element, the input sequence is:
4395 1st vec: 0 1 2 3 4 5 6 7
4396 2nd vec: 8 9 10 11 12 13 14 15
4397 3rd vec: 16 17 18 19 20 21 22 23
4398 4th vec: 24 25 26 27 28 29 30 31
4400 The output sequence should be:
4402 1st vec: 0 8 16 24 1 9 17 25
4403 2nd vec: 2 10 18 26 3 11 19 27
4404 3rd vec: 4 12 20 28 5 13 21 30
4405 4th vec: 6 14 22 30 7 15 23 31
4407 i.e., we interleave the contents of the four vectors in their order.
4409 We use interleave_high/low instructions to create such output. The input of
4410 each interleave_high/low operation is two vectors:
4413 the even elements of the result vector are obtained left-to-right from the
4414 high/low elements of the first vector. The odd elements of the result are
4415 obtained left-to-right from the high/low elements of the second vector.
4416 The output of interleave_high will be: 0 4 1 5
4417 and of interleave_low: 2 6 3 7
4420 The permutation is done in log LENGTH stages. In each stage interleave_high
4421 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4422 where the first argument is taken from the first half of DR_CHAIN and the
4423 second argument from it's second half.
4426 I1: interleave_high (1st vec, 3rd vec)
4427 I2: interleave_low (1st vec, 3rd vec)
4428 I3: interleave_high (2nd vec, 4th vec)
4429 I4: interleave_low (2nd vec, 4th vec)
4431 The output for the first stage is:
4433 I1: 0 16 1 17 2 18 3 19
4434 I2: 4 20 5 21 6 22 7 23
4435 I3: 8 24 9 25 10 26 11 27
4436 I4: 12 28 13 29 14 30 15 31
4438 The output of the second stage, i.e. the final result is:
4440 I1: 0 8 16 24 1 9 17 25
4441 I2: 2 10 18 26 3 11 19 27
4442 I3: 4 12 20 28 5 13 21 30
4443 I4: 6 14 22 30 7 15 23 31. */
4446 vect_permute_store_chain (vec
<tree
> dr_chain
,
4447 unsigned int length
,
4449 gimple_stmt_iterator
*gsi
,
4450 vec
<tree
> *result_chain
)
4452 tree vect1
, vect2
, high
, low
;
4454 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4455 tree perm_mask_low
, perm_mask_high
;
4457 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4458 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4460 result_chain
->quick_grow (length
);
4461 memcpy (result_chain
->address (), dr_chain
.address (),
4462 length
* sizeof (tree
));
4464 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4467 sel
[i
* 2 + 1] = i
+ nelt
;
4469 perm_mask_high
= vect_gen_perm_mask (vectype
, sel
);
4470 gcc_assert (perm_mask_high
!= NULL
);
4472 for (i
= 0; i
< nelt
; i
++)
4474 perm_mask_low
= vect_gen_perm_mask (vectype
, sel
);
4475 gcc_assert (perm_mask_low
!= NULL
);
4477 for (i
= 0, n
= exact_log2 (length
); i
< n
; i
++)
4479 for (j
= 0; j
< length
/2; j
++)
4481 vect1
= dr_chain
[j
];
4482 vect2
= dr_chain
[j
+length
/2];
4484 /* Create interleaving stmt:
4485 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4486 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4488 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, high
,
4489 vect1
, vect2
, perm_mask_high
);
4490 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4491 (*result_chain
)[2*j
] = high
;
4493 /* Create interleaving stmt:
4494 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4495 nelt*3/2+1, ...}> */
4496 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4498 = gimple_build_assign_with_ops (VEC_PERM_EXPR
, low
,
4499 vect1
, vect2
, perm_mask_low
);
4500 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4501 (*result_chain
)[2*j
+1] = low
;
4503 memcpy (dr_chain
.address (), result_chain
->address (),
4504 length
* sizeof (tree
));
4508 /* Function vect_setup_realignment
4510 This function is called when vectorizing an unaligned load using
4511 the dr_explicit_realign[_optimized] scheme.
4512 This function generates the following code at the loop prolog:
4515 x msq_init = *(floor(p)); # prolog load
4516 realignment_token = call target_builtin;
4518 x msq = phi (msq_init, ---)
4520 The stmts marked with x are generated only for the case of
4521 dr_explicit_realign_optimized.
4523 The code above sets up a new (vector) pointer, pointing to the first
4524 location accessed by STMT, and a "floor-aligned" load using that pointer.
4525 It also generates code to compute the "realignment-token" (if the relevant
4526 target hook was defined), and creates a phi-node at the loop-header bb
4527 whose arguments are the result of the prolog-load (created by this
4528 function) and the result of a load that takes place in the loop (to be
4529 created by the caller to this function).
4531 For the case of dr_explicit_realign_optimized:
4532 The caller to this function uses the phi-result (msq) to create the
4533 realignment code inside the loop, and sets up the missing phi argument,
4536 msq = phi (msq_init, lsq)
4537 lsq = *(floor(p')); # load in loop
4538 result = realign_load (msq, lsq, realignment_token);
4540 For the case of dr_explicit_realign:
4542 msq = *(floor(p)); # load in loop
4544 lsq = *(floor(p')); # load in loop
4545 result = realign_load (msq, lsq, realignment_token);
4548 STMT - (scalar) load stmt to be vectorized. This load accesses
4549 a memory location that may be unaligned.
4550 BSI - place where new code is to be inserted.
4551 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4555 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4556 target hook, if defined.
4557 Return value - the result of the loop-header phi node. */
4560 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4561 tree
*realignment_token
,
4562 enum dr_alignment_support alignment_support_scheme
,
4564 struct loop
**at_loop
)
4566 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4567 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4568 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4569 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4570 struct loop
*loop
= NULL
;
4572 tree scalar_dest
= gimple_assign_lhs (stmt
);
4579 tree msq_init
= NULL_TREE
;
4582 tree msq
= NULL_TREE
;
4583 gimple_seq stmts
= NULL
;
4585 bool compute_in_loop
= false;
4586 bool nested_in_vect_loop
= false;
4587 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4588 struct loop
*loop_for_initial_load
= NULL
;
4592 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4593 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4596 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4597 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4599 /* We need to generate three things:
4600 1. the misalignment computation
4601 2. the extra vector load (for the optimized realignment scheme).
4602 3. the phi node for the two vectors from which the realignment is
4603 done (for the optimized realignment scheme). */
4605 /* 1. Determine where to generate the misalignment computation.
4607 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4608 calculation will be generated by this function, outside the loop (in the
4609 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4610 caller, inside the loop.
4612 Background: If the misalignment remains fixed throughout the iterations of
4613 the loop, then both realignment schemes are applicable, and also the
4614 misalignment computation can be done outside LOOP. This is because we are
4615 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4616 are a multiple of VS (the Vector Size), and therefore the misalignment in
4617 different vectorized LOOP iterations is always the same.
4618 The problem arises only if the memory access is in an inner-loop nested
4619 inside LOOP, which is now being vectorized using outer-loop vectorization.
4620 This is the only case when the misalignment of the memory access may not
4621 remain fixed throughout the iterations of the inner-loop (as explained in
4622 detail in vect_supportable_dr_alignment). In this case, not only is the
4623 optimized realignment scheme not applicable, but also the misalignment
4624 computation (and generation of the realignment token that is passed to
4625 REALIGN_LOAD) have to be done inside the loop.
4627 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4628 or not, which in turn determines if the misalignment is computed inside
4629 the inner-loop, or outside LOOP. */
4631 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4633 compute_in_loop
= true;
4634 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4638 /* 2. Determine where to generate the extra vector load.
4640 For the optimized realignment scheme, instead of generating two vector
4641 loads in each iteration, we generate a single extra vector load in the
4642 preheader of the loop, and in each iteration reuse the result of the
4643 vector load from the previous iteration. In case the memory access is in
4644 an inner-loop nested inside LOOP, which is now being vectorized using
4645 outer-loop vectorization, we need to determine whether this initial vector
4646 load should be generated at the preheader of the inner-loop, or can be
4647 generated at the preheader of LOOP. If the memory access has no evolution
4648 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4649 to be generated inside LOOP (in the preheader of the inner-loop). */
4651 if (nested_in_vect_loop
)
4653 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4654 bool invariant_in_outerloop
=
4655 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4656 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4659 loop_for_initial_load
= loop
;
4661 *at_loop
= loop_for_initial_load
;
4663 if (loop_for_initial_load
)
4664 pe
= loop_preheader_edge (loop_for_initial_load
);
4666 /* 3. For the case of the optimized realignment, create the first vector
4667 load at the loop preheader. */
4669 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4671 /* Create msq_init = *(floor(p1)) in the loop preheader */
4673 gcc_assert (!compute_in_loop
);
4674 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4675 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4676 NULL_TREE
, &init_addr
, NULL
, &inc
,
4678 new_temp
= copy_ssa_name (ptr
, NULL
);
4679 new_stmt
= gimple_build_assign_with_ops
4680 (BIT_AND_EXPR
, new_temp
, ptr
,
4681 build_int_cst (TREE_TYPE (ptr
),
4682 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4683 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4684 gcc_assert (!new_bb
);
4686 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4687 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4688 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4689 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4690 gimple_assign_set_lhs (new_stmt
, new_temp
);
4693 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4694 gcc_assert (!new_bb
);
4697 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4699 msq_init
= gimple_assign_lhs (new_stmt
);
4702 /* 4. Create realignment token using a target builtin, if available.
4703 It is done either inside the containing loop, or before LOOP (as
4704 determined above). */
4706 if (targetm
.vectorize
.builtin_mask_for_load
)
4710 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4713 /* Generate the INIT_ADDR computation outside LOOP. */
4714 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4718 pe
= loop_preheader_edge (loop
);
4719 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4720 gcc_assert (!new_bb
);
4723 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4726 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4727 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4729 vect_create_destination_var (scalar_dest
,
4730 gimple_call_return_type (new_stmt
));
4731 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4732 gimple_call_set_lhs (new_stmt
, new_temp
);
4734 if (compute_in_loop
)
4735 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4738 /* Generate the misalignment computation outside LOOP. */
4739 pe
= loop_preheader_edge (loop
);
4740 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4741 gcc_assert (!new_bb
);
4744 *realignment_token
= gimple_call_lhs (new_stmt
);
4746 /* The result of the CALL_EXPR to this builtin is determined from
4747 the value of the parameter and no global variables are touched
4748 which makes the builtin a "const" function. Requiring the
4749 builtin to have the "const" attribute makes it unnecessary
4750 to call mark_call_clobbered. */
4751 gcc_assert (TREE_READONLY (builtin_decl
));
4754 if (alignment_support_scheme
== dr_explicit_realign
)
4757 gcc_assert (!compute_in_loop
);
4758 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4761 /* 5. Create msq = phi <msq_init, lsq> in loop */
4763 pe
= loop_preheader_edge (containing_loop
);
4764 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4765 msq
= make_ssa_name (vec_dest
, NULL
);
4766 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4767 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4773 /* Function vect_grouped_load_supported.
4775 Returns TRUE if even and odd permutations are supported,
4776 and FALSE otherwise. */
4779 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4781 enum machine_mode mode
= TYPE_MODE (vectype
);
4783 /* vect_permute_load_chain requires the group size to be a power of two. */
4784 if (exact_log2 (count
) == -1)
4786 if (dump_enabled_p ())
4787 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4788 "the size of the group of accesses"
4789 " is not a power of 2\n");
4793 /* Check that the permutation is supported. */
4794 if (VECTOR_MODE_P (mode
))
4796 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4797 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4799 for (i
= 0; i
< nelt
; i
++)
4801 if (can_vec_perm_p (mode
, false, sel
))
4803 for (i
= 0; i
< nelt
; i
++)
4805 if (can_vec_perm_p (mode
, false, sel
))
4810 if (dump_enabled_p ())
4811 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4812 "extract even/odd not supported by target\n");
4816 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4820 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4822 return vect_lanes_optab_supported_p ("vec_load_lanes",
4823 vec_load_lanes_optab
,
4827 /* Function vect_permute_load_chain.
4829 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4830 a power of 2, generate extract_even/odd stmts to reorder the input data
4831 correctly. Return the final references for loads in RESULT_CHAIN.
4833 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4834 The input is 4 vectors each containing 8 elements. We assign a number to each
4835 element, the input sequence is:
4837 1st vec: 0 1 2 3 4 5 6 7
4838 2nd vec: 8 9 10 11 12 13 14 15
4839 3rd vec: 16 17 18 19 20 21 22 23
4840 4th vec: 24 25 26 27 28 29 30 31
4842 The output sequence should be:
4844 1st vec: 0 4 8 12 16 20 24 28
4845 2nd vec: 1 5 9 13 17 21 25 29
4846 3rd vec: 2 6 10 14 18 22 26 30
4847 4th vec: 3 7 11 15 19 23 27 31
4849 i.e., the first output vector should contain the first elements of each
4850 interleaving group, etc.
4852 We use extract_even/odd instructions to create such output. The input of
4853 each extract_even/odd operation is two vectors
4857 and the output is the vector of extracted even/odd elements. The output of
4858 extract_even will be: 0 2 4 6
4859 and of extract_odd: 1 3 5 7
4862 The permutation is done in log LENGTH stages. In each stage extract_even
4863 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4864 their order. In our example,
4866 E1: extract_even (1st vec, 2nd vec)
4867 E2: extract_odd (1st vec, 2nd vec)
4868 E3: extract_even (3rd vec, 4th vec)
4869 E4: extract_odd (3rd vec, 4th vec)
4871 The output for the first stage will be:
4873 E1: 0 2 4 6 8 10 12 14
4874 E2: 1 3 5 7 9 11 13 15
4875 E3: 16 18 20 22 24 26 28 30
4876 E4: 17 19 21 23 25 27 29 31
4878 In order to proceed and create the correct sequence for the next stage (or
4879 for the correct output, if the second stage is the last one, as in our
4880 example), we first put the output of extract_even operation and then the
4881 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4882 The input for the second stage is:
4884 1st vec (E1): 0 2 4 6 8 10 12 14
4885 2nd vec (E3): 16 18 20 22 24 26 28 30
4886 3rd vec (E2): 1 3 5 7 9 11 13 15
4887 4th vec (E4): 17 19 21 23 25 27 29 31
4889 The output of the second stage:
4891 E1: 0 4 8 12 16 20 24 28
4892 E2: 2 6 10 14 18 22 26 30
4893 E3: 1 5 9 13 17 21 25 29
4894 E4: 3 7 11 15 19 23 27 31
4896 And RESULT_CHAIN after reordering:
4898 1st vec (E1): 0 4 8 12 16 20 24 28
4899 2nd vec (E3): 1 5 9 13 17 21 25 29
4900 3rd vec (E2): 2 6 10 14 18 22 26 30
4901 4th vec (E4): 3 7 11 15 19 23 27 31. */
4904 vect_permute_load_chain (vec
<tree
> dr_chain
,
4905 unsigned int length
,
4907 gimple_stmt_iterator
*gsi
,
4908 vec
<tree
> *result_chain
)
4910 tree data_ref
, first_vect
, second_vect
;
4911 tree perm_mask_even
, perm_mask_odd
;
4913 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4914 unsigned int i
, j
, log_length
= exact_log2 (length
);
4915 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4916 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4918 result_chain
->quick_grow (length
);
4919 memcpy (result_chain
->address (), dr_chain
.address (),
4920 length
* sizeof (tree
));
4922 for (i
= 0; i
< nelt
; ++i
)
4924 perm_mask_even
= vect_gen_perm_mask (vectype
, sel
);
4925 gcc_assert (perm_mask_even
!= NULL
);
4927 for (i
= 0; i
< nelt
; ++i
)
4929 perm_mask_odd
= vect_gen_perm_mask (vectype
, sel
);
4930 gcc_assert (perm_mask_odd
!= NULL
);
4932 for (i
= 0; i
< log_length
; i
++)
4934 for (j
= 0; j
< length
; j
+= 2)
4936 first_vect
= dr_chain
[j
];
4937 second_vect
= dr_chain
[j
+1];
4939 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4940 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
4941 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4942 first_vect
, second_vect
,
4944 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4945 (*result_chain
)[j
/2] = data_ref
;
4947 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4948 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
4949 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, data_ref
,
4950 first_vect
, second_vect
,
4952 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4953 (*result_chain
)[j
/2+length
/2] = data_ref
;
4955 memcpy (dr_chain
.address (), result_chain
->address (),
4956 length
* sizeof (tree
));
4961 /* Function vect_transform_grouped_load.
4963 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4964 to perform their permutation and ascribe the result vectorized statements to
4965 the scalar statements.
4969 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
4970 gimple_stmt_iterator
*gsi
)
4972 vec
<tree
> result_chain
= vNULL
;
4974 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4975 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4976 vectors, that are ready for vector computation. */
4977 result_chain
.create (size
);
4978 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
4979 vect_record_grouped_load_vectors (stmt
, result_chain
);
4980 result_chain
.release ();
4983 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4984 generated as part of the vectorization of STMT. Assign the statement
4985 for each vector to the associated scalar statement. */
4988 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
4990 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
4991 gimple next_stmt
, new_stmt
;
4992 unsigned int i
, gap_count
;
4995 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4996 Since we scan the chain starting from it's first node, their order
4997 corresponds the order of data-refs in RESULT_CHAIN. */
4998 next_stmt
= first_stmt
;
5000 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5005 /* Skip the gaps. Loads created for the gaps will be removed by dead
5006 code elimination pass later. No need to check for the first stmt in
5007 the group, since it always exists.
5008 GROUP_GAP is the number of steps in elements from the previous
5009 access (if there is no gap GROUP_GAP is 1). We skip loads that
5010 correspond to the gaps. */
5011 if (next_stmt
!= first_stmt
5012 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5020 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5021 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5022 copies, and we put the new vector statement in the first available
5024 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5025 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5028 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5031 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5033 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5036 prev_stmt
= rel_stmt
;
5038 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5041 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5046 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5048 /* If NEXT_STMT accesses the same DR as the previous statement,
5049 put the same TMP_DATA_REF as its vectorized statement; otherwise
5050 get the next data-ref from RESULT_CHAIN. */
5051 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5057 /* Function vect_force_dr_alignment_p.
5059 Returns whether the alignment of a DECL can be forced to be aligned
5060 on ALIGNMENT bit boundary. */
5063 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5065 if (TREE_CODE (decl
) != VAR_DECL
)
5068 /* We cannot change alignment of common or external symbols as another
5069 translation unit may contain a definition with lower alignment.
5070 The rules of common symbol linking mean that the definition
5071 will override the common symbol. The same is true for constant
5072 pool entries which may be shared and are not properly merged
5074 if (DECL_EXTERNAL (decl
)
5075 || DECL_COMMON (decl
)
5076 || DECL_IN_CONSTANT_POOL (decl
))
5079 if (TREE_ASM_WRITTEN (decl
))
5082 /* Do not override the alignment as specified by the ABI when the used
5083 attribute is set. */
5084 if (DECL_PRESERVE_P (decl
))
5087 /* Do not override explicit alignment set by the user when an explicit
5088 section name is also used. This is a common idiom used by many
5089 software projects. */
5090 if (DECL_SECTION_NAME (decl
) != NULL_TREE
5091 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl
))
5094 if (TREE_STATIC (decl
))
5095 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5097 return (alignment
<= MAX_STACK_ALIGNMENT
);
5101 /* Return whether the data reference DR is supported with respect to its
5103 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5104 it is aligned, i.e., check if it is possible to vectorize it with different
5107 enum dr_alignment_support
5108 vect_supportable_dr_alignment (struct data_reference
*dr
,
5109 bool check_aligned_accesses
)
5111 gimple stmt
= DR_STMT (dr
);
5112 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5113 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5114 enum machine_mode mode
= TYPE_MODE (vectype
);
5115 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5116 struct loop
*vect_loop
= NULL
;
5117 bool nested_in_vect_loop
= false;
5119 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5124 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5125 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5128 /* Possibly unaligned access. */
5130 /* We can choose between using the implicit realignment scheme (generating
5131 a misaligned_move stmt) and the explicit realignment scheme (generating
5132 aligned loads with a REALIGN_LOAD). There are two variants to the
5133 explicit realignment scheme: optimized, and unoptimized.
5134 We can optimize the realignment only if the step between consecutive
5135 vector loads is equal to the vector size. Since the vector memory
5136 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5137 is guaranteed that the misalignment amount remains the same throughout the
5138 execution of the vectorized loop. Therefore, we can create the
5139 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5140 at the loop preheader.
5142 However, in the case of outer-loop vectorization, when vectorizing a
5143 memory access in the inner-loop nested within the LOOP that is now being
5144 vectorized, while it is guaranteed that the misalignment of the
5145 vectorized memory access will remain the same in different outer-loop
5146 iterations, it is *not* guaranteed that is will remain the same throughout
5147 the execution of the inner-loop. This is because the inner-loop advances
5148 with the original scalar step (and not in steps of VS). If the inner-loop
5149 step happens to be a multiple of VS, then the misalignment remains fixed
5150 and we can use the optimized realignment scheme. For example:
5156 When vectorizing the i-loop in the above example, the step between
5157 consecutive vector loads is 1, and so the misalignment does not remain
5158 fixed across the execution of the inner-loop, and the realignment cannot
5159 be optimized (as illustrated in the following pseudo vectorized loop):
5161 for (i=0; i<N; i+=4)
5162 for (j=0; j<M; j++){
5163 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5164 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5165 // (assuming that we start from an aligned address).
5168 We therefore have to use the unoptimized realignment scheme:
5170 for (i=0; i<N; i+=4)
5171 for (j=k; j<M; j+=4)
5172 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5173 // that the misalignment of the initial address is
5176 The loop can then be vectorized as follows:
5178 for (k=0; k<4; k++){
5179 rt = get_realignment_token (&vp[k]);
5180 for (i=0; i<N; i+=4){
5182 for (j=k; j<M; j+=4){
5184 va = REALIGN_LOAD <v1,v2,rt>;
5191 if (DR_IS_READ (dr
))
5193 bool is_packed
= false;
5194 tree type
= (TREE_TYPE (DR_REF (dr
)));
5196 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5197 && (!targetm
.vectorize
.builtin_mask_for_load
5198 || targetm
.vectorize
.builtin_mask_for_load ()))
5200 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5201 if ((nested_in_vect_loop
5202 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5203 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5205 return dr_explicit_realign
;
5207 return dr_explicit_realign_optimized
;
5209 if (!known_alignment_for_access_p (dr
))
5210 is_packed
= not_size_aligned (DR_REF (dr
));
5212 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5213 || targetm
.vectorize
.
5214 support_vector_misalignment (mode
, type
,
5215 DR_MISALIGNMENT (dr
), is_packed
))
5216 /* Can't software pipeline the loads, but can at least do them. */
5217 return dr_unaligned_supported
;
5221 bool is_packed
= false;
5222 tree type
= (TREE_TYPE (DR_REF (dr
)));
5224 if (!known_alignment_for_access_p (dr
))
5225 is_packed
= not_size_aligned (DR_REF (dr
));
5227 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5228 || targetm
.vectorize
.
5229 support_vector_misalignment (mode
, type
,
5230 DR_MISALIGNMENT (dr
), is_packed
))
5231 return dr_unaligned_supported
;
5235 return dr_unaligned_unsupported
;