1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "double-int.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
42 #include "hard-reg-set.h"
44 #include "dominance.h"
46 #include "basic-block.h"
47 #include "gimple-pretty-print.h"
48 #include "tree-ssa-alias.h"
49 #include "internal-fn.h"
51 #include "gimple-expr.h"
55 #include "gimple-iterator.h"
56 #include "gimplify-me.h"
57 #include "gimple-ssa.h"
58 #include "tree-phinodes.h"
59 #include "ssa-iterators.h"
60 #include "stringpool.h"
61 #include "tree-ssanames.h"
62 #include "tree-ssa-loop-ivopts.h"
63 #include "tree-ssa-loop-manip.h"
64 #include "tree-ssa-loop.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "diagnostic-core.h"
71 #include "plugin-api.h"
74 /* Need to include rtl.h, expr.h, etc. for optabs. */
78 #include "statistics.h"
80 #include "fixed-value.h"
81 #include "insn-config.h"
90 #include "insn-codes.h"
94 /* Return true if load- or store-lanes optab OPTAB is implemented for
95 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
98 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
99 tree vectype
, unsigned HOST_WIDE_INT count
)
101 machine_mode mode
, array_mode
;
104 mode
= TYPE_MODE (vectype
);
105 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
106 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
109 if (array_mode
== BLKmode
)
111 if (dump_enabled_p ())
112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
113 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
114 GET_MODE_NAME (mode
), count
);
118 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
120 if (dump_enabled_p ())
121 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
122 "cannot use %s<%s><%s>\n", name
,
123 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
127 if (dump_enabled_p ())
128 dump_printf_loc (MSG_NOTE
, vect_location
,
129 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
130 GET_MODE_NAME (mode
));
136 /* Return the smallest scalar part of STMT.
137 This is used to determine the vectype of the stmt. We generally set the
138 vectype according to the type of the result (lhs). For stmts whose
139 result-type is different than the type of the arguments (e.g., demotion,
140 promotion), vectype will be reset appropriately (later). Note that we have
141 to visit the smallest datatype in this function, because that determines the
142 VF. If the smallest datatype in the loop is present only as the rhs of a
143 promotion operation - we'd miss it.
144 Such a case, where a variable of this datatype does not appear in the lhs
145 anywhere in the loop, can only occur if it's an invariant: e.g.:
146 'int_x = (int) short_inv', which we'd expect to have been optimized away by
147 invariant motion. However, we cannot rely on invariant motion to always
148 take invariants out of the loop, and so in the case of promotion we also
149 have to check the rhs.
150 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
154 vect_get_smallest_scalar_type (gimple stmt
, HOST_WIDE_INT
*lhs_size_unit
,
155 HOST_WIDE_INT
*rhs_size_unit
)
157 tree scalar_type
= gimple_expr_type (stmt
);
158 HOST_WIDE_INT lhs
, rhs
;
160 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
162 if (is_gimple_assign (stmt
)
163 && (gimple_assign_cast_p (stmt
)
164 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
165 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
166 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
168 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
170 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
172 scalar_type
= rhs_type
;
175 *lhs_size_unit
= lhs
;
176 *rhs_size_unit
= rhs
;
181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
182 tested at run-time. Return TRUE if DDR was successfully inserted.
183 Return false if versioning is not supported. */
186 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
188 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
190 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
193 if (dump_enabled_p ())
195 dump_printf_loc (MSG_NOTE
, vect_location
,
196 "mark for run-time aliasing test between ");
197 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
198 dump_printf (MSG_NOTE
, " and ");
199 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
200 dump_printf (MSG_NOTE
, "\n");
203 if (optimize_loop_nest_for_size_p (loop
))
205 if (dump_enabled_p ())
206 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
207 "versioning not supported when optimizing"
212 /* FORNOW: We don't support versioning with outer-loop vectorization. */
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
217 "versioning not yet supported for outer-loops.\n");
221 /* FORNOW: We don't support creating runtime alias tests for non-constant
223 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
224 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
226 if (dump_enabled_p ())
227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
228 "versioning not yet supported for non-constant "
233 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
238 /* Function vect_analyze_data_ref_dependence.
240 Return TRUE if there (might) exist a dependence between a memory-reference
241 DRA and a memory-reference DRB. When versioning for alias may check a
242 dependence at run-time, return FALSE. Adjust *MAX_VF according to
243 the data dependence. */
246 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
247 loop_vec_info loop_vinfo
, int *max_vf
)
250 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
251 struct data_reference
*dra
= DDR_A (ddr
);
252 struct data_reference
*drb
= DDR_B (ddr
);
253 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
254 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
255 lambda_vector dist_v
;
256 unsigned int loop_depth
;
258 /* In loop analysis all data references should be vectorizable. */
259 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
260 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
263 /* Independent data accesses. */
264 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
268 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
271 /* Even if we have an anti-dependence then, as the vectorized loop covers at
272 least two scalar iterations, there is always also a true dependence.
273 As the vectorizer does not re-order loads and stores we can ignore
274 the anti-dependence if TBAA can disambiguate both DRs similar to the
275 case with known negative distance anti-dependences (positive
276 distance anti-dependences would violate TBAA constraints). */
277 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
278 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
279 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
280 get_alias_set (DR_REF (drb
))))
283 /* Unknown data dependence. */
284 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
286 /* If user asserted safelen consecutive iterations can be
287 executed concurrently, assume independence. */
288 if (loop
->safelen
>= 2)
290 if (loop
->safelen
< *max_vf
)
291 *max_vf
= loop
->safelen
;
292 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
296 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
297 || STMT_VINFO_GATHER_P (stmtinfo_b
))
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
302 "versioning for alias not supported for: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
306 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
309 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
317 "versioning for alias required: "
318 "can't determine dependence between ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
321 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
324 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
331 /* Known data dependence. */
332 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
334 /* If user asserted safelen consecutive iterations can be
335 executed concurrently, assume independence. */
336 if (loop
->safelen
>= 2)
338 if (loop
->safelen
< *max_vf
)
339 *max_vf
= loop
->safelen
;
340 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
344 if (STMT_VINFO_GATHER_P (stmtinfo_a
)
345 || STMT_VINFO_GATHER_P (stmtinfo_b
))
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
350 "versioning for alias not supported for: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
354 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
355 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
357 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
362 if (dump_enabled_p ())
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
365 "versioning for alias required: "
366 "bad dist vector for ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
368 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
369 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
370 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
376 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
377 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
379 int dist
= dist_v
[loop_depth
];
381 if (dump_enabled_p ())
382 dump_printf_loc (MSG_NOTE
, vect_location
,
383 "dependence distance = %d.\n", dist
);
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE
, vect_location
,
390 "dependence distance == 0 between ");
391 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
392 dump_printf (MSG_NOTE
, " and ");
393 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
394 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
397 /* When we perform grouped accesses and perform implicit CSE
398 by detecting equal accesses and doing disambiguation with
399 runtime alias tests like for
407 where we will end up loading { a[i], a[i+1] } once, make
408 sure that inserting group loads before the first load and
409 stores after the last store will do the right thing.
410 Similar for groups like
414 where loads from the group interleave with the store. */
415 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
416 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
419 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
421 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
425 "READ_WRITE dependence in interleaving."
434 if (dist
> 0 && DDR_REVERSED_P (ddr
))
436 /* If DDR_REVERSED_P the order of the data-refs in DDR was
437 reversed (to make distance vector positive), and the actual
438 distance is negative. */
439 if (dump_enabled_p ())
440 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
441 "dependence distance negative.\n");
442 /* Record a negative dependence distance to later limit the
443 amount of stmt copying / unrolling we can perform.
444 Only need to handle read-after-write dependence. */
446 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
447 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
448 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
453 && abs (dist
) < *max_vf
)
455 /* The dependence distance requires reduction of the maximal
456 vectorization factor. */
457 *max_vf
= abs (dist
);
458 if (dump_enabled_p ())
459 dump_printf_loc (MSG_NOTE
, vect_location
,
460 "adjusting maximal vectorization factor to %i\n",
464 if (abs (dist
) >= *max_vf
)
466 /* Dependence distance does not create dependence, as far as
467 vectorization is concerned, in this case. */
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE
, vect_location
,
470 "dependence distance >= VF.\n");
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
477 "not vectorized, possible dependence "
478 "between data-refs ");
479 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
480 dump_printf (MSG_NOTE
, " and ");
481 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
482 dump_printf (MSG_NOTE
, "\n");
491 /* Function vect_analyze_data_ref_dependences.
493 Examine all the data references in the loop, and make sure there do not
494 exist any data dependences between them. Set *MAX_VF according to
495 the maximum vectorization factor the data dependences allow. */
498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
501 struct data_dependence_relation
*ddr
;
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE
, vect_location
,
505 "=== vect_analyze_data_ref_dependences ===\n");
507 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
509 &LOOP_VINFO_DDRS (loop_vinfo
),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
513 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
514 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
521 /* Function vect_slp_analyze_data_ref_dependence.
523 Return TRUE if there (might) exist a dependence between a memory-reference
524 DRA and a memory-reference DRB. When versioning for alias may check a
525 dependence at run-time, return FALSE. Adjust *MAX_VF according to
526 the data dependence. */
529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
531 struct data_reference
*dra
= DDR_A (ddr
);
532 struct data_reference
*drb
= DDR_B (ddr
);
534 /* We need to check dependences of statements marked as unvectorizable
535 as well, they still can prohibit vectorization. */
537 /* Independent data accesses. */
538 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
544 /* Read-read is OK. */
545 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
548 /* If dra and drb are part of the same interleaving chain consider
550 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
551 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
552 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
555 /* Unknown data dependence. */
556 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
558 if (dump_enabled_p ())
560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
561 "can't determine dependence between ");
562 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
563 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
565 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
568 else if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE
, vect_location
,
571 "determined dependence between ");
572 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
573 dump_printf (MSG_NOTE
, " and ");
574 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
575 dump_printf (MSG_NOTE
, "\n");
578 /* We do not vectorize basic blocks with write-write dependencies. */
579 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
582 /* If we have a read-write dependence check that the load is before the store.
583 When we vectorize basic blocks, vector load can be only before
584 corresponding scalar load, and vector store can be only after its
585 corresponding scalar store. So the order of the acceses is preserved in
586 case the load is before the store. */
587 gimple earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
588 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
590 /* That only holds for load-store pairs taking part in vectorization. */
591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
592 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
600 /* Function vect_analyze_data_ref_dependences.
602 Examine all the data references in the basic-block, and make sure there
603 do not exist any data dependences between them. Set *MAX_VF according to
604 the maximum vectorization factor the data dependences allow. */
607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
609 struct data_dependence_relation
*ddr
;
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE
, vect_location
,
614 "=== vect_slp_analyze_data_ref_dependences ===\n");
616 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo
),
617 &BB_VINFO_DDRS (bb_vinfo
),
621 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo
), i
, ddr
)
622 if (vect_slp_analyze_data_ref_dependence (ddr
))
629 /* Function vect_compute_data_ref_alignment
631 Compute the misalignment of the data reference DR.
634 1. If during the misalignment computation it is found that the data reference
635 cannot be vectorized then false is returned.
636 2. DR_MISALIGNMENT (DR) is defined.
638 FOR NOW: No analysis is actually performed. Misalignment is calculated
639 only for trivial cases. TODO. */
642 vect_compute_data_ref_alignment (struct data_reference
*dr
)
644 gimple stmt
= DR_STMT (dr
);
645 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
646 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
647 struct loop
*loop
= NULL
;
648 tree ref
= DR_REF (dr
);
650 tree base
, base_addr
;
654 unsigned HOST_WIDE_INT alignment
;
656 if (dump_enabled_p ())
657 dump_printf_loc (MSG_NOTE
, vect_location
,
658 "vect_compute_data_ref_alignment:\n");
661 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
663 /* Initialize misalignment to unknown. */
664 SET_DR_MISALIGNMENT (dr
, -1);
666 /* Strided loads perform only component accesses, misalignment information
667 is irrelevant for them. */
668 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
671 misalign
= DR_INIT (dr
);
672 aligned_to
= DR_ALIGNED_TO (dr
);
673 base_addr
= DR_BASE_ADDRESS (dr
);
674 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
676 /* In case the dataref is in an inner-loop of the loop that is being
677 vectorized (LOOP), we use the base and misalignment information
678 relative to the outer-loop (LOOP). This is ok only if the misalignment
679 stays the same throughout the execution of the inner-loop, which is why
680 we have to check that the stride of the dataref in the inner-loop evenly
681 divides by the vector size. */
682 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
684 tree step
= DR_STEP (dr
);
685 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
687 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
689 if (dump_enabled_p ())
690 dump_printf_loc (MSG_NOTE
, vect_location
,
691 "inner step divides the vector-size.\n");
692 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
693 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
694 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
700 "inner step doesn't divide the vector-size.\n");
701 misalign
= NULL_TREE
;
705 /* Similarly, if we're doing basic-block vectorization, we can only use
706 base and misalignment information relative to an innermost loop if the
707 misalignment stays the same throughout the execution of the loop.
708 As above, this is the case if the stride of the dataref evenly divides
709 by the vector size. */
712 tree step
= DR_STEP (dr
);
713 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
715 if (dr_step
% GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0)
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
719 "SLP: step doesn't divide the vector-size.\n");
720 misalign
= NULL_TREE
;
724 alignment
= TYPE_ALIGN_UNIT (vectype
);
726 if ((compare_tree_int (aligned_to
, alignment
) < 0)
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
732 "Unknown alignment for access: ");
733 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
734 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
739 /* To look at alignment of the base we have to preserve an inner MEM_REF
740 as that carries alignment information of the actual access. */
742 while (handled_component_p (base
))
743 base
= TREE_OPERAND (base
, 0);
744 if (TREE_CODE (base
) == MEM_REF
)
745 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
746 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
748 if (get_object_alignment (base
) >= TYPE_ALIGN (vectype
))
751 base_aligned
= false;
755 /* Strip an inner MEM_REF to a bare decl if possible. */
756 if (TREE_CODE (base
) == MEM_REF
757 && integer_zerop (TREE_OPERAND (base
, 1))
758 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
759 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
761 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
763 if (dump_enabled_p ())
765 dump_printf_loc (MSG_NOTE
, vect_location
,
766 "can't force alignment of ref: ");
767 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
768 dump_printf (MSG_NOTE
, "\n");
773 /* Force the alignment of the decl.
774 NOTE: This is the only change to the code we make during
775 the analysis phase, before deciding to vectorize the loop. */
776 if (dump_enabled_p ())
778 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
779 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
780 dump_printf (MSG_NOTE
, "\n");
783 ((dataref_aux
*)dr
->aux
)->base_decl
= base
;
784 ((dataref_aux
*)dr
->aux
)->base_misaligned
= true;
787 /* If this is a backward running DR then first access in the larger
788 vectype actually is N-1 elements before the address in the DR.
789 Adjust misalign accordingly. */
790 if (tree_int_cst_sgn (DR_STEP (dr
)) < 0)
792 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
793 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
794 otherwise we wouldn't be here. */
795 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
796 /* PLUS because DR_STEP was negative. */
797 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
800 SET_DR_MISALIGNMENT (dr
,
801 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
803 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
806 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
807 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
808 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
815 /* Function vect_compute_data_refs_alignment
817 Compute the misalignment of data references in the loop.
818 Return FALSE if a data reference is found that cannot be vectorized. */
821 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo
,
822 bb_vec_info bb_vinfo
)
824 vec
<data_reference_p
> datarefs
;
825 struct data_reference
*dr
;
829 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
831 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
833 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
834 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
835 && !vect_compute_data_ref_alignment (dr
))
839 /* Mark unsupported statement as unvectorizable. */
840 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
851 /* Function vect_update_misalignment_for_peel
853 DR - the data reference whose misalignment is to be adjusted.
854 DR_PEEL - the data reference whose misalignment is being made
855 zero in the vector loop by the peel.
856 NPEEL - the number of iterations in the peel loop if the misalignment
857 of DR_PEEL is known at compile time. */
860 vect_update_misalignment_for_peel (struct data_reference
*dr
,
861 struct data_reference
*dr_peel
, int npeel
)
864 vec
<dr_p
> same_align_drs
;
865 struct data_reference
*current_dr
;
866 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
867 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
868 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
869 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
871 /* For interleaved data accesses the step in the loop must be multiplied by
872 the size of the interleaving group. */
873 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
874 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
875 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
876 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
878 /* It can be assumed that the data refs with the same alignment as dr_peel
879 are aligned in the vector loop. */
881 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
882 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
884 if (current_dr
!= dr
)
886 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
887 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
888 SET_DR_MISALIGNMENT (dr
, 0);
892 if (known_alignment_for_access_p (dr
)
893 && known_alignment_for_access_p (dr_peel
))
895 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
896 int misal
= DR_MISALIGNMENT (dr
);
897 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
898 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
899 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
900 SET_DR_MISALIGNMENT (dr
, misal
);
904 if (dump_enabled_p ())
905 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
906 SET_DR_MISALIGNMENT (dr
, -1);
910 /* Function vect_verify_datarefs_alignment
912 Return TRUE if all data references in the loop can be
913 handled with respect to alignment. */
916 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
918 vec
<data_reference_p
> datarefs
;
919 struct data_reference
*dr
;
920 enum dr_alignment_support supportable_dr_alignment
;
924 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
926 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
928 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
930 gimple stmt
= DR_STMT (dr
);
931 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
933 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
936 /* For interleaving, only the alignment of the first access matters.
937 Skip statements marked as not vectorizable. */
938 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
939 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
940 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
943 /* Strided loads perform only component accesses, alignment is
944 irrelevant for them. */
945 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
948 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
949 if (!supportable_dr_alignment
)
951 if (dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
955 "not vectorized: unsupported unaligned load.");
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
958 "not vectorized: unsupported unaligned "
961 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
963 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
967 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE
, vect_location
,
969 "Vectorizing an unaligned access.\n");
974 /* Given an memory reference EXP return whether its alignment is less
978 not_size_aligned (tree exp
)
980 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
983 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
984 > get_object_alignment (exp
));
987 /* Function vector_alignment_reachable_p
989 Return true if vector alignment for DR is reachable by peeling
990 a few loop iterations. Return false otherwise. */
993 vector_alignment_reachable_p (struct data_reference
*dr
)
995 gimple stmt
= DR_STMT (dr
);
996 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
997 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1001 /* For interleaved access we peel only if number of iterations in
1002 the prolog loop ({VF - misalignment}), is a multiple of the
1003 number of the interleaved accesses. */
1004 int elem_size
, mis_in_elements
;
1005 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1007 /* FORNOW: handle only known alignment. */
1008 if (!known_alignment_for_access_p (dr
))
1011 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1012 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1014 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1018 /* If misalignment is known at the compile time then allow peeling
1019 only if natural alignment is reachable through peeling. */
1020 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1022 HOST_WIDE_INT elmsize
=
1023 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1024 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE
, vect_location
,
1027 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1028 dump_printf (MSG_NOTE
,
1029 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1031 if (DR_MISALIGNMENT (dr
) % elmsize
)
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1035 "data size does not divide the misalignment.\n");
1040 if (!known_alignment_for_access_p (dr
))
1042 tree type
= TREE_TYPE (DR_REF (dr
));
1043 bool is_packed
= not_size_aligned (DR_REF (dr
));
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1046 "Unknown misalignment, is_packed = %d\n",is_packed
);
1047 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1048 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1058 /* Calculate the cost of the memory access represented by DR. */
1061 vect_get_data_access_cost (struct data_reference
*dr
,
1062 unsigned int *inside_cost
,
1063 unsigned int *outside_cost
,
1064 stmt_vector_for_cost
*body_cost_vec
)
1066 gimple stmt
= DR_STMT (dr
);
1067 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1068 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1069 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1070 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1071 int ncopies
= vf
/ nunits
;
1073 if (DR_IS_READ (dr
))
1074 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1075 NULL
, body_cost_vec
, false);
1077 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_NOTE
, vect_location
,
1081 "vect_get_data_access_cost: inside_cost = %d, "
1082 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1086 /* Insert DR into peeling hash table with NPEEL as key. */
1089 vect_peeling_hash_insert (loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1092 struct _vect_peel_info elem
, *slot
;
1093 _vect_peel_info
**new_slot
;
1094 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1097 slot
= LOOP_VINFO_PEELING_HTAB (loop_vinfo
)->find (&elem
);
1102 slot
= XNEW (struct _vect_peel_info
);
1103 slot
->npeel
= npeel
;
1107 = LOOP_VINFO_PEELING_HTAB (loop_vinfo
)->find_slot (slot
, INSERT
);
1111 if (!supportable_dr_alignment
1112 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1113 slot
->count
+= VECT_MAX_COST
;
1117 /* Traverse peeling hash table to find peeling option that aligns maximum
1118 number of data accesses. */
1121 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1122 _vect_peel_extended_info
*max
)
1124 vect_peel_info elem
= *slot
;
1126 if (elem
->count
> max
->peel_info
.count
1127 || (elem
->count
== max
->peel_info
.count
1128 && max
->peel_info
.npeel
> elem
->npeel
))
1130 max
->peel_info
.npeel
= elem
->npeel
;
1131 max
->peel_info
.count
= elem
->count
;
1132 max
->peel_info
.dr
= elem
->dr
;
1139 /* Traverse peeling hash table and calculate cost for each peeling option.
1140 Find the one with the lowest cost. */
1143 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1144 _vect_peel_extended_info
*min
)
1146 vect_peel_info elem
= *slot
;
1147 int save_misalignment
, dummy
;
1148 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1149 gimple stmt
= DR_STMT (elem
->dr
);
1150 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1151 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1152 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1153 struct data_reference
*dr
;
1154 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1155 int single_iter_cost
;
1157 prologue_cost_vec
.create (2);
1158 body_cost_vec
.create (2);
1159 epilogue_cost_vec
.create (2);
1161 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1163 stmt
= DR_STMT (dr
);
1164 stmt_info
= vinfo_for_stmt (stmt
);
1165 /* For interleaving, only the alignment of the first access
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1168 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1171 save_misalignment
= DR_MISALIGNMENT (dr
);
1172 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1173 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1175 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1178 single_iter_cost
= vect_get_single_scalar_iteration_cost (loop_vinfo
);
1179 outside_cost
+= vect_get_known_peeling_cost
1180 (loop_vinfo
, elem
->npeel
, &dummy
,
1181 /* ??? We use this cost as number of stmts with scalar_stmt cost,
1182 thus divide by that. This introduces rounding errors, thus better
1183 introduce a new cost kind (raw_cost? scalar_iter_cost?). */
1184 single_iter_cost
/ vect_get_stmt_cost (scalar_stmt
),
1185 &prologue_cost_vec
, &epilogue_cost_vec
);
1187 /* Prologue and epilogue costs are added to the target model later.
1188 These costs depend only on the scalar iteration cost, the
1189 number of peeling iterations finally chosen, and the number of
1190 misaligned statements. So discard the information found here. */
1191 prologue_cost_vec
.release ();
1192 epilogue_cost_vec
.release ();
1194 if (inside_cost
< min
->inside_cost
1195 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1197 min
->inside_cost
= inside_cost
;
1198 min
->outside_cost
= outside_cost
;
1199 min
->body_cost_vec
.release ();
1200 min
->body_cost_vec
= body_cost_vec
;
1201 min
->peel_info
.dr
= elem
->dr
;
1202 min
->peel_info
.npeel
= elem
->npeel
;
1205 body_cost_vec
.release ();
1211 /* Choose best peeling option by traversing peeling hash table and either
1212 choosing an option with the lowest cost (if cost model is enabled) or the
1213 option that aligns as many accesses as possible. */
1215 static struct data_reference
*
1216 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo
,
1217 unsigned int *npeel
,
1218 stmt_vector_for_cost
*body_cost_vec
)
1220 struct _vect_peel_extended_info res
;
1222 res
.peel_info
.dr
= NULL
;
1223 res
.body_cost_vec
= stmt_vector_for_cost ();
1225 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1227 res
.inside_cost
= INT_MAX
;
1228 res
.outside_cost
= INT_MAX
;
1229 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1230 ->traverse
<_vect_peel_extended_info
*,
1231 vect_peeling_hash_get_lowest_cost
> (&res
);
1235 res
.peel_info
.count
= 0;
1236 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1237 ->traverse
<_vect_peel_extended_info
*,
1238 vect_peeling_hash_get_most_frequent
> (&res
);
1241 *npeel
= res
.peel_info
.npeel
;
1242 *body_cost_vec
= res
.body_cost_vec
;
1243 return res
.peel_info
.dr
;
1247 /* Function vect_enhance_data_refs_alignment
1249 This pass will use loop versioning and loop peeling in order to enhance
1250 the alignment of data references in the loop.
1252 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253 original loop is to be vectorized. Any other loops that are created by
1254 the transformations performed in this pass - are not supposed to be
1255 vectorized. This restriction will be relaxed.
1257 This pass will require a cost model to guide it whether to apply peeling
1258 or versioning or a combination of the two. For example, the scheme that
1259 intel uses when given a loop with several memory accesses, is as follows:
1260 choose one memory access ('p') which alignment you want to force by doing
1261 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1262 other accesses are not necessarily aligned, or (2) use loop versioning to
1263 generate one loop in which all accesses are aligned, and another loop in
1264 which only 'p' is necessarily aligned.
1266 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1270 Devising a cost model is the most critical aspect of this work. It will
1271 guide us on which access to peel for, whether to use loop versioning, how
1272 many versions to create, etc. The cost model will probably consist of
1273 generic considerations as well as target specific considerations (on
1274 powerpc for example, misaligned stores are more painful than misaligned
1277 Here are the general steps involved in alignment enhancements:
1279 -- original loop, before alignment analysis:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1285 -- After vect_compute_data_refs_alignment:
1286 for (i=0; i<N; i++){
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1291 -- Possibility 1: we do loop versioning:
1293 for (i=0; i<N; i++){ # loop 1A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1299 for (i=0; i<N; i++){ # loop 1B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 3
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 -- Possibility 2: we do loop peeling:
1306 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1310 for (i = 3; i < N; i++){ # loop 2A
1311 x = q[i]; # DR_MISALIGNMENT(q) = 0
1312 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1315 -- Possibility 3: combination of loop peeling and versioning:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1321 for (i = 3; i<N; i++){ # loop 3A
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = 0
1327 for (i = 3; i<N; i++){ # loop 3B
1328 x = q[i]; # DR_MISALIGNMENT(q) = 0
1329 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1333 These loops are later passed to loop_transform to be vectorized. The
1334 vectorizer will use the alignment information to guide the transformation
1335 (whether to generate regular loads/stores, or with special handling for
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1341 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1342 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1343 enum dr_alignment_support supportable_dr_alignment
;
1344 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1345 struct data_reference
*dr
;
1347 bool do_peeling
= false;
1348 bool do_versioning
= false;
1351 stmt_vec_info stmt_info
;
1352 unsigned int npeel
= 0;
1353 bool all_misalignments_unknown
= true;
1354 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1355 unsigned possible_npeel_number
= 1;
1357 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1358 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1360 if (dump_enabled_p ())
1361 dump_printf_loc (MSG_NOTE
, vect_location
,
1362 "=== vect_enhance_data_refs_alignment ===\n");
1364 /* While cost model enhancements are expected in the future, the high level
1365 view of the code at this time is as follows:
1367 A) If there is a misaligned access then see if peeling to align
1368 this access can make all data references satisfy
1369 vect_supportable_dr_alignment. If so, update data structures
1370 as needed and return true.
1372 B) If peeling wasn't possible and there is a data reference with an
1373 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1374 then see if loop versioning checks can be used to make all data
1375 references satisfy vect_supportable_dr_alignment. If so, update
1376 data structures as needed and return true.
1378 C) If neither peeling nor versioning were successful then return false if
1379 any data reference does not satisfy vect_supportable_dr_alignment.
1381 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1383 Note, Possibility 3 above (which is peeling and versioning together) is not
1384 being done at this time. */
1386 /* (1) Peeling to force alignment. */
1388 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1390 + How many accesses will become aligned due to the peeling
1391 - How many accesses will become unaligned due to the peeling,
1392 and the cost of misaligned accesses.
1393 - The cost of peeling (the extra runtime checks, the increase
1396 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1398 stmt
= DR_STMT (dr
);
1399 stmt_info
= vinfo_for_stmt (stmt
);
1401 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1404 /* For interleaving, only the alignment of the first access
1406 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1407 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1410 /* For invariant accesses there is nothing to enhance. */
1411 if (integer_zerop (DR_STEP (dr
)))
1414 /* Strided loads perform only component accesses, alignment is
1415 irrelevant for them. */
1416 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1419 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1420 do_peeling
= vector_alignment_reachable_p (dr
);
1423 if (known_alignment_for_access_p (dr
))
1425 unsigned int npeel_tmp
;
1426 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1427 size_zero_node
) < 0;
1429 /* Save info about DR in the hash table. */
1430 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo
))
1431 LOOP_VINFO_PEELING_HTAB (loop_vinfo
)
1432 = new hash_table
<peel_info_hasher
> (1);
1434 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1435 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1436 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1437 TREE_TYPE (DR_REF (dr
))));
1438 npeel_tmp
= (negative
1439 ? (mis
- nelements
) : (nelements
- mis
))
1442 /* For multiple types, it is possible that the bigger type access
1443 will have more than one peeling option. E.g., a loop with two
1444 types: one of size (vector size / 4), and the other one of
1445 size (vector size / 8). Vectorization factor will 8. If both
1446 access are misaligned by 3, the first one needs one scalar
1447 iteration to be aligned, and the second one needs 5. But the
1448 the first one will be aligned also by peeling 5 scalar
1449 iterations, and in that case both accesses will be aligned.
1450 Hence, except for the immediate peeling amount, we also want
1451 to try to add full vector size, while we don't exceed
1452 vectorization factor.
1453 We do this automtically for cost model, since we calculate cost
1454 for every peeling option. */
1455 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1456 possible_npeel_number
= vf
/nelements
;
1458 /* Handle the aligned case. We may decide to align some other
1459 access, making DR unaligned. */
1460 if (DR_MISALIGNMENT (dr
) == 0)
1463 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1464 possible_npeel_number
++;
1467 for (j
= 0; j
< possible_npeel_number
; j
++)
1469 gcc_assert (npeel_tmp
<= vf
);
1470 vect_peeling_hash_insert (loop_vinfo
, dr
, npeel_tmp
);
1471 npeel_tmp
+= nelements
;
1474 all_misalignments_unknown
= false;
1475 /* Data-ref that was chosen for the case that all the
1476 misalignments are unknown is not relevant anymore, since we
1477 have a data-ref with known alignment. */
1482 /* If we don't know any misalignment values, we prefer
1483 peeling for data-ref that has the maximum number of data-refs
1484 with the same alignment, unless the target prefers to align
1485 stores over load. */
1486 if (all_misalignments_unknown
)
1488 unsigned same_align_drs
1489 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1491 || same_align_drs_max
< same_align_drs
)
1493 same_align_drs_max
= same_align_drs
;
1496 /* For data-refs with the same number of related
1497 accesses prefer the one where the misalign
1498 computation will be invariant in the outermost loop. */
1499 else if (same_align_drs_max
== same_align_drs
)
1501 struct loop
*ivloop0
, *ivloop
;
1502 ivloop0
= outermost_invariant_loop_for_expr
1503 (loop
, DR_BASE_ADDRESS (dr0
));
1504 ivloop
= outermost_invariant_loop_for_expr
1505 (loop
, DR_BASE_ADDRESS (dr
));
1506 if ((ivloop
&& !ivloop0
)
1507 || (ivloop
&& ivloop0
1508 && flow_loop_nested_p (ivloop
, ivloop0
)))
1512 if (!first_store
&& DR_IS_WRITE (dr
))
1516 /* If there are both known and unknown misaligned accesses in the
1517 loop, we choose peeling amount according to the known
1519 if (!supportable_dr_alignment
)
1522 if (!first_store
&& DR_IS_WRITE (dr
))
1529 if (!aligned_access_p (dr
))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1533 "vector alignment may not be reachable\n");
1539 /* Check if we can possibly peel the loop. */
1540 if (!vect_can_advance_ivs_p (loop_vinfo
)
1541 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
)))
1544 /* If we don't know how many times the peeling loop will run
1545 assume it will run VF-1 times and disable peeling if the remaining
1546 iters are less than the vectorization factor. */
1548 && all_misalignments_unknown
1549 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1550 && (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1551 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1))
1555 && all_misalignments_unknown
1556 && vect_supportable_dr_alignment (dr0
, false))
1558 /* Check if the target requires to prefer stores over loads, i.e., if
1559 misaligned stores are more expensive than misaligned loads (taking
1560 drs with same alignment into account). */
1561 if (first_store
&& DR_IS_READ (dr0
))
1563 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1564 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1565 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1566 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1567 stmt_vector_for_cost dummy
;
1570 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1572 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1573 &store_outside_cost
, &dummy
);
1577 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1578 aligning the load DR0). */
1579 load_inside_penalty
= store_inside_cost
;
1580 load_outside_penalty
= store_outside_cost
;
1582 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583 DR_STMT (first_store
))).iterate (i
, &dr
);
1585 if (DR_IS_READ (dr
))
1587 load_inside_penalty
+= load_inside_cost
;
1588 load_outside_penalty
+= load_outside_cost
;
1592 load_inside_penalty
+= store_inside_cost
;
1593 load_outside_penalty
+= store_outside_cost
;
1596 /* Calculate the penalty for leaving DR0 unaligned (by
1597 aligning the FIRST_STORE). */
1598 store_inside_penalty
= load_inside_cost
;
1599 store_outside_penalty
= load_outside_cost
;
1601 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1602 DR_STMT (dr0
))).iterate (i
, &dr
);
1604 if (DR_IS_READ (dr
))
1606 store_inside_penalty
+= load_inside_cost
;
1607 store_outside_penalty
+= load_outside_cost
;
1611 store_inside_penalty
+= store_inside_cost
;
1612 store_outside_penalty
+= store_outside_cost
;
1615 if (load_inside_penalty
> store_inside_penalty
1616 || (load_inside_penalty
== store_inside_penalty
1617 && load_outside_penalty
> store_outside_penalty
))
1621 /* In case there are only loads with different unknown misalignments, use
1622 peeling only if it may help to align other accesses in the loop. */
1624 && !STMT_VINFO_SAME_ALIGN_REFS (
1625 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1626 && vect_supportable_dr_alignment (dr0
, false)
1627 != dr_unaligned_supported
)
1631 if (do_peeling
&& !dr0
)
1633 /* Peeling is possible, but there is no data access that is not supported
1634 unless aligned. So we try to choose the best possible peeling. */
1636 /* We should get here only if there are drs with known misalignment. */
1637 gcc_assert (!all_misalignments_unknown
);
1639 /* Choose the best peeling from the hash table. */
1640 dr0
= vect_peeling_hash_choose_best_peeling (loop_vinfo
, &npeel
,
1645 /* If peeling by npeel will result in a remaining loop not iterating
1646 enough to be vectorized then do not peel. */
1648 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1649 && (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1650 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + npeel
))
1656 stmt
= DR_STMT (dr0
);
1657 stmt_info
= vinfo_for_stmt (stmt
);
1658 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1659 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1661 if (known_alignment_for_access_p (dr0
))
1663 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1664 size_zero_node
) < 0;
1667 /* Since it's known at compile time, compute the number of
1668 iterations in the peeled loop (the peeling factor) for use in
1669 updating DR_MISALIGNMENT values. The peeling factor is the
1670 vectorization factor minus the misalignment as an element
1672 mis
= DR_MISALIGNMENT (dr0
);
1673 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1674 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1678 /* For interleaved data access every iteration accesses all the
1679 members of the group, therefore we divide the number of iterations
1680 by the group size. */
1681 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1682 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1683 npeel
/= GROUP_SIZE (stmt_info
);
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE
, vect_location
,
1687 "Try peeling by %d\n", npeel
);
1690 /* Ensure that all data refs can be vectorized after the peel. */
1691 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1693 int save_misalignment
;
1698 stmt
= DR_STMT (dr
);
1699 stmt_info
= vinfo_for_stmt (stmt
);
1700 /* For interleaving, only the alignment of the first access
1702 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1703 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1706 /* Strided loads perform only component accesses, alignment is
1707 irrelevant for them. */
1708 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1711 save_misalignment
= DR_MISALIGNMENT (dr
);
1712 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1713 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1714 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1716 if (!supportable_dr_alignment
)
1723 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1725 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1730 body_cost_vec
.release ();
1737 unsigned max_allowed_peel
1738 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1739 if (max_allowed_peel
!= (unsigned)-1)
1741 unsigned max_peel
= npeel
;
1744 gimple dr_stmt
= DR_STMT (dr0
);
1745 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1746 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1747 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1749 if (max_peel
> max_allowed_peel
)
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_NOTE
, vect_location
,
1754 "Disable peeling, max peels reached: %d\n", max_peel
);
1761 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1762 If the misalignment of DR_i is identical to that of dr0 then set
1763 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1764 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1765 by the peeling factor times the element size of DR_i (MOD the
1766 vectorization factor times the size). Otherwise, the
1767 misalignment of DR_i must be set to unknown. */
1768 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1770 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1772 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1774 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1776 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1777 = DR_MISALIGNMENT (dr0
);
1778 SET_DR_MISALIGNMENT (dr0
, 0);
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE
, vect_location
,
1782 "Alignment of access forced using peeling.\n");
1783 dump_printf_loc (MSG_NOTE
, vect_location
,
1784 "Peeling for alignment will be applied.\n");
1786 /* The inside-loop cost will be accounted for in vectorizable_load
1787 and vectorizable_store correctly with adjusted alignments.
1788 Drop the body_cst_vec on the floor here. */
1789 body_cost_vec
.release ();
1791 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1797 body_cost_vec
.release ();
1799 /* (2) Versioning to force alignment. */
1801 /* Try versioning if:
1802 1) optimize loop for speed
1803 2) there is at least one unsupported misaligned data ref with an unknown
1805 3) all misaligned data refs with a known misalignment are supported, and
1806 4) the number of runtime alignment checks is within reason. */
1809 optimize_loop_nest_for_speed_p (loop
)
1810 && (!loop
->inner
); /* FORNOW */
1814 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1816 stmt
= DR_STMT (dr
);
1817 stmt_info
= vinfo_for_stmt (stmt
);
1819 /* For interleaving, only the alignment of the first access
1821 if (aligned_access_p (dr
)
1822 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1823 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1826 /* Strided loads perform only component accesses, alignment is
1827 irrelevant for them. */
1828 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info
))
1831 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1833 if (!supportable_dr_alignment
)
1839 if (known_alignment_for_access_p (dr
)
1840 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1841 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1843 do_versioning
= false;
1847 stmt
= DR_STMT (dr
);
1848 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1849 gcc_assert (vectype
);
1851 /* The rightmost bits of an aligned address must be zeros.
1852 Construct the mask needed for this test. For example,
1853 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1854 mask must be 15 = 0xf. */
1855 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1857 /* FORNOW: use the same mask to test all potentially unaligned
1858 references in the loop. The vectorizer currently supports
1859 a single vector size, see the reference to
1860 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1861 vectorization factor is computed. */
1862 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1863 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1864 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1865 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1870 /* Versioning requires at least one misaligned data reference. */
1871 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1872 do_versioning
= false;
1873 else if (!do_versioning
)
1874 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1879 vec
<gimple
> may_misalign_stmts
1880 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1883 /* It can now be assumed that the data references in the statements
1884 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1885 of the loop being vectorized. */
1886 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1888 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1889 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1890 SET_DR_MISALIGNMENT (dr
, 0);
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_NOTE
, vect_location
,
1893 "Alignment of access forced using versioning.\n");
1896 if (dump_enabled_p ())
1897 dump_printf_loc (MSG_NOTE
, vect_location
,
1898 "Versioning for alignment will be applied.\n");
1900 /* Peeling and versioning can't be done together at this time. */
1901 gcc_assert (! (do_peeling
&& do_versioning
));
1903 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1908 /* This point is reached if neither peeling nor versioning is being done. */
1909 gcc_assert (! (do_peeling
|| do_versioning
));
1911 stat
= vect_verify_datarefs_alignment (loop_vinfo
, NULL
);
1916 /* Function vect_find_same_alignment_drs.
1918 Update group and alignment relations according to the chosen
1919 vectorization factor. */
1922 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1923 loop_vec_info loop_vinfo
)
1926 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1927 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1928 struct data_reference
*dra
= DDR_A (ddr
);
1929 struct data_reference
*drb
= DDR_B (ddr
);
1930 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1931 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1932 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1933 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1934 lambda_vector dist_v
;
1935 unsigned int loop_depth
;
1937 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1943 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1946 /* Loop-based vectorization and known data dependence. */
1947 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
1950 /* Data-dependence analysis reports a distance vector of zero
1951 for data-references that overlap only in the first iteration
1952 but have different sign step (see PR45764).
1953 So as a sanity check require equal DR_STEP. */
1954 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
1957 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
1958 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1960 int dist
= dist_v
[loop_depth
];
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE
, vect_location
,
1964 "dependence distance = %d.\n", dist
);
1966 /* Same loop iteration. */
1968 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
1970 /* Two references with distance zero have the same alignment. */
1971 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
1972 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
1973 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_NOTE
, vect_location
,
1976 "accesses have the same alignment.\n");
1977 dump_printf (MSG_NOTE
,
1978 "dependence distance modulo vf == 0 between ");
1979 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
1980 dump_printf (MSG_NOTE
, " and ");
1981 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
1982 dump_printf (MSG_NOTE
, "\n");
1989 /* Function vect_analyze_data_refs_alignment
1991 Analyze the alignment of the data-references in the loop.
1992 Return FALSE if a data reference is found that cannot be vectorized. */
1995 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
,
1996 bb_vec_info bb_vinfo
)
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE
, vect_location
,
2000 "=== vect_analyze_data_refs_alignment ===\n");
2002 /* Mark groups of data references with same alignment using
2003 data dependence information. */
2006 vec
<ddr_p
> ddrs
= LOOP_VINFO_DDRS (loop_vinfo
);
2007 struct data_dependence_relation
*ddr
;
2010 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2011 vect_find_same_alignment_drs (ddr
, loop_vinfo
);
2014 if (!vect_compute_data_refs_alignment (loop_vinfo
, bb_vinfo
))
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2018 "not vectorized: can't calculate alignment "
2027 /* Analyze groups of accesses: check that DR belongs to a group of
2028 accesses of legal size, step, etc. Detect gaps, single element
2029 interleaving, and other special cases. Set grouped access info.
2030 Collect groups of strided stores for further use in SLP analysis. */
2033 vect_analyze_group_access (struct data_reference
*dr
)
2035 tree step
= DR_STEP (dr
);
2036 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2037 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2038 gimple stmt
= DR_STMT (dr
);
2039 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2040 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2041 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2042 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2043 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2044 bool slp_impossible
= false;
2045 struct loop
*loop
= NULL
;
2048 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2050 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2051 size of the interleaving group (including gaps). */
2052 groupsize
= absu_hwi (dr_step
) / type_size
;
2054 /* Not consecutive access is possible only if it is a part of interleaving. */
2055 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2057 /* Check if it this DR is a part of interleaving, and is a single
2058 element of the group that is accessed in the loop. */
2060 /* Gaps are supported only for loads. STEP must be a multiple of the type
2061 size. The size of the group must be a power of 2. */
2063 && (dr_step
% type_size
) == 0
2065 && exact_log2 (groupsize
) != -1)
2067 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2068 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2069 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE
, vect_location
,
2072 "Detected single element interleaving ");
2073 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2074 dump_printf (MSG_NOTE
, " step ");
2075 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2076 dump_printf (MSG_NOTE
, "\n");
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE
, vect_location
,
2083 "Data access with gaps requires scalar "
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2089 "Peeling for outer loop is not"
2094 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2100 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2103 "not consecutive access ");
2104 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2105 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2110 /* Mark the statement as unvectorizable. */
2111 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2118 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2120 /* First stmt in the interleaving chain. Check the chain. */
2121 gimple next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2122 struct data_reference
*data_ref
= dr
;
2123 unsigned int count
= 1;
2124 tree prev_init
= DR_INIT (data_ref
);
2126 HOST_WIDE_INT diff
, gaps
= 0;
2127 unsigned HOST_WIDE_INT count_in_bytes
;
2131 /* Skip same data-refs. In case that two or more stmts share
2132 data-ref (supported only for loads), we vectorize only the first
2133 stmt, and the rest get their vectorized loads from the first
2135 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2136 DR_INIT (STMT_VINFO_DATA_REF (
2137 vinfo_for_stmt (next
)))))
2139 if (DR_IS_WRITE (data_ref
))
2141 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2143 "Two store stmts share the same dr.\n");
2147 /* For load use the same data-ref load. */
2148 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2151 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2156 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2158 /* All group members have the same STEP by construction. */
2159 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2161 /* Check that the distance between two accesses is equal to the type
2162 size. Otherwise, we have gaps. */
2163 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2164 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2167 /* FORNOW: SLP of accesses with gaps is not supported. */
2168 slp_impossible
= true;
2169 if (DR_IS_WRITE (data_ref
))
2171 if (dump_enabled_p ())
2172 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2173 "interleaved store with gaps\n");
2180 last_accessed_element
+= diff
;
2182 /* Store the gap from the previous member of the group. If there is no
2183 gap in the access, GROUP_GAP is always 1. */
2184 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2186 prev_init
= DR_INIT (data_ref
);
2187 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2188 /* Count the number of data-refs in the chain. */
2192 /* COUNT is the number of accesses found, we multiply it by the size of
2193 the type to get COUNT_IN_BYTES. */
2194 count_in_bytes
= type_size
* count
;
2196 /* Check that the size of the interleaving (including gaps) is not
2197 greater than STEP. */
2199 && absu_hwi (dr_step
) < count_in_bytes
+ gaps
* type_size
)
2201 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2204 "interleaving size is greater than step for ");
2205 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2207 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2212 /* Check that the size of the interleaving is equal to STEP for stores,
2213 i.e., that there are no gaps. */
2215 && absu_hwi (dr_step
) != count_in_bytes
)
2217 if (DR_IS_READ (dr
))
2219 slp_impossible
= true;
2220 /* There is a gap after the last load in the group. This gap is a
2221 difference between the groupsize and the number of elements.
2222 When there is no gap, this difference should be 0. */
2223 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- count
;
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2229 "interleaved store with gaps\n");
2234 /* Check that STEP is a multiple of type size. */
2236 && (dr_step
% type_size
) != 0)
2238 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2241 "step is not a multiple of type size: step ");
2242 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, step
);
2243 dump_printf (MSG_MISSED_OPTIMIZATION
, " size ");
2244 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
2245 TYPE_SIZE_UNIT (scalar_type
));
2246 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
2254 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE
, vect_location
,
2257 "Detected interleaving of size %d\n", (int)groupsize
);
2259 /* SLP: create an SLP data structure for every interleaving group of
2260 stores for further analysis in vect_analyse_slp. */
2261 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2264 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2266 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2269 /* There is a gap in the end of the group. */
2270 if (groupsize
- last_accessed_element
> 0 && loop_vinfo
)
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2274 "Data access with gaps requires scalar "
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2280 "Peeling for outer loop is not supported\n");
2284 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2292 /* Analyze the access pattern of the data-reference DR.
2293 In case of non-consecutive accesses call vect_analyze_group_access() to
2294 analyze groups of accesses. */
2297 vect_analyze_data_ref_access (struct data_reference
*dr
)
2299 tree step
= DR_STEP (dr
);
2300 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2301 gimple stmt
= DR_STMT (dr
);
2302 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2303 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2304 struct loop
*loop
= NULL
;
2307 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2309 if (loop_vinfo
&& !step
)
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2313 "bad data-ref access in loop\n");
2317 /* Allow invariant loads in not nested loops. */
2318 if (loop_vinfo
&& integer_zerop (step
))
2320 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2321 if (nested_in_vect_loop_p (loop
, stmt
))
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE
, vect_location
,
2325 "zero step in inner loop of nest\n");
2328 return DR_IS_READ (dr
);
2331 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2333 /* Interleaved accesses are not yet supported within outer-loop
2334 vectorization for references in the inner-loop. */
2335 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2337 /* For the rest of the analysis we use the outer-loop step. */
2338 step
= STMT_VINFO_DR_STEP (stmt_info
);
2339 if (integer_zerop (step
))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_NOTE
, vect_location
,
2343 "zero step in outer loop.\n");
2344 if (DR_IS_READ (dr
))
2352 if (TREE_CODE (step
) == INTEGER_CST
)
2354 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2355 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2357 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2359 /* Mark that it is not interleaving. */
2360 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2365 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE
, vect_location
,
2369 "grouped access in outer loop.\n");
2373 /* Assume this is a DR handled by non-constant strided load case. */
2374 if (TREE_CODE (step
) != INTEGER_CST
)
2375 return STMT_VINFO_STRIDE_LOAD_P (stmt_info
);
2377 /* Not consecutive access - check if it's a part of interleaving group. */
2378 return vect_analyze_group_access (dr
);
2383 /* A helper function used in the comparator function to sort data
2384 references. T1 and T2 are two data references to be compared.
2385 The function returns -1, 0, or 1. */
2388 compare_tree (tree t1
, tree t2
)
2391 enum tree_code code
;
2402 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2403 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2405 code
= TREE_CODE (t1
);
2408 /* For const values, we can just use hash values for comparisons. */
2416 hashval_t h1
= iterative_hash_expr (t1
, 0);
2417 hashval_t h2
= iterative_hash_expr (t2
, 0);
2419 return h1
< h2
? -1 : 1;
2424 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2428 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2429 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2433 tclass
= TREE_CODE_CLASS (code
);
2435 /* For var-decl, we could compare their UIDs. */
2436 if (tclass
== tcc_declaration
)
2438 if (DECL_UID (t1
) != DECL_UID (t2
))
2439 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2443 /* For expressions with operands, compare their operands recursively. */
2444 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2446 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2456 /* Compare two data-references DRA and DRB to group them into chunks
2457 suitable for grouping. */
2460 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2462 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2463 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2466 /* Stabilize sort. */
2470 /* Ordering of DRs according to base. */
2471 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2473 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2478 /* And according to DR_OFFSET. */
2479 if (!dr_equal_offsets_p (dra
, drb
))
2481 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2486 /* Put reads before writes. */
2487 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2488 return DR_IS_READ (dra
) ? -1 : 1;
2490 /* Then sort after access size. */
2491 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2492 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2494 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2500 /* And after step. */
2501 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2503 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2508 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2509 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2511 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2515 /* Function vect_analyze_data_ref_accesses.
2517 Analyze the access pattern of all the data references in the loop.
2519 FORNOW: the only access pattern that is considered vectorizable is a
2520 simple step 1 (consecutive) access.
2522 FORNOW: handle only arrays and pointer accesses. */
2525 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
2528 vec
<data_reference_p
> datarefs
;
2529 struct data_reference
*dr
;
2531 if (dump_enabled_p ())
2532 dump_printf_loc (MSG_NOTE
, vect_location
,
2533 "=== vect_analyze_data_ref_accesses ===\n");
2536 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2538 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
2540 if (datarefs
.is_empty ())
2543 /* Sort the array of datarefs to make building the interleaving chains
2544 linear. Don't modify the original vector's order, it is needed for
2545 determining what dependencies are reversed. */
2546 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2547 datarefs_copy
.qsort (dr_group_sort_cmp
);
2549 /* Build the interleaving chains. */
2550 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2552 data_reference_p dra
= datarefs_copy
[i
];
2553 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2554 stmt_vec_info lastinfo
= NULL
;
2555 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2557 data_reference_p drb
= datarefs_copy
[i
];
2558 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2560 /* ??? Imperfect sorting (non-compatible types, non-modulo
2561 accesses, same accesses) can lead to a group to be artificially
2562 split here as we don't just skip over those. If it really
2563 matters we can push those to a worklist and re-iterate
2564 over them. The we can just skip ahead to the next DR here. */
2566 /* Check that the data-refs have same first location (except init)
2567 and they are both either store or load (not load and store,
2568 not masked loads or stores). */
2569 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2570 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2571 DR_BASE_ADDRESS (drb
), 0)
2572 || !dr_equal_offsets_p (dra
, drb
)
2573 || !gimple_assign_single_p (DR_STMT (dra
))
2574 || !gimple_assign_single_p (DR_STMT (drb
)))
2577 /* Check that the data-refs have the same constant size and step. */
2578 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2579 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2580 if (!tree_fits_uhwi_p (sza
)
2581 || !tree_fits_uhwi_p (szb
)
2582 || !tree_int_cst_equal (sza
, szb
)
2583 || !tree_fits_shwi_p (DR_STEP (dra
))
2584 || !tree_fits_shwi_p (DR_STEP (drb
))
2585 || !tree_int_cst_equal (DR_STEP (dra
), DR_STEP (drb
)))
2588 /* Do not place the same access in the interleaving chain twice. */
2589 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2592 /* Check the types are compatible.
2593 ??? We don't distinguish this during sorting. */
2594 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2595 TREE_TYPE (DR_REF (drb
))))
2598 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2599 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2600 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2601 gcc_assert (init_a
< init_b
);
2603 /* If init_b == init_a + the size of the type * k, we have an
2604 interleaving, and DRA is accessed before DRB. */
2605 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2606 if ((init_b
- init_a
) % type_size_a
!= 0)
2609 /* The step (if not zero) is greater than the difference between
2610 data-refs' inits. This splits groups into suitable sizes. */
2611 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2612 if (step
!= 0 && step
<= (init_b
- init_a
))
2615 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE
, vect_location
,
2618 "Detected interleaving ");
2619 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2620 dump_printf (MSG_NOTE
, " and ");
2621 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2622 dump_printf (MSG_NOTE
, "\n");
2625 /* Link the found element into the group list. */
2626 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2628 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2629 lastinfo
= stmtinfo_a
;
2631 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2632 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2633 lastinfo
= stmtinfo_b
;
2637 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2638 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2639 && !vect_analyze_data_ref_access (dr
))
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2643 "not vectorized: complicated access pattern.\n");
2647 /* Mark the statement as not vectorizable. */
2648 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2653 datarefs_copy
.release ();
2658 datarefs_copy
.release ();
2663 /* Operator == between two dr_with_seg_len objects.
2665 This equality operator is used to make sure two data refs
2666 are the same one so that we will consider to combine the
2667 aliasing checks of those two pairs of data dependent data
2671 operator == (const dr_with_seg_len
& d1
,
2672 const dr_with_seg_len
& d2
)
2674 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2675 DR_BASE_ADDRESS (d2
.dr
), 0)
2676 && compare_tree (d1
.offset
, d2
.offset
) == 0
2677 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2680 /* Function comp_dr_with_seg_len_pair.
2682 Comparison function for sorting objects of dr_with_seg_len_pair_t
2683 so that we can combine aliasing checks in one scan. */
2686 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2688 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2689 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2691 const dr_with_seg_len
&p11
= p1
->first
,
2696 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2697 if a and c have the same basic address snd step, and b and d have the same
2698 address and step. Therefore, if any a&c or b&d don't have the same address
2699 and step, we don't care the order of those two pairs after sorting. */
2702 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2703 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2705 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2706 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2708 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2710 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2712 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2714 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2720 /* Function vect_vfa_segment_size.
2722 Create an expression that computes the size of segment
2723 that will be accessed for a data reference. The functions takes into
2724 account that realignment loads may access one more vector.
2727 DR: The data reference.
2728 LENGTH_FACTOR: segment length to consider.
2730 Return an expression whose value is the size of segment which will be
2734 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2736 tree segment_length
;
2738 if (integer_zerop (DR_STEP (dr
)))
2739 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2741 segment_length
= size_binop (MULT_EXPR
,
2742 fold_convert (sizetype
, DR_STEP (dr
)),
2743 fold_convert (sizetype
, length_factor
));
2745 if (vect_supportable_dr_alignment (dr
, false)
2746 == dr_explicit_realign_optimized
)
2748 tree vector_size
= TYPE_SIZE_UNIT
2749 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2751 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2753 return segment_length
;
2756 /* Function vect_prune_runtime_alias_test_list.
2758 Prune a list of ddrs to be tested at run-time by versioning for alias.
2759 Merge several alias checks into one if possible.
2760 Return FALSE if resulting list of ddrs is longer then allowed by
2761 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2764 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2766 vec
<ddr_p
> may_alias_ddrs
=
2767 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2768 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2769 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2770 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2771 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_NOTE
, vect_location
,
2779 "=== vect_prune_runtime_alias_test_list ===\n");
2781 if (may_alias_ddrs
.is_empty ())
2784 /* Basically, for each pair of dependent data refs store_ptr_0
2785 and load_ptr_0, we create an expression:
2787 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2790 for aliasing checks. However, in some cases we can decrease
2791 the number of checks by combining two checks into one. For
2792 example, suppose we have another pair of data refs store_ptr_0
2793 and load_ptr_1, and if the following condition is satisfied:
2795 load_ptr_0 < load_ptr_1 &&
2796 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2798 (this condition means, in each iteration of vectorized loop,
2799 the accessed memory of store_ptr_0 cannot be between the memory
2800 of load_ptr_0 and load_ptr_1.)
2802 we then can use only the following expression to finish the
2803 alising checks between store_ptr_0 & load_ptr_0 and
2804 store_ptr_0 & load_ptr_1:
2806 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2807 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2809 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2810 same basic address. */
2812 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2814 /* First, we collect all data ref pairs for aliasing checks. */
2815 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2817 struct data_reference
*dr_a
, *dr_b
;
2818 gimple dr_group_first_a
, dr_group_first_b
;
2819 tree segment_length_a
, segment_length_b
;
2820 gimple stmt_a
, stmt_b
;
2823 stmt_a
= DR_STMT (DDR_A (ddr
));
2824 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2825 if (dr_group_first_a
)
2827 stmt_a
= dr_group_first_a
;
2828 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2832 stmt_b
= DR_STMT (DDR_B (ddr
));
2833 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2834 if (dr_group_first_b
)
2836 stmt_b
= dr_group_first_b
;
2837 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2840 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2841 length_factor
= scalar_loop_iters
;
2843 length_factor
= size_int (vect_factor
);
2844 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2845 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2847 dr_with_seg_len_pair_t dr_with_seg_len_pair
2848 (dr_with_seg_len (dr_a
, segment_length_a
),
2849 dr_with_seg_len (dr_b
, segment_length_b
));
2851 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2852 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2854 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2857 /* Second, we sort the collected data ref pairs so that we can scan
2858 them once to combine all possible aliasing checks. */
2859 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2861 /* Third, we scan the sorted dr pairs and check if we can combine
2862 alias checks of two neighbouring dr pairs. */
2863 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2865 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2866 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2867 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2868 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2869 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2871 /* Remove duplicate data ref pairs. */
2872 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2874 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_NOTE
, vect_location
,
2877 "found equal ranges ");
2878 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2879 DR_REF (dr_a1
->dr
));
2880 dump_printf (MSG_NOTE
, ", ");
2881 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2882 DR_REF (dr_b1
->dr
));
2883 dump_printf (MSG_NOTE
, " and ");
2884 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2885 DR_REF (dr_a2
->dr
));
2886 dump_printf (MSG_NOTE
, ", ");
2887 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2888 DR_REF (dr_b2
->dr
));
2889 dump_printf (MSG_NOTE
, "\n");
2892 comp_alias_ddrs
.ordered_remove (i
--);
2896 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
2898 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2899 and DR_A1 and DR_A2 are two consecutive memrefs. */
2900 if (*dr_a1
== *dr_a2
)
2902 std::swap (dr_a1
, dr_b1
);
2903 std::swap (dr_a2
, dr_b2
);
2906 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
2907 DR_BASE_ADDRESS (dr_a2
->dr
),
2909 || !tree_fits_shwi_p (dr_a1
->offset
)
2910 || !tree_fits_shwi_p (dr_a2
->offset
))
2913 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
2914 - tree_to_shwi (dr_a1
->offset
));
2917 /* Now we check if the following condition is satisfied:
2919 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2921 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2922 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2923 have to make a best estimation. We can get the minimum value
2924 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2925 then either of the following two conditions can guarantee the
2928 1: DIFF <= MIN_SEG_LEN_B
2929 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2933 HOST_WIDE_INT min_seg_len_b
= (tree_fits_shwi_p (dr_b1
->seg_len
)
2934 ? tree_to_shwi (dr_b1
->seg_len
)
2937 if (diff
<= min_seg_len_b
2938 || (tree_fits_shwi_p (dr_a1
->seg_len
)
2939 && diff
- tree_to_shwi (dr_a1
->seg_len
) < min_seg_len_b
))
2941 if (dump_enabled_p ())
2943 dump_printf_loc (MSG_NOTE
, vect_location
,
2944 "merging ranges for ");
2945 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2946 DR_REF (dr_a1
->dr
));
2947 dump_printf (MSG_NOTE
, ", ");
2948 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2949 DR_REF (dr_b1
->dr
));
2950 dump_printf (MSG_NOTE
, " and ");
2951 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2952 DR_REF (dr_a2
->dr
));
2953 dump_printf (MSG_NOTE
, ", ");
2954 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2955 DR_REF (dr_b2
->dr
));
2956 dump_printf (MSG_NOTE
, "\n");
2959 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
2960 dr_a2
->seg_len
, size_int (diff
));
2961 comp_alias_ddrs
.ordered_remove (i
--);
2966 dump_printf_loc (MSG_NOTE
, vect_location
,
2967 "improved number of alias checks from %d to %d\n",
2968 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
2969 if ((int) comp_alias_ddrs
.length () >
2970 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2976 /* Check whether a non-affine read in stmt is suitable for gather load
2977 and if so, return a builtin decl for that operation. */
2980 vect_check_gather (gimple stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
2981 tree
*offp
, int *scalep
)
2983 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
2984 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2985 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2986 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2987 tree offtype
= NULL_TREE
;
2988 tree decl
, base
, off
;
2990 int punsignedp
, pvolatilep
;
2993 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2994 see if we can use the def stmt of the address. */
2995 if (is_gimple_call (stmt
)
2996 && gimple_call_internal_p (stmt
)
2997 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
2998 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
2999 && TREE_CODE (base
) == MEM_REF
3000 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3001 && integer_zerop (TREE_OPERAND (base
, 1))
3002 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3004 gimple def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3005 if (is_gimple_assign (def_stmt
)
3006 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3007 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3010 /* The gather builtins need address of the form
3011 loop_invariant + vector * {1, 2, 4, 8}
3013 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3014 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3015 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3016 multiplications and additions in it. To get a vector, we need
3017 a single SSA_NAME that will be defined in the loop and will
3018 contain everything that is not loop invariant and that can be
3019 vectorized. The following code attempts to find such a preexistng
3020 SSA_NAME OFF and put the loop invariants into a tree BASE
3021 that can be gimplified before the loop. */
3022 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
,
3023 &pmode
, &punsignedp
, &pvolatilep
, false);
3024 gcc_assert (base
!= NULL_TREE
&& (pbitpos
% BITS_PER_UNIT
) == 0);
3026 if (TREE_CODE (base
) == MEM_REF
)
3028 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3030 if (off
== NULL_TREE
)
3032 offset_int moff
= mem_ref_offset (base
);
3033 off
= wide_int_to_tree (sizetype
, moff
);
3036 off
= size_binop (PLUS_EXPR
, off
,
3037 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3039 base
= TREE_OPERAND (base
, 0);
3042 base
= build_fold_addr_expr (base
);
3044 if (off
== NULL_TREE
)
3045 off
= size_zero_node
;
3047 /* If base is not loop invariant, either off is 0, then we start with just
3048 the constant offset in the loop invariant BASE and continue with base
3049 as OFF, otherwise give up.
3050 We could handle that case by gimplifying the addition of base + off
3051 into some SSA_NAME and use that as off, but for now punt. */
3052 if (!expr_invariant_in_loop_p (loop
, base
))
3054 if (!integer_zerop (off
))
3057 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3059 /* Otherwise put base + constant offset into the loop invariant BASE
3060 and continue with OFF. */
3063 base
= fold_convert (sizetype
, base
);
3064 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3067 /* OFF at this point may be either a SSA_NAME or some tree expression
3068 from get_inner_reference. Try to peel off loop invariants from it
3069 into BASE as long as possible. */
3071 while (offtype
== NULL_TREE
)
3073 enum tree_code code
;
3074 tree op0
, op1
, add
= NULL_TREE
;
3076 if (TREE_CODE (off
) == SSA_NAME
)
3078 gimple def_stmt
= SSA_NAME_DEF_STMT (off
);
3080 if (expr_invariant_in_loop_p (loop
, off
))
3083 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3086 op0
= gimple_assign_rhs1 (def_stmt
);
3087 code
= gimple_assign_rhs_code (def_stmt
);
3088 op1
= gimple_assign_rhs2 (def_stmt
);
3092 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3094 code
= TREE_CODE (off
);
3095 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3099 case POINTER_PLUS_EXPR
:
3101 if (expr_invariant_in_loop_p (loop
, op0
))
3106 add
= fold_convert (sizetype
, add
);
3108 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3109 base
= size_binop (PLUS_EXPR
, base
, add
);
3112 if (expr_invariant_in_loop_p (loop
, op1
))
3120 if (expr_invariant_in_loop_p (loop
, op1
))
3122 add
= fold_convert (sizetype
, op1
);
3123 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3129 if (scale
== 1 && tree_fits_shwi_p (op1
))
3131 scale
= tree_to_shwi (op1
);
3140 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3141 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3143 if (TYPE_PRECISION (TREE_TYPE (op0
))
3144 == TYPE_PRECISION (TREE_TYPE (off
)))
3149 if (TYPE_PRECISION (TREE_TYPE (op0
))
3150 < TYPE_PRECISION (TREE_TYPE (off
)))
3153 offtype
= TREE_TYPE (off
);
3164 /* If at the end OFF still isn't a SSA_NAME or isn't
3165 defined in the loop, punt. */
3166 if (TREE_CODE (off
) != SSA_NAME
3167 || expr_invariant_in_loop_p (loop
, off
))
3170 if (offtype
== NULL_TREE
)
3171 offtype
= TREE_TYPE (off
);
3173 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3175 if (decl
== NULL_TREE
)
3187 /* Function vect_analyze_data_refs.
3189 Find all the data references in the loop or basic block.
3191 The general structure of the analysis of data refs in the vectorizer is as
3193 1- vect_analyze_data_refs(loop/bb): call
3194 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3195 in the loop/bb and their dependences.
3196 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3197 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3198 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3203 vect_analyze_data_refs (loop_vec_info loop_vinfo
,
3204 bb_vec_info bb_vinfo
,
3205 int *min_vf
, unsigned *n_stmts
)
3207 struct loop
*loop
= NULL
;
3208 basic_block bb
= NULL
;
3210 vec
<data_reference_p
> datarefs
;
3211 struct data_reference
*dr
;
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE
, vect_location
,
3216 "=== vect_analyze_data_refs ===\n");
3220 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
3222 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3223 datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
3224 if (!find_loop_nest (loop
, &LOOP_VINFO_LOOP_NEST (loop_vinfo
)))
3226 if (dump_enabled_p ())
3227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3228 "not vectorized: loop contains function calls"
3229 " or data references that cannot be analyzed\n");
3233 for (i
= 0; i
< loop
->num_nodes
; i
++)
3235 gimple_stmt_iterator gsi
;
3237 for (gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
); gsi_next (&gsi
))
3239 gimple stmt
= gsi_stmt (gsi
);
3240 if (is_gimple_debug (stmt
))
3243 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs
))
3245 if (is_gimple_call (stmt
) && loop
->safelen
)
3247 tree fndecl
= gimple_call_fndecl (stmt
), op
;
3248 if (fndecl
!= NULL_TREE
)
3250 struct cgraph_node
*node
= cgraph_node::get (fndecl
);
3251 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
3253 unsigned int j
, n
= gimple_call_num_args (stmt
);
3254 for (j
= 0; j
< n
; j
++)
3256 op
= gimple_call_arg (stmt
, j
);
3258 || (REFERENCE_CLASS_P (op
)
3259 && get_base_address (op
)))
3262 op
= gimple_call_lhs (stmt
);
3263 /* Ignore #pragma omp declare simd functions
3264 if they don't have data references in the
3265 call stmt itself. */
3269 || (REFERENCE_CLASS_P (op
)
3270 && get_base_address (op
)))))
3275 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3278 "not vectorized: loop contains function "
3279 "calls or data references that cannot "
3286 LOOP_VINFO_DATAREFS (loop_vinfo
) = datarefs
;
3290 gimple_stmt_iterator gsi
;
3292 bb
= BB_VINFO_BB (bb_vinfo
);
3293 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3295 gimple stmt
= gsi_stmt (gsi
);
3296 if (is_gimple_debug (stmt
))
3299 if (!find_data_references_in_stmt (NULL
, stmt
,
3300 &BB_VINFO_DATAREFS (bb_vinfo
)))
3302 /* Mark the rest of the basic-block as unvectorizable. */
3303 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3305 stmt
= gsi_stmt (gsi
);
3306 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)) = false;
3312 datarefs
= BB_VINFO_DATAREFS (bb_vinfo
);
3315 /* Go through the data-refs, check that the analysis succeeded. Update
3316 pointer from stmt_vec_info struct to DR and vectype. */
3318 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3321 stmt_vec_info stmt_info
;
3322 tree base
, offset
, init
;
3323 bool gather
= false;
3324 bool simd_lane_access
= false;
3328 if (!dr
|| !DR_REF (dr
))
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3332 "not vectorized: unhandled data-ref\n");
3336 stmt
= DR_STMT (dr
);
3337 stmt_info
= vinfo_for_stmt (stmt
);
3339 /* Discard clobbers from the dataref vector. We will remove
3340 clobber stmts during vectorization. */
3341 if (gimple_clobber_p (stmt
))
3344 if (i
== datarefs
.length () - 1)
3349 datarefs
.ordered_remove (i
);
3354 /* Check that analysis of the data-ref succeeded. */
3355 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3360 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3361 && targetm
.vectorize
.builtin_gather
!= NULL
;
3362 bool maybe_simd_lane_access
3363 = loop_vinfo
&& loop
->simduid
;
3365 /* If target supports vector gather loads, or if this might be
3366 a SIMD lane access, see if they can't be used. */
3368 && (maybe_gather
|| maybe_simd_lane_access
)
3369 && !nested_in_vect_loop_p (loop
, stmt
))
3371 struct data_reference
*newdr
3372 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3373 DR_REF (dr
), stmt
, true);
3374 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3375 if (DR_BASE_ADDRESS (newdr
)
3376 && DR_OFFSET (newdr
)
3379 && integer_zerop (DR_STEP (newdr
)))
3381 if (maybe_simd_lane_access
)
3383 tree off
= DR_OFFSET (newdr
);
3385 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3386 && TREE_CODE (off
) == MULT_EXPR
3387 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3389 tree step
= TREE_OPERAND (off
, 1);
3390 off
= TREE_OPERAND (off
, 0);
3392 if (CONVERT_EXPR_P (off
)
3393 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3395 < TYPE_PRECISION (TREE_TYPE (off
)))
3396 off
= TREE_OPERAND (off
, 0);
3397 if (TREE_CODE (off
) == SSA_NAME
)
3399 gimple def
= SSA_NAME_DEF_STMT (off
);
3400 tree reft
= TREE_TYPE (DR_REF (newdr
));
3401 if (is_gimple_call (def
)
3402 && gimple_call_internal_p (def
)
3403 && (gimple_call_internal_fn (def
)
3404 == IFN_GOMP_SIMD_LANE
))
3406 tree arg
= gimple_call_arg (def
, 0);
3407 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3408 arg
= SSA_NAME_VAR (arg
);
3409 if (arg
== loop
->simduid
3411 && tree_int_cst_equal
3412 (TYPE_SIZE_UNIT (reft
),
3415 DR_OFFSET (newdr
) = ssize_int (0);
3416 DR_STEP (newdr
) = step
;
3417 DR_ALIGNED_TO (newdr
)
3418 = size_int (BIGGEST_ALIGNMENT
);
3420 simd_lane_access
= true;
3426 if (!simd_lane_access
&& maybe_gather
)
3432 if (!gather
&& !simd_lane_access
)
3433 free_data_ref (newdr
);
3436 if (!gather
&& !simd_lane_access
)
3438 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3441 "not vectorized: data ref analysis "
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3444 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3454 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3456 if (dump_enabled_p ())
3457 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3458 "not vectorized: base addr of dr is a "
3464 if (gather
|| simd_lane_access
)
3469 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3471 if (dump_enabled_p ())
3473 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3474 "not vectorized: volatile type ");
3475 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3476 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3485 if (stmt_can_throw_internal (stmt
))
3487 if (dump_enabled_p ())
3489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3490 "not vectorized: statement can throw an "
3492 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3493 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3499 if (gather
|| simd_lane_access
)
3504 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3505 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3507 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3510 "not vectorized: statement is bitfield "
3512 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3513 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3519 if (gather
|| simd_lane_access
)
3524 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3525 offset
= unshare_expr (DR_OFFSET (dr
));
3526 init
= unshare_expr (DR_INIT (dr
));
3528 if (is_gimple_call (stmt
)
3529 && (!gimple_call_internal_p (stmt
)
3530 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3531 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3533 if (dump_enabled_p ())
3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3536 "not vectorized: dr in a call ");
3537 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3538 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3544 if (gather
|| simd_lane_access
)
3549 /* Update DR field in stmt_vec_info struct. */
3551 /* If the dataref is in an inner-loop of the loop that is considered for
3552 for vectorization, we also want to analyze the access relative to
3553 the outer-loop (DR contains information only relative to the
3554 inner-most enclosing loop). We do that by building a reference to the
3555 first location accessed by the inner-loop, and analyze it relative to
3557 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3559 tree outer_step
, outer_base
, outer_init
;
3560 HOST_WIDE_INT pbitsize
, pbitpos
;
3563 int punsignedp
, pvolatilep
;
3564 affine_iv base_iv
, offset_iv
;
3567 /* Build a reference to the first location accessed by the
3568 inner-loop: *(BASE+INIT). (The first location is actually
3569 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3570 tree inner_base
= build_fold_indirect_ref
3571 (fold_build_pointer_plus (base
, init
));
3573 if (dump_enabled_p ())
3575 dump_printf_loc (MSG_NOTE
, vect_location
,
3576 "analyze in outer-loop: ");
3577 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3578 dump_printf (MSG_NOTE
, "\n");
3581 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3582 &poffset
, &pmode
, &punsignedp
, &pvolatilep
, false);
3583 gcc_assert (outer_base
!= NULL_TREE
);
3585 if (pbitpos
% BITS_PER_UNIT
!= 0)
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3589 "failed: bit offset alignment.\n");
3593 outer_base
= build_fold_addr_expr (outer_base
);
3594 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3597 if (dump_enabled_p ())
3598 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3599 "failed: evolution of base is not affine.\n");
3606 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3614 offset_iv
.base
= ssize_int (0);
3615 offset_iv
.step
= ssize_int (0);
3617 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3622 "evolution of offset is not affine.\n");
3626 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3627 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3628 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3629 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3630 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3632 outer_step
= size_binop (PLUS_EXPR
,
3633 fold_convert (ssizetype
, base_iv
.step
),
3634 fold_convert (ssizetype
, offset_iv
.step
));
3636 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3637 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3638 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3639 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3640 STMT_VINFO_DR_OFFSET (stmt_info
) =
3641 fold_convert (ssizetype
, offset_iv
.base
);
3642 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3643 size_int (highest_pow2_factor (offset_iv
.base
));
3645 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_NOTE
, vect_location
,
3648 "\touter base_address: ");
3649 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3650 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3651 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3652 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3653 STMT_VINFO_DR_OFFSET (stmt_info
));
3654 dump_printf (MSG_NOTE
,
3655 "\n\touter constant offset from base address: ");
3656 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3657 STMT_VINFO_DR_INIT (stmt_info
));
3658 dump_printf (MSG_NOTE
, "\n\touter step: ");
3659 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3660 STMT_VINFO_DR_STEP (stmt_info
));
3661 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3662 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3663 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3664 dump_printf (MSG_NOTE
, "\n");
3668 if (STMT_VINFO_DATA_REF (stmt_info
))
3670 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3673 "not vectorized: more than one data ref "
3675 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3676 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3682 if (gather
|| simd_lane_access
)
3687 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3688 if (simd_lane_access
)
3690 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3691 free_data_ref (datarefs
[i
]);
3695 /* Set vectype for STMT. */
3696 scalar_type
= TREE_TYPE (DR_REF (dr
));
3697 STMT_VINFO_VECTYPE (stmt_info
)
3698 = get_vectype_for_scalar_type (scalar_type
);
3699 if (!STMT_VINFO_VECTYPE (stmt_info
))
3701 if (dump_enabled_p ())
3703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3704 "not vectorized: no vectype for stmt: ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3706 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3707 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3709 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3715 if (gather
|| simd_lane_access
)
3717 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3725 if (dump_enabled_p ())
3727 dump_printf_loc (MSG_NOTE
, vect_location
,
3728 "got vectype for stmt: ");
3729 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3730 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3731 STMT_VINFO_VECTYPE (stmt_info
));
3732 dump_printf (MSG_NOTE
, "\n");
3736 /* Adjust the minimal vectorization factor according to the
3738 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3746 gather
= 0 != vect_check_gather (stmt
, loop_vinfo
, NULL
, &off
, NULL
);
3748 && get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3752 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3757 "not vectorized: not suitable for gather "
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3766 STMT_VINFO_GATHER_P (stmt_info
) = true;
3769 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3771 if (nested_in_vect_loop_p (loop
, stmt
)
3772 || !DR_IS_READ (dr
))
3774 if (dump_enabled_p ())
3776 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3777 "not vectorized: not suitable for strided "
3779 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3780 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3784 STMT_VINFO_STRIDE_LOAD_P (stmt_info
) = true;
3788 /* If we stopped analysis at the first dataref we could not analyze
3789 when trying to vectorize a basic-block mark the rest of the datarefs
3790 as not vectorizable and truncate the vector of datarefs. That
3791 avoids spending useless time in analyzing their dependence. */
3792 if (i
!= datarefs
.length ())
3794 gcc_assert (bb_vinfo
!= NULL
);
3795 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3797 data_reference_p dr
= datarefs
[j
];
3798 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3801 datarefs
.truncate (i
);
3808 /* Function vect_get_new_vect_var.
3810 Returns a name for a new variable. The current naming scheme appends the
3811 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3812 the name of vectorizer generated variables, and appends that to NAME if
3816 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3823 case vect_simple_var
:
3826 case vect_scalar_var
:
3829 case vect_pointer_var
:
3838 char* tmp
= concat (prefix
, "_", name
, NULL
);
3839 new_vect_var
= create_tmp_reg (type
, tmp
);
3843 new_vect_var
= create_tmp_reg (type
, prefix
);
3845 return new_vect_var
;
3848 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3851 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3852 stmt_vec_info stmt_info
)
3854 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3855 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3856 int misalign
= DR_MISALIGNMENT (dr
);
3858 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
3860 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
3863 /* Function vect_create_addr_base_for_vector_ref.
3865 Create an expression that computes the address of the first memory location
3866 that will be accessed for a data reference.
3869 STMT: The statement containing the data reference.
3870 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3871 OFFSET: Optional. If supplied, it is be added to the initial address.
3872 LOOP: Specify relative to which loop-nest should the address be computed.
3873 For example, when the dataref is in an inner-loop nested in an
3874 outer-loop that is now being vectorized, LOOP can be either the
3875 outer-loop, or the inner-loop. The first memory location accessed
3876 by the following dataref ('in' points to short):
3883 if LOOP=i_loop: &in (relative to i_loop)
3884 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3885 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3886 initial address. Unlike OFFSET, which is number of elements to
3887 be added, BYTE_OFFSET is measured in bytes.
3890 1. Return an SSA_NAME whose value is the address of the memory location of
3891 the first vector of the data reference.
3892 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3893 these statement(s) which define the returned SSA_NAME.
3895 FORNOW: We are only handling array accesses with step 1. */
3898 vect_create_addr_base_for_vector_ref (gimple stmt
,
3899 gimple_seq
*new_stmt_list
,
3904 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3905 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3907 const char *base_name
;
3910 gimple_seq seq
= NULL
;
3914 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3915 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3917 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3919 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3921 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3923 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3924 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3925 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
3929 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3930 base_offset
= unshare_expr (DR_OFFSET (dr
));
3931 init
= unshare_expr (DR_INIT (dr
));
3935 base_name
= get_name (data_ref_base
);
3938 base_offset
= ssize_int (0);
3939 init
= ssize_int (0);
3940 base_name
= get_name (DR_REF (dr
));
3943 /* Create base_offset */
3944 base_offset
= size_binop (PLUS_EXPR
,
3945 fold_convert (sizetype
, base_offset
),
3946 fold_convert (sizetype
, init
));
3950 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3951 fold_convert (sizetype
, offset
), step
);
3952 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3953 base_offset
, offset
);
3957 byte_offset
= fold_convert (sizetype
, byte_offset
);
3958 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3959 base_offset
, byte_offset
);
3962 /* base + base_offset */
3964 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3967 addr_base
= build1 (ADDR_EXPR
,
3968 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3969 unshare_expr (DR_REF (dr
)));
3972 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3973 addr_base
= fold_convert (vect_ptr_type
, addr_base
);
3974 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3975 addr_base
= force_gimple_operand (addr_base
, &seq
, false, dest
);
3976 gimple_seq_add_seq (new_stmt_list
, seq
);
3978 if (DR_PTR_INFO (dr
)
3979 && TREE_CODE (addr_base
) == SSA_NAME
)
3981 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
3982 if (offset
|| byte_offset
)
3983 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3986 if (dump_enabled_p ())
3988 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3989 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3990 dump_printf (MSG_NOTE
, "\n");
3997 /* Function vect_create_data_ref_ptr.
3999 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4000 location accessed in the loop by STMT, along with the def-use update
4001 chain to appropriately advance the pointer through the loop iterations.
4002 Also set aliasing information for the pointer. This pointer is used by
4003 the callers to this function to create a memory reference expression for
4004 vector load/store access.
4007 1. STMT: a stmt that references memory. Expected to be of the form
4008 GIMPLE_ASSIGN <name, data-ref> or
4009 GIMPLE_ASSIGN <data-ref, name>.
4010 2. AGGR_TYPE: the type of the reference, which should be either a vector
4012 3. AT_LOOP: the loop where the vector memref is to be created.
4013 4. OFFSET (optional): an offset to be added to the initial address accessed
4014 by the data-ref in STMT.
4015 5. BSI: location where the new stmts are to be placed if there is no loop
4016 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4017 pointing to the initial address.
4018 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4019 to the initial address accessed by the data-ref in STMT. This is
4020 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4024 1. Declare a new ptr to vector_type, and have it point to the base of the
4025 data reference (initial addressed accessed by the data reference).
4026 For example, for vector of type V8HI, the following code is generated:
4029 ap = (v8hi *)initial_address;
4031 if OFFSET is not supplied:
4032 initial_address = &a[init];
4033 if OFFSET is supplied:
4034 initial_address = &a[init + OFFSET];
4035 if BYTE_OFFSET is supplied:
4036 initial_address = &a[init] + BYTE_OFFSET;
4038 Return the initial_address in INITIAL_ADDRESS.
4040 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4041 update the pointer in each iteration of the loop.
4043 Return the increment stmt that updates the pointer in PTR_INCR.
4045 3. Set INV_P to true if the access pattern of the data reference in the
4046 vectorized loop is invariant. Set it to false otherwise.
4048 4. Return the pointer. */
4051 vect_create_data_ref_ptr (gimple stmt
, tree aggr_type
, struct loop
*at_loop
,
4052 tree offset
, tree
*initial_address
,
4053 gimple_stmt_iterator
*gsi
, gimple
*ptr_incr
,
4054 bool only_init
, bool *inv_p
, tree byte_offset
)
4056 const char *base_name
;
4057 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4058 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4059 struct loop
*loop
= NULL
;
4060 bool nested_in_vect_loop
= false;
4061 struct loop
*containing_loop
= NULL
;
4066 gimple_seq new_stmt_list
= NULL
;
4070 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4072 gimple_stmt_iterator incr_gsi
;
4074 tree indx_before_incr
, indx_after_incr
;
4077 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4079 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4080 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4084 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4085 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4086 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4087 pe
= loop_preheader_edge (loop
);
4091 gcc_assert (bb_vinfo
);
4096 /* Check the step (evolution) of the load in LOOP, and record
4097 whether it's invariant. */
4098 if (nested_in_vect_loop
)
4099 step
= STMT_VINFO_DR_STEP (stmt_info
);
4101 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4103 if (integer_zerop (step
))
4108 /* Create an expression for the first address accessed by this load
4110 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4112 if (dump_enabled_p ())
4114 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4115 dump_printf_loc (MSG_NOTE
, vect_location
,
4116 "create %s-pointer variable to type: ",
4117 get_tree_code_name (TREE_CODE (aggr_type
)));
4118 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4119 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4120 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4121 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4122 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4123 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4124 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4126 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4127 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4128 dump_printf (MSG_NOTE
, "\n");
4131 /* (1) Create the new aggregate-pointer variable.
4132 Vector and array types inherit the alias set of their component
4133 type by default so we need to use a ref-all pointer if the data
4134 reference does not conflict with the created aggregated data
4135 reference because it is not addressable. */
4136 bool need_ref_all
= false;
4137 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4138 get_alias_set (DR_REF (dr
))))
4139 need_ref_all
= true;
4140 /* Likewise for any of the data references in the stmt group. */
4141 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4143 gimple orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4146 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4147 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4148 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4149 get_alias_set (DR_REF (sdr
))))
4151 need_ref_all
= true;
4154 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4158 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4160 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4163 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4164 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4165 def-use update cycles for the pointer: one relative to the outer-loop
4166 (LOOP), which is what steps (3) and (4) below do. The other is relative
4167 to the inner-loop (which is the inner-most loop containing the dataref),
4168 and this is done be step (5) below.
4170 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4171 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4172 redundant. Steps (3),(4) create the following:
4175 LOOP: vp1 = phi(vp0,vp2)
4181 If there is an inner-loop nested in loop, then step (5) will also be
4182 applied, and an additional update in the inner-loop will be created:
4185 LOOP: vp1 = phi(vp0,vp2)
4187 inner: vp3 = phi(vp1,vp4)
4188 vp4 = vp3 + inner_step
4194 /* (2) Calculate the initial address of the aggregate-pointer, and set
4195 the aggregate-pointer to point to it before the loop. */
4197 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4199 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4200 offset
, loop
, byte_offset
);
4205 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4206 gcc_assert (!new_bb
);
4209 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4212 *initial_address
= new_temp
;
4214 /* Create: p = (aggr_type *) initial_base */
4215 if (TREE_CODE (new_temp
) != SSA_NAME
4216 || !useless_type_conversion_p (aggr_ptr_type
, TREE_TYPE (new_temp
)))
4218 vec_stmt
= gimple_build_assign (aggr_ptr
,
4219 fold_convert (aggr_ptr_type
, new_temp
));
4220 aggr_ptr_init
= make_ssa_name (aggr_ptr
, vec_stmt
);
4221 /* Copy the points-to information if it exists. */
4222 if (DR_PTR_INFO (dr
))
4223 vect_duplicate_ssa_name_ptr_info (aggr_ptr_init
, dr
, stmt_info
);
4224 gimple_assign_set_lhs (vec_stmt
, aggr_ptr_init
);
4227 new_bb
= gsi_insert_on_edge_immediate (pe
, vec_stmt
);
4228 gcc_assert (!new_bb
);
4231 gsi_insert_before (gsi
, vec_stmt
, GSI_SAME_STMT
);
4234 aggr_ptr_init
= new_temp
;
4236 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4237 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4238 inner-loop nested in LOOP (during outer-loop vectorization). */
4240 /* No update in loop is required. */
4241 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4242 aptr
= aggr_ptr_init
;
4245 /* The step of the aggregate pointer is the type size. */
4246 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4247 /* One exception to the above is when the scalar step of the load in
4248 LOOP is zero. In this case the step here is also zero. */
4250 iv_step
= size_zero_node
;
4251 else if (tree_int_cst_sgn (step
) == -1)
4252 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4254 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4256 create_iv (aggr_ptr_init
,
4257 fold_convert (aggr_ptr_type
, iv_step
),
4258 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4259 &indx_before_incr
, &indx_after_incr
);
4260 incr
= gsi_stmt (incr_gsi
);
4261 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr
))
4266 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4267 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4272 aptr
= indx_before_incr
;
4275 if (!nested_in_vect_loop
|| only_init
)
4279 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4280 nested in LOOP, if exists. */
4282 gcc_assert (nested_in_vect_loop
);
4285 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4287 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4288 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4290 incr
= gsi_stmt (incr_gsi
);
4291 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
, NULL
));
4293 /* Copy the points-to information if it exists. */
4294 if (DR_PTR_INFO (dr
))
4296 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4297 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4302 return indx_before_incr
;
4309 /* Function bump_vector_ptr
4311 Increment a pointer (to a vector type) by vector-size. If requested,
4312 i.e. if PTR-INCR is given, then also connect the new increment stmt
4313 to the existing def-use update-chain of the pointer, by modifying
4314 the PTR_INCR as illustrated below:
4316 The pointer def-use update-chain before this function:
4317 DATAREF_PTR = phi (p_0, p_2)
4319 PTR_INCR: p_2 = DATAREF_PTR + step
4321 The pointer def-use update-chain after this function:
4322 DATAREF_PTR = phi (p_0, p_2)
4324 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4326 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4329 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4331 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4332 the loop. The increment amount across iterations is expected
4334 BSI - location where the new update stmt is to be placed.
4335 STMT - the original scalar memory-access stmt that is being vectorized.
4336 BUMP - optional. The offset by which to bump the pointer. If not given,
4337 the offset is assumed to be vector_size.
4339 Output: Return NEW_DATAREF_PTR as illustrated above.
4344 bump_vector_ptr (tree dataref_ptr
, gimple ptr_incr
, gimple_stmt_iterator
*gsi
,
4345 gimple stmt
, tree bump
)
4347 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4348 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4349 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4350 tree update
= TYPE_SIZE_UNIT (vectype
);
4353 use_operand_p use_p
;
4354 tree new_dataref_ptr
;
4359 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4360 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4361 dataref_ptr
, update
);
4362 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4364 /* Copy the points-to information if it exists. */
4365 if (DR_PTR_INFO (dr
))
4367 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4368 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4372 return new_dataref_ptr
;
4374 /* Update the vector-pointer's cross-iteration increment. */
4375 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4377 tree use
= USE_FROM_PTR (use_p
);
4379 if (use
== dataref_ptr
)
4380 SET_USE (use_p
, new_dataref_ptr
);
4382 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4385 return new_dataref_ptr
;
4389 /* Function vect_create_destination_var.
4391 Create a new temporary of type VECTYPE. */
4394 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4400 enum vect_var_kind kind
;
4402 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4403 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4405 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4407 name
= get_name (scalar_dest
);
4409 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4411 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4412 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4418 /* Function vect_grouped_store_supported.
4420 Returns TRUE if interleave high and interleave low permutations
4421 are supported, and FALSE otherwise. */
4424 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4426 machine_mode mode
= TYPE_MODE (vectype
);
4428 /* vect_permute_store_chain requires the group size to be equal to 3 or
4429 be a power of two. */
4430 if (count
!= 3 && exact_log2 (count
) == -1)
4432 if (dump_enabled_p ())
4433 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4434 "the size of the group of accesses"
4435 " is not a power of 2 or not eqaul to 3\n");
4439 /* Check that the permutation is supported. */
4440 if (VECTOR_MODE_P (mode
))
4442 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4443 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4447 unsigned int j0
= 0, j1
= 0, j2
= 0;
4450 for (j
= 0; j
< 3; j
++)
4452 int nelt0
= ((3 - j
) * nelt
) % 3;
4453 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4454 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4455 for (i
= 0; i
< nelt
; i
++)
4457 if (3 * i
+ nelt0
< nelt
)
4458 sel
[3 * i
+ nelt0
] = j0
++;
4459 if (3 * i
+ nelt1
< nelt
)
4460 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4461 if (3 * i
+ nelt2
< nelt
)
4462 sel
[3 * i
+ nelt2
] = 0;
4464 if (!can_vec_perm_p (mode
, false, sel
))
4466 if (dump_enabled_p ())
4467 dump_printf (MSG_MISSED_OPTIMIZATION
,
4468 "permutaion op not supported by target.\n");
4472 for (i
= 0; i
< nelt
; i
++)
4474 if (3 * i
+ nelt0
< nelt
)
4475 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4476 if (3 * i
+ nelt1
< nelt
)
4477 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4478 if (3 * i
+ nelt2
< nelt
)
4479 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4481 if (!can_vec_perm_p (mode
, false, sel
))
4483 if (dump_enabled_p ())
4484 dump_printf (MSG_MISSED_OPTIMIZATION
,
4485 "permutaion op not supported by target.\n");
4493 /* If length is not equal to 3 then only power of 2 is supported. */
4494 gcc_assert (exact_log2 (count
) != -1);
4496 for (i
= 0; i
< nelt
/ 2; i
++)
4499 sel
[i
* 2 + 1] = i
+ nelt
;
4501 if (can_vec_perm_p (mode
, false, sel
))
4503 for (i
= 0; i
< nelt
; i
++)
4505 if (can_vec_perm_p (mode
, false, sel
))
4511 if (dump_enabled_p ())
4512 dump_printf (MSG_MISSED_OPTIMIZATION
,
4513 "permutaion op not supported by target.\n");
4518 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4522 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4524 return vect_lanes_optab_supported_p ("vec_store_lanes",
4525 vec_store_lanes_optab
,
4530 /* Function vect_permute_store_chain.
4532 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4533 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4534 the data correctly for the stores. Return the final references for stores
4537 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4538 The input is 4 vectors each containing 8 elements. We assign a number to
4539 each element, the input sequence is:
4541 1st vec: 0 1 2 3 4 5 6 7
4542 2nd vec: 8 9 10 11 12 13 14 15
4543 3rd vec: 16 17 18 19 20 21 22 23
4544 4th vec: 24 25 26 27 28 29 30 31
4546 The output sequence should be:
4548 1st vec: 0 8 16 24 1 9 17 25
4549 2nd vec: 2 10 18 26 3 11 19 27
4550 3rd vec: 4 12 20 28 5 13 21 30
4551 4th vec: 6 14 22 30 7 15 23 31
4553 i.e., we interleave the contents of the four vectors in their order.
4555 We use interleave_high/low instructions to create such output. The input of
4556 each interleave_high/low operation is two vectors:
4559 the even elements of the result vector are obtained left-to-right from the
4560 high/low elements of the first vector. The odd elements of the result are
4561 obtained left-to-right from the high/low elements of the second vector.
4562 The output of interleave_high will be: 0 4 1 5
4563 and of interleave_low: 2 6 3 7
4566 The permutation is done in log LENGTH stages. In each stage interleave_high
4567 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4568 where the first argument is taken from the first half of DR_CHAIN and the
4569 second argument from it's second half.
4572 I1: interleave_high (1st vec, 3rd vec)
4573 I2: interleave_low (1st vec, 3rd vec)
4574 I3: interleave_high (2nd vec, 4th vec)
4575 I4: interleave_low (2nd vec, 4th vec)
4577 The output for the first stage is:
4579 I1: 0 16 1 17 2 18 3 19
4580 I2: 4 20 5 21 6 22 7 23
4581 I3: 8 24 9 25 10 26 11 27
4582 I4: 12 28 13 29 14 30 15 31
4584 The output of the second stage, i.e. the final result is:
4586 I1: 0 8 16 24 1 9 17 25
4587 I2: 2 10 18 26 3 11 19 27
4588 I3: 4 12 20 28 5 13 21 30
4589 I4: 6 14 22 30 7 15 23 31. */
4592 vect_permute_store_chain (vec
<tree
> dr_chain
,
4593 unsigned int length
,
4595 gimple_stmt_iterator
*gsi
,
4596 vec
<tree
> *result_chain
)
4598 tree vect1
, vect2
, high
, low
;
4600 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4601 tree perm_mask_low
, perm_mask_high
;
4603 tree perm3_mask_low
, perm3_mask_high
;
4604 unsigned int i
, n
, log_length
= exact_log2 (length
);
4605 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4606 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4608 result_chain
->quick_grow (length
);
4609 memcpy (result_chain
->address (), dr_chain
.address (),
4610 length
* sizeof (tree
));
4614 unsigned int j0
= 0, j1
= 0, j2
= 0;
4616 for (j
= 0; j
< 3; j
++)
4618 int nelt0
= ((3 - j
) * nelt
) % 3;
4619 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4620 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4622 for (i
= 0; i
< nelt
; i
++)
4624 if (3 * i
+ nelt0
< nelt
)
4625 sel
[3 * i
+ nelt0
] = j0
++;
4626 if (3 * i
+ nelt1
< nelt
)
4627 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4628 if (3 * i
+ nelt2
< nelt
)
4629 sel
[3 * i
+ nelt2
] = 0;
4631 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4633 for (i
= 0; i
< nelt
; i
++)
4635 if (3 * i
+ nelt0
< nelt
)
4636 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4637 if (3 * i
+ nelt1
< nelt
)
4638 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4639 if (3 * i
+ nelt2
< nelt
)
4640 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4642 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4644 vect1
= dr_chain
[0];
4645 vect2
= dr_chain
[1];
4647 /* Create interleaving stmt:
4648 low = VEC_PERM_EXPR <vect1, vect2,
4649 {j, nelt, *, j + 1, nelt + j + 1, *,
4650 j + 2, nelt + j + 2, *, ...}> */
4651 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4652 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4653 vect2
, perm3_mask_low
);
4654 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4657 vect2
= dr_chain
[2];
4658 /* Create interleaving stmt:
4659 low = VEC_PERM_EXPR <vect1, vect2,
4660 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4661 6, 7, nelt + j + 2, ...}> */
4662 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4663 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4664 vect2
, perm3_mask_high
);
4665 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4666 (*result_chain
)[j
] = data_ref
;
4671 /* If length is not equal to 3 then only power of 2 is supported. */
4672 gcc_assert (exact_log2 (length
) != -1);
4674 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4677 sel
[i
* 2 + 1] = i
+ nelt
;
4679 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4681 for (i
= 0; i
< nelt
; i
++)
4683 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4685 for (i
= 0, n
= log_length
; i
< n
; i
++)
4687 for (j
= 0; j
< length
/2; j
++)
4689 vect1
= dr_chain
[j
];
4690 vect2
= dr_chain
[j
+length
/2];
4692 /* Create interleaving stmt:
4693 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4695 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4696 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4697 vect2
, perm_mask_high
);
4698 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4699 (*result_chain
)[2*j
] = high
;
4701 /* Create interleaving stmt:
4702 low = VEC_PERM_EXPR <vect1, vect2,
4703 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4705 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4706 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4707 vect2
, perm_mask_low
);
4708 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4709 (*result_chain
)[2*j
+1] = low
;
4711 memcpy (dr_chain
.address (), result_chain
->address (),
4712 length
* sizeof (tree
));
4717 /* Function vect_setup_realignment
4719 This function is called when vectorizing an unaligned load using
4720 the dr_explicit_realign[_optimized] scheme.
4721 This function generates the following code at the loop prolog:
4724 x msq_init = *(floor(p)); # prolog load
4725 realignment_token = call target_builtin;
4727 x msq = phi (msq_init, ---)
4729 The stmts marked with x are generated only for the case of
4730 dr_explicit_realign_optimized.
4732 The code above sets up a new (vector) pointer, pointing to the first
4733 location accessed by STMT, and a "floor-aligned" load using that pointer.
4734 It also generates code to compute the "realignment-token" (if the relevant
4735 target hook was defined), and creates a phi-node at the loop-header bb
4736 whose arguments are the result of the prolog-load (created by this
4737 function) and the result of a load that takes place in the loop (to be
4738 created by the caller to this function).
4740 For the case of dr_explicit_realign_optimized:
4741 The caller to this function uses the phi-result (msq) to create the
4742 realignment code inside the loop, and sets up the missing phi argument,
4745 msq = phi (msq_init, lsq)
4746 lsq = *(floor(p')); # load in loop
4747 result = realign_load (msq, lsq, realignment_token);
4749 For the case of dr_explicit_realign:
4751 msq = *(floor(p)); # load in loop
4753 lsq = *(floor(p')); # load in loop
4754 result = realign_load (msq, lsq, realignment_token);
4757 STMT - (scalar) load stmt to be vectorized. This load accesses
4758 a memory location that may be unaligned.
4759 BSI - place where new code is to be inserted.
4760 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4764 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4765 target hook, if defined.
4766 Return value - the result of the loop-header phi node. */
4769 vect_setup_realignment (gimple stmt
, gimple_stmt_iterator
*gsi
,
4770 tree
*realignment_token
,
4771 enum dr_alignment_support alignment_support_scheme
,
4773 struct loop
**at_loop
)
4775 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4776 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4777 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4778 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4779 struct loop
*loop
= NULL
;
4781 tree scalar_dest
= gimple_assign_lhs (stmt
);
4787 tree msq_init
= NULL_TREE
;
4790 tree msq
= NULL_TREE
;
4791 gimple_seq stmts
= NULL
;
4793 bool compute_in_loop
= false;
4794 bool nested_in_vect_loop
= false;
4795 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4796 struct loop
*loop_for_initial_load
= NULL
;
4800 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4801 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4804 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4805 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4807 /* We need to generate three things:
4808 1. the misalignment computation
4809 2. the extra vector load (for the optimized realignment scheme).
4810 3. the phi node for the two vectors from which the realignment is
4811 done (for the optimized realignment scheme). */
4813 /* 1. Determine where to generate the misalignment computation.
4815 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4816 calculation will be generated by this function, outside the loop (in the
4817 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4818 caller, inside the loop.
4820 Background: If the misalignment remains fixed throughout the iterations of
4821 the loop, then both realignment schemes are applicable, and also the
4822 misalignment computation can be done outside LOOP. This is because we are
4823 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4824 are a multiple of VS (the Vector Size), and therefore the misalignment in
4825 different vectorized LOOP iterations is always the same.
4826 The problem arises only if the memory access is in an inner-loop nested
4827 inside LOOP, which is now being vectorized using outer-loop vectorization.
4828 This is the only case when the misalignment of the memory access may not
4829 remain fixed throughout the iterations of the inner-loop (as explained in
4830 detail in vect_supportable_dr_alignment). In this case, not only is the
4831 optimized realignment scheme not applicable, but also the misalignment
4832 computation (and generation of the realignment token that is passed to
4833 REALIGN_LOAD) have to be done inside the loop.
4835 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4836 or not, which in turn determines if the misalignment is computed inside
4837 the inner-loop, or outside LOOP. */
4839 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4841 compute_in_loop
= true;
4842 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4846 /* 2. Determine where to generate the extra vector load.
4848 For the optimized realignment scheme, instead of generating two vector
4849 loads in each iteration, we generate a single extra vector load in the
4850 preheader of the loop, and in each iteration reuse the result of the
4851 vector load from the previous iteration. In case the memory access is in
4852 an inner-loop nested inside LOOP, which is now being vectorized using
4853 outer-loop vectorization, we need to determine whether this initial vector
4854 load should be generated at the preheader of the inner-loop, or can be
4855 generated at the preheader of LOOP. If the memory access has no evolution
4856 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4857 to be generated inside LOOP (in the preheader of the inner-loop). */
4859 if (nested_in_vect_loop
)
4861 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4862 bool invariant_in_outerloop
=
4863 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4864 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4867 loop_for_initial_load
= loop
;
4869 *at_loop
= loop_for_initial_load
;
4871 if (loop_for_initial_load
)
4872 pe
= loop_preheader_edge (loop_for_initial_load
);
4874 /* 3. For the case of the optimized realignment, create the first vector
4875 load at the loop preheader. */
4877 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4879 /* Create msq_init = *(floor(p1)) in the loop preheader */
4882 gcc_assert (!compute_in_loop
);
4883 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4884 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4885 NULL_TREE
, &init_addr
, NULL
, &inc
,
4887 new_temp
= copy_ssa_name (ptr
);
4888 new_stmt
= gimple_build_assign
4889 (new_temp
, BIT_AND_EXPR
, ptr
,
4890 build_int_cst (TREE_TYPE (ptr
),
4891 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4892 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4893 gcc_assert (!new_bb
);
4895 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4896 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4897 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4898 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4899 gimple_assign_set_lhs (new_stmt
, new_temp
);
4902 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4903 gcc_assert (!new_bb
);
4906 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4908 msq_init
= gimple_assign_lhs (new_stmt
);
4911 /* 4. Create realignment token using a target builtin, if available.
4912 It is done either inside the containing loop, or before LOOP (as
4913 determined above). */
4915 if (targetm
.vectorize
.builtin_mask_for_load
)
4920 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4923 /* Generate the INIT_ADDR computation outside LOOP. */
4924 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4928 pe
= loop_preheader_edge (loop
);
4929 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4930 gcc_assert (!new_bb
);
4933 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4936 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4937 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4939 vect_create_destination_var (scalar_dest
,
4940 gimple_call_return_type (new_stmt
));
4941 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4942 gimple_call_set_lhs (new_stmt
, new_temp
);
4944 if (compute_in_loop
)
4945 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4948 /* Generate the misalignment computation outside LOOP. */
4949 pe
= loop_preheader_edge (loop
);
4950 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4951 gcc_assert (!new_bb
);
4954 *realignment_token
= gimple_call_lhs (new_stmt
);
4956 /* The result of the CALL_EXPR to this builtin is determined from
4957 the value of the parameter and no global variables are touched
4958 which makes the builtin a "const" function. Requiring the
4959 builtin to have the "const" attribute makes it unnecessary
4960 to call mark_call_clobbered. */
4961 gcc_assert (TREE_READONLY (builtin_decl
));
4964 if (alignment_support_scheme
== dr_explicit_realign
)
4967 gcc_assert (!compute_in_loop
);
4968 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4971 /* 5. Create msq = phi <msq_init, lsq> in loop */
4973 pe
= loop_preheader_edge (containing_loop
);
4974 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4975 msq
= make_ssa_name (vec_dest
);
4976 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4977 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4983 /* Function vect_grouped_load_supported.
4985 Returns TRUE if even and odd permutations are supported,
4986 and FALSE otherwise. */
4989 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4991 machine_mode mode
= TYPE_MODE (vectype
);
4993 /* vect_permute_load_chain requires the group size to be equal to 3 or
4994 be a power of two. */
4995 if (count
!= 3 && exact_log2 (count
) == -1)
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4999 "the size of the group of accesses"
5000 " is not a power of 2 or not equal to 3\n");
5004 /* Check that the permutation is supported. */
5005 if (VECTOR_MODE_P (mode
))
5007 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5008 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5013 for (k
= 0; k
< 3; k
++)
5015 for (i
= 0; i
< nelt
; i
++)
5016 if (3 * i
+ k
< 2 * nelt
)
5020 if (!can_vec_perm_p (mode
, false, sel
))
5022 if (dump_enabled_p ())
5023 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5024 "shuffle of 3 loads is not supported by"
5028 for (i
= 0, j
= 0; i
< nelt
; i
++)
5029 if (3 * i
+ k
< 2 * nelt
)
5032 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5033 if (!can_vec_perm_p (mode
, false, sel
))
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5037 "shuffle of 3 loads is not supported by"
5046 /* If length is not equal to 3 then only power of 2 is supported. */
5047 gcc_assert (exact_log2 (count
) != -1);
5048 for (i
= 0; i
< nelt
; i
++)
5050 if (can_vec_perm_p (mode
, false, sel
))
5052 for (i
= 0; i
< nelt
; i
++)
5054 if (can_vec_perm_p (mode
, false, sel
))
5060 if (dump_enabled_p ())
5061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5062 "extract even/odd not supported by target\n");
5066 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5070 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5072 return vect_lanes_optab_supported_p ("vec_load_lanes",
5073 vec_load_lanes_optab
,
5077 /* Function vect_permute_load_chain.
5079 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5080 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5081 the input data correctly. Return the final references for loads in
5084 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5085 The input is 4 vectors each containing 8 elements. We assign a number to each
5086 element, the input sequence is:
5088 1st vec: 0 1 2 3 4 5 6 7
5089 2nd vec: 8 9 10 11 12 13 14 15
5090 3rd vec: 16 17 18 19 20 21 22 23
5091 4th vec: 24 25 26 27 28 29 30 31
5093 The output sequence should be:
5095 1st vec: 0 4 8 12 16 20 24 28
5096 2nd vec: 1 5 9 13 17 21 25 29
5097 3rd vec: 2 6 10 14 18 22 26 30
5098 4th vec: 3 7 11 15 19 23 27 31
5100 i.e., the first output vector should contain the first elements of each
5101 interleaving group, etc.
5103 We use extract_even/odd instructions to create such output. The input of
5104 each extract_even/odd operation is two vectors
5108 and the output is the vector of extracted even/odd elements. The output of
5109 extract_even will be: 0 2 4 6
5110 and of extract_odd: 1 3 5 7
5113 The permutation is done in log LENGTH stages. In each stage extract_even
5114 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5115 their order. In our example,
5117 E1: extract_even (1st vec, 2nd vec)
5118 E2: extract_odd (1st vec, 2nd vec)
5119 E3: extract_even (3rd vec, 4th vec)
5120 E4: extract_odd (3rd vec, 4th vec)
5122 The output for the first stage will be:
5124 E1: 0 2 4 6 8 10 12 14
5125 E2: 1 3 5 7 9 11 13 15
5126 E3: 16 18 20 22 24 26 28 30
5127 E4: 17 19 21 23 25 27 29 31
5129 In order to proceed and create the correct sequence for the next stage (or
5130 for the correct output, if the second stage is the last one, as in our
5131 example), we first put the output of extract_even operation and then the
5132 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5133 The input for the second stage is:
5135 1st vec (E1): 0 2 4 6 8 10 12 14
5136 2nd vec (E3): 16 18 20 22 24 26 28 30
5137 3rd vec (E2): 1 3 5 7 9 11 13 15
5138 4th vec (E4): 17 19 21 23 25 27 29 31
5140 The output of the second stage:
5142 E1: 0 4 8 12 16 20 24 28
5143 E2: 2 6 10 14 18 22 26 30
5144 E3: 1 5 9 13 17 21 25 29
5145 E4: 3 7 11 15 19 23 27 31
5147 And RESULT_CHAIN after reordering:
5149 1st vec (E1): 0 4 8 12 16 20 24 28
5150 2nd vec (E3): 1 5 9 13 17 21 25 29
5151 3rd vec (E2): 2 6 10 14 18 22 26 30
5152 4th vec (E4): 3 7 11 15 19 23 27 31. */
5155 vect_permute_load_chain (vec
<tree
> dr_chain
,
5156 unsigned int length
,
5158 gimple_stmt_iterator
*gsi
,
5159 vec
<tree
> *result_chain
)
5161 tree data_ref
, first_vect
, second_vect
;
5162 tree perm_mask_even
, perm_mask_odd
;
5163 tree perm3_mask_low
, perm3_mask_high
;
5165 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5166 unsigned int i
, j
, log_length
= exact_log2 (length
);
5167 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5168 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5170 result_chain
->quick_grow (length
);
5171 memcpy (result_chain
->address (), dr_chain
.address (),
5172 length
* sizeof (tree
));
5178 for (k
= 0; k
< 3; k
++)
5180 for (i
= 0; i
< nelt
; i
++)
5181 if (3 * i
+ k
< 2 * nelt
)
5185 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5187 for (i
= 0, j
= 0; i
< nelt
; i
++)
5188 if (3 * i
+ k
< 2 * nelt
)
5191 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5193 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5195 first_vect
= dr_chain
[0];
5196 second_vect
= dr_chain
[1];
5198 /* Create interleaving stmt (low part of):
5199 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5201 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5202 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5203 second_vect
, perm3_mask_low
);
5204 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5206 /* Create interleaving stmt (high part of):
5207 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5209 first_vect
= data_ref
;
5210 second_vect
= dr_chain
[2];
5211 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5212 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5213 second_vect
, perm3_mask_high
);
5214 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5215 (*result_chain
)[k
] = data_ref
;
5220 /* If length is not equal to 3 then only power of 2 is supported. */
5221 gcc_assert (exact_log2 (length
) != -1);
5223 for (i
= 0; i
< nelt
; ++i
)
5225 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5227 for (i
= 0; i
< nelt
; ++i
)
5229 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5231 for (i
= 0; i
< log_length
; i
++)
5233 for (j
= 0; j
< length
; j
+= 2)
5235 first_vect
= dr_chain
[j
];
5236 second_vect
= dr_chain
[j
+1];
5238 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5239 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5240 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5241 first_vect
, second_vect
,
5243 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5244 (*result_chain
)[j
/2] = data_ref
;
5246 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5247 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5248 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5249 first_vect
, second_vect
,
5251 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5252 (*result_chain
)[j
/2+length
/2] = data_ref
;
5254 memcpy (dr_chain
.address (), result_chain
->address (),
5255 length
* sizeof (tree
));
5260 /* Function vect_shift_permute_load_chain.
5262 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5263 sequence of stmts to reorder the input data accordingly.
5264 Return the final references for loads in RESULT_CHAIN.
5265 Return true if successed, false otherwise.
5267 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5268 The input is 3 vectors each containing 8 elements. We assign a
5269 number to each element, the input sequence is:
5271 1st vec: 0 1 2 3 4 5 6 7
5272 2nd vec: 8 9 10 11 12 13 14 15
5273 3rd vec: 16 17 18 19 20 21 22 23
5275 The output sequence should be:
5277 1st vec: 0 3 6 9 12 15 18 21
5278 2nd vec: 1 4 7 10 13 16 19 22
5279 3rd vec: 2 5 8 11 14 17 20 23
5281 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5283 First we shuffle all 3 vectors to get correct elements order:
5285 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5286 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5287 3rd vec: (16 19 22) (17 20 23) (18 21)
5289 Next we unite and shift vector 3 times:
5292 shift right by 6 the concatenation of:
5293 "1st vec" and "2nd vec"
5294 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5295 "2nd vec" and "3rd vec"
5296 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5297 "3rd vec" and "1st vec"
5298 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5301 So that now new vectors are:
5303 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5304 2nd vec: (10 13) (16 19 22) (17 20 23)
5305 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5308 shift right by 5 the concatenation of:
5309 "1st vec" and "3rd vec"
5310 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5311 "2nd vec" and "1st vec"
5312 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5313 "3rd vec" and "2nd vec"
5314 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5317 So that now new vectors are:
5319 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5320 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5321 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5324 shift right by 5 the concatenation of:
5325 "1st vec" and "1st vec"
5326 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5327 shift right by 3 the concatenation of:
5328 "2nd vec" and "2nd vec"
5329 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5332 So that now all vectors are READY:
5333 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5334 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5335 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5337 This algorithm is faster than one in vect_permute_load_chain if:
5338 1. "shift of a concatination" is faster than general permutation.
5340 2. The TARGET machine can't execute vector instructions in parallel.
5341 This is because each step of the algorithm depends on previous.
5342 The algorithm in vect_permute_load_chain is much more parallel.
5344 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5348 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5349 unsigned int length
,
5351 gimple_stmt_iterator
*gsi
,
5352 vec
<tree
> *result_chain
)
5354 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5355 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5356 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5359 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5361 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5362 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5363 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5364 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5366 result_chain
->quick_grow (length
);
5367 memcpy (result_chain
->address (), dr_chain
.address (),
5368 length
* sizeof (tree
));
5370 if (exact_log2 (length
) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5372 unsigned int j
, log_length
= exact_log2 (length
);
5373 for (i
= 0; i
< nelt
/ 2; ++i
)
5375 for (i
= 0; i
< nelt
/ 2; ++i
)
5376 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5377 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5379 if (dump_enabled_p ())
5380 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5381 "shuffle of 2 fields structure is not \
5382 supported by target\n");
5385 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5387 for (i
= 0; i
< nelt
/ 2; ++i
)
5389 for (i
= 0; i
< nelt
/ 2; ++i
)
5390 sel
[nelt
/ 2 + i
] = i
* 2;
5391 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5395 "shuffle of 2 fields structure is not \
5396 supported by target\n");
5399 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5401 /* Generating permutation constant to shift all elements.
5402 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5403 for (i
= 0; i
< nelt
; i
++)
5404 sel
[i
] = nelt
/ 2 + i
;
5405 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5407 if (dump_enabled_p ())
5408 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5409 "shift permutation is not supported by target\n");
5412 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5414 /* Generating permutation constant to select vector from 2.
5415 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5416 for (i
= 0; i
< nelt
/ 2; i
++)
5418 for (i
= nelt
/ 2; i
< nelt
; i
++)
5420 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5422 if (dump_enabled_p ())
5423 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5424 "select is not supported by target\n");
5427 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5429 for (i
= 0; i
< log_length
; i
++)
5431 for (j
= 0; j
< length
; j
+= 2)
5433 first_vect
= dr_chain
[j
];
5434 second_vect
= dr_chain
[j
+ 1];
5436 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5437 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5438 first_vect
, first_vect
,
5440 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5443 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5444 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5445 second_vect
, second_vect
,
5447 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5450 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5451 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5452 vect
[0], vect
[1], shift1_mask
);
5453 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5454 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5456 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5457 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5458 vect
[0], vect
[1], select_mask
);
5459 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5460 (*result_chain
)[j
/2] = data_ref
;
5462 memcpy (dr_chain
.address (), result_chain
->address (),
5463 length
* sizeof (tree
));
5467 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5469 unsigned int k
= 0, l
= 0;
5471 /* Generating permutation constant to get all elements in rigth order.
5472 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5473 for (i
= 0; i
< nelt
; i
++)
5475 if (3 * k
+ (l
% 3) >= nelt
)
5478 l
+= (3 - (nelt
% 3));
5480 sel
[i
] = 3 * k
+ (l
% 3);
5483 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5485 if (dump_enabled_p ())
5486 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5487 "shuffle of 3 fields structure is not \
5488 supported by target\n");
5491 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5493 /* Generating permutation constant to shift all elements.
5494 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5495 for (i
= 0; i
< nelt
; i
++)
5496 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5497 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5499 if (dump_enabled_p ())
5500 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5501 "shift permutation is not supported by target\n");
5504 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5506 /* Generating permutation constant to shift all elements.
5507 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5508 for (i
= 0; i
< nelt
; i
++)
5509 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5510 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5512 if (dump_enabled_p ())
5513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5514 "shift permutation is not supported by target\n");
5517 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5519 /* Generating permutation constant to shift all elements.
5520 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5521 for (i
= 0; i
< nelt
; i
++)
5522 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5523 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5525 if (dump_enabled_p ())
5526 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5527 "shift permutation is not supported by target\n");
5530 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5532 /* Generating permutation constant to shift all elements.
5533 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5534 for (i
= 0; i
< nelt
; i
++)
5535 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5536 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5538 if (dump_enabled_p ())
5539 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5540 "shift permutation is not supported by target\n");
5543 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5545 for (k
= 0; k
< 3; k
++)
5547 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5548 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5549 dr_chain
[k
], dr_chain
[k
],
5551 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5555 for (k
= 0; k
< 3; k
++)
5557 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5558 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5559 vect
[k
% 3], vect
[(k
+ 1) % 3],
5561 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5562 vect_shift
[k
] = data_ref
;
5565 for (k
= 0; k
< 3; k
++)
5567 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5568 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5569 vect_shift
[(4 - k
) % 3],
5570 vect_shift
[(3 - k
) % 3],
5572 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5576 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5578 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5579 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5580 vect
[0], shift3_mask
);
5581 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5582 (*result_chain
)[nelt
% 3] = data_ref
;
5584 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5585 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5586 vect
[1], shift4_mask
);
5587 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5588 (*result_chain
)[0] = data_ref
;
5594 /* Function vect_transform_grouped_load.
5596 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5597 to perform their permutation and ascribe the result vectorized statements to
5598 the scalar statements.
5602 vect_transform_grouped_load (gimple stmt
, vec
<tree
> dr_chain
, int size
,
5603 gimple_stmt_iterator
*gsi
)
5606 vec
<tree
> result_chain
= vNULL
;
5608 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5609 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5610 vectors, that are ready for vector computation. */
5611 result_chain
.create (size
);
5613 /* If reassociation width for vector type is 2 or greater target machine can
5614 execute 2 or more vector instructions in parallel. Otherwise try to
5615 get chain for loads group using vect_shift_permute_load_chain. */
5616 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5617 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5618 || exact_log2 (size
) != -1
5619 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5620 gsi
, &result_chain
))
5621 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5622 vect_record_grouped_load_vectors (stmt
, result_chain
);
5623 result_chain
.release ();
5626 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5627 generated as part of the vectorization of STMT. Assign the statement
5628 for each vector to the associated scalar statement. */
5631 vect_record_grouped_load_vectors (gimple stmt
, vec
<tree
> result_chain
)
5633 gimple first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5634 gimple next_stmt
, new_stmt
;
5635 unsigned int i
, gap_count
;
5638 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5639 Since we scan the chain starting from it's first node, their order
5640 corresponds the order of data-refs in RESULT_CHAIN. */
5641 next_stmt
= first_stmt
;
5643 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5648 /* Skip the gaps. Loads created for the gaps will be removed by dead
5649 code elimination pass later. No need to check for the first stmt in
5650 the group, since it always exists.
5651 GROUP_GAP is the number of steps in elements from the previous
5652 access (if there is no gap GROUP_GAP is 1). We skip loads that
5653 correspond to the gaps. */
5654 if (next_stmt
!= first_stmt
5655 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5663 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5664 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5665 copies, and we put the new vector statement in the first available
5667 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5668 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5671 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5674 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5676 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5679 prev_stmt
= rel_stmt
;
5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5684 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5689 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5691 /* If NEXT_STMT accesses the same DR as the previous statement,
5692 put the same TMP_DATA_REF as its vectorized statement; otherwise
5693 get the next data-ref from RESULT_CHAIN. */
5694 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5700 /* Function vect_force_dr_alignment_p.
5702 Returns whether the alignment of a DECL can be forced to be aligned
5703 on ALIGNMENT bit boundary. */
5706 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5708 if (TREE_CODE (decl
) != VAR_DECL
)
5711 if (decl_in_symtab_p (decl
)
5712 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5715 if (TREE_STATIC (decl
))
5716 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5718 return (alignment
<= MAX_STACK_ALIGNMENT
);
5722 /* Return whether the data reference DR is supported with respect to its
5724 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5725 it is aligned, i.e., check if it is possible to vectorize it with different
5728 enum dr_alignment_support
5729 vect_supportable_dr_alignment (struct data_reference
*dr
,
5730 bool check_aligned_accesses
)
5732 gimple stmt
= DR_STMT (dr
);
5733 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5734 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5735 machine_mode mode
= TYPE_MODE (vectype
);
5736 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5737 struct loop
*vect_loop
= NULL
;
5738 bool nested_in_vect_loop
= false;
5740 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5743 /* For now assume all conditional loads/stores support unaligned
5744 access without any special code. */
5745 if (is_gimple_call (stmt
)
5746 && gimple_call_internal_p (stmt
)
5747 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5748 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5749 return dr_unaligned_supported
;
5753 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5754 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5757 /* Possibly unaligned access. */
5759 /* We can choose between using the implicit realignment scheme (generating
5760 a misaligned_move stmt) and the explicit realignment scheme (generating
5761 aligned loads with a REALIGN_LOAD). There are two variants to the
5762 explicit realignment scheme: optimized, and unoptimized.
5763 We can optimize the realignment only if the step between consecutive
5764 vector loads is equal to the vector size. Since the vector memory
5765 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5766 is guaranteed that the misalignment amount remains the same throughout the
5767 execution of the vectorized loop. Therefore, we can create the
5768 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5769 at the loop preheader.
5771 However, in the case of outer-loop vectorization, when vectorizing a
5772 memory access in the inner-loop nested within the LOOP that is now being
5773 vectorized, while it is guaranteed that the misalignment of the
5774 vectorized memory access will remain the same in different outer-loop
5775 iterations, it is *not* guaranteed that is will remain the same throughout
5776 the execution of the inner-loop. This is because the inner-loop advances
5777 with the original scalar step (and not in steps of VS). If the inner-loop
5778 step happens to be a multiple of VS, then the misalignment remains fixed
5779 and we can use the optimized realignment scheme. For example:
5785 When vectorizing the i-loop in the above example, the step between
5786 consecutive vector loads is 1, and so the misalignment does not remain
5787 fixed across the execution of the inner-loop, and the realignment cannot
5788 be optimized (as illustrated in the following pseudo vectorized loop):
5790 for (i=0; i<N; i+=4)
5791 for (j=0; j<M; j++){
5792 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5793 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5794 // (assuming that we start from an aligned address).
5797 We therefore have to use the unoptimized realignment scheme:
5799 for (i=0; i<N; i+=4)
5800 for (j=k; j<M; j+=4)
5801 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5802 // that the misalignment of the initial address is
5805 The loop can then be vectorized as follows:
5807 for (k=0; k<4; k++){
5808 rt = get_realignment_token (&vp[k]);
5809 for (i=0; i<N; i+=4){
5811 for (j=k; j<M; j+=4){
5813 va = REALIGN_LOAD <v1,v2,rt>;
5820 if (DR_IS_READ (dr
))
5822 bool is_packed
= false;
5823 tree type
= (TREE_TYPE (DR_REF (dr
)));
5825 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5826 && (!targetm
.vectorize
.builtin_mask_for_load
5827 || targetm
.vectorize
.builtin_mask_for_load ()))
5829 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5830 if ((nested_in_vect_loop
5831 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5832 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5834 return dr_explicit_realign
;
5836 return dr_explicit_realign_optimized
;
5838 if (!known_alignment_for_access_p (dr
))
5839 is_packed
= not_size_aligned (DR_REF (dr
));
5841 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5842 || targetm
.vectorize
.
5843 support_vector_misalignment (mode
, type
,
5844 DR_MISALIGNMENT (dr
), is_packed
))
5845 /* Can't software pipeline the loads, but can at least do them. */
5846 return dr_unaligned_supported
;
5850 bool is_packed
= false;
5851 tree type
= (TREE_TYPE (DR_REF (dr
)));
5853 if (!known_alignment_for_access_p (dr
))
5854 is_packed
= not_size_aligned (DR_REF (dr
));
5856 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5857 || targetm
.vectorize
.
5858 support_vector_misalignment (mode
, type
,
5859 DR_MISALIGNMENT (dr
), is_packed
))
5860 return dr_unaligned_supported
;
5864 return dr_unaligned_unsupported
;