1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
62 tree vectype
, unsigned HOST_WIDE_INT count
)
65 scalar_int_mode array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
70 if (!int_mode_for_size (count
* GET_MODE_BITSIZE (mode
),
71 limit_p
).exists (&array_mode
))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
76 GET_MODE_NAME (mode
), count
);
80 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
84 "cannot use %s<%s><%s>\n", name
,
85 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE
, vect_location
,
91 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
92 GET_MODE_NAME (mode
));
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
117 HOST_WIDE_INT
*rhs_size_unit
)
119 tree scalar_type
= gimple_expr_type (stmt
);
120 HOST_WIDE_INT lhs
, rhs
;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (!runtime_alias_check_p (ddr
, loop
,
161 optimize_loop_nest_for_speed_p (loop
)))
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
181 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
182 loop_vec_info loop_vinfo
,
183 int loop_depth
, int *max_vf
)
185 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
186 lambda_vector dist_v
;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
190 int dist
= dist_v
[loop_depth
];
191 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
201 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
203 if (loop
->safelen
< *max_vf
)
204 *max_vf
= loop
->safelen
;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
231 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
232 loop_vec_info loop_vinfo
, int *max_vf
)
235 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
236 struct data_reference
*dra
= DDR_A (ddr
);
237 struct data_reference
*drb
= DDR_B (ddr
);
238 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
239 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
240 lambda_vector dist_v
;
241 unsigned int loop_depth
;
243 /* In loop analysis all data references should be vectorizable. */
244 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
245 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
248 /* Independent data accesses. */
249 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
253 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
256 /* We do not have to consider dependences between accesses that belong
257 to the same group. */
258 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
259 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
262 /* Even if we have an anti-dependence then, as the vectorized loop covers at
263 least two scalar iterations, there is always also a true dependence.
264 As the vectorizer does not re-order loads and stores we can ignore
265 the anti-dependence if TBAA can disambiguate both DRs similar to the
266 case with known negative distance anti-dependences (positive
267 distance anti-dependences would violate TBAA constraints). */
268 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
269 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
270 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
271 get_alias_set (DR_REF (drb
))))
274 /* Unknown data dependence. */
275 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
277 /* If user asserted safelen consecutive iterations can be
278 executed concurrently, assume independence. */
279 if (loop
->safelen
>= 2)
281 if (loop
->safelen
< *max_vf
)
282 *max_vf
= loop
->safelen
;
283 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
287 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
288 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
290 if (dump_enabled_p ())
292 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
293 "versioning for alias not supported for: "
294 "can't determine dependence between ");
295 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
297 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
300 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
308 "versioning for alias required: "
309 "can't determine dependence between ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
312 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
313 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
315 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
318 /* Add to list of ddrs that need to be tested at run-time. */
319 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
322 /* Known data dependence. */
323 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
325 /* If user asserted safelen consecutive iterations can be
326 executed concurrently, assume independence. */
327 if (loop
->safelen
>= 2)
329 if (loop
->safelen
< *max_vf
)
330 *max_vf
= loop
->safelen
;
331 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
335 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
336 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
338 if (dump_enabled_p ())
340 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
341 "versioning for alias not supported for: "
342 "bad dist vector for ");
343 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
345 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
346 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
348 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
356 "versioning for alias required: "
357 "bad dist vector for ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
359 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
361 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
363 /* Add to list of ddrs that need to be tested at run-time. */
364 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
367 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
369 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
370 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
374 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
376 int dist
= dist_v
[loop_depth
];
378 if (dump_enabled_p ())
379 dump_printf_loc (MSG_NOTE
, vect_location
,
380 "dependence distance = %d.\n", dist
);
384 if (dump_enabled_p ())
386 dump_printf_loc (MSG_NOTE
, vect_location
,
387 "dependence distance == 0 between ");
388 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
389 dump_printf (MSG_NOTE
, " and ");
390 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
391 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
394 /* When we perform grouped accesses and perform implicit CSE
395 by detecting equal accesses and doing disambiguation with
396 runtime alias tests like for
404 where we will end up loading { a[i], a[i+1] } once, make
405 sure that inserting group loads before the first load and
406 stores after the last store will do the right thing.
407 Similar for groups like
411 where loads from the group interleave with the store. */
412 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
413 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
415 gimple
*earlier_stmt
;
416 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
418 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
422 "READ_WRITE dependence in interleaving."
431 if (dist
> 0 && DDR_REVERSED_P (ddr
))
433 /* If DDR_REVERSED_P the order of the data-refs in DDR was
434 reversed (to make distance vector positive), and the actual
435 distance is negative. */
436 if (dump_enabled_p ())
437 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
438 "dependence distance negative.\n");
439 /* Record a negative dependence distance to later limit the
440 amount of stmt copying / unrolling we can perform.
441 Only need to handle read-after-write dependence. */
443 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
444 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
445 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
450 && abs (dist
) < *max_vf
)
452 /* The dependence distance requires reduction of the maximal
453 vectorization factor. */
454 *max_vf
= abs (dist
);
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE
, vect_location
,
457 "adjusting maximal vectorization factor to %i\n",
461 if (abs (dist
) >= *max_vf
)
463 /* Dependence distance does not create dependence, as far as
464 vectorization is concerned, in this case. */
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE
, vect_location
,
467 "dependence distance >= VF.\n");
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
474 "not vectorized, possible dependence "
475 "between data-refs ");
476 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
477 dump_printf (MSG_NOTE
, " and ");
478 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
479 dump_printf (MSG_NOTE
, "\n");
488 /* Function vect_analyze_data_ref_dependences.
490 Examine all the data references in the loop, and make sure there do not
491 exist any data dependences between them. Set *MAX_VF according to
492 the maximum vectorization factor the data dependences allow. */
495 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
498 struct data_dependence_relation
*ddr
;
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_NOTE
, vect_location
,
502 "=== vect_analyze_data_ref_dependences ===\n");
504 LOOP_VINFO_DDRS (loop_vinfo
)
505 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
506 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
507 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
508 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
509 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
510 &LOOP_VINFO_DDRS (loop_vinfo
),
511 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
514 /* For epilogues we either have no aliases or alias versioning
515 was applied to original loop. Therefore we may just get max_vf
516 using VF of original loop. */
517 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
518 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
520 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
521 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
528 /* Function vect_slp_analyze_data_ref_dependence.
530 Return TRUE if there (might) exist a dependence between a memory-reference
531 DRA and a memory-reference DRB. When versioning for alias may check a
532 dependence at run-time, return FALSE. Adjust *MAX_VF according to
533 the data dependence. */
536 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
538 struct data_reference
*dra
= DDR_A (ddr
);
539 struct data_reference
*drb
= DDR_B (ddr
);
541 /* We need to check dependences of statements marked as unvectorizable
542 as well, they still can prohibit vectorization. */
544 /* Independent data accesses. */
545 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
551 /* Read-read is OK. */
552 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
555 /* If dra and drb are part of the same interleaving chain consider
557 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
558 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
559 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
562 /* Unknown data dependence. */
563 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
565 if (dump_enabled_p ())
567 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
568 "can't determine dependence between ");
569 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
570 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
572 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
575 else if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE
, vect_location
,
578 "determined dependence between ");
579 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
580 dump_printf (MSG_NOTE
, " and ");
581 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
582 dump_printf (MSG_NOTE
, "\n");
589 /* Analyze dependences involved in the transform of SLP NODE. STORES
590 contain the vector of scalar stores of this instance if we are
591 disambiguating the loads. */
594 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
595 vec
<gimple
*> stores
, gimple
*last_store
)
597 /* This walks over all stmts involved in the SLP load/store done
598 in NODE verifying we can sink them up to the last stmt in the
600 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
601 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
603 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
604 if (access
== last_access
)
606 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
607 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
608 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
610 gimple
*stmt
= gsi_stmt (gsi
);
611 if (! gimple_vuse (stmt
)
612 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
615 /* If we couldn't record a (single) data reference for this
616 stmt we have to give up. */
617 /* ??? Here and below if dependence analysis fails we can resort
618 to the alias oracle which can handle more kinds of stmts. */
619 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
623 bool dependent
= false;
624 /* If we run into a store of this same instance (we've just
625 marked those) then delay dependence checking until we run
626 into the last store because this is where it will have
627 been sunk to (and we verify if we can do that as well). */
628 if (gimple_visited_p (stmt
))
630 if (stmt
!= last_store
)
634 FOR_EACH_VEC_ELT (stores
, i
, store
)
636 data_reference
*store_dr
637 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
638 ddr_p ddr
= initialize_data_dependence_relation
639 (dr_a
, store_dr
, vNULL
);
640 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
641 free_dependence_relation (ddr
);
648 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
650 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
651 free_dependence_relation (ddr
);
661 /* Function vect_analyze_data_ref_dependences.
663 Examine all the data references in the basic-block, and make sure there
664 do not exist any data dependences between them. Set *MAX_VF according to
665 the maximum vectorization factor the data dependences allow. */
668 vect_slp_analyze_instance_dependence (slp_instance instance
)
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE
, vect_location
,
672 "=== vect_slp_analyze_instance_dependence ===\n");
674 /* The stores of this instance are at the root of the SLP tree. */
675 slp_tree store
= SLP_INSTANCE_TREE (instance
);
676 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
679 /* Verify we can sink stores to the vectorized stmt insert location. */
680 gimple
*last_store
= NULL
;
683 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
686 /* Mark stores in this instance and remember the last one. */
687 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
688 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
689 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
694 /* Verify we can sink loads to the vectorized stmt insert location,
695 special-casing stores of this instance. */
698 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
699 if (! vect_slp_analyze_node_dependences (instance
, load
,
701 ? SLP_TREE_SCALAR_STMTS (store
)
702 : vNULL
, last_store
))
708 /* Unset the visited flag. */
710 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
711 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
716 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
717 the statement that contains DRB, which is useful for recording in the
721 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
722 innermost_loop_behavior
*drb
)
725 innermost_loop_behavior
*&entry
726 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
727 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE
, vect_location
,
733 "recording new base alignment for ");
734 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
735 dump_printf (MSG_NOTE
, "\n");
736 dump_printf_loc (MSG_NOTE
, vect_location
,
737 " alignment: %d\n", drb
->base_alignment
);
738 dump_printf_loc (MSG_NOTE
, vect_location
,
739 " misalignment: %d\n", drb
->base_misalignment
);
740 dump_printf_loc (MSG_NOTE
, vect_location
,
742 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
747 /* If the region we're going to vectorize is reached, all unconditional
748 data references occur at least once. We can therefore pool the base
749 alignment guarantees from each unconditional reference. Do this by
750 going through all the data references in VINFO and checking whether
751 the containing statement makes the reference unconditionally. If so,
752 record the alignment of the base address in VINFO so that it can be
753 used for all other references with the same base. */
756 vect_record_base_alignments (vec_info
*vinfo
)
758 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
759 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
762 FOR_EACH_VEC_ELT (vinfo
->datarefs
, i
, dr
)
763 if (!DR_IS_CONDITIONAL_IN_STMT (dr
))
765 gimple
*stmt
= DR_STMT (dr
);
766 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
768 /* If DR is nested in the loop that is being vectorized, we can also
769 record the alignment of the base wrt the outer loop. */
770 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
772 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
773 vect_record_base_alignment
774 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
779 /* Return the target alignment for the vectorized form of DR. */
782 vect_calculate_target_alignment (struct data_reference
*dr
)
784 gimple
*stmt
= DR_STMT (dr
);
785 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
786 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
787 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
790 /* Function vect_compute_data_ref_alignment
792 Compute the misalignment of the data reference DR.
795 1. If during the misalignment computation it is found that the data reference
796 cannot be vectorized then false is returned.
797 2. DR_MISALIGNMENT (DR) is defined.
799 FOR NOW: No analysis is actually performed. Misalignment is calculated
800 only for trivial cases. TODO. */
803 vect_compute_data_ref_alignment (struct data_reference
*dr
)
805 gimple
*stmt
= DR_STMT (dr
);
806 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
807 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
808 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
809 struct loop
*loop
= NULL
;
810 tree ref
= DR_REF (dr
);
811 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
813 if (dump_enabled_p ())
814 dump_printf_loc (MSG_NOTE
, vect_location
,
815 "vect_compute_data_ref_alignment:\n");
818 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
820 /* Initialize misalignment to unknown. */
821 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
823 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
824 bool step_preserves_misalignment_p
;
826 unsigned HOST_WIDE_INT vector_alignment
827 = vect_calculate_target_alignment (dr
) / BITS_PER_UNIT
;
828 DR_TARGET_ALIGNMENT (dr
) = vector_alignment
;
830 /* No step for BB vectorization. */
833 gcc_assert (integer_zerop (drb
->step
));
834 step_preserves_misalignment_p
= true;
837 /* In case the dataref is in an inner-loop of the loop that is being
838 vectorized (LOOP), we use the base and misalignment information
839 relative to the outer-loop (LOOP). This is ok only if the misalignment
840 stays the same throughout the execution of the inner-loop, which is why
841 we have to check that the stride of the dataref in the inner-loop evenly
842 divides by the vector alignment. */
843 else if (nested_in_vect_loop_p (loop
, stmt
))
845 step_preserves_misalignment_p
846 = (DR_STEP_ALIGNMENT (dr
) % vector_alignment
) == 0;
848 if (dump_enabled_p ())
850 if (step_preserves_misalignment_p
)
851 dump_printf_loc (MSG_NOTE
, vect_location
,
852 "inner step divides the vector alignment.\n");
854 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
855 "inner step doesn't divide the vector"
860 /* Similarly we can only use base and misalignment information relative to
861 an innermost loop if the misalignment stays the same throughout the
862 execution of the loop. As above, this is the case if the stride of
863 the dataref evenly divides by the alignment. */
866 unsigned vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
867 step_preserves_misalignment_p
868 = ((DR_STEP_ALIGNMENT (dr
) * vf
) % vector_alignment
) == 0;
870 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
872 "step doesn't divide the vector alignment.\n");
875 unsigned int base_alignment
= drb
->base_alignment
;
876 unsigned int base_misalignment
= drb
->base_misalignment
;
878 /* Calculate the maximum of the pooled base address alignment and the
879 alignment that we can compute for DR itself. */
880 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
881 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
883 base_alignment
= (*entry
)->base_alignment
;
884 base_misalignment
= (*entry
)->base_misalignment
;
887 if (drb
->offset_alignment
< vector_alignment
888 || !step_preserves_misalignment_p
889 /* We need to know whether the step wrt the vectorized loop is
890 negative when computing the starting misalignment below. */
891 || TREE_CODE (drb
->step
) != INTEGER_CST
)
893 if (dump_enabled_p ())
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
896 "Unknown alignment for access: ");
897 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
898 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
903 if (base_alignment
< vector_alignment
)
905 tree base
= drb
->base_address
;
906 if (TREE_CODE (base
) == ADDR_EXPR
)
907 base
= TREE_OPERAND (base
, 0);
908 if (!vect_can_force_dr_alignment_p (base
,
909 vector_alignment
* BITS_PER_UNIT
))
911 if (dump_enabled_p ())
913 dump_printf_loc (MSG_NOTE
, vect_location
,
914 "can't force alignment of ref: ");
915 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
916 dump_printf (MSG_NOTE
, "\n");
921 if (DECL_USER_ALIGN (base
))
923 if (dump_enabled_p ())
925 dump_printf_loc (MSG_NOTE
, vect_location
,
926 "not forcing alignment of user-aligned "
928 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
929 dump_printf (MSG_NOTE
, "\n");
934 /* Force the alignment of the decl.
935 NOTE: This is the only change to the code we make during
936 the analysis phase, before deciding to vectorize the loop. */
937 if (dump_enabled_p ())
939 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
940 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
941 dump_printf (MSG_NOTE
, "\n");
944 DR_VECT_AUX (dr
)->base_decl
= base
;
945 DR_VECT_AUX (dr
)->base_misaligned
= true;
946 base_misalignment
= 0;
948 poly_int64 misalignment
949 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
951 /* If this is a backward running DR then first access in the larger
952 vectype actually is N-1 elements before the address in the DR.
953 Adjust misalign accordingly. */
954 if (tree_int_cst_sgn (drb
->step
) < 0)
955 /* PLUS because STEP is negative. */
956 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
957 * TREE_INT_CST_LOW (drb
->step
));
959 unsigned int const_misalignment
;
960 if (!known_misalignment (misalignment
, vector_alignment
,
961 &const_misalignment
))
963 if (dump_enabled_p ())
965 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
966 "Non-constant misalignment for access: ");
967 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
968 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
973 SET_DR_MISALIGNMENT (dr
, const_misalignment
);
975 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
978 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
979 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
980 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
986 /* Function vect_update_misalignment_for_peel.
987 Sets DR's misalignment
988 - to 0 if it has the same alignment as DR_PEEL,
989 - to the misalignment computed using NPEEL if DR's salignment is known,
990 - to -1 (unknown) otherwise.
992 DR - the data reference whose misalignment is to be adjusted.
993 DR_PEEL - the data reference whose misalignment is being made
994 zero in the vector loop by the peel.
995 NPEEL - the number of iterations in the peel loop if the misalignment
996 of DR_PEEL is known at compile time. */
999 vect_update_misalignment_for_peel (struct data_reference
*dr
,
1000 struct data_reference
*dr_peel
, int npeel
)
1003 vec
<dr_p
> same_aligned_drs
;
1004 struct data_reference
*current_dr
;
1005 int dr_size
= vect_get_scalar_dr_size (dr
);
1006 int dr_peel_size
= vect_get_scalar_dr_size (dr_peel
);
1007 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1008 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
1010 /* For interleaved data accesses the step in the loop must be multiplied by
1011 the size of the interleaving group. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1013 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
1014 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
1015 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
1017 /* It can be assumed that the data refs with the same alignment as dr_peel
1018 are aligned in the vector loop. */
1020 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
1021 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1023 if (current_dr
!= dr
)
1025 gcc_assert (!known_alignment_for_access_p (dr
)
1026 || !known_alignment_for_access_p (dr_peel
)
1027 || (DR_MISALIGNMENT (dr
) / dr_size
1028 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
1029 SET_DR_MISALIGNMENT (dr
, 0);
1033 if (known_alignment_for_access_p (dr
)
1034 && known_alignment_for_access_p (dr_peel
))
1036 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1037 int misal
= DR_MISALIGNMENT (dr
);
1038 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1039 misal
&= DR_TARGET_ALIGNMENT (dr
) - 1;
1040 SET_DR_MISALIGNMENT (dr
, misal
);
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1046 "to unknown (-1).\n");
1047 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1051 /* Function verify_data_ref_alignment
1053 Return TRUE if DR can be handled with respect to alignment. */
1056 verify_data_ref_alignment (data_reference_p dr
)
1058 enum dr_alignment_support supportable_dr_alignment
1059 = vect_supportable_dr_alignment (dr
, false);
1060 if (!supportable_dr_alignment
)
1062 if (dump_enabled_p ())
1064 if (DR_IS_READ (dr
))
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1066 "not vectorized: unsupported unaligned load.");
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1069 "not vectorized: unsupported unaligned "
1072 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1074 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1079 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1080 dump_printf_loc (MSG_NOTE
, vect_location
,
1081 "Vectorizing an unaligned access.\n");
1086 /* Function vect_verify_datarefs_alignment
1088 Return TRUE if all data references in the loop can be
1089 handled with respect to alignment. */
1092 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1094 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1095 struct data_reference
*dr
;
1098 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1100 gimple
*stmt
= DR_STMT (dr
);
1101 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1103 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1106 /* For interleaving, only the alignment of the first access matters. */
1107 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1108 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1111 /* Strided accesses perform only component accesses, alignment is
1112 irrelevant for them. */
1113 if (STMT_VINFO_STRIDED_P (stmt_info
)
1114 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1117 if (! verify_data_ref_alignment (dr
))
1124 /* Given an memory reference EXP return whether its alignment is less
1128 not_size_aligned (tree exp
)
1130 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1133 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1134 > get_object_alignment (exp
));
1137 /* Function vector_alignment_reachable_p
1139 Return true if vector alignment for DR is reachable by peeling
1140 a few loop iterations. Return false otherwise. */
1143 vector_alignment_reachable_p (struct data_reference
*dr
)
1145 gimple
*stmt
= DR_STMT (dr
);
1146 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1147 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1149 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1151 /* For interleaved access we peel only if number of iterations in
1152 the prolog loop ({VF - misalignment}), is a multiple of the
1153 number of the interleaved accesses. */
1154 int elem_size
, mis_in_elements
;
1155 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1157 /* FORNOW: handle only known alignment. */
1158 if (!known_alignment_for_access_p (dr
))
1161 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1162 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1164 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1168 /* If misalignment is known at the compile time then allow peeling
1169 only if natural alignment is reachable through peeling. */
1170 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1172 HOST_WIDE_INT elmsize
=
1173 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1174 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_NOTE
, vect_location
,
1177 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1178 dump_printf (MSG_NOTE
,
1179 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1181 if (DR_MISALIGNMENT (dr
) % elmsize
)
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1185 "data size does not divide the misalignment.\n");
1190 if (!known_alignment_for_access_p (dr
))
1192 tree type
= TREE_TYPE (DR_REF (dr
));
1193 bool is_packed
= not_size_aligned (DR_REF (dr
));
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1196 "Unknown misalignment, %snaturally aligned\n",
1197 is_packed
? "not " : "");
1198 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1205 /* Calculate the cost of the memory access represented by DR. */
1208 vect_get_data_access_cost (struct data_reference
*dr
,
1209 unsigned int *inside_cost
,
1210 unsigned int *outside_cost
,
1211 stmt_vector_for_cost
*body_cost_vec
)
1213 gimple
*stmt
= DR_STMT (dr
);
1214 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1215 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1218 if (PURE_SLP_STMT (stmt_info
))
1221 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1223 if (DR_IS_READ (dr
))
1224 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1225 NULL
, body_cost_vec
, false);
1227 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_NOTE
, vect_location
,
1231 "vect_get_data_access_cost: inside_cost = %d, "
1232 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1236 typedef struct _vect_peel_info
1238 struct data_reference
*dr
;
1243 typedef struct _vect_peel_extended_info
1245 struct _vect_peel_info peel_info
;
1246 unsigned int inside_cost
;
1247 unsigned int outside_cost
;
1248 } *vect_peel_extended_info
;
1251 /* Peeling hashtable helpers. */
1253 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1255 static inline hashval_t
hash (const _vect_peel_info
*);
1256 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1260 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1262 return (hashval_t
) peel_info
->npeel
;
1266 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1268 return (a
->npeel
== b
->npeel
);
1272 /* Insert DR into peeling hash table with NPEEL as key. */
1275 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1276 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1279 struct _vect_peel_info elem
, *slot
;
1280 _vect_peel_info
**new_slot
;
1281 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1284 slot
= peeling_htab
->find (&elem
);
1289 slot
= XNEW (struct _vect_peel_info
);
1290 slot
->npeel
= npeel
;
1293 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1297 if (!supportable_dr_alignment
1298 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1299 slot
->count
+= VECT_MAX_COST
;
1303 /* Traverse peeling hash table to find peeling option that aligns maximum
1304 number of data accesses. */
1307 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1308 _vect_peel_extended_info
*max
)
1310 vect_peel_info elem
= *slot
;
1312 if (elem
->count
> max
->peel_info
.count
1313 || (elem
->count
== max
->peel_info
.count
1314 && max
->peel_info
.npeel
> elem
->npeel
))
1316 max
->peel_info
.npeel
= elem
->npeel
;
1317 max
->peel_info
.count
= elem
->count
;
1318 max
->peel_info
.dr
= elem
->dr
;
1324 /* Get the costs of peeling NPEEL iterations checking data access costs
1325 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1326 misalignment will be zero after peeling. */
1329 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1330 struct data_reference
*dr0
,
1331 unsigned int *inside_cost
,
1332 unsigned int *outside_cost
,
1333 stmt_vector_for_cost
*body_cost_vec
,
1335 bool unknown_misalignment
)
1340 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1342 gimple
*stmt
= DR_STMT (dr
);
1343 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1344 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1347 /* For interleaving, only the alignment of the first access
1349 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1350 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1353 /* Strided accesses perform only component accesses, alignment is
1354 irrelevant for them. */
1355 if (STMT_VINFO_STRIDED_P (stmt_info
)
1356 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1359 int save_misalignment
;
1360 save_misalignment
= DR_MISALIGNMENT (dr
);
1363 else if (unknown_misalignment
&& dr
== dr0
)
1364 SET_DR_MISALIGNMENT (dr
, 0);
1366 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1367 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1369 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1373 /* Traverse peeling hash table and calculate cost for each peeling option.
1374 Find the one with the lowest cost. */
1377 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1378 _vect_peel_extended_info
*min
)
1380 vect_peel_info elem
= *slot
;
1382 unsigned int inside_cost
= 0, outside_cost
= 0;
1383 gimple
*stmt
= DR_STMT (elem
->dr
);
1384 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1385 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1386 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1389 prologue_cost_vec
.create (2);
1390 body_cost_vec
.create (2);
1391 epilogue_cost_vec
.create (2);
1393 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1394 elem
->dr
, &inside_cost
, &outside_cost
,
1395 &body_cost_vec
, elem
->npeel
, false);
1397 body_cost_vec
.release ();
1399 outside_cost
+= vect_get_known_peeling_cost
1400 (loop_vinfo
, elem
->npeel
, &dummy
,
1401 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1402 &prologue_cost_vec
, &epilogue_cost_vec
);
1404 /* Prologue and epilogue costs are added to the target model later.
1405 These costs depend only on the scalar iteration cost, the
1406 number of peeling iterations finally chosen, and the number of
1407 misaligned statements. So discard the information found here. */
1408 prologue_cost_vec
.release ();
1409 epilogue_cost_vec
.release ();
1411 if (inside_cost
< min
->inside_cost
1412 || (inside_cost
== min
->inside_cost
1413 && outside_cost
< min
->outside_cost
))
1415 min
->inside_cost
= inside_cost
;
1416 min
->outside_cost
= outside_cost
;
1417 min
->peel_info
.dr
= elem
->dr
;
1418 min
->peel_info
.npeel
= elem
->npeel
;
1419 min
->peel_info
.count
= elem
->count
;
1426 /* Choose best peeling option by traversing peeling hash table and either
1427 choosing an option with the lowest cost (if cost model is enabled) or the
1428 option that aligns as many accesses as possible. */
1430 static struct _vect_peel_extended_info
1431 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1432 loop_vec_info loop_vinfo
)
1434 struct _vect_peel_extended_info res
;
1436 res
.peel_info
.dr
= NULL
;
1438 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1440 res
.inside_cost
= INT_MAX
;
1441 res
.outside_cost
= INT_MAX
;
1442 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1443 vect_peeling_hash_get_lowest_cost
> (&res
);
1447 res
.peel_info
.count
= 0;
1448 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1449 vect_peeling_hash_get_most_frequent
> (&res
);
1450 res
.inside_cost
= 0;
1451 res
.outside_cost
= 0;
1457 /* Return true if the new peeling NPEEL is supported. */
1460 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1464 struct data_reference
*dr
= NULL
;
1465 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1467 stmt_vec_info stmt_info
;
1468 enum dr_alignment_support supportable_dr_alignment
;
1470 /* Ensure that all data refs can be vectorized after the peel. */
1471 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1473 int save_misalignment
;
1478 stmt
= DR_STMT (dr
);
1479 stmt_info
= vinfo_for_stmt (stmt
);
1480 /* For interleaving, only the alignment of the first access
1482 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1483 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1486 /* Strided accesses perform only component accesses, alignment is
1487 irrelevant for them. */
1488 if (STMT_VINFO_STRIDED_P (stmt_info
)
1489 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1492 save_misalignment
= DR_MISALIGNMENT (dr
);
1493 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1494 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1495 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1497 if (!supportable_dr_alignment
)
1504 /* Function vect_enhance_data_refs_alignment
1506 This pass will use loop versioning and loop peeling in order to enhance
1507 the alignment of data references in the loop.
1509 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1510 original loop is to be vectorized. Any other loops that are created by
1511 the transformations performed in this pass - are not supposed to be
1512 vectorized. This restriction will be relaxed.
1514 This pass will require a cost model to guide it whether to apply peeling
1515 or versioning or a combination of the two. For example, the scheme that
1516 intel uses when given a loop with several memory accesses, is as follows:
1517 choose one memory access ('p') which alignment you want to force by doing
1518 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1519 other accesses are not necessarily aligned, or (2) use loop versioning to
1520 generate one loop in which all accesses are aligned, and another loop in
1521 which only 'p' is necessarily aligned.
1523 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1524 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1525 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1527 Devising a cost model is the most critical aspect of this work. It will
1528 guide us on which access to peel for, whether to use loop versioning, how
1529 many versions to create, etc. The cost model will probably consist of
1530 generic considerations as well as target specific considerations (on
1531 powerpc for example, misaligned stores are more painful than misaligned
1534 Here are the general steps involved in alignment enhancements:
1536 -- original loop, before alignment analysis:
1537 for (i=0; i<N; i++){
1538 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1539 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1542 -- After vect_compute_data_refs_alignment:
1543 for (i=0; i<N; i++){
1544 x = q[i]; # DR_MISALIGNMENT(q) = 3
1545 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1548 -- Possibility 1: we do loop versioning:
1550 for (i=0; i<N; i++){ # loop 1A
1551 x = q[i]; # DR_MISALIGNMENT(q) = 3
1552 p[i] = y; # DR_MISALIGNMENT(p) = 0
1556 for (i=0; i<N; i++){ # loop 1B
1557 x = q[i]; # DR_MISALIGNMENT(q) = 3
1558 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1562 -- Possibility 2: we do loop peeling:
1563 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1567 for (i = 3; i < N; i++){ # loop 2A
1568 x = q[i]; # DR_MISALIGNMENT(q) = 0
1569 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1572 -- Possibility 3: combination of loop peeling and versioning:
1573 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1578 for (i = 3; i<N; i++){ # loop 3A
1579 x = q[i]; # DR_MISALIGNMENT(q) = 0
1580 p[i] = y; # DR_MISALIGNMENT(p) = 0
1584 for (i = 3; i<N; i++){ # loop 3B
1585 x = q[i]; # DR_MISALIGNMENT(q) = 0
1586 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1590 These loops are later passed to loop_transform to be vectorized. The
1591 vectorizer will use the alignment information to guide the transformation
1592 (whether to generate regular loads/stores, or with special handling for
1596 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1598 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1599 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1600 enum dr_alignment_support supportable_dr_alignment
;
1601 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1602 struct data_reference
*dr
;
1604 bool do_peeling
= false;
1605 bool do_versioning
= false;
1608 stmt_vec_info stmt_info
;
1609 unsigned int npeel
= 0;
1610 bool one_misalignment_known
= false;
1611 bool one_misalignment_unknown
= false;
1612 bool one_dr_unsupportable
= false;
1613 struct data_reference
*unsupportable_dr
= NULL
;
1614 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1615 unsigned possible_npeel_number
= 1;
1617 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1618 hash_table
<peel_info_hasher
> peeling_htab (1);
1620 if (dump_enabled_p ())
1621 dump_printf_loc (MSG_NOTE
, vect_location
,
1622 "=== vect_enhance_data_refs_alignment ===\n");
1624 /* Reset data so we can safely be called multiple times. */
1625 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1626 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1628 /* While cost model enhancements are expected in the future, the high level
1629 view of the code at this time is as follows:
1631 A) If there is a misaligned access then see if peeling to align
1632 this access can make all data references satisfy
1633 vect_supportable_dr_alignment. If so, update data structures
1634 as needed and return true.
1636 B) If peeling wasn't possible and there is a data reference with an
1637 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1638 then see if loop versioning checks can be used to make all data
1639 references satisfy vect_supportable_dr_alignment. If so, update
1640 data structures as needed and return true.
1642 C) If neither peeling nor versioning were successful then return false if
1643 any data reference does not satisfy vect_supportable_dr_alignment.
1645 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1647 Note, Possibility 3 above (which is peeling and versioning together) is not
1648 being done at this time. */
1650 /* (1) Peeling to force alignment. */
1652 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1654 + How many accesses will become aligned due to the peeling
1655 - How many accesses will become unaligned due to the peeling,
1656 and the cost of misaligned accesses.
1657 - The cost of peeling (the extra runtime checks, the increase
1660 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1662 stmt
= DR_STMT (dr
);
1663 stmt_info
= vinfo_for_stmt (stmt
);
1665 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1668 /* For interleaving, only the alignment of the first access
1670 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1671 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1674 /* For invariant accesses there is nothing to enhance. */
1675 if (integer_zerop (DR_STEP (dr
)))
1678 /* Strided accesses perform only component accesses, alignment is
1679 irrelevant for them. */
1680 if (STMT_VINFO_STRIDED_P (stmt_info
)
1681 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1684 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1685 do_peeling
= vector_alignment_reachable_p (dr
);
1688 if (known_alignment_for_access_p (dr
))
1690 unsigned int npeel_tmp
= 0;
1691 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1692 size_zero_node
) < 0;
1694 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1695 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1696 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr
);
1697 unsigned int dr_size
= vect_get_scalar_dr_size (dr
);
1698 mis
= (negative
? DR_MISALIGNMENT (dr
) : -DR_MISALIGNMENT (dr
));
1699 if (DR_MISALIGNMENT (dr
) != 0)
1700 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1702 /* For multiple types, it is possible that the bigger type access
1703 will have more than one peeling option. E.g., a loop with two
1704 types: one of size (vector size / 4), and the other one of
1705 size (vector size / 8). Vectorization factor will 8. If both
1706 accesses are misaligned by 3, the first one needs one scalar
1707 iteration to be aligned, and the second one needs 5. But the
1708 first one will be aligned also by peeling 5 scalar
1709 iterations, and in that case both accesses will be aligned.
1710 Hence, except for the immediate peeling amount, we also want
1711 to try to add full vector size, while we don't exceed
1712 vectorization factor.
1713 We do this automatically for cost model, since we calculate
1714 cost for every peeling option. */
1715 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1717 if (STMT_SLP_TYPE (stmt_info
))
1718 possible_npeel_number
1719 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1721 possible_npeel_number
= vf
/ nelements
;
1723 /* NPEEL_TMP is 0 when there is no misalignment, but also
1724 allow peeling NELEMENTS. */
1725 if (DR_MISALIGNMENT (dr
) == 0)
1726 possible_npeel_number
++;
1729 /* Save info about DR in the hash table. Also include peeling
1730 amounts according to the explanation above. */
1731 for (j
= 0; j
< possible_npeel_number
; j
++)
1733 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1735 npeel_tmp
+= target_align
/ dr_size
;
1738 one_misalignment_known
= true;
1742 /* If we don't know any misalignment values, we prefer
1743 peeling for data-ref that has the maximum number of data-refs
1744 with the same alignment, unless the target prefers to align
1745 stores over load. */
1746 unsigned same_align_drs
1747 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1749 || same_align_drs_max
< same_align_drs
)
1751 same_align_drs_max
= same_align_drs
;
1754 /* For data-refs with the same number of related
1755 accesses prefer the one where the misalign
1756 computation will be invariant in the outermost loop. */
1757 else if (same_align_drs_max
== same_align_drs
)
1759 struct loop
*ivloop0
, *ivloop
;
1760 ivloop0
= outermost_invariant_loop_for_expr
1761 (loop
, DR_BASE_ADDRESS (dr0
));
1762 ivloop
= outermost_invariant_loop_for_expr
1763 (loop
, DR_BASE_ADDRESS (dr
));
1764 if ((ivloop
&& !ivloop0
)
1765 || (ivloop
&& ivloop0
1766 && flow_loop_nested_p (ivloop
, ivloop0
)))
1770 one_misalignment_unknown
= true;
1772 /* Check for data refs with unsupportable alignment that
1774 if (!supportable_dr_alignment
)
1776 one_dr_unsupportable
= true;
1777 unsupportable_dr
= dr
;
1780 if (!first_store
&& DR_IS_WRITE (dr
))
1786 if (!aligned_access_p (dr
))
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1790 "vector alignment may not be reachable\n");
1796 /* Check if we can possibly peel the loop. */
1797 if (!vect_can_advance_ivs_p (loop_vinfo
)
1798 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1802 struct _vect_peel_extended_info peel_for_known_alignment
;
1803 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1804 struct _vect_peel_extended_info best_peel
;
1806 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1807 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1808 peel_for_unknown_alignment
.peel_info
.count
= 0;
1811 && one_misalignment_unknown
)
1813 /* Check if the target requires to prefer stores over loads, i.e., if
1814 misaligned stores are more expensive than misaligned loads (taking
1815 drs with same alignment into account). */
1816 unsigned int load_inside_cost
= 0;
1817 unsigned int load_outside_cost
= 0;
1818 unsigned int store_inside_cost
= 0;
1819 unsigned int store_outside_cost
= 0;
1821 stmt_vector_for_cost dummy
;
1823 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1826 &dummy
, vf
/ 2, true);
1832 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1834 &store_outside_cost
,
1835 &dummy
, vf
/ 2, true);
1840 store_inside_cost
= INT_MAX
;
1841 store_outside_cost
= INT_MAX
;
1844 if (load_inside_cost
> store_inside_cost
1845 || (load_inside_cost
== store_inside_cost
1846 && load_outside_cost
> store_outside_cost
))
1849 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1850 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1854 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1855 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1858 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1859 prologue_cost_vec
.create (2);
1860 epilogue_cost_vec
.create (2);
1863 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1864 (loop_vinfo
, vf
/ 2, &dummy2
,
1865 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1866 &prologue_cost_vec
, &epilogue_cost_vec
);
1868 prologue_cost_vec
.release ();
1869 epilogue_cost_vec
.release ();
1871 peel_for_unknown_alignment
.peel_info
.count
= 1
1872 + STMT_VINFO_SAME_ALIGN_REFS
1873 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1876 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1877 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1879 best_peel
= peel_for_unknown_alignment
;
1881 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1882 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1883 peel_for_known_alignment
.peel_info
.count
= 0;
1884 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1886 if (do_peeling
&& one_misalignment_known
)
1888 /* Peeling is possible, but there is no data access that is not supported
1889 unless aligned. So we try to choose the best possible peeling from
1891 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1892 (&peeling_htab
, loop_vinfo
);
1895 /* Compare costs of peeling for known and unknown alignment. */
1896 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1897 && peel_for_unknown_alignment
.inside_cost
1898 >= peel_for_known_alignment
.inside_cost
)
1900 best_peel
= peel_for_known_alignment
;
1902 /* If the best peeling for known alignment has NPEEL == 0, perform no
1903 peeling at all except if there is an unsupportable dr that we can
1905 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1909 /* If there is an unsupportable data ref, prefer this over all choices so far
1910 since we'd have to discard a chosen peeling except when it accidentally
1911 aligned the unsupportable data ref. */
1912 if (one_dr_unsupportable
)
1913 dr0
= unsupportable_dr
;
1914 else if (do_peeling
)
1916 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1917 TODO: Use nopeel_outside_cost or get rid of it? */
1918 unsigned nopeel_inside_cost
= 0;
1919 unsigned nopeel_outside_cost
= 0;
1921 stmt_vector_for_cost dummy
;
1923 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1924 &nopeel_outside_cost
, &dummy
, 0, false);
1927 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1928 costs will be recorded. */
1929 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1930 prologue_cost_vec
.create (2);
1931 epilogue_cost_vec
.create (2);
1934 nopeel_outside_cost
+= vect_get_known_peeling_cost
1935 (loop_vinfo
, 0, &dummy2
,
1936 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1937 &prologue_cost_vec
, &epilogue_cost_vec
);
1939 prologue_cost_vec
.release ();
1940 epilogue_cost_vec
.release ();
1942 npeel
= best_peel
.peel_info
.npeel
;
1943 dr0
= best_peel
.peel_info
.dr
;
1945 /* If no peeling is not more expensive than the best peeling we
1946 have so far, don't perform any peeling. */
1947 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1953 stmt
= DR_STMT (dr0
);
1954 stmt_info
= vinfo_for_stmt (stmt
);
1955 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1957 if (known_alignment_for_access_p (dr0
))
1959 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1960 size_zero_node
) < 0;
1963 /* Since it's known at compile time, compute the number of
1964 iterations in the peeled loop (the peeling factor) for use in
1965 updating DR_MISALIGNMENT values. The peeling factor is the
1966 vectorization factor minus the misalignment as an element
1968 mis
= negative
? DR_MISALIGNMENT (dr0
) : -DR_MISALIGNMENT (dr0
);
1969 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
1970 npeel
= ((mis
& (target_align
- 1))
1971 / vect_get_scalar_dr_size (dr0
));
1974 /* For interleaved data access every iteration accesses all the
1975 members of the group, therefore we divide the number of iterations
1976 by the group size. */
1977 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1978 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1979 npeel
/= GROUP_SIZE (stmt_info
);
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE
, vect_location
,
1983 "Try peeling by %d\n", npeel
);
1986 /* Ensure that all datarefs can be vectorized after the peel. */
1987 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
1990 /* Check if all datarefs are supportable and log. */
1991 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1993 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2000 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2003 unsigned max_allowed_peel
2004 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2005 if (max_allowed_peel
!= (unsigned)-1)
2007 unsigned max_peel
= npeel
;
2010 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2011 max_peel
= target_align
/ vect_get_scalar_dr_size (dr0
) - 1;
2013 if (max_peel
> max_allowed_peel
)
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_NOTE
, vect_location
,
2018 "Disable peeling, max peels reached: %d\n", max_peel
);
2023 /* Cost model #2 - if peeling may result in a remaining loop not
2024 iterating enough to be vectorized then do not peel. */
2026 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2029 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
2030 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
2031 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
2037 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2038 If the misalignment of DR_i is identical to that of dr0 then set
2039 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2040 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2041 by the peeling factor times the element size of DR_i (MOD the
2042 vectorization factor times the size). Otherwise, the
2043 misalignment of DR_i must be set to unknown. */
2044 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2047 /* Strided accesses perform only component accesses, alignment
2048 is irrelevant for them. */
2049 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2050 if (STMT_VINFO_STRIDED_P (stmt_info
)
2051 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2054 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2057 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2059 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2061 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2062 = DR_MISALIGNMENT (dr0
);
2063 SET_DR_MISALIGNMENT (dr0
, 0);
2064 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE
, vect_location
,
2067 "Alignment of access forced using peeling.\n");
2068 dump_printf_loc (MSG_NOTE
, vect_location
,
2069 "Peeling for alignment will be applied.\n");
2072 /* The inside-loop cost will be accounted for in vectorizable_load
2073 and vectorizable_store correctly with adjusted alignments.
2074 Drop the body_cst_vec on the floor here. */
2075 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2081 /* (2) Versioning to force alignment. */
2083 /* Try versioning if:
2084 1) optimize loop for speed
2085 2) there is at least one unsupported misaligned data ref with an unknown
2087 3) all misaligned data refs with a known misalignment are supported, and
2088 4) the number of runtime alignment checks is within reason. */
2091 optimize_loop_nest_for_speed_p (loop
)
2092 && (!loop
->inner
); /* FORNOW */
2096 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2098 stmt
= DR_STMT (dr
);
2099 stmt_info
= vinfo_for_stmt (stmt
);
2101 /* For interleaving, only the alignment of the first access
2103 if (aligned_access_p (dr
)
2104 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2105 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2108 if (STMT_VINFO_STRIDED_P (stmt_info
))
2110 /* Strided loads perform only component accesses, alignment is
2111 irrelevant for them. */
2112 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2114 do_versioning
= false;
2118 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2120 if (!supportable_dr_alignment
)
2126 if (known_alignment_for_access_p (dr
)
2127 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2128 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2130 do_versioning
= false;
2134 stmt
= DR_STMT (dr
);
2135 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2136 gcc_assert (vectype
);
2138 /* The rightmost bits of an aligned address must be zeros.
2139 Construct the mask needed for this test. For example,
2140 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2141 mask must be 15 = 0xf. */
2142 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
2144 /* FORNOW: use the same mask to test all potentially unaligned
2145 references in the loop. The vectorizer currently supports
2146 a single vector size, see the reference to
2147 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2148 vectorization factor is computed. */
2149 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2150 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2151 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2152 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2157 /* Versioning requires at least one misaligned data reference. */
2158 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2159 do_versioning
= false;
2160 else if (!do_versioning
)
2161 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2166 vec
<gimple
*> may_misalign_stmts
2167 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2170 /* It can now be assumed that the data references in the statements
2171 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2172 of the loop being vectorized. */
2173 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2175 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2176 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2177 SET_DR_MISALIGNMENT (dr
, 0);
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE
, vect_location
,
2180 "Alignment of access forced using versioning.\n");
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_NOTE
, vect_location
,
2185 "Versioning for alignment will be applied.\n");
2187 /* Peeling and versioning can't be done together at this time. */
2188 gcc_assert (! (do_peeling
&& do_versioning
));
2190 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2195 /* This point is reached if neither peeling nor versioning is being done. */
2196 gcc_assert (! (do_peeling
|| do_versioning
));
2198 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2203 /* Function vect_find_same_alignment_drs.
2205 Update group and alignment relations according to the chosen
2206 vectorization factor. */
2209 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2211 struct data_reference
*dra
= DDR_A (ddr
);
2212 struct data_reference
*drb
= DDR_B (ddr
);
2213 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2214 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2216 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2222 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2223 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2224 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2227 /* Two references with distance zero have the same alignment. */
2228 offset_int diff
= (wi::to_offset (DR_INIT (dra
))
2229 - wi::to_offset (DR_INIT (drb
)));
2232 /* Get the wider of the two alignments. */
2233 unsigned int align_a
= (vect_calculate_target_alignment (dra
)
2235 unsigned int align_b
= (vect_calculate_target_alignment (drb
)
2237 unsigned int max_align
= MAX (align_a
, align_b
);
2239 /* Require the gap to be a multiple of the larger vector alignment. */
2240 if (!wi::multiple_of_p (diff
, max_align
, SIGNED
))
2244 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2245 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2246 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_NOTE
, vect_location
,
2249 "accesses have the same alignment: ");
2250 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2251 dump_printf (MSG_NOTE
, " and ");
2252 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2253 dump_printf (MSG_NOTE
, "\n");
2258 /* Function vect_analyze_data_refs_alignment
2260 Analyze the alignment of the data-references in the loop.
2261 Return FALSE if a data reference is found that cannot be vectorized. */
2264 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
,
2268 "=== vect_analyze_data_refs_alignment ===\n");
2270 /* Mark groups of data references with same alignment using
2271 data dependence information. */
2272 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2273 struct data_dependence_relation
*ddr
;
2276 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2277 vect_find_same_alignment_drs (ddr
);
2279 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2280 struct data_reference
*dr
;
2282 vect_record_base_alignments (vinfo
);
2283 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2285 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2286 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2287 && !vect_compute_data_ref_alignment (dr
))
2289 /* Strided accesses perform only component accesses, misalignment
2290 information is irrelevant for them. */
2291 if (STMT_VINFO_STRIDED_P (stmt_info
)
2292 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2297 "not vectorized: can't calculate alignment "
2308 /* Analyze alignment of DRs of stmts in NODE. */
2311 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2313 /* We vectorize from the first scalar stmt in the node unless
2314 the node is permuted in which case we start from the first
2315 element in the group. */
2316 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2317 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2318 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2319 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2321 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2322 if (! vect_compute_data_ref_alignment (dr
)
2323 /* For creating the data-ref pointer we need alignment of the
2324 first element anyway. */
2326 && ! vect_compute_data_ref_alignment (first_dr
))
2327 || ! verify_data_ref_alignment (dr
))
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2331 "not vectorized: bad data alignment in basic "
2339 /* Function vect_slp_analyze_instance_alignment
2341 Analyze the alignment of the data-references in the SLP instance.
2342 Return FALSE if a data reference is found that cannot be vectorized. */
2345 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_NOTE
, vect_location
,
2349 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2353 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2354 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2357 node
= SLP_INSTANCE_TREE (instance
);
2358 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2359 && ! vect_slp_analyze_and_verify_node_alignment
2360 (SLP_INSTANCE_TREE (instance
)))
2367 /* Analyze groups of accesses: check that DR belongs to a group of
2368 accesses of legal size, step, etc. Detect gaps, single element
2369 interleaving, and other special cases. Set grouped access info.
2370 Collect groups of strided stores for further use in SLP analysis.
2371 Worker for vect_analyze_group_access. */
2374 vect_analyze_group_access_1 (struct data_reference
*dr
)
2376 tree step
= DR_STEP (dr
);
2377 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2378 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2379 gimple
*stmt
= DR_STMT (dr
);
2380 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2381 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2382 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2383 HOST_WIDE_INT dr_step
= -1;
2384 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2385 bool slp_impossible
= false;
2387 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2388 size of the interleaving group (including gaps). */
2389 if (tree_fits_shwi_p (step
))
2391 dr_step
= tree_to_shwi (step
);
2392 /* Check that STEP is a multiple of type size. Otherwise there is
2393 a non-element-sized gap at the end of the group which we
2394 cannot represent in GROUP_GAP or GROUP_SIZE.
2395 ??? As we can handle non-constant step fine here we should
2396 simply remove uses of GROUP_GAP between the last and first
2397 element and instead rely on DR_STEP. GROUP_SIZE then would
2398 simply not include that gap. */
2399 if ((dr_step
% type_size
) != 0)
2401 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_NOTE
, vect_location
,
2405 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2406 dump_printf (MSG_NOTE
,
2407 " is not a multiple of the element size for ");
2408 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2409 dump_printf (MSG_NOTE
, "\n");
2413 groupsize
= absu_hwi (dr_step
) / type_size
;
2418 /* Not consecutive access is possible only if it is a part of interleaving. */
2419 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2421 /* Check if it this DR is a part of interleaving, and is a single
2422 element of the group that is accessed in the loop. */
2424 /* Gaps are supported only for loads. STEP must be a multiple of the type
2425 size. The size of the group must be a power of 2. */
2427 && (dr_step
% type_size
) == 0
2429 && pow2p_hwi (groupsize
))
2431 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2432 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2433 GROUP_GAP (stmt_info
) = groupsize
- 1;
2434 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE
, vect_location
,
2437 "Detected single element interleaving ");
2438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2439 dump_printf (MSG_NOTE
, " step ");
2440 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2441 dump_printf (MSG_NOTE
, "\n");
2447 if (dump_enabled_p ())
2449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2450 "not consecutive access ");
2451 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2456 /* Mark the statement as unvectorizable. */
2457 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2461 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2462 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2466 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2468 /* First stmt in the interleaving chain. Check the chain. */
2469 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2470 struct data_reference
*data_ref
= dr
;
2471 unsigned int count
= 1;
2472 tree prev_init
= DR_INIT (data_ref
);
2473 gimple
*prev
= stmt
;
2474 HOST_WIDE_INT diff
, gaps
= 0;
2478 /* Skip same data-refs. In case that two or more stmts share
2479 data-ref (supported only for loads), we vectorize only the first
2480 stmt, and the rest get their vectorized loads from the first
2482 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2483 DR_INIT (STMT_VINFO_DATA_REF (
2484 vinfo_for_stmt (next
)))))
2486 if (DR_IS_WRITE (data_ref
))
2488 if (dump_enabled_p ())
2489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2490 "Two store stmts share the same dr.\n");
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2496 "Two or more load stmts share the same dr.\n");
2498 /* For load use the same data-ref load. */
2499 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2502 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2507 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2509 /* All group members have the same STEP by construction. */
2510 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2512 /* Check that the distance between two accesses is equal to the type
2513 size. Otherwise, we have gaps. */
2514 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2515 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2518 /* FORNOW: SLP of accesses with gaps is not supported. */
2519 slp_impossible
= true;
2520 if (DR_IS_WRITE (data_ref
))
2522 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2524 "interleaved store with gaps\n");
2531 last_accessed_element
+= diff
;
2533 /* Store the gap from the previous member of the group. If there is no
2534 gap in the access, GROUP_GAP is always 1. */
2535 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2537 prev_init
= DR_INIT (data_ref
);
2538 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2539 /* Count the number of data-refs in the chain. */
2544 groupsize
= count
+ gaps
;
2546 /* This could be UINT_MAX but as we are generating code in a very
2547 inefficient way we have to cap earlier. See PR78699 for example. */
2548 if (groupsize
> 4096)
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2552 "group is too large\n");
2556 /* Check that the size of the interleaving is equal to count for stores,
2557 i.e., that there are no gaps. */
2558 if (groupsize
!= count
2559 && !DR_IS_READ (dr
))
2561 if (dump_enabled_p ())
2562 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2563 "interleaved store with gaps\n");
2567 /* If there is a gap after the last load in the group it is the
2568 difference between the groupsize and the last accessed
2570 When there is no gap, this difference should be 0. */
2571 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2573 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2574 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_NOTE
, vect_location
,
2577 "Detected interleaving ");
2578 if (DR_IS_READ (dr
))
2579 dump_printf (MSG_NOTE
, "load ");
2581 dump_printf (MSG_NOTE
, "store ");
2582 dump_printf (MSG_NOTE
, "of size %u starting with ",
2583 (unsigned)groupsize
);
2584 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2585 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2586 dump_printf_loc (MSG_NOTE
, vect_location
,
2587 "There is a gap of %u elements after the group\n",
2588 GROUP_GAP (vinfo_for_stmt (stmt
)));
2591 /* SLP: create an SLP data structure for every interleaving group of
2592 stores for further analysis in vect_analyse_slp. */
2593 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2596 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2598 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2605 /* Analyze groups of accesses: check that DR belongs to a group of
2606 accesses of legal size, step, etc. Detect gaps, single element
2607 interleaving, and other special cases. Set grouped access info.
2608 Collect groups of strided stores for further use in SLP analysis. */
2611 vect_analyze_group_access (struct data_reference
*dr
)
2613 if (!vect_analyze_group_access_1 (dr
))
2615 /* Dissolve the group if present. */
2617 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2620 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2621 next
= GROUP_NEXT_ELEMENT (vinfo
);
2622 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2623 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2631 /* Analyze the access pattern of the data-reference DR.
2632 In case of non-consecutive accesses call vect_analyze_group_access() to
2633 analyze groups of accesses. */
2636 vect_analyze_data_ref_access (struct data_reference
*dr
)
2638 tree step
= DR_STEP (dr
);
2639 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2640 gimple
*stmt
= DR_STMT (dr
);
2641 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2642 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2643 struct loop
*loop
= NULL
;
2646 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2648 if (loop_vinfo
&& !step
)
2650 if (dump_enabled_p ())
2651 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2652 "bad data-ref access in loop\n");
2656 /* Allow loads with zero step in inner-loop vectorization. */
2657 if (loop_vinfo
&& integer_zerop (step
))
2659 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2660 if (!nested_in_vect_loop_p (loop
, stmt
))
2661 return DR_IS_READ (dr
);
2662 /* Allow references with zero step for outer loops marked
2663 with pragma omp simd only - it guarantees absence of
2664 loop-carried dependencies between inner loop iterations. */
2665 if (!loop
->force_vectorize
)
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_NOTE
, vect_location
,
2669 "zero step in inner loop of nest\n");
2674 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2676 /* Interleaved accesses are not yet supported within outer-loop
2677 vectorization for references in the inner-loop. */
2678 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2680 /* For the rest of the analysis we use the outer-loop step. */
2681 step
= STMT_VINFO_DR_STEP (stmt_info
);
2682 if (integer_zerop (step
))
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_NOTE
, vect_location
,
2686 "zero step in outer loop.\n");
2687 return DR_IS_READ (dr
);
2692 if (TREE_CODE (step
) == INTEGER_CST
)
2694 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2695 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2697 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2699 /* Mark that it is not interleaving. */
2700 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2705 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_NOTE
, vect_location
,
2709 "grouped access in outer loop.\n");
2714 /* Assume this is a DR handled by non-constant strided load case. */
2715 if (TREE_CODE (step
) != INTEGER_CST
)
2716 return (STMT_VINFO_STRIDED_P (stmt_info
)
2717 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2718 || vect_analyze_group_access (dr
)));
2720 /* Not consecutive access - check if it's a part of interleaving group. */
2721 return vect_analyze_group_access (dr
);
2724 /* Compare two data-references DRA and DRB to group them into chunks
2725 suitable for grouping. */
2728 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2730 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2731 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2734 /* Stabilize sort. */
2738 /* DRs in different loops never belong to the same group. */
2739 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2740 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2742 return loopa
->num
< loopb
->num
? -1 : 1;
2744 /* Ordering of DRs according to base. */
2745 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2746 DR_BASE_ADDRESS (drb
));
2750 /* And according to DR_OFFSET. */
2751 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2755 /* Put reads before writes. */
2756 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2757 return DR_IS_READ (dra
) ? -1 : 1;
2759 /* Then sort after access size. */
2760 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2761 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2765 /* And after step. */
2766 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2770 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2771 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2773 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2777 /* Function vect_analyze_data_ref_accesses.
2779 Analyze the access pattern of all the data references in the loop.
2781 FORNOW: the only access pattern that is considered vectorizable is a
2782 simple step 1 (consecutive) access.
2784 FORNOW: handle only arrays and pointer accesses. */
2787 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2790 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2791 struct data_reference
*dr
;
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_NOTE
, vect_location
,
2795 "=== vect_analyze_data_ref_accesses ===\n");
2797 if (datarefs
.is_empty ())
2800 /* Sort the array of datarefs to make building the interleaving chains
2801 linear. Don't modify the original vector's order, it is needed for
2802 determining what dependencies are reversed. */
2803 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2804 datarefs_copy
.qsort (dr_group_sort_cmp
);
2806 /* Build the interleaving chains. */
2807 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2809 data_reference_p dra
= datarefs_copy
[i
];
2810 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2811 stmt_vec_info lastinfo
= NULL
;
2812 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2817 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2819 data_reference_p drb
= datarefs_copy
[i
];
2820 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2821 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2824 /* ??? Imperfect sorting (non-compatible types, non-modulo
2825 accesses, same accesses) can lead to a group to be artificially
2826 split here as we don't just skip over those. If it really
2827 matters we can push those to a worklist and re-iterate
2828 over them. The we can just skip ahead to the next DR here. */
2830 /* DRs in a different loop should not be put into the same
2831 interleaving group. */
2832 if (gimple_bb (DR_STMT (dra
))->loop_father
2833 != gimple_bb (DR_STMT (drb
))->loop_father
)
2836 /* Check that the data-refs have same first location (except init)
2837 and they are both either store or load (not load and store,
2838 not masked loads or stores). */
2839 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2840 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2841 DR_BASE_ADDRESS (drb
)) != 0
2842 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2843 || !gimple_assign_single_p (DR_STMT (dra
))
2844 || !gimple_assign_single_p (DR_STMT (drb
)))
2847 /* Check that the data-refs have the same constant size. */
2848 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2849 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2850 if (!tree_fits_uhwi_p (sza
)
2851 || !tree_fits_uhwi_p (szb
)
2852 || !tree_int_cst_equal (sza
, szb
))
2855 /* Check that the data-refs have the same step. */
2856 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2859 /* Check the types are compatible.
2860 ??? We don't distinguish this during sorting. */
2861 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2862 TREE_TYPE (DR_REF (drb
))))
2865 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2866 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2867 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2868 HOST_WIDE_INT init_prev
2869 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
2870 gcc_assert (init_a
<= init_b
2871 && init_a
<= init_prev
2872 && init_prev
<= init_b
);
2874 /* Do not place the same access in the interleaving chain twice. */
2875 if (init_b
== init_prev
)
2877 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
2878 < gimple_uid (DR_STMT (drb
)));
2879 /* ??? For now we simply "drop" the later reference which is
2880 otherwise the same rather than finishing off this group.
2881 In the end we'd want to re-process duplicates forming
2882 multiple groups from the refs, likely by just collecting
2883 all candidates (including duplicates and split points
2884 below) in a vector and then process them together. */
2888 /* If init_b == init_a + the size of the type * k, we have an
2889 interleaving, and DRA is accessed before DRB. */
2890 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2891 if (type_size_a
== 0
2892 || (init_b
- init_a
) % type_size_a
!= 0)
2895 /* If we have a store, the accesses are adjacent. This splits
2896 groups into chunks we support (we don't support vectorization
2897 of stores with gaps). */
2898 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
2901 /* If the step (if not zero or non-constant) is greater than the
2902 difference between data-refs' inits this splits groups into
2904 if (tree_fits_shwi_p (DR_STEP (dra
)))
2906 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2907 if (step
!= 0 && step
<= (init_b
- init_a
))
2911 if (dump_enabled_p ())
2913 dump_printf_loc (MSG_NOTE
, vect_location
,
2914 "Detected interleaving ");
2915 if (DR_IS_READ (dra
))
2916 dump_printf (MSG_NOTE
, "load ");
2918 dump_printf (MSG_NOTE
, "store ");
2919 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2920 dump_printf (MSG_NOTE
, " and ");
2921 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2922 dump_printf (MSG_NOTE
, "\n");
2925 /* Link the found element into the group list. */
2926 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2928 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2929 lastinfo
= stmtinfo_a
;
2931 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2932 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2933 lastinfo
= stmtinfo_b
;
2937 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2938 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2939 && !vect_analyze_data_ref_access (dr
))
2941 if (dump_enabled_p ())
2942 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2943 "not vectorized: complicated access pattern.\n");
2945 if (is_a
<bb_vec_info
> (vinfo
))
2947 /* Mark the statement as not vectorizable. */
2948 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2953 datarefs_copy
.release ();
2958 datarefs_copy
.release ();
2962 /* Function vect_vfa_segment_size.
2964 Create an expression that computes the size of segment
2965 that will be accessed for a data reference. The functions takes into
2966 account that realignment loads may access one more vector.
2969 DR: The data reference.
2970 LENGTH_FACTOR: segment length to consider.
2972 Return an expression whose value is the size of segment which will be
2976 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2978 tree segment_length
;
2980 if (integer_zerop (DR_STEP (dr
)))
2981 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2983 segment_length
= size_binop (MULT_EXPR
,
2984 fold_convert (sizetype
, DR_STEP (dr
)),
2985 fold_convert (sizetype
, length_factor
));
2987 if (vect_supportable_dr_alignment (dr
, false)
2988 == dr_explicit_realign_optimized
)
2990 tree vector_size
= TYPE_SIZE_UNIT
2991 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2993 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2995 return segment_length
;
2998 /* Function vect_no_alias_p.
3000 Given data references A and B with equal base and offset, the alias
3001 relation can be decided at compilation time, return TRUE if they do
3002 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
3003 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3007 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
3008 tree segment_length_a
, tree segment_length_b
)
3010 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
3011 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
3012 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
3015 tree seg_a_min
= DR_INIT (a
);
3016 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
3017 seg_a_min
, segment_length_a
);
3018 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3019 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3021 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3023 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
3024 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
3025 seg_a_max
, unit_size
);
3026 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
3027 DR_INIT (a
), unit_size
);
3029 tree seg_b_min
= DR_INIT (b
);
3030 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
3031 seg_b_min
, segment_length_b
);
3032 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3034 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3035 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3036 seg_b_max
, unit_size
);
3037 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3038 DR_INIT (b
), unit_size
);
3041 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3042 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3048 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3052 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3053 unsigned int loop_depth
, unsigned HOST_WIDE_INT vf
)
3055 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3056 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3059 /* If the dependence is exact, we should have limited the VF instead. */
3060 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3063 lambda_vector dist_v
;
3064 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3066 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3068 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3069 && (unsigned HOST_WIDE_INT
) abs_hwi (dist
) < vf
)
3073 if (dump_enabled_p ())
3075 dump_printf_loc (MSG_NOTE
, vect_location
,
3076 "dependence distance between ");
3077 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3078 dump_printf (MSG_NOTE
, " and ");
3079 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3080 dump_printf (MSG_NOTE
, " is >= VF\n");
3086 /* Function vect_prune_runtime_alias_test_list.
3088 Prune a list of ddrs to be tested at run-time by versioning for alias.
3089 Merge several alias checks into one if possible.
3090 Return FALSE if resulting list of ddrs is longer then allowed by
3091 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3094 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3096 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3097 hash_set
<tree_pair_hash
> compared_objects
;
3099 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3100 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3101 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3102 vec
<vec_object_pair
> &check_unequal_addrs
3103 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3104 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3105 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3111 if (dump_enabled_p ())
3112 dump_printf_loc (MSG_NOTE
, vect_location
,
3113 "=== vect_prune_runtime_alias_test_list ===\n");
3115 if (may_alias_ddrs
.is_empty ())
3118 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3120 unsigned int loop_depth
3121 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3122 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3124 /* First, we collect all data ref pairs for aliasing checks. */
3125 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3128 struct data_reference
*dr_a
, *dr_b
;
3129 gimple
*dr_group_first_a
, *dr_group_first_b
;
3130 tree segment_length_a
, segment_length_b
;
3131 gimple
*stmt_a
, *stmt_b
;
3133 /* Ignore the alias if the VF we chose ended up being no greater
3134 than the dependence distance. */
3135 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3138 if (DDR_OBJECT_A (ddr
))
3140 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3141 if (!compared_objects
.add (new_pair
))
3143 if (dump_enabled_p ())
3145 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3146 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3147 dump_printf (MSG_NOTE
, " and ");
3148 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3149 dump_printf (MSG_NOTE
, " have different addresses\n");
3151 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3157 stmt_a
= DR_STMT (DDR_A (ddr
));
3158 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3159 if (dr_group_first_a
)
3161 stmt_a
= dr_group_first_a
;
3162 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3166 stmt_b
= DR_STMT (DDR_B (ddr
));
3167 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3168 if (dr_group_first_b
)
3170 stmt_b
= dr_group_first_b
;
3171 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3174 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3175 length_factor
= scalar_loop_iters
;
3177 length_factor
= size_int (vect_factor
);
3178 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3179 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3181 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3182 DR_BASE_ADDRESS (dr_b
));
3184 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3187 /* Alias is known at compilation time. */
3189 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3190 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3191 && TREE_CODE (segment_length_a
) == INTEGER_CST
3192 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3194 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_NOTE
, vect_location
,
3199 "not vectorized: compilation time alias.\n");
3204 dr_with_seg_len_pair_t dr_with_seg_len_pair
3205 (dr_with_seg_len (dr_a
, segment_length_a
),
3206 dr_with_seg_len (dr_b
, segment_length_b
));
3208 /* Canonicalize pairs by sorting the two DR members. */
3210 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3212 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3215 prune_runtime_alias_test_list (&comp_alias_ddrs
,
3216 (unsigned HOST_WIDE_INT
) vect_factor
);
3218 unsigned int count
= (comp_alias_ddrs
.length ()
3219 + check_unequal_addrs
.length ());
3220 dump_printf_loc (MSG_NOTE
, vect_location
,
3221 "improved number of alias checks from %d to %d\n",
3222 may_alias_ddrs
.length (), count
);
3223 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3225 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3227 "number of versioning for alias "
3228 "run-time tests exceeds %d "
3229 "(--param vect-max-version-for-alias-checks)\n",
3230 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3237 /* Return true if a non-affine read or write in STMT is suitable for a
3238 gather load or scatter store. Describe the operation in *INFO if so. */
3241 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3242 gather_scatter_info
*info
)
3244 HOST_WIDE_INT scale
= 1;
3245 poly_int64 pbitpos
, pbitsize
;
3246 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3247 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3248 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3249 tree offtype
= NULL_TREE
;
3250 tree decl
, base
, off
;
3252 int punsignedp
, reversep
, pvolatilep
= 0;
3255 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3256 see if we can use the def stmt of the address. */
3257 if (is_gimple_call (stmt
)
3258 && gimple_call_internal_p (stmt
)
3259 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3260 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3261 && TREE_CODE (base
) == MEM_REF
3262 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3263 && integer_zerop (TREE_OPERAND (base
, 1))
3264 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3266 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3267 if (is_gimple_assign (def_stmt
)
3268 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3269 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3272 /* The gather and scatter builtins need address of the form
3273 loop_invariant + vector * {1, 2, 4, 8}
3275 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3276 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3277 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3278 multiplications and additions in it. To get a vector, we need
3279 a single SSA_NAME that will be defined in the loop and will
3280 contain everything that is not loop invariant and that can be
3281 vectorized. The following code attempts to find such a preexistng
3282 SSA_NAME OFF and put the loop invariants into a tree BASE
3283 that can be gimplified before the loop. */
3284 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3285 &punsignedp
, &reversep
, &pvolatilep
);
3286 gcc_assert (base
&& !reversep
);
3287 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3289 if (TREE_CODE (base
) == MEM_REF
)
3291 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3293 if (off
== NULL_TREE
)
3294 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3296 off
= size_binop (PLUS_EXPR
, off
,
3297 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3299 base
= TREE_OPERAND (base
, 0);
3302 base
= build_fold_addr_expr (base
);
3304 if (off
== NULL_TREE
)
3305 off
= size_zero_node
;
3307 /* If base is not loop invariant, either off is 0, then we start with just
3308 the constant offset in the loop invariant BASE and continue with base
3309 as OFF, otherwise give up.
3310 We could handle that case by gimplifying the addition of base + off
3311 into some SSA_NAME and use that as off, but for now punt. */
3312 if (!expr_invariant_in_loop_p (loop
, base
))
3314 if (!integer_zerop (off
))
3317 base
= size_int (pbytepos
);
3319 /* Otherwise put base + constant offset into the loop invariant BASE
3320 and continue with OFF. */
3323 base
= fold_convert (sizetype
, base
);
3324 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3327 /* OFF at this point may be either a SSA_NAME or some tree expression
3328 from get_inner_reference. Try to peel off loop invariants from it
3329 into BASE as long as possible. */
3331 while (offtype
== NULL_TREE
)
3333 enum tree_code code
;
3334 tree op0
, op1
, add
= NULL_TREE
;
3336 if (TREE_CODE (off
) == SSA_NAME
)
3338 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3340 if (expr_invariant_in_loop_p (loop
, off
))
3343 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3346 op0
= gimple_assign_rhs1 (def_stmt
);
3347 code
= gimple_assign_rhs_code (def_stmt
);
3348 op1
= gimple_assign_rhs2 (def_stmt
);
3352 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3354 code
= TREE_CODE (off
);
3355 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3359 case POINTER_PLUS_EXPR
:
3361 if (expr_invariant_in_loop_p (loop
, op0
))
3366 add
= fold_convert (sizetype
, add
);
3368 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3369 base
= size_binop (PLUS_EXPR
, base
, add
);
3372 if (expr_invariant_in_loop_p (loop
, op1
))
3380 if (expr_invariant_in_loop_p (loop
, op1
))
3382 add
= fold_convert (sizetype
, op1
);
3383 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3389 if (scale
== 1 && tree_fits_shwi_p (op1
))
3391 scale
= tree_to_shwi (op1
);
3400 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3401 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3403 if (TYPE_PRECISION (TREE_TYPE (op0
))
3404 == TYPE_PRECISION (TREE_TYPE (off
)))
3409 if (TYPE_PRECISION (TREE_TYPE (op0
))
3410 < TYPE_PRECISION (TREE_TYPE (off
)))
3413 offtype
= TREE_TYPE (off
);
3424 /* If at the end OFF still isn't a SSA_NAME or isn't
3425 defined in the loop, punt. */
3426 if (TREE_CODE (off
) != SSA_NAME
3427 || expr_invariant_in_loop_p (loop
, off
))
3430 if (offtype
== NULL_TREE
)
3431 offtype
= TREE_TYPE (off
);
3433 if (DR_IS_READ (dr
))
3434 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3437 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3440 if (decl
== NULL_TREE
)
3446 info
->offset_dt
= vect_unknown_def_type
;
3447 info
->offset_vectype
= NULL_TREE
;
3448 info
->scale
= scale
;
3452 /* Function vect_analyze_data_refs.
3454 Find all the data references in the loop or basic block.
3456 The general structure of the analysis of data refs in the vectorizer is as
3458 1- vect_analyze_data_refs(loop/bb): call
3459 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3460 in the loop/bb and their dependences.
3461 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3462 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3463 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3468 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3470 struct loop
*loop
= NULL
;
3472 struct data_reference
*dr
;
3475 if (dump_enabled_p ())
3476 dump_printf_loc (MSG_NOTE
, vect_location
,
3477 "=== vect_analyze_data_refs ===\n");
3479 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3480 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3482 /* Go through the data-refs, check that the analysis succeeded. Update
3483 pointer from stmt_vec_info struct to DR and vectype. */
3485 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3486 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3489 stmt_vec_info stmt_info
;
3490 tree base
, offset
, init
;
3491 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3492 bool simd_lane_access
= false;
3496 if (!dr
|| !DR_REF (dr
))
3498 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3500 "not vectorized: unhandled data-ref\n");
3504 stmt
= DR_STMT (dr
);
3505 stmt_info
= vinfo_for_stmt (stmt
);
3507 /* Discard clobbers from the dataref vector. We will remove
3508 clobber stmts during vectorization. */
3509 if (gimple_clobber_p (stmt
))
3512 if (i
== datarefs
.length () - 1)
3517 datarefs
.ordered_remove (i
);
3522 /* Check that analysis of the data-ref succeeded. */
3523 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3528 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3529 && targetm
.vectorize
.builtin_gather
!= NULL
;
3532 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3533 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3534 bool maybe_simd_lane_access
3535 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3537 /* If target supports vector gather loads or scatter stores, or if
3538 this might be a SIMD lane access, see if they can't be used. */
3539 if (is_a
<loop_vec_info
> (vinfo
)
3540 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3541 && !nested_in_vect_loop_p (loop
, stmt
))
3543 struct data_reference
*newdr
3544 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3545 DR_REF (dr
), stmt
, !maybe_scatter
,
3546 DR_IS_CONDITIONAL_IN_STMT (dr
));
3547 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3548 if (DR_BASE_ADDRESS (newdr
)
3549 && DR_OFFSET (newdr
)
3552 && integer_zerop (DR_STEP (newdr
)))
3554 if (maybe_simd_lane_access
)
3556 tree off
= DR_OFFSET (newdr
);
3558 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3559 && TREE_CODE (off
) == MULT_EXPR
3560 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3562 tree step
= TREE_OPERAND (off
, 1);
3563 off
= TREE_OPERAND (off
, 0);
3565 if (CONVERT_EXPR_P (off
)
3566 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3568 < TYPE_PRECISION (TREE_TYPE (off
)))
3569 off
= TREE_OPERAND (off
, 0);
3570 if (TREE_CODE (off
) == SSA_NAME
)
3572 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3573 tree reft
= TREE_TYPE (DR_REF (newdr
));
3574 if (is_gimple_call (def
)
3575 && gimple_call_internal_p (def
)
3576 && (gimple_call_internal_fn (def
)
3577 == IFN_GOMP_SIMD_LANE
))
3579 tree arg
= gimple_call_arg (def
, 0);
3580 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3581 arg
= SSA_NAME_VAR (arg
);
3582 if (arg
== loop
->simduid
3584 && tree_int_cst_equal
3585 (TYPE_SIZE_UNIT (reft
),
3588 DR_OFFSET (newdr
) = ssize_int (0);
3589 DR_STEP (newdr
) = step
;
3590 DR_OFFSET_ALIGNMENT (newdr
)
3591 = BIGGEST_ALIGNMENT
;
3592 DR_STEP_ALIGNMENT (newdr
)
3593 = highest_pow2_factor (step
);
3595 simd_lane_access
= true;
3601 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3605 gatherscatter
= GATHER
;
3607 gatherscatter
= SCATTER
;
3610 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3611 free_data_ref (newdr
);
3614 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3616 if (dump_enabled_p ())
3618 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3619 "not vectorized: data ref analysis "
3621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3624 if (is_a
<bb_vec_info
> (vinfo
))
3631 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3633 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3635 "not vectorized: base addr of dr is a "
3638 if (is_a
<bb_vec_info
> (vinfo
))
3641 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3646 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3651 "not vectorized: volatile type ");
3652 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3655 if (is_a
<bb_vec_info
> (vinfo
))
3661 if (stmt_can_throw_internal (stmt
))
3663 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3666 "not vectorized: statement can throw an "
3668 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3671 if (is_a
<bb_vec_info
> (vinfo
))
3674 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3679 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3680 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3685 "not vectorized: statement is bitfield "
3687 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3690 if (is_a
<bb_vec_info
> (vinfo
))
3693 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3698 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3699 offset
= unshare_expr (DR_OFFSET (dr
));
3700 init
= unshare_expr (DR_INIT (dr
));
3702 if (is_gimple_call (stmt
)
3703 && (!gimple_call_internal_p (stmt
)
3704 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3705 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3707 if (dump_enabled_p ())
3709 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3710 "not vectorized: dr in a call ");
3711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3714 if (is_a
<bb_vec_info
> (vinfo
))
3717 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3722 /* Update DR field in stmt_vec_info struct. */
3724 /* If the dataref is in an inner-loop of the loop that is considered for
3725 for vectorization, we also want to analyze the access relative to
3726 the outer-loop (DR contains information only relative to the
3727 inner-most enclosing loop). We do that by building a reference to the
3728 first location accessed by the inner-loop, and analyze it relative to
3730 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3732 /* Build a reference to the first location accessed by the
3733 inner loop: *(BASE + INIT + OFFSET). By construction,
3734 this address must be invariant in the inner loop, so we
3735 can consider it as being used in the outer loop. */
3736 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
3738 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
3739 tree init_ref
= build_fold_indirect_ref (init_addr
);
3741 if (dump_enabled_p ())
3743 dump_printf_loc (MSG_NOTE
, vect_location
,
3744 "analyze in outer loop: ");
3745 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
3746 dump_printf (MSG_NOTE
, "\n");
3749 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
3751 /* dr_analyze_innermost already explained the failure. */
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_NOTE
, vect_location
,
3757 "\touter base_address: ");
3758 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3759 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3760 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3761 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3762 STMT_VINFO_DR_OFFSET (stmt_info
));
3763 dump_printf (MSG_NOTE
,
3764 "\n\touter constant offset from base address: ");
3765 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3766 STMT_VINFO_DR_INIT (stmt_info
));
3767 dump_printf (MSG_NOTE
, "\n\touter step: ");
3768 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3769 STMT_VINFO_DR_STEP (stmt_info
));
3770 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
3771 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
3772 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
3773 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
3774 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
3775 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
3776 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
3777 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
3781 if (STMT_VINFO_DATA_REF (stmt_info
))
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3786 "not vectorized: more than one data ref "
3788 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3791 if (is_a
<bb_vec_info
> (vinfo
))
3794 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3799 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3800 if (simd_lane_access
)
3802 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3803 free_data_ref (datarefs
[i
]);
3807 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
3808 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
3809 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
3811 if (dump_enabled_p ())
3813 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3814 "not vectorized: base object not addressable "
3816 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3818 if (is_a
<bb_vec_info
> (vinfo
))
3820 /* In BB vectorization the ref can still participate
3821 in dependence analysis, we just can't vectorize it. */
3822 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3828 /* Set vectype for STMT. */
3829 scalar_type
= TREE_TYPE (DR_REF (dr
));
3830 STMT_VINFO_VECTYPE (stmt_info
)
3831 = get_vectype_for_scalar_type (scalar_type
);
3832 if (!STMT_VINFO_VECTYPE (stmt_info
))
3834 if (dump_enabled_p ())
3836 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3837 "not vectorized: no vectype for stmt: ");
3838 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3839 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3840 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3842 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3845 if (is_a
<bb_vec_info
> (vinfo
))
3847 /* No vector type is fine, the ref can still participate
3848 in dependence analysis, we just can't vectorize it. */
3849 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3853 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3855 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3856 if (gatherscatter
!= SG_NONE
)
3863 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE
, vect_location
,
3866 "got vectype for stmt: ");
3867 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3868 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3869 STMT_VINFO_VECTYPE (stmt_info
));
3870 dump_printf (MSG_NOTE
, "\n");
3874 /* Adjust the minimal vectorization factor according to the
3876 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3880 if (gatherscatter
!= SG_NONE
)
3882 gather_scatter_info gs_info
;
3883 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3885 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
3887 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3889 if (dump_enabled_p ())
3891 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3892 (gatherscatter
== GATHER
) ?
3893 "not vectorized: not suitable for gather "
3895 "not vectorized: not suitable for scatter "
3897 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3902 free_data_ref (datarefs
[i
]);
3904 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3907 else if (is_a
<loop_vec_info
> (vinfo
)
3908 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3910 if (nested_in_vect_loop_p (loop
, stmt
))
3912 if (dump_enabled_p ())
3914 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3915 "not vectorized: not suitable for strided "
3917 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3921 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3925 /* If we stopped analysis at the first dataref we could not analyze
3926 when trying to vectorize a basic-block mark the rest of the datarefs
3927 as not vectorizable and truncate the vector of datarefs. That
3928 avoids spending useless time in analyzing their dependence. */
3929 if (i
!= datarefs
.length ())
3931 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3932 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3934 data_reference_p dr
= datarefs
[j
];
3935 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3938 datarefs
.truncate (i
);
3945 /* Function vect_get_new_vect_var.
3947 Returns a name for a new variable. The current naming scheme appends the
3948 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3949 the name of vectorizer generated variables, and appends that to NAME if
3953 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3960 case vect_simple_var
:
3963 case vect_scalar_var
:
3969 case vect_pointer_var
:
3978 char* tmp
= concat (prefix
, "_", name
, NULL
);
3979 new_vect_var
= create_tmp_reg (type
, tmp
);
3983 new_vect_var
= create_tmp_reg (type
, prefix
);
3985 return new_vect_var
;
3988 /* Like vect_get_new_vect_var but return an SSA name. */
3991 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3998 case vect_simple_var
:
4001 case vect_scalar_var
:
4004 case vect_pointer_var
:
4013 char* tmp
= concat (prefix
, "_", name
, NULL
);
4014 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4018 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4020 return new_vect_var
;
4023 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4026 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
)
4028 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4029 int misalign
= DR_MISALIGNMENT (dr
);
4030 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4031 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4033 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4034 DR_TARGET_ALIGNMENT (dr
), misalign
);
4037 /* Function vect_create_addr_base_for_vector_ref.
4039 Create an expression that computes the address of the first memory location
4040 that will be accessed for a data reference.
4043 STMT: The statement containing the data reference.
4044 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4045 OFFSET: Optional. If supplied, it is be added to the initial address.
4046 LOOP: Specify relative to which loop-nest should the address be computed.
4047 For example, when the dataref is in an inner-loop nested in an
4048 outer-loop that is now being vectorized, LOOP can be either the
4049 outer-loop, or the inner-loop. The first memory location accessed
4050 by the following dataref ('in' points to short):
4057 if LOOP=i_loop: &in (relative to i_loop)
4058 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4059 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4060 initial address. Unlike OFFSET, which is number of elements to
4061 be added, BYTE_OFFSET is measured in bytes.
4064 1. Return an SSA_NAME whose value is the address of the memory location of
4065 the first vector of the data reference.
4066 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4067 these statement(s) which define the returned SSA_NAME.
4069 FORNOW: We are only handling array accesses with step 1. */
4072 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4073 gimple_seq
*new_stmt_list
,
4077 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4078 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4079 const char *base_name
;
4082 gimple_seq seq
= NULL
;
4084 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4085 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4086 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4088 tree data_ref_base
= unshare_expr (drb
->base_address
);
4089 tree base_offset
= unshare_expr (drb
->offset
);
4090 tree init
= unshare_expr (drb
->init
);
4093 base_name
= get_name (data_ref_base
);
4096 base_offset
= ssize_int (0);
4097 init
= ssize_int (0);
4098 base_name
= get_name (DR_REF (dr
));
4101 /* Create base_offset */
4102 base_offset
= size_binop (PLUS_EXPR
,
4103 fold_convert (sizetype
, base_offset
),
4104 fold_convert (sizetype
, init
));
4108 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4109 fold_convert (sizetype
, offset
), step
);
4110 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4111 base_offset
, offset
);
4115 byte_offset
= fold_convert (sizetype
, byte_offset
);
4116 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4117 base_offset
, byte_offset
);
4120 /* base + base_offset */
4122 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4125 addr_base
= build1 (ADDR_EXPR
,
4126 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4127 unshare_expr (DR_REF (dr
)));
4130 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4131 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4132 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4133 gimple_seq_add_seq (new_stmt_list
, seq
);
4135 if (DR_PTR_INFO (dr
)
4136 && TREE_CODE (addr_base
) == SSA_NAME
4137 && !SSA_NAME_PTR_INFO (addr_base
))
4139 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
);
4140 if (offset
|| byte_offset
)
4141 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4144 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4147 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4148 dump_printf (MSG_NOTE
, "\n");
4155 /* Function vect_create_data_ref_ptr.
4157 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4158 location accessed in the loop by STMT, along with the def-use update
4159 chain to appropriately advance the pointer through the loop iterations.
4160 Also set aliasing information for the pointer. This pointer is used by
4161 the callers to this function to create a memory reference expression for
4162 vector load/store access.
4165 1. STMT: a stmt that references memory. Expected to be of the form
4166 GIMPLE_ASSIGN <name, data-ref> or
4167 GIMPLE_ASSIGN <data-ref, name>.
4168 2. AGGR_TYPE: the type of the reference, which should be either a vector
4170 3. AT_LOOP: the loop where the vector memref is to be created.
4171 4. OFFSET (optional): an offset to be added to the initial address accessed
4172 by the data-ref in STMT.
4173 5. BSI: location where the new stmts are to be placed if there is no loop
4174 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4175 pointing to the initial address.
4176 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4177 to the initial address accessed by the data-ref in STMT. This is
4178 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4182 1. Declare a new ptr to vector_type, and have it point to the base of the
4183 data reference (initial addressed accessed by the data reference).
4184 For example, for vector of type V8HI, the following code is generated:
4187 ap = (v8hi *)initial_address;
4189 if OFFSET is not supplied:
4190 initial_address = &a[init];
4191 if OFFSET is supplied:
4192 initial_address = &a[init + OFFSET];
4193 if BYTE_OFFSET is supplied:
4194 initial_address = &a[init] + BYTE_OFFSET;
4196 Return the initial_address in INITIAL_ADDRESS.
4198 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4199 update the pointer in each iteration of the loop.
4201 Return the increment stmt that updates the pointer in PTR_INCR.
4203 3. Set INV_P to true if the access pattern of the data reference in the
4204 vectorized loop is invariant. Set it to false otherwise.
4206 4. Return the pointer. */
4209 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4210 tree offset
, tree
*initial_address
,
4211 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4212 bool only_init
, bool *inv_p
, tree byte_offset
)
4214 const char *base_name
;
4215 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4216 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4217 struct loop
*loop
= NULL
;
4218 bool nested_in_vect_loop
= false;
4219 struct loop
*containing_loop
= NULL
;
4223 gimple_seq new_stmt_list
= NULL
;
4227 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4229 gimple_stmt_iterator incr_gsi
;
4231 tree indx_before_incr
, indx_after_incr
;
4234 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4236 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4237 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4241 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4242 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4243 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4244 pe
= loop_preheader_edge (loop
);
4248 gcc_assert (bb_vinfo
);
4253 /* Check the step (evolution) of the load in LOOP, and record
4254 whether it's invariant. */
4255 step
= vect_dr_behavior (dr
)->step
;
4256 if (integer_zerop (step
))
4261 /* Create an expression for the first address accessed by this load
4263 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4265 if (dump_enabled_p ())
4267 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4268 dump_printf_loc (MSG_NOTE
, vect_location
,
4269 "create %s-pointer variable to type: ",
4270 get_tree_code_name (TREE_CODE (aggr_type
)));
4271 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4272 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4273 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4274 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4275 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4276 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4277 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4279 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4280 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4281 dump_printf (MSG_NOTE
, "\n");
4284 /* (1) Create the new aggregate-pointer variable.
4285 Vector and array types inherit the alias set of their component
4286 type by default so we need to use a ref-all pointer if the data
4287 reference does not conflict with the created aggregated data
4288 reference because it is not addressable. */
4289 bool need_ref_all
= false;
4290 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4291 get_alias_set (DR_REF (dr
))))
4292 need_ref_all
= true;
4293 /* Likewise for any of the data references in the stmt group. */
4294 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4296 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4299 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4300 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4301 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4302 get_alias_set (DR_REF (sdr
))))
4304 need_ref_all
= true;
4307 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4311 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4313 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4316 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4317 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4318 def-use update cycles for the pointer: one relative to the outer-loop
4319 (LOOP), which is what steps (3) and (4) below do. The other is relative
4320 to the inner-loop (which is the inner-most loop containing the dataref),
4321 and this is done be step (5) below.
4323 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4324 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4325 redundant. Steps (3),(4) create the following:
4328 LOOP: vp1 = phi(vp0,vp2)
4334 If there is an inner-loop nested in loop, then step (5) will also be
4335 applied, and an additional update in the inner-loop will be created:
4338 LOOP: vp1 = phi(vp0,vp2)
4340 inner: vp3 = phi(vp1,vp4)
4341 vp4 = vp3 + inner_step
4347 /* (2) Calculate the initial address of the aggregate-pointer, and set
4348 the aggregate-pointer to point to it before the loop. */
4350 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4352 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4353 offset
, byte_offset
);
4358 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4359 gcc_assert (!new_bb
);
4362 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4365 *initial_address
= new_temp
;
4366 aggr_ptr_init
= new_temp
;
4368 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4369 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4370 inner-loop nested in LOOP (during outer-loop vectorization). */
4372 /* No update in loop is required. */
4373 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4374 aptr
= aggr_ptr_init
;
4377 /* The step of the aggregate pointer is the type size. */
4378 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4379 /* One exception to the above is when the scalar step of the load in
4380 LOOP is zero. In this case the step here is also zero. */
4382 iv_step
= size_zero_node
;
4383 else if (tree_int_cst_sgn (step
) == -1)
4384 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4386 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4388 create_iv (aggr_ptr_init
,
4389 fold_convert (aggr_ptr_type
, iv_step
),
4390 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4391 &indx_before_incr
, &indx_after_incr
);
4392 incr
= gsi_stmt (incr_gsi
);
4393 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4395 /* Copy the points-to information if it exists. */
4396 if (DR_PTR_INFO (dr
))
4398 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4399 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4404 aptr
= indx_before_incr
;
4407 if (!nested_in_vect_loop
|| only_init
)
4411 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4412 nested in LOOP, if exists. */
4414 gcc_assert (nested_in_vect_loop
);
4417 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4419 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4420 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4422 incr
= gsi_stmt (incr_gsi
);
4423 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4425 /* Copy the points-to information if it exists. */
4426 if (DR_PTR_INFO (dr
))
4428 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4429 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4434 return indx_before_incr
;
4441 /* Function bump_vector_ptr
4443 Increment a pointer (to a vector type) by vector-size. If requested,
4444 i.e. if PTR-INCR is given, then also connect the new increment stmt
4445 to the existing def-use update-chain of the pointer, by modifying
4446 the PTR_INCR as illustrated below:
4448 The pointer def-use update-chain before this function:
4449 DATAREF_PTR = phi (p_0, p_2)
4451 PTR_INCR: p_2 = DATAREF_PTR + step
4453 The pointer def-use update-chain after this function:
4454 DATAREF_PTR = phi (p_0, p_2)
4456 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4458 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4461 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4463 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4464 the loop. The increment amount across iterations is expected
4466 BSI - location where the new update stmt is to be placed.
4467 STMT - the original scalar memory-access stmt that is being vectorized.
4468 BUMP - optional. The offset by which to bump the pointer. If not given,
4469 the offset is assumed to be vector_size.
4471 Output: Return NEW_DATAREF_PTR as illustrated above.
4476 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4477 gimple
*stmt
, tree bump
)
4479 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4480 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4481 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4482 tree update
= TYPE_SIZE_UNIT (vectype
);
4485 use_operand_p use_p
;
4486 tree new_dataref_ptr
;
4491 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4492 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4494 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4495 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4496 dataref_ptr
, update
);
4497 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4499 /* Copy the points-to information if it exists. */
4500 if (DR_PTR_INFO (dr
))
4502 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4503 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4507 return new_dataref_ptr
;
4509 /* Update the vector-pointer's cross-iteration increment. */
4510 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4512 tree use
= USE_FROM_PTR (use_p
);
4514 if (use
== dataref_ptr
)
4515 SET_USE (use_p
, new_dataref_ptr
);
4517 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4520 return new_dataref_ptr
;
4524 /* Function vect_create_destination_var.
4526 Create a new temporary of type VECTYPE. */
4529 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4535 enum vect_var_kind kind
;
4538 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4542 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4544 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4546 name
= get_name (scalar_dest
);
4548 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4550 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4551 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4557 /* Function vect_grouped_store_supported.
4559 Returns TRUE if interleave high and interleave low permutations
4560 are supported, and FALSE otherwise. */
4563 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4565 machine_mode mode
= TYPE_MODE (vectype
);
4567 /* vect_permute_store_chain requires the group size to be equal to 3 or
4568 be a power of two. */
4569 if (count
!= 3 && exact_log2 (count
) == -1)
4571 if (dump_enabled_p ())
4572 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4573 "the size of the group of accesses"
4574 " is not a power of 2 or not eqaul to 3\n");
4578 /* Check that the permutation is supported. */
4579 if (VECTOR_MODE_P (mode
))
4581 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4584 unsigned int j0
= 0, j1
= 0, j2
= 0;
4587 vec_perm_builder
sel (nelt
, nelt
, 1);
4588 sel
.quick_grow (nelt
);
4589 vec_perm_indices indices
;
4590 for (j
= 0; j
< 3; j
++)
4592 int nelt0
= ((3 - j
) * nelt
) % 3;
4593 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4594 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4595 for (i
= 0; i
< nelt
; i
++)
4597 if (3 * i
+ nelt0
< nelt
)
4598 sel
[3 * i
+ nelt0
] = j0
++;
4599 if (3 * i
+ nelt1
< nelt
)
4600 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4601 if (3 * i
+ nelt2
< nelt
)
4602 sel
[3 * i
+ nelt2
] = 0;
4604 indices
.new_vector (sel
, 2, nelt
);
4605 if (!can_vec_perm_const_p (mode
, indices
))
4607 if (dump_enabled_p ())
4608 dump_printf (MSG_MISSED_OPTIMIZATION
,
4609 "permutation op not supported by target.\n");
4613 for (i
= 0; i
< nelt
; i
++)
4615 if (3 * i
+ nelt0
< nelt
)
4616 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4617 if (3 * i
+ nelt1
< nelt
)
4618 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4619 if (3 * i
+ nelt2
< nelt
)
4620 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4622 indices
.new_vector (sel
, 2, nelt
);
4623 if (!can_vec_perm_const_p (mode
, indices
))
4625 if (dump_enabled_p ())
4626 dump_printf (MSG_MISSED_OPTIMIZATION
,
4627 "permutation op not supported by target.\n");
4635 /* If length is not equal to 3 then only power of 2 is supported. */
4636 gcc_assert (pow2p_hwi (count
));
4638 /* The encoding has 2 interleaved stepped patterns. */
4639 vec_perm_builder
sel (nelt
, 2, 3);
4641 for (i
= 0; i
< 3; i
++)
4644 sel
[i
* 2 + 1] = i
+ nelt
;
4646 vec_perm_indices
indices (sel
, 2, nelt
);
4647 if (can_vec_perm_const_p (mode
, indices
))
4649 for (i
= 0; i
< 6; i
++)
4651 indices
.new_vector (sel
, 2, nelt
);
4652 if (can_vec_perm_const_p (mode
, indices
))
4658 if (dump_enabled_p ())
4659 dump_printf (MSG_MISSED_OPTIMIZATION
,
4660 "permutaion op not supported by target.\n");
4665 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4669 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4671 return vect_lanes_optab_supported_p ("vec_store_lanes",
4672 vec_store_lanes_optab
,
4677 /* Function vect_permute_store_chain.
4679 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4680 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4681 the data correctly for the stores. Return the final references for stores
4684 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4685 The input is 4 vectors each containing 8 elements. We assign a number to
4686 each element, the input sequence is:
4688 1st vec: 0 1 2 3 4 5 6 7
4689 2nd vec: 8 9 10 11 12 13 14 15
4690 3rd vec: 16 17 18 19 20 21 22 23
4691 4th vec: 24 25 26 27 28 29 30 31
4693 The output sequence should be:
4695 1st vec: 0 8 16 24 1 9 17 25
4696 2nd vec: 2 10 18 26 3 11 19 27
4697 3rd vec: 4 12 20 28 5 13 21 30
4698 4th vec: 6 14 22 30 7 15 23 31
4700 i.e., we interleave the contents of the four vectors in their order.
4702 We use interleave_high/low instructions to create such output. The input of
4703 each interleave_high/low operation is two vectors:
4706 the even elements of the result vector are obtained left-to-right from the
4707 high/low elements of the first vector. The odd elements of the result are
4708 obtained left-to-right from the high/low elements of the second vector.
4709 The output of interleave_high will be: 0 4 1 5
4710 and of interleave_low: 2 6 3 7
4713 The permutation is done in log LENGTH stages. In each stage interleave_high
4714 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4715 where the first argument is taken from the first half of DR_CHAIN and the
4716 second argument from it's second half.
4719 I1: interleave_high (1st vec, 3rd vec)
4720 I2: interleave_low (1st vec, 3rd vec)
4721 I3: interleave_high (2nd vec, 4th vec)
4722 I4: interleave_low (2nd vec, 4th vec)
4724 The output for the first stage is:
4726 I1: 0 16 1 17 2 18 3 19
4727 I2: 4 20 5 21 6 22 7 23
4728 I3: 8 24 9 25 10 26 11 27
4729 I4: 12 28 13 29 14 30 15 31
4731 The output of the second stage, i.e. the final result is:
4733 I1: 0 8 16 24 1 9 17 25
4734 I2: 2 10 18 26 3 11 19 27
4735 I3: 4 12 20 28 5 13 21 30
4736 I4: 6 14 22 30 7 15 23 31. */
4739 vect_permute_store_chain (vec
<tree
> dr_chain
,
4740 unsigned int length
,
4742 gimple_stmt_iterator
*gsi
,
4743 vec
<tree
> *result_chain
)
4745 tree vect1
, vect2
, high
, low
;
4747 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4748 tree perm_mask_low
, perm_mask_high
;
4750 tree perm3_mask_low
, perm3_mask_high
;
4751 unsigned int i
, n
, log_length
= exact_log2 (length
);
4752 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4754 result_chain
->quick_grow (length
);
4755 memcpy (result_chain
->address (), dr_chain
.address (),
4756 length
* sizeof (tree
));
4760 unsigned int j0
= 0, j1
= 0, j2
= 0;
4762 vec_perm_builder
sel (nelt
, nelt
, 1);
4763 sel
.quick_grow (nelt
);
4764 vec_perm_indices indices
;
4765 for (j
= 0; j
< 3; j
++)
4767 int nelt0
= ((3 - j
) * nelt
) % 3;
4768 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4769 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4771 for (i
= 0; i
< nelt
; i
++)
4773 if (3 * i
+ nelt0
< nelt
)
4774 sel
[3 * i
+ nelt0
] = j0
++;
4775 if (3 * i
+ nelt1
< nelt
)
4776 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4777 if (3 * i
+ nelt2
< nelt
)
4778 sel
[3 * i
+ nelt2
] = 0;
4780 indices
.new_vector (sel
, 2, nelt
);
4781 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
4783 for (i
= 0; i
< nelt
; i
++)
4785 if (3 * i
+ nelt0
< nelt
)
4786 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4787 if (3 * i
+ nelt1
< nelt
)
4788 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4789 if (3 * i
+ nelt2
< nelt
)
4790 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4792 indices
.new_vector (sel
, 2, nelt
);
4793 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
4795 vect1
= dr_chain
[0];
4796 vect2
= dr_chain
[1];
4798 /* Create interleaving stmt:
4799 low = VEC_PERM_EXPR <vect1, vect2,
4800 {j, nelt, *, j + 1, nelt + j + 1, *,
4801 j + 2, nelt + j + 2, *, ...}> */
4802 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4803 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4804 vect2
, perm3_mask_low
);
4805 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4808 vect2
= dr_chain
[2];
4809 /* Create interleaving stmt:
4810 low = VEC_PERM_EXPR <vect1, vect2,
4811 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4812 6, 7, nelt + j + 2, ...}> */
4813 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4814 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4815 vect2
, perm3_mask_high
);
4816 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4817 (*result_chain
)[j
] = data_ref
;
4822 /* If length is not equal to 3 then only power of 2 is supported. */
4823 gcc_assert (pow2p_hwi (length
));
4825 /* The encoding has 2 interleaved stepped patterns. */
4826 vec_perm_builder
sel (nelt
, 2, 3);
4828 for (i
= 0; i
< 3; i
++)
4831 sel
[i
* 2 + 1] = i
+ nelt
;
4833 vec_perm_indices
indices (sel
, 2, nelt
);
4834 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
4836 for (i
= 0; i
< 6; i
++)
4838 indices
.new_vector (sel
, 2, nelt
);
4839 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
4841 for (i
= 0, n
= log_length
; i
< n
; i
++)
4843 for (j
= 0; j
< length
/2; j
++)
4845 vect1
= dr_chain
[j
];
4846 vect2
= dr_chain
[j
+length
/2];
4848 /* Create interleaving stmt:
4849 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4851 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4852 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4853 vect2
, perm_mask_high
);
4854 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4855 (*result_chain
)[2*j
] = high
;
4857 /* Create interleaving stmt:
4858 low = VEC_PERM_EXPR <vect1, vect2,
4859 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4861 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4862 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4863 vect2
, perm_mask_low
);
4864 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4865 (*result_chain
)[2*j
+1] = low
;
4867 memcpy (dr_chain
.address (), result_chain
->address (),
4868 length
* sizeof (tree
));
4873 /* Function vect_setup_realignment
4875 This function is called when vectorizing an unaligned load using
4876 the dr_explicit_realign[_optimized] scheme.
4877 This function generates the following code at the loop prolog:
4880 x msq_init = *(floor(p)); # prolog load
4881 realignment_token = call target_builtin;
4883 x msq = phi (msq_init, ---)
4885 The stmts marked with x are generated only for the case of
4886 dr_explicit_realign_optimized.
4888 The code above sets up a new (vector) pointer, pointing to the first
4889 location accessed by STMT, and a "floor-aligned" load using that pointer.
4890 It also generates code to compute the "realignment-token" (if the relevant
4891 target hook was defined), and creates a phi-node at the loop-header bb
4892 whose arguments are the result of the prolog-load (created by this
4893 function) and the result of a load that takes place in the loop (to be
4894 created by the caller to this function).
4896 For the case of dr_explicit_realign_optimized:
4897 The caller to this function uses the phi-result (msq) to create the
4898 realignment code inside the loop, and sets up the missing phi argument,
4901 msq = phi (msq_init, lsq)
4902 lsq = *(floor(p')); # load in loop
4903 result = realign_load (msq, lsq, realignment_token);
4905 For the case of dr_explicit_realign:
4907 msq = *(floor(p)); # load in loop
4909 lsq = *(floor(p')); # load in loop
4910 result = realign_load (msq, lsq, realignment_token);
4913 STMT - (scalar) load stmt to be vectorized. This load accesses
4914 a memory location that may be unaligned.
4915 BSI - place where new code is to be inserted.
4916 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4920 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4921 target hook, if defined.
4922 Return value - the result of the loop-header phi node. */
4925 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4926 tree
*realignment_token
,
4927 enum dr_alignment_support alignment_support_scheme
,
4929 struct loop
**at_loop
)
4931 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4932 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4933 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4934 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4935 struct loop
*loop
= NULL
;
4937 tree scalar_dest
= gimple_assign_lhs (stmt
);
4943 tree msq_init
= NULL_TREE
;
4946 tree msq
= NULL_TREE
;
4947 gimple_seq stmts
= NULL
;
4949 bool compute_in_loop
= false;
4950 bool nested_in_vect_loop
= false;
4951 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4952 struct loop
*loop_for_initial_load
= NULL
;
4956 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4957 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4960 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4961 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4963 /* We need to generate three things:
4964 1. the misalignment computation
4965 2. the extra vector load (for the optimized realignment scheme).
4966 3. the phi node for the two vectors from which the realignment is
4967 done (for the optimized realignment scheme). */
4969 /* 1. Determine where to generate the misalignment computation.
4971 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4972 calculation will be generated by this function, outside the loop (in the
4973 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4974 caller, inside the loop.
4976 Background: If the misalignment remains fixed throughout the iterations of
4977 the loop, then both realignment schemes are applicable, and also the
4978 misalignment computation can be done outside LOOP. This is because we are
4979 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4980 are a multiple of VS (the Vector Size), and therefore the misalignment in
4981 different vectorized LOOP iterations is always the same.
4982 The problem arises only if the memory access is in an inner-loop nested
4983 inside LOOP, which is now being vectorized using outer-loop vectorization.
4984 This is the only case when the misalignment of the memory access may not
4985 remain fixed throughout the iterations of the inner-loop (as explained in
4986 detail in vect_supportable_dr_alignment). In this case, not only is the
4987 optimized realignment scheme not applicable, but also the misalignment
4988 computation (and generation of the realignment token that is passed to
4989 REALIGN_LOAD) have to be done inside the loop.
4991 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4992 or not, which in turn determines if the misalignment is computed inside
4993 the inner-loop, or outside LOOP. */
4995 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4997 compute_in_loop
= true;
4998 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5002 /* 2. Determine where to generate the extra vector load.
5004 For the optimized realignment scheme, instead of generating two vector
5005 loads in each iteration, we generate a single extra vector load in the
5006 preheader of the loop, and in each iteration reuse the result of the
5007 vector load from the previous iteration. In case the memory access is in
5008 an inner-loop nested inside LOOP, which is now being vectorized using
5009 outer-loop vectorization, we need to determine whether this initial vector
5010 load should be generated at the preheader of the inner-loop, or can be
5011 generated at the preheader of LOOP. If the memory access has no evolution
5012 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5013 to be generated inside LOOP (in the preheader of the inner-loop). */
5015 if (nested_in_vect_loop
)
5017 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5018 bool invariant_in_outerloop
=
5019 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5020 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5023 loop_for_initial_load
= loop
;
5025 *at_loop
= loop_for_initial_load
;
5027 if (loop_for_initial_load
)
5028 pe
= loop_preheader_edge (loop_for_initial_load
);
5030 /* 3. For the case of the optimized realignment, create the first vector
5031 load at the loop preheader. */
5033 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5035 /* Create msq_init = *(floor(p1)) in the loop preheader */
5038 gcc_assert (!compute_in_loop
);
5039 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5040 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5041 NULL_TREE
, &init_addr
, NULL
, &inc
,
5043 if (TREE_CODE (ptr
) == SSA_NAME
)
5044 new_temp
= copy_ssa_name (ptr
);
5046 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5047 unsigned int align
= DR_TARGET_ALIGNMENT (dr
);
5048 new_stmt
= gimple_build_assign
5049 (new_temp
, BIT_AND_EXPR
, ptr
,
5050 build_int_cst (TREE_TYPE (ptr
), -(HOST_WIDE_INT
) align
));
5051 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5052 gcc_assert (!new_bb
);
5054 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5055 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5056 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5057 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5058 gimple_assign_set_lhs (new_stmt
, new_temp
);
5061 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5062 gcc_assert (!new_bb
);
5065 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5067 msq_init
= gimple_assign_lhs (new_stmt
);
5070 /* 4. Create realignment token using a target builtin, if available.
5071 It is done either inside the containing loop, or before LOOP (as
5072 determined above). */
5074 if (targetm
.vectorize
.builtin_mask_for_load
)
5079 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5082 /* Generate the INIT_ADDR computation outside LOOP. */
5083 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5087 pe
= loop_preheader_edge (loop
);
5088 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5089 gcc_assert (!new_bb
);
5092 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5095 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5096 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5098 vect_create_destination_var (scalar_dest
,
5099 gimple_call_return_type (new_stmt
));
5100 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5101 gimple_call_set_lhs (new_stmt
, new_temp
);
5103 if (compute_in_loop
)
5104 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5107 /* Generate the misalignment computation outside LOOP. */
5108 pe
= loop_preheader_edge (loop
);
5109 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5110 gcc_assert (!new_bb
);
5113 *realignment_token
= gimple_call_lhs (new_stmt
);
5115 /* The result of the CALL_EXPR to this builtin is determined from
5116 the value of the parameter and no global variables are touched
5117 which makes the builtin a "const" function. Requiring the
5118 builtin to have the "const" attribute makes it unnecessary
5119 to call mark_call_clobbered. */
5120 gcc_assert (TREE_READONLY (builtin_decl
));
5123 if (alignment_support_scheme
== dr_explicit_realign
)
5126 gcc_assert (!compute_in_loop
);
5127 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5130 /* 5. Create msq = phi <msq_init, lsq> in loop */
5132 pe
= loop_preheader_edge (containing_loop
);
5133 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5134 msq
= make_ssa_name (vec_dest
);
5135 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5136 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5142 /* Function vect_grouped_load_supported.
5144 COUNT is the size of the load group (the number of statements plus the
5145 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5146 only one statement, with a gap of COUNT - 1.
5148 Returns true if a suitable permute exists. */
5151 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5152 unsigned HOST_WIDE_INT count
)
5154 machine_mode mode
= TYPE_MODE (vectype
);
5156 /* If this is single-element interleaving with an element distance
5157 that leaves unused vector loads around punt - we at least create
5158 very sub-optimal code in that case (and blow up memory,
5160 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5162 if (dump_enabled_p ())
5163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5164 "single-element interleaving not supported "
5165 "for not adjacent vector loads\n");
5169 /* vect_permute_load_chain requires the group size to be equal to 3 or
5170 be a power of two. */
5171 if (count
!= 3 && exact_log2 (count
) == -1)
5173 if (dump_enabled_p ())
5174 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5175 "the size of the group of accesses"
5176 " is not a power of 2 or not equal to 3\n");
5180 /* Check that the permutation is supported. */
5181 if (VECTOR_MODE_P (mode
))
5183 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5187 vec_perm_builder
sel (nelt
, nelt
, 1);
5188 sel
.quick_grow (nelt
);
5189 vec_perm_indices indices
;
5191 for (k
= 0; k
< 3; k
++)
5193 for (i
= 0; i
< nelt
; i
++)
5194 if (3 * i
+ k
< 2 * nelt
)
5198 indices
.new_vector (sel
, 2, nelt
);
5199 if (!can_vec_perm_const_p (mode
, indices
))
5201 if (dump_enabled_p ())
5202 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5203 "shuffle of 3 loads is not supported by"
5207 for (i
= 0, j
= 0; i
< nelt
; i
++)
5208 if (3 * i
+ k
< 2 * nelt
)
5211 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5212 indices
.new_vector (sel
, 2, nelt
);
5213 if (!can_vec_perm_const_p (mode
, indices
))
5215 if (dump_enabled_p ())
5216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5217 "shuffle of 3 loads is not supported by"
5226 /* If length is not equal to 3 then only power of 2 is supported. */
5227 gcc_assert (pow2p_hwi (count
));
5229 /* The encoding has a single stepped pattern. */
5230 vec_perm_builder
sel (nelt
, 1, 3);
5232 for (i
= 0; i
< 3; i
++)
5234 vec_perm_indices
indices (sel
, 2, nelt
);
5235 if (can_vec_perm_const_p (mode
, indices
))
5237 for (i
= 0; i
< 3; i
++)
5239 indices
.new_vector (sel
, 2, nelt
);
5240 if (can_vec_perm_const_p (mode
, indices
))
5246 if (dump_enabled_p ())
5247 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5248 "extract even/odd not supported by target\n");
5252 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5256 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5258 return vect_lanes_optab_supported_p ("vec_load_lanes",
5259 vec_load_lanes_optab
,
5263 /* Function vect_permute_load_chain.
5265 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5266 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5267 the input data correctly. Return the final references for loads in
5270 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5271 The input is 4 vectors each containing 8 elements. We assign a number to each
5272 element, the input sequence is:
5274 1st vec: 0 1 2 3 4 5 6 7
5275 2nd vec: 8 9 10 11 12 13 14 15
5276 3rd vec: 16 17 18 19 20 21 22 23
5277 4th vec: 24 25 26 27 28 29 30 31
5279 The output sequence should be:
5281 1st vec: 0 4 8 12 16 20 24 28
5282 2nd vec: 1 5 9 13 17 21 25 29
5283 3rd vec: 2 6 10 14 18 22 26 30
5284 4th vec: 3 7 11 15 19 23 27 31
5286 i.e., the first output vector should contain the first elements of each
5287 interleaving group, etc.
5289 We use extract_even/odd instructions to create such output. The input of
5290 each extract_even/odd operation is two vectors
5294 and the output is the vector of extracted even/odd elements. The output of
5295 extract_even will be: 0 2 4 6
5296 and of extract_odd: 1 3 5 7
5299 The permutation is done in log LENGTH stages. In each stage extract_even
5300 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5301 their order. In our example,
5303 E1: extract_even (1st vec, 2nd vec)
5304 E2: extract_odd (1st vec, 2nd vec)
5305 E3: extract_even (3rd vec, 4th vec)
5306 E4: extract_odd (3rd vec, 4th vec)
5308 The output for the first stage will be:
5310 E1: 0 2 4 6 8 10 12 14
5311 E2: 1 3 5 7 9 11 13 15
5312 E3: 16 18 20 22 24 26 28 30
5313 E4: 17 19 21 23 25 27 29 31
5315 In order to proceed and create the correct sequence for the next stage (or
5316 for the correct output, if the second stage is the last one, as in our
5317 example), we first put the output of extract_even operation and then the
5318 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5319 The input for the second stage is:
5321 1st vec (E1): 0 2 4 6 8 10 12 14
5322 2nd vec (E3): 16 18 20 22 24 26 28 30
5323 3rd vec (E2): 1 3 5 7 9 11 13 15
5324 4th vec (E4): 17 19 21 23 25 27 29 31
5326 The output of the second stage:
5328 E1: 0 4 8 12 16 20 24 28
5329 E2: 2 6 10 14 18 22 26 30
5330 E3: 1 5 9 13 17 21 25 29
5331 E4: 3 7 11 15 19 23 27 31
5333 And RESULT_CHAIN after reordering:
5335 1st vec (E1): 0 4 8 12 16 20 24 28
5336 2nd vec (E3): 1 5 9 13 17 21 25 29
5337 3rd vec (E2): 2 6 10 14 18 22 26 30
5338 4th vec (E4): 3 7 11 15 19 23 27 31. */
5341 vect_permute_load_chain (vec
<tree
> dr_chain
,
5342 unsigned int length
,
5344 gimple_stmt_iterator
*gsi
,
5345 vec
<tree
> *result_chain
)
5347 tree data_ref
, first_vect
, second_vect
;
5348 tree perm_mask_even
, perm_mask_odd
;
5349 tree perm3_mask_low
, perm3_mask_high
;
5351 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5352 unsigned int i
, j
, log_length
= exact_log2 (length
);
5353 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5355 result_chain
->quick_grow (length
);
5356 memcpy (result_chain
->address (), dr_chain
.address (),
5357 length
* sizeof (tree
));
5363 vec_perm_builder
sel (nelt
, nelt
, 1);
5364 sel
.quick_grow (nelt
);
5365 vec_perm_indices indices
;
5366 for (k
= 0; k
< 3; k
++)
5368 for (i
= 0; i
< nelt
; i
++)
5369 if (3 * i
+ k
< 2 * nelt
)
5373 indices
.new_vector (sel
, 2, nelt
);
5374 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5376 for (i
= 0, j
= 0; i
< nelt
; i
++)
5377 if (3 * i
+ k
< 2 * nelt
)
5380 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5381 indices
.new_vector (sel
, 2, nelt
);
5382 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5384 first_vect
= dr_chain
[0];
5385 second_vect
= dr_chain
[1];
5387 /* Create interleaving stmt (low part of):
5388 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5390 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5391 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5392 second_vect
, perm3_mask_low
);
5393 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5395 /* Create interleaving stmt (high part of):
5396 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5398 first_vect
= data_ref
;
5399 second_vect
= dr_chain
[2];
5400 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5401 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5402 second_vect
, perm3_mask_high
);
5403 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5404 (*result_chain
)[k
] = data_ref
;
5409 /* If length is not equal to 3 then only power of 2 is supported. */
5410 gcc_assert (pow2p_hwi (length
));
5412 /* The encoding has a single stepped pattern. */
5413 vec_perm_builder
sel (nelt
, 1, 3);
5415 for (i
= 0; i
< 3; ++i
)
5417 vec_perm_indices
indices (sel
, 2, nelt
);
5418 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5420 for (i
= 0; i
< 3; ++i
)
5422 indices
.new_vector (sel
, 2, nelt
);
5423 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5425 for (i
= 0; i
< log_length
; i
++)
5427 for (j
= 0; j
< length
; j
+= 2)
5429 first_vect
= dr_chain
[j
];
5430 second_vect
= dr_chain
[j
+1];
5432 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5433 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5434 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5435 first_vect
, second_vect
,
5437 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5438 (*result_chain
)[j
/2] = data_ref
;
5440 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5441 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5442 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5443 first_vect
, second_vect
,
5445 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5446 (*result_chain
)[j
/2+length
/2] = data_ref
;
5448 memcpy (dr_chain
.address (), result_chain
->address (),
5449 length
* sizeof (tree
));
5454 /* Function vect_shift_permute_load_chain.
5456 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5457 sequence of stmts to reorder the input data accordingly.
5458 Return the final references for loads in RESULT_CHAIN.
5459 Return true if successed, false otherwise.
5461 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5462 The input is 3 vectors each containing 8 elements. We assign a
5463 number to each element, the input sequence is:
5465 1st vec: 0 1 2 3 4 5 6 7
5466 2nd vec: 8 9 10 11 12 13 14 15
5467 3rd vec: 16 17 18 19 20 21 22 23
5469 The output sequence should be:
5471 1st vec: 0 3 6 9 12 15 18 21
5472 2nd vec: 1 4 7 10 13 16 19 22
5473 3rd vec: 2 5 8 11 14 17 20 23
5475 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5477 First we shuffle all 3 vectors to get correct elements order:
5479 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5480 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5481 3rd vec: (16 19 22) (17 20 23) (18 21)
5483 Next we unite and shift vector 3 times:
5486 shift right by 6 the concatenation of:
5487 "1st vec" and "2nd vec"
5488 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5489 "2nd vec" and "3rd vec"
5490 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5491 "3rd vec" and "1st vec"
5492 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5495 So that now new vectors are:
5497 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5498 2nd vec: (10 13) (16 19 22) (17 20 23)
5499 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5502 shift right by 5 the concatenation of:
5503 "1st vec" and "3rd vec"
5504 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5505 "2nd vec" and "1st vec"
5506 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5507 "3rd vec" and "2nd vec"
5508 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5511 So that now new vectors are:
5513 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5514 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5515 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5518 shift right by 5 the concatenation of:
5519 "1st vec" and "1st vec"
5520 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5521 shift right by 3 the concatenation of:
5522 "2nd vec" and "2nd vec"
5523 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5526 So that now all vectors are READY:
5527 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5528 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5529 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5531 This algorithm is faster than one in vect_permute_load_chain if:
5532 1. "shift of a concatination" is faster than general permutation.
5534 2. The TARGET machine can't execute vector instructions in parallel.
5535 This is because each step of the algorithm depends on previous.
5536 The algorithm in vect_permute_load_chain is much more parallel.
5538 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5542 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5543 unsigned int length
,
5545 gimple_stmt_iterator
*gsi
,
5546 vec
<tree
> *result_chain
)
5548 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5549 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5550 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5553 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5555 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5556 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5557 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5559 vec_perm_builder
sel (nelt
, nelt
, 1);
5560 sel
.quick_grow (nelt
);
5562 result_chain
->quick_grow (length
);
5563 memcpy (result_chain
->address (), dr_chain
.address (),
5564 length
* sizeof (tree
));
5566 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5568 unsigned int j
, log_length
= exact_log2 (length
);
5569 for (i
= 0; i
< nelt
/ 2; ++i
)
5571 for (i
= 0; i
< nelt
/ 2; ++i
)
5572 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5573 vec_perm_indices
indices (sel
, 2, nelt
);
5574 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5576 if (dump_enabled_p ())
5577 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5578 "shuffle of 2 fields structure is not \
5579 supported by target\n");
5582 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
5584 for (i
= 0; i
< nelt
/ 2; ++i
)
5586 for (i
= 0; i
< nelt
/ 2; ++i
)
5587 sel
[nelt
/ 2 + i
] = i
* 2;
5588 indices
.new_vector (sel
, 2, nelt
);
5589 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5591 if (dump_enabled_p ())
5592 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5593 "shuffle of 2 fields structure is not \
5594 supported by target\n");
5597 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
5599 /* Generating permutation constant to shift all elements.
5600 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5601 for (i
= 0; i
< nelt
; i
++)
5602 sel
[i
] = nelt
/ 2 + i
;
5603 indices
.new_vector (sel
, 2, nelt
);
5604 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5608 "shift permutation is not supported by target\n");
5611 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5613 /* Generating permutation constant to select vector from 2.
5614 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5615 for (i
= 0; i
< nelt
/ 2; i
++)
5617 for (i
= nelt
/ 2; i
< nelt
; i
++)
5619 indices
.new_vector (sel
, 2, nelt
);
5620 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5622 if (dump_enabled_p ())
5623 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5624 "select is not supported by target\n");
5627 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5629 for (i
= 0; i
< log_length
; i
++)
5631 for (j
= 0; j
< length
; j
+= 2)
5633 first_vect
= dr_chain
[j
];
5634 second_vect
= dr_chain
[j
+ 1];
5636 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5637 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5638 first_vect
, first_vect
,
5640 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5643 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5644 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5645 second_vect
, second_vect
,
5647 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5650 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5651 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5652 vect
[0], vect
[1], shift1_mask
);
5653 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5654 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5656 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5657 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5658 vect
[0], vect
[1], select_mask
);
5659 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5660 (*result_chain
)[j
/2] = data_ref
;
5662 memcpy (dr_chain
.address (), result_chain
->address (),
5663 length
* sizeof (tree
));
5667 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5669 unsigned int k
= 0, l
= 0;
5671 /* Generating permutation constant to get all elements in rigth order.
5672 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5673 for (i
= 0; i
< nelt
; i
++)
5675 if (3 * k
+ (l
% 3) >= nelt
)
5678 l
+= (3 - (nelt
% 3));
5680 sel
[i
] = 3 * k
+ (l
% 3);
5683 vec_perm_indices
indices (sel
, 2, nelt
);
5684 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5686 if (dump_enabled_p ())
5687 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5688 "shuffle of 3 fields structure is not \
5689 supported by target\n");
5692 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5694 /* Generating permutation constant to shift all elements.
5695 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5696 for (i
= 0; i
< nelt
; i
++)
5697 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5698 indices
.new_vector (sel
, 2, nelt
);
5699 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5701 if (dump_enabled_p ())
5702 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5703 "shift permutation is not supported by target\n");
5706 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5708 /* Generating permutation constant to shift all elements.
5709 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5710 for (i
= 0; i
< nelt
; i
++)
5711 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5712 indices
.new_vector (sel
, 2, nelt
);
5713 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5715 if (dump_enabled_p ())
5716 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5717 "shift permutation is not supported by target\n");
5720 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5722 /* Generating permutation constant to shift all elements.
5723 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5724 for (i
= 0; i
< nelt
; i
++)
5725 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5726 indices
.new_vector (sel
, 2, nelt
);
5727 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5731 "shift permutation is not supported by target\n");
5734 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5736 /* Generating permutation constant to shift all elements.
5737 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5738 for (i
= 0; i
< nelt
; i
++)
5739 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5740 indices
.new_vector (sel
, 2, nelt
);
5741 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5743 if (dump_enabled_p ())
5744 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5745 "shift permutation is not supported by target\n");
5748 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5750 for (k
= 0; k
< 3; k
++)
5752 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5753 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5754 dr_chain
[k
], dr_chain
[k
],
5756 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5760 for (k
= 0; k
< 3; k
++)
5762 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5763 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5764 vect
[k
% 3], vect
[(k
+ 1) % 3],
5766 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5767 vect_shift
[k
] = data_ref
;
5770 for (k
= 0; k
< 3; k
++)
5772 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5773 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5774 vect_shift
[(4 - k
) % 3],
5775 vect_shift
[(3 - k
) % 3],
5777 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5781 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5783 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5784 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5785 vect
[0], shift3_mask
);
5786 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5787 (*result_chain
)[nelt
% 3] = data_ref
;
5789 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5790 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5791 vect
[1], shift4_mask
);
5792 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5793 (*result_chain
)[0] = data_ref
;
5799 /* Function vect_transform_grouped_load.
5801 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5802 to perform their permutation and ascribe the result vectorized statements to
5803 the scalar statements.
5807 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5808 gimple_stmt_iterator
*gsi
)
5811 vec
<tree
> result_chain
= vNULL
;
5813 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5814 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5815 vectors, that are ready for vector computation. */
5816 result_chain
.create (size
);
5818 /* If reassociation width for vector type is 2 or greater target machine can
5819 execute 2 or more vector instructions in parallel. Otherwise try to
5820 get chain for loads group using vect_shift_permute_load_chain. */
5821 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5822 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5824 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5825 gsi
, &result_chain
))
5826 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5827 vect_record_grouped_load_vectors (stmt
, result_chain
);
5828 result_chain
.release ();
5831 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5832 generated as part of the vectorization of STMT. Assign the statement
5833 for each vector to the associated scalar statement. */
5836 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5838 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5839 gimple
*next_stmt
, *new_stmt
;
5840 unsigned int i
, gap_count
;
5843 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5844 Since we scan the chain starting from it's first node, their order
5845 corresponds the order of data-refs in RESULT_CHAIN. */
5846 next_stmt
= first_stmt
;
5848 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5853 /* Skip the gaps. Loads created for the gaps will be removed by dead
5854 code elimination pass later. No need to check for the first stmt in
5855 the group, since it always exists.
5856 GROUP_GAP is the number of steps in elements from the previous
5857 access (if there is no gap GROUP_GAP is 1). We skip loads that
5858 correspond to the gaps. */
5859 if (next_stmt
!= first_stmt
5860 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5868 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5869 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5870 copies, and we put the new vector statement in the first available
5872 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5873 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5876 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5879 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5881 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5884 prev_stmt
= rel_stmt
;
5886 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5889 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5894 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5896 /* If NEXT_STMT accesses the same DR as the previous statement,
5897 put the same TMP_DATA_REF as its vectorized statement; otherwise
5898 get the next data-ref from RESULT_CHAIN. */
5899 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5905 /* Function vect_force_dr_alignment_p.
5907 Returns whether the alignment of a DECL can be forced to be aligned
5908 on ALIGNMENT bit boundary. */
5911 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5916 if (decl_in_symtab_p (decl
)
5917 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5920 if (TREE_STATIC (decl
))
5921 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5923 return (alignment
<= MAX_STACK_ALIGNMENT
);
5927 /* Return whether the data reference DR is supported with respect to its
5929 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5930 it is aligned, i.e., check if it is possible to vectorize it with different
5933 enum dr_alignment_support
5934 vect_supportable_dr_alignment (struct data_reference
*dr
,
5935 bool check_aligned_accesses
)
5937 gimple
*stmt
= DR_STMT (dr
);
5938 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5939 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5940 machine_mode mode
= TYPE_MODE (vectype
);
5941 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5942 struct loop
*vect_loop
= NULL
;
5943 bool nested_in_vect_loop
= false;
5945 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5948 /* For now assume all conditional loads/stores support unaligned
5949 access without any special code. */
5950 if (is_gimple_call (stmt
)
5951 && gimple_call_internal_p (stmt
)
5952 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5953 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5954 return dr_unaligned_supported
;
5958 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5959 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5962 /* Possibly unaligned access. */
5964 /* We can choose between using the implicit realignment scheme (generating
5965 a misaligned_move stmt) and the explicit realignment scheme (generating
5966 aligned loads with a REALIGN_LOAD). There are two variants to the
5967 explicit realignment scheme: optimized, and unoptimized.
5968 We can optimize the realignment only if the step between consecutive
5969 vector loads is equal to the vector size. Since the vector memory
5970 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5971 is guaranteed that the misalignment amount remains the same throughout the
5972 execution of the vectorized loop. Therefore, we can create the
5973 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5974 at the loop preheader.
5976 However, in the case of outer-loop vectorization, when vectorizing a
5977 memory access in the inner-loop nested within the LOOP that is now being
5978 vectorized, while it is guaranteed that the misalignment of the
5979 vectorized memory access will remain the same in different outer-loop
5980 iterations, it is *not* guaranteed that is will remain the same throughout
5981 the execution of the inner-loop. This is because the inner-loop advances
5982 with the original scalar step (and not in steps of VS). If the inner-loop
5983 step happens to be a multiple of VS, then the misalignment remains fixed
5984 and we can use the optimized realignment scheme. For example:
5990 When vectorizing the i-loop in the above example, the step between
5991 consecutive vector loads is 1, and so the misalignment does not remain
5992 fixed across the execution of the inner-loop, and the realignment cannot
5993 be optimized (as illustrated in the following pseudo vectorized loop):
5995 for (i=0; i<N; i+=4)
5996 for (j=0; j<M; j++){
5997 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5998 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5999 // (assuming that we start from an aligned address).
6002 We therefore have to use the unoptimized realignment scheme:
6004 for (i=0; i<N; i+=4)
6005 for (j=k; j<M; j+=4)
6006 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6007 // that the misalignment of the initial address is
6010 The loop can then be vectorized as follows:
6012 for (k=0; k<4; k++){
6013 rt = get_realignment_token (&vp[k]);
6014 for (i=0; i<N; i+=4){
6016 for (j=k; j<M; j+=4){
6018 va = REALIGN_LOAD <v1,v2,rt>;
6025 if (DR_IS_READ (dr
))
6027 bool is_packed
= false;
6028 tree type
= (TREE_TYPE (DR_REF (dr
)));
6030 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6031 && (!targetm
.vectorize
.builtin_mask_for_load
6032 || targetm
.vectorize
.builtin_mask_for_load ()))
6034 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6036 /* If we are doing SLP then the accesses need not have the
6037 same alignment, instead it depends on the SLP group size. */
6039 && STMT_SLP_TYPE (stmt_info
)
6040 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6041 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
6042 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
6044 else if (!loop_vinfo
6045 || (nested_in_vect_loop
6046 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6047 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6048 return dr_explicit_realign
;
6050 return dr_explicit_realign_optimized
;
6052 if (!known_alignment_for_access_p (dr
))
6053 is_packed
= not_size_aligned (DR_REF (dr
));
6055 if (targetm
.vectorize
.support_vector_misalignment
6056 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6057 /* Can't software pipeline the loads, but can at least do them. */
6058 return dr_unaligned_supported
;
6062 bool is_packed
= false;
6063 tree type
= (TREE_TYPE (DR_REF (dr
)));
6065 if (!known_alignment_for_access_p (dr
))
6066 is_packed
= not_size_aligned (DR_REF (dr
));
6068 if (targetm
.vectorize
.support_vector_misalignment
6069 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6070 return dr_unaligned_supported
;
6074 return dr_unaligned_unsupported
;