1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
62 tree vectype
, unsigned HOST_WIDE_INT count
)
65 scalar_int_mode array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
70 if (!int_mode_for_size (count
* GET_MODE_BITSIZE (mode
),
71 limit_p
).exists (&array_mode
))
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
76 GET_MODE_NAME (mode
), count
);
80 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
84 "cannot use %s<%s><%s>\n", name
,
85 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE
, vect_location
,
91 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
92 GET_MODE_NAME (mode
));
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
117 HOST_WIDE_INT
*rhs_size_unit
)
119 tree scalar_type
= gimple_expr_type (stmt
);
120 HOST_WIDE_INT lhs
, rhs
;
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
127 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
129 if (is_gimple_assign (stmt
)
130 && (gimple_assign_cast_p (stmt
)
131 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
135 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
137 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
139 scalar_type
= rhs_type
;
142 *lhs_size_unit
= lhs
;
143 *rhs_size_unit
= rhs
;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
153 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
155 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
160 if (!runtime_alias_check_p (ddr
, loop
,
161 optimize_loop_nest_for_speed_p (loop
)))
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
181 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
182 loop_vec_info loop_vinfo
,
183 int loop_depth
, unsigned int *max_vf
)
185 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
186 lambda_vector dist_v
;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
190 int dist
= dist_v
[loop_depth
];
191 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
201 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
203 if ((unsigned int) loop
->safelen
< *max_vf
)
204 *max_vf
= loop
->safelen
;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
231 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
232 loop_vec_info loop_vinfo
,
233 unsigned int *max_vf
)
236 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
237 struct data_reference
*dra
= DDR_A (ddr
);
238 struct data_reference
*drb
= DDR_B (ddr
);
239 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
240 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
241 lambda_vector dist_v
;
242 unsigned int loop_depth
;
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
254 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
270 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
272 get_alias_set (DR_REF (drb
))))
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop
->safelen
>= 2)
282 if ((unsigned int) loop
->safelen
< *max_vf
)
283 *max_vf
= loop
->safelen
;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
298 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
301 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
313 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
316 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop
->safelen
>= 2)
330 if ((unsigned int) loop
->safelen
< *max_vf
)
331 *max_vf
= loop
->safelen
;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
346 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
349 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
360 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
362 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
368 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
371 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
377 int dist
= dist_v
[loop_depth
];
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE
, vect_location
,
381 "dependence distance = %d.\n", dist
);
385 if (dump_enabled_p ())
387 dump_printf_loc (MSG_NOTE
, vect_location
,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
390 dump_printf (MSG_NOTE
, " and ");
391 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
392 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
416 gimple
*earlier_stmt
;
417 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
423 "READ_WRITE dependence in interleaving."
432 if (dist
> 0 && DDR_REVERSED_P (ddr
))
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
450 unsigned int abs_dist
= abs (dist
);
451 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf
= abs (dist
);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE
, vect_location
,
458 "adjusting maximal vectorization factor to %i\n",
462 if (abs_dist
>= *max_vf
)
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE
, vect_location
,
468 "dependence distance >= VF.\n");
472 if (dump_enabled_p ())
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
478 dump_printf (MSG_NOTE
, " and ");
479 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
480 dump_printf (MSG_NOTE
, "\n");
489 /* Function vect_analyze_data_ref_dependences.
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
497 unsigned int *max_vf
)
500 struct data_dependence_relation
*ddr
;
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE
, vect_location
,
504 "=== vect_analyze_data_ref_dependences ===\n");
506 LOOP_VINFO_DDRS (loop_vinfo
)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
512 &LOOP_VINFO_DDRS (loop_vinfo
),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
520 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
523 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
530 /* Function vect_slp_analyze_data_ref_dependence.
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
540 struct data_reference
*dra
= DDR_A (ddr
);
541 struct data_reference
*drb
= DDR_B (ddr
);
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
557 /* If dra and drb are part of the same interleaving chain consider
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
567 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
572 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
574 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
577 else if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE
, vect_location
,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
582 dump_printf (MSG_NOTE
, " and ");
583 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
584 dump_printf (MSG_NOTE
, "\n");
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
596 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
597 vec
<gimple
*> stores
, gimple
*last_store
)
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
602 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
603 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
605 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
606 if (access
== last_access
)
608 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
609 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
610 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
612 gimple
*stmt
= gsi_stmt (gsi
);
613 if (! gimple_vuse (stmt
)
614 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
625 bool dependent
= false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt
))
632 if (stmt
!= last_store
)
636 FOR_EACH_VEC_ELT (stores
, i
, store
)
638 data_reference
*store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
640 ddr_p ddr
= initialize_data_dependence_relation
641 (dr_a
, store_dr
, vNULL
);
642 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
643 free_dependence_relation (ddr
);
650 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
652 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
653 free_dependence_relation (ddr
);
663 /* Function vect_analyze_data_ref_dependences.
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
670 vect_slp_analyze_instance_dependence (slp_instance instance
)
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE
, vect_location
,
674 "=== vect_slp_analyze_instance_dependence ===\n");
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store
= SLP_INSTANCE_TREE (instance
);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple
*last_store
= NULL
;
685 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
688 /* Mark stores in this instance and remember the last one. */
689 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
690 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
701 if (! vect_slp_analyze_node_dependences (instance
, load
,
703 ? SLP_TREE_SCALAR_STMTS (store
)
704 : vNULL
, last_store
))
710 /* Unset the visited flag. */
712 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
723 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
724 innermost_loop_behavior
*drb
)
727 innermost_loop_behavior
*&entry
728 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
729 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
732 if (dump_enabled_p ())
734 dump_printf_loc (MSG_NOTE
, vect_location
,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
737 dump_printf (MSG_NOTE
, "\n");
738 dump_printf_loc (MSG_NOTE
, vect_location
,
739 " alignment: %d\n", drb
->base_alignment
);
740 dump_printf_loc (MSG_NOTE
, vect_location
,
741 " misalignment: %d\n", drb
->base_misalignment
);
742 dump_printf_loc (MSG_NOTE
, vect_location
,
744 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
758 vect_record_base_alignments (vec_info
*vinfo
)
760 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
761 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
764 FOR_EACH_VEC_ELT (vinfo
->datarefs
, i
, dr
)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr
))
767 gimple
*stmt
= DR_STMT (dr
);
768 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
774 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
775 vect_record_base_alignment
776 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
781 /* Return the target alignment for the vectorized form of DR. */
784 vect_calculate_target_alignment (struct data_reference
*dr
)
786 gimple
*stmt
= DR_STMT (dr
);
787 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
788 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
789 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
792 /* Function vect_compute_data_ref_alignment
794 Compute the misalignment of the data reference DR.
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
805 vect_compute_data_ref_alignment (struct data_reference
*dr
)
807 gimple
*stmt
= DR_STMT (dr
);
808 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
809 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
810 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
811 struct loop
*loop
= NULL
;
812 tree ref
= DR_REF (dr
);
813 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE
, vect_location
,
817 "vect_compute_data_ref_alignment:\n");
820 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
825 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
826 bool step_preserves_misalignment_p
;
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr
) / BITS_PER_UNIT
;
830 DR_TARGET_ALIGNMENT (dr
) = vector_alignment
;
832 /* No step for BB vectorization. */
835 gcc_assert (integer_zerop (drb
->step
));
836 step_preserves_misalignment_p
= true;
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop
, stmt
))
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr
) % vector_alignment
) == 0;
850 if (dump_enabled_p ())
852 if (step_preserves_misalignment_p
)
853 dump_printf_loc (MSG_NOTE
, vect_location
,
854 "inner step divides the vector alignment.\n");
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
857 "inner step doesn't divide the vector"
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
868 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr
) * vf
, vector_alignment
);
872 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
874 "step doesn't divide the vector alignment.\n");
877 unsigned int base_alignment
= drb
->base_alignment
;
878 unsigned int base_misalignment
= drb
->base_misalignment
;
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
883 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
885 base_alignment
= (*entry
)->base_alignment
;
886 base_misalignment
= (*entry
)->base_misalignment
;
889 if (drb
->offset_alignment
< vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb
->step
) != INTEGER_CST
)
895 if (dump_enabled_p ())
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
900 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
905 if (base_alignment
< vector_alignment
)
907 tree base
= drb
->base_address
;
908 if (TREE_CODE (base
) == ADDR_EXPR
)
909 base
= TREE_OPERAND (base
, 0);
910 if (!vect_can_force_dr_alignment_p (base
,
911 vector_alignment
* BITS_PER_UNIT
))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_NOTE
, vect_location
,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
918 dump_printf (MSG_NOTE
, "\n");
923 if (DECL_USER_ALIGN (base
))
925 if (dump_enabled_p ())
927 dump_printf_loc (MSG_NOTE
, vect_location
,
928 "not forcing alignment of user-aligned "
930 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
931 dump_printf (MSG_NOTE
, "\n");
936 /* Force the alignment of the decl.
937 NOTE: This is the only change to the code we make during
938 the analysis phase, before deciding to vectorize the loop. */
939 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
942 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
943 dump_printf (MSG_NOTE
, "\n");
946 DR_VECT_AUX (dr
)->base_decl
= base
;
947 DR_VECT_AUX (dr
)->base_misaligned
= true;
948 base_misalignment
= 0;
950 poly_int64 misalignment
951 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
953 /* If this is a backward running DR then first access in the larger
954 vectype actually is N-1 elements before the address in the DR.
955 Adjust misalign accordingly. */
956 if (tree_int_cst_sgn (drb
->step
) < 0)
957 /* PLUS because STEP is negative. */
958 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
959 * TREE_INT_CST_LOW (drb
->step
));
961 unsigned int const_misalignment
;
962 if (!known_misalignment (misalignment
, vector_alignment
,
963 &const_misalignment
))
965 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
968 "Non-constant misalignment for access: ");
969 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
970 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
975 SET_DR_MISALIGNMENT (dr
, const_misalignment
);
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
980 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
981 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
982 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
988 /* Function vect_update_misalignment_for_peel.
989 Sets DR's misalignment
990 - to 0 if it has the same alignment as DR_PEEL,
991 - to the misalignment computed using NPEEL if DR's salignment is known,
992 - to -1 (unknown) otherwise.
994 DR - the data reference whose misalignment is to be adjusted.
995 DR_PEEL - the data reference whose misalignment is being made
996 zero in the vector loop by the peel.
997 NPEEL - the number of iterations in the peel loop if the misalignment
998 of DR_PEEL is known at compile time. */
1001 vect_update_misalignment_for_peel (struct data_reference
*dr
,
1002 struct data_reference
*dr_peel
, int npeel
)
1005 vec
<dr_p
> same_aligned_drs
;
1006 struct data_reference
*current_dr
;
1007 int dr_size
= vect_get_scalar_dr_size (dr
);
1008 int dr_peel_size
= vect_get_scalar_dr_size (dr_peel
);
1009 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1010 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
1012 /* For interleaved data accesses the step in the loop must be multiplied by
1013 the size of the interleaving group. */
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1015 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
1016 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
1017 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
1019 /* It can be assumed that the data refs with the same alignment as dr_peel
1020 are aligned in the vector loop. */
1022 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
1023 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1025 if (current_dr
!= dr
)
1027 gcc_assert (!known_alignment_for_access_p (dr
)
1028 || !known_alignment_for_access_p (dr_peel
)
1029 || (DR_MISALIGNMENT (dr
) / dr_size
1030 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
1031 SET_DR_MISALIGNMENT (dr
, 0);
1035 if (known_alignment_for_access_p (dr
)
1036 && known_alignment_for_access_p (dr_peel
))
1038 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1039 int misal
= DR_MISALIGNMENT (dr
);
1040 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1041 misal
&= DR_TARGET_ALIGNMENT (dr
) - 1;
1042 SET_DR_MISALIGNMENT (dr
, misal
);
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1048 "to unknown (-1).\n");
1049 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1053 /* Function verify_data_ref_alignment
1055 Return TRUE if DR can be handled with respect to alignment. */
1058 verify_data_ref_alignment (data_reference_p dr
)
1060 enum dr_alignment_support supportable_dr_alignment
1061 = vect_supportable_dr_alignment (dr
, false);
1062 if (!supportable_dr_alignment
)
1064 if (dump_enabled_p ())
1066 if (DR_IS_READ (dr
))
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1068 "not vectorized: unsupported unaligned load.");
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1071 "not vectorized: unsupported unaligned "
1074 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1076 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1081 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1082 dump_printf_loc (MSG_NOTE
, vect_location
,
1083 "Vectorizing an unaligned access.\n");
1088 /* Function vect_verify_datarefs_alignment
1090 Return TRUE if all data references in the loop can be
1091 handled with respect to alignment. */
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1096 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1097 struct data_reference
*dr
;
1100 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1102 gimple
*stmt
= DR_STMT (dr
);
1103 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1105 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1108 /* For interleaving, only the alignment of the first access matters. */
1109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1110 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1113 /* Strided accesses perform only component accesses, alignment is
1114 irrelevant for them. */
1115 if (STMT_VINFO_STRIDED_P (stmt_info
)
1116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1119 if (! verify_data_ref_alignment (dr
))
1126 /* Given an memory reference EXP return whether its alignment is less
1130 not_size_aligned (tree exp
)
1132 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1135 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1136 > get_object_alignment (exp
));
1139 /* Function vector_alignment_reachable_p
1141 Return true if vector alignment for DR is reachable by peeling
1142 a few loop iterations. Return false otherwise. */
1145 vector_alignment_reachable_p (struct data_reference
*dr
)
1147 gimple
*stmt
= DR_STMT (dr
);
1148 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1149 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1153 /* For interleaved access we peel only if number of iterations in
1154 the prolog loop ({VF - misalignment}), is a multiple of the
1155 number of the interleaved accesses. */
1156 int elem_size
, mis_in_elements
;
1157 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1159 /* FORNOW: handle only known alignment. */
1160 if (!known_alignment_for_access_p (dr
))
1163 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1164 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1166 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1170 /* If misalignment is known at the compile time then allow peeling
1171 only if natural alignment is reachable through peeling. */
1172 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1174 HOST_WIDE_INT elmsize
=
1175 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1176 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_NOTE
, vect_location
,
1179 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1180 dump_printf (MSG_NOTE
,
1181 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1183 if (DR_MISALIGNMENT (dr
) % elmsize
)
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1187 "data size does not divide the misalignment.\n");
1192 if (!known_alignment_for_access_p (dr
))
1194 tree type
= TREE_TYPE (DR_REF (dr
));
1195 bool is_packed
= not_size_aligned (DR_REF (dr
));
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1198 "Unknown misalignment, %snaturally aligned\n",
1199 is_packed
? "not " : "");
1200 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1207 /* Calculate the cost of the memory access represented by DR. */
1210 vect_get_data_access_cost (struct data_reference
*dr
,
1211 unsigned int *inside_cost
,
1212 unsigned int *outside_cost
,
1213 stmt_vector_for_cost
*body_cost_vec
)
1215 gimple
*stmt
= DR_STMT (dr
);
1216 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1217 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1220 if (PURE_SLP_STMT (stmt_info
))
1223 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1225 if (DR_IS_READ (dr
))
1226 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1227 NULL
, body_cost_vec
, false);
1229 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_NOTE
, vect_location
,
1233 "vect_get_data_access_cost: inside_cost = %d, "
1234 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1238 typedef struct _vect_peel_info
1240 struct data_reference
*dr
;
1245 typedef struct _vect_peel_extended_info
1247 struct _vect_peel_info peel_info
;
1248 unsigned int inside_cost
;
1249 unsigned int outside_cost
;
1250 } *vect_peel_extended_info
;
1253 /* Peeling hashtable helpers. */
1255 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1257 static inline hashval_t
hash (const _vect_peel_info
*);
1258 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1262 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1264 return (hashval_t
) peel_info
->npeel
;
1268 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1270 return (a
->npeel
== b
->npeel
);
1274 /* Insert DR into peeling hash table with NPEEL as key. */
1277 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1278 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1281 struct _vect_peel_info elem
, *slot
;
1282 _vect_peel_info
**new_slot
;
1283 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1286 slot
= peeling_htab
->find (&elem
);
1291 slot
= XNEW (struct _vect_peel_info
);
1292 slot
->npeel
= npeel
;
1295 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1299 if (!supportable_dr_alignment
1300 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1301 slot
->count
+= VECT_MAX_COST
;
1305 /* Traverse peeling hash table to find peeling option that aligns maximum
1306 number of data accesses. */
1309 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1310 _vect_peel_extended_info
*max
)
1312 vect_peel_info elem
= *slot
;
1314 if (elem
->count
> max
->peel_info
.count
1315 || (elem
->count
== max
->peel_info
.count
1316 && max
->peel_info
.npeel
> elem
->npeel
))
1318 max
->peel_info
.npeel
= elem
->npeel
;
1319 max
->peel_info
.count
= elem
->count
;
1320 max
->peel_info
.dr
= elem
->dr
;
1326 /* Get the costs of peeling NPEEL iterations checking data access costs
1327 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1328 misalignment will be zero after peeling. */
1331 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1332 struct data_reference
*dr0
,
1333 unsigned int *inside_cost
,
1334 unsigned int *outside_cost
,
1335 stmt_vector_for_cost
*body_cost_vec
,
1337 bool unknown_misalignment
)
1342 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1344 gimple
*stmt
= DR_STMT (dr
);
1345 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1346 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1349 /* For interleaving, only the alignment of the first access
1351 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1352 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1355 /* Strided accesses perform only component accesses, alignment is
1356 irrelevant for them. */
1357 if (STMT_VINFO_STRIDED_P (stmt_info
)
1358 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1361 int save_misalignment
;
1362 save_misalignment
= DR_MISALIGNMENT (dr
);
1365 else if (unknown_misalignment
&& dr
== dr0
)
1366 SET_DR_MISALIGNMENT (dr
, 0);
1368 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1369 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1371 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1375 /* Traverse peeling hash table and calculate cost for each peeling option.
1376 Find the one with the lowest cost. */
1379 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1380 _vect_peel_extended_info
*min
)
1382 vect_peel_info elem
= *slot
;
1384 unsigned int inside_cost
= 0, outside_cost
= 0;
1385 gimple
*stmt
= DR_STMT (elem
->dr
);
1386 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1387 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1388 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1391 prologue_cost_vec
.create (2);
1392 body_cost_vec
.create (2);
1393 epilogue_cost_vec
.create (2);
1395 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1396 elem
->dr
, &inside_cost
, &outside_cost
,
1397 &body_cost_vec
, elem
->npeel
, false);
1399 body_cost_vec
.release ();
1401 outside_cost
+= vect_get_known_peeling_cost
1402 (loop_vinfo
, elem
->npeel
, &dummy
,
1403 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1404 &prologue_cost_vec
, &epilogue_cost_vec
);
1406 /* Prologue and epilogue costs are added to the target model later.
1407 These costs depend only on the scalar iteration cost, the
1408 number of peeling iterations finally chosen, and the number of
1409 misaligned statements. So discard the information found here. */
1410 prologue_cost_vec
.release ();
1411 epilogue_cost_vec
.release ();
1413 if (inside_cost
< min
->inside_cost
1414 || (inside_cost
== min
->inside_cost
1415 && outside_cost
< min
->outside_cost
))
1417 min
->inside_cost
= inside_cost
;
1418 min
->outside_cost
= outside_cost
;
1419 min
->peel_info
.dr
= elem
->dr
;
1420 min
->peel_info
.npeel
= elem
->npeel
;
1421 min
->peel_info
.count
= elem
->count
;
1428 /* Choose best peeling option by traversing peeling hash table and either
1429 choosing an option with the lowest cost (if cost model is enabled) or the
1430 option that aligns as many accesses as possible. */
1432 static struct _vect_peel_extended_info
1433 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1434 loop_vec_info loop_vinfo
)
1436 struct _vect_peel_extended_info res
;
1438 res
.peel_info
.dr
= NULL
;
1440 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1442 res
.inside_cost
= INT_MAX
;
1443 res
.outside_cost
= INT_MAX
;
1444 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1445 vect_peeling_hash_get_lowest_cost
> (&res
);
1449 res
.peel_info
.count
= 0;
1450 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1451 vect_peeling_hash_get_most_frequent
> (&res
);
1452 res
.inside_cost
= 0;
1453 res
.outside_cost
= 0;
1459 /* Return true if the new peeling NPEEL is supported. */
1462 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1466 struct data_reference
*dr
= NULL
;
1467 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1469 stmt_vec_info stmt_info
;
1470 enum dr_alignment_support supportable_dr_alignment
;
1472 /* Ensure that all data refs can be vectorized after the peel. */
1473 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1475 int save_misalignment
;
1480 stmt
= DR_STMT (dr
);
1481 stmt_info
= vinfo_for_stmt (stmt
);
1482 /* For interleaving, only the alignment of the first access
1484 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1485 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1488 /* Strided accesses perform only component accesses, alignment is
1489 irrelevant for them. */
1490 if (STMT_VINFO_STRIDED_P (stmt_info
)
1491 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1494 save_misalignment
= DR_MISALIGNMENT (dr
);
1495 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1496 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1497 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1499 if (!supportable_dr_alignment
)
1506 /* Function vect_enhance_data_refs_alignment
1508 This pass will use loop versioning and loop peeling in order to enhance
1509 the alignment of data references in the loop.
1511 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1512 original loop is to be vectorized. Any other loops that are created by
1513 the transformations performed in this pass - are not supposed to be
1514 vectorized. This restriction will be relaxed.
1516 This pass will require a cost model to guide it whether to apply peeling
1517 or versioning or a combination of the two. For example, the scheme that
1518 intel uses when given a loop with several memory accesses, is as follows:
1519 choose one memory access ('p') which alignment you want to force by doing
1520 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1521 other accesses are not necessarily aligned, or (2) use loop versioning to
1522 generate one loop in which all accesses are aligned, and another loop in
1523 which only 'p' is necessarily aligned.
1525 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1526 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1527 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1529 Devising a cost model is the most critical aspect of this work. It will
1530 guide us on which access to peel for, whether to use loop versioning, how
1531 many versions to create, etc. The cost model will probably consist of
1532 generic considerations as well as target specific considerations (on
1533 powerpc for example, misaligned stores are more painful than misaligned
1536 Here are the general steps involved in alignment enhancements:
1538 -- original loop, before alignment analysis:
1539 for (i=0; i<N; i++){
1540 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1541 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1544 -- After vect_compute_data_refs_alignment:
1545 for (i=0; i<N; i++){
1546 x = q[i]; # DR_MISALIGNMENT(q) = 3
1547 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1550 -- Possibility 1: we do loop versioning:
1552 for (i=0; i<N; i++){ # loop 1A
1553 x = q[i]; # DR_MISALIGNMENT(q) = 3
1554 p[i] = y; # DR_MISALIGNMENT(p) = 0
1558 for (i=0; i<N; i++){ # loop 1B
1559 x = q[i]; # DR_MISALIGNMENT(q) = 3
1560 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1564 -- Possibility 2: we do loop peeling:
1565 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1569 for (i = 3; i < N; i++){ # loop 2A
1570 x = q[i]; # DR_MISALIGNMENT(q) = 0
1571 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1574 -- Possibility 3: combination of loop peeling and versioning:
1575 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1580 for (i = 3; i<N; i++){ # loop 3A
1581 x = q[i]; # DR_MISALIGNMENT(q) = 0
1582 p[i] = y; # DR_MISALIGNMENT(p) = 0
1586 for (i = 3; i<N; i++){ # loop 3B
1587 x = q[i]; # DR_MISALIGNMENT(q) = 0
1588 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1592 These loops are later passed to loop_transform to be vectorized. The
1593 vectorizer will use the alignment information to guide the transformation
1594 (whether to generate regular loads/stores, or with special handling for
1598 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1600 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1601 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1602 enum dr_alignment_support supportable_dr_alignment
;
1603 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1604 struct data_reference
*dr
;
1606 bool do_peeling
= false;
1607 bool do_versioning
= false;
1610 stmt_vec_info stmt_info
;
1611 unsigned int npeel
= 0;
1612 bool one_misalignment_known
= false;
1613 bool one_misalignment_unknown
= false;
1614 bool one_dr_unsupportable
= false;
1615 struct data_reference
*unsupportable_dr
= NULL
;
1616 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1617 unsigned possible_npeel_number
= 1;
1619 unsigned int mis
, same_align_drs_max
= 0;
1620 hash_table
<peel_info_hasher
> peeling_htab (1);
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE
, vect_location
,
1624 "=== vect_enhance_data_refs_alignment ===\n");
1626 /* Reset data so we can safely be called multiple times. */
1627 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1628 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1630 /* While cost model enhancements are expected in the future, the high level
1631 view of the code at this time is as follows:
1633 A) If there is a misaligned access then see if peeling to align
1634 this access can make all data references satisfy
1635 vect_supportable_dr_alignment. If so, update data structures
1636 as needed and return true.
1638 B) If peeling wasn't possible and there is a data reference with an
1639 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1640 then see if loop versioning checks can be used to make all data
1641 references satisfy vect_supportable_dr_alignment. If so, update
1642 data structures as needed and return true.
1644 C) If neither peeling nor versioning were successful then return false if
1645 any data reference does not satisfy vect_supportable_dr_alignment.
1647 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1649 Note, Possibility 3 above (which is peeling and versioning together) is not
1650 being done at this time. */
1652 /* (1) Peeling to force alignment. */
1654 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656 + How many accesses will become aligned due to the peeling
1657 - How many accesses will become unaligned due to the peeling,
1658 and the cost of misaligned accesses.
1659 - The cost of peeling (the extra runtime checks, the increase
1662 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1664 stmt
= DR_STMT (dr
);
1665 stmt_info
= vinfo_for_stmt (stmt
);
1667 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1670 /* For interleaving, only the alignment of the first access
1672 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1673 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1676 /* For invariant accesses there is nothing to enhance. */
1677 if (integer_zerop (DR_STEP (dr
)))
1680 /* Strided accesses perform only component accesses, alignment is
1681 irrelevant for them. */
1682 if (STMT_VINFO_STRIDED_P (stmt_info
)
1683 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1686 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1687 do_peeling
= vector_alignment_reachable_p (dr
);
1690 if (known_alignment_for_access_p (dr
))
1692 unsigned int npeel_tmp
= 0;
1693 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1694 size_zero_node
) < 0;
1696 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1697 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr
);
1698 unsigned int dr_size
= vect_get_scalar_dr_size (dr
);
1699 mis
= (negative
? DR_MISALIGNMENT (dr
) : -DR_MISALIGNMENT (dr
));
1700 if (DR_MISALIGNMENT (dr
) != 0)
1701 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1703 /* For multiple types, it is possible that the bigger type access
1704 will have more than one peeling option. E.g., a loop with two
1705 types: one of size (vector size / 4), and the other one of
1706 size (vector size / 8). Vectorization factor will 8. If both
1707 accesses are misaligned by 3, the first one needs one scalar
1708 iteration to be aligned, and the second one needs 5. But the
1709 first one will be aligned also by peeling 5 scalar
1710 iterations, and in that case both accesses will be aligned.
1711 Hence, except for the immediate peeling amount, we also want
1712 to try to add full vector size, while we don't exceed
1713 vectorization factor.
1714 We do this automatically for cost model, since we calculate
1715 cost for every peeling option. */
1716 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1718 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1719 ? vf
* GROUP_SIZE (stmt_info
) : vf
);
1720 possible_npeel_number
1721 = vect_get_num_vectors (nscalars
, vectype
);
1723 /* NPEEL_TMP is 0 when there is no misalignment, but also
1724 allow peeling NELEMENTS. */
1725 if (DR_MISALIGNMENT (dr
) == 0)
1726 possible_npeel_number
++;
1729 /* Save info about DR in the hash table. Also include peeling
1730 amounts according to the explanation above. */
1731 for (j
= 0; j
< possible_npeel_number
; j
++)
1733 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1735 npeel_tmp
+= target_align
/ dr_size
;
1738 one_misalignment_known
= true;
1742 /* If we don't know any misalignment values, we prefer
1743 peeling for data-ref that has the maximum number of data-refs
1744 with the same alignment, unless the target prefers to align
1745 stores over load. */
1746 unsigned same_align_drs
1747 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1749 || same_align_drs_max
< same_align_drs
)
1751 same_align_drs_max
= same_align_drs
;
1754 /* For data-refs with the same number of related
1755 accesses prefer the one where the misalign
1756 computation will be invariant in the outermost loop. */
1757 else if (same_align_drs_max
== same_align_drs
)
1759 struct loop
*ivloop0
, *ivloop
;
1760 ivloop0
= outermost_invariant_loop_for_expr
1761 (loop
, DR_BASE_ADDRESS (dr0
));
1762 ivloop
= outermost_invariant_loop_for_expr
1763 (loop
, DR_BASE_ADDRESS (dr
));
1764 if ((ivloop
&& !ivloop0
)
1765 || (ivloop
&& ivloop0
1766 && flow_loop_nested_p (ivloop
, ivloop0
)))
1770 one_misalignment_unknown
= true;
1772 /* Check for data refs with unsupportable alignment that
1774 if (!supportable_dr_alignment
)
1776 one_dr_unsupportable
= true;
1777 unsupportable_dr
= dr
;
1780 if (!first_store
&& DR_IS_WRITE (dr
))
1786 if (!aligned_access_p (dr
))
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1790 "vector alignment may not be reachable\n");
1796 /* Check if we can possibly peel the loop. */
1797 if (!vect_can_advance_ivs_p (loop_vinfo
)
1798 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1802 struct _vect_peel_extended_info peel_for_known_alignment
;
1803 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1804 struct _vect_peel_extended_info best_peel
;
1806 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1807 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1808 peel_for_unknown_alignment
.peel_info
.count
= 0;
1811 && one_misalignment_unknown
)
1813 /* Check if the target requires to prefer stores over loads, i.e., if
1814 misaligned stores are more expensive than misaligned loads (taking
1815 drs with same alignment into account). */
1816 unsigned int load_inside_cost
= 0;
1817 unsigned int load_outside_cost
= 0;
1818 unsigned int store_inside_cost
= 0;
1819 unsigned int store_outside_cost
= 0;
1820 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1822 stmt_vector_for_cost dummy
;
1824 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1827 &dummy
, estimated_npeels
, true);
1833 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1835 &store_outside_cost
,
1836 &dummy
, estimated_npeels
, true);
1841 store_inside_cost
= INT_MAX
;
1842 store_outside_cost
= INT_MAX
;
1845 if (load_inside_cost
> store_inside_cost
1846 || (load_inside_cost
== store_inside_cost
1847 && load_outside_cost
> store_outside_cost
))
1850 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1851 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1855 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1856 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1859 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1860 prologue_cost_vec
.create (2);
1861 epilogue_cost_vec
.create (2);
1864 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1865 (loop_vinfo
, estimated_npeels
, &dummy2
,
1866 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1867 &prologue_cost_vec
, &epilogue_cost_vec
);
1869 prologue_cost_vec
.release ();
1870 epilogue_cost_vec
.release ();
1872 peel_for_unknown_alignment
.peel_info
.count
= 1
1873 + STMT_VINFO_SAME_ALIGN_REFS
1874 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1877 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1878 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1880 best_peel
= peel_for_unknown_alignment
;
1882 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1883 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1884 peel_for_known_alignment
.peel_info
.count
= 0;
1885 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1887 if (do_peeling
&& one_misalignment_known
)
1889 /* Peeling is possible, but there is no data access that is not supported
1890 unless aligned. So we try to choose the best possible peeling from
1892 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1893 (&peeling_htab
, loop_vinfo
);
1896 /* Compare costs of peeling for known and unknown alignment. */
1897 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1898 && peel_for_unknown_alignment
.inside_cost
1899 >= peel_for_known_alignment
.inside_cost
)
1901 best_peel
= peel_for_known_alignment
;
1903 /* If the best peeling for known alignment has NPEEL == 0, perform no
1904 peeling at all except if there is an unsupportable dr that we can
1906 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1910 /* If there is an unsupportable data ref, prefer this over all choices so far
1911 since we'd have to discard a chosen peeling except when it accidentally
1912 aligned the unsupportable data ref. */
1913 if (one_dr_unsupportable
)
1914 dr0
= unsupportable_dr
;
1915 else if (do_peeling
)
1917 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1918 TODO: Use nopeel_outside_cost or get rid of it? */
1919 unsigned nopeel_inside_cost
= 0;
1920 unsigned nopeel_outside_cost
= 0;
1922 stmt_vector_for_cost dummy
;
1924 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1925 &nopeel_outside_cost
, &dummy
, 0, false);
1928 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1929 costs will be recorded. */
1930 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1931 prologue_cost_vec
.create (2);
1932 epilogue_cost_vec
.create (2);
1935 nopeel_outside_cost
+= vect_get_known_peeling_cost
1936 (loop_vinfo
, 0, &dummy2
,
1937 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1938 &prologue_cost_vec
, &epilogue_cost_vec
);
1940 prologue_cost_vec
.release ();
1941 epilogue_cost_vec
.release ();
1943 npeel
= best_peel
.peel_info
.npeel
;
1944 dr0
= best_peel
.peel_info
.dr
;
1946 /* If no peeling is not more expensive than the best peeling we
1947 have so far, don't perform any peeling. */
1948 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1954 stmt
= DR_STMT (dr0
);
1955 stmt_info
= vinfo_for_stmt (stmt
);
1956 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1958 if (known_alignment_for_access_p (dr0
))
1960 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1961 size_zero_node
) < 0;
1964 /* Since it's known at compile time, compute the number of
1965 iterations in the peeled loop (the peeling factor) for use in
1966 updating DR_MISALIGNMENT values. The peeling factor is the
1967 vectorization factor minus the misalignment as an element
1969 mis
= negative
? DR_MISALIGNMENT (dr0
) : -DR_MISALIGNMENT (dr0
);
1970 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
1971 npeel
= ((mis
& (target_align
- 1))
1972 / vect_get_scalar_dr_size (dr0
));
1975 /* For interleaved data access every iteration accesses all the
1976 members of the group, therefore we divide the number of iterations
1977 by the group size. */
1978 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1979 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1980 npeel
/= GROUP_SIZE (stmt_info
);
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE
, vect_location
,
1984 "Try peeling by %d\n", npeel
);
1987 /* Ensure that all datarefs can be vectorized after the peel. */
1988 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
1991 /* Check if all datarefs are supportable and log. */
1992 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1994 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2001 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2004 unsigned max_allowed_peel
2005 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2006 if (max_allowed_peel
!= (unsigned)-1)
2008 unsigned max_peel
= npeel
;
2011 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2012 max_peel
= target_align
/ vect_get_scalar_dr_size (dr0
) - 1;
2014 if (max_peel
> max_allowed_peel
)
2017 if (dump_enabled_p ())
2018 dump_printf_loc (MSG_NOTE
, vect_location
,
2019 "Disable peeling, max peels reached: %d\n", max_peel
);
2024 /* Cost model #2 - if peeling may result in a remaining loop not
2025 iterating enough to be vectorized then do not peel. Since this
2026 is a cost heuristic rather than a correctness decision, use the
2027 most likely runtime value for variable vectorization factors. */
2029 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2031 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2032 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2033 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2034 < assumed_vf
+ max_peel
)
2040 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2041 If the misalignment of DR_i is identical to that of dr0 then set
2042 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2043 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2044 by the peeling factor times the element size of DR_i (MOD the
2045 vectorization factor times the size). Otherwise, the
2046 misalignment of DR_i must be set to unknown. */
2047 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2050 /* Strided accesses perform only component accesses, alignment
2051 is irrelevant for them. */
2052 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2053 if (STMT_VINFO_STRIDED_P (stmt_info
)
2054 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2057 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2060 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2062 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2064 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2065 = DR_MISALIGNMENT (dr0
);
2066 SET_DR_MISALIGNMENT (dr0
, 0);
2067 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE
, vect_location
,
2070 "Alignment of access forced using peeling.\n");
2071 dump_printf_loc (MSG_NOTE
, vect_location
,
2072 "Peeling for alignment will be applied.\n");
2075 /* The inside-loop cost will be accounted for in vectorizable_load
2076 and vectorizable_store correctly with adjusted alignments.
2077 Drop the body_cst_vec on the floor here. */
2078 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2084 /* (2) Versioning to force alignment. */
2086 /* Try versioning if:
2087 1) optimize loop for speed
2088 2) there is at least one unsupported misaligned data ref with an unknown
2090 3) all misaligned data refs with a known misalignment are supported, and
2091 4) the number of runtime alignment checks is within reason. */
2094 optimize_loop_nest_for_speed_p (loop
)
2095 && (!loop
->inner
); /* FORNOW */
2099 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2101 stmt
= DR_STMT (dr
);
2102 stmt_info
= vinfo_for_stmt (stmt
);
2104 /* For interleaving, only the alignment of the first access
2106 if (aligned_access_p (dr
)
2107 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2108 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2111 if (STMT_VINFO_STRIDED_P (stmt_info
))
2113 /* Strided loads perform only component accesses, alignment is
2114 irrelevant for them. */
2115 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2117 do_versioning
= false;
2121 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2123 if (!supportable_dr_alignment
)
2129 if (known_alignment_for_access_p (dr
)
2130 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2131 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2133 do_versioning
= false;
2137 stmt
= DR_STMT (dr
);
2138 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2139 gcc_assert (vectype
);
2141 /* The rightmost bits of an aligned address must be zeros.
2142 Construct the mask needed for this test. For example,
2143 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2144 mask must be 15 = 0xf. */
2145 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
2147 /* FORNOW: use the same mask to test all potentially unaligned
2148 references in the loop. The vectorizer currently supports
2149 a single vector size, see the reference to
2150 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2151 vectorization factor is computed. */
2152 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2153 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2154 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2155 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2160 /* Versioning requires at least one misaligned data reference. */
2161 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2162 do_versioning
= false;
2163 else if (!do_versioning
)
2164 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2169 vec
<gimple
*> may_misalign_stmts
2170 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2173 /* It can now be assumed that the data references in the statements
2174 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2175 of the loop being vectorized. */
2176 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2178 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2179 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2180 SET_DR_MISALIGNMENT (dr
, 0);
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE
, vect_location
,
2183 "Alignment of access forced using versioning.\n");
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_NOTE
, vect_location
,
2188 "Versioning for alignment will be applied.\n");
2190 /* Peeling and versioning can't be done together at this time. */
2191 gcc_assert (! (do_peeling
&& do_versioning
));
2193 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2198 /* This point is reached if neither peeling nor versioning is being done. */
2199 gcc_assert (! (do_peeling
|| do_versioning
));
2201 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2206 /* Function vect_find_same_alignment_drs.
2208 Update group and alignment relations according to the chosen
2209 vectorization factor. */
2212 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2214 struct data_reference
*dra
= DDR_A (ddr
);
2215 struct data_reference
*drb
= DDR_B (ddr
);
2216 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2217 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2219 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2225 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2226 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2227 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2230 /* Two references with distance zero have the same alignment. */
2231 offset_int diff
= (wi::to_offset (DR_INIT (dra
))
2232 - wi::to_offset (DR_INIT (drb
)));
2235 /* Get the wider of the two alignments. */
2236 unsigned int align_a
= (vect_calculate_target_alignment (dra
)
2238 unsigned int align_b
= (vect_calculate_target_alignment (drb
)
2240 unsigned int max_align
= MAX (align_a
, align_b
);
2242 /* Require the gap to be a multiple of the larger vector alignment. */
2243 if (!wi::multiple_of_p (diff
, max_align
, SIGNED
))
2247 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_NOTE
, vect_location
,
2252 "accesses have the same alignment: ");
2253 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2254 dump_printf (MSG_NOTE
, " and ");
2255 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2256 dump_printf (MSG_NOTE
, "\n");
2261 /* Function vect_analyze_data_refs_alignment
2263 Analyze the alignment of the data-references in the loop.
2264 Return FALSE if a data reference is found that cannot be vectorized. */
2267 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_NOTE
, vect_location
,
2271 "=== vect_analyze_data_refs_alignment ===\n");
2273 /* Mark groups of data references with same alignment using
2274 data dependence information. */
2275 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2276 struct data_dependence_relation
*ddr
;
2279 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2280 vect_find_same_alignment_drs (ddr
);
2282 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2283 struct data_reference
*dr
;
2285 vect_record_base_alignments (vinfo
);
2286 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2288 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2289 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2290 && !vect_compute_data_ref_alignment (dr
))
2292 /* Strided accesses perform only component accesses, misalignment
2293 information is irrelevant for them. */
2294 if (STMT_VINFO_STRIDED_P (stmt_info
)
2295 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2300 "not vectorized: can't calculate alignment "
2311 /* Analyze alignment of DRs of stmts in NODE. */
2314 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2316 /* We vectorize from the first scalar stmt in the node unless
2317 the node is permuted in which case we start from the first
2318 element in the group. */
2319 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2320 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2321 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2322 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2324 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2325 if (! vect_compute_data_ref_alignment (dr
)
2326 /* For creating the data-ref pointer we need alignment of the
2327 first element anyway. */
2329 && ! vect_compute_data_ref_alignment (first_dr
))
2330 || ! verify_data_ref_alignment (dr
))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2334 "not vectorized: bad data alignment in basic "
2342 /* Function vect_slp_analyze_instance_alignment
2344 Analyze the alignment of the data-references in the SLP instance.
2345 Return FALSE if a data reference is found that cannot be vectorized. */
2348 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_NOTE
, vect_location
,
2352 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2356 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2357 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2360 node
= SLP_INSTANCE_TREE (instance
);
2361 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2362 && ! vect_slp_analyze_and_verify_node_alignment
2363 (SLP_INSTANCE_TREE (instance
)))
2370 /* Analyze groups of accesses: check that DR belongs to a group of
2371 accesses of legal size, step, etc. Detect gaps, single element
2372 interleaving, and other special cases. Set grouped access info.
2373 Collect groups of strided stores for further use in SLP analysis.
2374 Worker for vect_analyze_group_access. */
2377 vect_analyze_group_access_1 (struct data_reference
*dr
)
2379 tree step
= DR_STEP (dr
);
2380 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2381 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2382 gimple
*stmt
= DR_STMT (dr
);
2383 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2384 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2385 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2386 HOST_WIDE_INT dr_step
= -1;
2387 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2388 bool slp_impossible
= false;
2390 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2391 size of the interleaving group (including gaps). */
2392 if (tree_fits_shwi_p (step
))
2394 dr_step
= tree_to_shwi (step
);
2395 /* Check that STEP is a multiple of type size. Otherwise there is
2396 a non-element-sized gap at the end of the group which we
2397 cannot represent in GROUP_GAP or GROUP_SIZE.
2398 ??? As we can handle non-constant step fine here we should
2399 simply remove uses of GROUP_GAP between the last and first
2400 element and instead rely on DR_STEP. GROUP_SIZE then would
2401 simply not include that gap. */
2402 if ((dr_step
% type_size
) != 0)
2404 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_NOTE
, vect_location
,
2408 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2409 dump_printf (MSG_NOTE
,
2410 " is not a multiple of the element size for ");
2411 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2412 dump_printf (MSG_NOTE
, "\n");
2416 groupsize
= absu_hwi (dr_step
) / type_size
;
2421 /* Not consecutive access is possible only if it is a part of interleaving. */
2422 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2424 /* Check if it this DR is a part of interleaving, and is a single
2425 element of the group that is accessed in the loop. */
2427 /* Gaps are supported only for loads. STEP must be a multiple of the type
2428 size. The size of the group must be a power of 2. */
2430 && (dr_step
% type_size
) == 0
2432 && pow2p_hwi (groupsize
))
2434 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2435 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2436 GROUP_GAP (stmt_info
) = groupsize
- 1;
2437 if (dump_enabled_p ())
2439 dump_printf_loc (MSG_NOTE
, vect_location
,
2440 "Detected single element interleaving ");
2441 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2442 dump_printf (MSG_NOTE
, " step ");
2443 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2444 dump_printf (MSG_NOTE
, "\n");
2450 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2453 "not consecutive access ");
2454 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2459 /* Mark the statement as unvectorizable. */
2460 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2464 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2465 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2469 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2471 /* First stmt in the interleaving chain. Check the chain. */
2472 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2473 struct data_reference
*data_ref
= dr
;
2474 unsigned int count
= 1;
2475 tree prev_init
= DR_INIT (data_ref
);
2476 gimple
*prev
= stmt
;
2477 HOST_WIDE_INT diff
, gaps
= 0;
2481 /* Skip same data-refs. In case that two or more stmts share
2482 data-ref (supported only for loads), we vectorize only the first
2483 stmt, and the rest get their vectorized loads from the first
2485 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2486 DR_INIT (STMT_VINFO_DATA_REF (
2487 vinfo_for_stmt (next
)))))
2489 if (DR_IS_WRITE (data_ref
))
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2493 "Two store stmts share the same dr.\n");
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2499 "Two or more load stmts share the same dr.\n");
2501 /* For load use the same data-ref load. */
2502 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2505 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2510 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2512 /* All group members have the same STEP by construction. */
2513 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2515 /* Check that the distance between two accesses is equal to the type
2516 size. Otherwise, we have gaps. */
2517 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2518 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2521 /* FORNOW: SLP of accesses with gaps is not supported. */
2522 slp_impossible
= true;
2523 if (DR_IS_WRITE (data_ref
))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2527 "interleaved store with gaps\n");
2534 last_accessed_element
+= diff
;
2536 /* Store the gap from the previous member of the group. If there is no
2537 gap in the access, GROUP_GAP is always 1. */
2538 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2540 prev_init
= DR_INIT (data_ref
);
2541 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2542 /* Count the number of data-refs in the chain. */
2547 groupsize
= count
+ gaps
;
2549 /* This could be UINT_MAX but as we are generating code in a very
2550 inefficient way we have to cap earlier. See PR78699 for example. */
2551 if (groupsize
> 4096)
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2555 "group is too large\n");
2559 /* Check that the size of the interleaving is equal to count for stores,
2560 i.e., that there are no gaps. */
2561 if (groupsize
!= count
2562 && !DR_IS_READ (dr
))
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2566 "interleaved store with gaps\n");
2570 /* If there is a gap after the last load in the group it is the
2571 difference between the groupsize and the last accessed
2573 When there is no gap, this difference should be 0. */
2574 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2576 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2577 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE
, vect_location
,
2580 "Detected interleaving ");
2581 if (DR_IS_READ (dr
))
2582 dump_printf (MSG_NOTE
, "load ");
2584 dump_printf (MSG_NOTE
, "store ");
2585 dump_printf (MSG_NOTE
, "of size %u starting with ",
2586 (unsigned)groupsize
);
2587 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2588 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2589 dump_printf_loc (MSG_NOTE
, vect_location
,
2590 "There is a gap of %u elements after the group\n",
2591 GROUP_GAP (vinfo_for_stmt (stmt
)));
2594 /* SLP: create an SLP data structure for every interleaving group of
2595 stores for further analysis in vect_analyse_slp. */
2596 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2599 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2601 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2608 /* Analyze groups of accesses: check that DR belongs to a group of
2609 accesses of legal size, step, etc. Detect gaps, single element
2610 interleaving, and other special cases. Set grouped access info.
2611 Collect groups of strided stores for further use in SLP analysis. */
2614 vect_analyze_group_access (struct data_reference
*dr
)
2616 if (!vect_analyze_group_access_1 (dr
))
2618 /* Dissolve the group if present. */
2620 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2623 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2624 next
= GROUP_NEXT_ELEMENT (vinfo
);
2625 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2626 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2634 /* Analyze the access pattern of the data-reference DR.
2635 In case of non-consecutive accesses call vect_analyze_group_access() to
2636 analyze groups of accesses. */
2639 vect_analyze_data_ref_access (struct data_reference
*dr
)
2641 tree step
= DR_STEP (dr
);
2642 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2643 gimple
*stmt
= DR_STMT (dr
);
2644 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2645 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2646 struct loop
*loop
= NULL
;
2649 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2651 if (loop_vinfo
&& !step
)
2653 if (dump_enabled_p ())
2654 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2655 "bad data-ref access in loop\n");
2659 /* Allow loads with zero step in inner-loop vectorization. */
2660 if (loop_vinfo
&& integer_zerop (step
))
2662 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2663 if (!nested_in_vect_loop_p (loop
, stmt
))
2664 return DR_IS_READ (dr
);
2665 /* Allow references with zero step for outer loops marked
2666 with pragma omp simd only - it guarantees absence of
2667 loop-carried dependencies between inner loop iterations. */
2668 if (!loop
->force_vectorize
)
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE
, vect_location
,
2672 "zero step in inner loop of nest\n");
2677 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2679 /* Interleaved accesses are not yet supported within outer-loop
2680 vectorization for references in the inner-loop. */
2681 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2683 /* For the rest of the analysis we use the outer-loop step. */
2684 step
= STMT_VINFO_DR_STEP (stmt_info
);
2685 if (integer_zerop (step
))
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_NOTE
, vect_location
,
2689 "zero step in outer loop.\n");
2690 return DR_IS_READ (dr
);
2695 if (TREE_CODE (step
) == INTEGER_CST
)
2697 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2698 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2700 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2702 /* Mark that it is not interleaving. */
2703 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2708 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE
, vect_location
,
2712 "grouped access in outer loop.\n");
2717 /* Assume this is a DR handled by non-constant strided load case. */
2718 if (TREE_CODE (step
) != INTEGER_CST
)
2719 return (STMT_VINFO_STRIDED_P (stmt_info
)
2720 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2721 || vect_analyze_group_access (dr
)));
2723 /* Not consecutive access - check if it's a part of interleaving group. */
2724 return vect_analyze_group_access (dr
);
2727 /* Compare two data-references DRA and DRB to group them into chunks
2728 suitable for grouping. */
2731 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2733 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2734 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2737 /* Stabilize sort. */
2741 /* DRs in different loops never belong to the same group. */
2742 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2743 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2745 return loopa
->num
< loopb
->num
? -1 : 1;
2747 /* Ordering of DRs according to base. */
2748 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2749 DR_BASE_ADDRESS (drb
));
2753 /* And according to DR_OFFSET. */
2754 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2758 /* Put reads before writes. */
2759 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2760 return DR_IS_READ (dra
) ? -1 : 1;
2762 /* Then sort after access size. */
2763 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2764 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2768 /* And after step. */
2769 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2773 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2774 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2776 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2780 /* Function vect_analyze_data_ref_accesses.
2782 Analyze the access pattern of all the data references in the loop.
2784 FORNOW: the only access pattern that is considered vectorizable is a
2785 simple step 1 (consecutive) access.
2787 FORNOW: handle only arrays and pointer accesses. */
2790 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2793 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2794 struct data_reference
*dr
;
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE
, vect_location
,
2798 "=== vect_analyze_data_ref_accesses ===\n");
2800 if (datarefs
.is_empty ())
2803 /* Sort the array of datarefs to make building the interleaving chains
2804 linear. Don't modify the original vector's order, it is needed for
2805 determining what dependencies are reversed. */
2806 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2807 datarefs_copy
.qsort (dr_group_sort_cmp
);
2809 /* Build the interleaving chains. */
2810 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2812 data_reference_p dra
= datarefs_copy
[i
];
2813 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2814 stmt_vec_info lastinfo
= NULL
;
2815 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2820 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2822 data_reference_p drb
= datarefs_copy
[i
];
2823 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2824 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2827 /* ??? Imperfect sorting (non-compatible types, non-modulo
2828 accesses, same accesses) can lead to a group to be artificially
2829 split here as we don't just skip over those. If it really
2830 matters we can push those to a worklist and re-iterate
2831 over them. The we can just skip ahead to the next DR here. */
2833 /* DRs in a different loop should not be put into the same
2834 interleaving group. */
2835 if (gimple_bb (DR_STMT (dra
))->loop_father
2836 != gimple_bb (DR_STMT (drb
))->loop_father
)
2839 /* Check that the data-refs have same first location (except init)
2840 and they are both either store or load (not load and store,
2841 not masked loads or stores). */
2842 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2843 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2844 DR_BASE_ADDRESS (drb
)) != 0
2845 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2846 || !gimple_assign_single_p (DR_STMT (dra
))
2847 || !gimple_assign_single_p (DR_STMT (drb
)))
2850 /* Check that the data-refs have the same constant size. */
2851 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2852 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2853 if (!tree_fits_uhwi_p (sza
)
2854 || !tree_fits_uhwi_p (szb
)
2855 || !tree_int_cst_equal (sza
, szb
))
2858 /* Check that the data-refs have the same step. */
2859 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2862 /* Check the types are compatible.
2863 ??? We don't distinguish this during sorting. */
2864 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2865 TREE_TYPE (DR_REF (drb
))))
2868 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2869 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2870 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2871 HOST_WIDE_INT init_prev
2872 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
2873 gcc_assert (init_a
<= init_b
2874 && init_a
<= init_prev
2875 && init_prev
<= init_b
);
2877 /* Do not place the same access in the interleaving chain twice. */
2878 if (init_b
== init_prev
)
2880 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
2881 < gimple_uid (DR_STMT (drb
)));
2882 /* ??? For now we simply "drop" the later reference which is
2883 otherwise the same rather than finishing off this group.
2884 In the end we'd want to re-process duplicates forming
2885 multiple groups from the refs, likely by just collecting
2886 all candidates (including duplicates and split points
2887 below) in a vector and then process them together. */
2891 /* If init_b == init_a + the size of the type * k, we have an
2892 interleaving, and DRA is accessed before DRB. */
2893 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2894 if (type_size_a
== 0
2895 || (init_b
- init_a
) % type_size_a
!= 0)
2898 /* If we have a store, the accesses are adjacent. This splits
2899 groups into chunks we support (we don't support vectorization
2900 of stores with gaps). */
2901 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
2904 /* If the step (if not zero or non-constant) is greater than the
2905 difference between data-refs' inits this splits groups into
2907 if (tree_fits_shwi_p (DR_STEP (dra
)))
2909 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2910 if (step
!= 0 && step
<= (init_b
- init_a
))
2914 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE
, vect_location
,
2917 "Detected interleaving ");
2918 if (DR_IS_READ (dra
))
2919 dump_printf (MSG_NOTE
, "load ");
2921 dump_printf (MSG_NOTE
, "store ");
2922 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2923 dump_printf (MSG_NOTE
, " and ");
2924 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2925 dump_printf (MSG_NOTE
, "\n");
2928 /* Link the found element into the group list. */
2929 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2931 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2932 lastinfo
= stmtinfo_a
;
2934 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2935 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2936 lastinfo
= stmtinfo_b
;
2940 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2941 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2942 && !vect_analyze_data_ref_access (dr
))
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2946 "not vectorized: complicated access pattern.\n");
2948 if (is_a
<bb_vec_info
> (vinfo
))
2950 /* Mark the statement as not vectorizable. */
2951 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2956 datarefs_copy
.release ();
2961 datarefs_copy
.release ();
2965 /* Function vect_vfa_segment_size.
2967 Create an expression that computes the size of segment
2968 that will be accessed for a data reference. The functions takes into
2969 account that realignment loads may access one more vector.
2972 DR: The data reference.
2973 LENGTH_FACTOR: segment length to consider.
2975 Return an expression whose value is the size of segment which will be
2979 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2981 tree segment_length
;
2983 if (integer_zerop (DR_STEP (dr
)))
2984 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2986 segment_length
= size_binop (MULT_EXPR
,
2987 fold_convert (sizetype
, DR_STEP (dr
)),
2988 fold_convert (sizetype
, length_factor
));
2990 if (vect_supportable_dr_alignment (dr
, false)
2991 == dr_explicit_realign_optimized
)
2993 tree vector_size
= TYPE_SIZE_UNIT
2994 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2996 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2998 return segment_length
;
3001 /* Function vect_no_alias_p.
3003 Given data references A and B with equal base and offset, the alias
3004 relation can be decided at compilation time, return TRUE if they do
3005 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
3006 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3010 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
3011 tree segment_length_a
, tree segment_length_b
)
3013 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
3014 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
3015 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
3018 tree seg_a_min
= DR_INIT (a
);
3019 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
3020 seg_a_min
, segment_length_a
);
3021 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3022 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3024 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3026 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
3027 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
3028 seg_a_max
, unit_size
);
3029 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
3030 DR_INIT (a
), unit_size
);
3032 tree seg_b_min
= DR_INIT (b
);
3033 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
3034 seg_b_min
, segment_length_b
);
3035 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3037 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3038 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3039 seg_b_max
, unit_size
);
3040 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3041 DR_INIT (b
), unit_size
);
3044 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3045 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3051 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3055 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3056 unsigned int loop_depth
, poly_uint64 vf
)
3058 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3059 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3062 /* If the dependence is exact, we should have limited the VF instead. */
3063 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3066 lambda_vector dist_v
;
3067 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3069 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3071 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3072 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3076 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE
, vect_location
,
3079 "dependence distance between ");
3080 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3081 dump_printf (MSG_NOTE
, " and ");
3082 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3083 dump_printf (MSG_NOTE
, " is >= VF\n");
3089 /* Function vect_prune_runtime_alias_test_list.
3091 Prune a list of ddrs to be tested at run-time by versioning for alias.
3092 Merge several alias checks into one if possible.
3093 Return FALSE if resulting list of ddrs is longer then allowed by
3094 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3097 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3099 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3100 hash_set
<tree_pair_hash
> compared_objects
;
3102 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3103 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3104 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3105 vec
<vec_object_pair
> &check_unequal_addrs
3106 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3107 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3108 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3114 if (dump_enabled_p ())
3115 dump_printf_loc (MSG_NOTE
, vect_location
,
3116 "=== vect_prune_runtime_alias_test_list ===\n");
3118 if (may_alias_ddrs
.is_empty ())
3121 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3123 unsigned int loop_depth
3124 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3125 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3127 /* First, we collect all data ref pairs for aliasing checks. */
3128 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3131 struct data_reference
*dr_a
, *dr_b
;
3132 gimple
*dr_group_first_a
, *dr_group_first_b
;
3133 tree segment_length_a
, segment_length_b
;
3134 gimple
*stmt_a
, *stmt_b
;
3136 /* Ignore the alias if the VF we chose ended up being no greater
3137 than the dependence distance. */
3138 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3141 if (DDR_OBJECT_A (ddr
))
3143 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3144 if (!compared_objects
.add (new_pair
))
3146 if (dump_enabled_p ())
3148 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3149 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3150 dump_printf (MSG_NOTE
, " and ");
3151 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3152 dump_printf (MSG_NOTE
, " have different addresses\n");
3154 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3160 stmt_a
= DR_STMT (DDR_A (ddr
));
3161 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3162 if (dr_group_first_a
)
3164 stmt_a
= dr_group_first_a
;
3165 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3169 stmt_b
= DR_STMT (DDR_B (ddr
));
3170 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3171 if (dr_group_first_b
)
3173 stmt_b
= dr_group_first_b
;
3174 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3177 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3178 length_factor
= scalar_loop_iters
;
3180 length_factor
= size_int (vect_factor
);
3181 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3182 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3184 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3185 DR_BASE_ADDRESS (dr_b
));
3187 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3190 /* Alias is known at compilation time. */
3192 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3193 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3194 && TREE_CODE (segment_length_a
) == INTEGER_CST
3195 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3197 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE
, vect_location
,
3202 "not vectorized: compilation time alias.\n");
3207 dr_with_seg_len_pair_t dr_with_seg_len_pair
3208 (dr_with_seg_len (dr_a
, segment_length_a
),
3209 dr_with_seg_len (dr_b
, segment_length_b
));
3211 /* Canonicalize pairs by sorting the two DR members. */
3213 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3215 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3218 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3220 unsigned int count
= (comp_alias_ddrs
.length ()
3221 + check_unequal_addrs
.length ());
3222 dump_printf_loc (MSG_NOTE
, vect_location
,
3223 "improved number of alias checks from %d to %d\n",
3224 may_alias_ddrs
.length (), count
);
3225 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3229 "number of versioning for alias "
3230 "run-time tests exceeds %d "
3231 "(--param vect-max-version-for-alias-checks)\n",
3232 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3239 /* Return true if a non-affine read or write in STMT is suitable for a
3240 gather load or scatter store. Describe the operation in *INFO if so. */
3243 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3244 gather_scatter_info
*info
)
3246 HOST_WIDE_INT scale
= 1;
3247 poly_int64 pbitpos
, pbitsize
;
3248 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3249 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3250 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3251 tree offtype
= NULL_TREE
;
3252 tree decl
, base
, off
;
3254 int punsignedp
, reversep
, pvolatilep
= 0;
3257 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3258 see if we can use the def stmt of the address. */
3259 if (is_gimple_call (stmt
)
3260 && gimple_call_internal_p (stmt
)
3261 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3262 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3263 && TREE_CODE (base
) == MEM_REF
3264 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3265 && integer_zerop (TREE_OPERAND (base
, 1))
3266 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3268 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3269 if (is_gimple_assign (def_stmt
)
3270 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3271 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3274 /* The gather and scatter builtins need address of the form
3275 loop_invariant + vector * {1, 2, 4, 8}
3277 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3278 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3279 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3280 multiplications and additions in it. To get a vector, we need
3281 a single SSA_NAME that will be defined in the loop and will
3282 contain everything that is not loop invariant and that can be
3283 vectorized. The following code attempts to find such a preexistng
3284 SSA_NAME OFF and put the loop invariants into a tree BASE
3285 that can be gimplified before the loop. */
3286 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3287 &punsignedp
, &reversep
, &pvolatilep
);
3288 gcc_assert (base
&& !reversep
);
3289 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3291 if (TREE_CODE (base
) == MEM_REF
)
3293 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3295 if (off
== NULL_TREE
)
3296 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3298 off
= size_binop (PLUS_EXPR
, off
,
3299 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3301 base
= TREE_OPERAND (base
, 0);
3304 base
= build_fold_addr_expr (base
);
3306 if (off
== NULL_TREE
)
3307 off
= size_zero_node
;
3309 /* If base is not loop invariant, either off is 0, then we start with just
3310 the constant offset in the loop invariant BASE and continue with base
3311 as OFF, otherwise give up.
3312 We could handle that case by gimplifying the addition of base + off
3313 into some SSA_NAME and use that as off, but for now punt. */
3314 if (!expr_invariant_in_loop_p (loop
, base
))
3316 if (!integer_zerop (off
))
3319 base
= size_int (pbytepos
);
3321 /* Otherwise put base + constant offset into the loop invariant BASE
3322 and continue with OFF. */
3325 base
= fold_convert (sizetype
, base
);
3326 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3329 /* OFF at this point may be either a SSA_NAME or some tree expression
3330 from get_inner_reference. Try to peel off loop invariants from it
3331 into BASE as long as possible. */
3333 while (offtype
== NULL_TREE
)
3335 enum tree_code code
;
3336 tree op0
, op1
, add
= NULL_TREE
;
3338 if (TREE_CODE (off
) == SSA_NAME
)
3340 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3342 if (expr_invariant_in_loop_p (loop
, off
))
3345 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3348 op0
= gimple_assign_rhs1 (def_stmt
);
3349 code
= gimple_assign_rhs_code (def_stmt
);
3350 op1
= gimple_assign_rhs2 (def_stmt
);
3354 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3356 code
= TREE_CODE (off
);
3357 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3361 case POINTER_PLUS_EXPR
:
3363 if (expr_invariant_in_loop_p (loop
, op0
))
3368 add
= fold_convert (sizetype
, add
);
3370 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3371 base
= size_binop (PLUS_EXPR
, base
, add
);
3374 if (expr_invariant_in_loop_p (loop
, op1
))
3382 if (expr_invariant_in_loop_p (loop
, op1
))
3384 add
= fold_convert (sizetype
, op1
);
3385 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3391 if (scale
== 1 && tree_fits_shwi_p (op1
))
3393 scale
= tree_to_shwi (op1
);
3402 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3403 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3405 if (TYPE_PRECISION (TREE_TYPE (op0
))
3406 == TYPE_PRECISION (TREE_TYPE (off
)))
3411 if (TYPE_PRECISION (TREE_TYPE (op0
))
3412 < TYPE_PRECISION (TREE_TYPE (off
)))
3415 offtype
= TREE_TYPE (off
);
3426 /* If at the end OFF still isn't a SSA_NAME or isn't
3427 defined in the loop, punt. */
3428 if (TREE_CODE (off
) != SSA_NAME
3429 || expr_invariant_in_loop_p (loop
, off
))
3432 if (offtype
== NULL_TREE
)
3433 offtype
= TREE_TYPE (off
);
3435 if (DR_IS_READ (dr
))
3436 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3439 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3442 if (decl
== NULL_TREE
)
3448 info
->offset_dt
= vect_unknown_def_type
;
3449 info
->offset_vectype
= NULL_TREE
;
3450 info
->scale
= scale
;
3454 /* Function vect_analyze_data_refs.
3456 Find all the data references in the loop or basic block.
3458 The general structure of the analysis of data refs in the vectorizer is as
3460 1- vect_analyze_data_refs(loop/bb): call
3461 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3462 in the loop/bb and their dependences.
3463 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3464 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3465 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3470 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
)
3472 struct loop
*loop
= NULL
;
3474 struct data_reference
*dr
;
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE
, vect_location
,
3479 "=== vect_analyze_data_refs ===\n");
3481 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3482 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3484 /* Go through the data-refs, check that the analysis succeeded. Update
3485 pointer from stmt_vec_info struct to DR and vectype. */
3487 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3488 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3491 stmt_vec_info stmt_info
;
3492 tree base
, offset
, init
;
3493 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3494 bool simd_lane_access
= false;
3498 if (!dr
|| !DR_REF (dr
))
3500 if (dump_enabled_p ())
3501 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3502 "not vectorized: unhandled data-ref\n");
3506 stmt
= DR_STMT (dr
);
3507 stmt_info
= vinfo_for_stmt (stmt
);
3509 /* Discard clobbers from the dataref vector. We will remove
3510 clobber stmts during vectorization. */
3511 if (gimple_clobber_p (stmt
))
3514 if (i
== datarefs
.length () - 1)
3519 datarefs
.ordered_remove (i
);
3524 /* Check that analysis of the data-ref succeeded. */
3525 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3530 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3531 && targetm
.vectorize
.builtin_gather
!= NULL
;
3534 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3535 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3536 bool maybe_simd_lane_access
3537 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3539 /* If target supports vector gather loads or scatter stores, or if
3540 this might be a SIMD lane access, see if they can't be used. */
3541 if (is_a
<loop_vec_info
> (vinfo
)
3542 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3543 && !nested_in_vect_loop_p (loop
, stmt
))
3545 struct data_reference
*newdr
3546 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3547 DR_REF (dr
), stmt
, !maybe_scatter
,
3548 DR_IS_CONDITIONAL_IN_STMT (dr
));
3549 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3550 if (DR_BASE_ADDRESS (newdr
)
3551 && DR_OFFSET (newdr
)
3554 && integer_zerop (DR_STEP (newdr
)))
3556 if (maybe_simd_lane_access
)
3558 tree off
= DR_OFFSET (newdr
);
3560 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3561 && TREE_CODE (off
) == MULT_EXPR
3562 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3564 tree step
= TREE_OPERAND (off
, 1);
3565 off
= TREE_OPERAND (off
, 0);
3567 if (CONVERT_EXPR_P (off
)
3568 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3570 < TYPE_PRECISION (TREE_TYPE (off
)))
3571 off
= TREE_OPERAND (off
, 0);
3572 if (TREE_CODE (off
) == SSA_NAME
)
3574 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3575 tree reft
= TREE_TYPE (DR_REF (newdr
));
3576 if (is_gimple_call (def
)
3577 && gimple_call_internal_p (def
)
3578 && (gimple_call_internal_fn (def
)
3579 == IFN_GOMP_SIMD_LANE
))
3581 tree arg
= gimple_call_arg (def
, 0);
3582 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3583 arg
= SSA_NAME_VAR (arg
);
3584 if (arg
== loop
->simduid
3586 && tree_int_cst_equal
3587 (TYPE_SIZE_UNIT (reft
),
3590 DR_OFFSET (newdr
) = ssize_int (0);
3591 DR_STEP (newdr
) = step
;
3592 DR_OFFSET_ALIGNMENT (newdr
)
3593 = BIGGEST_ALIGNMENT
;
3594 DR_STEP_ALIGNMENT (newdr
)
3595 = highest_pow2_factor (step
);
3597 simd_lane_access
= true;
3603 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3607 gatherscatter
= GATHER
;
3609 gatherscatter
= SCATTER
;
3612 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3613 free_data_ref (newdr
);
3616 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3618 if (dump_enabled_p ())
3620 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3621 "not vectorized: data ref analysis "
3623 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3626 if (is_a
<bb_vec_info
> (vinfo
))
3633 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3637 "not vectorized: base addr of dr is a "
3640 if (is_a
<bb_vec_info
> (vinfo
))
3643 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3648 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3650 if (dump_enabled_p ())
3652 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3653 "not vectorized: volatile type ");
3654 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3657 if (is_a
<bb_vec_info
> (vinfo
))
3663 if (stmt_can_throw_internal (stmt
))
3665 if (dump_enabled_p ())
3667 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3668 "not vectorized: statement can throw an "
3670 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3673 if (is_a
<bb_vec_info
> (vinfo
))
3676 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3681 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3682 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3684 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3687 "not vectorized: statement is bitfield "
3689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3692 if (is_a
<bb_vec_info
> (vinfo
))
3695 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3700 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3701 offset
= unshare_expr (DR_OFFSET (dr
));
3702 init
= unshare_expr (DR_INIT (dr
));
3704 if (is_gimple_call (stmt
)
3705 && (!gimple_call_internal_p (stmt
)
3706 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3707 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3709 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3712 "not vectorized: dr in a call ");
3713 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3716 if (is_a
<bb_vec_info
> (vinfo
))
3719 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3724 /* Update DR field in stmt_vec_info struct. */
3726 /* If the dataref is in an inner-loop of the loop that is considered for
3727 for vectorization, we also want to analyze the access relative to
3728 the outer-loop (DR contains information only relative to the
3729 inner-most enclosing loop). We do that by building a reference to the
3730 first location accessed by the inner-loop, and analyze it relative to
3732 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3734 /* Build a reference to the first location accessed by the
3735 inner loop: *(BASE + INIT + OFFSET). By construction,
3736 this address must be invariant in the inner loop, so we
3737 can consider it as being used in the outer loop. */
3738 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
3740 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
3741 tree init_ref
= build_fold_indirect_ref (init_addr
);
3743 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_NOTE
, vect_location
,
3746 "analyze in outer loop: ");
3747 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
3748 dump_printf (MSG_NOTE
, "\n");
3751 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
3753 /* dr_analyze_innermost already explained the failure. */
3756 if (dump_enabled_p ())
3758 dump_printf_loc (MSG_NOTE
, vect_location
,
3759 "\touter base_address: ");
3760 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3761 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3762 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3763 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3764 STMT_VINFO_DR_OFFSET (stmt_info
));
3765 dump_printf (MSG_NOTE
,
3766 "\n\touter constant offset from base address: ");
3767 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3768 STMT_VINFO_DR_INIT (stmt_info
));
3769 dump_printf (MSG_NOTE
, "\n\touter step: ");
3770 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3771 STMT_VINFO_DR_STEP (stmt_info
));
3772 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
3773 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
3774 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
3775 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
3776 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
3777 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
3778 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
3779 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
3783 if (STMT_VINFO_DATA_REF (stmt_info
))
3785 if (dump_enabled_p ())
3787 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3788 "not vectorized: more than one data ref "
3790 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3793 if (is_a
<bb_vec_info
> (vinfo
))
3796 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3801 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3802 if (simd_lane_access
)
3804 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3805 free_data_ref (datarefs
[i
]);
3809 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
3810 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
3811 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
3813 if (dump_enabled_p ())
3815 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3816 "not vectorized: base object not addressable "
3818 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3820 if (is_a
<bb_vec_info
> (vinfo
))
3822 /* In BB vectorization the ref can still participate
3823 in dependence analysis, we just can't vectorize it. */
3824 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3830 /* Set vectype for STMT. */
3831 scalar_type
= TREE_TYPE (DR_REF (dr
));
3832 STMT_VINFO_VECTYPE (stmt_info
)
3833 = get_vectype_for_scalar_type (scalar_type
);
3834 if (!STMT_VINFO_VECTYPE (stmt_info
))
3836 if (dump_enabled_p ())
3838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3839 "not vectorized: no vectype for stmt: ");
3840 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3841 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3842 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3844 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3847 if (is_a
<bb_vec_info
> (vinfo
))
3849 /* No vector type is fine, the ref can still participate
3850 in dependence analysis, we just can't vectorize it. */
3851 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3855 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3857 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3858 if (gatherscatter
!= SG_NONE
)
3865 if (dump_enabled_p ())
3867 dump_printf_loc (MSG_NOTE
, vect_location
,
3868 "got vectype for stmt: ");
3869 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3870 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3871 STMT_VINFO_VECTYPE (stmt_info
));
3872 dump_printf (MSG_NOTE
, "\n");
3876 /* Adjust the minimal vectorization factor according to the
3878 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3879 *min_vf
= upper_bound (*min_vf
, vf
);
3881 if (gatherscatter
!= SG_NONE
)
3883 gather_scatter_info gs_info
;
3884 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3886 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
3888 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3890 if (dump_enabled_p ())
3892 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3893 (gatherscatter
== GATHER
) ?
3894 "not vectorized: not suitable for gather "
3896 "not vectorized: not suitable for scatter "
3898 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3903 free_data_ref (datarefs
[i
]);
3905 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3908 else if (is_a
<loop_vec_info
> (vinfo
)
3909 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3911 if (nested_in_vect_loop_p (loop
, stmt
))
3913 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3916 "not vectorized: not suitable for strided "
3918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3922 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3926 /* If we stopped analysis at the first dataref we could not analyze
3927 when trying to vectorize a basic-block mark the rest of the datarefs
3928 as not vectorizable and truncate the vector of datarefs. That
3929 avoids spending useless time in analyzing their dependence. */
3930 if (i
!= datarefs
.length ())
3932 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3933 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3935 data_reference_p dr
= datarefs
[j
];
3936 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3939 datarefs
.truncate (i
);
3946 /* Function vect_get_new_vect_var.
3948 Returns a name for a new variable. The current naming scheme appends the
3949 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3950 the name of vectorizer generated variables, and appends that to NAME if
3954 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3961 case vect_simple_var
:
3964 case vect_scalar_var
:
3970 case vect_pointer_var
:
3979 char* tmp
= concat (prefix
, "_", name
, NULL
);
3980 new_vect_var
= create_tmp_reg (type
, tmp
);
3984 new_vect_var
= create_tmp_reg (type
, prefix
);
3986 return new_vect_var
;
3989 /* Like vect_get_new_vect_var but return an SSA name. */
3992 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3999 case vect_simple_var
:
4002 case vect_scalar_var
:
4005 case vect_pointer_var
:
4014 char* tmp
= concat (prefix
, "_", name
, NULL
);
4015 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4019 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4021 return new_vect_var
;
4024 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4027 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
)
4029 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4030 int misalign
= DR_MISALIGNMENT (dr
);
4031 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4032 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4034 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4035 DR_TARGET_ALIGNMENT (dr
), misalign
);
4038 /* Function vect_create_addr_base_for_vector_ref.
4040 Create an expression that computes the address of the first memory location
4041 that will be accessed for a data reference.
4044 STMT: The statement containing the data reference.
4045 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4046 OFFSET: Optional. If supplied, it is be added to the initial address.
4047 LOOP: Specify relative to which loop-nest should the address be computed.
4048 For example, when the dataref is in an inner-loop nested in an
4049 outer-loop that is now being vectorized, LOOP can be either the
4050 outer-loop, or the inner-loop. The first memory location accessed
4051 by the following dataref ('in' points to short):
4058 if LOOP=i_loop: &in (relative to i_loop)
4059 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4060 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4061 initial address. Unlike OFFSET, which is number of elements to
4062 be added, BYTE_OFFSET is measured in bytes.
4065 1. Return an SSA_NAME whose value is the address of the memory location of
4066 the first vector of the data reference.
4067 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4068 these statement(s) which define the returned SSA_NAME.
4070 FORNOW: We are only handling array accesses with step 1. */
4073 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4074 gimple_seq
*new_stmt_list
,
4078 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4079 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4080 const char *base_name
;
4083 gimple_seq seq
= NULL
;
4085 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4086 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4087 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4089 tree data_ref_base
= unshare_expr (drb
->base_address
);
4090 tree base_offset
= unshare_expr (drb
->offset
);
4091 tree init
= unshare_expr (drb
->init
);
4094 base_name
= get_name (data_ref_base
);
4097 base_offset
= ssize_int (0);
4098 init
= ssize_int (0);
4099 base_name
= get_name (DR_REF (dr
));
4102 /* Create base_offset */
4103 base_offset
= size_binop (PLUS_EXPR
,
4104 fold_convert (sizetype
, base_offset
),
4105 fold_convert (sizetype
, init
));
4109 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4110 fold_convert (sizetype
, offset
), step
);
4111 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4112 base_offset
, offset
);
4116 byte_offset
= fold_convert (sizetype
, byte_offset
);
4117 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4118 base_offset
, byte_offset
);
4121 /* base + base_offset */
4123 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4126 addr_base
= build1 (ADDR_EXPR
,
4127 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4128 unshare_expr (DR_REF (dr
)));
4131 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4132 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4133 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4134 gimple_seq_add_seq (new_stmt_list
, seq
);
4136 if (DR_PTR_INFO (dr
)
4137 && TREE_CODE (addr_base
) == SSA_NAME
4138 && !SSA_NAME_PTR_INFO (addr_base
))
4140 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
);
4141 if (offset
|| byte_offset
)
4142 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4145 if (dump_enabled_p ())
4147 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4148 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4149 dump_printf (MSG_NOTE
, "\n");
4156 /* Function vect_create_data_ref_ptr.
4158 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4159 location accessed in the loop by STMT, along with the def-use update
4160 chain to appropriately advance the pointer through the loop iterations.
4161 Also set aliasing information for the pointer. This pointer is used by
4162 the callers to this function to create a memory reference expression for
4163 vector load/store access.
4166 1. STMT: a stmt that references memory. Expected to be of the form
4167 GIMPLE_ASSIGN <name, data-ref> or
4168 GIMPLE_ASSIGN <data-ref, name>.
4169 2. AGGR_TYPE: the type of the reference, which should be either a vector
4171 3. AT_LOOP: the loop where the vector memref is to be created.
4172 4. OFFSET (optional): an offset to be added to the initial address accessed
4173 by the data-ref in STMT.
4174 5. BSI: location where the new stmts are to be placed if there is no loop
4175 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4176 pointing to the initial address.
4177 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4178 to the initial address accessed by the data-ref in STMT. This is
4179 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4183 1. Declare a new ptr to vector_type, and have it point to the base of the
4184 data reference (initial addressed accessed by the data reference).
4185 For example, for vector of type V8HI, the following code is generated:
4188 ap = (v8hi *)initial_address;
4190 if OFFSET is not supplied:
4191 initial_address = &a[init];
4192 if OFFSET is supplied:
4193 initial_address = &a[init + OFFSET];
4194 if BYTE_OFFSET is supplied:
4195 initial_address = &a[init] + BYTE_OFFSET;
4197 Return the initial_address in INITIAL_ADDRESS.
4199 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4200 update the pointer in each iteration of the loop.
4202 Return the increment stmt that updates the pointer in PTR_INCR.
4204 3. Set INV_P to true if the access pattern of the data reference in the
4205 vectorized loop is invariant. Set it to false otherwise.
4207 4. Return the pointer. */
4210 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4211 tree offset
, tree
*initial_address
,
4212 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4213 bool only_init
, bool *inv_p
, tree byte_offset
)
4215 const char *base_name
;
4216 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4217 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4218 struct loop
*loop
= NULL
;
4219 bool nested_in_vect_loop
= false;
4220 struct loop
*containing_loop
= NULL
;
4224 gimple_seq new_stmt_list
= NULL
;
4228 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4230 gimple_stmt_iterator incr_gsi
;
4232 tree indx_before_incr
, indx_after_incr
;
4235 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4237 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4238 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4242 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4243 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4244 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4245 pe
= loop_preheader_edge (loop
);
4249 gcc_assert (bb_vinfo
);
4254 /* Check the step (evolution) of the load in LOOP, and record
4255 whether it's invariant. */
4256 step
= vect_dr_behavior (dr
)->step
;
4257 if (integer_zerop (step
))
4262 /* Create an expression for the first address accessed by this load
4264 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4266 if (dump_enabled_p ())
4268 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4269 dump_printf_loc (MSG_NOTE
, vect_location
,
4270 "create %s-pointer variable to type: ",
4271 get_tree_code_name (TREE_CODE (aggr_type
)));
4272 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4273 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4274 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4275 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4276 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4277 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4278 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4280 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4281 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4282 dump_printf (MSG_NOTE
, "\n");
4285 /* (1) Create the new aggregate-pointer variable.
4286 Vector and array types inherit the alias set of their component
4287 type by default so we need to use a ref-all pointer if the data
4288 reference does not conflict with the created aggregated data
4289 reference because it is not addressable. */
4290 bool need_ref_all
= false;
4291 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4292 get_alias_set (DR_REF (dr
))))
4293 need_ref_all
= true;
4294 /* Likewise for any of the data references in the stmt group. */
4295 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4297 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4300 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4301 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4302 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4303 get_alias_set (DR_REF (sdr
))))
4305 need_ref_all
= true;
4308 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4312 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4314 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4317 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4318 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4319 def-use update cycles for the pointer: one relative to the outer-loop
4320 (LOOP), which is what steps (3) and (4) below do. The other is relative
4321 to the inner-loop (which is the inner-most loop containing the dataref),
4322 and this is done be step (5) below.
4324 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4325 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4326 redundant. Steps (3),(4) create the following:
4329 LOOP: vp1 = phi(vp0,vp2)
4335 If there is an inner-loop nested in loop, then step (5) will also be
4336 applied, and an additional update in the inner-loop will be created:
4339 LOOP: vp1 = phi(vp0,vp2)
4341 inner: vp3 = phi(vp1,vp4)
4342 vp4 = vp3 + inner_step
4348 /* (2) Calculate the initial address of the aggregate-pointer, and set
4349 the aggregate-pointer to point to it before the loop. */
4351 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4353 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4354 offset
, byte_offset
);
4359 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4360 gcc_assert (!new_bb
);
4363 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4366 *initial_address
= new_temp
;
4367 aggr_ptr_init
= new_temp
;
4369 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4370 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4371 inner-loop nested in LOOP (during outer-loop vectorization). */
4373 /* No update in loop is required. */
4374 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4375 aptr
= aggr_ptr_init
;
4378 /* The step of the aggregate pointer is the type size. */
4379 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4380 /* One exception to the above is when the scalar step of the load in
4381 LOOP is zero. In this case the step here is also zero. */
4383 iv_step
= size_zero_node
;
4384 else if (tree_int_cst_sgn (step
) == -1)
4385 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4387 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4389 create_iv (aggr_ptr_init
,
4390 fold_convert (aggr_ptr_type
, iv_step
),
4391 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4392 &indx_before_incr
, &indx_after_incr
);
4393 incr
= gsi_stmt (incr_gsi
);
4394 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4396 /* Copy the points-to information if it exists. */
4397 if (DR_PTR_INFO (dr
))
4399 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4400 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4405 aptr
= indx_before_incr
;
4408 if (!nested_in_vect_loop
|| only_init
)
4412 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4413 nested in LOOP, if exists. */
4415 gcc_assert (nested_in_vect_loop
);
4418 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4420 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4421 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4423 incr
= gsi_stmt (incr_gsi
);
4424 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4426 /* Copy the points-to information if it exists. */
4427 if (DR_PTR_INFO (dr
))
4429 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4430 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4435 return indx_before_incr
;
4442 /* Function bump_vector_ptr
4444 Increment a pointer (to a vector type) by vector-size. If requested,
4445 i.e. if PTR-INCR is given, then also connect the new increment stmt
4446 to the existing def-use update-chain of the pointer, by modifying
4447 the PTR_INCR as illustrated below:
4449 The pointer def-use update-chain before this function:
4450 DATAREF_PTR = phi (p_0, p_2)
4452 PTR_INCR: p_2 = DATAREF_PTR + step
4454 The pointer def-use update-chain after this function:
4455 DATAREF_PTR = phi (p_0, p_2)
4457 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4459 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4462 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4464 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4465 the loop. The increment amount across iterations is expected
4467 BSI - location where the new update stmt is to be placed.
4468 STMT - the original scalar memory-access stmt that is being vectorized.
4469 BUMP - optional. The offset by which to bump the pointer. If not given,
4470 the offset is assumed to be vector_size.
4472 Output: Return NEW_DATAREF_PTR as illustrated above.
4477 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4478 gimple
*stmt
, tree bump
)
4480 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4481 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4482 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4483 tree update
= TYPE_SIZE_UNIT (vectype
);
4486 use_operand_p use_p
;
4487 tree new_dataref_ptr
;
4492 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4493 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4495 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4496 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4497 dataref_ptr
, update
);
4498 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4500 /* Copy the points-to information if it exists. */
4501 if (DR_PTR_INFO (dr
))
4503 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4504 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4508 return new_dataref_ptr
;
4510 /* Update the vector-pointer's cross-iteration increment. */
4511 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4513 tree use
= USE_FROM_PTR (use_p
);
4515 if (use
== dataref_ptr
)
4516 SET_USE (use_p
, new_dataref_ptr
);
4518 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4521 return new_dataref_ptr
;
4525 /* Function vect_create_destination_var.
4527 Create a new temporary of type VECTYPE. */
4530 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4536 enum vect_var_kind kind
;
4539 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4543 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4545 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4547 name
= get_name (scalar_dest
);
4549 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4551 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4552 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4558 /* Function vect_grouped_store_supported.
4560 Returns TRUE if interleave high and interleave low permutations
4561 are supported, and FALSE otherwise. */
4564 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4566 machine_mode mode
= TYPE_MODE (vectype
);
4568 /* vect_permute_store_chain requires the group size to be equal to 3 or
4569 be a power of two. */
4570 if (count
!= 3 && exact_log2 (count
) == -1)
4572 if (dump_enabled_p ())
4573 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4574 "the size of the group of accesses"
4575 " is not a power of 2 or not eqaul to 3\n");
4579 /* Check that the permutation is supported. */
4580 if (VECTOR_MODE_P (mode
))
4582 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4585 unsigned int j0
= 0, j1
= 0, j2
= 0;
4588 vec_perm_builder
sel (nelt
, nelt
, 1);
4589 sel
.quick_grow (nelt
);
4590 vec_perm_indices indices
;
4591 for (j
= 0; j
< 3; j
++)
4593 int nelt0
= ((3 - j
) * nelt
) % 3;
4594 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4595 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4596 for (i
= 0; i
< nelt
; i
++)
4598 if (3 * i
+ nelt0
< nelt
)
4599 sel
[3 * i
+ nelt0
] = j0
++;
4600 if (3 * i
+ nelt1
< nelt
)
4601 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4602 if (3 * i
+ nelt2
< nelt
)
4603 sel
[3 * i
+ nelt2
] = 0;
4605 indices
.new_vector (sel
, 2, nelt
);
4606 if (!can_vec_perm_const_p (mode
, indices
))
4608 if (dump_enabled_p ())
4609 dump_printf (MSG_MISSED_OPTIMIZATION
,
4610 "permutation op not supported by target.\n");
4614 for (i
= 0; i
< nelt
; i
++)
4616 if (3 * i
+ nelt0
< nelt
)
4617 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4618 if (3 * i
+ nelt1
< nelt
)
4619 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4620 if (3 * i
+ nelt2
< nelt
)
4621 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4623 indices
.new_vector (sel
, 2, nelt
);
4624 if (!can_vec_perm_const_p (mode
, indices
))
4626 if (dump_enabled_p ())
4627 dump_printf (MSG_MISSED_OPTIMIZATION
,
4628 "permutation op not supported by target.\n");
4636 /* If length is not equal to 3 then only power of 2 is supported. */
4637 gcc_assert (pow2p_hwi (count
));
4639 /* The encoding has 2 interleaved stepped patterns. */
4640 vec_perm_builder
sel (nelt
, 2, 3);
4642 for (i
= 0; i
< 3; i
++)
4645 sel
[i
* 2 + 1] = i
+ nelt
;
4647 vec_perm_indices
indices (sel
, 2, nelt
);
4648 if (can_vec_perm_const_p (mode
, indices
))
4650 for (i
= 0; i
< 6; i
++)
4652 indices
.new_vector (sel
, 2, nelt
);
4653 if (can_vec_perm_const_p (mode
, indices
))
4659 if (dump_enabled_p ())
4660 dump_printf (MSG_MISSED_OPTIMIZATION
,
4661 "permutaion op not supported by target.\n");
4666 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4670 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4672 return vect_lanes_optab_supported_p ("vec_store_lanes",
4673 vec_store_lanes_optab
,
4678 /* Function vect_permute_store_chain.
4680 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4681 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4682 the data correctly for the stores. Return the final references for stores
4685 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4686 The input is 4 vectors each containing 8 elements. We assign a number to
4687 each element, the input sequence is:
4689 1st vec: 0 1 2 3 4 5 6 7
4690 2nd vec: 8 9 10 11 12 13 14 15
4691 3rd vec: 16 17 18 19 20 21 22 23
4692 4th vec: 24 25 26 27 28 29 30 31
4694 The output sequence should be:
4696 1st vec: 0 8 16 24 1 9 17 25
4697 2nd vec: 2 10 18 26 3 11 19 27
4698 3rd vec: 4 12 20 28 5 13 21 30
4699 4th vec: 6 14 22 30 7 15 23 31
4701 i.e., we interleave the contents of the four vectors in their order.
4703 We use interleave_high/low instructions to create such output. The input of
4704 each interleave_high/low operation is two vectors:
4707 the even elements of the result vector are obtained left-to-right from the
4708 high/low elements of the first vector. The odd elements of the result are
4709 obtained left-to-right from the high/low elements of the second vector.
4710 The output of interleave_high will be: 0 4 1 5
4711 and of interleave_low: 2 6 3 7
4714 The permutation is done in log LENGTH stages. In each stage interleave_high
4715 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4716 where the first argument is taken from the first half of DR_CHAIN and the
4717 second argument from it's second half.
4720 I1: interleave_high (1st vec, 3rd vec)
4721 I2: interleave_low (1st vec, 3rd vec)
4722 I3: interleave_high (2nd vec, 4th vec)
4723 I4: interleave_low (2nd vec, 4th vec)
4725 The output for the first stage is:
4727 I1: 0 16 1 17 2 18 3 19
4728 I2: 4 20 5 21 6 22 7 23
4729 I3: 8 24 9 25 10 26 11 27
4730 I4: 12 28 13 29 14 30 15 31
4732 The output of the second stage, i.e. the final result is:
4734 I1: 0 8 16 24 1 9 17 25
4735 I2: 2 10 18 26 3 11 19 27
4736 I3: 4 12 20 28 5 13 21 30
4737 I4: 6 14 22 30 7 15 23 31. */
4740 vect_permute_store_chain (vec
<tree
> dr_chain
,
4741 unsigned int length
,
4743 gimple_stmt_iterator
*gsi
,
4744 vec
<tree
> *result_chain
)
4746 tree vect1
, vect2
, high
, low
;
4748 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4749 tree perm_mask_low
, perm_mask_high
;
4751 tree perm3_mask_low
, perm3_mask_high
;
4752 unsigned int i
, n
, log_length
= exact_log2 (length
);
4753 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4755 result_chain
->quick_grow (length
);
4756 memcpy (result_chain
->address (), dr_chain
.address (),
4757 length
* sizeof (tree
));
4761 unsigned int j0
= 0, j1
= 0, j2
= 0;
4763 vec_perm_builder
sel (nelt
, nelt
, 1);
4764 sel
.quick_grow (nelt
);
4765 vec_perm_indices indices
;
4766 for (j
= 0; j
< 3; j
++)
4768 int nelt0
= ((3 - j
) * nelt
) % 3;
4769 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4770 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4772 for (i
= 0; i
< nelt
; i
++)
4774 if (3 * i
+ nelt0
< nelt
)
4775 sel
[3 * i
+ nelt0
] = j0
++;
4776 if (3 * i
+ nelt1
< nelt
)
4777 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4778 if (3 * i
+ nelt2
< nelt
)
4779 sel
[3 * i
+ nelt2
] = 0;
4781 indices
.new_vector (sel
, 2, nelt
);
4782 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
4784 for (i
= 0; i
< nelt
; i
++)
4786 if (3 * i
+ nelt0
< nelt
)
4787 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4788 if (3 * i
+ nelt1
< nelt
)
4789 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4790 if (3 * i
+ nelt2
< nelt
)
4791 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4793 indices
.new_vector (sel
, 2, nelt
);
4794 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
4796 vect1
= dr_chain
[0];
4797 vect2
= dr_chain
[1];
4799 /* Create interleaving stmt:
4800 low = VEC_PERM_EXPR <vect1, vect2,
4801 {j, nelt, *, j + 1, nelt + j + 1, *,
4802 j + 2, nelt + j + 2, *, ...}> */
4803 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4804 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4805 vect2
, perm3_mask_low
);
4806 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4809 vect2
= dr_chain
[2];
4810 /* Create interleaving stmt:
4811 low = VEC_PERM_EXPR <vect1, vect2,
4812 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4813 6, 7, nelt + j + 2, ...}> */
4814 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4815 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4816 vect2
, perm3_mask_high
);
4817 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4818 (*result_chain
)[j
] = data_ref
;
4823 /* If length is not equal to 3 then only power of 2 is supported. */
4824 gcc_assert (pow2p_hwi (length
));
4826 /* The encoding has 2 interleaved stepped patterns. */
4827 vec_perm_builder
sel (nelt
, 2, 3);
4829 for (i
= 0; i
< 3; i
++)
4832 sel
[i
* 2 + 1] = i
+ nelt
;
4834 vec_perm_indices
indices (sel
, 2, nelt
);
4835 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
4837 for (i
= 0; i
< 6; i
++)
4839 indices
.new_vector (sel
, 2, nelt
);
4840 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
4842 for (i
= 0, n
= log_length
; i
< n
; i
++)
4844 for (j
= 0; j
< length
/2; j
++)
4846 vect1
= dr_chain
[j
];
4847 vect2
= dr_chain
[j
+length
/2];
4849 /* Create interleaving stmt:
4850 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4852 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4853 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4854 vect2
, perm_mask_high
);
4855 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4856 (*result_chain
)[2*j
] = high
;
4858 /* Create interleaving stmt:
4859 low = VEC_PERM_EXPR <vect1, vect2,
4860 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4862 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4863 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4864 vect2
, perm_mask_low
);
4865 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4866 (*result_chain
)[2*j
+1] = low
;
4868 memcpy (dr_chain
.address (), result_chain
->address (),
4869 length
* sizeof (tree
));
4874 /* Function vect_setup_realignment
4876 This function is called when vectorizing an unaligned load using
4877 the dr_explicit_realign[_optimized] scheme.
4878 This function generates the following code at the loop prolog:
4881 x msq_init = *(floor(p)); # prolog load
4882 realignment_token = call target_builtin;
4884 x msq = phi (msq_init, ---)
4886 The stmts marked with x are generated only for the case of
4887 dr_explicit_realign_optimized.
4889 The code above sets up a new (vector) pointer, pointing to the first
4890 location accessed by STMT, and a "floor-aligned" load using that pointer.
4891 It also generates code to compute the "realignment-token" (if the relevant
4892 target hook was defined), and creates a phi-node at the loop-header bb
4893 whose arguments are the result of the prolog-load (created by this
4894 function) and the result of a load that takes place in the loop (to be
4895 created by the caller to this function).
4897 For the case of dr_explicit_realign_optimized:
4898 The caller to this function uses the phi-result (msq) to create the
4899 realignment code inside the loop, and sets up the missing phi argument,
4902 msq = phi (msq_init, lsq)
4903 lsq = *(floor(p')); # load in loop
4904 result = realign_load (msq, lsq, realignment_token);
4906 For the case of dr_explicit_realign:
4908 msq = *(floor(p)); # load in loop
4910 lsq = *(floor(p')); # load in loop
4911 result = realign_load (msq, lsq, realignment_token);
4914 STMT - (scalar) load stmt to be vectorized. This load accesses
4915 a memory location that may be unaligned.
4916 BSI - place where new code is to be inserted.
4917 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4921 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4922 target hook, if defined.
4923 Return value - the result of the loop-header phi node. */
4926 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4927 tree
*realignment_token
,
4928 enum dr_alignment_support alignment_support_scheme
,
4930 struct loop
**at_loop
)
4932 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4933 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4934 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4935 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4936 struct loop
*loop
= NULL
;
4938 tree scalar_dest
= gimple_assign_lhs (stmt
);
4944 tree msq_init
= NULL_TREE
;
4947 tree msq
= NULL_TREE
;
4948 gimple_seq stmts
= NULL
;
4950 bool compute_in_loop
= false;
4951 bool nested_in_vect_loop
= false;
4952 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4953 struct loop
*loop_for_initial_load
= NULL
;
4957 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4958 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4961 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4962 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4964 /* We need to generate three things:
4965 1. the misalignment computation
4966 2. the extra vector load (for the optimized realignment scheme).
4967 3. the phi node for the two vectors from which the realignment is
4968 done (for the optimized realignment scheme). */
4970 /* 1. Determine where to generate the misalignment computation.
4972 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4973 calculation will be generated by this function, outside the loop (in the
4974 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4975 caller, inside the loop.
4977 Background: If the misalignment remains fixed throughout the iterations of
4978 the loop, then both realignment schemes are applicable, and also the
4979 misalignment computation can be done outside LOOP. This is because we are
4980 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4981 are a multiple of VS (the Vector Size), and therefore the misalignment in
4982 different vectorized LOOP iterations is always the same.
4983 The problem arises only if the memory access is in an inner-loop nested
4984 inside LOOP, which is now being vectorized using outer-loop vectorization.
4985 This is the only case when the misalignment of the memory access may not
4986 remain fixed throughout the iterations of the inner-loop (as explained in
4987 detail in vect_supportable_dr_alignment). In this case, not only is the
4988 optimized realignment scheme not applicable, but also the misalignment
4989 computation (and generation of the realignment token that is passed to
4990 REALIGN_LOAD) have to be done inside the loop.
4992 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4993 or not, which in turn determines if the misalignment is computed inside
4994 the inner-loop, or outside LOOP. */
4996 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4998 compute_in_loop
= true;
4999 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5003 /* 2. Determine where to generate the extra vector load.
5005 For the optimized realignment scheme, instead of generating two vector
5006 loads in each iteration, we generate a single extra vector load in the
5007 preheader of the loop, and in each iteration reuse the result of the
5008 vector load from the previous iteration. In case the memory access is in
5009 an inner-loop nested inside LOOP, which is now being vectorized using
5010 outer-loop vectorization, we need to determine whether this initial vector
5011 load should be generated at the preheader of the inner-loop, or can be
5012 generated at the preheader of LOOP. If the memory access has no evolution
5013 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5014 to be generated inside LOOP (in the preheader of the inner-loop). */
5016 if (nested_in_vect_loop
)
5018 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5019 bool invariant_in_outerloop
=
5020 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5021 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5024 loop_for_initial_load
= loop
;
5026 *at_loop
= loop_for_initial_load
;
5028 if (loop_for_initial_load
)
5029 pe
= loop_preheader_edge (loop_for_initial_load
);
5031 /* 3. For the case of the optimized realignment, create the first vector
5032 load at the loop preheader. */
5034 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5036 /* Create msq_init = *(floor(p1)) in the loop preheader */
5039 gcc_assert (!compute_in_loop
);
5040 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5041 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5042 NULL_TREE
, &init_addr
, NULL
, &inc
,
5044 if (TREE_CODE (ptr
) == SSA_NAME
)
5045 new_temp
= copy_ssa_name (ptr
);
5047 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5048 unsigned int align
= DR_TARGET_ALIGNMENT (dr
);
5049 new_stmt
= gimple_build_assign
5050 (new_temp
, BIT_AND_EXPR
, ptr
,
5051 build_int_cst (TREE_TYPE (ptr
), -(HOST_WIDE_INT
) align
));
5052 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5053 gcc_assert (!new_bb
);
5055 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5056 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5057 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5058 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5059 gimple_assign_set_lhs (new_stmt
, new_temp
);
5062 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5063 gcc_assert (!new_bb
);
5066 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5068 msq_init
= gimple_assign_lhs (new_stmt
);
5071 /* 4. Create realignment token using a target builtin, if available.
5072 It is done either inside the containing loop, or before LOOP (as
5073 determined above). */
5075 if (targetm
.vectorize
.builtin_mask_for_load
)
5080 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5083 /* Generate the INIT_ADDR computation outside LOOP. */
5084 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5088 pe
= loop_preheader_edge (loop
);
5089 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5090 gcc_assert (!new_bb
);
5093 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5096 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5097 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5099 vect_create_destination_var (scalar_dest
,
5100 gimple_call_return_type (new_stmt
));
5101 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5102 gimple_call_set_lhs (new_stmt
, new_temp
);
5104 if (compute_in_loop
)
5105 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5108 /* Generate the misalignment computation outside LOOP. */
5109 pe
= loop_preheader_edge (loop
);
5110 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5111 gcc_assert (!new_bb
);
5114 *realignment_token
= gimple_call_lhs (new_stmt
);
5116 /* The result of the CALL_EXPR to this builtin is determined from
5117 the value of the parameter and no global variables are touched
5118 which makes the builtin a "const" function. Requiring the
5119 builtin to have the "const" attribute makes it unnecessary
5120 to call mark_call_clobbered. */
5121 gcc_assert (TREE_READONLY (builtin_decl
));
5124 if (alignment_support_scheme
== dr_explicit_realign
)
5127 gcc_assert (!compute_in_loop
);
5128 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5131 /* 5. Create msq = phi <msq_init, lsq> in loop */
5133 pe
= loop_preheader_edge (containing_loop
);
5134 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5135 msq
= make_ssa_name (vec_dest
);
5136 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5137 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5143 /* Function vect_grouped_load_supported.
5145 COUNT is the size of the load group (the number of statements plus the
5146 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5147 only one statement, with a gap of COUNT - 1.
5149 Returns true if a suitable permute exists. */
5152 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5153 unsigned HOST_WIDE_INT count
)
5155 machine_mode mode
= TYPE_MODE (vectype
);
5157 /* If this is single-element interleaving with an element distance
5158 that leaves unused vector loads around punt - we at least create
5159 very sub-optimal code in that case (and blow up memory,
5161 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5163 if (dump_enabled_p ())
5164 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5165 "single-element interleaving not supported "
5166 "for not adjacent vector loads\n");
5170 /* vect_permute_load_chain requires the group size to be equal to 3 or
5171 be a power of two. */
5172 if (count
!= 3 && exact_log2 (count
) == -1)
5174 if (dump_enabled_p ())
5175 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5176 "the size of the group of accesses"
5177 " is not a power of 2 or not equal to 3\n");
5181 /* Check that the permutation is supported. */
5182 if (VECTOR_MODE_P (mode
))
5184 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5188 vec_perm_builder
sel (nelt
, nelt
, 1);
5189 sel
.quick_grow (nelt
);
5190 vec_perm_indices indices
;
5192 for (k
= 0; k
< 3; k
++)
5194 for (i
= 0; i
< nelt
; i
++)
5195 if (3 * i
+ k
< 2 * nelt
)
5199 indices
.new_vector (sel
, 2, nelt
);
5200 if (!can_vec_perm_const_p (mode
, indices
))
5202 if (dump_enabled_p ())
5203 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5204 "shuffle of 3 loads is not supported by"
5208 for (i
= 0, j
= 0; i
< nelt
; i
++)
5209 if (3 * i
+ k
< 2 * nelt
)
5212 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5213 indices
.new_vector (sel
, 2, nelt
);
5214 if (!can_vec_perm_const_p (mode
, indices
))
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5218 "shuffle of 3 loads is not supported by"
5227 /* If length is not equal to 3 then only power of 2 is supported. */
5228 gcc_assert (pow2p_hwi (count
));
5230 /* The encoding has a single stepped pattern. */
5231 vec_perm_builder
sel (nelt
, 1, 3);
5233 for (i
= 0; i
< 3; i
++)
5235 vec_perm_indices
indices (sel
, 2, nelt
);
5236 if (can_vec_perm_const_p (mode
, indices
))
5238 for (i
= 0; i
< 3; i
++)
5240 indices
.new_vector (sel
, 2, nelt
);
5241 if (can_vec_perm_const_p (mode
, indices
))
5247 if (dump_enabled_p ())
5248 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5249 "extract even/odd not supported by target\n");
5253 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5257 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5259 return vect_lanes_optab_supported_p ("vec_load_lanes",
5260 vec_load_lanes_optab
,
5264 /* Function vect_permute_load_chain.
5266 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5267 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5268 the input data correctly. Return the final references for loads in
5271 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5272 The input is 4 vectors each containing 8 elements. We assign a number to each
5273 element, the input sequence is:
5275 1st vec: 0 1 2 3 4 5 6 7
5276 2nd vec: 8 9 10 11 12 13 14 15
5277 3rd vec: 16 17 18 19 20 21 22 23
5278 4th vec: 24 25 26 27 28 29 30 31
5280 The output sequence should be:
5282 1st vec: 0 4 8 12 16 20 24 28
5283 2nd vec: 1 5 9 13 17 21 25 29
5284 3rd vec: 2 6 10 14 18 22 26 30
5285 4th vec: 3 7 11 15 19 23 27 31
5287 i.e., the first output vector should contain the first elements of each
5288 interleaving group, etc.
5290 We use extract_even/odd instructions to create such output. The input of
5291 each extract_even/odd operation is two vectors
5295 and the output is the vector of extracted even/odd elements. The output of
5296 extract_even will be: 0 2 4 6
5297 and of extract_odd: 1 3 5 7
5300 The permutation is done in log LENGTH stages. In each stage extract_even
5301 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5302 their order. In our example,
5304 E1: extract_even (1st vec, 2nd vec)
5305 E2: extract_odd (1st vec, 2nd vec)
5306 E3: extract_even (3rd vec, 4th vec)
5307 E4: extract_odd (3rd vec, 4th vec)
5309 The output for the first stage will be:
5311 E1: 0 2 4 6 8 10 12 14
5312 E2: 1 3 5 7 9 11 13 15
5313 E3: 16 18 20 22 24 26 28 30
5314 E4: 17 19 21 23 25 27 29 31
5316 In order to proceed and create the correct sequence for the next stage (or
5317 for the correct output, if the second stage is the last one, as in our
5318 example), we first put the output of extract_even operation and then the
5319 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5320 The input for the second stage is:
5322 1st vec (E1): 0 2 4 6 8 10 12 14
5323 2nd vec (E3): 16 18 20 22 24 26 28 30
5324 3rd vec (E2): 1 3 5 7 9 11 13 15
5325 4th vec (E4): 17 19 21 23 25 27 29 31
5327 The output of the second stage:
5329 E1: 0 4 8 12 16 20 24 28
5330 E2: 2 6 10 14 18 22 26 30
5331 E3: 1 5 9 13 17 21 25 29
5332 E4: 3 7 11 15 19 23 27 31
5334 And RESULT_CHAIN after reordering:
5336 1st vec (E1): 0 4 8 12 16 20 24 28
5337 2nd vec (E3): 1 5 9 13 17 21 25 29
5338 3rd vec (E2): 2 6 10 14 18 22 26 30
5339 4th vec (E4): 3 7 11 15 19 23 27 31. */
5342 vect_permute_load_chain (vec
<tree
> dr_chain
,
5343 unsigned int length
,
5345 gimple_stmt_iterator
*gsi
,
5346 vec
<tree
> *result_chain
)
5348 tree data_ref
, first_vect
, second_vect
;
5349 tree perm_mask_even
, perm_mask_odd
;
5350 tree perm3_mask_low
, perm3_mask_high
;
5352 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5353 unsigned int i
, j
, log_length
= exact_log2 (length
);
5354 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5356 result_chain
->quick_grow (length
);
5357 memcpy (result_chain
->address (), dr_chain
.address (),
5358 length
* sizeof (tree
));
5364 vec_perm_builder
sel (nelt
, nelt
, 1);
5365 sel
.quick_grow (nelt
);
5366 vec_perm_indices indices
;
5367 for (k
= 0; k
< 3; k
++)
5369 for (i
= 0; i
< nelt
; i
++)
5370 if (3 * i
+ k
< 2 * nelt
)
5374 indices
.new_vector (sel
, 2, nelt
);
5375 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5377 for (i
= 0, j
= 0; i
< nelt
; i
++)
5378 if (3 * i
+ k
< 2 * nelt
)
5381 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5382 indices
.new_vector (sel
, 2, nelt
);
5383 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5385 first_vect
= dr_chain
[0];
5386 second_vect
= dr_chain
[1];
5388 /* Create interleaving stmt (low part of):
5389 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5391 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5392 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5393 second_vect
, perm3_mask_low
);
5394 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5396 /* Create interleaving stmt (high part of):
5397 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5399 first_vect
= data_ref
;
5400 second_vect
= dr_chain
[2];
5401 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5402 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5403 second_vect
, perm3_mask_high
);
5404 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5405 (*result_chain
)[k
] = data_ref
;
5410 /* If length is not equal to 3 then only power of 2 is supported. */
5411 gcc_assert (pow2p_hwi (length
));
5413 /* The encoding has a single stepped pattern. */
5414 vec_perm_builder
sel (nelt
, 1, 3);
5416 for (i
= 0; i
< 3; ++i
)
5418 vec_perm_indices
indices (sel
, 2, nelt
);
5419 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5421 for (i
= 0; i
< 3; ++i
)
5423 indices
.new_vector (sel
, 2, nelt
);
5424 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5426 for (i
= 0; i
< log_length
; i
++)
5428 for (j
= 0; j
< length
; j
+= 2)
5430 first_vect
= dr_chain
[j
];
5431 second_vect
= dr_chain
[j
+1];
5433 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5434 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5435 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5436 first_vect
, second_vect
,
5438 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5439 (*result_chain
)[j
/2] = data_ref
;
5441 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5442 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5443 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5444 first_vect
, second_vect
,
5446 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5447 (*result_chain
)[j
/2+length
/2] = data_ref
;
5449 memcpy (dr_chain
.address (), result_chain
->address (),
5450 length
* sizeof (tree
));
5455 /* Function vect_shift_permute_load_chain.
5457 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5458 sequence of stmts to reorder the input data accordingly.
5459 Return the final references for loads in RESULT_CHAIN.
5460 Return true if successed, false otherwise.
5462 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5463 The input is 3 vectors each containing 8 elements. We assign a
5464 number to each element, the input sequence is:
5466 1st vec: 0 1 2 3 4 5 6 7
5467 2nd vec: 8 9 10 11 12 13 14 15
5468 3rd vec: 16 17 18 19 20 21 22 23
5470 The output sequence should be:
5472 1st vec: 0 3 6 9 12 15 18 21
5473 2nd vec: 1 4 7 10 13 16 19 22
5474 3rd vec: 2 5 8 11 14 17 20 23
5476 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5478 First we shuffle all 3 vectors to get correct elements order:
5480 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5481 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5482 3rd vec: (16 19 22) (17 20 23) (18 21)
5484 Next we unite and shift vector 3 times:
5487 shift right by 6 the concatenation of:
5488 "1st vec" and "2nd vec"
5489 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5490 "2nd vec" and "3rd vec"
5491 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5492 "3rd vec" and "1st vec"
5493 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5496 So that now new vectors are:
5498 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5499 2nd vec: (10 13) (16 19 22) (17 20 23)
5500 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5503 shift right by 5 the concatenation of:
5504 "1st vec" and "3rd vec"
5505 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5506 "2nd vec" and "1st vec"
5507 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5508 "3rd vec" and "2nd vec"
5509 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5512 So that now new vectors are:
5514 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5515 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5516 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5519 shift right by 5 the concatenation of:
5520 "1st vec" and "1st vec"
5521 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5522 shift right by 3 the concatenation of:
5523 "2nd vec" and "2nd vec"
5524 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5527 So that now all vectors are READY:
5528 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5529 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5530 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5532 This algorithm is faster than one in vect_permute_load_chain if:
5533 1. "shift of a concatination" is faster than general permutation.
5535 2. The TARGET machine can't execute vector instructions in parallel.
5536 This is because each step of the algorithm depends on previous.
5537 The algorithm in vect_permute_load_chain is much more parallel.
5539 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5543 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5544 unsigned int length
,
5546 gimple_stmt_iterator
*gsi
,
5547 vec
<tree
> *result_chain
)
5549 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5550 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5551 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5554 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5556 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5557 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5558 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5560 unsigned HOST_WIDE_INT vf
;
5561 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
5562 /* Not supported for variable-length vectors. */
5565 vec_perm_builder
sel (nelt
, nelt
, 1);
5566 sel
.quick_grow (nelt
);
5568 result_chain
->quick_grow (length
);
5569 memcpy (result_chain
->address (), dr_chain
.address (),
5570 length
* sizeof (tree
));
5572 if (pow2p_hwi (length
) && vf
> 4)
5574 unsigned int j
, log_length
= exact_log2 (length
);
5575 for (i
= 0; i
< nelt
/ 2; ++i
)
5577 for (i
= 0; i
< nelt
/ 2; ++i
)
5578 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5579 vec_perm_indices
indices (sel
, 2, nelt
);
5580 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5582 if (dump_enabled_p ())
5583 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5584 "shuffle of 2 fields structure is not \
5585 supported by target\n");
5588 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
5590 for (i
= 0; i
< nelt
/ 2; ++i
)
5592 for (i
= 0; i
< nelt
/ 2; ++i
)
5593 sel
[nelt
/ 2 + i
] = i
* 2;
5594 indices
.new_vector (sel
, 2, nelt
);
5595 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5597 if (dump_enabled_p ())
5598 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5599 "shuffle of 2 fields structure is not \
5600 supported by target\n");
5603 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
5605 /* Generating permutation constant to shift all elements.
5606 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5607 for (i
= 0; i
< nelt
; i
++)
5608 sel
[i
] = nelt
/ 2 + i
;
5609 indices
.new_vector (sel
, 2, nelt
);
5610 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5614 "shift permutation is not supported by target\n");
5617 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5619 /* Generating permutation constant to select vector from 2.
5620 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5621 for (i
= 0; i
< nelt
/ 2; i
++)
5623 for (i
= nelt
/ 2; i
< nelt
; i
++)
5625 indices
.new_vector (sel
, 2, nelt
);
5626 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5628 if (dump_enabled_p ())
5629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5630 "select is not supported by target\n");
5633 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5635 for (i
= 0; i
< log_length
; i
++)
5637 for (j
= 0; j
< length
; j
+= 2)
5639 first_vect
= dr_chain
[j
];
5640 second_vect
= dr_chain
[j
+ 1];
5642 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5643 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5644 first_vect
, first_vect
,
5646 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5649 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5650 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5651 second_vect
, second_vect
,
5653 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5656 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5657 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5658 vect
[0], vect
[1], shift1_mask
);
5659 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5660 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5662 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5663 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5664 vect
[0], vect
[1], select_mask
);
5665 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5666 (*result_chain
)[j
/2] = data_ref
;
5668 memcpy (dr_chain
.address (), result_chain
->address (),
5669 length
* sizeof (tree
));
5673 if (length
== 3 && vf
> 2)
5675 unsigned int k
= 0, l
= 0;
5677 /* Generating permutation constant to get all elements in rigth order.
5678 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5679 for (i
= 0; i
< nelt
; i
++)
5681 if (3 * k
+ (l
% 3) >= nelt
)
5684 l
+= (3 - (nelt
% 3));
5686 sel
[i
] = 3 * k
+ (l
% 3);
5689 vec_perm_indices
indices (sel
, 2, nelt
);
5690 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5694 "shuffle of 3 fields structure is not \
5695 supported by target\n");
5698 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5700 /* Generating permutation constant to shift all elements.
5701 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5702 for (i
= 0; i
< nelt
; i
++)
5703 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5704 indices
.new_vector (sel
, 2, nelt
);
5705 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5707 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5709 "shift permutation is not supported by target\n");
5712 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5714 /* Generating permutation constant to shift all elements.
5715 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5716 for (i
= 0; i
< nelt
; i
++)
5717 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5718 indices
.new_vector (sel
, 2, nelt
);
5719 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5723 "shift permutation is not supported by target\n");
5726 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5728 /* Generating permutation constant to shift all elements.
5729 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5730 for (i
= 0; i
< nelt
; i
++)
5731 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5732 indices
.new_vector (sel
, 2, nelt
);
5733 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5735 if (dump_enabled_p ())
5736 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5737 "shift permutation is not supported by target\n");
5740 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5742 /* Generating permutation constant to shift all elements.
5743 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5744 for (i
= 0; i
< nelt
; i
++)
5745 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5746 indices
.new_vector (sel
, 2, nelt
);
5747 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5749 if (dump_enabled_p ())
5750 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5751 "shift permutation is not supported by target\n");
5754 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
5756 for (k
= 0; k
< 3; k
++)
5758 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5759 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5760 dr_chain
[k
], dr_chain
[k
],
5762 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5766 for (k
= 0; k
< 3; k
++)
5768 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5769 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5770 vect
[k
% 3], vect
[(k
+ 1) % 3],
5772 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5773 vect_shift
[k
] = data_ref
;
5776 for (k
= 0; k
< 3; k
++)
5778 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5779 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5780 vect_shift
[(4 - k
) % 3],
5781 vect_shift
[(3 - k
) % 3],
5783 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5787 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5789 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5790 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5791 vect
[0], shift3_mask
);
5792 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5793 (*result_chain
)[nelt
% 3] = data_ref
;
5795 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5796 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5797 vect
[1], shift4_mask
);
5798 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5799 (*result_chain
)[0] = data_ref
;
5805 /* Function vect_transform_grouped_load.
5807 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5808 to perform their permutation and ascribe the result vectorized statements to
5809 the scalar statements.
5813 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5814 gimple_stmt_iterator
*gsi
)
5817 vec
<tree
> result_chain
= vNULL
;
5819 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5820 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5821 vectors, that are ready for vector computation. */
5822 result_chain
.create (size
);
5824 /* If reassociation width for vector type is 2 or greater target machine can
5825 execute 2 or more vector instructions in parallel. Otherwise try to
5826 get chain for loads group using vect_shift_permute_load_chain. */
5827 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5828 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5830 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5831 gsi
, &result_chain
))
5832 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5833 vect_record_grouped_load_vectors (stmt
, result_chain
);
5834 result_chain
.release ();
5837 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5838 generated as part of the vectorization of STMT. Assign the statement
5839 for each vector to the associated scalar statement. */
5842 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5844 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5845 gimple
*next_stmt
, *new_stmt
;
5846 unsigned int i
, gap_count
;
5849 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5850 Since we scan the chain starting from it's first node, their order
5851 corresponds the order of data-refs in RESULT_CHAIN. */
5852 next_stmt
= first_stmt
;
5854 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5859 /* Skip the gaps. Loads created for the gaps will be removed by dead
5860 code elimination pass later. No need to check for the first stmt in
5861 the group, since it always exists.
5862 GROUP_GAP is the number of steps in elements from the previous
5863 access (if there is no gap GROUP_GAP is 1). We skip loads that
5864 correspond to the gaps. */
5865 if (next_stmt
!= first_stmt
5866 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5874 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5875 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5876 copies, and we put the new vector statement in the first available
5878 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5879 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5882 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5885 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5887 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5890 prev_stmt
= rel_stmt
;
5892 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5895 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5900 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5902 /* If NEXT_STMT accesses the same DR as the previous statement,
5903 put the same TMP_DATA_REF as its vectorized statement; otherwise
5904 get the next data-ref from RESULT_CHAIN. */
5905 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5911 /* Function vect_force_dr_alignment_p.
5913 Returns whether the alignment of a DECL can be forced to be aligned
5914 on ALIGNMENT bit boundary. */
5917 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5922 if (decl_in_symtab_p (decl
)
5923 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5926 if (TREE_STATIC (decl
))
5927 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5929 return (alignment
<= MAX_STACK_ALIGNMENT
);
5933 /* Return whether the data reference DR is supported with respect to its
5935 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5936 it is aligned, i.e., check if it is possible to vectorize it with different
5939 enum dr_alignment_support
5940 vect_supportable_dr_alignment (struct data_reference
*dr
,
5941 bool check_aligned_accesses
)
5943 gimple
*stmt
= DR_STMT (dr
);
5944 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5945 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5946 machine_mode mode
= TYPE_MODE (vectype
);
5947 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5948 struct loop
*vect_loop
= NULL
;
5949 bool nested_in_vect_loop
= false;
5951 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5954 /* For now assume all conditional loads/stores support unaligned
5955 access without any special code. */
5956 if (is_gimple_call (stmt
)
5957 && gimple_call_internal_p (stmt
)
5958 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5959 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5960 return dr_unaligned_supported
;
5964 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5965 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5968 /* Possibly unaligned access. */
5970 /* We can choose between using the implicit realignment scheme (generating
5971 a misaligned_move stmt) and the explicit realignment scheme (generating
5972 aligned loads with a REALIGN_LOAD). There are two variants to the
5973 explicit realignment scheme: optimized, and unoptimized.
5974 We can optimize the realignment only if the step between consecutive
5975 vector loads is equal to the vector size. Since the vector memory
5976 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5977 is guaranteed that the misalignment amount remains the same throughout the
5978 execution of the vectorized loop. Therefore, we can create the
5979 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5980 at the loop preheader.
5982 However, in the case of outer-loop vectorization, when vectorizing a
5983 memory access in the inner-loop nested within the LOOP that is now being
5984 vectorized, while it is guaranteed that the misalignment of the
5985 vectorized memory access will remain the same in different outer-loop
5986 iterations, it is *not* guaranteed that is will remain the same throughout
5987 the execution of the inner-loop. This is because the inner-loop advances
5988 with the original scalar step (and not in steps of VS). If the inner-loop
5989 step happens to be a multiple of VS, then the misalignment remains fixed
5990 and we can use the optimized realignment scheme. For example:
5996 When vectorizing the i-loop in the above example, the step between
5997 consecutive vector loads is 1, and so the misalignment does not remain
5998 fixed across the execution of the inner-loop, and the realignment cannot
5999 be optimized (as illustrated in the following pseudo vectorized loop):
6001 for (i=0; i<N; i+=4)
6002 for (j=0; j<M; j++){
6003 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6004 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6005 // (assuming that we start from an aligned address).
6008 We therefore have to use the unoptimized realignment scheme:
6010 for (i=0; i<N; i+=4)
6011 for (j=k; j<M; j+=4)
6012 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6013 // that the misalignment of the initial address is
6016 The loop can then be vectorized as follows:
6018 for (k=0; k<4; k++){
6019 rt = get_realignment_token (&vp[k]);
6020 for (i=0; i<N; i+=4){
6022 for (j=k; j<M; j+=4){
6024 va = REALIGN_LOAD <v1,v2,rt>;
6031 if (DR_IS_READ (dr
))
6033 bool is_packed
= false;
6034 tree type
= (TREE_TYPE (DR_REF (dr
)));
6036 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6037 && (!targetm
.vectorize
.builtin_mask_for_load
6038 || targetm
.vectorize
.builtin_mask_for_load ()))
6040 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6042 /* If we are doing SLP then the accesses need not have the
6043 same alignment, instead it depends on the SLP group size. */
6045 && STMT_SLP_TYPE (stmt_info
)
6046 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6047 * GROUP_SIZE (vinfo_for_stmt
6048 (GROUP_FIRST_ELEMENT (stmt_info
))),
6049 TYPE_VECTOR_SUBPARTS (vectype
)))
6051 else if (!loop_vinfo
6052 || (nested_in_vect_loop
6053 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6054 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6055 return dr_explicit_realign
;
6057 return dr_explicit_realign_optimized
;
6059 if (!known_alignment_for_access_p (dr
))
6060 is_packed
= not_size_aligned (DR_REF (dr
));
6062 if (targetm
.vectorize
.support_vector_misalignment
6063 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6064 /* Can't software pipeline the loads, but can at least do them. */
6065 return dr_unaligned_supported
;
6069 bool is_packed
= false;
6070 tree type
= (TREE_TYPE (DR_REF (dr
)));
6072 if (!known_alignment_for_access_p (dr
))
6073 is_packed
= not_size_aligned (DR_REF (dr
));
6075 if (targetm
.vectorize
.support_vector_misalignment
6076 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6077 return dr_unaligned_supported
;
6081 return dr_unaligned_unsupported
;