1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC
"]\n",
79 GET_MODE_NAME (mode
), count
);
84 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
88 "cannot use %s<%s><%s>\n", name
,
89 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE
, vect_location
,
95 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
96 GET_MODE_NAME (mode
));
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
120 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
121 HOST_WIDE_INT
*rhs_size_unit
)
123 tree scalar_type
= gimple_expr_type (stmt
);
124 HOST_WIDE_INT lhs
, rhs
;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
131 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
133 if (is_gimple_assign (stmt
)
134 && (gimple_assign_cast_p (stmt
)
135 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
136 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
137 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
139 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
141 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
143 scalar_type
= rhs_type
;
146 *lhs_size_unit
= lhs
;
147 *rhs_size_unit
= rhs
;
152 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
153 tested at run-time. Return TRUE if DDR was successfully inserted.
154 Return false if versioning is not supported. */
157 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
159 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
161 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
164 if (!runtime_alias_check_p (ddr
, loop
,
165 optimize_loop_nest_for_speed_p (loop
)))
168 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
172 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
175 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
177 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
178 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
179 if (checks
[i
] == value
)
182 if (dump_enabled_p ())
184 dump_printf_loc (MSG_NOTE
, vect_location
, "need run-time check that ");
185 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, value
);
186 dump_printf (MSG_NOTE
, " is nonzero\n");
188 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
191 /* Return true if we know that the order of vectorized STMT_A and
192 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
193 At least one of the statements is a write. */
196 vect_preserves_scalar_order_p (gimple
*stmt_a
, gimple
*stmt_b
)
198 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (stmt_a
);
199 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (stmt_b
);
201 /* Single statements are always kept in their original order. */
202 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
203 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
206 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
207 group are emitted at the position of the first scalar load and all
208 stores in a group are emitted at the position of the last scalar store.
209 Thus writes will happen no earlier than their current position
210 (but could happen later) while reads will happen no later than their
211 current position (but could happen earlier). Reordering is therefore
212 only possible if the first access is a write. */
213 gimple
*earlier_stmt
= get_earlier_stmt (stmt_a
, stmt_b
);
214 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
)));
217 /* A subroutine of vect_analyze_data_ref_dependence. Handle
218 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
219 distances. These distances are conservatively correct but they don't
220 reflect a guaranteed dependence.
222 Return true if this function does all the work necessary to avoid
223 an alias or false if the caller should use the dependence distances
224 to limit the vectorization factor in the usual way. LOOP_DEPTH is
225 the depth of the loop described by LOOP_VINFO and the other arguments
226 are as for vect_analyze_data_ref_dependence. */
229 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
230 loop_vec_info loop_vinfo
,
231 int loop_depth
, unsigned int *max_vf
)
233 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
234 lambda_vector dist_v
;
236 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
238 int dist
= dist_v
[loop_depth
];
239 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
241 /* If the user asserted safelen >= DIST consecutive iterations
242 can be executed concurrently, assume independence.
244 ??? An alternative would be to add the alias check even
245 in this case, and vectorize the fallback loop with the
246 maximum VF set to safelen. However, if the user has
247 explicitly given a length, it's less likely that that
249 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
251 if ((unsigned int) loop
->safelen
< *max_vf
)
252 *max_vf
= loop
->safelen
;
253 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
257 /* For dependence distances of 2 or more, we have the option
258 of limiting VF or checking for an alias at runtime.
259 Prefer to check at runtime if we can, to avoid limiting
260 the VF unnecessarily when the bases are in fact independent.
262 Note that the alias checks will be removed if the VF ends up
263 being small enough. */
264 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
271 /* Function vect_analyze_data_ref_dependence.
273 Return TRUE if there (might) exist a dependence between a memory-reference
274 DRA and a memory-reference DRB. When versioning for alias may check a
275 dependence at run-time, return FALSE. Adjust *MAX_VF according to
276 the data dependence. */
279 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
280 loop_vec_info loop_vinfo
,
281 unsigned int *max_vf
)
284 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
285 struct data_reference
*dra
= DDR_A (ddr
);
286 struct data_reference
*drb
= DDR_B (ddr
);
287 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
288 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
289 lambda_vector dist_v
;
290 unsigned int loop_depth
;
292 /* In loop analysis all data references should be vectorizable. */
293 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
294 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
297 /* Independent data accesses. */
298 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
302 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
305 /* We do not have to consider dependences between accesses that belong
306 to the same group. */
307 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
308 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
311 /* Even if we have an anti-dependence then, as the vectorized loop covers at
312 least two scalar iterations, there is always also a true dependence.
313 As the vectorizer does not re-order loads and stores we can ignore
314 the anti-dependence if TBAA can disambiguate both DRs similar to the
315 case with known negative distance anti-dependences (positive
316 distance anti-dependences would violate TBAA constraints). */
317 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
318 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
319 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
320 get_alias_set (DR_REF (drb
))))
323 /* Unknown data dependence. */
324 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop
->safelen
>= 2)
330 if ((unsigned int) loop
->safelen
< *max_vf
)
331 *max_vf
= loop
->safelen
;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
342 "versioning for alias not supported for: "
343 "can't determine dependence between ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
346 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
349 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
357 "versioning for alias required: "
358 "can't determine dependence between ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
361 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
362 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
364 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
367 /* Add to list of ddrs that need to be tested at run-time. */
368 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
371 /* Known data dependence. */
372 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
374 /* If user asserted safelen consecutive iterations can be
375 executed concurrently, assume independence. */
376 if (loop
->safelen
>= 2)
378 if ((unsigned int) loop
->safelen
< *max_vf
)
379 *max_vf
= loop
->safelen
;
380 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
384 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
385 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
390 "versioning for alias not supported for: "
391 "bad dist vector for ");
392 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
394 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
395 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
397 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
405 "versioning for alias required: "
406 "bad dist vector for ");
407 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
408 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
410 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
412 /* Add to list of ddrs that need to be tested at run-time. */
413 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
416 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
418 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
419 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
423 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
425 int dist
= dist_v
[loop_depth
];
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE
, vect_location
,
429 "dependence distance = %d.\n", dist
);
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "dependence distance == 0 between ");
437 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
438 dump_printf (MSG_NOTE
, " and ");
439 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
440 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
443 /* When we perform grouped accesses and perform implicit CSE
444 by detecting equal accesses and doing disambiguation with
445 runtime alias tests like for
453 where we will end up loading { a[i], a[i+1] } once, make
454 sure that inserting group loads before the first load and
455 stores after the last store will do the right thing.
456 Similar for groups like
460 where loads from the group interleave with the store. */
461 if (!vect_preserves_scalar_order_p (DR_STMT (dra
), DR_STMT (drb
)))
463 if (dump_enabled_p ())
464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
465 "READ_WRITE dependence in interleaving.\n");
469 if (!loop
->force_vectorize
)
471 tree indicator
= dr_zero_step_indicator (dra
);
472 if (TREE_CODE (indicator
) != INTEGER_CST
)
473 vect_check_nonzero_value (loop_vinfo
, indicator
);
474 else if (integer_zerop (indicator
))
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
478 "access also has a zero step\n");
485 if (dist
> 0 && DDR_REVERSED_P (ddr
))
487 /* If DDR_REVERSED_P the order of the data-refs in DDR was
488 reversed (to make distance vector positive), and the actual
489 distance is negative. */
490 if (dump_enabled_p ())
491 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
492 "dependence distance negative.\n");
493 /* Record a negative dependence distance to later limit the
494 amount of stmt copying / unrolling we can perform.
495 Only need to handle read-after-write dependence. */
497 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
498 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
499 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
503 unsigned int abs_dist
= abs (dist
);
504 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
506 /* The dependence distance requires reduction of the maximal
507 vectorization factor. */
508 *max_vf
= abs (dist
);
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE
, vect_location
,
511 "adjusting maximal vectorization factor to %i\n",
515 if (abs_dist
>= *max_vf
)
517 /* Dependence distance does not create dependence, as far as
518 vectorization is concerned, in this case. */
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE
, vect_location
,
521 "dependence distance >= VF.\n");
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
528 "not vectorized, possible dependence "
529 "between data-refs ");
530 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
531 dump_printf (MSG_NOTE
, " and ");
532 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
533 dump_printf (MSG_NOTE
, "\n");
542 /* Function vect_analyze_data_ref_dependences.
544 Examine all the data references in the loop, and make sure there do not
545 exist any data dependences between them. Set *MAX_VF according to
546 the maximum vectorization factor the data dependences allow. */
549 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
550 unsigned int *max_vf
)
553 struct data_dependence_relation
*ddr
;
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE
, vect_location
,
557 "=== vect_analyze_data_ref_dependences ===\n");
559 LOOP_VINFO_DDRS (loop_vinfo
)
560 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
561 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
562 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
563 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
564 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
565 &LOOP_VINFO_DDRS (loop_vinfo
),
566 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
569 /* For epilogues we either have no aliases or alias versioning
570 was applied to original loop. Therefore we may just get max_vf
571 using VF of original loop. */
572 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
573 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
575 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
576 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
583 /* Function vect_slp_analyze_data_ref_dependence.
585 Return TRUE if there (might) exist a dependence between a memory-reference
586 DRA and a memory-reference DRB. When versioning for alias may check a
587 dependence at run-time, return FALSE. Adjust *MAX_VF according to
588 the data dependence. */
591 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
593 struct data_reference
*dra
= DDR_A (ddr
);
594 struct data_reference
*drb
= DDR_B (ddr
);
596 /* We need to check dependences of statements marked as unvectorizable
597 as well, they still can prohibit vectorization. */
599 /* Independent data accesses. */
600 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
606 /* Read-read is OK. */
607 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
610 /* If dra and drb are part of the same interleaving chain consider
612 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
613 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
614 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
617 /* Unknown data dependence. */
618 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
620 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
623 "can't determine dependence between ");
624 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
625 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
626 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
627 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
630 else if (dump_enabled_p ())
632 dump_printf_loc (MSG_NOTE
, vect_location
,
633 "determined dependence between ");
634 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
635 dump_printf (MSG_NOTE
, " and ");
636 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
637 dump_printf (MSG_NOTE
, "\n");
644 /* Analyze dependences involved in the transform of SLP NODE. STORES
645 contain the vector of scalar stores of this instance if we are
646 disambiguating the loads. */
649 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
650 vec
<gimple
*> stores
, gimple
*last_store
)
652 /* This walks over all stmts involved in the SLP load/store done
653 in NODE verifying we can sink them up to the last stmt in the
655 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
656 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
658 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
659 if (access
== last_access
)
661 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
662 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
663 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
665 gimple
*stmt
= gsi_stmt (gsi
);
666 if (! gimple_vuse (stmt
)
667 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
670 /* If we couldn't record a (single) data reference for this
671 stmt we have to give up. */
672 /* ??? Here and below if dependence analysis fails we can resort
673 to the alias oracle which can handle more kinds of stmts. */
674 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
678 bool dependent
= false;
679 /* If we run into a store of this same instance (we've just
680 marked those) then delay dependence checking until we run
681 into the last store because this is where it will have
682 been sunk to (and we verify if we can do that as well). */
683 if (gimple_visited_p (stmt
))
685 if (stmt
!= last_store
)
689 FOR_EACH_VEC_ELT (stores
, i
, store
)
691 data_reference
*store_dr
692 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
693 ddr_p ddr
= initialize_data_dependence_relation
694 (dr_a
, store_dr
, vNULL
);
695 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
696 free_dependence_relation (ddr
);
703 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
705 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
706 free_dependence_relation (ddr
);
716 /* Function vect_analyze_data_ref_dependences.
718 Examine all the data references in the basic-block, and make sure there
719 do not exist any data dependences between them. Set *MAX_VF according to
720 the maximum vectorization factor the data dependences allow. */
723 vect_slp_analyze_instance_dependence (slp_instance instance
)
725 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE
, vect_location
,
727 "=== vect_slp_analyze_instance_dependence ===\n");
729 /* The stores of this instance are at the root of the SLP tree. */
730 slp_tree store
= SLP_INSTANCE_TREE (instance
);
731 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
734 /* Verify we can sink stores to the vectorized stmt insert location. */
735 gimple
*last_store
= NULL
;
738 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
741 /* Mark stores in this instance and remember the last one. */
742 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
743 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
744 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
749 /* Verify we can sink loads to the vectorized stmt insert location,
750 special-casing stores of this instance. */
753 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
754 if (! vect_slp_analyze_node_dependences (instance
, load
,
756 ? SLP_TREE_SCALAR_STMTS (store
)
757 : vNULL
, last_store
))
763 /* Unset the visited flag. */
765 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
766 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
771 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
772 the statement that contains DRB, which is useful for recording in the
776 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
777 innermost_loop_behavior
*drb
)
780 innermost_loop_behavior
*&entry
781 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
782 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE
, vect_location
,
788 "recording new base alignment for ");
789 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
790 dump_printf (MSG_NOTE
, "\n");
791 dump_printf_loc (MSG_NOTE
, vect_location
,
792 " alignment: %d\n", drb
->base_alignment
);
793 dump_printf_loc (MSG_NOTE
, vect_location
,
794 " misalignment: %d\n", drb
->base_misalignment
);
795 dump_printf_loc (MSG_NOTE
, vect_location
,
797 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
802 /* If the region we're going to vectorize is reached, all unconditional
803 data references occur at least once. We can therefore pool the base
804 alignment guarantees from each unconditional reference. Do this by
805 going through all the data references in VINFO and checking whether
806 the containing statement makes the reference unconditionally. If so,
807 record the alignment of the base address in VINFO so that it can be
808 used for all other references with the same base. */
811 vect_record_base_alignments (vec_info
*vinfo
)
813 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
814 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
817 FOR_EACH_VEC_ELT (vinfo
->datarefs
, i
, dr
)
818 if (!DR_IS_CONDITIONAL_IN_STMT (dr
))
820 gimple
*stmt
= DR_STMT (dr
);
821 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
823 /* If DR is nested in the loop that is being vectorized, we can also
824 record the alignment of the base wrt the outer loop. */
825 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
827 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
828 vect_record_base_alignment
829 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
834 /* Return the target alignment for the vectorized form of DR. */
837 vect_calculate_target_alignment (struct data_reference
*dr
)
839 gimple
*stmt
= DR_STMT (dr
);
840 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
841 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
842 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
845 /* Function vect_compute_data_ref_alignment
847 Compute the misalignment of the data reference DR.
850 1. If during the misalignment computation it is found that the data reference
851 cannot be vectorized then false is returned.
852 2. DR_MISALIGNMENT (DR) is defined.
854 FOR NOW: No analysis is actually performed. Misalignment is calculated
855 only for trivial cases. TODO. */
858 vect_compute_data_ref_alignment (struct data_reference
*dr
)
860 gimple
*stmt
= DR_STMT (dr
);
861 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
862 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
863 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
864 struct loop
*loop
= NULL
;
865 tree ref
= DR_REF (dr
);
866 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
868 if (dump_enabled_p ())
869 dump_printf_loc (MSG_NOTE
, vect_location
,
870 "vect_compute_data_ref_alignment:\n");
873 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
875 /* Initialize misalignment to unknown. */
876 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
878 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
879 bool step_preserves_misalignment_p
;
881 unsigned HOST_WIDE_INT vector_alignment
882 = vect_calculate_target_alignment (dr
) / BITS_PER_UNIT
;
883 DR_TARGET_ALIGNMENT (dr
) = vector_alignment
;
885 /* No step for BB vectorization. */
888 gcc_assert (integer_zerop (drb
->step
));
889 step_preserves_misalignment_p
= true;
892 /* In case the dataref is in an inner-loop of the loop that is being
893 vectorized (LOOP), we use the base and misalignment information
894 relative to the outer-loop (LOOP). This is ok only if the misalignment
895 stays the same throughout the execution of the inner-loop, which is why
896 we have to check that the stride of the dataref in the inner-loop evenly
897 divides by the vector alignment. */
898 else if (nested_in_vect_loop_p (loop
, stmt
))
900 step_preserves_misalignment_p
901 = (DR_STEP_ALIGNMENT (dr
) % vector_alignment
) == 0;
903 if (dump_enabled_p ())
905 if (step_preserves_misalignment_p
)
906 dump_printf_loc (MSG_NOTE
, vect_location
,
907 "inner step divides the vector alignment.\n");
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
910 "inner step doesn't divide the vector"
915 /* Similarly we can only use base and misalignment information relative to
916 an innermost loop if the misalignment stays the same throughout the
917 execution of the loop. As above, this is the case if the stride of
918 the dataref evenly divides by the alignment. */
921 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
922 step_preserves_misalignment_p
923 = multiple_p (DR_STEP_ALIGNMENT (dr
) * vf
, vector_alignment
);
925 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
927 "step doesn't divide the vector alignment.\n");
930 unsigned int base_alignment
= drb
->base_alignment
;
931 unsigned int base_misalignment
= drb
->base_misalignment
;
933 /* Calculate the maximum of the pooled base address alignment and the
934 alignment that we can compute for DR itself. */
935 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
936 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
938 base_alignment
= (*entry
)->base_alignment
;
939 base_misalignment
= (*entry
)->base_misalignment
;
942 if (drb
->offset_alignment
< vector_alignment
943 || !step_preserves_misalignment_p
944 /* We need to know whether the step wrt the vectorized loop is
945 negative when computing the starting misalignment below. */
946 || TREE_CODE (drb
->step
) != INTEGER_CST
)
948 if (dump_enabled_p ())
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
951 "Unknown alignment for access: ");
952 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
953 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
958 if (base_alignment
< vector_alignment
)
960 tree base
= drb
->base_address
;
961 if (TREE_CODE (base
) == ADDR_EXPR
)
962 base
= TREE_OPERAND (base
, 0);
963 if (!vect_can_force_dr_alignment_p (base
,
964 vector_alignment
* BITS_PER_UNIT
))
966 if (dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE
, vect_location
,
969 "can't force alignment of ref: ");
970 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
971 dump_printf (MSG_NOTE
, "\n");
976 /* Force the alignment of the decl.
977 NOTE: This is the only change to the code we make during
978 the analysis phase, before deciding to vectorize the loop. */
979 if (dump_enabled_p ())
981 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
982 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
983 dump_printf (MSG_NOTE
, "\n");
986 DR_VECT_AUX (dr
)->base_decl
= base
;
987 DR_VECT_AUX (dr
)->base_misaligned
= true;
988 base_misalignment
= 0;
990 poly_int64 misalignment
991 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
993 /* If this is a backward running DR then first access in the larger
994 vectype actually is N-1 elements before the address in the DR.
995 Adjust misalign accordingly. */
996 if (tree_int_cst_sgn (drb
->step
) < 0)
997 /* PLUS because STEP is negative. */
998 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
999 * TREE_INT_CST_LOW (drb
->step
));
1001 unsigned int const_misalignment
;
1002 if (!known_misalignment (misalignment
, vector_alignment
,
1003 &const_misalignment
))
1005 if (dump_enabled_p ())
1007 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1008 "Non-constant misalignment for access: ");
1009 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1010 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1015 SET_DR_MISALIGNMENT (dr
, const_misalignment
);
1017 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1020 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
1021 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1022 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1028 /* Function vect_update_misalignment_for_peel.
1029 Sets DR's misalignment
1030 - to 0 if it has the same alignment as DR_PEEL,
1031 - to the misalignment computed using NPEEL if DR's salignment is known,
1032 - to -1 (unknown) otherwise.
1034 DR - the data reference whose misalignment is to be adjusted.
1035 DR_PEEL - the data reference whose misalignment is being made
1036 zero in the vector loop by the peel.
1037 NPEEL - the number of iterations in the peel loop if the misalignment
1038 of DR_PEEL is known at compile time. */
1041 vect_update_misalignment_for_peel (struct data_reference
*dr
,
1042 struct data_reference
*dr_peel
, int npeel
)
1045 vec
<dr_p
> same_aligned_drs
;
1046 struct data_reference
*current_dr
;
1047 int dr_size
= vect_get_scalar_dr_size (dr
);
1048 int dr_peel_size
= vect_get_scalar_dr_size (dr_peel
);
1049 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1050 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
1052 /* For interleaved data accesses the step in the loop must be multiplied by
1053 the size of the interleaving group. */
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1055 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
1056 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
1057 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
1059 /* It can be assumed that the data refs with the same alignment as dr_peel
1060 are aligned in the vector loop. */
1062 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
1063 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1065 if (current_dr
!= dr
)
1067 gcc_assert (!known_alignment_for_access_p (dr
)
1068 || !known_alignment_for_access_p (dr_peel
)
1069 || (DR_MISALIGNMENT (dr
) / dr_size
1070 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
1071 SET_DR_MISALIGNMENT (dr
, 0);
1075 if (known_alignment_for_access_p (dr
)
1076 && known_alignment_for_access_p (dr_peel
))
1078 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1079 int misal
= DR_MISALIGNMENT (dr
);
1080 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1081 misal
&= DR_TARGET_ALIGNMENT (dr
) - 1;
1082 SET_DR_MISALIGNMENT (dr
, misal
);
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1088 "to unknown (-1).\n");
1089 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1093 /* Function verify_data_ref_alignment
1095 Return TRUE if DR can be handled with respect to alignment. */
1098 verify_data_ref_alignment (data_reference_p dr
)
1100 enum dr_alignment_support supportable_dr_alignment
1101 = vect_supportable_dr_alignment (dr
, false);
1102 if (!supportable_dr_alignment
)
1104 if (dump_enabled_p ())
1106 if (DR_IS_READ (dr
))
1107 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1108 "not vectorized: unsupported unaligned load.");
1110 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1111 "not vectorized: unsupported unaligned "
1114 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1116 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1121 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1122 dump_printf_loc (MSG_NOTE
, vect_location
,
1123 "Vectorizing an unaligned access.\n");
1128 /* Function vect_verify_datarefs_alignment
1130 Return TRUE if all data references in the loop can be
1131 handled with respect to alignment. */
1134 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1136 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1137 struct data_reference
*dr
;
1140 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1142 gimple
*stmt
= DR_STMT (dr
);
1143 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1145 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1148 /* For interleaving, only the alignment of the first access matters. */
1149 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1150 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1153 /* Strided accesses perform only component accesses, alignment is
1154 irrelevant for them. */
1155 if (STMT_VINFO_STRIDED_P (stmt_info
)
1156 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1159 if (! verify_data_ref_alignment (dr
))
1166 /* Given an memory reference EXP return whether its alignment is less
1170 not_size_aligned (tree exp
)
1172 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1175 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1176 > get_object_alignment (exp
));
1179 /* Function vector_alignment_reachable_p
1181 Return true if vector alignment for DR is reachable by peeling
1182 a few loop iterations. Return false otherwise. */
1185 vector_alignment_reachable_p (struct data_reference
*dr
)
1187 gimple
*stmt
= DR_STMT (dr
);
1188 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1189 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1191 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1193 /* For interleaved access we peel only if number of iterations in
1194 the prolog loop ({VF - misalignment}), is a multiple of the
1195 number of the interleaved accesses. */
1196 int elem_size
, mis_in_elements
;
1198 /* FORNOW: handle only known alignment. */
1199 if (!known_alignment_for_access_p (dr
))
1202 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1203 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1204 elem_size
= vector_element_size (vector_size
, nelements
);
1205 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1207 if (!multiple_p (nelements
- mis_in_elements
, GROUP_SIZE (stmt_info
)))
1211 /* If misalignment is known at the compile time then allow peeling
1212 only if natural alignment is reachable through peeling. */
1213 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1215 HOST_WIDE_INT elmsize
=
1216 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1217 if (dump_enabled_p ())
1219 dump_printf_loc (MSG_NOTE
, vect_location
,
1220 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1221 dump_printf (MSG_NOTE
,
1222 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1224 if (DR_MISALIGNMENT (dr
) % elmsize
)
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1228 "data size does not divide the misalignment.\n");
1233 if (!known_alignment_for_access_p (dr
))
1235 tree type
= TREE_TYPE (DR_REF (dr
));
1236 bool is_packed
= not_size_aligned (DR_REF (dr
));
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1239 "Unknown misalignment, %snaturally aligned\n",
1240 is_packed
? "not " : "");
1241 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1248 /* Calculate the cost of the memory access represented by DR. */
1251 vect_get_data_access_cost (struct data_reference
*dr
,
1252 unsigned int *inside_cost
,
1253 unsigned int *outside_cost
,
1254 stmt_vector_for_cost
*body_cost_vec
)
1256 gimple
*stmt
= DR_STMT (dr
);
1257 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1258 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1261 if (PURE_SLP_STMT (stmt_info
))
1264 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1266 if (DR_IS_READ (dr
))
1267 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1268 NULL
, body_cost_vec
, false);
1270 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_NOTE
, vect_location
,
1274 "vect_get_data_access_cost: inside_cost = %d, "
1275 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1279 typedef struct _vect_peel_info
1281 struct data_reference
*dr
;
1286 typedef struct _vect_peel_extended_info
1288 struct _vect_peel_info peel_info
;
1289 unsigned int inside_cost
;
1290 unsigned int outside_cost
;
1291 } *vect_peel_extended_info
;
1294 /* Peeling hashtable helpers. */
1296 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1298 static inline hashval_t
hash (const _vect_peel_info
*);
1299 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1303 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1305 return (hashval_t
) peel_info
->npeel
;
1309 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1311 return (a
->npeel
== b
->npeel
);
1315 /* Insert DR into peeling hash table with NPEEL as key. */
1318 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1319 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1322 struct _vect_peel_info elem
, *slot
;
1323 _vect_peel_info
**new_slot
;
1324 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1327 slot
= peeling_htab
->find (&elem
);
1332 slot
= XNEW (struct _vect_peel_info
);
1333 slot
->npeel
= npeel
;
1336 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1340 if (!supportable_dr_alignment
1341 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1342 slot
->count
+= VECT_MAX_COST
;
1346 /* Traverse peeling hash table to find peeling option that aligns maximum
1347 number of data accesses. */
1350 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1351 _vect_peel_extended_info
*max
)
1353 vect_peel_info elem
= *slot
;
1355 if (elem
->count
> max
->peel_info
.count
1356 || (elem
->count
== max
->peel_info
.count
1357 && max
->peel_info
.npeel
> elem
->npeel
))
1359 max
->peel_info
.npeel
= elem
->npeel
;
1360 max
->peel_info
.count
= elem
->count
;
1361 max
->peel_info
.dr
= elem
->dr
;
1367 /* Get the costs of peeling NPEEL iterations checking data access costs
1368 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1369 misalignment will be zero after peeling. */
1372 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1373 struct data_reference
*dr0
,
1374 unsigned int *inside_cost
,
1375 unsigned int *outside_cost
,
1376 stmt_vector_for_cost
*body_cost_vec
,
1378 bool unknown_misalignment
)
1383 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1385 gimple
*stmt
= DR_STMT (dr
);
1386 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1387 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1390 /* For interleaving, only the alignment of the first access
1392 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1393 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1396 /* Strided accesses perform only component accesses, alignment is
1397 irrelevant for them. */
1398 if (STMT_VINFO_STRIDED_P (stmt_info
)
1399 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1402 int save_misalignment
;
1403 save_misalignment
= DR_MISALIGNMENT (dr
);
1406 else if (unknown_misalignment
&& dr
== dr0
)
1407 SET_DR_MISALIGNMENT (dr
, 0);
1409 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1410 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1412 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1416 /* Traverse peeling hash table and calculate cost for each peeling option.
1417 Find the one with the lowest cost. */
1420 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1421 _vect_peel_extended_info
*min
)
1423 vect_peel_info elem
= *slot
;
1425 unsigned int inside_cost
= 0, outside_cost
= 0;
1426 gimple
*stmt
= DR_STMT (elem
->dr
);
1427 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1428 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1429 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1432 prologue_cost_vec
.create (2);
1433 body_cost_vec
.create (2);
1434 epilogue_cost_vec
.create (2);
1436 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1437 elem
->dr
, &inside_cost
, &outside_cost
,
1438 &body_cost_vec
, elem
->npeel
, false);
1440 body_cost_vec
.release ();
1442 outside_cost
+= vect_get_known_peeling_cost
1443 (loop_vinfo
, elem
->npeel
, &dummy
,
1444 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1445 &prologue_cost_vec
, &epilogue_cost_vec
);
1447 /* Prologue and epilogue costs are added to the target model later.
1448 These costs depend only on the scalar iteration cost, the
1449 number of peeling iterations finally chosen, and the number of
1450 misaligned statements. So discard the information found here. */
1451 prologue_cost_vec
.release ();
1452 epilogue_cost_vec
.release ();
1454 if (inside_cost
< min
->inside_cost
1455 || (inside_cost
== min
->inside_cost
1456 && outside_cost
< min
->outside_cost
))
1458 min
->inside_cost
= inside_cost
;
1459 min
->outside_cost
= outside_cost
;
1460 min
->peel_info
.dr
= elem
->dr
;
1461 min
->peel_info
.npeel
= elem
->npeel
;
1462 min
->peel_info
.count
= elem
->count
;
1469 /* Choose best peeling option by traversing peeling hash table and either
1470 choosing an option with the lowest cost (if cost model is enabled) or the
1471 option that aligns as many accesses as possible. */
1473 static struct _vect_peel_extended_info
1474 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1475 loop_vec_info loop_vinfo
)
1477 struct _vect_peel_extended_info res
;
1479 res
.peel_info
.dr
= NULL
;
1481 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1483 res
.inside_cost
= INT_MAX
;
1484 res
.outside_cost
= INT_MAX
;
1485 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1486 vect_peeling_hash_get_lowest_cost
> (&res
);
1490 res
.peel_info
.count
= 0;
1491 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1492 vect_peeling_hash_get_most_frequent
> (&res
);
1493 res
.inside_cost
= 0;
1494 res
.outside_cost
= 0;
1500 /* Return true if the new peeling NPEEL is supported. */
1503 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1507 struct data_reference
*dr
= NULL
;
1508 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1510 stmt_vec_info stmt_info
;
1511 enum dr_alignment_support supportable_dr_alignment
;
1513 /* Ensure that all data refs can be vectorized after the peel. */
1514 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1516 int save_misalignment
;
1521 stmt
= DR_STMT (dr
);
1522 stmt_info
= vinfo_for_stmt (stmt
);
1523 /* For interleaving, only the alignment of the first access
1525 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1526 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1529 /* Strided accesses perform only component accesses, alignment is
1530 irrelevant for them. */
1531 if (STMT_VINFO_STRIDED_P (stmt_info
)
1532 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1535 save_misalignment
= DR_MISALIGNMENT (dr
);
1536 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1537 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1538 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1540 if (!supportable_dr_alignment
)
1547 /* Function vect_enhance_data_refs_alignment
1549 This pass will use loop versioning and loop peeling in order to enhance
1550 the alignment of data references in the loop.
1552 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1553 original loop is to be vectorized. Any other loops that are created by
1554 the transformations performed in this pass - are not supposed to be
1555 vectorized. This restriction will be relaxed.
1557 This pass will require a cost model to guide it whether to apply peeling
1558 or versioning or a combination of the two. For example, the scheme that
1559 intel uses when given a loop with several memory accesses, is as follows:
1560 choose one memory access ('p') which alignment you want to force by doing
1561 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1562 other accesses are not necessarily aligned, or (2) use loop versioning to
1563 generate one loop in which all accesses are aligned, and another loop in
1564 which only 'p' is necessarily aligned.
1566 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1567 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1568 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1570 Devising a cost model is the most critical aspect of this work. It will
1571 guide us on which access to peel for, whether to use loop versioning, how
1572 many versions to create, etc. The cost model will probably consist of
1573 generic considerations as well as target specific considerations (on
1574 powerpc for example, misaligned stores are more painful than misaligned
1577 Here are the general steps involved in alignment enhancements:
1579 -- original loop, before alignment analysis:
1580 for (i=0; i<N; i++){
1581 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1582 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1585 -- After vect_compute_data_refs_alignment:
1586 for (i=0; i<N; i++){
1587 x = q[i]; # DR_MISALIGNMENT(q) = 3
1588 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1591 -- Possibility 1: we do loop versioning:
1593 for (i=0; i<N; i++){ # loop 1A
1594 x = q[i]; # DR_MISALIGNMENT(q) = 3
1595 p[i] = y; # DR_MISALIGNMENT(p) = 0
1599 for (i=0; i<N; i++){ # loop 1B
1600 x = q[i]; # DR_MISALIGNMENT(q) = 3
1601 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1605 -- Possibility 2: we do loop peeling:
1606 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1610 for (i = 3; i < N; i++){ # loop 2A
1611 x = q[i]; # DR_MISALIGNMENT(q) = 0
1612 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1615 -- Possibility 3: combination of loop peeling and versioning:
1616 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1621 for (i = 3; i<N; i++){ # loop 3A
1622 x = q[i]; # DR_MISALIGNMENT(q) = 0
1623 p[i] = y; # DR_MISALIGNMENT(p) = 0
1627 for (i = 3; i<N; i++){ # loop 3B
1628 x = q[i]; # DR_MISALIGNMENT(q) = 0
1629 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1633 These loops are later passed to loop_transform to be vectorized. The
1634 vectorizer will use the alignment information to guide the transformation
1635 (whether to generate regular loads/stores, or with special handling for
1639 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1641 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1642 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1643 enum dr_alignment_support supportable_dr_alignment
;
1644 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1645 struct data_reference
*dr
;
1647 bool do_peeling
= false;
1648 bool do_versioning
= false;
1651 stmt_vec_info stmt_info
;
1652 unsigned int npeel
= 0;
1653 bool one_misalignment_known
= false;
1654 bool one_misalignment_unknown
= false;
1655 bool one_dr_unsupportable
= false;
1656 struct data_reference
*unsupportable_dr
= NULL
;
1657 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1658 unsigned possible_npeel_number
= 1;
1660 unsigned int mis
, same_align_drs_max
= 0;
1661 hash_table
<peel_info_hasher
> peeling_htab (1);
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_NOTE
, vect_location
,
1665 "=== vect_enhance_data_refs_alignment ===\n");
1667 /* Reset data so we can safely be called multiple times. */
1668 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1669 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1671 /* While cost model enhancements are expected in the future, the high level
1672 view of the code at this time is as follows:
1674 A) If there is a misaligned access then see if peeling to align
1675 this access can make all data references satisfy
1676 vect_supportable_dr_alignment. If so, update data structures
1677 as needed and return true.
1679 B) If peeling wasn't possible and there is a data reference with an
1680 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1681 then see if loop versioning checks can be used to make all data
1682 references satisfy vect_supportable_dr_alignment. If so, update
1683 data structures as needed and return true.
1685 C) If neither peeling nor versioning were successful then return false if
1686 any data reference does not satisfy vect_supportable_dr_alignment.
1688 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1690 Note, Possibility 3 above (which is peeling and versioning together) is not
1691 being done at this time. */
1693 /* (1) Peeling to force alignment. */
1695 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1697 + How many accesses will become aligned due to the peeling
1698 - How many accesses will become unaligned due to the peeling,
1699 and the cost of misaligned accesses.
1700 - The cost of peeling (the extra runtime checks, the increase
1703 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1705 stmt
= DR_STMT (dr
);
1706 stmt_info
= vinfo_for_stmt (stmt
);
1708 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1711 /* For interleaving, only the alignment of the first access
1713 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1714 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1717 /* For invariant accesses there is nothing to enhance. */
1718 if (integer_zerop (DR_STEP (dr
)))
1721 /* Strided accesses perform only component accesses, alignment is
1722 irrelevant for them. */
1723 if (STMT_VINFO_STRIDED_P (stmt_info
)
1724 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1727 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1728 do_peeling
= vector_alignment_reachable_p (dr
);
1731 if (known_alignment_for_access_p (dr
))
1733 unsigned int npeel_tmp
= 0;
1734 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1735 size_zero_node
) < 0;
1737 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1738 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr
);
1739 unsigned int dr_size
= vect_get_scalar_dr_size (dr
);
1740 mis
= (negative
? DR_MISALIGNMENT (dr
) : -DR_MISALIGNMENT (dr
));
1741 if (DR_MISALIGNMENT (dr
) != 0)
1742 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1744 /* For multiple types, it is possible that the bigger type access
1745 will have more than one peeling option. E.g., a loop with two
1746 types: one of size (vector size / 4), and the other one of
1747 size (vector size / 8). Vectorization factor will 8. If both
1748 accesses are misaligned by 3, the first one needs one scalar
1749 iteration to be aligned, and the second one needs 5. But the
1750 first one will be aligned also by peeling 5 scalar
1751 iterations, and in that case both accesses will be aligned.
1752 Hence, except for the immediate peeling amount, we also want
1753 to try to add full vector size, while we don't exceed
1754 vectorization factor.
1755 We do this automatically for cost model, since we calculate
1756 cost for every peeling option. */
1757 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1759 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1760 ? vf
* GROUP_SIZE (stmt_info
) : vf
);
1761 possible_npeel_number
1762 = vect_get_num_vectors (nscalars
, vectype
);
1764 /* NPEEL_TMP is 0 when there is no misalignment, but also
1765 allow peeling NELEMENTS. */
1766 if (DR_MISALIGNMENT (dr
) == 0)
1767 possible_npeel_number
++;
1770 /* Save info about DR in the hash table. Also include peeling
1771 amounts according to the explanation above. */
1772 for (j
= 0; j
< possible_npeel_number
; j
++)
1774 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1776 npeel_tmp
+= target_align
/ dr_size
;
1779 one_misalignment_known
= true;
1783 /* If we don't know any misalignment values, we prefer
1784 peeling for data-ref that has the maximum number of data-refs
1785 with the same alignment, unless the target prefers to align
1786 stores over load. */
1787 unsigned same_align_drs
1788 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1790 || same_align_drs_max
< same_align_drs
)
1792 same_align_drs_max
= same_align_drs
;
1795 /* For data-refs with the same number of related
1796 accesses prefer the one where the misalign
1797 computation will be invariant in the outermost loop. */
1798 else if (same_align_drs_max
== same_align_drs
)
1800 struct loop
*ivloop0
, *ivloop
;
1801 ivloop0
= outermost_invariant_loop_for_expr
1802 (loop
, DR_BASE_ADDRESS (dr0
));
1803 ivloop
= outermost_invariant_loop_for_expr
1804 (loop
, DR_BASE_ADDRESS (dr
));
1805 if ((ivloop
&& !ivloop0
)
1806 || (ivloop
&& ivloop0
1807 && flow_loop_nested_p (ivloop
, ivloop0
)))
1811 one_misalignment_unknown
= true;
1813 /* Check for data refs with unsupportable alignment that
1815 if (!supportable_dr_alignment
)
1817 one_dr_unsupportable
= true;
1818 unsupportable_dr
= dr
;
1821 if (!first_store
&& DR_IS_WRITE (dr
))
1827 if (!aligned_access_p (dr
))
1829 if (dump_enabled_p ())
1830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1831 "vector alignment may not be reachable\n");
1837 /* Check if we can possibly peel the loop. */
1838 if (!vect_can_advance_ivs_p (loop_vinfo
)
1839 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1843 struct _vect_peel_extended_info peel_for_known_alignment
;
1844 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1845 struct _vect_peel_extended_info best_peel
;
1847 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1848 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1849 peel_for_unknown_alignment
.peel_info
.count
= 0;
1852 && one_misalignment_unknown
)
1854 /* Check if the target requires to prefer stores over loads, i.e., if
1855 misaligned stores are more expensive than misaligned loads (taking
1856 drs with same alignment into account). */
1857 unsigned int load_inside_cost
= 0;
1858 unsigned int load_outside_cost
= 0;
1859 unsigned int store_inside_cost
= 0;
1860 unsigned int store_outside_cost
= 0;
1861 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1863 stmt_vector_for_cost dummy
;
1865 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1868 &dummy
, estimated_npeels
, true);
1874 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1876 &store_outside_cost
,
1877 &dummy
, estimated_npeels
, true);
1882 store_inside_cost
= INT_MAX
;
1883 store_outside_cost
= INT_MAX
;
1886 if (load_inside_cost
> store_inside_cost
1887 || (load_inside_cost
== store_inside_cost
1888 && load_outside_cost
> store_outside_cost
))
1891 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1892 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1896 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1897 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1900 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1901 prologue_cost_vec
.create (2);
1902 epilogue_cost_vec
.create (2);
1905 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1906 (loop_vinfo
, estimated_npeels
, &dummy2
,
1907 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1908 &prologue_cost_vec
, &epilogue_cost_vec
);
1910 prologue_cost_vec
.release ();
1911 epilogue_cost_vec
.release ();
1913 peel_for_unknown_alignment
.peel_info
.count
= 1
1914 + STMT_VINFO_SAME_ALIGN_REFS
1915 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1918 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1919 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1921 best_peel
= peel_for_unknown_alignment
;
1923 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1924 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1925 peel_for_known_alignment
.peel_info
.count
= 0;
1926 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1928 if (do_peeling
&& one_misalignment_known
)
1930 /* Peeling is possible, but there is no data access that is not supported
1931 unless aligned. So we try to choose the best possible peeling from
1933 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1934 (&peeling_htab
, loop_vinfo
);
1937 /* Compare costs of peeling for known and unknown alignment. */
1938 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1939 && peel_for_unknown_alignment
.inside_cost
1940 >= peel_for_known_alignment
.inside_cost
)
1942 best_peel
= peel_for_known_alignment
;
1944 /* If the best peeling for known alignment has NPEEL == 0, perform no
1945 peeling at all except if there is an unsupportable dr that we can
1947 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1951 /* If there is an unsupportable data ref, prefer this over all choices so far
1952 since we'd have to discard a chosen peeling except when it accidentally
1953 aligned the unsupportable data ref. */
1954 if (one_dr_unsupportable
)
1955 dr0
= unsupportable_dr
;
1956 else if (do_peeling
)
1958 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1959 TODO: Use nopeel_outside_cost or get rid of it? */
1960 unsigned nopeel_inside_cost
= 0;
1961 unsigned nopeel_outside_cost
= 0;
1963 stmt_vector_for_cost dummy
;
1965 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1966 &nopeel_outside_cost
, &dummy
, 0, false);
1969 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1970 costs will be recorded. */
1971 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1972 prologue_cost_vec
.create (2);
1973 epilogue_cost_vec
.create (2);
1976 nopeel_outside_cost
+= vect_get_known_peeling_cost
1977 (loop_vinfo
, 0, &dummy2
,
1978 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1979 &prologue_cost_vec
, &epilogue_cost_vec
);
1981 prologue_cost_vec
.release ();
1982 epilogue_cost_vec
.release ();
1984 npeel
= best_peel
.peel_info
.npeel
;
1985 dr0
= best_peel
.peel_info
.dr
;
1987 /* If no peeling is not more expensive than the best peeling we
1988 have so far, don't perform any peeling. */
1989 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1995 stmt
= DR_STMT (dr0
);
1996 stmt_info
= vinfo_for_stmt (stmt
);
1997 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1999 if (known_alignment_for_access_p (dr0
))
2001 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
2002 size_zero_node
) < 0;
2005 /* Since it's known at compile time, compute the number of
2006 iterations in the peeled loop (the peeling factor) for use in
2007 updating DR_MISALIGNMENT values. The peeling factor is the
2008 vectorization factor minus the misalignment as an element
2010 mis
= negative
? DR_MISALIGNMENT (dr0
) : -DR_MISALIGNMENT (dr0
);
2011 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2012 npeel
= ((mis
& (target_align
- 1))
2013 / vect_get_scalar_dr_size (dr0
));
2016 /* For interleaved data access every iteration accesses all the
2017 members of the group, therefore we divide the number of iterations
2018 by the group size. */
2019 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
2020 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2021 npeel
/= GROUP_SIZE (stmt_info
);
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_NOTE
, vect_location
,
2025 "Try peeling by %d\n", npeel
);
2028 /* Ensure that all datarefs can be vectorized after the peel. */
2029 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
2032 /* Check if all datarefs are supportable and log. */
2033 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
2035 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2042 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2045 unsigned max_allowed_peel
2046 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2047 if (max_allowed_peel
!= (unsigned)-1)
2049 unsigned max_peel
= npeel
;
2052 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2053 max_peel
= target_align
/ vect_get_scalar_dr_size (dr0
) - 1;
2055 if (max_peel
> max_allowed_peel
)
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_NOTE
, vect_location
,
2060 "Disable peeling, max peels reached: %d\n", max_peel
);
2065 /* Cost model #2 - if peeling may result in a remaining loop not
2066 iterating enough to be vectorized then do not peel. Since this
2067 is a cost heuristic rather than a correctness decision, use the
2068 most likely runtime value for variable vectorization factors. */
2070 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2072 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2073 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2074 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2075 < assumed_vf
+ max_peel
)
2081 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2082 If the misalignment of DR_i is identical to that of dr0 then set
2083 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2084 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2085 by the peeling factor times the element size of DR_i (MOD the
2086 vectorization factor times the size). Otherwise, the
2087 misalignment of DR_i must be set to unknown. */
2088 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2091 /* Strided accesses perform only component accesses, alignment
2092 is irrelevant for them. */
2093 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2094 if (STMT_VINFO_STRIDED_P (stmt_info
)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2098 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2101 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2103 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2105 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2106 = DR_MISALIGNMENT (dr0
);
2107 SET_DR_MISALIGNMENT (dr0
, 0);
2108 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_NOTE
, vect_location
,
2111 "Alignment of access forced using peeling.\n");
2112 dump_printf_loc (MSG_NOTE
, vect_location
,
2113 "Peeling for alignment will be applied.\n");
2116 /* The inside-loop cost will be accounted for in vectorizable_load
2117 and vectorizable_store correctly with adjusted alignments.
2118 Drop the body_cst_vec on the floor here. */
2119 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2125 /* (2) Versioning to force alignment. */
2127 /* Try versioning if:
2128 1) optimize loop for speed
2129 2) there is at least one unsupported misaligned data ref with an unknown
2131 3) all misaligned data refs with a known misalignment are supported, and
2132 4) the number of runtime alignment checks is within reason. */
2135 optimize_loop_nest_for_speed_p (loop
)
2136 && (!loop
->inner
); /* FORNOW */
2140 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2142 stmt
= DR_STMT (dr
);
2143 stmt_info
= vinfo_for_stmt (stmt
);
2145 /* For interleaving, only the alignment of the first access
2147 if (aligned_access_p (dr
)
2148 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2149 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2152 if (STMT_VINFO_STRIDED_P (stmt_info
))
2154 /* Strided loads perform only component accesses, alignment is
2155 irrelevant for them. */
2156 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2158 do_versioning
= false;
2162 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2164 if (!supportable_dr_alignment
)
2170 if (known_alignment_for_access_p (dr
)
2171 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2172 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2174 do_versioning
= false;
2178 stmt
= DR_STMT (dr
);
2179 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2180 gcc_assert (vectype
);
2182 /* At present we don't support versioning for alignment
2183 with variable VF, since there's no guarantee that the
2184 VF is a power of two. We could relax this if we added
2185 a way of enforcing a power-of-two size. */
2186 unsigned HOST_WIDE_INT size
;
2187 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2189 do_versioning
= false;
2193 /* The rightmost bits of an aligned address must be zeros.
2194 Construct the mask needed for this test. For example,
2195 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2196 mask must be 15 = 0xf. */
2199 /* FORNOW: use the same mask to test all potentially unaligned
2200 references in the loop. The vectorizer currently supports
2201 a single vector size, see the reference to
2202 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2203 vectorization factor is computed. */
2204 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2205 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2206 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2207 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2212 /* Versioning requires at least one misaligned data reference. */
2213 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2214 do_versioning
= false;
2215 else if (!do_versioning
)
2216 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2221 vec
<gimple
*> may_misalign_stmts
2222 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2225 /* It can now be assumed that the data references in the statements
2226 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2227 of the loop being vectorized. */
2228 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2230 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2231 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2232 SET_DR_MISALIGNMENT (dr
, 0);
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE
, vect_location
,
2235 "Alignment of access forced using versioning.\n");
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE
, vect_location
,
2240 "Versioning for alignment will be applied.\n");
2242 /* Peeling and versioning can't be done together at this time. */
2243 gcc_assert (! (do_peeling
&& do_versioning
));
2245 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2250 /* This point is reached if neither peeling nor versioning is being done. */
2251 gcc_assert (! (do_peeling
|| do_versioning
));
2253 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2258 /* Function vect_find_same_alignment_drs.
2260 Update group and alignment relations according to the chosen
2261 vectorization factor. */
2264 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2266 struct data_reference
*dra
= DDR_A (ddr
);
2267 struct data_reference
*drb
= DDR_B (ddr
);
2268 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2269 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2271 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2277 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2278 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2279 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2282 /* Two references with distance zero have the same alignment. */
2283 poly_offset_int diff
= (wi::to_poly_offset (DR_INIT (dra
))
2284 - wi::to_poly_offset (DR_INIT (drb
)));
2285 if (maybe_ne (diff
, 0))
2287 /* Get the wider of the two alignments. */
2288 unsigned int align_a
= (vect_calculate_target_alignment (dra
)
2290 unsigned int align_b
= (vect_calculate_target_alignment (drb
)
2292 unsigned int max_align
= MAX (align_a
, align_b
);
2294 /* Require the gap to be a multiple of the larger vector alignment. */
2295 if (!multiple_p (diff
, max_align
))
2299 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2300 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2301 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE
, vect_location
,
2304 "accesses have the same alignment: ");
2305 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2306 dump_printf (MSG_NOTE
, " and ");
2307 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2308 dump_printf (MSG_NOTE
, "\n");
2313 /* Function vect_analyze_data_refs_alignment
2315 Analyze the alignment of the data-references in the loop.
2316 Return FALSE if a data reference is found that cannot be vectorized. */
2319 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE
, vect_location
,
2323 "=== vect_analyze_data_refs_alignment ===\n");
2325 /* Mark groups of data references with same alignment using
2326 data dependence information. */
2327 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2328 struct data_dependence_relation
*ddr
;
2331 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2332 vect_find_same_alignment_drs (ddr
);
2334 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2335 struct data_reference
*dr
;
2337 vect_record_base_alignments (vinfo
);
2338 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2340 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2341 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2342 && !vect_compute_data_ref_alignment (dr
))
2344 /* Strided accesses perform only component accesses, misalignment
2345 information is irrelevant for them. */
2346 if (STMT_VINFO_STRIDED_P (stmt_info
)
2347 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2352 "not vectorized: can't calculate alignment "
2363 /* Analyze alignment of DRs of stmts in NODE. */
2366 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2368 /* We vectorize from the first scalar stmt in the node unless
2369 the node is permuted in which case we start from the first
2370 element in the group. */
2371 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2372 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2373 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2374 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2376 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2377 if (! vect_compute_data_ref_alignment (dr
)
2378 /* For creating the data-ref pointer we need alignment of the
2379 first element anyway. */
2381 && ! vect_compute_data_ref_alignment (first_dr
))
2382 || ! verify_data_ref_alignment (dr
))
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2386 "not vectorized: bad data alignment in basic "
2394 /* Function vect_slp_analyze_instance_alignment
2396 Analyze the alignment of the data-references in the SLP instance.
2397 Return FALSE if a data reference is found that cannot be vectorized. */
2400 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_NOTE
, vect_location
,
2404 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2408 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2409 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2412 node
= SLP_INSTANCE_TREE (instance
);
2413 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2414 && ! vect_slp_analyze_and_verify_node_alignment
2415 (SLP_INSTANCE_TREE (instance
)))
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423 accesses of legal size, step, etc. Detect gaps, single element
2424 interleaving, and other special cases. Set grouped access info.
2425 Collect groups of strided stores for further use in SLP analysis.
2426 Worker for vect_analyze_group_access. */
2429 vect_analyze_group_access_1 (struct data_reference
*dr
)
2431 tree step
= DR_STEP (dr
);
2432 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2433 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2434 gimple
*stmt
= DR_STMT (dr
);
2435 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2436 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2437 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2438 HOST_WIDE_INT dr_step
= -1;
2439 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2440 bool slp_impossible
= false;
2442 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2443 size of the interleaving group (including gaps). */
2444 if (tree_fits_shwi_p (step
))
2446 dr_step
= tree_to_shwi (step
);
2447 /* Check that STEP is a multiple of type size. Otherwise there is
2448 a non-element-sized gap at the end of the group which we
2449 cannot represent in GROUP_GAP or GROUP_SIZE.
2450 ??? As we can handle non-constant step fine here we should
2451 simply remove uses of GROUP_GAP between the last and first
2452 element and instead rely on DR_STEP. GROUP_SIZE then would
2453 simply not include that gap. */
2454 if ((dr_step
% type_size
) != 0)
2456 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_NOTE
, vect_location
,
2460 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2461 dump_printf (MSG_NOTE
,
2462 " is not a multiple of the element size for ");
2463 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2464 dump_printf (MSG_NOTE
, "\n");
2468 groupsize
= absu_hwi (dr_step
) / type_size
;
2473 /* Not consecutive access is possible only if it is a part of interleaving. */
2474 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2476 /* Check if it this DR is a part of interleaving, and is a single
2477 element of the group that is accessed in the loop. */
2479 /* Gaps are supported only for loads. STEP must be a multiple of the type
2482 && (dr_step
% type_size
) == 0
2485 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2486 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2487 GROUP_GAP (stmt_info
) = groupsize
- 1;
2488 if (dump_enabled_p ())
2490 dump_printf_loc (MSG_NOTE
, vect_location
,
2491 "Detected single element interleaving ");
2492 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2493 dump_printf (MSG_NOTE
, " step ");
2494 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2495 dump_printf (MSG_NOTE
, "\n");
2501 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2504 "not consecutive access ");
2505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2510 /* Mark the statement as unvectorizable. */
2511 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2515 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2516 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2520 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2522 /* First stmt in the interleaving chain. Check the chain. */
2523 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2524 struct data_reference
*data_ref
= dr
;
2525 unsigned int count
= 1;
2526 tree prev_init
= DR_INIT (data_ref
);
2527 gimple
*prev
= stmt
;
2528 HOST_WIDE_INT diff
, gaps
= 0;
2530 /* By construction, all group members have INTEGER_CST DR_INITs. */
2533 /* Skip same data-refs. In case that two or more stmts share
2534 data-ref (supported only for loads), we vectorize only the first
2535 stmt, and the rest get their vectorized loads from the first
2537 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2538 DR_INIT (STMT_VINFO_DATA_REF (
2539 vinfo_for_stmt (next
)))))
2541 if (DR_IS_WRITE (data_ref
))
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2545 "Two store stmts share the same dr.\n");
2549 if (dump_enabled_p ())
2550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2551 "Two or more load stmts share the same dr.\n");
2553 /* For load use the same data-ref load. */
2554 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2557 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2562 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2564 /* All group members have the same STEP by construction. */
2565 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2567 /* Check that the distance between two accesses is equal to the type
2568 size. Otherwise, we have gaps. */
2569 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2570 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2573 /* FORNOW: SLP of accesses with gaps is not supported. */
2574 slp_impossible
= true;
2575 if (DR_IS_WRITE (data_ref
))
2577 if (dump_enabled_p ())
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2579 "interleaved store with gaps\n");
2586 last_accessed_element
+= diff
;
2588 /* Store the gap from the previous member of the group. If there is no
2589 gap in the access, GROUP_GAP is always 1. */
2590 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2592 prev_init
= DR_INIT (data_ref
);
2593 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2594 /* Count the number of data-refs in the chain. */
2599 groupsize
= count
+ gaps
;
2601 /* This could be UINT_MAX but as we are generating code in a very
2602 inefficient way we have to cap earlier. See PR78699 for example. */
2603 if (groupsize
> 4096)
2605 if (dump_enabled_p ())
2606 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2607 "group is too large\n");
2611 /* Check that the size of the interleaving is equal to count for stores,
2612 i.e., that there are no gaps. */
2613 if (groupsize
!= count
2614 && !DR_IS_READ (dr
))
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2618 "interleaved store with gaps\n");
2622 /* If there is a gap after the last load in the group it is the
2623 difference between the groupsize and the last accessed
2625 When there is no gap, this difference should be 0. */
2626 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2628 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2629 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_NOTE
, vect_location
,
2632 "Detected interleaving ");
2633 if (DR_IS_READ (dr
))
2634 dump_printf (MSG_NOTE
, "load ");
2636 dump_printf (MSG_NOTE
, "store ");
2637 dump_printf (MSG_NOTE
, "of size %u starting with ",
2638 (unsigned)groupsize
);
2639 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2640 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2641 dump_printf_loc (MSG_NOTE
, vect_location
,
2642 "There is a gap of %u elements after the group\n",
2643 GROUP_GAP (vinfo_for_stmt (stmt
)));
2646 /* SLP: create an SLP data structure for every interleaving group of
2647 stores for further analysis in vect_analyse_slp. */
2648 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2651 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2653 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2660 /* Analyze groups of accesses: check that DR belongs to a group of
2661 accesses of legal size, step, etc. Detect gaps, single element
2662 interleaving, and other special cases. Set grouped access info.
2663 Collect groups of strided stores for further use in SLP analysis. */
2666 vect_analyze_group_access (struct data_reference
*dr
)
2668 if (!vect_analyze_group_access_1 (dr
))
2670 /* Dissolve the group if present. */
2672 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2675 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2676 next
= GROUP_NEXT_ELEMENT (vinfo
);
2677 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2678 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2686 /* Analyze the access pattern of the data-reference DR.
2687 In case of non-consecutive accesses call vect_analyze_group_access() to
2688 analyze groups of accesses. */
2691 vect_analyze_data_ref_access (struct data_reference
*dr
)
2693 tree step
= DR_STEP (dr
);
2694 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2695 gimple
*stmt
= DR_STMT (dr
);
2696 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2697 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2698 struct loop
*loop
= NULL
;
2700 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2704 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2706 if (loop_vinfo
&& !step
)
2708 if (dump_enabled_p ())
2709 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2710 "bad data-ref access in loop\n");
2714 /* Allow loads with zero step in inner-loop vectorization. */
2715 if (loop_vinfo
&& integer_zerop (step
))
2717 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2718 if (!nested_in_vect_loop_p (loop
, stmt
))
2719 return DR_IS_READ (dr
);
2720 /* Allow references with zero step for outer loops marked
2721 with pragma omp simd only - it guarantees absence of
2722 loop-carried dependencies between inner loop iterations. */
2723 if (!loop
->force_vectorize
)
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE
, vect_location
,
2727 "zero step in inner loop of nest\n");
2732 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2734 /* Interleaved accesses are not yet supported within outer-loop
2735 vectorization for references in the inner-loop. */
2736 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2738 /* For the rest of the analysis we use the outer-loop step. */
2739 step
= STMT_VINFO_DR_STEP (stmt_info
);
2740 if (integer_zerop (step
))
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_NOTE
, vect_location
,
2744 "zero step in outer loop.\n");
2745 return DR_IS_READ (dr
);
2750 if (TREE_CODE (step
) == INTEGER_CST
)
2752 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2753 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2755 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2757 /* Mark that it is not interleaving. */
2758 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2763 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2765 if (dump_enabled_p ())
2766 dump_printf_loc (MSG_NOTE
, vect_location
,
2767 "grouped access in outer loop.\n");
2772 /* Assume this is a DR handled by non-constant strided load case. */
2773 if (TREE_CODE (step
) != INTEGER_CST
)
2774 return (STMT_VINFO_STRIDED_P (stmt_info
)
2775 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2776 || vect_analyze_group_access (dr
)));
2778 /* Not consecutive access - check if it's a part of interleaving group. */
2779 return vect_analyze_group_access (dr
);
2782 /* Compare two data-references DRA and DRB to group them into chunks
2783 suitable for grouping. */
2786 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2788 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2789 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2792 /* Stabilize sort. */
2796 /* DRs in different loops never belong to the same group. */
2797 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2798 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2800 return loopa
->num
< loopb
->num
? -1 : 1;
2802 /* Ordering of DRs according to base. */
2803 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2804 DR_BASE_ADDRESS (drb
));
2808 /* And according to DR_OFFSET. */
2809 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2813 /* Put reads before writes. */
2814 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2815 return DR_IS_READ (dra
) ? -1 : 1;
2817 /* Then sort after access size. */
2818 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2819 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2823 /* And after step. */
2824 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2828 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2829 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2831 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2835 /* If OP is the result of a conversion, return the unconverted value,
2836 otherwise return null. */
2839 strip_conversion (tree op
)
2841 if (TREE_CODE (op
) != SSA_NAME
)
2843 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2844 if (!is_gimple_assign (stmt
)
2845 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2847 return gimple_assign_rhs1 (stmt
);
2850 /* Return true if vectorizable_* routines can handle statements STMT1
2851 and STMT2 being in a single group. */
2854 can_group_stmts_p (gimple
*stmt1
, gimple
*stmt2
)
2856 if (gimple_assign_single_p (stmt1
))
2857 return gimple_assign_single_p (stmt2
);
2859 if (is_gimple_call (stmt1
) && gimple_call_internal_p (stmt1
))
2861 /* Check for two masked loads or two masked stores. */
2862 if (!is_gimple_call (stmt2
) || !gimple_call_internal_p (stmt2
))
2864 internal_fn ifn
= gimple_call_internal_fn (stmt1
);
2865 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2867 if (ifn
!= gimple_call_internal_fn (stmt2
))
2870 /* Check that the masks are the same. Cope with casts of masks,
2871 like those created by build_mask_conversion. */
2872 tree mask1
= gimple_call_arg (stmt1
, 2);
2873 tree mask2
= gimple_call_arg (stmt2
, 2);
2874 if (!operand_equal_p (mask1
, mask2
, 0))
2876 mask1
= strip_conversion (mask1
);
2879 mask2
= strip_conversion (mask2
);
2882 if (!operand_equal_p (mask1
, mask2
, 0))
2891 /* Function vect_analyze_data_ref_accesses.
2893 Analyze the access pattern of all the data references in the loop.
2895 FORNOW: the only access pattern that is considered vectorizable is a
2896 simple step 1 (consecutive) access.
2898 FORNOW: handle only arrays and pointer accesses. */
2901 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2904 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2905 struct data_reference
*dr
;
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_NOTE
, vect_location
,
2909 "=== vect_analyze_data_ref_accesses ===\n");
2911 if (datarefs
.is_empty ())
2914 /* Sort the array of datarefs to make building the interleaving chains
2915 linear. Don't modify the original vector's order, it is needed for
2916 determining what dependencies are reversed. */
2917 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2918 datarefs_copy
.qsort (dr_group_sort_cmp
);
2920 /* Build the interleaving chains. */
2921 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2923 data_reference_p dra
= datarefs_copy
[i
];
2924 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2925 stmt_vec_info lastinfo
= NULL
;
2926 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2927 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2932 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2934 data_reference_p drb
= datarefs_copy
[i
];
2935 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2936 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2937 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2940 /* ??? Imperfect sorting (non-compatible types, non-modulo
2941 accesses, same accesses) can lead to a group to be artificially
2942 split here as we don't just skip over those. If it really
2943 matters we can push those to a worklist and re-iterate
2944 over them. The we can just skip ahead to the next DR here. */
2946 /* DRs in a different loop should not be put into the same
2947 interleaving group. */
2948 if (gimple_bb (DR_STMT (dra
))->loop_father
2949 != gimple_bb (DR_STMT (drb
))->loop_father
)
2952 /* Check that the data-refs have same first location (except init)
2953 and they are both either store or load (not load and store,
2954 not masked loads or stores). */
2955 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2956 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2957 DR_BASE_ADDRESS (drb
)) != 0
2958 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2959 || !can_group_stmts_p (DR_STMT (dra
), DR_STMT (drb
)))
2962 /* Check that the data-refs have the same constant size. */
2963 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2964 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2965 if (!tree_fits_uhwi_p (sza
)
2966 || !tree_fits_uhwi_p (szb
)
2967 || !tree_int_cst_equal (sza
, szb
))
2970 /* Check that the data-refs have the same step. */
2971 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2974 /* Check the types are compatible.
2975 ??? We don't distinguish this during sorting. */
2976 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2977 TREE_TYPE (DR_REF (drb
))))
2980 /* Check that the DR_INITs are compile-time constants. */
2981 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
2982 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
2985 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2986 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2987 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2988 HOST_WIDE_INT init_prev
2989 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
2990 gcc_assert (init_a
<= init_b
2991 && init_a
<= init_prev
2992 && init_prev
<= init_b
);
2994 /* Do not place the same access in the interleaving chain twice. */
2995 if (init_b
== init_prev
)
2997 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
2998 < gimple_uid (DR_STMT (drb
)));
2999 /* ??? For now we simply "drop" the later reference which is
3000 otherwise the same rather than finishing off this group.
3001 In the end we'd want to re-process duplicates forming
3002 multiple groups from the refs, likely by just collecting
3003 all candidates (including duplicates and split points
3004 below) in a vector and then process them together. */
3008 /* If init_b == init_a + the size of the type * k, we have an
3009 interleaving, and DRA is accessed before DRB. */
3010 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3011 if (type_size_a
== 0
3012 || (init_b
- init_a
) % type_size_a
!= 0)
3015 /* If we have a store, the accesses are adjacent. This splits
3016 groups into chunks we support (we don't support vectorization
3017 of stores with gaps). */
3018 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3021 /* If the step (if not zero or non-constant) is greater than the
3022 difference between data-refs' inits this splits groups into
3024 if (tree_fits_shwi_p (DR_STEP (dra
)))
3026 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
3027 if (step
!= 0 && step
<= (init_b
- init_a
))
3031 if (dump_enabled_p ())
3033 dump_printf_loc (MSG_NOTE
, vect_location
,
3034 "Detected interleaving ");
3035 if (DR_IS_READ (dra
))
3036 dump_printf (MSG_NOTE
, "load ");
3038 dump_printf (MSG_NOTE
, "store ");
3039 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
3040 dump_printf (MSG_NOTE
, " and ");
3041 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
3042 dump_printf (MSG_NOTE
, "\n");
3045 /* Link the found element into the group list. */
3046 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
3048 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
3049 lastinfo
= stmtinfo_a
;
3051 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
3052 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
3053 lastinfo
= stmtinfo_b
;
3057 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
3058 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
3059 && !vect_analyze_data_ref_access (dr
))
3061 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3063 "not vectorized: complicated access pattern.\n");
3065 if (is_a
<bb_vec_info
> (vinfo
))
3067 /* Mark the statement as not vectorizable. */
3068 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3073 datarefs_copy
.release ();
3078 datarefs_copy
.release ();
3082 /* Function vect_vfa_segment_size.
3085 DR: The data reference.
3086 LENGTH_FACTOR: segment length to consider.
3088 Return a value suitable for the dr_with_seg_len::seg_len field.
3089 This is the "distance travelled" by the pointer from the first
3090 iteration in the segment to the last. Note that it does not include
3091 the size of the access; in effect it only describes the first byte. */
3094 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
3096 length_factor
= size_binop (MINUS_EXPR
,
3097 fold_convert (sizetype
, length_factor
),
3099 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr
)),
3103 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3104 gives the worst-case number of bytes covered by the segment. */
3106 static unsigned HOST_WIDE_INT
3107 vect_vfa_access_size (data_reference
*dr
)
3109 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (DR_STMT (dr
));
3110 tree ref_type
= TREE_TYPE (DR_REF (dr
));
3111 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3112 unsigned HOST_WIDE_INT access_size
= ref_size
;
3113 if (GROUP_FIRST_ELEMENT (stmt_vinfo
))
3115 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo
) == DR_STMT (dr
));
3116 access_size
*= GROUP_SIZE (stmt_vinfo
) - GROUP_GAP (stmt_vinfo
);
3118 if (STMT_VINFO_VEC_STMT (stmt_vinfo
)
3119 && (vect_supportable_dr_alignment (dr
, false)
3120 == dr_explicit_realign_optimized
))
3122 /* We might access a full vector's worth. */
3123 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3124 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3129 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3132 vect_vfa_align (const data_reference
*dr
)
3134 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr
)));
3137 /* Function vect_no_alias_p.
3139 Given data references A and B with equal base and offset, see whether
3140 the alias relation can be decided at compilation time. Return 1 if
3141 it can and the references alias, 0 if it can and the references do
3142 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3143 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3144 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3147 vect_compile_time_alias (struct data_reference
*a
, struct data_reference
*b
,
3148 tree segment_length_a
, tree segment_length_b
,
3149 unsigned HOST_WIDE_INT access_size_a
,
3150 unsigned HOST_WIDE_INT access_size_b
)
3152 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
));
3153 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
));
3154 poly_uint64 const_length_a
;
3155 poly_uint64 const_length_b
;
3157 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3158 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3160 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3162 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3163 offset_a
= (offset_a
+ access_size_a
) - const_length_a
;
3166 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3167 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3169 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3170 offset_b
= (offset_b
+ access_size_b
) - const_length_b
;
3173 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3175 const_length_a
+= access_size_a
;
3176 const_length_b
+= access_size_b
;
3178 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3179 offset_b
, const_length_b
))
3182 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3183 offset_b
, const_length_b
))
3189 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3193 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3194 unsigned int loop_depth
, poly_uint64 vf
)
3196 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3197 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3200 /* If the dependence is exact, we should have limited the VF instead. */
3201 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3204 lambda_vector dist_v
;
3205 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3207 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3209 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3210 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3214 if (dump_enabled_p ())
3216 dump_printf_loc (MSG_NOTE
, vect_location
,
3217 "dependence distance between ");
3218 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3219 dump_printf (MSG_NOTE
, " and ");
3220 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3221 dump_printf (MSG_NOTE
, " is >= VF\n");
3227 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3230 dump_lower_bound (int dump_kind
, const vec_lower_bound
&lower_bound
)
3232 dump_printf (dump_kind
, "%s (", lower_bound
.unsigned_p
? "unsigned" : "abs");
3233 dump_generic_expr (dump_kind
, TDF_SLIM
, lower_bound
.expr
);
3234 dump_printf (dump_kind
, ") >= ");
3235 dump_dec (dump_kind
, lower_bound
.min_value
);
3238 /* Record that the vectorized loop requires the vec_lower_bound described
3239 by EXPR, UNSIGNED_P and MIN_VALUE. */
3242 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3243 poly_uint64 min_value
)
3245 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3246 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3247 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3249 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3250 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3251 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3252 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3254 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3255 lower_bounds
[i
].min_value
= min_value
;
3256 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_NOTE
, vect_location
,
3259 "updating run-time check to ");
3260 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3261 dump_printf (MSG_NOTE
, "\n");
3267 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3268 if (dump_enabled_p ())
3270 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3271 dump_lower_bound (MSG_NOTE
, lower_bound
);
3272 dump_printf (MSG_NOTE
, "\n");
3274 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3277 /* Return true if it's unlikely that the step of the vectorized form of DR
3278 will span fewer than GAP bytes. */
3281 vect_small_gap_p (loop_vec_info loop_vinfo
, data_reference
*dr
, poly_int64 gap
)
3283 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
3285 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3286 if (GROUP_FIRST_ELEMENT (stmt_info
))
3287 count
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
3288 return estimated_poly_value (gap
) <= count
* vect_get_scalar_dr_size (dr
);
3291 /* Return true if we know that there is no alias between DR_A and DR_B
3292 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3293 *LOWER_BOUND_OUT to this N. */
3296 vectorizable_with_step_bound_p (data_reference
*dr_a
, data_reference
*dr_b
,
3297 poly_uint64
*lower_bound_out
)
3299 /* Check that there is a constant gap of known sign between DR_A
3301 poly_int64 init_a
, init_b
;
3302 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3303 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3304 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3305 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3306 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3307 || !ordered_p (init_a
, init_b
))
3310 /* Sort DR_A and DR_B by the address they access. */
3311 if (maybe_lt (init_b
, init_a
))
3313 std::swap (init_a
, init_b
);
3314 std::swap (dr_a
, dr_b
);
3317 /* If the two accesses could be dependent within a scalar iteration,
3318 make sure that we'd retain their order. */
3319 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_a
), init_b
)
3320 && !vect_preserves_scalar_order_p (DR_STMT (dr_a
), DR_STMT (dr_b
)))
3323 /* There is no alias if abs (DR_STEP) is greater than or equal to
3324 the bytes spanned by the combination of the two accesses. */
3325 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_b
) - init_a
;
3329 /* Function vect_prune_runtime_alias_test_list.
3331 Prune a list of ddrs to be tested at run-time by versioning for alias.
3332 Merge several alias checks into one if possible.
3333 Return FALSE if resulting list of ddrs is longer then allowed by
3334 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3337 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3339 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3340 hash_set
<tree_pair_hash
> compared_objects
;
3342 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3343 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3344 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3345 vec
<vec_object_pair
> &check_unequal_addrs
3346 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3347 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3348 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE
, vect_location
,
3356 "=== vect_prune_runtime_alias_test_list ===\n");
3358 /* Step values are irrelevant for aliasing if the number of vector
3359 iterations is equal to the number of scalar iterations (which can
3360 happen for fully-SLP loops). */
3361 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3365 /* Convert the checks for nonzero steps into bound tests. */
3367 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3368 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3371 if (may_alias_ddrs
.is_empty ())
3374 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3376 unsigned int loop_depth
3377 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3378 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3380 /* First, we collect all data ref pairs for aliasing checks. */
3381 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3384 poly_uint64 lower_bound
;
3385 struct data_reference
*dr_a
, *dr_b
;
3386 gimple
*dr_group_first_a
, *dr_group_first_b
;
3387 tree segment_length_a
, segment_length_b
;
3388 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3389 unsigned int align_a
, align_b
;
3390 gimple
*stmt_a
, *stmt_b
;
3392 /* Ignore the alias if the VF we chose ended up being no greater
3393 than the dependence distance. */
3394 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3397 if (DDR_OBJECT_A (ddr
))
3399 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3400 if (!compared_objects
.add (new_pair
))
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3405 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3406 dump_printf (MSG_NOTE
, " and ");
3407 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3408 dump_printf (MSG_NOTE
, " have different addresses\n");
3410 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3416 stmt_a
= DR_STMT (DDR_A (ddr
));
3419 stmt_b
= DR_STMT (DDR_B (ddr
));
3421 /* Skip the pair if inter-iteration dependencies are irrelevant
3422 and intra-iteration dependencies are guaranteed to be honored. */
3424 && (vect_preserves_scalar_order_p (stmt_a
, stmt_b
)
3425 || vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)))
3427 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_NOTE
, vect_location
,
3430 "no need for alias check between ");
3431 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3432 dump_printf (MSG_NOTE
, " and ");
3433 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3434 dump_printf (MSG_NOTE
, " when VF is 1\n");
3439 /* See whether we can handle the alias using a bounds check on
3440 the step, and whether that's likely to be the best approach.
3441 (It might not be, for example, if the minimum step is much larger
3442 than the number of bytes handled by one vector iteration.) */
3444 && TREE_CODE (DR_STEP (dr_a
)) != INTEGER_CST
3445 && vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)
3446 && (vect_small_gap_p (loop_vinfo
, dr_a
, lower_bound
)
3447 || vect_small_gap_p (loop_vinfo
, dr_b
, lower_bound
)))
3449 bool unsigned_p
= dr_known_forward_stride_p (dr_a
);
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between ");
3453 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3454 dump_printf (MSG_NOTE
, " and ");
3455 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3456 dump_printf (MSG_NOTE
, " when the step ");
3457 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_STEP (dr_a
));
3458 dump_printf (MSG_NOTE
, " is outside ");
3460 dump_printf (MSG_NOTE
, "[0");
3463 dump_printf (MSG_NOTE
, "(");
3464 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3466 dump_printf (MSG_NOTE
, ", ");
3467 dump_dec (MSG_NOTE
, lower_bound
);
3468 dump_printf (MSG_NOTE
, ")\n");
3470 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_a
), unsigned_p
,
3475 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3476 if (dr_group_first_a
)
3478 stmt_a
= dr_group_first_a
;
3479 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3482 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3483 if (dr_group_first_b
)
3485 stmt_b
= dr_group_first_b
;
3486 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3491 segment_length_a
= size_zero_node
;
3492 segment_length_b
= size_zero_node
;
3496 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3497 length_factor
= scalar_loop_iters
;
3499 length_factor
= size_int (vect_factor
);
3500 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3501 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3503 access_size_a
= vect_vfa_access_size (dr_a
);
3504 access_size_b
= vect_vfa_access_size (dr_b
);
3505 align_a
= vect_vfa_align (dr_a
);
3506 align_b
= vect_vfa_align (dr_b
);
3508 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3509 DR_BASE_ADDRESS (dr_b
));
3511 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3514 /* See whether the alias is known at compilation time. */
3516 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3517 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3518 && poly_int_tree_p (segment_length_a
)
3519 && poly_int_tree_p (segment_length_b
))
3521 int res
= vect_compile_time_alias (dr_a
, dr_b
,
3526 if (res
>= 0 && dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE
, vect_location
,
3529 "can tell at compile time that ");
3530 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3531 dump_printf (MSG_NOTE
, " and ");
3532 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3534 dump_printf (MSG_NOTE
, " do not alias\n");
3536 dump_printf (MSG_NOTE
, " alias\n");
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE
, vect_location
,
3546 "not vectorized: compilation time alias.\n");
3551 dr_with_seg_len_pair_t dr_with_seg_len_pair
3552 (dr_with_seg_len (dr_a
, segment_length_a
, access_size_a
, align_a
),
3553 dr_with_seg_len (dr_b
, segment_length_b
, access_size_b
, align_b
));
3555 /* Canonicalize pairs by sorting the two DR members. */
3557 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3559 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3562 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3564 unsigned int count
= (comp_alias_ddrs
.length ()
3565 + check_unequal_addrs
.length ());
3567 dump_printf_loc (MSG_NOTE
, vect_location
,
3568 "improved number of alias checks from %d to %d\n",
3569 may_alias_ddrs
.length (), count
);
3570 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3574 "number of versioning for alias "
3575 "run-time tests exceeds %d "
3576 "(--param vect-max-version-for-alias-checks)\n",
3577 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3584 /* Check whether we can use an internal function for a gather load
3585 or scatter store. READ_P is true for loads and false for stores.
3586 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3587 the type of the memory elements being loaded or stored. OFFSET_BITS
3588 is the number of bits in each scalar offset and OFFSET_SIGN is the
3589 sign of the offset. SCALE is the amount by which the offset should
3590 be multiplied *after* it has been converted to address width.
3592 Return true if the function is supported, storing the function
3593 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3596 vect_gather_scatter_fn_p (bool read_p
, bool masked_p
, tree vectype
,
3597 tree memory_type
, unsigned int offset_bits
,
3598 signop offset_sign
, int scale
,
3599 internal_fn
*ifn_out
, tree
*element_type_out
)
3601 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3602 unsigned int element_bits
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype
)));
3603 if (offset_bits
> element_bits
)
3604 /* Internal functions require the offset to be the same width as
3605 the vector elements. We can extend narrower offsets, but it isn't
3606 safe to truncate wider offsets. */
3609 if (element_bits
!= memory_bits
)
3610 /* For now the vector elements must be the same width as the
3614 /* Work out which function we need. */
3617 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3619 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3621 /* Test whether the target supports this combination. */
3622 if (!internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3623 offset_sign
, scale
))
3627 *element_type_out
= TREE_TYPE (vectype
);
3631 /* CALL is a call to an internal gather load or scatter store function.
3632 Describe the operation in INFO. */
3635 vect_describe_gather_scatter_call (gcall
*call
, gather_scatter_info
*info
)
3637 stmt_vec_info stmt_info
= vinfo_for_stmt (call
);
3638 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3639 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3641 info
->ifn
= gimple_call_internal_fn (call
);
3642 info
->decl
= NULL_TREE
;
3643 info
->base
= gimple_call_arg (call
, 0);
3644 info
->offset
= gimple_call_arg (call
, 1);
3645 info
->offset_dt
= vect_unknown_def_type
;
3646 info
->offset_vectype
= NULL_TREE
;
3647 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3648 info
->element_type
= TREE_TYPE (vectype
);
3649 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3652 /* Return true if a non-affine read or write in STMT is suitable for a
3653 gather load or scatter store. Describe the operation in *INFO if so. */
3656 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3657 gather_scatter_info
*info
)
3659 HOST_WIDE_INT scale
= 1;
3660 poly_int64 pbitpos
, pbitsize
;
3661 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3662 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3663 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3664 tree offtype
= NULL_TREE
;
3665 tree decl
= NULL_TREE
, base
, off
;
3666 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3667 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3669 int punsignedp
, reversep
, pvolatilep
= 0;
3672 bool masked_p
= false;
3674 /* See whether this is already a call to a gather/scatter internal function.
3675 If not, see whether it's a masked load or store. */
3676 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
3677 if (call
&& gimple_call_internal_p (call
))
3679 ifn
= gimple_call_internal_fn (stmt
);
3680 if (internal_gather_scatter_fn_p (ifn
))
3682 vect_describe_gather_scatter_call (call
, info
);
3685 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3688 /* True if we should aim to use internal functions rather than
3689 built-in functions. */
3690 bool use_ifn_p
= (DR_IS_READ (dr
)
3691 ? supports_vec_gather_load_p ()
3692 : supports_vec_scatter_store_p ());
3695 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3696 see if we can use the def stmt of the address. */
3698 && TREE_CODE (base
) == MEM_REF
3699 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3700 && integer_zerop (TREE_OPERAND (base
, 1))
3701 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3703 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3704 if (is_gimple_assign (def_stmt
)
3705 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3706 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3709 /* The gather and scatter builtins need address of the form
3710 loop_invariant + vector * {1, 2, 4, 8}
3712 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3713 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3714 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3715 multiplications and additions in it. To get a vector, we need
3716 a single SSA_NAME that will be defined in the loop and will
3717 contain everything that is not loop invariant and that can be
3718 vectorized. The following code attempts to find such a preexistng
3719 SSA_NAME OFF and put the loop invariants into a tree BASE
3720 that can be gimplified before the loop. */
3721 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3722 &punsignedp
, &reversep
, &pvolatilep
);
3723 gcc_assert (base
&& !reversep
);
3724 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3726 if (TREE_CODE (base
) == MEM_REF
)
3728 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3730 if (off
== NULL_TREE
)
3731 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3733 off
= size_binop (PLUS_EXPR
, off
,
3734 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3736 base
= TREE_OPERAND (base
, 0);
3739 base
= build_fold_addr_expr (base
);
3741 if (off
== NULL_TREE
)
3742 off
= size_zero_node
;
3744 /* If base is not loop invariant, either off is 0, then we start with just
3745 the constant offset in the loop invariant BASE and continue with base
3746 as OFF, otherwise give up.
3747 We could handle that case by gimplifying the addition of base + off
3748 into some SSA_NAME and use that as off, but for now punt. */
3749 if (!expr_invariant_in_loop_p (loop
, base
))
3751 if (!integer_zerop (off
))
3754 base
= size_int (pbytepos
);
3756 /* Otherwise put base + constant offset into the loop invariant BASE
3757 and continue with OFF. */
3760 base
= fold_convert (sizetype
, base
);
3761 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3764 /* OFF at this point may be either a SSA_NAME or some tree expression
3765 from get_inner_reference. Try to peel off loop invariants from it
3766 into BASE as long as possible. */
3768 while (offtype
== NULL_TREE
)
3770 enum tree_code code
;
3771 tree op0
, op1
, add
= NULL_TREE
;
3773 if (TREE_CODE (off
) == SSA_NAME
)
3775 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3777 if (expr_invariant_in_loop_p (loop
, off
))
3780 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3783 op0
= gimple_assign_rhs1 (def_stmt
);
3784 code
= gimple_assign_rhs_code (def_stmt
);
3785 op1
= gimple_assign_rhs2 (def_stmt
);
3789 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3791 code
= TREE_CODE (off
);
3792 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3796 case POINTER_PLUS_EXPR
:
3798 if (expr_invariant_in_loop_p (loop
, op0
))
3803 add
= fold_convert (sizetype
, add
);
3805 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3806 base
= size_binop (PLUS_EXPR
, base
, add
);
3809 if (expr_invariant_in_loop_p (loop
, op1
))
3817 if (expr_invariant_in_loop_p (loop
, op1
))
3819 add
= fold_convert (sizetype
, op1
);
3820 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3826 if (scale
== 1 && tree_fits_shwi_p (op1
))
3828 int new_scale
= tree_to_shwi (op1
);
3829 /* Only treat this as a scaling operation if the target
3832 && !vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
,
3833 vectype
, memory_type
, 1,
3834 TYPE_SIGN (TREE_TYPE (op0
)),
3847 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3848 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3850 if (TYPE_PRECISION (TREE_TYPE (op0
))
3851 == TYPE_PRECISION (TREE_TYPE (off
)))
3857 /* The internal functions need the offset to be the same width
3858 as the elements of VECTYPE. Don't include operations that
3859 cast the offset from that width to a different width. */
3861 && (int_size_in_bytes (TREE_TYPE (vectype
))
3862 == int_size_in_bytes (TREE_TYPE (off
))))
3865 if (TYPE_PRECISION (TREE_TYPE (op0
))
3866 < TYPE_PRECISION (TREE_TYPE (off
)))
3869 offtype
= TREE_TYPE (off
);
3880 /* If at the end OFF still isn't a SSA_NAME or isn't
3881 defined in the loop, punt. */
3882 if (TREE_CODE (off
) != SSA_NAME
3883 || expr_invariant_in_loop_p (loop
, off
))
3886 if (offtype
== NULL_TREE
)
3887 offtype
= TREE_TYPE (off
);
3891 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
, vectype
,
3892 memory_type
, TYPE_PRECISION (offtype
),
3893 TYPE_SIGN (offtype
), scale
, &ifn
,
3899 if (DR_IS_READ (dr
))
3901 if (targetm
.vectorize
.builtin_gather
)
3902 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
3906 if (targetm
.vectorize
.builtin_scatter
)
3907 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
3914 element_type
= TREE_TYPE (vectype
);
3921 info
->offset_dt
= vect_unknown_def_type
;
3922 info
->offset_vectype
= NULL_TREE
;
3923 info
->scale
= scale
;
3924 info
->element_type
= element_type
;
3925 info
->memory_type
= memory_type
;
3929 /* Function vect_analyze_data_refs.
3931 Find all the data references in the loop or basic block.
3933 The general structure of the analysis of data refs in the vectorizer is as
3935 1- vect_analyze_data_refs(loop/bb): call
3936 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3937 in the loop/bb and their dependences.
3938 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3939 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3940 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3945 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
)
3947 struct loop
*loop
= NULL
;
3949 struct data_reference
*dr
;
3952 if (dump_enabled_p ())
3953 dump_printf_loc (MSG_NOTE
, vect_location
,
3954 "=== vect_analyze_data_refs ===\n");
3956 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3957 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3959 /* Go through the data-refs, check that the analysis succeeded. Update
3960 pointer from stmt_vec_info struct to DR and vectype. */
3962 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3963 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3966 stmt_vec_info stmt_info
;
3967 tree base
, offset
, init
;
3968 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3969 bool simd_lane_access
= false;
3973 if (!dr
|| !DR_REF (dr
))
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3977 "not vectorized: unhandled data-ref\n");
3981 stmt
= DR_STMT (dr
);
3982 stmt_info
= vinfo_for_stmt (stmt
);
3984 /* Discard clobbers from the dataref vector. We will remove
3985 clobber stmts during vectorization. */
3986 if (gimple_clobber_p (stmt
))
3989 if (i
== datarefs
.length () - 1)
3994 datarefs
.ordered_remove (i
);
3999 /* Check that analysis of the data-ref succeeded. */
4000 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4005 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4006 && (targetm
.vectorize
.builtin_gather
!= NULL
4007 || supports_vec_gather_load_p ());
4010 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4011 && (targetm
.vectorize
.builtin_scatter
!= NULL
4012 || supports_vec_scatter_store_p ());
4013 bool maybe_simd_lane_access
4014 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
4016 /* If target supports vector gather loads or scatter stores, or if
4017 this might be a SIMD lane access, see if they can't be used. */
4018 if (is_a
<loop_vec_info
> (vinfo
)
4019 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
4020 && !nested_in_vect_loop_p (loop
, stmt
))
4022 struct data_reference
*newdr
4023 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
4024 DR_REF (dr
), stmt
, !maybe_scatter
,
4025 DR_IS_CONDITIONAL_IN_STMT (dr
));
4026 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
4027 if (DR_BASE_ADDRESS (newdr
)
4028 && DR_OFFSET (newdr
)
4031 && integer_zerop (DR_STEP (newdr
)))
4033 if (maybe_simd_lane_access
)
4035 tree off
= DR_OFFSET (newdr
);
4037 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4038 && TREE_CODE (off
) == MULT_EXPR
4039 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4041 tree step
= TREE_OPERAND (off
, 1);
4042 off
= TREE_OPERAND (off
, 0);
4044 if (CONVERT_EXPR_P (off
)
4045 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
4047 < TYPE_PRECISION (TREE_TYPE (off
)))
4048 off
= TREE_OPERAND (off
, 0);
4049 if (TREE_CODE (off
) == SSA_NAME
)
4051 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4052 tree reft
= TREE_TYPE (DR_REF (newdr
));
4053 if (is_gimple_call (def
)
4054 && gimple_call_internal_p (def
)
4055 && (gimple_call_internal_fn (def
)
4056 == IFN_GOMP_SIMD_LANE
))
4058 tree arg
= gimple_call_arg (def
, 0);
4059 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4060 arg
= SSA_NAME_VAR (arg
);
4061 if (arg
== loop
->simduid
4063 && tree_int_cst_equal
4064 (TYPE_SIZE_UNIT (reft
),
4067 DR_OFFSET (newdr
) = ssize_int (0);
4068 DR_STEP (newdr
) = step
;
4069 DR_OFFSET_ALIGNMENT (newdr
)
4070 = BIGGEST_ALIGNMENT
;
4071 DR_STEP_ALIGNMENT (newdr
)
4072 = highest_pow2_factor (step
);
4074 simd_lane_access
= true;
4080 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
4084 gatherscatter
= GATHER
;
4086 gatherscatter
= SCATTER
;
4089 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
4090 free_data_ref (newdr
);
4093 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
4095 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4098 "not vectorized: data ref analysis "
4100 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4103 if (is_a
<bb_vec_info
> (vinfo
))
4110 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4114 "not vectorized: base addr of dr is a "
4117 if (is_a
<bb_vec_info
> (vinfo
))
4120 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4125 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
4127 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4130 "not vectorized: volatile type ");
4131 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4134 if (is_a
<bb_vec_info
> (vinfo
))
4140 if (stmt_can_throw_internal (stmt
))
4142 if (dump_enabled_p ())
4144 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4145 "not vectorized: statement can throw an "
4147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4150 if (is_a
<bb_vec_info
> (vinfo
))
4153 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4158 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4159 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4161 if (dump_enabled_p ())
4163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4164 "not vectorized: statement is bitfield "
4166 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4169 if (is_a
<bb_vec_info
> (vinfo
))
4172 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4177 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4178 offset
= unshare_expr (DR_OFFSET (dr
));
4179 init
= unshare_expr (DR_INIT (dr
));
4181 if (is_gimple_call (stmt
)
4182 && (!gimple_call_internal_p (stmt
)
4183 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
4184 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
4186 if (dump_enabled_p ())
4188 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4189 "not vectorized: dr in a call ");
4190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4193 if (is_a
<bb_vec_info
> (vinfo
))
4196 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4201 /* Update DR field in stmt_vec_info struct. */
4203 /* If the dataref is in an inner-loop of the loop that is considered for
4204 for vectorization, we also want to analyze the access relative to
4205 the outer-loop (DR contains information only relative to the
4206 inner-most enclosing loop). We do that by building a reference to the
4207 first location accessed by the inner-loop, and analyze it relative to
4209 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
4211 /* Build a reference to the first location accessed by the
4212 inner loop: *(BASE + INIT + OFFSET). By construction,
4213 this address must be invariant in the inner loop, so we
4214 can consider it as being used in the outer loop. */
4215 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4217 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4218 tree init_ref
= build_fold_indirect_ref (init_addr
);
4220 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_NOTE
, vect_location
,
4223 "analyze in outer loop: ");
4224 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
4225 dump_printf (MSG_NOTE
, "\n");
4228 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4230 /* dr_analyze_innermost already explained the failure. */
4233 if (dump_enabled_p ())
4235 dump_printf_loc (MSG_NOTE
, vect_location
,
4236 "\touter base_address: ");
4237 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4238 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4239 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
4240 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4241 STMT_VINFO_DR_OFFSET (stmt_info
));
4242 dump_printf (MSG_NOTE
,
4243 "\n\touter constant offset from base address: ");
4244 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4245 STMT_VINFO_DR_INIT (stmt_info
));
4246 dump_printf (MSG_NOTE
, "\n\touter step: ");
4247 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4248 STMT_VINFO_DR_STEP (stmt_info
));
4249 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
4250 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
4251 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
4252 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
4253 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
4254 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
4255 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
4256 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4260 if (STMT_VINFO_DATA_REF (stmt_info
))
4262 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4265 "not vectorized: more than one data ref "
4267 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4270 if (is_a
<bb_vec_info
> (vinfo
))
4273 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4278 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
4279 if (simd_lane_access
)
4281 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
4282 free_data_ref (datarefs
[i
]);
4286 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
4287 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
4288 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
4290 if (dump_enabled_p ())
4292 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4293 "not vectorized: base object not addressable "
4295 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4297 if (is_a
<bb_vec_info
> (vinfo
))
4299 /* In BB vectorization the ref can still participate
4300 in dependence analysis, we just can't vectorize it. */
4301 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4307 /* Set vectype for STMT. */
4308 scalar_type
= TREE_TYPE (DR_REF (dr
));
4309 STMT_VINFO_VECTYPE (stmt_info
)
4310 = get_vectype_for_scalar_type (scalar_type
);
4311 if (!STMT_VINFO_VECTYPE (stmt_info
))
4313 if (dump_enabled_p ())
4315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4316 "not vectorized: no vectype for stmt: ");
4317 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4318 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4319 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4321 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4324 if (is_a
<bb_vec_info
> (vinfo
))
4326 /* No vector type is fine, the ref can still participate
4327 in dependence analysis, we just can't vectorize it. */
4328 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4332 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4334 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4335 if (gatherscatter
!= SG_NONE
)
4342 if (dump_enabled_p ())
4344 dump_printf_loc (MSG_NOTE
, vect_location
,
4345 "got vectype for stmt: ");
4346 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
4347 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4348 STMT_VINFO_VECTYPE (stmt_info
));
4349 dump_printf (MSG_NOTE
, "\n");
4353 /* Adjust the minimal vectorization factor according to the
4355 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4356 *min_vf
= upper_bound (*min_vf
, vf
);
4358 if (gatherscatter
!= SG_NONE
)
4360 gather_scatter_info gs_info
;
4361 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4363 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4365 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4367 if (dump_enabled_p ())
4369 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4370 (gatherscatter
== GATHER
) ?
4371 "not vectorized: not suitable for gather "
4373 "not vectorized: not suitable for scatter "
4375 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4380 free_data_ref (datarefs
[i
]);
4382 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4385 else if (is_a
<loop_vec_info
> (vinfo
)
4386 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4388 if (nested_in_vect_loop_p (loop
, stmt
))
4390 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4393 "not vectorized: not suitable for strided "
4395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4399 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4403 /* If we stopped analysis at the first dataref we could not analyze
4404 when trying to vectorize a basic-block mark the rest of the datarefs
4405 as not vectorizable and truncate the vector of datarefs. That
4406 avoids spending useless time in analyzing their dependence. */
4407 if (i
!= datarefs
.length ())
4409 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4410 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4412 data_reference_p dr
= datarefs
[j
];
4413 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4416 datarefs
.truncate (i
);
4423 /* Function vect_get_new_vect_var.
4425 Returns a name for a new variable. The current naming scheme appends the
4426 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4427 the name of vectorizer generated variables, and appends that to NAME if
4431 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4438 case vect_simple_var
:
4441 case vect_scalar_var
:
4447 case vect_pointer_var
:
4456 char* tmp
= concat (prefix
, "_", name
, NULL
);
4457 new_vect_var
= create_tmp_reg (type
, tmp
);
4461 new_vect_var
= create_tmp_reg (type
, prefix
);
4463 return new_vect_var
;
4466 /* Like vect_get_new_vect_var but return an SSA name. */
4469 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4476 case vect_simple_var
:
4479 case vect_scalar_var
:
4482 case vect_pointer_var
:
4491 char* tmp
= concat (prefix
, "_", name
, NULL
);
4492 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4496 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4498 return new_vect_var
;
4501 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4504 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
)
4506 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4507 int misalign
= DR_MISALIGNMENT (dr
);
4508 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4509 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4511 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4512 DR_TARGET_ALIGNMENT (dr
), misalign
);
4515 /* Function vect_create_addr_base_for_vector_ref.
4517 Create an expression that computes the address of the first memory location
4518 that will be accessed for a data reference.
4521 STMT: The statement containing the data reference.
4522 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4523 OFFSET: Optional. If supplied, it is be added to the initial address.
4524 LOOP: Specify relative to which loop-nest should the address be computed.
4525 For example, when the dataref is in an inner-loop nested in an
4526 outer-loop that is now being vectorized, LOOP can be either the
4527 outer-loop, or the inner-loop. The first memory location accessed
4528 by the following dataref ('in' points to short):
4535 if LOOP=i_loop: &in (relative to i_loop)
4536 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4537 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4538 initial address. Unlike OFFSET, which is number of elements to
4539 be added, BYTE_OFFSET is measured in bytes.
4542 1. Return an SSA_NAME whose value is the address of the memory location of
4543 the first vector of the data reference.
4544 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4545 these statement(s) which define the returned SSA_NAME.
4547 FORNOW: We are only handling array accesses with step 1. */
4550 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4551 gimple_seq
*new_stmt_list
,
4555 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4556 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4557 const char *base_name
;
4560 gimple_seq seq
= NULL
;
4562 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4563 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4564 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4566 tree data_ref_base
= unshare_expr (drb
->base_address
);
4567 tree base_offset
= unshare_expr (drb
->offset
);
4568 tree init
= unshare_expr (drb
->init
);
4571 base_name
= get_name (data_ref_base
);
4574 base_offset
= ssize_int (0);
4575 init
= ssize_int (0);
4576 base_name
= get_name (DR_REF (dr
));
4579 /* Create base_offset */
4580 base_offset
= size_binop (PLUS_EXPR
,
4581 fold_convert (sizetype
, base_offset
),
4582 fold_convert (sizetype
, init
));
4586 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4587 fold_convert (sizetype
, offset
), step
);
4588 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4589 base_offset
, offset
);
4593 byte_offset
= fold_convert (sizetype
, byte_offset
);
4594 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4595 base_offset
, byte_offset
);
4598 /* base + base_offset */
4600 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4603 addr_base
= build1 (ADDR_EXPR
,
4604 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4605 unshare_expr (DR_REF (dr
)));
4608 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4609 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4610 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4611 gimple_seq_add_seq (new_stmt_list
, seq
);
4613 if (DR_PTR_INFO (dr
)
4614 && TREE_CODE (addr_base
) == SSA_NAME
4615 && !SSA_NAME_PTR_INFO (addr_base
))
4617 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
);
4618 if (offset
|| byte_offset
)
4619 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4622 if (dump_enabled_p ())
4624 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4625 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4626 dump_printf (MSG_NOTE
, "\n");
4633 /* Function vect_create_data_ref_ptr.
4635 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4636 location accessed in the loop by STMT, along with the def-use update
4637 chain to appropriately advance the pointer through the loop iterations.
4638 Also set aliasing information for the pointer. This pointer is used by
4639 the callers to this function to create a memory reference expression for
4640 vector load/store access.
4643 1. STMT: a stmt that references memory. Expected to be of the form
4644 GIMPLE_ASSIGN <name, data-ref> or
4645 GIMPLE_ASSIGN <data-ref, name>.
4646 2. AGGR_TYPE: the type of the reference, which should be either a vector
4648 3. AT_LOOP: the loop where the vector memref is to be created.
4649 4. OFFSET (optional): an offset to be added to the initial address accessed
4650 by the data-ref in STMT.
4651 5. BSI: location where the new stmts are to be placed if there is no loop
4652 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4653 pointing to the initial address.
4654 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4655 to the initial address accessed by the data-ref in STMT. This is
4656 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4658 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4659 to the IV during each iteration of the loop. NULL says to move
4660 by one copy of AGGR_TYPE up or down, depending on the step of the
4664 1. Declare a new ptr to vector_type, and have it point to the base of the
4665 data reference (initial addressed accessed by the data reference).
4666 For example, for vector of type V8HI, the following code is generated:
4669 ap = (v8hi *)initial_address;
4671 if OFFSET is not supplied:
4672 initial_address = &a[init];
4673 if OFFSET is supplied:
4674 initial_address = &a[init + OFFSET];
4675 if BYTE_OFFSET is supplied:
4676 initial_address = &a[init] + BYTE_OFFSET;
4678 Return the initial_address in INITIAL_ADDRESS.
4680 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4681 update the pointer in each iteration of the loop.
4683 Return the increment stmt that updates the pointer in PTR_INCR.
4685 3. Set INV_P to true if the access pattern of the data reference in the
4686 vectorized loop is invariant. Set it to false otherwise.
4688 4. Return the pointer. */
4691 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4692 tree offset
, tree
*initial_address
,
4693 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4694 bool only_init
, bool *inv_p
, tree byte_offset
,
4697 const char *base_name
;
4698 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4699 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4700 struct loop
*loop
= NULL
;
4701 bool nested_in_vect_loop
= false;
4702 struct loop
*containing_loop
= NULL
;
4706 gimple_seq new_stmt_list
= NULL
;
4710 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4712 gimple_stmt_iterator incr_gsi
;
4714 tree indx_before_incr
, indx_after_incr
;
4717 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4719 gcc_assert (iv_step
!= NULL_TREE
4720 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4721 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4725 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4726 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4727 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4728 pe
= loop_preheader_edge (loop
);
4732 gcc_assert (bb_vinfo
);
4737 /* Check the step (evolution) of the load in LOOP, and record
4738 whether it's invariant. */
4739 step
= vect_dr_behavior (dr
)->step
;
4740 if (integer_zerop (step
))
4745 /* Create an expression for the first address accessed by this load
4747 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4749 if (dump_enabled_p ())
4751 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4752 dump_printf_loc (MSG_NOTE
, vect_location
,
4753 "create %s-pointer variable to type: ",
4754 get_tree_code_name (TREE_CODE (aggr_type
)));
4755 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4756 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4757 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4758 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4759 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4760 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4761 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4763 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4764 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4765 dump_printf (MSG_NOTE
, "\n");
4768 /* (1) Create the new aggregate-pointer variable.
4769 Vector and array types inherit the alias set of their component
4770 type by default so we need to use a ref-all pointer if the data
4771 reference does not conflict with the created aggregated data
4772 reference because it is not addressable. */
4773 bool need_ref_all
= false;
4774 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4775 get_alias_set (DR_REF (dr
))))
4776 need_ref_all
= true;
4777 /* Likewise for any of the data references in the stmt group. */
4778 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4780 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4783 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4784 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4785 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4786 get_alias_set (DR_REF (sdr
))))
4788 need_ref_all
= true;
4791 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4795 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4797 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4800 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4801 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4802 def-use update cycles for the pointer: one relative to the outer-loop
4803 (LOOP), which is what steps (3) and (4) below do. The other is relative
4804 to the inner-loop (which is the inner-most loop containing the dataref),
4805 and this is done be step (5) below.
4807 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4808 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4809 redundant. Steps (3),(4) create the following:
4812 LOOP: vp1 = phi(vp0,vp2)
4818 If there is an inner-loop nested in loop, then step (5) will also be
4819 applied, and an additional update in the inner-loop will be created:
4822 LOOP: vp1 = phi(vp0,vp2)
4824 inner: vp3 = phi(vp1,vp4)
4825 vp4 = vp3 + inner_step
4831 /* (2) Calculate the initial address of the aggregate-pointer, and set
4832 the aggregate-pointer to point to it before the loop. */
4834 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4836 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4837 offset
, byte_offset
);
4842 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4843 gcc_assert (!new_bb
);
4846 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4849 *initial_address
= new_temp
;
4850 aggr_ptr_init
= new_temp
;
4852 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4853 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4854 inner-loop nested in LOOP (during outer-loop vectorization). */
4856 /* No update in loop is required. */
4857 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4858 aptr
= aggr_ptr_init
;
4861 if (iv_step
== NULL_TREE
)
4863 /* The step of the aggregate pointer is the type size. */
4864 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4865 /* One exception to the above is when the scalar step of the load in
4866 LOOP is zero. In this case the step here is also zero. */
4868 iv_step
= size_zero_node
;
4869 else if (tree_int_cst_sgn (step
) == -1)
4870 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4873 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4875 create_iv (aggr_ptr_init
,
4876 fold_convert (aggr_ptr_type
, iv_step
),
4877 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4878 &indx_before_incr
, &indx_after_incr
);
4879 incr
= gsi_stmt (incr_gsi
);
4880 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4882 /* Copy the points-to information if it exists. */
4883 if (DR_PTR_INFO (dr
))
4885 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4886 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4891 aptr
= indx_before_incr
;
4894 if (!nested_in_vect_loop
|| only_init
)
4898 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4899 nested in LOOP, if exists. */
4901 gcc_assert (nested_in_vect_loop
);
4904 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4906 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4907 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4909 incr
= gsi_stmt (incr_gsi
);
4910 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4912 /* Copy the points-to information if it exists. */
4913 if (DR_PTR_INFO (dr
))
4915 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4916 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4921 return indx_before_incr
;
4928 /* Function bump_vector_ptr
4930 Increment a pointer (to a vector type) by vector-size. If requested,
4931 i.e. if PTR-INCR is given, then also connect the new increment stmt
4932 to the existing def-use update-chain of the pointer, by modifying
4933 the PTR_INCR as illustrated below:
4935 The pointer def-use update-chain before this function:
4936 DATAREF_PTR = phi (p_0, p_2)
4938 PTR_INCR: p_2 = DATAREF_PTR + step
4940 The pointer def-use update-chain after this function:
4941 DATAREF_PTR = phi (p_0, p_2)
4943 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4945 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4948 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4950 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4951 the loop. The increment amount across iterations is expected
4953 BSI - location where the new update stmt is to be placed.
4954 STMT - the original scalar memory-access stmt that is being vectorized.
4955 BUMP - optional. The offset by which to bump the pointer. If not given,
4956 the offset is assumed to be vector_size.
4958 Output: Return NEW_DATAREF_PTR as illustrated above.
4963 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4964 gimple
*stmt
, tree bump
)
4966 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4967 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4968 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4969 tree update
= TYPE_SIZE_UNIT (vectype
);
4972 use_operand_p use_p
;
4973 tree new_dataref_ptr
;
4978 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4979 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4981 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4982 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4983 dataref_ptr
, update
);
4984 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4986 /* Copy the points-to information if it exists. */
4987 if (DR_PTR_INFO (dr
))
4989 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4990 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4994 return new_dataref_ptr
;
4996 /* Update the vector-pointer's cross-iteration increment. */
4997 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4999 tree use
= USE_FROM_PTR (use_p
);
5001 if (use
== dataref_ptr
)
5002 SET_USE (use_p
, new_dataref_ptr
);
5004 gcc_assert (operand_equal_p (use
, update
, 0));
5007 return new_dataref_ptr
;
5011 /* Function vect_create_destination_var.
5013 Create a new temporary of type VECTYPE. */
5016 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5022 enum vect_var_kind kind
;
5025 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5029 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5031 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5033 name
= get_name (scalar_dest
);
5035 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5037 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5038 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5044 /* Function vect_grouped_store_supported.
5046 Returns TRUE if interleave high and interleave low permutations
5047 are supported, and FALSE otherwise. */
5050 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5052 machine_mode mode
= TYPE_MODE (vectype
);
5054 /* vect_permute_store_chain requires the group size to be equal to 3 or
5055 be a power of two. */
5056 if (count
!= 3 && exact_log2 (count
) == -1)
5058 if (dump_enabled_p ())
5059 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5060 "the size of the group of accesses"
5061 " is not a power of 2 or not eqaul to 3\n");
5065 /* Check that the permutation is supported. */
5066 if (VECTOR_MODE_P (mode
))
5071 unsigned int j0
= 0, j1
= 0, j2
= 0;
5075 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5079 "cannot handle groups of 3 stores for"
5080 " variable-length vectors\n");
5084 vec_perm_builder
sel (nelt
, nelt
, 1);
5085 sel
.quick_grow (nelt
);
5086 vec_perm_indices indices
;
5087 for (j
= 0; j
< 3; j
++)
5089 int nelt0
= ((3 - j
) * nelt
) % 3;
5090 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5091 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5092 for (i
= 0; i
< nelt
; i
++)
5094 if (3 * i
+ nelt0
< nelt
)
5095 sel
[3 * i
+ nelt0
] = j0
++;
5096 if (3 * i
+ nelt1
< nelt
)
5097 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5098 if (3 * i
+ nelt2
< nelt
)
5099 sel
[3 * i
+ nelt2
] = 0;
5101 indices
.new_vector (sel
, 2, nelt
);
5102 if (!can_vec_perm_const_p (mode
, indices
))
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_MISSED_OPTIMIZATION
,
5106 "permutation op not supported by target.\n");
5110 for (i
= 0; i
< nelt
; i
++)
5112 if (3 * i
+ nelt0
< nelt
)
5113 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5114 if (3 * i
+ nelt1
< nelt
)
5115 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5116 if (3 * i
+ nelt2
< nelt
)
5117 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5119 indices
.new_vector (sel
, 2, nelt
);
5120 if (!can_vec_perm_const_p (mode
, indices
))
5122 if (dump_enabled_p ())
5123 dump_printf (MSG_MISSED_OPTIMIZATION
,
5124 "permutation op not supported by target.\n");
5132 /* If length is not equal to 3 then only power of 2 is supported. */
5133 gcc_assert (pow2p_hwi (count
));
5134 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5136 /* The encoding has 2 interleaved stepped patterns. */
5137 vec_perm_builder
sel (nelt
, 2, 3);
5139 for (i
= 0; i
< 3; i
++)
5142 sel
[i
* 2 + 1] = i
+ nelt
;
5144 vec_perm_indices
indices (sel
, 2, nelt
);
5145 if (can_vec_perm_const_p (mode
, indices
))
5147 for (i
= 0; i
< 6; i
++)
5148 sel
[i
] += exact_div (nelt
, 2);
5149 indices
.new_vector (sel
, 2, nelt
);
5150 if (can_vec_perm_const_p (mode
, indices
))
5156 if (dump_enabled_p ())
5157 dump_printf (MSG_MISSED_OPTIMIZATION
,
5158 "permutaion op not supported by target.\n");
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164 type VECTYPE. MASKED_P says whether the masked form is needed. */
5167 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5171 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172 vec_mask_store_lanes_optab
,
5175 return vect_lanes_optab_supported_p ("vec_store_lanes",
5176 vec_store_lanes_optab
,
5181 /* Function vect_permute_store_chain.
5183 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185 the data correctly for the stores. Return the final references for stores
5188 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189 The input is 4 vectors each containing 8 elements. We assign a number to
5190 each element, the input sequence is:
5192 1st vec: 0 1 2 3 4 5 6 7
5193 2nd vec: 8 9 10 11 12 13 14 15
5194 3rd vec: 16 17 18 19 20 21 22 23
5195 4th vec: 24 25 26 27 28 29 30 31
5197 The output sequence should be:
5199 1st vec: 0 8 16 24 1 9 17 25
5200 2nd vec: 2 10 18 26 3 11 19 27
5201 3rd vec: 4 12 20 28 5 13 21 30
5202 4th vec: 6 14 22 30 7 15 23 31
5204 i.e., we interleave the contents of the four vectors in their order.
5206 We use interleave_high/low instructions to create such output. The input of
5207 each interleave_high/low operation is two vectors:
5210 the even elements of the result vector are obtained left-to-right from the
5211 high/low elements of the first vector. The odd elements of the result are
5212 obtained left-to-right from the high/low elements of the second vector.
5213 The output of interleave_high will be: 0 4 1 5
5214 and of interleave_low: 2 6 3 7
5217 The permutation is done in log LENGTH stages. In each stage interleave_high
5218 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219 where the first argument is taken from the first half of DR_CHAIN and the
5220 second argument from it's second half.
5223 I1: interleave_high (1st vec, 3rd vec)
5224 I2: interleave_low (1st vec, 3rd vec)
5225 I3: interleave_high (2nd vec, 4th vec)
5226 I4: interleave_low (2nd vec, 4th vec)
5228 The output for the first stage is:
5230 I1: 0 16 1 17 2 18 3 19
5231 I2: 4 20 5 21 6 22 7 23
5232 I3: 8 24 9 25 10 26 11 27
5233 I4: 12 28 13 29 14 30 15 31
5235 The output of the second stage, i.e. the final result is:
5237 I1: 0 8 16 24 1 9 17 25
5238 I2: 2 10 18 26 3 11 19 27
5239 I3: 4 12 20 28 5 13 21 30
5240 I4: 6 14 22 30 7 15 23 31. */
5243 vect_permute_store_chain (vec
<tree
> dr_chain
,
5244 unsigned int length
,
5246 gimple_stmt_iterator
*gsi
,
5247 vec
<tree
> *result_chain
)
5249 tree vect1
, vect2
, high
, low
;
5251 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5252 tree perm_mask_low
, perm_mask_high
;
5254 tree perm3_mask_low
, perm3_mask_high
;
5255 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5257 result_chain
->quick_grow (length
);
5258 memcpy (result_chain
->address (), dr_chain
.address (),
5259 length
* sizeof (tree
));
5263 /* vect_grouped_store_supported ensures that this is constant. */
5264 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5265 unsigned int j0
= 0, j1
= 0, j2
= 0;
5267 vec_perm_builder
sel (nelt
, nelt
, 1);
5268 sel
.quick_grow (nelt
);
5269 vec_perm_indices indices
;
5270 for (j
= 0; j
< 3; j
++)
5272 int nelt0
= ((3 - j
) * nelt
) % 3;
5273 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5274 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5276 for (i
= 0; i
< nelt
; i
++)
5278 if (3 * i
+ nelt0
< nelt
)
5279 sel
[3 * i
+ nelt0
] = j0
++;
5280 if (3 * i
+ nelt1
< nelt
)
5281 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5282 if (3 * i
+ nelt2
< nelt
)
5283 sel
[3 * i
+ nelt2
] = 0;
5285 indices
.new_vector (sel
, 2, nelt
);
5286 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5288 for (i
= 0; i
< nelt
; i
++)
5290 if (3 * i
+ nelt0
< nelt
)
5291 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5292 if (3 * i
+ nelt1
< nelt
)
5293 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5294 if (3 * i
+ nelt2
< nelt
)
5295 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5297 indices
.new_vector (sel
, 2, nelt
);
5298 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5300 vect1
= dr_chain
[0];
5301 vect2
= dr_chain
[1];
5303 /* Create interleaving stmt:
5304 low = VEC_PERM_EXPR <vect1, vect2,
5305 {j, nelt, *, j + 1, nelt + j + 1, *,
5306 j + 2, nelt + j + 2, *, ...}> */
5307 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5308 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5309 vect2
, perm3_mask_low
);
5310 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5313 vect2
= dr_chain
[2];
5314 /* Create interleaving stmt:
5315 low = VEC_PERM_EXPR <vect1, vect2,
5316 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317 6, 7, nelt + j + 2, ...}> */
5318 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5319 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5320 vect2
, perm3_mask_high
);
5321 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5322 (*result_chain
)[j
] = data_ref
;
5327 /* If length is not equal to 3 then only power of 2 is supported. */
5328 gcc_assert (pow2p_hwi (length
));
5330 /* The encoding has 2 interleaved stepped patterns. */
5331 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5332 vec_perm_builder
sel (nelt
, 2, 3);
5334 for (i
= 0; i
< 3; i
++)
5337 sel
[i
* 2 + 1] = i
+ nelt
;
5339 vec_perm_indices
indices (sel
, 2, nelt
);
5340 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5342 for (i
= 0; i
< 6; i
++)
5343 sel
[i
] += exact_div (nelt
, 2);
5344 indices
.new_vector (sel
, 2, nelt
);
5345 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5347 for (i
= 0, n
= log_length
; i
< n
; i
++)
5349 for (j
= 0; j
< length
/2; j
++)
5351 vect1
= dr_chain
[j
];
5352 vect2
= dr_chain
[j
+length
/2];
5354 /* Create interleaving stmt:
5355 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5357 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5358 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5359 vect2
, perm_mask_high
);
5360 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5361 (*result_chain
)[2*j
] = high
;
5363 /* Create interleaving stmt:
5364 low = VEC_PERM_EXPR <vect1, vect2,
5365 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5367 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5368 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5369 vect2
, perm_mask_low
);
5370 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5371 (*result_chain
)[2*j
+1] = low
;
5373 memcpy (dr_chain
.address (), result_chain
->address (),
5374 length
* sizeof (tree
));
5379 /* Function vect_setup_realignment
5381 This function is called when vectorizing an unaligned load using
5382 the dr_explicit_realign[_optimized] scheme.
5383 This function generates the following code at the loop prolog:
5386 x msq_init = *(floor(p)); # prolog load
5387 realignment_token = call target_builtin;
5389 x msq = phi (msq_init, ---)
5391 The stmts marked with x are generated only for the case of
5392 dr_explicit_realign_optimized.
5394 The code above sets up a new (vector) pointer, pointing to the first
5395 location accessed by STMT, and a "floor-aligned" load using that pointer.
5396 It also generates code to compute the "realignment-token" (if the relevant
5397 target hook was defined), and creates a phi-node at the loop-header bb
5398 whose arguments are the result of the prolog-load (created by this
5399 function) and the result of a load that takes place in the loop (to be
5400 created by the caller to this function).
5402 For the case of dr_explicit_realign_optimized:
5403 The caller to this function uses the phi-result (msq) to create the
5404 realignment code inside the loop, and sets up the missing phi argument,
5407 msq = phi (msq_init, lsq)
5408 lsq = *(floor(p')); # load in loop
5409 result = realign_load (msq, lsq, realignment_token);
5411 For the case of dr_explicit_realign:
5413 msq = *(floor(p)); # load in loop
5415 lsq = *(floor(p')); # load in loop
5416 result = realign_load (msq, lsq, realignment_token);
5419 STMT - (scalar) load stmt to be vectorized. This load accesses
5420 a memory location that may be unaligned.
5421 BSI - place where new code is to be inserted.
5422 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5426 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427 target hook, if defined.
5428 Return value - the result of the loop-header phi node. */
5431 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5432 tree
*realignment_token
,
5433 enum dr_alignment_support alignment_support_scheme
,
5435 struct loop
**at_loop
)
5437 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5438 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5439 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5440 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5441 struct loop
*loop
= NULL
;
5443 tree scalar_dest
= gimple_assign_lhs (stmt
);
5449 tree msq_init
= NULL_TREE
;
5452 tree msq
= NULL_TREE
;
5453 gimple_seq stmts
= NULL
;
5455 bool compute_in_loop
= false;
5456 bool nested_in_vect_loop
= false;
5457 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5458 struct loop
*loop_for_initial_load
= NULL
;
5462 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5463 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5466 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5467 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5475 /* 1. Determine where to generate the misalignment computation.
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5501 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5503 compute_in_loop
= true;
5504 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5508 /* 2. Determine where to generate the extra vector load.
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5521 if (nested_in_vect_loop
)
5523 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5524 bool invariant_in_outerloop
=
5525 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5526 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5529 loop_for_initial_load
= loop
;
5531 *at_loop
= loop_for_initial_load
;
5533 if (loop_for_initial_load
)
5534 pe
= loop_preheader_edge (loop_for_initial_load
);
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5539 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5544 gcc_assert (!compute_in_loop
);
5545 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5546 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5547 NULL_TREE
, &init_addr
, NULL
, &inc
,
5549 if (TREE_CODE (ptr
) == SSA_NAME
)
5550 new_temp
= copy_ssa_name (ptr
);
5552 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5553 unsigned int align
= DR_TARGET_ALIGNMENT (dr
);
5554 new_stmt
= gimple_build_assign
5555 (new_temp
, BIT_AND_EXPR
, ptr
,
5556 build_int_cst (TREE_TYPE (ptr
), -(HOST_WIDE_INT
) align
));
5557 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5558 gcc_assert (!new_bb
);
5560 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5561 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5562 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5563 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5564 gimple_assign_set_lhs (new_stmt
, new_temp
);
5567 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5568 gcc_assert (!new_bb
);
5571 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5573 msq_init
= gimple_assign_lhs (new_stmt
);
5576 /* 4. Create realignment token using a target builtin, if available.
5577 It is done either inside the containing loop, or before LOOP (as
5578 determined above). */
5580 if (targetm
.vectorize
.builtin_mask_for_load
)
5585 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5588 /* Generate the INIT_ADDR computation outside LOOP. */
5589 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5593 pe
= loop_preheader_edge (loop
);
5594 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5595 gcc_assert (!new_bb
);
5598 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5601 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5602 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5604 vect_create_destination_var (scalar_dest
,
5605 gimple_call_return_type (new_stmt
));
5606 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5607 gimple_call_set_lhs (new_stmt
, new_temp
);
5609 if (compute_in_loop
)
5610 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5613 /* Generate the misalignment computation outside LOOP. */
5614 pe
= loop_preheader_edge (loop
);
5615 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5616 gcc_assert (!new_bb
);
5619 *realignment_token
= gimple_call_lhs (new_stmt
);
5621 /* The result of the CALL_EXPR to this builtin is determined from
5622 the value of the parameter and no global variables are touched
5623 which makes the builtin a "const" function. Requiring the
5624 builtin to have the "const" attribute makes it unnecessary
5625 to call mark_call_clobbered. */
5626 gcc_assert (TREE_READONLY (builtin_decl
));
5629 if (alignment_support_scheme
== dr_explicit_realign
)
5632 gcc_assert (!compute_in_loop
);
5633 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5636 /* 5. Create msq = phi <msq_init, lsq> in loop */
5638 pe
= loop_preheader_edge (containing_loop
);
5639 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5640 msq
= make_ssa_name (vec_dest
);
5641 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5642 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5648 /* Function vect_grouped_load_supported.
5650 COUNT is the size of the load group (the number of statements plus the
5651 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5652 only one statement, with a gap of COUNT - 1.
5654 Returns true if a suitable permute exists. */
5657 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5658 unsigned HOST_WIDE_INT count
)
5660 machine_mode mode
= TYPE_MODE (vectype
);
5662 /* If this is single-element interleaving with an element distance
5663 that leaves unused vector loads around punt - we at least create
5664 very sub-optimal code in that case (and blow up memory,
5666 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5670 "single-element interleaving not supported "
5671 "for not adjacent vector loads\n");
5675 /* vect_permute_load_chain requires the group size to be equal to 3 or
5676 be a power of two. */
5677 if (count
!= 3 && exact_log2 (count
) == -1)
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5681 "the size of the group of accesses"
5682 " is not a power of 2 or not equal to 3\n");
5686 /* Check that the permutation is supported. */
5687 if (VECTOR_MODE_P (mode
))
5693 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5697 "cannot handle groups of 3 loads for"
5698 " variable-length vectors\n");
5702 vec_perm_builder
sel (nelt
, nelt
, 1);
5703 sel
.quick_grow (nelt
);
5704 vec_perm_indices indices
;
5706 for (k
= 0; k
< 3; k
++)
5708 for (i
= 0; i
< nelt
; i
++)
5709 if (3 * i
+ k
< 2 * nelt
)
5713 indices
.new_vector (sel
, 2, nelt
);
5714 if (!can_vec_perm_const_p (mode
, indices
))
5716 if (dump_enabled_p ())
5717 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5718 "shuffle of 3 loads is not supported by"
5722 for (i
= 0, j
= 0; i
< nelt
; i
++)
5723 if (3 * i
+ k
< 2 * nelt
)
5726 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5727 indices
.new_vector (sel
, 2, nelt
);
5728 if (!can_vec_perm_const_p (mode
, indices
))
5730 if (dump_enabled_p ())
5731 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5732 "shuffle of 3 loads is not supported by"
5741 /* If length is not equal to 3 then only power of 2 is supported. */
5742 gcc_assert (pow2p_hwi (count
));
5743 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5745 /* The encoding has a single stepped pattern. */
5746 vec_perm_builder
sel (nelt
, 1, 3);
5748 for (i
= 0; i
< 3; i
++)
5750 vec_perm_indices
indices (sel
, 2, nelt
);
5751 if (can_vec_perm_const_p (mode
, indices
))
5753 for (i
= 0; i
< 3; i
++)
5755 indices
.new_vector (sel
, 2, nelt
);
5756 if (can_vec_perm_const_p (mode
, indices
))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5764 "extract even/odd not supported by target\n");
5768 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5769 type VECTYPE. MASKED_P says whether the masked form is needed. */
5772 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5776 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5777 vec_mask_load_lanes_optab
,
5780 return vect_lanes_optab_supported_p ("vec_load_lanes",
5781 vec_load_lanes_optab
,
5785 /* Function vect_permute_load_chain.
5787 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5788 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5789 the input data correctly. Return the final references for loads in
5792 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5793 The input is 4 vectors each containing 8 elements. We assign a number to each
5794 element, the input sequence is:
5796 1st vec: 0 1 2 3 4 5 6 7
5797 2nd vec: 8 9 10 11 12 13 14 15
5798 3rd vec: 16 17 18 19 20 21 22 23
5799 4th vec: 24 25 26 27 28 29 30 31
5801 The output sequence should be:
5803 1st vec: 0 4 8 12 16 20 24 28
5804 2nd vec: 1 5 9 13 17 21 25 29
5805 3rd vec: 2 6 10 14 18 22 26 30
5806 4th vec: 3 7 11 15 19 23 27 31
5808 i.e., the first output vector should contain the first elements of each
5809 interleaving group, etc.
5811 We use extract_even/odd instructions to create such output. The input of
5812 each extract_even/odd operation is two vectors
5816 and the output is the vector of extracted even/odd elements. The output of
5817 extract_even will be: 0 2 4 6
5818 and of extract_odd: 1 3 5 7
5821 The permutation is done in log LENGTH stages. In each stage extract_even
5822 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5823 their order. In our example,
5825 E1: extract_even (1st vec, 2nd vec)
5826 E2: extract_odd (1st vec, 2nd vec)
5827 E3: extract_even (3rd vec, 4th vec)
5828 E4: extract_odd (3rd vec, 4th vec)
5830 The output for the first stage will be:
5832 E1: 0 2 4 6 8 10 12 14
5833 E2: 1 3 5 7 9 11 13 15
5834 E3: 16 18 20 22 24 26 28 30
5835 E4: 17 19 21 23 25 27 29 31
5837 In order to proceed and create the correct sequence for the next stage (or
5838 for the correct output, if the second stage is the last one, as in our
5839 example), we first put the output of extract_even operation and then the
5840 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5841 The input for the second stage is:
5843 1st vec (E1): 0 2 4 6 8 10 12 14
5844 2nd vec (E3): 16 18 20 22 24 26 28 30
5845 3rd vec (E2): 1 3 5 7 9 11 13 15
5846 4th vec (E4): 17 19 21 23 25 27 29 31
5848 The output of the second stage:
5850 E1: 0 4 8 12 16 20 24 28
5851 E2: 2 6 10 14 18 22 26 30
5852 E3: 1 5 9 13 17 21 25 29
5853 E4: 3 7 11 15 19 23 27 31
5855 And RESULT_CHAIN after reordering:
5857 1st vec (E1): 0 4 8 12 16 20 24 28
5858 2nd vec (E3): 1 5 9 13 17 21 25 29
5859 3rd vec (E2): 2 6 10 14 18 22 26 30
5860 4th vec (E4): 3 7 11 15 19 23 27 31. */
5863 vect_permute_load_chain (vec
<tree
> dr_chain
,
5864 unsigned int length
,
5866 gimple_stmt_iterator
*gsi
,
5867 vec
<tree
> *result_chain
)
5869 tree data_ref
, first_vect
, second_vect
;
5870 tree perm_mask_even
, perm_mask_odd
;
5871 tree perm3_mask_low
, perm3_mask_high
;
5873 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5874 unsigned int i
, j
, log_length
= exact_log2 (length
);
5876 result_chain
->quick_grow (length
);
5877 memcpy (result_chain
->address (), dr_chain
.address (),
5878 length
* sizeof (tree
));
5882 /* vect_grouped_load_supported ensures that this is constant. */
5883 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5886 vec_perm_builder
sel (nelt
, nelt
, 1);
5887 sel
.quick_grow (nelt
);
5888 vec_perm_indices indices
;
5889 for (k
= 0; k
< 3; k
++)
5891 for (i
= 0; i
< nelt
; i
++)
5892 if (3 * i
+ k
< 2 * nelt
)
5896 indices
.new_vector (sel
, 2, nelt
);
5897 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5899 for (i
= 0, j
= 0; i
< nelt
; i
++)
5900 if (3 * i
+ k
< 2 * nelt
)
5903 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5904 indices
.new_vector (sel
, 2, nelt
);
5905 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5907 first_vect
= dr_chain
[0];
5908 second_vect
= dr_chain
[1];
5910 /* Create interleaving stmt (low part of):
5911 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5914 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5915 second_vect
, perm3_mask_low
);
5916 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5918 /* Create interleaving stmt (high part of):
5919 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921 first_vect
= data_ref
;
5922 second_vect
= dr_chain
[2];
5923 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5924 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5925 second_vect
, perm3_mask_high
);
5926 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5927 (*result_chain
)[k
] = data_ref
;
5932 /* If length is not equal to 3 then only power of 2 is supported. */
5933 gcc_assert (pow2p_hwi (length
));
5935 /* The encoding has a single stepped pattern. */
5936 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5937 vec_perm_builder
sel (nelt
, 1, 3);
5939 for (i
= 0; i
< 3; ++i
)
5941 vec_perm_indices
indices (sel
, 2, nelt
);
5942 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5944 for (i
= 0; i
< 3; ++i
)
5946 indices
.new_vector (sel
, 2, nelt
);
5947 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5949 for (i
= 0; i
< log_length
; i
++)
5951 for (j
= 0; j
< length
; j
+= 2)
5953 first_vect
= dr_chain
[j
];
5954 second_vect
= dr_chain
[j
+1];
5956 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5957 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5958 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5959 first_vect
, second_vect
,
5961 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5962 (*result_chain
)[j
/2] = data_ref
;
5964 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5965 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5966 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5967 first_vect
, second_vect
,
5969 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5970 (*result_chain
)[j
/2+length
/2] = data_ref
;
5972 memcpy (dr_chain
.address (), result_chain
->address (),
5973 length
* sizeof (tree
));
5978 /* Function vect_shift_permute_load_chain.
5980 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5981 sequence of stmts to reorder the input data accordingly.
5982 Return the final references for loads in RESULT_CHAIN.
5983 Return true if successed, false otherwise.
5985 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5986 The input is 3 vectors each containing 8 elements. We assign a
5987 number to each element, the input sequence is:
5989 1st vec: 0 1 2 3 4 5 6 7
5990 2nd vec: 8 9 10 11 12 13 14 15
5991 3rd vec: 16 17 18 19 20 21 22 23
5993 The output sequence should be:
5995 1st vec: 0 3 6 9 12 15 18 21
5996 2nd vec: 1 4 7 10 13 16 19 22
5997 3rd vec: 2 5 8 11 14 17 20 23
5999 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6001 First we shuffle all 3 vectors to get correct elements order:
6003 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6004 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6005 3rd vec: (16 19 22) (17 20 23) (18 21)
6007 Next we unite and shift vector 3 times:
6010 shift right by 6 the concatenation of:
6011 "1st vec" and "2nd vec"
6012 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6013 "2nd vec" and "3rd vec"
6014 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6015 "3rd vec" and "1st vec"
6016 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6019 So that now new vectors are:
6021 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6022 2nd vec: (10 13) (16 19 22) (17 20 23)
6023 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6026 shift right by 5 the concatenation of:
6027 "1st vec" and "3rd vec"
6028 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6029 "2nd vec" and "1st vec"
6030 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6031 "3rd vec" and "2nd vec"
6032 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6035 So that now new vectors are:
6037 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6038 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6039 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6042 shift right by 5 the concatenation of:
6043 "1st vec" and "1st vec"
6044 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6045 shift right by 3 the concatenation of:
6046 "2nd vec" and "2nd vec"
6047 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6050 So that now all vectors are READY:
6051 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6052 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6053 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6055 This algorithm is faster than one in vect_permute_load_chain if:
6056 1. "shift of a concatination" is faster than general permutation.
6058 2. The TARGET machine can't execute vector instructions in parallel.
6059 This is because each step of the algorithm depends on previous.
6060 The algorithm in vect_permute_load_chain is much more parallel.
6062 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6066 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
6067 unsigned int length
,
6069 gimple_stmt_iterator
*gsi
,
6070 vec
<tree
> *result_chain
)
6072 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6073 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6074 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6077 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
6079 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6080 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6082 unsigned HOST_WIDE_INT nelt
, vf
;
6083 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6084 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6085 /* Not supported for variable-length vectors. */
6088 vec_perm_builder
sel (nelt
, nelt
, 1);
6089 sel
.quick_grow (nelt
);
6091 result_chain
->quick_grow (length
);
6092 memcpy (result_chain
->address (), dr_chain
.address (),
6093 length
* sizeof (tree
));
6095 if (pow2p_hwi (length
) && vf
> 4)
6097 unsigned int j
, log_length
= exact_log2 (length
);
6098 for (i
= 0; i
< nelt
/ 2; ++i
)
6100 for (i
= 0; i
< nelt
/ 2; ++i
)
6101 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6102 vec_perm_indices
indices (sel
, 2, nelt
);
6103 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6105 if (dump_enabled_p ())
6106 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6107 "shuffle of 2 fields structure is not \
6108 supported by target\n");
6111 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6113 for (i
= 0; i
< nelt
/ 2; ++i
)
6115 for (i
= 0; i
< nelt
/ 2; ++i
)
6116 sel
[nelt
/ 2 + i
] = i
* 2;
6117 indices
.new_vector (sel
, 2, nelt
);
6118 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6120 if (dump_enabled_p ())
6121 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6122 "shuffle of 2 fields structure is not \
6123 supported by target\n");
6126 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6128 /* Generating permutation constant to shift all elements.
6129 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6130 for (i
= 0; i
< nelt
; i
++)
6131 sel
[i
] = nelt
/ 2 + i
;
6132 indices
.new_vector (sel
, 2, nelt
);
6133 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6137 "shift permutation is not supported by target\n");
6140 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6142 /* Generating permutation constant to select vector from 2.
6143 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6144 for (i
= 0; i
< nelt
/ 2; i
++)
6146 for (i
= nelt
/ 2; i
< nelt
; i
++)
6148 indices
.new_vector (sel
, 2, nelt
);
6149 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6151 if (dump_enabled_p ())
6152 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6153 "select is not supported by target\n");
6156 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6158 for (i
= 0; i
< log_length
; i
++)
6160 for (j
= 0; j
< length
; j
+= 2)
6162 first_vect
= dr_chain
[j
];
6163 second_vect
= dr_chain
[j
+ 1];
6165 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6166 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6167 first_vect
, first_vect
,
6169 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6172 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6173 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6174 second_vect
, second_vect
,
6176 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6179 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6180 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6181 vect
[0], vect
[1], shift1_mask
);
6182 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6183 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6185 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6186 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6187 vect
[0], vect
[1], select_mask
);
6188 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6189 (*result_chain
)[j
/2] = data_ref
;
6191 memcpy (dr_chain
.address (), result_chain
->address (),
6192 length
* sizeof (tree
));
6196 if (length
== 3 && vf
> 2)
6198 unsigned int k
= 0, l
= 0;
6200 /* Generating permutation constant to get all elements in rigth order.
6201 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6202 for (i
= 0; i
< nelt
; i
++)
6204 if (3 * k
+ (l
% 3) >= nelt
)
6207 l
+= (3 - (nelt
% 3));
6209 sel
[i
] = 3 * k
+ (l
% 3);
6212 vec_perm_indices
indices (sel
, 2, nelt
);
6213 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6215 if (dump_enabled_p ())
6216 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6217 "shuffle of 3 fields structure is not \
6218 supported by target\n");
6221 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6223 /* Generating permutation constant to shift all elements.
6224 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6225 for (i
= 0; i
< nelt
; i
++)
6226 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6227 indices
.new_vector (sel
, 2, nelt
);
6228 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6230 if (dump_enabled_p ())
6231 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6232 "shift permutation is not supported by target\n");
6235 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6237 /* Generating permutation constant to shift all elements.
6238 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6239 for (i
= 0; i
< nelt
; i
++)
6240 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6241 indices
.new_vector (sel
, 2, nelt
);
6242 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6244 if (dump_enabled_p ())
6245 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6246 "shift permutation is not supported by target\n");
6249 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6251 /* Generating permutation constant to shift all elements.
6252 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6253 for (i
= 0; i
< nelt
; i
++)
6254 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6255 indices
.new_vector (sel
, 2, nelt
);
6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6260 "shift permutation is not supported by target\n");
6263 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6265 /* Generating permutation constant to shift all elements.
6266 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6267 for (i
= 0; i
< nelt
; i
++)
6268 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6269 indices
.new_vector (sel
, 2, nelt
);
6270 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6272 if (dump_enabled_p ())
6273 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6274 "shift permutation is not supported by target\n");
6277 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6279 for (k
= 0; k
< 3; k
++)
6281 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6282 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6283 dr_chain
[k
], dr_chain
[k
],
6285 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6289 for (k
= 0; k
< 3; k
++)
6291 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6292 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6293 vect
[k
% 3], vect
[(k
+ 1) % 3],
6295 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6296 vect_shift
[k
] = data_ref
;
6299 for (k
= 0; k
< 3; k
++)
6301 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6302 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6303 vect_shift
[(4 - k
) % 3],
6304 vect_shift
[(3 - k
) % 3],
6306 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6310 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6312 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6313 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6314 vect
[0], shift3_mask
);
6315 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6316 (*result_chain
)[nelt
% 3] = data_ref
;
6318 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6319 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6320 vect
[1], shift4_mask
);
6321 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6322 (*result_chain
)[0] = data_ref
;
6328 /* Function vect_transform_grouped_load.
6330 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6331 to perform their permutation and ascribe the result vectorized statements to
6332 the scalar statements.
6336 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
6337 gimple_stmt_iterator
*gsi
)
6340 vec
<tree
> result_chain
= vNULL
;
6342 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6343 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6344 vectors, that are ready for vector computation. */
6345 result_chain
.create (size
);
6347 /* If reassociation width for vector type is 2 or greater target machine can
6348 execute 2 or more vector instructions in parallel. Otherwise try to
6349 get chain for loads group using vect_shift_permute_load_chain. */
6350 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
6351 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6353 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
6354 gsi
, &result_chain
))
6355 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
6356 vect_record_grouped_load_vectors (stmt
, result_chain
);
6357 result_chain
.release ();
6360 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6361 generated as part of the vectorization of STMT. Assign the statement
6362 for each vector to the associated scalar statement. */
6365 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
6367 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
6368 gimple
*next_stmt
, *new_stmt
;
6369 unsigned int i
, gap_count
;
6372 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6373 Since we scan the chain starting from it's first node, their order
6374 corresponds the order of data-refs in RESULT_CHAIN. */
6375 next_stmt
= first_stmt
;
6377 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6382 /* Skip the gaps. Loads created for the gaps will be removed by dead
6383 code elimination pass later. No need to check for the first stmt in
6384 the group, since it always exists.
6385 GROUP_GAP is the number of steps in elements from the previous
6386 access (if there is no gap GROUP_GAP is 1). We skip loads that
6387 correspond to the gaps. */
6388 if (next_stmt
!= first_stmt
6389 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
6397 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6398 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6399 copies, and we put the new vector statement in the first available
6401 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
6402 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
6405 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6408 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
6410 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
6413 prev_stmt
= rel_stmt
;
6415 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
6418 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
6423 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
6425 /* If NEXT_STMT accesses the same DR as the previous statement,
6426 put the same TMP_DATA_REF as its vectorized statement; otherwise
6427 get the next data-ref from RESULT_CHAIN. */
6428 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6434 /* Function vect_force_dr_alignment_p.
6436 Returns whether the alignment of a DECL can be forced to be aligned
6437 on ALIGNMENT bit boundary. */
6440 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6445 if (decl_in_symtab_p (decl
)
6446 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6449 if (TREE_STATIC (decl
))
6450 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6452 return (alignment
<= MAX_STACK_ALIGNMENT
);
6456 /* Return whether the data reference DR is supported with respect to its
6458 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6459 it is aligned, i.e., check if it is possible to vectorize it with different
6462 enum dr_alignment_support
6463 vect_supportable_dr_alignment (struct data_reference
*dr
,
6464 bool check_aligned_accesses
)
6466 gimple
*stmt
= DR_STMT (dr
);
6467 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6468 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6469 machine_mode mode
= TYPE_MODE (vectype
);
6470 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6471 struct loop
*vect_loop
= NULL
;
6472 bool nested_in_vect_loop
= false;
6474 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6477 /* For now assume all conditional loads/stores support unaligned
6478 access without any special code. */
6479 if (is_gimple_call (stmt
)
6480 && gimple_call_internal_p (stmt
)
6481 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6482 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6483 return dr_unaligned_supported
;
6487 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6488 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6491 /* Possibly unaligned access. */
6493 /* We can choose between using the implicit realignment scheme (generating
6494 a misaligned_move stmt) and the explicit realignment scheme (generating
6495 aligned loads with a REALIGN_LOAD). There are two variants to the
6496 explicit realignment scheme: optimized, and unoptimized.
6497 We can optimize the realignment only if the step between consecutive
6498 vector loads is equal to the vector size. Since the vector memory
6499 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6500 is guaranteed that the misalignment amount remains the same throughout the
6501 execution of the vectorized loop. Therefore, we can create the
6502 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6503 at the loop preheader.
6505 However, in the case of outer-loop vectorization, when vectorizing a
6506 memory access in the inner-loop nested within the LOOP that is now being
6507 vectorized, while it is guaranteed that the misalignment of the
6508 vectorized memory access will remain the same in different outer-loop
6509 iterations, it is *not* guaranteed that is will remain the same throughout
6510 the execution of the inner-loop. This is because the inner-loop advances
6511 with the original scalar step (and not in steps of VS). If the inner-loop
6512 step happens to be a multiple of VS, then the misalignment remains fixed
6513 and we can use the optimized realignment scheme. For example:
6519 When vectorizing the i-loop in the above example, the step between
6520 consecutive vector loads is 1, and so the misalignment does not remain
6521 fixed across the execution of the inner-loop, and the realignment cannot
6522 be optimized (as illustrated in the following pseudo vectorized loop):
6524 for (i=0; i<N; i+=4)
6525 for (j=0; j<M; j++){
6526 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6527 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6528 // (assuming that we start from an aligned address).
6531 We therefore have to use the unoptimized realignment scheme:
6533 for (i=0; i<N; i+=4)
6534 for (j=k; j<M; j+=4)
6535 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6536 // that the misalignment of the initial address is
6539 The loop can then be vectorized as follows:
6541 for (k=0; k<4; k++){
6542 rt = get_realignment_token (&vp[k]);
6543 for (i=0; i<N; i+=4){
6545 for (j=k; j<M; j+=4){
6547 va = REALIGN_LOAD <v1,v2,rt>;
6554 if (DR_IS_READ (dr
))
6556 bool is_packed
= false;
6557 tree type
= (TREE_TYPE (DR_REF (dr
)));
6559 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6560 && (!targetm
.vectorize
.builtin_mask_for_load
6561 || targetm
.vectorize
.builtin_mask_for_load ()))
6563 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6565 /* If we are doing SLP then the accesses need not have the
6566 same alignment, instead it depends on the SLP group size. */
6568 && STMT_SLP_TYPE (stmt_info
)
6569 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6570 * GROUP_SIZE (vinfo_for_stmt
6571 (GROUP_FIRST_ELEMENT (stmt_info
))),
6572 TYPE_VECTOR_SUBPARTS (vectype
)))
6574 else if (!loop_vinfo
6575 || (nested_in_vect_loop
6576 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6577 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6578 return dr_explicit_realign
;
6580 return dr_explicit_realign_optimized
;
6582 if (!known_alignment_for_access_p (dr
))
6583 is_packed
= not_size_aligned (DR_REF (dr
));
6585 if (targetm
.vectorize
.support_vector_misalignment
6586 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6587 /* Can't software pipeline the loads, but can at least do them. */
6588 return dr_unaligned_supported
;
6592 bool is_packed
= false;
6593 tree type
= (TREE_TYPE (DR_REF (dr
)));
6595 if (!known_alignment_for_access_p (dr
))
6596 is_packed
= not_size_aligned (DR_REF (dr
));
6598 if (targetm
.vectorize
.support_vector_misalignment
6599 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6600 return dr_unaligned_supported
;
6604 return dr_unaligned_unsupported
;