1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC
"]\n",
79 GET_MODE_NAME (mode
), count
);
84 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
88 "cannot use %s<%s><%s>\n", name
,
89 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE
, vect_location
,
95 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
96 GET_MODE_NAME (mode
));
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
120 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
121 HOST_WIDE_INT
*rhs_size_unit
)
123 tree scalar_type
= gimple_expr_type (stmt
);
124 HOST_WIDE_INT lhs
, rhs
;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
131 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
133 if (is_gimple_assign (stmt
)
134 && (gimple_assign_cast_p (stmt
)
135 || gimple_assign_rhs_code (stmt
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
148 *lhs_size_unit
= lhs
;
149 *rhs_size_unit
= rhs
;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
159 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
161 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
166 if (!runtime_alias_check_p (ddr
, loop
,
167 optimize_loop_nest_for_speed_p (loop
)))
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
177 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
179 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
180 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
181 if (checks
[i
] == value
)
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE
, vect_location
, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, value
);
188 dump_printf (MSG_NOTE
, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
198 vect_preserves_scalar_order_p (gimple
*stmt_a
, gimple
*stmt_b
)
200 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (stmt_a
);
201 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (stmt_b
);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 gimple
*earlier_stmt
= get_earlier_stmt (stmt_a
, stmt_b
);
216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
)));
219 /* A subroutine of vect_analyze_data_ref_dependence. Handle
220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
221 distances. These distances are conservatively correct but they don't
222 reflect a guaranteed dependence.
224 Return true if this function does all the work necessary to avoid
225 an alias or false if the caller should use the dependence distances
226 to limit the vectorization factor in the usual way. LOOP_DEPTH is
227 the depth of the loop described by LOOP_VINFO and the other arguments
228 are as for vect_analyze_data_ref_dependence. */
231 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
232 loop_vec_info loop_vinfo
,
233 int loop_depth
, unsigned int *max_vf
)
235 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
236 lambda_vector dist_v
;
238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
240 int dist
= dist_v
[loop_depth
];
241 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
243 /* If the user asserted safelen >= DIST consecutive iterations
244 can be executed concurrently, assume independence.
246 ??? An alternative would be to add the alias check even
247 in this case, and vectorize the fallback loop with the
248 maximum VF set to safelen. However, if the user has
249 explicitly given a length, it's less likely that that
251 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
253 if ((unsigned int) loop
->safelen
< *max_vf
)
254 *max_vf
= loop
->safelen
;
255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
259 /* For dependence distances of 2 or more, we have the option
260 of limiting VF or checking for an alias at runtime.
261 Prefer to check at runtime if we can, to avoid limiting
262 the VF unnecessarily when the bases are in fact independent.
264 Note that the alias checks will be removed if the VF ends up
265 being small enough. */
266 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
273 /* Function vect_analyze_data_ref_dependence.
275 Return TRUE if there (might) exist a dependence between a memory-reference
276 DRA and a memory-reference DRB. When versioning for alias may check a
277 dependence at run-time, return FALSE. Adjust *MAX_VF according to
278 the data dependence. */
281 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
282 loop_vec_info loop_vinfo
,
283 unsigned int *max_vf
)
286 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
287 struct data_reference
*dra
= DDR_A (ddr
);
288 struct data_reference
*drb
= DDR_B (ddr
);
289 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
290 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
291 lambda_vector dist_v
;
292 unsigned int loop_depth
;
294 /* In loop analysis all data references should be vectorizable. */
295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
299 /* Independent data accesses. */
300 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
304 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
307 /* We do not have to consider dependences between accesses that belong
308 to the same group, unless the stride could be smaller than the
310 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
311 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
312 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
313 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
316 /* Even if we have an anti-dependence then, as the vectorized loop covers at
317 least two scalar iterations, there is always also a true dependence.
318 As the vectorizer does not re-order loads and stores we can ignore
319 the anti-dependence if TBAA can disambiguate both DRs similar to the
320 case with known negative distance anti-dependences (positive
321 distance anti-dependences would violate TBAA constraints). */
322 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
323 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
324 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
325 get_alias_set (DR_REF (drb
))))
328 /* Unknown data dependence. */
329 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
331 /* If user asserted safelen consecutive iterations can be
332 executed concurrently, assume independence. */
333 if (loop
->safelen
>= 2)
335 if ((unsigned int) loop
->safelen
< *max_vf
)
336 *max_vf
= loop
->safelen
;
337 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
341 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
342 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
344 if (dump_enabled_p ())
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
347 "versioning for alias not supported for: "
348 "can't determine dependence between ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
351 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
354 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
362 "versioning for alias required: "
363 "can't determine dependence between ");
364 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
366 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
369 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
376 /* Known data dependence. */
377 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
379 /* If user asserted safelen consecutive iterations can be
380 executed concurrently, assume independence. */
381 if (loop
->safelen
>= 2)
383 if ((unsigned int) loop
->safelen
< *max_vf
)
384 *max_vf
= loop
->safelen
;
385 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
389 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
390 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
395 "versioning for alias not supported for: "
396 "bad dist vector for ");
397 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
399 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
400 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
402 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
407 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
410 "versioning for alias required: "
411 "bad dist vector for ");
412 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
413 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
414 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
415 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
417 /* Add to list of ddrs that need to be tested at run-time. */
418 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
421 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
423 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
424 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
428 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
430 int dist
= dist_v
[loop_depth
];
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE
, vect_location
,
434 "dependence distance = %d.\n", dist
);
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE
, vect_location
,
441 "dependence distance == 0 between ");
442 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
443 dump_printf (MSG_NOTE
, " and ");
444 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
445 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
448 /* When we perform grouped accesses and perform implicit CSE
449 by detecting equal accesses and doing disambiguation with
450 runtime alias tests like for
458 where we will end up loading { a[i], a[i+1] } once, make
459 sure that inserting group loads before the first load and
460 stores after the last store will do the right thing.
461 Similar for groups like
465 where loads from the group interleave with the store. */
466 if (!vect_preserves_scalar_order_p (DR_STMT (dra
), DR_STMT (drb
)))
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
470 "READ_WRITE dependence in interleaving.\n");
474 if (loop
->safelen
< 2)
476 tree indicator
= dr_zero_step_indicator (dra
);
477 if (TREE_CODE (indicator
) != INTEGER_CST
)
478 vect_check_nonzero_value (loop_vinfo
, indicator
);
479 else if (integer_zerop (indicator
))
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
483 "access also has a zero step\n");
490 if (dist
> 0 && DDR_REVERSED_P (ddr
))
492 /* If DDR_REVERSED_P the order of the data-refs in DDR was
493 reversed (to make distance vector positive), and the actual
494 distance is negative. */
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
497 "dependence distance negative.\n");
498 /* Record a negative dependence distance to later limit the
499 amount of stmt copying / unrolling we can perform.
500 Only need to handle read-after-write dependence. */
502 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
503 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
504 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
508 unsigned int abs_dist
= abs (dist
);
509 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
511 /* The dependence distance requires reduction of the maximal
512 vectorization factor. */
513 *max_vf
= abs (dist
);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE
, vect_location
,
516 "adjusting maximal vectorization factor to %i\n",
520 if (abs_dist
>= *max_vf
)
522 /* Dependence distance does not create dependence, as far as
523 vectorization is concerned, in this case. */
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE
, vect_location
,
526 "dependence distance >= VF.\n");
530 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
533 "not vectorized, possible dependence "
534 "between data-refs ");
535 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
536 dump_printf (MSG_NOTE
, " and ");
537 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
538 dump_printf (MSG_NOTE
, "\n");
547 /* Function vect_analyze_data_ref_dependences.
549 Examine all the data references in the loop, and make sure there do not
550 exist any data dependences between them. Set *MAX_VF according to
551 the maximum vectorization factor the data dependences allow. */
554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
555 unsigned int *max_vf
)
558 struct data_dependence_relation
*ddr
;
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE
, vect_location
,
562 "=== vect_analyze_data_ref_dependences ===\n");
564 LOOP_VINFO_DDRS (loop_vinfo
)
565 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
566 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
567 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
568 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
569 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
570 &LOOP_VINFO_DDRS (loop_vinfo
),
571 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
574 /* For epilogues we either have no aliases or alias versioning
575 was applied to original loop. Therefore we may just get max_vf
576 using VF of original loop. */
577 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
578 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
580 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
581 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
588 /* Function vect_slp_analyze_data_ref_dependence.
590 Return TRUE if there (might) exist a dependence between a memory-reference
591 DRA and a memory-reference DRB. When versioning for alias may check a
592 dependence at run-time, return FALSE. Adjust *MAX_VF according to
593 the data dependence. */
596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
598 struct data_reference
*dra
= DDR_A (ddr
);
599 struct data_reference
*drb
= DDR_B (ddr
);
601 /* We need to check dependences of statements marked as unvectorizable
602 as well, they still can prohibit vectorization. */
604 /* Independent data accesses. */
605 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
611 /* Read-read is OK. */
612 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
615 /* If dra and drb are part of the same interleaving chain consider
617 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
618 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
619 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
622 /* Unknown data dependence. */
623 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
625 if (dump_enabled_p ())
627 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
628 "can't determine dependence between ");
629 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
630 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
631 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
632 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
635 else if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE
, vect_location
,
638 "determined dependence between ");
639 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
640 dump_printf (MSG_NOTE
, " and ");
641 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
642 dump_printf (MSG_NOTE
, "\n");
649 /* Analyze dependences involved in the transform of SLP NODE. STORES
650 contain the vector of scalar stores of this instance if we are
651 disambiguating the loads. */
654 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
655 vec
<gimple
*> stores
, gimple
*last_store
)
657 /* This walks over all stmts involved in the SLP load/store done
658 in NODE verifying we can sink them up to the last stmt in the
660 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
661 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
663 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
664 if (access
== last_access
)
666 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
667 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
668 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
670 gimple
*stmt
= gsi_stmt (gsi
);
671 if (! gimple_vuse (stmt
)
672 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
675 /* If we couldn't record a (single) data reference for this
676 stmt we have to give up. */
677 /* ??? Here and below if dependence analysis fails we can resort
678 to the alias oracle which can handle more kinds of stmts. */
679 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
683 bool dependent
= false;
684 /* If we run into a store of this same instance (we've just
685 marked those) then delay dependence checking until we run
686 into the last store because this is where it will have
687 been sunk to (and we verify if we can do that as well). */
688 if (gimple_visited_p (stmt
))
690 if (stmt
!= last_store
)
694 FOR_EACH_VEC_ELT (stores
, i
, store
)
696 data_reference
*store_dr
697 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
698 ddr_p ddr
= initialize_data_dependence_relation
699 (dr_a
, store_dr
, vNULL
);
700 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
701 free_dependence_relation (ddr
);
708 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
710 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
711 free_dependence_relation (ddr
);
721 /* Function vect_analyze_data_ref_dependences.
723 Examine all the data references in the basic-block, and make sure there
724 do not exist any data dependences between them. Set *MAX_VF according to
725 the maximum vectorization factor the data dependences allow. */
728 vect_slp_analyze_instance_dependence (slp_instance instance
)
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE
, vect_location
,
732 "=== vect_slp_analyze_instance_dependence ===\n");
734 /* The stores of this instance are at the root of the SLP tree. */
735 slp_tree store
= SLP_INSTANCE_TREE (instance
);
736 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
739 /* Verify we can sink stores to the vectorized stmt insert location. */
740 gimple
*last_store
= NULL
;
743 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
746 /* Mark stores in this instance and remember the last one. */
747 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
748 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
749 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
754 /* Verify we can sink loads to the vectorized stmt insert location,
755 special-casing stores of this instance. */
758 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
759 if (! vect_slp_analyze_node_dependences (instance
, load
,
761 ? SLP_TREE_SCALAR_STMTS (store
)
762 : vNULL
, last_store
))
768 /* Unset the visited flag. */
770 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
771 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
776 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
777 the statement that contains DRB, which is useful for recording in the
781 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
782 innermost_loop_behavior
*drb
)
785 innermost_loop_behavior
*&entry
786 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
787 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE
, vect_location
,
793 "recording new base alignment for ");
794 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
795 dump_printf (MSG_NOTE
, "\n");
796 dump_printf_loc (MSG_NOTE
, vect_location
,
797 " alignment: %d\n", drb
->base_alignment
);
798 dump_printf_loc (MSG_NOTE
, vect_location
,
799 " misalignment: %d\n", drb
->base_misalignment
);
800 dump_printf_loc (MSG_NOTE
, vect_location
,
802 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
807 /* If the region we're going to vectorize is reached, all unconditional
808 data references occur at least once. We can therefore pool the base
809 alignment guarantees from each unconditional reference. Do this by
810 going through all the data references in VINFO and checking whether
811 the containing statement makes the reference unconditionally. If so,
812 record the alignment of the base address in VINFO so that it can be
813 used for all other references with the same base. */
816 vect_record_base_alignments (vec_info
*vinfo
)
818 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
819 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
822 FOR_EACH_VEC_ELT (vinfo
->datarefs
, i
, dr
)
823 if (!DR_IS_CONDITIONAL_IN_STMT (dr
))
825 gimple
*stmt
= DR_STMT (dr
);
826 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
828 /* If DR is nested in the loop that is being vectorized, we can also
829 record the alignment of the base wrt the outer loop. */
830 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
832 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
833 vect_record_base_alignment
834 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
839 /* Return the target alignment for the vectorized form of DR. */
842 vect_calculate_target_alignment (struct data_reference
*dr
)
844 gimple
*stmt
= DR_STMT (dr
);
845 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
846 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
847 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
850 /* Function vect_compute_data_ref_alignment
852 Compute the misalignment of the data reference DR.
855 1. If during the misalignment computation it is found that the data reference
856 cannot be vectorized then false is returned.
857 2. DR_MISALIGNMENT (DR) is defined.
859 FOR NOW: No analysis is actually performed. Misalignment is calculated
860 only for trivial cases. TODO. */
863 vect_compute_data_ref_alignment (struct data_reference
*dr
)
865 gimple
*stmt
= DR_STMT (dr
);
866 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
867 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
868 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
869 struct loop
*loop
= NULL
;
870 tree ref
= DR_REF (dr
);
871 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
873 if (dump_enabled_p ())
874 dump_printf_loc (MSG_NOTE
, vect_location
,
875 "vect_compute_data_ref_alignment:\n");
878 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
880 /* Initialize misalignment to unknown. */
881 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
883 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
884 bool step_preserves_misalignment_p
;
886 unsigned HOST_WIDE_INT vector_alignment
887 = vect_calculate_target_alignment (dr
) / BITS_PER_UNIT
;
888 DR_TARGET_ALIGNMENT (dr
) = vector_alignment
;
890 /* No step for BB vectorization. */
893 gcc_assert (integer_zerop (drb
->step
));
894 step_preserves_misalignment_p
= true;
897 /* In case the dataref is in an inner-loop of the loop that is being
898 vectorized (LOOP), we use the base and misalignment information
899 relative to the outer-loop (LOOP). This is ok only if the misalignment
900 stays the same throughout the execution of the inner-loop, which is why
901 we have to check that the stride of the dataref in the inner-loop evenly
902 divides by the vector alignment. */
903 else if (nested_in_vect_loop_p (loop
, stmt
))
905 step_preserves_misalignment_p
906 = (DR_STEP_ALIGNMENT (dr
) % vector_alignment
) == 0;
908 if (dump_enabled_p ())
910 if (step_preserves_misalignment_p
)
911 dump_printf_loc (MSG_NOTE
, vect_location
,
912 "inner step divides the vector alignment.\n");
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
915 "inner step doesn't divide the vector"
920 /* Similarly we can only use base and misalignment information relative to
921 an innermost loop if the misalignment stays the same throughout the
922 execution of the loop. As above, this is the case if the stride of
923 the dataref evenly divides by the alignment. */
926 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
927 step_preserves_misalignment_p
928 = multiple_p (DR_STEP_ALIGNMENT (dr
) * vf
, vector_alignment
);
930 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
932 "step doesn't divide the vector alignment.\n");
935 unsigned int base_alignment
= drb
->base_alignment
;
936 unsigned int base_misalignment
= drb
->base_misalignment
;
938 /* Calculate the maximum of the pooled base address alignment and the
939 alignment that we can compute for DR itself. */
940 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
941 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
943 base_alignment
= (*entry
)->base_alignment
;
944 base_misalignment
= (*entry
)->base_misalignment
;
947 if (drb
->offset_alignment
< vector_alignment
948 || !step_preserves_misalignment_p
949 /* We need to know whether the step wrt the vectorized loop is
950 negative when computing the starting misalignment below. */
951 || TREE_CODE (drb
->step
) != INTEGER_CST
)
953 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
956 "Unknown alignment for access: ");
957 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
958 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
963 if (base_alignment
< vector_alignment
)
965 unsigned int max_alignment
;
966 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
967 if (max_alignment
< vector_alignment
968 || !vect_can_force_dr_alignment_p (base
,
969 vector_alignment
* BITS_PER_UNIT
))
971 if (dump_enabled_p ())
973 dump_printf_loc (MSG_NOTE
, vect_location
,
974 "can't force alignment of ref: ");
975 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
976 dump_printf (MSG_NOTE
, "\n");
981 /* Force the alignment of the decl.
982 NOTE: This is the only change to the code we make during
983 the analysis phase, before deciding to vectorize the loop. */
984 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
987 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
988 dump_printf (MSG_NOTE
, "\n");
991 DR_VECT_AUX (dr
)->base_decl
= base
;
992 DR_VECT_AUX (dr
)->base_misaligned
= true;
993 base_misalignment
= 0;
995 poly_int64 misalignment
996 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
998 /* If this is a backward running DR then first access in the larger
999 vectype actually is N-1 elements before the address in the DR.
1000 Adjust misalign accordingly. */
1001 if (tree_int_cst_sgn (drb
->step
) < 0)
1002 /* PLUS because STEP is negative. */
1003 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1004 * TREE_INT_CST_LOW (drb
->step
));
1006 unsigned int const_misalignment
;
1007 if (!known_misalignment (misalignment
, vector_alignment
,
1008 &const_misalignment
))
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1013 "Non-constant misalignment for access: ");
1014 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1015 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1020 SET_DR_MISALIGNMENT (dr
, const_misalignment
);
1022 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1025 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
1026 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1027 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1033 /* Function vect_update_misalignment_for_peel.
1034 Sets DR's misalignment
1035 - to 0 if it has the same alignment as DR_PEEL,
1036 - to the misalignment computed using NPEEL if DR's salignment is known,
1037 - to -1 (unknown) otherwise.
1039 DR - the data reference whose misalignment is to be adjusted.
1040 DR_PEEL - the data reference whose misalignment is being made
1041 zero in the vector loop by the peel.
1042 NPEEL - the number of iterations in the peel loop if the misalignment
1043 of DR_PEEL is known at compile time. */
1046 vect_update_misalignment_for_peel (struct data_reference
*dr
,
1047 struct data_reference
*dr_peel
, int npeel
)
1050 vec
<dr_p
> same_aligned_drs
;
1051 struct data_reference
*current_dr
;
1052 int dr_size
= vect_get_scalar_dr_size (dr
);
1053 int dr_peel_size
= vect_get_scalar_dr_size (dr_peel
);
1054 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1055 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
1057 /* For interleaved data accesses the step in the loop must be multiplied by
1058 the size of the interleaving group. */
1059 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1060 dr_size
*= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info
)));
1061 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
1062 dr_peel_size
*= DR_GROUP_SIZE (peel_stmt_info
);
1064 /* It can be assumed that the data refs with the same alignment as dr_peel
1065 are aligned in the vector loop. */
1067 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
1068 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1070 if (current_dr
!= dr
)
1072 gcc_assert (!known_alignment_for_access_p (dr
)
1073 || !known_alignment_for_access_p (dr_peel
)
1074 || (DR_MISALIGNMENT (dr
) / dr_size
1075 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
1076 SET_DR_MISALIGNMENT (dr
, 0);
1080 if (known_alignment_for_access_p (dr
)
1081 && known_alignment_for_access_p (dr_peel
))
1083 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1084 int misal
= DR_MISALIGNMENT (dr
);
1085 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1086 misal
&= DR_TARGET_ALIGNMENT (dr
) - 1;
1087 SET_DR_MISALIGNMENT (dr
, misal
);
1091 if (dump_enabled_p ())
1092 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1093 "to unknown (-1).\n");
1094 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1098 /* Function verify_data_ref_alignment
1100 Return TRUE if DR can be handled with respect to alignment. */
1103 verify_data_ref_alignment (data_reference_p dr
)
1105 enum dr_alignment_support supportable_dr_alignment
1106 = vect_supportable_dr_alignment (dr
, false);
1107 if (!supportable_dr_alignment
)
1109 if (dump_enabled_p ())
1111 if (DR_IS_READ (dr
))
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1113 "not vectorized: unsupported unaligned load.");
1115 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1116 "not vectorized: unsupported unaligned "
1119 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1121 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1126 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1127 dump_printf_loc (MSG_NOTE
, vect_location
,
1128 "Vectorizing an unaligned access.\n");
1133 /* Function vect_verify_datarefs_alignment
1135 Return TRUE if all data references in the loop can be
1136 handled with respect to alignment. */
1139 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1141 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1142 struct data_reference
*dr
;
1145 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1147 gimple
*stmt
= DR_STMT (dr
);
1148 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1150 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1153 /* For interleaving, only the alignment of the first access matters. */
1154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1155 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1158 /* Strided accesses perform only component accesses, alignment is
1159 irrelevant for them. */
1160 if (STMT_VINFO_STRIDED_P (stmt_info
)
1161 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1164 if (! verify_data_ref_alignment (dr
))
1171 /* Given an memory reference EXP return whether its alignment is less
1175 not_size_aligned (tree exp
)
1177 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1180 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1181 > get_object_alignment (exp
));
1184 /* Function vector_alignment_reachable_p
1186 Return true if vector alignment for DR is reachable by peeling
1187 a few loop iterations. Return false otherwise. */
1190 vector_alignment_reachable_p (struct data_reference
*dr
)
1192 gimple
*stmt
= DR_STMT (dr
);
1193 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1194 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1196 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1198 /* For interleaved access we peel only if number of iterations in
1199 the prolog loop ({VF - misalignment}), is a multiple of the
1200 number of the interleaved accesses. */
1201 int elem_size
, mis_in_elements
;
1203 /* FORNOW: handle only known alignment. */
1204 if (!known_alignment_for_access_p (dr
))
1207 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1208 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1209 elem_size
= vector_element_size (vector_size
, nelements
);
1210 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1212 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1216 /* If misalignment is known at the compile time then allow peeling
1217 only if natural alignment is reachable through peeling. */
1218 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1220 HOST_WIDE_INT elmsize
=
1221 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1222 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_NOTE
, vect_location
,
1225 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1226 dump_printf (MSG_NOTE
,
1227 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1229 if (DR_MISALIGNMENT (dr
) % elmsize
)
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1233 "data size does not divide the misalignment.\n");
1238 if (!known_alignment_for_access_p (dr
))
1240 tree type
= TREE_TYPE (DR_REF (dr
));
1241 bool is_packed
= not_size_aligned (DR_REF (dr
));
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1244 "Unknown misalignment, %snaturally aligned\n",
1245 is_packed
? "not " : "");
1246 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1253 /* Calculate the cost of the memory access represented by DR. */
1256 vect_get_data_access_cost (struct data_reference
*dr
,
1257 unsigned int *inside_cost
,
1258 unsigned int *outside_cost
,
1259 stmt_vector_for_cost
*body_cost_vec
,
1260 stmt_vector_for_cost
*prologue_cost_vec
)
1262 gimple
*stmt
= DR_STMT (dr
);
1263 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1264 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1267 if (PURE_SLP_STMT (stmt_info
))
1270 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1272 if (DR_IS_READ (dr
))
1273 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1274 prologue_cost_vec
, body_cost_vec
, false);
1276 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_NOTE
, vect_location
,
1280 "vect_get_data_access_cost: inside_cost = %d, "
1281 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1285 typedef struct _vect_peel_info
1287 struct data_reference
*dr
;
1292 typedef struct _vect_peel_extended_info
1294 struct _vect_peel_info peel_info
;
1295 unsigned int inside_cost
;
1296 unsigned int outside_cost
;
1297 } *vect_peel_extended_info
;
1300 /* Peeling hashtable helpers. */
1302 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1304 static inline hashval_t
hash (const _vect_peel_info
*);
1305 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1309 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1311 return (hashval_t
) peel_info
->npeel
;
1315 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1317 return (a
->npeel
== b
->npeel
);
1321 /* Insert DR into peeling hash table with NPEEL as key. */
1324 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1325 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1328 struct _vect_peel_info elem
, *slot
;
1329 _vect_peel_info
**new_slot
;
1330 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1333 slot
= peeling_htab
->find (&elem
);
1338 slot
= XNEW (struct _vect_peel_info
);
1339 slot
->npeel
= npeel
;
1342 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1346 if (!supportable_dr_alignment
1347 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1348 slot
->count
+= VECT_MAX_COST
;
1352 /* Traverse peeling hash table to find peeling option that aligns maximum
1353 number of data accesses. */
1356 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1357 _vect_peel_extended_info
*max
)
1359 vect_peel_info elem
= *slot
;
1361 if (elem
->count
> max
->peel_info
.count
1362 || (elem
->count
== max
->peel_info
.count
1363 && max
->peel_info
.npeel
> elem
->npeel
))
1365 max
->peel_info
.npeel
= elem
->npeel
;
1366 max
->peel_info
.count
= elem
->count
;
1367 max
->peel_info
.dr
= elem
->dr
;
1373 /* Get the costs of peeling NPEEL iterations checking data access costs
1374 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1375 misalignment will be zero after peeling. */
1378 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1379 struct data_reference
*dr0
,
1380 unsigned int *inside_cost
,
1381 unsigned int *outside_cost
,
1382 stmt_vector_for_cost
*body_cost_vec
,
1383 stmt_vector_for_cost
*prologue_cost_vec
,
1385 bool unknown_misalignment
)
1390 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1392 gimple
*stmt
= DR_STMT (dr
);
1393 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1394 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1397 /* For interleaving, only the alignment of the first access
1399 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1400 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1403 /* Strided accesses perform only component accesses, alignment is
1404 irrelevant for them. */
1405 if (STMT_VINFO_STRIDED_P (stmt_info
)
1406 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1409 int save_misalignment
;
1410 save_misalignment
= DR_MISALIGNMENT (dr
);
1413 else if (unknown_misalignment
&& dr
== dr0
)
1414 SET_DR_MISALIGNMENT (dr
, 0);
1416 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1417 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1418 body_cost_vec
, prologue_cost_vec
);
1419 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1423 /* Traverse peeling hash table and calculate cost for each peeling option.
1424 Find the one with the lowest cost. */
1427 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1428 _vect_peel_extended_info
*min
)
1430 vect_peel_info elem
= *slot
;
1432 unsigned int inside_cost
= 0, outside_cost
= 0;
1433 gimple
*stmt
= DR_STMT (elem
->dr
);
1434 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1435 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1436 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1439 prologue_cost_vec
.create (2);
1440 body_cost_vec
.create (2);
1441 epilogue_cost_vec
.create (2);
1443 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1444 elem
->dr
, &inside_cost
, &outside_cost
,
1445 &body_cost_vec
, &prologue_cost_vec
,
1446 elem
->npeel
, false);
1448 body_cost_vec
.release ();
1450 outside_cost
+= vect_get_known_peeling_cost
1451 (loop_vinfo
, elem
->npeel
, &dummy
,
1452 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1453 &prologue_cost_vec
, &epilogue_cost_vec
);
1455 /* Prologue and epilogue costs are added to the target model later.
1456 These costs depend only on the scalar iteration cost, the
1457 number of peeling iterations finally chosen, and the number of
1458 misaligned statements. So discard the information found here. */
1459 prologue_cost_vec
.release ();
1460 epilogue_cost_vec
.release ();
1462 if (inside_cost
< min
->inside_cost
1463 || (inside_cost
== min
->inside_cost
1464 && outside_cost
< min
->outside_cost
))
1466 min
->inside_cost
= inside_cost
;
1467 min
->outside_cost
= outside_cost
;
1468 min
->peel_info
.dr
= elem
->dr
;
1469 min
->peel_info
.npeel
= elem
->npeel
;
1470 min
->peel_info
.count
= elem
->count
;
1477 /* Choose best peeling option by traversing peeling hash table and either
1478 choosing an option with the lowest cost (if cost model is enabled) or the
1479 option that aligns as many accesses as possible. */
1481 static struct _vect_peel_extended_info
1482 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1483 loop_vec_info loop_vinfo
)
1485 struct _vect_peel_extended_info res
;
1487 res
.peel_info
.dr
= NULL
;
1489 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1491 res
.inside_cost
= INT_MAX
;
1492 res
.outside_cost
= INT_MAX
;
1493 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1494 vect_peeling_hash_get_lowest_cost
> (&res
);
1498 res
.peel_info
.count
= 0;
1499 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1500 vect_peeling_hash_get_most_frequent
> (&res
);
1501 res
.inside_cost
= 0;
1502 res
.outside_cost
= 0;
1508 /* Return true if the new peeling NPEEL is supported. */
1511 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1515 struct data_reference
*dr
= NULL
;
1516 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1518 stmt_vec_info stmt_info
;
1519 enum dr_alignment_support supportable_dr_alignment
;
1521 /* Ensure that all data refs can be vectorized after the peel. */
1522 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1524 int save_misalignment
;
1529 stmt
= DR_STMT (dr
);
1530 stmt_info
= vinfo_for_stmt (stmt
);
1531 /* For interleaving, only the alignment of the first access
1533 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1534 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1537 /* Strided accesses perform only component accesses, alignment is
1538 irrelevant for them. */
1539 if (STMT_VINFO_STRIDED_P (stmt_info
)
1540 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1543 save_misalignment
= DR_MISALIGNMENT (dr
);
1544 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1545 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1546 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1548 if (!supportable_dr_alignment
)
1555 /* Function vect_enhance_data_refs_alignment
1557 This pass will use loop versioning and loop peeling in order to enhance
1558 the alignment of data references in the loop.
1560 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1561 original loop is to be vectorized. Any other loops that are created by
1562 the transformations performed in this pass - are not supposed to be
1563 vectorized. This restriction will be relaxed.
1565 This pass will require a cost model to guide it whether to apply peeling
1566 or versioning or a combination of the two. For example, the scheme that
1567 intel uses when given a loop with several memory accesses, is as follows:
1568 choose one memory access ('p') which alignment you want to force by doing
1569 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1570 other accesses are not necessarily aligned, or (2) use loop versioning to
1571 generate one loop in which all accesses are aligned, and another loop in
1572 which only 'p' is necessarily aligned.
1574 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1575 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1576 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1578 Devising a cost model is the most critical aspect of this work. It will
1579 guide us on which access to peel for, whether to use loop versioning, how
1580 many versions to create, etc. The cost model will probably consist of
1581 generic considerations as well as target specific considerations (on
1582 powerpc for example, misaligned stores are more painful than misaligned
1585 Here are the general steps involved in alignment enhancements:
1587 -- original loop, before alignment analysis:
1588 for (i=0; i<N; i++){
1589 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1590 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1593 -- After vect_compute_data_refs_alignment:
1594 for (i=0; i<N; i++){
1595 x = q[i]; # DR_MISALIGNMENT(q) = 3
1596 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1599 -- Possibility 1: we do loop versioning:
1601 for (i=0; i<N; i++){ # loop 1A
1602 x = q[i]; # DR_MISALIGNMENT(q) = 3
1603 p[i] = y; # DR_MISALIGNMENT(p) = 0
1607 for (i=0; i<N; i++){ # loop 1B
1608 x = q[i]; # DR_MISALIGNMENT(q) = 3
1609 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1613 -- Possibility 2: we do loop peeling:
1614 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1618 for (i = 3; i < N; i++){ # loop 2A
1619 x = q[i]; # DR_MISALIGNMENT(q) = 0
1620 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1623 -- Possibility 3: combination of loop peeling and versioning:
1624 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1629 for (i = 3; i<N; i++){ # loop 3A
1630 x = q[i]; # DR_MISALIGNMENT(q) = 0
1631 p[i] = y; # DR_MISALIGNMENT(p) = 0
1635 for (i = 3; i<N; i++){ # loop 3B
1636 x = q[i]; # DR_MISALIGNMENT(q) = 0
1637 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1641 These loops are later passed to loop_transform to be vectorized. The
1642 vectorizer will use the alignment information to guide the transformation
1643 (whether to generate regular loads/stores, or with special handling for
1647 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1649 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1650 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1651 enum dr_alignment_support supportable_dr_alignment
;
1652 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1653 struct data_reference
*dr
;
1655 bool do_peeling
= false;
1656 bool do_versioning
= false;
1659 stmt_vec_info stmt_info
;
1660 unsigned int npeel
= 0;
1661 bool one_misalignment_known
= false;
1662 bool one_misalignment_unknown
= false;
1663 bool one_dr_unsupportable
= false;
1664 struct data_reference
*unsupportable_dr
= NULL
;
1665 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1666 unsigned possible_npeel_number
= 1;
1668 unsigned int mis
, same_align_drs_max
= 0;
1669 hash_table
<peel_info_hasher
> peeling_htab (1);
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_NOTE
, vect_location
,
1673 "=== vect_enhance_data_refs_alignment ===\n");
1675 /* Reset data so we can safely be called multiple times. */
1676 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1677 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1679 /* While cost model enhancements are expected in the future, the high level
1680 view of the code at this time is as follows:
1682 A) If there is a misaligned access then see if peeling to align
1683 this access can make all data references satisfy
1684 vect_supportable_dr_alignment. If so, update data structures
1685 as needed and return true.
1687 B) If peeling wasn't possible and there is a data reference with an
1688 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1689 then see if loop versioning checks can be used to make all data
1690 references satisfy vect_supportable_dr_alignment. If so, update
1691 data structures as needed and return true.
1693 C) If neither peeling nor versioning were successful then return false if
1694 any data reference does not satisfy vect_supportable_dr_alignment.
1696 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1698 Note, Possibility 3 above (which is peeling and versioning together) is not
1699 being done at this time. */
1701 /* (1) Peeling to force alignment. */
1703 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1705 + How many accesses will become aligned due to the peeling
1706 - How many accesses will become unaligned due to the peeling,
1707 and the cost of misaligned accesses.
1708 - The cost of peeling (the extra runtime checks, the increase
1711 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1713 stmt
= DR_STMT (dr
);
1714 stmt_info
= vinfo_for_stmt (stmt
);
1716 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1719 /* For interleaving, only the alignment of the first access
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1722 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1725 /* For invariant accesses there is nothing to enhance. */
1726 if (integer_zerop (DR_STEP (dr
)))
1729 /* Strided accesses perform only component accesses, alignment is
1730 irrelevant for them. */
1731 if (STMT_VINFO_STRIDED_P (stmt_info
)
1732 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1735 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1736 do_peeling
= vector_alignment_reachable_p (dr
);
1739 if (known_alignment_for_access_p (dr
))
1741 unsigned int npeel_tmp
= 0;
1742 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1743 size_zero_node
) < 0;
1745 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1746 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr
);
1747 unsigned int dr_size
= vect_get_scalar_dr_size (dr
);
1748 mis
= (negative
? DR_MISALIGNMENT (dr
) : -DR_MISALIGNMENT (dr
));
1749 if (DR_MISALIGNMENT (dr
) != 0)
1750 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1752 /* For multiple types, it is possible that the bigger type access
1753 will have more than one peeling option. E.g., a loop with two
1754 types: one of size (vector size / 4), and the other one of
1755 size (vector size / 8). Vectorization factor will 8. If both
1756 accesses are misaligned by 3, the first one needs one scalar
1757 iteration to be aligned, and the second one needs 5. But the
1758 first one will be aligned also by peeling 5 scalar
1759 iterations, and in that case both accesses will be aligned.
1760 Hence, except for the immediate peeling amount, we also want
1761 to try to add full vector size, while we don't exceed
1762 vectorization factor.
1763 We do this automatically for cost model, since we calculate
1764 cost for every peeling option. */
1765 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1767 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1768 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1769 possible_npeel_number
1770 = vect_get_num_vectors (nscalars
, vectype
);
1772 /* NPEEL_TMP is 0 when there is no misalignment, but also
1773 allow peeling NELEMENTS. */
1774 if (DR_MISALIGNMENT (dr
) == 0)
1775 possible_npeel_number
++;
1778 /* Save info about DR in the hash table. Also include peeling
1779 amounts according to the explanation above. */
1780 for (j
= 0; j
< possible_npeel_number
; j
++)
1782 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1784 npeel_tmp
+= target_align
/ dr_size
;
1787 one_misalignment_known
= true;
1791 /* If we don't know any misalignment values, we prefer
1792 peeling for data-ref that has the maximum number of data-refs
1793 with the same alignment, unless the target prefers to align
1794 stores over load. */
1795 unsigned same_align_drs
1796 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1798 || same_align_drs_max
< same_align_drs
)
1800 same_align_drs_max
= same_align_drs
;
1803 /* For data-refs with the same number of related
1804 accesses prefer the one where the misalign
1805 computation will be invariant in the outermost loop. */
1806 else if (same_align_drs_max
== same_align_drs
)
1808 struct loop
*ivloop0
, *ivloop
;
1809 ivloop0
= outermost_invariant_loop_for_expr
1810 (loop
, DR_BASE_ADDRESS (dr0
));
1811 ivloop
= outermost_invariant_loop_for_expr
1812 (loop
, DR_BASE_ADDRESS (dr
));
1813 if ((ivloop
&& !ivloop0
)
1814 || (ivloop
&& ivloop0
1815 && flow_loop_nested_p (ivloop
, ivloop0
)))
1819 one_misalignment_unknown
= true;
1821 /* Check for data refs with unsupportable alignment that
1823 if (!supportable_dr_alignment
)
1825 one_dr_unsupportable
= true;
1826 unsupportable_dr
= dr
;
1829 if (!first_store
&& DR_IS_WRITE (dr
))
1835 if (!aligned_access_p (dr
))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1839 "vector alignment may not be reachable\n");
1845 /* Check if we can possibly peel the loop. */
1846 if (!vect_can_advance_ivs_p (loop_vinfo
)
1847 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1851 struct _vect_peel_extended_info peel_for_known_alignment
;
1852 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1853 struct _vect_peel_extended_info best_peel
;
1855 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1856 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1857 peel_for_unknown_alignment
.peel_info
.count
= 0;
1860 && one_misalignment_unknown
)
1862 /* Check if the target requires to prefer stores over loads, i.e., if
1863 misaligned stores are more expensive than misaligned loads (taking
1864 drs with same alignment into account). */
1865 unsigned int load_inside_cost
= 0;
1866 unsigned int load_outside_cost
= 0;
1867 unsigned int store_inside_cost
= 0;
1868 unsigned int store_outside_cost
= 0;
1869 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1871 stmt_vector_for_cost dummy
;
1873 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1876 &dummy
, &dummy
, estimated_npeels
, true);
1882 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1884 &store_outside_cost
,
1886 estimated_npeels
, true);
1891 store_inside_cost
= INT_MAX
;
1892 store_outside_cost
= INT_MAX
;
1895 if (load_inside_cost
> store_inside_cost
1896 || (load_inside_cost
== store_inside_cost
1897 && load_outside_cost
> store_outside_cost
))
1900 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1901 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1905 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1906 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1909 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1910 prologue_cost_vec
.create (2);
1911 epilogue_cost_vec
.create (2);
1914 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1915 (loop_vinfo
, estimated_npeels
, &dummy2
,
1916 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1917 &prologue_cost_vec
, &epilogue_cost_vec
);
1919 prologue_cost_vec
.release ();
1920 epilogue_cost_vec
.release ();
1922 peel_for_unknown_alignment
.peel_info
.count
= 1
1923 + STMT_VINFO_SAME_ALIGN_REFS
1924 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1927 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1928 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1930 best_peel
= peel_for_unknown_alignment
;
1932 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1933 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1934 peel_for_known_alignment
.peel_info
.count
= 0;
1935 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1937 if (do_peeling
&& one_misalignment_known
)
1939 /* Peeling is possible, but there is no data access that is not supported
1940 unless aligned. So we try to choose the best possible peeling from
1942 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1943 (&peeling_htab
, loop_vinfo
);
1946 /* Compare costs of peeling for known and unknown alignment. */
1947 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1948 && peel_for_unknown_alignment
.inside_cost
1949 >= peel_for_known_alignment
.inside_cost
)
1951 best_peel
= peel_for_known_alignment
;
1953 /* If the best peeling for known alignment has NPEEL == 0, perform no
1954 peeling at all except if there is an unsupportable dr that we can
1956 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1960 /* If there is an unsupportable data ref, prefer this over all choices so far
1961 since we'd have to discard a chosen peeling except when it accidentally
1962 aligned the unsupportable data ref. */
1963 if (one_dr_unsupportable
)
1964 dr0
= unsupportable_dr
;
1965 else if (do_peeling
)
1967 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1968 TODO: Use nopeel_outside_cost or get rid of it? */
1969 unsigned nopeel_inside_cost
= 0;
1970 unsigned nopeel_outside_cost
= 0;
1972 stmt_vector_for_cost dummy
;
1974 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1975 &nopeel_outside_cost
, &dummy
, &dummy
,
1979 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1980 costs will be recorded. */
1981 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1982 prologue_cost_vec
.create (2);
1983 epilogue_cost_vec
.create (2);
1986 nopeel_outside_cost
+= vect_get_known_peeling_cost
1987 (loop_vinfo
, 0, &dummy2
,
1988 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1989 &prologue_cost_vec
, &epilogue_cost_vec
);
1991 prologue_cost_vec
.release ();
1992 epilogue_cost_vec
.release ();
1994 npeel
= best_peel
.peel_info
.npeel
;
1995 dr0
= best_peel
.peel_info
.dr
;
1997 /* If no peeling is not more expensive than the best peeling we
1998 have so far, don't perform any peeling. */
1999 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2005 stmt
= DR_STMT (dr0
);
2006 stmt_info
= vinfo_for_stmt (stmt
);
2007 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2009 if (known_alignment_for_access_p (dr0
))
2011 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
2012 size_zero_node
) < 0;
2015 /* Since it's known at compile time, compute the number of
2016 iterations in the peeled loop (the peeling factor) for use in
2017 updating DR_MISALIGNMENT values. The peeling factor is the
2018 vectorization factor minus the misalignment as an element
2020 mis
= negative
? DR_MISALIGNMENT (dr0
) : -DR_MISALIGNMENT (dr0
);
2021 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2022 npeel
= ((mis
& (target_align
- 1))
2023 / vect_get_scalar_dr_size (dr0
));
2026 /* For interleaved data access every iteration accesses all the
2027 members of the group, therefore we divide the number of iterations
2028 by the group size. */
2029 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2031 npeel
/= DR_GROUP_SIZE (stmt_info
);
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE
, vect_location
,
2035 "Try peeling by %d\n", npeel
);
2038 /* Ensure that all datarefs can be vectorized after the peel. */
2039 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
2042 /* Check if all datarefs are supportable and log. */
2043 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
2045 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2052 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2055 unsigned max_allowed_peel
2056 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2057 if (max_allowed_peel
!= (unsigned)-1)
2059 unsigned max_peel
= npeel
;
2062 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2063 max_peel
= target_align
/ vect_get_scalar_dr_size (dr0
) - 1;
2065 if (max_peel
> max_allowed_peel
)
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE
, vect_location
,
2070 "Disable peeling, max peels reached: %d\n", max_peel
);
2075 /* Cost model #2 - if peeling may result in a remaining loop not
2076 iterating enough to be vectorized then do not peel. Since this
2077 is a cost heuristic rather than a correctness decision, use the
2078 most likely runtime value for variable vectorization factors. */
2080 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2082 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2083 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2084 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2085 < assumed_vf
+ max_peel
)
2091 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2092 If the misalignment of DR_i is identical to that of dr0 then set
2093 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2094 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2095 by the peeling factor times the element size of DR_i (MOD the
2096 vectorization factor times the size). Otherwise, the
2097 misalignment of DR_i must be set to unknown. */
2098 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2101 /* Strided accesses perform only component accesses, alignment
2102 is irrelevant for them. */
2103 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2104 if (STMT_VINFO_STRIDED_P (stmt_info
)
2105 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2108 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2111 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2113 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2115 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2116 = DR_MISALIGNMENT (dr0
);
2117 SET_DR_MISALIGNMENT (dr0
, 0);
2118 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_NOTE
, vect_location
,
2121 "Alignment of access forced using peeling.\n");
2122 dump_printf_loc (MSG_NOTE
, vect_location
,
2123 "Peeling for alignment will be applied.\n");
2126 /* The inside-loop cost will be accounted for in vectorizable_load
2127 and vectorizable_store correctly with adjusted alignments.
2128 Drop the body_cst_vec on the floor here. */
2129 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2135 /* (2) Versioning to force alignment. */
2137 /* Try versioning if:
2138 1) optimize loop for speed
2139 2) there is at least one unsupported misaligned data ref with an unknown
2141 3) all misaligned data refs with a known misalignment are supported, and
2142 4) the number of runtime alignment checks is within reason. */
2145 optimize_loop_nest_for_speed_p (loop
)
2146 && (!loop
->inner
); /* FORNOW */
2150 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2152 stmt
= DR_STMT (dr
);
2153 stmt_info
= vinfo_for_stmt (stmt
);
2155 /* For interleaving, only the alignment of the first access
2157 if (aligned_access_p (dr
)
2158 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2159 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2162 if (STMT_VINFO_STRIDED_P (stmt_info
))
2164 /* Strided loads perform only component accesses, alignment is
2165 irrelevant for them. */
2166 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2168 do_versioning
= false;
2172 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2174 if (!supportable_dr_alignment
)
2180 if (known_alignment_for_access_p (dr
)
2181 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2182 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2184 do_versioning
= false;
2188 stmt
= DR_STMT (dr
);
2189 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2190 gcc_assert (vectype
);
2192 /* At present we don't support versioning for alignment
2193 with variable VF, since there's no guarantee that the
2194 VF is a power of two. We could relax this if we added
2195 a way of enforcing a power-of-two size. */
2196 unsigned HOST_WIDE_INT size
;
2197 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2199 do_versioning
= false;
2203 /* The rightmost bits of an aligned address must be zeros.
2204 Construct the mask needed for this test. For example,
2205 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2206 mask must be 15 = 0xf. */
2209 /* FORNOW: use the same mask to test all potentially unaligned
2210 references in the loop. The vectorizer currently supports
2211 a single vector size, see the reference to
2212 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2213 vectorization factor is computed. */
2214 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2215 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2216 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2217 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2222 /* Versioning requires at least one misaligned data reference. */
2223 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2224 do_versioning
= false;
2225 else if (!do_versioning
)
2226 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2231 vec
<gimple
*> may_misalign_stmts
2232 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2235 /* It can now be assumed that the data references in the statements
2236 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2237 of the loop being vectorized. */
2238 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2240 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2241 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2242 SET_DR_MISALIGNMENT (dr
, 0);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE
, vect_location
,
2245 "Alignment of access forced using versioning.\n");
2248 if (dump_enabled_p ())
2249 dump_printf_loc (MSG_NOTE
, vect_location
,
2250 "Versioning for alignment will be applied.\n");
2252 /* Peeling and versioning can't be done together at this time. */
2253 gcc_assert (! (do_peeling
&& do_versioning
));
2255 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2260 /* This point is reached if neither peeling nor versioning is being done. */
2261 gcc_assert (! (do_peeling
|| do_versioning
));
2263 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2268 /* Function vect_find_same_alignment_drs.
2270 Update group and alignment relations according to the chosen
2271 vectorization factor. */
2274 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2276 struct data_reference
*dra
= DDR_A (ddr
);
2277 struct data_reference
*drb
= DDR_B (ddr
);
2278 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2279 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2281 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2287 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2288 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2289 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2292 /* Two references with distance zero have the same alignment. */
2293 poly_offset_int diff
= (wi::to_poly_offset (DR_INIT (dra
))
2294 - wi::to_poly_offset (DR_INIT (drb
)));
2295 if (maybe_ne (diff
, 0))
2297 /* Get the wider of the two alignments. */
2298 unsigned int align_a
= (vect_calculate_target_alignment (dra
)
2300 unsigned int align_b
= (vect_calculate_target_alignment (drb
)
2302 unsigned int max_align
= MAX (align_a
, align_b
);
2304 /* Require the gap to be a multiple of the larger vector alignment. */
2305 if (!multiple_p (diff
, max_align
))
2309 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2310 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2311 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE
, vect_location
,
2314 "accesses have the same alignment: ");
2315 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2316 dump_printf (MSG_NOTE
, " and ");
2317 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2318 dump_printf (MSG_NOTE
, "\n");
2323 /* Function vect_analyze_data_refs_alignment
2325 Analyze the alignment of the data-references in the loop.
2326 Return FALSE if a data reference is found that cannot be vectorized. */
2329 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE
, vect_location
,
2333 "=== vect_analyze_data_refs_alignment ===\n");
2335 /* Mark groups of data references with same alignment using
2336 data dependence information. */
2337 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2338 struct data_dependence_relation
*ddr
;
2341 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2342 vect_find_same_alignment_drs (ddr
);
2344 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2345 struct data_reference
*dr
;
2347 vect_record_base_alignments (vinfo
);
2348 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2350 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2351 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2352 && !vect_compute_data_ref_alignment (dr
))
2354 /* Strided accesses perform only component accesses, misalignment
2355 information is irrelevant for them. */
2356 if (STMT_VINFO_STRIDED_P (stmt_info
)
2357 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2362 "not vectorized: can't calculate alignment "
2373 /* Analyze alignment of DRs of stmts in NODE. */
2376 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2378 /* We vectorize from the first scalar stmt in the node unless
2379 the node is permuted in which case we start from the first
2380 element in the group. */
2381 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2382 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2383 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2384 first_stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2386 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2387 if (! vect_compute_data_ref_alignment (dr
)
2388 /* For creating the data-ref pointer we need alignment of the
2389 first element anyway. */
2391 && ! vect_compute_data_ref_alignment (first_dr
))
2392 || ! verify_data_ref_alignment (dr
))
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2396 "not vectorized: bad data alignment in basic "
2404 /* Function vect_slp_analyze_instance_alignment
2406 Analyze the alignment of the data-references in the SLP instance.
2407 Return FALSE if a data reference is found that cannot be vectorized. */
2410 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2412 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_NOTE
, vect_location
,
2414 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2418 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2419 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2422 node
= SLP_INSTANCE_TREE (instance
);
2423 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2424 && ! vect_slp_analyze_and_verify_node_alignment
2425 (SLP_INSTANCE_TREE (instance
)))
2432 /* Analyze groups of accesses: check that DR belongs to a group of
2433 accesses of legal size, step, etc. Detect gaps, single element
2434 interleaving, and other special cases. Set grouped access info.
2435 Collect groups of strided stores for further use in SLP analysis.
2436 Worker for vect_analyze_group_access. */
2439 vect_analyze_group_access_1 (struct data_reference
*dr
)
2441 tree step
= DR_STEP (dr
);
2442 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2443 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2444 gimple
*stmt
= DR_STMT (dr
);
2445 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2446 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2447 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2448 HOST_WIDE_INT dr_step
= -1;
2449 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2450 bool slp_impossible
= false;
2452 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2453 size of the interleaving group (including gaps). */
2454 if (tree_fits_shwi_p (step
))
2456 dr_step
= tree_to_shwi (step
);
2457 /* Check that STEP is a multiple of type size. Otherwise there is
2458 a non-element-sized gap at the end of the group which we
2459 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2460 ??? As we can handle non-constant step fine here we should
2461 simply remove uses of DR_GROUP_GAP between the last and first
2462 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2463 simply not include that gap. */
2464 if ((dr_step
% type_size
) != 0)
2466 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE
, vect_location
,
2470 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2471 dump_printf (MSG_NOTE
,
2472 " is not a multiple of the element size for ");
2473 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2474 dump_printf (MSG_NOTE
, "\n");
2478 groupsize
= absu_hwi (dr_step
) / type_size
;
2483 /* Not consecutive access is possible only if it is a part of interleaving. */
2484 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2486 /* Check if it this DR is a part of interleaving, and is a single
2487 element of the group that is accessed in the loop. */
2489 /* Gaps are supported only for loads. STEP must be a multiple of the type
2492 && (dr_step
% type_size
) == 0
2495 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2496 DR_GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2497 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2498 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE
, vect_location
,
2501 "Detected single element interleaving ");
2502 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2503 dump_printf (MSG_NOTE
, " step ");
2504 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2505 dump_printf (MSG_NOTE
, "\n");
2511 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2514 "not consecutive access ");
2515 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2520 /* Mark the statement as unvectorizable. */
2521 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2525 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2526 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2530 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2532 /* First stmt in the interleaving chain. Check the chain. */
2533 gimple
*next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2534 struct data_reference
*data_ref
= dr
;
2535 unsigned int count
= 1;
2536 tree prev_init
= DR_INIT (data_ref
);
2537 gimple
*prev
= stmt
;
2538 HOST_WIDE_INT diff
, gaps
= 0;
2540 /* By construction, all group members have INTEGER_CST DR_INITs. */
2543 /* Skip same data-refs. In case that two or more stmts share
2544 data-ref (supported only for loads), we vectorize only the first
2545 stmt, and the rest get their vectorized loads from the first
2547 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2548 DR_INIT (STMT_VINFO_DATA_REF (
2549 vinfo_for_stmt (next
)))))
2551 if (DR_IS_WRITE (data_ref
))
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2555 "Two store stmts share the same dr.\n");
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2561 "Two or more load stmts share the same dr.\n");
2563 /* For load use the same data-ref load. */
2564 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2567 next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2572 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2574 /* All group members have the same STEP by construction. */
2575 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2577 /* Check that the distance between two accesses is equal to the type
2578 size. Otherwise, we have gaps. */
2579 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2580 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2583 /* FORNOW: SLP of accesses with gaps is not supported. */
2584 slp_impossible
= true;
2585 if (DR_IS_WRITE (data_ref
))
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2589 "interleaved store with gaps\n");
2596 last_accessed_element
+= diff
;
2598 /* Store the gap from the previous member of the group. If there is no
2599 gap in the access, DR_GROUP_GAP is always 1. */
2600 DR_GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2602 prev_init
= DR_INIT (data_ref
);
2603 next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2604 /* Count the number of data-refs in the chain. */
2609 groupsize
= count
+ gaps
;
2611 /* This could be UINT_MAX but as we are generating code in a very
2612 inefficient way we have to cap earlier. See PR78699 for example. */
2613 if (groupsize
> 4096)
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2617 "group is too large\n");
2621 /* Check that the size of the interleaving is equal to count for stores,
2622 i.e., that there are no gaps. */
2623 if (groupsize
!= count
2624 && !DR_IS_READ (dr
))
2626 if (dump_enabled_p ())
2627 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2628 "interleaved store with gaps\n");
2632 /* If there is a gap after the last load in the group it is the
2633 difference between the groupsize and the last accessed
2635 When there is no gap, this difference should be 0. */
2636 DR_GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2638 DR_GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2639 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_NOTE
, vect_location
,
2642 "Detected interleaving ");
2643 if (DR_IS_READ (dr
))
2644 dump_printf (MSG_NOTE
, "load ");
2646 dump_printf (MSG_NOTE
, "store ");
2647 dump_printf (MSG_NOTE
, "of size %u starting with ",
2648 (unsigned)groupsize
);
2649 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2650 if (DR_GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2651 dump_printf_loc (MSG_NOTE
, vect_location
,
2652 "There is a gap of %u elements after the group\n",
2653 DR_GROUP_GAP (vinfo_for_stmt (stmt
)));
2656 /* SLP: create an SLP data structure for every interleaving group of
2657 stores for further analysis in vect_analyse_slp. */
2658 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2661 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2663 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2670 /* Analyze groups of accesses: check that DR belongs to a group of
2671 accesses of legal size, step, etc. Detect gaps, single element
2672 interleaving, and other special cases. Set grouped access info.
2673 Collect groups of strided stores for further use in SLP analysis. */
2676 vect_analyze_group_access (struct data_reference
*dr
)
2678 if (!vect_analyze_group_access_1 (dr
))
2680 /* Dissolve the group if present. */
2682 gimple
*stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2685 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2686 next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2687 DR_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2688 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2696 /* Analyze the access pattern of the data-reference DR.
2697 In case of non-consecutive accesses call vect_analyze_group_access() to
2698 analyze groups of accesses. */
2701 vect_analyze_data_ref_access (struct data_reference
*dr
)
2703 tree step
= DR_STEP (dr
);
2704 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2705 gimple
*stmt
= DR_STMT (dr
);
2706 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2707 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2708 struct loop
*loop
= NULL
;
2710 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2714 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2716 if (loop_vinfo
&& !step
)
2718 if (dump_enabled_p ())
2719 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2720 "bad data-ref access in loop\n");
2724 /* Allow loads with zero step in inner-loop vectorization. */
2725 if (loop_vinfo
&& integer_zerop (step
))
2727 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2728 if (!nested_in_vect_loop_p (loop
, stmt
))
2729 return DR_IS_READ (dr
);
2730 /* Allow references with zero step for outer loops marked
2731 with pragma omp simd only - it guarantees absence of
2732 loop-carried dependencies between inner loop iterations. */
2733 if (loop
->safelen
< 2)
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE
, vect_location
,
2737 "zero step in inner loop of nest\n");
2742 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2744 /* Interleaved accesses are not yet supported within outer-loop
2745 vectorization for references in the inner-loop. */
2746 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2748 /* For the rest of the analysis we use the outer-loop step. */
2749 step
= STMT_VINFO_DR_STEP (stmt_info
);
2750 if (integer_zerop (step
))
2752 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_NOTE
, vect_location
,
2754 "zero step in outer loop.\n");
2755 return DR_IS_READ (dr
);
2760 if (TREE_CODE (step
) == INTEGER_CST
)
2762 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2763 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2765 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2767 /* Mark that it is not interleaving. */
2768 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2773 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2775 if (dump_enabled_p ())
2776 dump_printf_loc (MSG_NOTE
, vect_location
,
2777 "grouped access in outer loop.\n");
2782 /* Assume this is a DR handled by non-constant strided load case. */
2783 if (TREE_CODE (step
) != INTEGER_CST
)
2784 return (STMT_VINFO_STRIDED_P (stmt_info
)
2785 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2786 || vect_analyze_group_access (dr
)));
2788 /* Not consecutive access - check if it's a part of interleaving group. */
2789 return vect_analyze_group_access (dr
);
2792 /* Compare two data-references DRA and DRB to group them into chunks
2793 suitable for grouping. */
2796 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2798 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2799 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2802 /* Stabilize sort. */
2806 /* DRs in different loops never belong to the same group. */
2807 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2808 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2810 return loopa
->num
< loopb
->num
? -1 : 1;
2812 /* Ordering of DRs according to base. */
2813 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2814 DR_BASE_ADDRESS (drb
));
2818 /* And according to DR_OFFSET. */
2819 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2823 /* Put reads before writes. */
2824 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2825 return DR_IS_READ (dra
) ? -1 : 1;
2827 /* Then sort after access size. */
2828 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2829 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2833 /* And after step. */
2834 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2838 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2839 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2841 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2845 /* If OP is the result of a conversion, return the unconverted value,
2846 otherwise return null. */
2849 strip_conversion (tree op
)
2851 if (TREE_CODE (op
) != SSA_NAME
)
2853 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2854 if (!is_gimple_assign (stmt
)
2855 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2857 return gimple_assign_rhs1 (stmt
);
2860 /* Return true if vectorizable_* routines can handle statements STMT1
2861 and STMT2 being in a single group. */
2864 can_group_stmts_p (gimple
*stmt1
, gimple
*stmt2
)
2866 if (gimple_assign_single_p (stmt1
))
2867 return gimple_assign_single_p (stmt2
);
2869 if (is_gimple_call (stmt1
) && gimple_call_internal_p (stmt1
))
2871 /* Check for two masked loads or two masked stores. */
2872 if (!is_gimple_call (stmt2
) || !gimple_call_internal_p (stmt2
))
2874 internal_fn ifn
= gimple_call_internal_fn (stmt1
);
2875 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2877 if (ifn
!= gimple_call_internal_fn (stmt2
))
2880 /* Check that the masks are the same. Cope with casts of masks,
2881 like those created by build_mask_conversion. */
2882 tree mask1
= gimple_call_arg (stmt1
, 2);
2883 tree mask2
= gimple_call_arg (stmt2
, 2);
2884 if (!operand_equal_p (mask1
, mask2
, 0))
2886 mask1
= strip_conversion (mask1
);
2889 mask2
= strip_conversion (mask2
);
2892 if (!operand_equal_p (mask1
, mask2
, 0))
2901 /* Function vect_analyze_data_ref_accesses.
2903 Analyze the access pattern of all the data references in the loop.
2905 FORNOW: the only access pattern that is considered vectorizable is a
2906 simple step 1 (consecutive) access.
2908 FORNOW: handle only arrays and pointer accesses. */
2911 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2914 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2915 struct data_reference
*dr
;
2917 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE
, vect_location
,
2919 "=== vect_analyze_data_ref_accesses ===\n");
2921 if (datarefs
.is_empty ())
2924 /* Sort the array of datarefs to make building the interleaving chains
2925 linear. Don't modify the original vector's order, it is needed for
2926 determining what dependencies are reversed. */
2927 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2928 datarefs_copy
.qsort (dr_group_sort_cmp
);
2930 /* Build the interleaving chains. */
2931 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2933 data_reference_p dra
= datarefs_copy
[i
];
2934 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2935 stmt_vec_info lastinfo
= NULL
;
2936 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2937 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2942 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2944 data_reference_p drb
= datarefs_copy
[i
];
2945 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2946 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2947 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2950 /* ??? Imperfect sorting (non-compatible types, non-modulo
2951 accesses, same accesses) can lead to a group to be artificially
2952 split here as we don't just skip over those. If it really
2953 matters we can push those to a worklist and re-iterate
2954 over them. The we can just skip ahead to the next DR here. */
2956 /* DRs in a different loop should not be put into the same
2957 interleaving group. */
2958 if (gimple_bb (DR_STMT (dra
))->loop_father
2959 != gimple_bb (DR_STMT (drb
))->loop_father
)
2962 /* Check that the data-refs have same first location (except init)
2963 and they are both either store or load (not load and store,
2964 not masked loads or stores). */
2965 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2966 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2967 DR_BASE_ADDRESS (drb
)) != 0
2968 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2969 || !can_group_stmts_p (DR_STMT (dra
), DR_STMT (drb
)))
2972 /* Check that the data-refs have the same constant size. */
2973 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2974 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2975 if (!tree_fits_uhwi_p (sza
)
2976 || !tree_fits_uhwi_p (szb
)
2977 || !tree_int_cst_equal (sza
, szb
))
2980 /* Check that the data-refs have the same step. */
2981 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2984 /* Check the types are compatible.
2985 ??? We don't distinguish this during sorting. */
2986 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2987 TREE_TYPE (DR_REF (drb
))))
2990 /* Check that the DR_INITs are compile-time constants. */
2991 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
2992 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
2995 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2996 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2997 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2998 HOST_WIDE_INT init_prev
2999 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
3000 gcc_assert (init_a
<= init_b
3001 && init_a
<= init_prev
3002 && init_prev
<= init_b
);
3004 /* Do not place the same access in the interleaving chain twice. */
3005 if (init_b
== init_prev
)
3007 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
3008 < gimple_uid (DR_STMT (drb
)));
3009 /* ??? For now we simply "drop" the later reference which is
3010 otherwise the same rather than finishing off this group.
3011 In the end we'd want to re-process duplicates forming
3012 multiple groups from the refs, likely by just collecting
3013 all candidates (including duplicates and split points
3014 below) in a vector and then process them together. */
3018 /* If init_b == init_a + the size of the type * k, we have an
3019 interleaving, and DRA is accessed before DRB. */
3020 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3021 if (type_size_a
== 0
3022 || (init_b
- init_a
) % type_size_a
!= 0)
3025 /* If we have a store, the accesses are adjacent. This splits
3026 groups into chunks we support (we don't support vectorization
3027 of stores with gaps). */
3028 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3031 /* If the step (if not zero or non-constant) is greater than the
3032 difference between data-refs' inits this splits groups into
3034 if (tree_fits_shwi_p (DR_STEP (dra
)))
3036 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
3037 if (step
!= 0 && step
<= (init_b
- init_a
))
3041 if (dump_enabled_p ())
3043 dump_printf_loc (MSG_NOTE
, vect_location
,
3044 "Detected interleaving ");
3045 if (DR_IS_READ (dra
))
3046 dump_printf (MSG_NOTE
, "load ");
3048 dump_printf (MSG_NOTE
, "store ");
3049 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
3050 dump_printf (MSG_NOTE
, " and ");
3051 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
3052 dump_printf (MSG_NOTE
, "\n");
3055 /* Link the found element into the group list. */
3056 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
3059 lastinfo
= stmtinfo_a
;
3061 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
3062 DR_GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
3063 lastinfo
= stmtinfo_b
;
3067 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
3068 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
3069 && !vect_analyze_data_ref_access (dr
))
3071 if (dump_enabled_p ())
3072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3073 "not vectorized: complicated access pattern.\n");
3075 if (is_a
<bb_vec_info
> (vinfo
))
3077 /* Mark the statement as not vectorizable. */
3078 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3083 datarefs_copy
.release ();
3088 datarefs_copy
.release ();
3092 /* Function vect_vfa_segment_size.
3095 DR: The data reference.
3096 LENGTH_FACTOR: segment length to consider.
3098 Return a value suitable for the dr_with_seg_len::seg_len field.
3099 This is the "distance travelled" by the pointer from the first
3100 iteration in the segment to the last. Note that it does not include
3101 the size of the access; in effect it only describes the first byte. */
3104 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
3106 length_factor
= size_binop (MINUS_EXPR
,
3107 fold_convert (sizetype
, length_factor
),
3109 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr
)),
3113 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3114 gives the worst-case number of bytes covered by the segment. */
3116 static unsigned HOST_WIDE_INT
3117 vect_vfa_access_size (data_reference
*dr
)
3119 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (DR_STMT (dr
));
3120 tree ref_type
= TREE_TYPE (DR_REF (dr
));
3121 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3122 unsigned HOST_WIDE_INT access_size
= ref_size
;
3123 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3125 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == DR_STMT (dr
));
3126 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3128 if (STMT_VINFO_VEC_STMT (stmt_vinfo
)
3129 && (vect_supportable_dr_alignment (dr
, false)
3130 == dr_explicit_realign_optimized
))
3132 /* We might access a full vector's worth. */
3133 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3134 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3139 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3142 vect_vfa_align (const data_reference
*dr
)
3144 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr
)));
3147 /* Function vect_no_alias_p.
3149 Given data references A and B with equal base and offset, see whether
3150 the alias relation can be decided at compilation time. Return 1 if
3151 it can and the references alias, 0 if it can and the references do
3152 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3153 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3154 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3157 vect_compile_time_alias (struct data_reference
*a
, struct data_reference
*b
,
3158 tree segment_length_a
, tree segment_length_b
,
3159 unsigned HOST_WIDE_INT access_size_a
,
3160 unsigned HOST_WIDE_INT access_size_b
)
3162 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
));
3163 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
));
3164 poly_uint64 const_length_a
;
3165 poly_uint64 const_length_b
;
3167 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3168 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3170 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3172 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3173 offset_a
= (offset_a
+ access_size_a
) - const_length_a
;
3176 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3177 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3179 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3180 offset_b
= (offset_b
+ access_size_b
) - const_length_b
;
3183 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3185 const_length_a
+= access_size_a
;
3186 const_length_b
+= access_size_b
;
3188 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3189 offset_b
, const_length_b
))
3192 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3193 offset_b
, const_length_b
))
3199 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3203 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3204 unsigned int loop_depth
, poly_uint64 vf
)
3206 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3207 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3210 /* If the dependence is exact, we should have limited the VF instead. */
3211 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3214 lambda_vector dist_v
;
3215 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3217 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3219 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3220 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3224 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_NOTE
, vect_location
,
3227 "dependence distance between ");
3228 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3229 dump_printf (MSG_NOTE
, " and ");
3230 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3231 dump_printf (MSG_NOTE
, " is >= VF\n");
3237 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3240 dump_lower_bound (int dump_kind
, const vec_lower_bound
&lower_bound
)
3242 dump_printf (dump_kind
, "%s (", lower_bound
.unsigned_p
? "unsigned" : "abs");
3243 dump_generic_expr (dump_kind
, TDF_SLIM
, lower_bound
.expr
);
3244 dump_printf (dump_kind
, ") >= ");
3245 dump_dec (dump_kind
, lower_bound
.min_value
);
3248 /* Record that the vectorized loop requires the vec_lower_bound described
3249 by EXPR, UNSIGNED_P and MIN_VALUE. */
3252 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3253 poly_uint64 min_value
)
3255 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3256 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3257 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3259 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3260 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3261 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3262 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3264 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3265 lower_bounds
[i
].min_value
= min_value
;
3266 if (dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE
, vect_location
,
3269 "updating run-time check to ");
3270 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3271 dump_printf (MSG_NOTE
, "\n");
3277 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3278 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3281 dump_lower_bound (MSG_NOTE
, lower_bound
);
3282 dump_printf (MSG_NOTE
, "\n");
3284 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3287 /* Return true if it's unlikely that the step of the vectorized form of DR
3288 will span fewer than GAP bytes. */
3291 vect_small_gap_p (loop_vec_info loop_vinfo
, data_reference
*dr
, poly_int64 gap
)
3293 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
3295 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3296 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3297 count
*= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info
)));
3298 return estimated_poly_value (gap
) <= count
* vect_get_scalar_dr_size (dr
);
3301 /* Return true if we know that there is no alias between DR_A and DR_B
3302 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3303 *LOWER_BOUND_OUT to this N. */
3306 vectorizable_with_step_bound_p (data_reference
*dr_a
, data_reference
*dr_b
,
3307 poly_uint64
*lower_bound_out
)
3309 /* Check that there is a constant gap of known sign between DR_A
3311 poly_int64 init_a
, init_b
;
3312 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3313 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3314 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3315 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3316 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3317 || !ordered_p (init_a
, init_b
))
3320 /* Sort DR_A and DR_B by the address they access. */
3321 if (maybe_lt (init_b
, init_a
))
3323 std::swap (init_a
, init_b
);
3324 std::swap (dr_a
, dr_b
);
3327 /* If the two accesses could be dependent within a scalar iteration,
3328 make sure that we'd retain their order. */
3329 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_a
), init_b
)
3330 && !vect_preserves_scalar_order_p (DR_STMT (dr_a
), DR_STMT (dr_b
)))
3333 /* There is no alias if abs (DR_STEP) is greater than or equal to
3334 the bytes spanned by the combination of the two accesses. */
3335 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_b
) - init_a
;
3339 /* Function vect_prune_runtime_alias_test_list.
3341 Prune a list of ddrs to be tested at run-time by versioning for alias.
3342 Merge several alias checks into one if possible.
3343 Return FALSE if resulting list of ddrs is longer then allowed by
3344 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3347 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3349 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3350 hash_set
<tree_pair_hash
> compared_objects
;
3352 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3353 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3354 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3355 vec
<vec_object_pair
> &check_unequal_addrs
3356 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3357 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3358 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3364 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_NOTE
, vect_location
,
3366 "=== vect_prune_runtime_alias_test_list ===\n");
3368 /* Step values are irrelevant for aliasing if the number of vector
3369 iterations is equal to the number of scalar iterations (which can
3370 happen for fully-SLP loops). */
3371 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3375 /* Convert the checks for nonzero steps into bound tests. */
3377 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3378 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3381 if (may_alias_ddrs
.is_empty ())
3384 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3386 unsigned int loop_depth
3387 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3388 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3390 /* First, we collect all data ref pairs for aliasing checks. */
3391 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3394 poly_uint64 lower_bound
;
3395 struct data_reference
*dr_a
, *dr_b
;
3396 gimple
*dr_group_first_a
, *dr_group_first_b
;
3397 tree segment_length_a
, segment_length_b
;
3398 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3399 unsigned int align_a
, align_b
;
3400 gimple
*stmt_a
, *stmt_b
;
3402 /* Ignore the alias if the VF we chose ended up being no greater
3403 than the dependence distance. */
3404 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3407 if (DDR_OBJECT_A (ddr
))
3409 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3410 if (!compared_objects
.add (new_pair
))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3415 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3416 dump_printf (MSG_NOTE
, " and ");
3417 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3418 dump_printf (MSG_NOTE
, " have different addresses\n");
3420 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3426 stmt_a
= DR_STMT (DDR_A (ddr
));
3429 stmt_b
= DR_STMT (DDR_B (ddr
));
3431 /* Skip the pair if inter-iteration dependencies are irrelevant
3432 and intra-iteration dependencies are guaranteed to be honored. */
3434 && (vect_preserves_scalar_order_p (stmt_a
, stmt_b
)
3435 || vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)))
3437 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_NOTE
, vect_location
,
3440 "no need for alias check between ");
3441 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3442 dump_printf (MSG_NOTE
, " and ");
3443 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3444 dump_printf (MSG_NOTE
, " when VF is 1\n");
3449 /* See whether we can handle the alias using a bounds check on
3450 the step, and whether that's likely to be the best approach.
3451 (It might not be, for example, if the minimum step is much larger
3452 than the number of bytes handled by one vector iteration.) */
3454 && TREE_CODE (DR_STEP (dr_a
)) != INTEGER_CST
3455 && vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)
3456 && (vect_small_gap_p (loop_vinfo
, dr_a
, lower_bound
)
3457 || vect_small_gap_p (loop_vinfo
, dr_b
, lower_bound
)))
3459 bool unsigned_p
= dr_known_forward_stride_p (dr_a
);
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between ");
3463 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3464 dump_printf (MSG_NOTE
, " and ");
3465 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3466 dump_printf (MSG_NOTE
, " when the step ");
3467 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_STEP (dr_a
));
3468 dump_printf (MSG_NOTE
, " is outside ");
3470 dump_printf (MSG_NOTE
, "[0");
3473 dump_printf (MSG_NOTE
, "(");
3474 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3476 dump_printf (MSG_NOTE
, ", ");
3477 dump_dec (MSG_NOTE
, lower_bound
);
3478 dump_printf (MSG_NOTE
, ")\n");
3480 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_a
), unsigned_p
,
3485 dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3486 if (dr_group_first_a
)
3488 stmt_a
= dr_group_first_a
;
3489 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3492 dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3493 if (dr_group_first_b
)
3495 stmt_b
= dr_group_first_b
;
3496 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3501 segment_length_a
= size_zero_node
;
3502 segment_length_b
= size_zero_node
;
3506 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3507 length_factor
= scalar_loop_iters
;
3509 length_factor
= size_int (vect_factor
);
3510 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3511 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3513 access_size_a
= vect_vfa_access_size (dr_a
);
3514 access_size_b
= vect_vfa_access_size (dr_b
);
3515 align_a
= vect_vfa_align (dr_a
);
3516 align_b
= vect_vfa_align (dr_b
);
3518 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3519 DR_BASE_ADDRESS (dr_b
));
3521 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3524 /* See whether the alias is known at compilation time. */
3526 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3527 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3528 && poly_int_tree_p (segment_length_a
)
3529 && poly_int_tree_p (segment_length_b
))
3531 int res
= vect_compile_time_alias (dr_a
, dr_b
,
3536 if (res
>= 0 && dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE
, vect_location
,
3539 "can tell at compile time that ");
3540 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3541 dump_printf (MSG_NOTE
, " and ");
3542 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3544 dump_printf (MSG_NOTE
, " do not alias\n");
3546 dump_printf (MSG_NOTE
, " alias\n");
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE
, vect_location
,
3556 "not vectorized: compilation time alias.\n");
3561 dr_with_seg_len_pair_t dr_with_seg_len_pair
3562 (dr_with_seg_len (dr_a
, segment_length_a
, access_size_a
, align_a
),
3563 dr_with_seg_len (dr_b
, segment_length_b
, access_size_b
, align_b
));
3565 /* Canonicalize pairs by sorting the two DR members. */
3567 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3569 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3572 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3574 unsigned int count
= (comp_alias_ddrs
.length ()
3575 + check_unequal_addrs
.length ());
3577 dump_printf_loc (MSG_NOTE
, vect_location
,
3578 "improved number of alias checks from %d to %d\n",
3579 may_alias_ddrs
.length (), count
);
3580 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3584 "number of versioning for alias "
3585 "run-time tests exceeds %d "
3586 "(--param vect-max-version-for-alias-checks)\n",
3587 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3594 /* Check whether we can use an internal function for a gather load
3595 or scatter store. READ_P is true for loads and false for stores.
3596 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3597 the type of the memory elements being loaded or stored. OFFSET_BITS
3598 is the number of bits in each scalar offset and OFFSET_SIGN is the
3599 sign of the offset. SCALE is the amount by which the offset should
3600 be multiplied *after* it has been converted to address width.
3602 Return true if the function is supported, storing the function
3603 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3606 vect_gather_scatter_fn_p (bool read_p
, bool masked_p
, tree vectype
,
3607 tree memory_type
, unsigned int offset_bits
,
3608 signop offset_sign
, int scale
,
3609 internal_fn
*ifn_out
, tree
*element_type_out
)
3611 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3612 unsigned int element_bits
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype
)));
3613 if (offset_bits
> element_bits
)
3614 /* Internal functions require the offset to be the same width as
3615 the vector elements. We can extend narrower offsets, but it isn't
3616 safe to truncate wider offsets. */
3619 if (element_bits
!= memory_bits
)
3620 /* For now the vector elements must be the same width as the
3624 /* Work out which function we need. */
3627 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3629 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3631 /* Test whether the target supports this combination. */
3632 if (!internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3633 offset_sign
, scale
))
3637 *element_type_out
= TREE_TYPE (vectype
);
3641 /* CALL is a call to an internal gather load or scatter store function.
3642 Describe the operation in INFO. */
3645 vect_describe_gather_scatter_call (gcall
*call
, gather_scatter_info
*info
)
3647 stmt_vec_info stmt_info
= vinfo_for_stmt (call
);
3648 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3649 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3651 info
->ifn
= gimple_call_internal_fn (call
);
3652 info
->decl
= NULL_TREE
;
3653 info
->base
= gimple_call_arg (call
, 0);
3654 info
->offset
= gimple_call_arg (call
, 1);
3655 info
->offset_dt
= vect_unknown_def_type
;
3656 info
->offset_vectype
= NULL_TREE
;
3657 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3658 info
->element_type
= TREE_TYPE (vectype
);
3659 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663 gather load or scatter store. Describe the operation in *INFO if so. */
3666 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3667 gather_scatter_info
*info
)
3669 HOST_WIDE_INT scale
= 1;
3670 poly_int64 pbitpos
, pbitsize
;
3671 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3672 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3673 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3674 tree offtype
= NULL_TREE
;
3675 tree decl
= NULL_TREE
, base
, off
;
3676 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3677 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3679 int punsignedp
, reversep
, pvolatilep
= 0;
3682 bool masked_p
= false;
3684 /* See whether this is already a call to a gather/scatter internal function.
3685 If not, see whether it's a masked load or store. */
3686 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
3687 if (call
&& gimple_call_internal_p (call
))
3689 ifn
= gimple_call_internal_fn (stmt
);
3690 if (internal_gather_scatter_fn_p (ifn
))
3692 vect_describe_gather_scatter_call (call
, info
);
3695 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3698 /* True if we should aim to use internal functions rather than
3699 built-in functions. */
3700 bool use_ifn_p
= (DR_IS_READ (dr
)
3701 ? supports_vec_gather_load_p ()
3702 : supports_vec_scatter_store_p ());
3705 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706 see if we can use the def stmt of the address. */
3708 && TREE_CODE (base
) == MEM_REF
3709 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3710 && integer_zerop (TREE_OPERAND (base
, 1))
3711 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3713 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3714 if (is_gimple_assign (def_stmt
)
3715 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3716 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3719 /* The gather and scatter builtins need address of the form
3720 loop_invariant + vector * {1, 2, 4, 8}
3722 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725 multiplications and additions in it. To get a vector, we need
3726 a single SSA_NAME that will be defined in the loop and will
3727 contain everything that is not loop invariant and that can be
3728 vectorized. The following code attempts to find such a preexistng
3729 SSA_NAME OFF and put the loop invariants into a tree BASE
3730 that can be gimplified before the loop. */
3731 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3732 &punsignedp
, &reversep
, &pvolatilep
);
3733 gcc_assert (base
&& !reversep
);
3734 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3736 if (TREE_CODE (base
) == MEM_REF
)
3738 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3740 if (off
== NULL_TREE
)
3741 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3743 off
= size_binop (PLUS_EXPR
, off
,
3744 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3746 base
= TREE_OPERAND (base
, 0);
3749 base
= build_fold_addr_expr (base
);
3751 if (off
== NULL_TREE
)
3752 off
= size_zero_node
;
3754 /* If base is not loop invariant, either off is 0, then we start with just
3755 the constant offset in the loop invariant BASE and continue with base
3756 as OFF, otherwise give up.
3757 We could handle that case by gimplifying the addition of base + off
3758 into some SSA_NAME and use that as off, but for now punt. */
3759 if (!expr_invariant_in_loop_p (loop
, base
))
3761 if (!integer_zerop (off
))
3764 base
= size_int (pbytepos
);
3766 /* Otherwise put base + constant offset into the loop invariant BASE
3767 and continue with OFF. */
3770 base
= fold_convert (sizetype
, base
);
3771 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3774 /* OFF at this point may be either a SSA_NAME or some tree expression
3775 from get_inner_reference. Try to peel off loop invariants from it
3776 into BASE as long as possible. */
3778 while (offtype
== NULL_TREE
)
3780 enum tree_code code
;
3781 tree op0
, op1
, add
= NULL_TREE
;
3783 if (TREE_CODE (off
) == SSA_NAME
)
3785 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3787 if (expr_invariant_in_loop_p (loop
, off
))
3790 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3793 op0
= gimple_assign_rhs1 (def_stmt
);
3794 code
= gimple_assign_rhs_code (def_stmt
);
3795 op1
= gimple_assign_rhs2 (def_stmt
);
3799 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3801 code
= TREE_CODE (off
);
3802 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3806 case POINTER_PLUS_EXPR
:
3808 if (expr_invariant_in_loop_p (loop
, op0
))
3813 add
= fold_convert (sizetype
, add
);
3815 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3816 base
= size_binop (PLUS_EXPR
, base
, add
);
3819 if (expr_invariant_in_loop_p (loop
, op1
))
3827 if (expr_invariant_in_loop_p (loop
, op1
))
3829 add
= fold_convert (sizetype
, op1
);
3830 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3836 if (scale
== 1 && tree_fits_shwi_p (op1
))
3838 int new_scale
= tree_to_shwi (op1
);
3839 /* Only treat this as a scaling operation if the target
3842 && !vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
,
3843 vectype
, memory_type
, 1,
3844 TYPE_SIGN (TREE_TYPE (op0
)),
3857 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3858 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3860 if (TYPE_PRECISION (TREE_TYPE (op0
))
3861 == TYPE_PRECISION (TREE_TYPE (off
)))
3867 /* The internal functions need the offset to be the same width
3868 as the elements of VECTYPE. Don't include operations that
3869 cast the offset from that width to a different width. */
3871 && (int_size_in_bytes (TREE_TYPE (vectype
))
3872 == int_size_in_bytes (TREE_TYPE (off
))))
3875 if (TYPE_PRECISION (TREE_TYPE (op0
))
3876 < TYPE_PRECISION (TREE_TYPE (off
)))
3879 offtype
= TREE_TYPE (off
);
3890 /* If at the end OFF still isn't a SSA_NAME or isn't
3891 defined in the loop, punt. */
3892 if (TREE_CODE (off
) != SSA_NAME
3893 || expr_invariant_in_loop_p (loop
, off
))
3896 if (offtype
== NULL_TREE
)
3897 offtype
= TREE_TYPE (off
);
3901 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
, vectype
,
3902 memory_type
, TYPE_PRECISION (offtype
),
3903 TYPE_SIGN (offtype
), scale
, &ifn
,
3909 if (DR_IS_READ (dr
))
3911 if (targetm
.vectorize
.builtin_gather
)
3912 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
3916 if (targetm
.vectorize
.builtin_scatter
)
3917 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
3924 element_type
= TREE_TYPE (vectype
);
3931 info
->offset_dt
= vect_unknown_def_type
;
3932 info
->offset_vectype
= NULL_TREE
;
3933 info
->scale
= scale
;
3934 info
->element_type
= element_type
;
3935 info
->memory_type
= memory_type
;
3939 /* Function vect_analyze_data_refs.
3941 Find all the data references in the loop or basic block.
3943 The general structure of the analysis of data refs in the vectorizer is as
3945 1- vect_analyze_data_refs(loop/bb): call
3946 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3947 in the loop/bb and their dependences.
3948 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3949 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3950 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3955 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
)
3957 struct loop
*loop
= NULL
;
3959 struct data_reference
*dr
;
3962 if (dump_enabled_p ())
3963 dump_printf_loc (MSG_NOTE
, vect_location
,
3964 "=== vect_analyze_data_refs ===\n");
3966 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3967 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3969 /* Go through the data-refs, check that the analysis succeeded. Update
3970 pointer from stmt_vec_info struct to DR and vectype. */
3972 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3973 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3976 stmt_vec_info stmt_info
;
3977 tree base
, offset
, init
;
3978 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3979 bool simd_lane_access
= false;
3983 if (!dr
|| !DR_REF (dr
))
3985 if (dump_enabled_p ())
3986 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3987 "not vectorized: unhandled data-ref\n");
3991 stmt
= DR_STMT (dr
);
3992 stmt_info
= vinfo_for_stmt (stmt
);
3994 /* Discard clobbers from the dataref vector. We will remove
3995 clobber stmts during vectorization. */
3996 if (gimple_clobber_p (stmt
))
3999 if (i
== datarefs
.length () - 1)
4004 datarefs
.ordered_remove (i
);
4009 /* Check that analysis of the data-ref succeeded. */
4010 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4015 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4016 && (targetm
.vectorize
.builtin_gather
!= NULL
4017 || supports_vec_gather_load_p ());
4020 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4021 && (targetm
.vectorize
.builtin_scatter
!= NULL
4022 || supports_vec_scatter_store_p ());
4023 bool maybe_simd_lane_access
4024 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
4026 /* If target supports vector gather loads or scatter stores, or if
4027 this might be a SIMD lane access, see if they can't be used. */
4028 if (is_a
<loop_vec_info
> (vinfo
)
4029 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
4030 && !nested_in_vect_loop_p (loop
, stmt
))
4032 struct data_reference
*newdr
4033 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
4034 DR_REF (dr
), stmt
, !maybe_scatter
,
4035 DR_IS_CONDITIONAL_IN_STMT (dr
));
4036 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
4037 if (DR_BASE_ADDRESS (newdr
)
4038 && DR_OFFSET (newdr
)
4041 && integer_zerop (DR_STEP (newdr
)))
4043 if (maybe_simd_lane_access
)
4045 tree off
= DR_OFFSET (newdr
);
4047 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4048 && TREE_CODE (off
) == MULT_EXPR
4049 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4051 tree step
= TREE_OPERAND (off
, 1);
4052 off
= TREE_OPERAND (off
, 0);
4054 if (CONVERT_EXPR_P (off
)
4055 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
4057 < TYPE_PRECISION (TREE_TYPE (off
)))
4058 off
= TREE_OPERAND (off
, 0);
4059 if (TREE_CODE (off
) == SSA_NAME
)
4061 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4062 tree reft
= TREE_TYPE (DR_REF (newdr
));
4063 if (is_gimple_call (def
)
4064 && gimple_call_internal_p (def
)
4065 && (gimple_call_internal_fn (def
)
4066 == IFN_GOMP_SIMD_LANE
))
4068 tree arg
= gimple_call_arg (def
, 0);
4069 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4070 arg
= SSA_NAME_VAR (arg
);
4071 if (arg
== loop
->simduid
4073 && tree_int_cst_equal
4074 (TYPE_SIZE_UNIT (reft
),
4077 DR_OFFSET (newdr
) = ssize_int (0);
4078 DR_STEP (newdr
) = step
;
4079 DR_OFFSET_ALIGNMENT (newdr
)
4080 = BIGGEST_ALIGNMENT
;
4081 DR_STEP_ALIGNMENT (newdr
)
4082 = highest_pow2_factor (step
);
4084 simd_lane_access
= true;
4090 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
4094 gatherscatter
= GATHER
;
4096 gatherscatter
= SCATTER
;
4099 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
4100 free_data_ref (newdr
);
4103 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
4105 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4108 "not vectorized: data ref analysis "
4110 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4113 if (is_a
<bb_vec_info
> (vinfo
))
4120 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4122 if (dump_enabled_p ())
4123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4124 "not vectorized: base addr of dr is a "
4127 if (is_a
<bb_vec_info
> (vinfo
))
4130 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4135 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
4137 if (dump_enabled_p ())
4139 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4140 "not vectorized: volatile type ");
4141 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4144 if (is_a
<bb_vec_info
> (vinfo
))
4150 if (stmt_can_throw_internal (stmt
))
4152 if (dump_enabled_p ())
4154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4155 "not vectorized: statement can throw an "
4157 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4160 if (is_a
<bb_vec_info
> (vinfo
))
4163 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4168 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4169 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4171 if (dump_enabled_p ())
4173 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4174 "not vectorized: statement is bitfield "
4176 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4179 if (is_a
<bb_vec_info
> (vinfo
))
4182 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4187 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4188 offset
= unshare_expr (DR_OFFSET (dr
));
4189 init
= unshare_expr (DR_INIT (dr
));
4191 if (is_gimple_call (stmt
)
4192 && (!gimple_call_internal_p (stmt
)
4193 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
4194 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
4196 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4199 "not vectorized: dr in a call ");
4200 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4203 if (is_a
<bb_vec_info
> (vinfo
))
4206 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4211 /* Update DR field in stmt_vec_info struct. */
4213 /* If the dataref is in an inner-loop of the loop that is considered for
4214 for vectorization, we also want to analyze the access relative to
4215 the outer-loop (DR contains information only relative to the
4216 inner-most enclosing loop). We do that by building a reference to the
4217 first location accessed by the inner-loop, and analyze it relative to
4219 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
4221 /* Build a reference to the first location accessed by the
4222 inner loop: *(BASE + INIT + OFFSET). By construction,
4223 this address must be invariant in the inner loop, so we
4224 can consider it as being used in the outer loop. */
4225 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4227 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4228 tree init_ref
= build_fold_indirect_ref (init_addr
);
4230 if (dump_enabled_p ())
4232 dump_printf_loc (MSG_NOTE
, vect_location
,
4233 "analyze in outer loop: ");
4234 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
4235 dump_printf (MSG_NOTE
, "\n");
4238 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4240 /* dr_analyze_innermost already explained the failure. */
4243 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE
, vect_location
,
4246 "\touter base_address: ");
4247 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4248 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4249 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
4250 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4251 STMT_VINFO_DR_OFFSET (stmt_info
));
4252 dump_printf (MSG_NOTE
,
4253 "\n\touter constant offset from base address: ");
4254 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4255 STMT_VINFO_DR_INIT (stmt_info
));
4256 dump_printf (MSG_NOTE
, "\n\touter step: ");
4257 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4258 STMT_VINFO_DR_STEP (stmt_info
));
4259 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
4260 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
4261 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
4262 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
4263 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
4264 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
4265 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
4266 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4270 if (STMT_VINFO_DATA_REF (stmt_info
))
4272 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4275 "not vectorized: more than one data ref "
4277 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4280 if (is_a
<bb_vec_info
> (vinfo
))
4283 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4288 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
4289 if (simd_lane_access
)
4291 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
4292 free_data_ref (datarefs
[i
]);
4296 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
4297 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
4298 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
4300 if (dump_enabled_p ())
4302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4303 "not vectorized: base object not addressable "
4305 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4307 if (is_a
<bb_vec_info
> (vinfo
))
4309 /* In BB vectorization the ref can still participate
4310 in dependence analysis, we just can't vectorize it. */
4311 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4317 /* Set vectype for STMT. */
4318 scalar_type
= TREE_TYPE (DR_REF (dr
));
4319 STMT_VINFO_VECTYPE (stmt_info
)
4320 = get_vectype_for_scalar_type (scalar_type
);
4321 if (!STMT_VINFO_VECTYPE (stmt_info
))
4323 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4326 "not vectorized: no vectype for stmt: ");
4327 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4328 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4329 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4331 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4334 if (is_a
<bb_vec_info
> (vinfo
))
4336 /* No vector type is fine, the ref can still participate
4337 in dependence analysis, we just can't vectorize it. */
4338 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4342 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
4344 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4345 if (gatherscatter
!= SG_NONE
)
4352 if (dump_enabled_p ())
4354 dump_printf_loc (MSG_NOTE
, vect_location
,
4355 "got vectype for stmt: ");
4356 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
4357 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4358 STMT_VINFO_VECTYPE (stmt_info
));
4359 dump_printf (MSG_NOTE
, "\n");
4363 /* Adjust the minimal vectorization factor according to the
4365 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4366 *min_vf
= upper_bound (*min_vf
, vf
);
4368 if (gatherscatter
!= SG_NONE
)
4370 gather_scatter_info gs_info
;
4371 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4373 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4375 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4377 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4380 (gatherscatter
== GATHER
) ?
4381 "not vectorized: not suitable for gather "
4383 "not vectorized: not suitable for scatter "
4385 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4390 free_data_ref (datarefs
[i
]);
4392 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4395 else if (is_a
<loop_vec_info
> (vinfo
)
4396 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4398 if (nested_in_vect_loop_p (loop
, stmt
))
4400 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4403 "not vectorized: not suitable for strided "
4405 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4409 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4413 /* If we stopped analysis at the first dataref we could not analyze
4414 when trying to vectorize a basic-block mark the rest of the datarefs
4415 as not vectorizable and truncate the vector of datarefs. That
4416 avoids spending useless time in analyzing their dependence. */
4417 if (i
!= datarefs
.length ())
4419 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4420 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4422 data_reference_p dr
= datarefs
[j
];
4423 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4426 datarefs
.truncate (i
);
4433 /* Function vect_get_new_vect_var.
4435 Returns a name for a new variable. The current naming scheme appends the
4436 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4437 the name of vectorizer generated variables, and appends that to NAME if
4441 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4448 case vect_simple_var
:
4451 case vect_scalar_var
:
4457 case vect_pointer_var
:
4466 char* tmp
= concat (prefix
, "_", name
, NULL
);
4467 new_vect_var
= create_tmp_reg (type
, tmp
);
4471 new_vect_var
= create_tmp_reg (type
, prefix
);
4473 return new_vect_var
;
4476 /* Like vect_get_new_vect_var but return an SSA name. */
4479 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4486 case vect_simple_var
:
4489 case vect_scalar_var
:
4492 case vect_pointer_var
:
4501 char* tmp
= concat (prefix
, "_", name
, NULL
);
4502 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4506 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4508 return new_vect_var
;
4511 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4514 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
)
4516 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4517 int misalign
= DR_MISALIGNMENT (dr
);
4518 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4519 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4521 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4522 DR_TARGET_ALIGNMENT (dr
), misalign
);
4525 /* Function vect_create_addr_base_for_vector_ref.
4527 Create an expression that computes the address of the first memory location
4528 that will be accessed for a data reference.
4531 STMT: The statement containing the data reference.
4532 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4533 OFFSET: Optional. If supplied, it is be added to the initial address.
4534 LOOP: Specify relative to which loop-nest should the address be computed.
4535 For example, when the dataref is in an inner-loop nested in an
4536 outer-loop that is now being vectorized, LOOP can be either the
4537 outer-loop, or the inner-loop. The first memory location accessed
4538 by the following dataref ('in' points to short):
4545 if LOOP=i_loop: &in (relative to i_loop)
4546 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4547 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4548 initial address. Unlike OFFSET, which is number of elements to
4549 be added, BYTE_OFFSET is measured in bytes.
4552 1. Return an SSA_NAME whose value is the address of the memory location of
4553 the first vector of the data reference.
4554 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4555 these statement(s) which define the returned SSA_NAME.
4557 FORNOW: We are only handling array accesses with step 1. */
4560 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4561 gimple_seq
*new_stmt_list
,
4565 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4566 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4567 const char *base_name
;
4570 gimple_seq seq
= NULL
;
4572 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4573 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4574 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4576 tree data_ref_base
= unshare_expr (drb
->base_address
);
4577 tree base_offset
= unshare_expr (drb
->offset
);
4578 tree init
= unshare_expr (drb
->init
);
4581 base_name
= get_name (data_ref_base
);
4584 base_offset
= ssize_int (0);
4585 init
= ssize_int (0);
4586 base_name
= get_name (DR_REF (dr
));
4589 /* Create base_offset */
4590 base_offset
= size_binop (PLUS_EXPR
,
4591 fold_convert (sizetype
, base_offset
),
4592 fold_convert (sizetype
, init
));
4596 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4597 fold_convert (sizetype
, offset
), step
);
4598 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4599 base_offset
, offset
);
4603 byte_offset
= fold_convert (sizetype
, byte_offset
);
4604 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4605 base_offset
, byte_offset
);
4608 /* base + base_offset */
4610 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4613 addr_base
= build1 (ADDR_EXPR
,
4614 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4615 unshare_expr (DR_REF (dr
)));
4618 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4619 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4620 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4621 gimple_seq_add_seq (new_stmt_list
, seq
);
4623 if (DR_PTR_INFO (dr
)
4624 && TREE_CODE (addr_base
) == SSA_NAME
4625 && !SSA_NAME_PTR_INFO (addr_base
))
4627 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
);
4628 if (offset
|| byte_offset
)
4629 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4632 if (dump_enabled_p ())
4634 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4635 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4636 dump_printf (MSG_NOTE
, "\n");
4643 /* Function vect_create_data_ref_ptr.
4645 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4646 location accessed in the loop by STMT, along with the def-use update
4647 chain to appropriately advance the pointer through the loop iterations.
4648 Also set aliasing information for the pointer. This pointer is used by
4649 the callers to this function to create a memory reference expression for
4650 vector load/store access.
4653 1. STMT: a stmt that references memory. Expected to be of the form
4654 GIMPLE_ASSIGN <name, data-ref> or
4655 GIMPLE_ASSIGN <data-ref, name>.
4656 2. AGGR_TYPE: the type of the reference, which should be either a vector
4658 3. AT_LOOP: the loop where the vector memref is to be created.
4659 4. OFFSET (optional): an offset to be added to the initial address accessed
4660 by the data-ref in STMT.
4661 5. BSI: location where the new stmts are to be placed if there is no loop
4662 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4663 pointing to the initial address.
4664 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4665 to the initial address accessed by the data-ref in STMT. This is
4666 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4668 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4669 to the IV during each iteration of the loop. NULL says to move
4670 by one copy of AGGR_TYPE up or down, depending on the step of the
4674 1. Declare a new ptr to vector_type, and have it point to the base of the
4675 data reference (initial addressed accessed by the data reference).
4676 For example, for vector of type V8HI, the following code is generated:
4679 ap = (v8hi *)initial_address;
4681 if OFFSET is not supplied:
4682 initial_address = &a[init];
4683 if OFFSET is supplied:
4684 initial_address = &a[init + OFFSET];
4685 if BYTE_OFFSET is supplied:
4686 initial_address = &a[init] + BYTE_OFFSET;
4688 Return the initial_address in INITIAL_ADDRESS.
4690 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4691 update the pointer in each iteration of the loop.
4693 Return the increment stmt that updates the pointer in PTR_INCR.
4695 3. Set INV_P to true if the access pattern of the data reference in the
4696 vectorized loop is invariant. Set it to false otherwise.
4698 4. Return the pointer. */
4701 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4702 tree offset
, tree
*initial_address
,
4703 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4704 bool only_init
, bool *inv_p
, tree byte_offset
,
4707 const char *base_name
;
4708 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4709 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4710 struct loop
*loop
= NULL
;
4711 bool nested_in_vect_loop
= false;
4712 struct loop
*containing_loop
= NULL
;
4716 gimple_seq new_stmt_list
= NULL
;
4720 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4722 gimple_stmt_iterator incr_gsi
;
4724 tree indx_before_incr
, indx_after_incr
;
4727 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4729 gcc_assert (iv_step
!= NULL_TREE
4730 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4731 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4735 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4736 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4737 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4738 pe
= loop_preheader_edge (loop
);
4742 gcc_assert (bb_vinfo
);
4747 /* Check the step (evolution) of the load in LOOP, and record
4748 whether it's invariant. */
4749 step
= vect_dr_behavior (dr
)->step
;
4750 if (integer_zerop (step
))
4755 /* Create an expression for the first address accessed by this load
4757 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4759 if (dump_enabled_p ())
4761 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4762 dump_printf_loc (MSG_NOTE
, vect_location
,
4763 "create %s-pointer variable to type: ",
4764 get_tree_code_name (TREE_CODE (aggr_type
)));
4765 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4766 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4767 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4768 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4769 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4770 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4771 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4773 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4774 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4775 dump_printf (MSG_NOTE
, "\n");
4778 /* (1) Create the new aggregate-pointer variable.
4779 Vector and array types inherit the alias set of their component
4780 type by default so we need to use a ref-all pointer if the data
4781 reference does not conflict with the created aggregated data
4782 reference because it is not addressable. */
4783 bool need_ref_all
= false;
4784 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4785 get_alias_set (DR_REF (dr
))))
4786 need_ref_all
= true;
4787 /* Likewise for any of the data references in the stmt group. */
4788 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4790 gimple
*orig_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4793 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4794 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4795 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4796 get_alias_set (DR_REF (sdr
))))
4798 need_ref_all
= true;
4801 orig_stmt
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4805 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4807 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4810 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4811 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4812 def-use update cycles for the pointer: one relative to the outer-loop
4813 (LOOP), which is what steps (3) and (4) below do. The other is relative
4814 to the inner-loop (which is the inner-most loop containing the dataref),
4815 and this is done be step (5) below.
4817 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4818 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4819 redundant. Steps (3),(4) create the following:
4822 LOOP: vp1 = phi(vp0,vp2)
4828 If there is an inner-loop nested in loop, then step (5) will also be
4829 applied, and an additional update in the inner-loop will be created:
4832 LOOP: vp1 = phi(vp0,vp2)
4834 inner: vp3 = phi(vp1,vp4)
4835 vp4 = vp3 + inner_step
4841 /* (2) Calculate the initial address of the aggregate-pointer, and set
4842 the aggregate-pointer to point to it before the loop. */
4844 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4846 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4847 offset
, byte_offset
);
4852 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4853 gcc_assert (!new_bb
);
4856 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4859 *initial_address
= new_temp
;
4860 aggr_ptr_init
= new_temp
;
4862 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4863 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4864 inner-loop nested in LOOP (during outer-loop vectorization). */
4866 /* No update in loop is required. */
4867 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4868 aptr
= aggr_ptr_init
;
4871 if (iv_step
== NULL_TREE
)
4873 /* The step of the aggregate pointer is the type size. */
4874 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4875 /* One exception to the above is when the scalar step of the load in
4876 LOOP is zero. In this case the step here is also zero. */
4878 iv_step
= size_zero_node
;
4879 else if (tree_int_cst_sgn (step
) == -1)
4880 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4883 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4885 create_iv (aggr_ptr_init
,
4886 fold_convert (aggr_ptr_type
, iv_step
),
4887 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4888 &indx_before_incr
, &indx_after_incr
);
4889 incr
= gsi_stmt (incr_gsi
);
4890 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4892 /* Copy the points-to information if it exists. */
4893 if (DR_PTR_INFO (dr
))
4895 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4896 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4901 aptr
= indx_before_incr
;
4904 if (!nested_in_vect_loop
|| only_init
)
4908 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4909 nested in LOOP, if exists. */
4911 gcc_assert (nested_in_vect_loop
);
4914 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4916 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4917 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4919 incr
= gsi_stmt (incr_gsi
);
4920 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4922 /* Copy the points-to information if it exists. */
4923 if (DR_PTR_INFO (dr
))
4925 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4926 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4931 return indx_before_incr
;
4938 /* Function bump_vector_ptr
4940 Increment a pointer (to a vector type) by vector-size. If requested,
4941 i.e. if PTR-INCR is given, then also connect the new increment stmt
4942 to the existing def-use update-chain of the pointer, by modifying
4943 the PTR_INCR as illustrated below:
4945 The pointer def-use update-chain before this function:
4946 DATAREF_PTR = phi (p_0, p_2)
4948 PTR_INCR: p_2 = DATAREF_PTR + step
4950 The pointer def-use update-chain after this function:
4951 DATAREF_PTR = phi (p_0, p_2)
4953 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4955 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4958 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4960 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4961 the loop. The increment amount across iterations is expected
4963 BSI - location where the new update stmt is to be placed.
4964 STMT - the original scalar memory-access stmt that is being vectorized.
4965 BUMP - optional. The offset by which to bump the pointer. If not given,
4966 the offset is assumed to be vector_size.
4968 Output: Return NEW_DATAREF_PTR as illustrated above.
4973 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4974 gimple
*stmt
, tree bump
)
4976 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4977 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4978 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4979 tree update
= TYPE_SIZE_UNIT (vectype
);
4982 use_operand_p use_p
;
4983 tree new_dataref_ptr
;
4988 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4989 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4991 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4992 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4993 dataref_ptr
, update
);
4994 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4996 /* Copy the points-to information if it exists. */
4997 if (DR_PTR_INFO (dr
))
4999 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5000 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5004 return new_dataref_ptr
;
5006 /* Update the vector-pointer's cross-iteration increment. */
5007 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5009 tree use
= USE_FROM_PTR (use_p
);
5011 if (use
== dataref_ptr
)
5012 SET_USE (use_p
, new_dataref_ptr
);
5014 gcc_assert (operand_equal_p (use
, update
, 0));
5017 return new_dataref_ptr
;
5021 /* Copy memory reference info such as base/clique from the SRC reference
5022 to the DEST MEM_REF. */
5025 vect_copy_ref_info (tree dest
, tree src
)
5027 if (TREE_CODE (dest
) != MEM_REF
)
5030 tree src_base
= src
;
5031 while (handled_component_p (src_base
))
5032 src_base
= TREE_OPERAND (src_base
, 0);
5033 if (TREE_CODE (src_base
) != MEM_REF
5034 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5037 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5038 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5042 /* Function vect_create_destination_var.
5044 Create a new temporary of type VECTYPE. */
5047 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5053 enum vect_var_kind kind
;
5056 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5060 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5062 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5064 name
= get_name (scalar_dest
);
5066 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5068 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5069 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5075 /* Function vect_grouped_store_supported.
5077 Returns TRUE if interleave high and interleave low permutations
5078 are supported, and FALSE otherwise. */
5081 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5083 machine_mode mode
= TYPE_MODE (vectype
);
5085 /* vect_permute_store_chain requires the group size to be equal to 3 or
5086 be a power of two. */
5087 if (count
!= 3 && exact_log2 (count
) == -1)
5089 if (dump_enabled_p ())
5090 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5091 "the size of the group of accesses"
5092 " is not a power of 2 or not eqaul to 3\n");
5096 /* Check that the permutation is supported. */
5097 if (VECTOR_MODE_P (mode
))
5102 unsigned int j0
= 0, j1
= 0, j2
= 0;
5106 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5108 if (dump_enabled_p ())
5109 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5110 "cannot handle groups of 3 stores for"
5111 " variable-length vectors\n");
5115 vec_perm_builder
sel (nelt
, nelt
, 1);
5116 sel
.quick_grow (nelt
);
5117 vec_perm_indices indices
;
5118 for (j
= 0; j
< 3; j
++)
5120 int nelt0
= ((3 - j
) * nelt
) % 3;
5121 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5122 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5123 for (i
= 0; i
< nelt
; i
++)
5125 if (3 * i
+ nelt0
< nelt
)
5126 sel
[3 * i
+ nelt0
] = j0
++;
5127 if (3 * i
+ nelt1
< nelt
)
5128 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5129 if (3 * i
+ nelt2
< nelt
)
5130 sel
[3 * i
+ nelt2
] = 0;
5132 indices
.new_vector (sel
, 2, nelt
);
5133 if (!can_vec_perm_const_p (mode
, indices
))
5135 if (dump_enabled_p ())
5136 dump_printf (MSG_MISSED_OPTIMIZATION
,
5137 "permutation op not supported by target.\n");
5141 for (i
= 0; i
< nelt
; i
++)
5143 if (3 * i
+ nelt0
< nelt
)
5144 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5145 if (3 * i
+ nelt1
< nelt
)
5146 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5147 if (3 * i
+ nelt2
< nelt
)
5148 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5150 indices
.new_vector (sel
, 2, nelt
);
5151 if (!can_vec_perm_const_p (mode
, indices
))
5153 if (dump_enabled_p ())
5154 dump_printf (MSG_MISSED_OPTIMIZATION
,
5155 "permutation op not supported by target.\n");
5163 /* If length is not equal to 3 then only power of 2 is supported. */
5164 gcc_assert (pow2p_hwi (count
));
5165 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5167 /* The encoding has 2 interleaved stepped patterns. */
5168 vec_perm_builder
sel (nelt
, 2, 3);
5170 for (i
= 0; i
< 3; i
++)
5173 sel
[i
* 2 + 1] = i
+ nelt
;
5175 vec_perm_indices
indices (sel
, 2, nelt
);
5176 if (can_vec_perm_const_p (mode
, indices
))
5178 for (i
= 0; i
< 6; i
++)
5179 sel
[i
] += exact_div (nelt
, 2);
5180 indices
.new_vector (sel
, 2, nelt
);
5181 if (can_vec_perm_const_p (mode
, indices
))
5187 if (dump_enabled_p ())
5188 dump_printf (MSG_MISSED_OPTIMIZATION
,
5189 "permutaion op not supported by target.\n");
5194 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5195 type VECTYPE. MASKED_P says whether the masked form is needed. */
5198 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5202 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5203 vec_mask_store_lanes_optab
,
5206 return vect_lanes_optab_supported_p ("vec_store_lanes",
5207 vec_store_lanes_optab
,
5212 /* Function vect_permute_store_chain.
5214 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5215 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5216 the data correctly for the stores. Return the final references for stores
5219 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5220 The input is 4 vectors each containing 8 elements. We assign a number to
5221 each element, the input sequence is:
5223 1st vec: 0 1 2 3 4 5 6 7
5224 2nd vec: 8 9 10 11 12 13 14 15
5225 3rd vec: 16 17 18 19 20 21 22 23
5226 4th vec: 24 25 26 27 28 29 30 31
5228 The output sequence should be:
5230 1st vec: 0 8 16 24 1 9 17 25
5231 2nd vec: 2 10 18 26 3 11 19 27
5232 3rd vec: 4 12 20 28 5 13 21 30
5233 4th vec: 6 14 22 30 7 15 23 31
5235 i.e., we interleave the contents of the four vectors in their order.
5237 We use interleave_high/low instructions to create such output. The input of
5238 each interleave_high/low operation is two vectors:
5241 the even elements of the result vector are obtained left-to-right from the
5242 high/low elements of the first vector. The odd elements of the result are
5243 obtained left-to-right from the high/low elements of the second vector.
5244 The output of interleave_high will be: 0 4 1 5
5245 and of interleave_low: 2 6 3 7
5248 The permutation is done in log LENGTH stages. In each stage interleave_high
5249 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5250 where the first argument is taken from the first half of DR_CHAIN and the
5251 second argument from it's second half.
5254 I1: interleave_high (1st vec, 3rd vec)
5255 I2: interleave_low (1st vec, 3rd vec)
5256 I3: interleave_high (2nd vec, 4th vec)
5257 I4: interleave_low (2nd vec, 4th vec)
5259 The output for the first stage is:
5261 I1: 0 16 1 17 2 18 3 19
5262 I2: 4 20 5 21 6 22 7 23
5263 I3: 8 24 9 25 10 26 11 27
5264 I4: 12 28 13 29 14 30 15 31
5266 The output of the second stage, i.e. the final result is:
5268 I1: 0 8 16 24 1 9 17 25
5269 I2: 2 10 18 26 3 11 19 27
5270 I3: 4 12 20 28 5 13 21 30
5271 I4: 6 14 22 30 7 15 23 31. */
5274 vect_permute_store_chain (vec
<tree
> dr_chain
,
5275 unsigned int length
,
5277 gimple_stmt_iterator
*gsi
,
5278 vec
<tree
> *result_chain
)
5280 tree vect1
, vect2
, high
, low
;
5282 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5283 tree perm_mask_low
, perm_mask_high
;
5285 tree perm3_mask_low
, perm3_mask_high
;
5286 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5288 result_chain
->quick_grow (length
);
5289 memcpy (result_chain
->address (), dr_chain
.address (),
5290 length
* sizeof (tree
));
5294 /* vect_grouped_store_supported ensures that this is constant. */
5295 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5296 unsigned int j0
= 0, j1
= 0, j2
= 0;
5298 vec_perm_builder
sel (nelt
, nelt
, 1);
5299 sel
.quick_grow (nelt
);
5300 vec_perm_indices indices
;
5301 for (j
= 0; j
< 3; j
++)
5303 int nelt0
= ((3 - j
) * nelt
) % 3;
5304 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5305 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5307 for (i
= 0; i
< nelt
; i
++)
5309 if (3 * i
+ nelt0
< nelt
)
5310 sel
[3 * i
+ nelt0
] = j0
++;
5311 if (3 * i
+ nelt1
< nelt
)
5312 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5313 if (3 * i
+ nelt2
< nelt
)
5314 sel
[3 * i
+ nelt2
] = 0;
5316 indices
.new_vector (sel
, 2, nelt
);
5317 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5319 for (i
= 0; i
< nelt
; i
++)
5321 if (3 * i
+ nelt0
< nelt
)
5322 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5323 if (3 * i
+ nelt1
< nelt
)
5324 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5325 if (3 * i
+ nelt2
< nelt
)
5326 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5328 indices
.new_vector (sel
, 2, nelt
);
5329 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5331 vect1
= dr_chain
[0];
5332 vect2
= dr_chain
[1];
5334 /* Create interleaving stmt:
5335 low = VEC_PERM_EXPR <vect1, vect2,
5336 {j, nelt, *, j + 1, nelt + j + 1, *,
5337 j + 2, nelt + j + 2, *, ...}> */
5338 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5339 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5340 vect2
, perm3_mask_low
);
5341 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5344 vect2
= dr_chain
[2];
5345 /* Create interleaving stmt:
5346 low = VEC_PERM_EXPR <vect1, vect2,
5347 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5348 6, 7, nelt + j + 2, ...}> */
5349 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5350 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5351 vect2
, perm3_mask_high
);
5352 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5353 (*result_chain
)[j
] = data_ref
;
5358 /* If length is not equal to 3 then only power of 2 is supported. */
5359 gcc_assert (pow2p_hwi (length
));
5361 /* The encoding has 2 interleaved stepped patterns. */
5362 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5363 vec_perm_builder
sel (nelt
, 2, 3);
5365 for (i
= 0; i
< 3; i
++)
5368 sel
[i
* 2 + 1] = i
+ nelt
;
5370 vec_perm_indices
indices (sel
, 2, nelt
);
5371 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5373 for (i
= 0; i
< 6; i
++)
5374 sel
[i
] += exact_div (nelt
, 2);
5375 indices
.new_vector (sel
, 2, nelt
);
5376 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5378 for (i
= 0, n
= log_length
; i
< n
; i
++)
5380 for (j
= 0; j
< length
/2; j
++)
5382 vect1
= dr_chain
[j
];
5383 vect2
= dr_chain
[j
+length
/2];
5385 /* Create interleaving stmt:
5386 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5388 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5389 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5390 vect2
, perm_mask_high
);
5391 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5392 (*result_chain
)[2*j
] = high
;
5394 /* Create interleaving stmt:
5395 low = VEC_PERM_EXPR <vect1, vect2,
5396 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5398 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5399 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5400 vect2
, perm_mask_low
);
5401 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5402 (*result_chain
)[2*j
+1] = low
;
5404 memcpy (dr_chain
.address (), result_chain
->address (),
5405 length
* sizeof (tree
));
5410 /* Function vect_setup_realignment
5412 This function is called when vectorizing an unaligned load using
5413 the dr_explicit_realign[_optimized] scheme.
5414 This function generates the following code at the loop prolog:
5417 x msq_init = *(floor(p)); # prolog load
5418 realignment_token = call target_builtin;
5420 x msq = phi (msq_init, ---)
5422 The stmts marked with x are generated only for the case of
5423 dr_explicit_realign_optimized.
5425 The code above sets up a new (vector) pointer, pointing to the first
5426 location accessed by STMT, and a "floor-aligned" load using that pointer.
5427 It also generates code to compute the "realignment-token" (if the relevant
5428 target hook was defined), and creates a phi-node at the loop-header bb
5429 whose arguments are the result of the prolog-load (created by this
5430 function) and the result of a load that takes place in the loop (to be
5431 created by the caller to this function).
5433 For the case of dr_explicit_realign_optimized:
5434 The caller to this function uses the phi-result (msq) to create the
5435 realignment code inside the loop, and sets up the missing phi argument,
5438 msq = phi (msq_init, lsq)
5439 lsq = *(floor(p')); # load in loop
5440 result = realign_load (msq, lsq, realignment_token);
5442 For the case of dr_explicit_realign:
5444 msq = *(floor(p)); # load in loop
5446 lsq = *(floor(p')); # load in loop
5447 result = realign_load (msq, lsq, realignment_token);
5450 STMT - (scalar) load stmt to be vectorized. This load accesses
5451 a memory location that may be unaligned.
5452 BSI - place where new code is to be inserted.
5453 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5457 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5458 target hook, if defined.
5459 Return value - the result of the loop-header phi node. */
5462 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5463 tree
*realignment_token
,
5464 enum dr_alignment_support alignment_support_scheme
,
5466 struct loop
**at_loop
)
5468 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5469 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5470 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5471 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5472 struct loop
*loop
= NULL
;
5474 tree scalar_dest
= gimple_assign_lhs (stmt
);
5480 tree msq_init
= NULL_TREE
;
5483 tree msq
= NULL_TREE
;
5484 gimple_seq stmts
= NULL
;
5486 bool compute_in_loop
= false;
5487 bool nested_in_vect_loop
= false;
5488 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5489 struct loop
*loop_for_initial_load
= NULL
;
5493 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5494 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5497 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5498 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5500 /* We need to generate three things:
5501 1. the misalignment computation
5502 2. the extra vector load (for the optimized realignment scheme).
5503 3. the phi node for the two vectors from which the realignment is
5504 done (for the optimized realignment scheme). */
5506 /* 1. Determine where to generate the misalignment computation.
5508 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5509 calculation will be generated by this function, outside the loop (in the
5510 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5511 caller, inside the loop.
5513 Background: If the misalignment remains fixed throughout the iterations of
5514 the loop, then both realignment schemes are applicable, and also the
5515 misalignment computation can be done outside LOOP. This is because we are
5516 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5517 are a multiple of VS (the Vector Size), and therefore the misalignment in
5518 different vectorized LOOP iterations is always the same.
5519 The problem arises only if the memory access is in an inner-loop nested
5520 inside LOOP, which is now being vectorized using outer-loop vectorization.
5521 This is the only case when the misalignment of the memory access may not
5522 remain fixed throughout the iterations of the inner-loop (as explained in
5523 detail in vect_supportable_dr_alignment). In this case, not only is the
5524 optimized realignment scheme not applicable, but also the misalignment
5525 computation (and generation of the realignment token that is passed to
5526 REALIGN_LOAD) have to be done inside the loop.
5528 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5529 or not, which in turn determines if the misalignment is computed inside
5530 the inner-loop, or outside LOOP. */
5532 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5534 compute_in_loop
= true;
5535 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5539 /* 2. Determine where to generate the extra vector load.
5541 For the optimized realignment scheme, instead of generating two vector
5542 loads in each iteration, we generate a single extra vector load in the
5543 preheader of the loop, and in each iteration reuse the result of the
5544 vector load from the previous iteration. In case the memory access is in
5545 an inner-loop nested inside LOOP, which is now being vectorized using
5546 outer-loop vectorization, we need to determine whether this initial vector
5547 load should be generated at the preheader of the inner-loop, or can be
5548 generated at the preheader of LOOP. If the memory access has no evolution
5549 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5550 to be generated inside LOOP (in the preheader of the inner-loop). */
5552 if (nested_in_vect_loop
)
5554 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5555 bool invariant_in_outerloop
=
5556 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5557 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5560 loop_for_initial_load
= loop
;
5562 *at_loop
= loop_for_initial_load
;
5564 if (loop_for_initial_load
)
5565 pe
= loop_preheader_edge (loop_for_initial_load
);
5567 /* 3. For the case of the optimized realignment, create the first vector
5568 load at the loop preheader. */
5570 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5572 /* Create msq_init = *(floor(p1)) in the loop preheader */
5575 gcc_assert (!compute_in_loop
);
5576 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5577 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5578 NULL_TREE
, &init_addr
, NULL
, &inc
,
5580 if (TREE_CODE (ptr
) == SSA_NAME
)
5581 new_temp
= copy_ssa_name (ptr
);
5583 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5584 unsigned int align
= DR_TARGET_ALIGNMENT (dr
);
5585 new_stmt
= gimple_build_assign
5586 (new_temp
, BIT_AND_EXPR
, ptr
,
5587 build_int_cst (TREE_TYPE (ptr
), -(HOST_WIDE_INT
) align
));
5588 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5589 gcc_assert (!new_bb
);
5591 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5592 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5593 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5594 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5595 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5596 gimple_assign_set_lhs (new_stmt
, new_temp
);
5599 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5600 gcc_assert (!new_bb
);
5603 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5605 msq_init
= gimple_assign_lhs (new_stmt
);
5608 /* 4. Create realignment token using a target builtin, if available.
5609 It is done either inside the containing loop, or before LOOP (as
5610 determined above). */
5612 if (targetm
.vectorize
.builtin_mask_for_load
)
5617 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5620 /* Generate the INIT_ADDR computation outside LOOP. */
5621 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5625 pe
= loop_preheader_edge (loop
);
5626 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5627 gcc_assert (!new_bb
);
5630 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5633 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5634 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5636 vect_create_destination_var (scalar_dest
,
5637 gimple_call_return_type (new_stmt
));
5638 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5639 gimple_call_set_lhs (new_stmt
, new_temp
);
5641 if (compute_in_loop
)
5642 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5645 /* Generate the misalignment computation outside LOOP. */
5646 pe
= loop_preheader_edge (loop
);
5647 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5648 gcc_assert (!new_bb
);
5651 *realignment_token
= gimple_call_lhs (new_stmt
);
5653 /* The result of the CALL_EXPR to this builtin is determined from
5654 the value of the parameter and no global variables are touched
5655 which makes the builtin a "const" function. Requiring the
5656 builtin to have the "const" attribute makes it unnecessary
5657 to call mark_call_clobbered. */
5658 gcc_assert (TREE_READONLY (builtin_decl
));
5661 if (alignment_support_scheme
== dr_explicit_realign
)
5664 gcc_assert (!compute_in_loop
);
5665 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5668 /* 5. Create msq = phi <msq_init, lsq> in loop */
5670 pe
= loop_preheader_edge (containing_loop
);
5671 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5672 msq
= make_ssa_name (vec_dest
);
5673 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5674 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5680 /* Function vect_grouped_load_supported.
5682 COUNT is the size of the load group (the number of statements plus the
5683 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5684 only one statement, with a gap of COUNT - 1.
5686 Returns true if a suitable permute exists. */
5689 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5690 unsigned HOST_WIDE_INT count
)
5692 machine_mode mode
= TYPE_MODE (vectype
);
5694 /* If this is single-element interleaving with an element distance
5695 that leaves unused vector loads around punt - we at least create
5696 very sub-optimal code in that case (and blow up memory,
5698 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5702 "single-element interleaving not supported "
5703 "for not adjacent vector loads\n");
5707 /* vect_permute_load_chain requires the group size to be equal to 3 or
5708 be a power of two. */
5709 if (count
!= 3 && exact_log2 (count
) == -1)
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5713 "the size of the group of accesses"
5714 " is not a power of 2 or not equal to 3\n");
5718 /* Check that the permutation is supported. */
5719 if (VECTOR_MODE_P (mode
))
5725 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5727 if (dump_enabled_p ())
5728 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5729 "cannot handle groups of 3 loads for"
5730 " variable-length vectors\n");
5734 vec_perm_builder
sel (nelt
, nelt
, 1);
5735 sel
.quick_grow (nelt
);
5736 vec_perm_indices indices
;
5738 for (k
= 0; k
< 3; k
++)
5740 for (i
= 0; i
< nelt
; i
++)
5741 if (3 * i
+ k
< 2 * nelt
)
5745 indices
.new_vector (sel
, 2, nelt
);
5746 if (!can_vec_perm_const_p (mode
, indices
))
5748 if (dump_enabled_p ())
5749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5750 "shuffle of 3 loads is not supported by"
5754 for (i
= 0, j
= 0; i
< nelt
; i
++)
5755 if (3 * i
+ k
< 2 * nelt
)
5758 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5759 indices
.new_vector (sel
, 2, nelt
);
5760 if (!can_vec_perm_const_p (mode
, indices
))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5764 "shuffle of 3 loads is not supported by"
5773 /* If length is not equal to 3 then only power of 2 is supported. */
5774 gcc_assert (pow2p_hwi (count
));
5775 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5777 /* The encoding has a single stepped pattern. */
5778 vec_perm_builder
sel (nelt
, 1, 3);
5780 for (i
= 0; i
< 3; i
++)
5782 vec_perm_indices
indices (sel
, 2, nelt
);
5783 if (can_vec_perm_const_p (mode
, indices
))
5785 for (i
= 0; i
< 3; i
++)
5787 indices
.new_vector (sel
, 2, nelt
);
5788 if (can_vec_perm_const_p (mode
, indices
))
5794 if (dump_enabled_p ())
5795 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5796 "extract even/odd not supported by target\n");
5800 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5801 type VECTYPE. MASKED_P says whether the masked form is needed. */
5804 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5808 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5809 vec_mask_load_lanes_optab
,
5812 return vect_lanes_optab_supported_p ("vec_load_lanes",
5813 vec_load_lanes_optab
,
5817 /* Function vect_permute_load_chain.
5819 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5820 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5821 the input data correctly. Return the final references for loads in
5824 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5825 The input is 4 vectors each containing 8 elements. We assign a number to each
5826 element, the input sequence is:
5828 1st vec: 0 1 2 3 4 5 6 7
5829 2nd vec: 8 9 10 11 12 13 14 15
5830 3rd vec: 16 17 18 19 20 21 22 23
5831 4th vec: 24 25 26 27 28 29 30 31
5833 The output sequence should be:
5835 1st vec: 0 4 8 12 16 20 24 28
5836 2nd vec: 1 5 9 13 17 21 25 29
5837 3rd vec: 2 6 10 14 18 22 26 30
5838 4th vec: 3 7 11 15 19 23 27 31
5840 i.e., the first output vector should contain the first elements of each
5841 interleaving group, etc.
5843 We use extract_even/odd instructions to create such output. The input of
5844 each extract_even/odd operation is two vectors
5848 and the output is the vector of extracted even/odd elements. The output of
5849 extract_even will be: 0 2 4 6
5850 and of extract_odd: 1 3 5 7
5853 The permutation is done in log LENGTH stages. In each stage extract_even
5854 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5855 their order. In our example,
5857 E1: extract_even (1st vec, 2nd vec)
5858 E2: extract_odd (1st vec, 2nd vec)
5859 E3: extract_even (3rd vec, 4th vec)
5860 E4: extract_odd (3rd vec, 4th vec)
5862 The output for the first stage will be:
5864 E1: 0 2 4 6 8 10 12 14
5865 E2: 1 3 5 7 9 11 13 15
5866 E3: 16 18 20 22 24 26 28 30
5867 E4: 17 19 21 23 25 27 29 31
5869 In order to proceed and create the correct sequence for the next stage (or
5870 for the correct output, if the second stage is the last one, as in our
5871 example), we first put the output of extract_even operation and then the
5872 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5873 The input for the second stage is:
5875 1st vec (E1): 0 2 4 6 8 10 12 14
5876 2nd vec (E3): 16 18 20 22 24 26 28 30
5877 3rd vec (E2): 1 3 5 7 9 11 13 15
5878 4th vec (E4): 17 19 21 23 25 27 29 31
5880 The output of the second stage:
5882 E1: 0 4 8 12 16 20 24 28
5883 E2: 2 6 10 14 18 22 26 30
5884 E3: 1 5 9 13 17 21 25 29
5885 E4: 3 7 11 15 19 23 27 31
5887 And RESULT_CHAIN after reordering:
5889 1st vec (E1): 0 4 8 12 16 20 24 28
5890 2nd vec (E3): 1 5 9 13 17 21 25 29
5891 3rd vec (E2): 2 6 10 14 18 22 26 30
5892 4th vec (E4): 3 7 11 15 19 23 27 31. */
5895 vect_permute_load_chain (vec
<tree
> dr_chain
,
5896 unsigned int length
,
5898 gimple_stmt_iterator
*gsi
,
5899 vec
<tree
> *result_chain
)
5901 tree data_ref
, first_vect
, second_vect
;
5902 tree perm_mask_even
, perm_mask_odd
;
5903 tree perm3_mask_low
, perm3_mask_high
;
5905 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5906 unsigned int i
, j
, log_length
= exact_log2 (length
);
5908 result_chain
->quick_grow (length
);
5909 memcpy (result_chain
->address (), dr_chain
.address (),
5910 length
* sizeof (tree
));
5914 /* vect_grouped_load_supported ensures that this is constant. */
5915 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5918 vec_perm_builder
sel (nelt
, nelt
, 1);
5919 sel
.quick_grow (nelt
);
5920 vec_perm_indices indices
;
5921 for (k
= 0; k
< 3; k
++)
5923 for (i
= 0; i
< nelt
; i
++)
5924 if (3 * i
+ k
< 2 * nelt
)
5928 indices
.new_vector (sel
, 2, nelt
);
5929 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5931 for (i
= 0, j
= 0; i
< nelt
; i
++)
5932 if (3 * i
+ k
< 2 * nelt
)
5935 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5936 indices
.new_vector (sel
, 2, nelt
);
5937 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5939 first_vect
= dr_chain
[0];
5940 second_vect
= dr_chain
[1];
5942 /* Create interleaving stmt (low part of):
5943 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5945 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5946 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5947 second_vect
, perm3_mask_low
);
5948 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5950 /* Create interleaving stmt (high part of):
5951 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5953 first_vect
= data_ref
;
5954 second_vect
= dr_chain
[2];
5955 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5956 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5957 second_vect
, perm3_mask_high
);
5958 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5959 (*result_chain
)[k
] = data_ref
;
5964 /* If length is not equal to 3 then only power of 2 is supported. */
5965 gcc_assert (pow2p_hwi (length
));
5967 /* The encoding has a single stepped pattern. */
5968 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5969 vec_perm_builder
sel (nelt
, 1, 3);
5971 for (i
= 0; i
< 3; ++i
)
5973 vec_perm_indices
indices (sel
, 2, nelt
);
5974 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5976 for (i
= 0; i
< 3; ++i
)
5978 indices
.new_vector (sel
, 2, nelt
);
5979 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5981 for (i
= 0; i
< log_length
; i
++)
5983 for (j
= 0; j
< length
; j
+= 2)
5985 first_vect
= dr_chain
[j
];
5986 second_vect
= dr_chain
[j
+1];
5988 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5989 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5990 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5991 first_vect
, second_vect
,
5993 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5994 (*result_chain
)[j
/2] = data_ref
;
5996 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5997 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5998 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5999 first_vect
, second_vect
,
6001 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6002 (*result_chain
)[j
/2+length
/2] = data_ref
;
6004 memcpy (dr_chain
.address (), result_chain
->address (),
6005 length
* sizeof (tree
));
6010 /* Function vect_shift_permute_load_chain.
6012 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6013 sequence of stmts to reorder the input data accordingly.
6014 Return the final references for loads in RESULT_CHAIN.
6015 Return true if successed, false otherwise.
6017 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6018 The input is 3 vectors each containing 8 elements. We assign a
6019 number to each element, the input sequence is:
6021 1st vec: 0 1 2 3 4 5 6 7
6022 2nd vec: 8 9 10 11 12 13 14 15
6023 3rd vec: 16 17 18 19 20 21 22 23
6025 The output sequence should be:
6027 1st vec: 0 3 6 9 12 15 18 21
6028 2nd vec: 1 4 7 10 13 16 19 22
6029 3rd vec: 2 5 8 11 14 17 20 23
6031 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6033 First we shuffle all 3 vectors to get correct elements order:
6035 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6036 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6037 3rd vec: (16 19 22) (17 20 23) (18 21)
6039 Next we unite and shift vector 3 times:
6042 shift right by 6 the concatenation of:
6043 "1st vec" and "2nd vec"
6044 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6045 "2nd vec" and "3rd vec"
6046 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6047 "3rd vec" and "1st vec"
6048 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6051 So that now new vectors are:
6053 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6054 2nd vec: (10 13) (16 19 22) (17 20 23)
6055 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6058 shift right by 5 the concatenation of:
6059 "1st vec" and "3rd vec"
6060 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6061 "2nd vec" and "1st vec"
6062 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6063 "3rd vec" and "2nd vec"
6064 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6067 So that now new vectors are:
6069 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6070 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6071 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6074 shift right by 5 the concatenation of:
6075 "1st vec" and "1st vec"
6076 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6077 shift right by 3 the concatenation of:
6078 "2nd vec" and "2nd vec"
6079 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6082 So that now all vectors are READY:
6083 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6084 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6085 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6087 This algorithm is faster than one in vect_permute_load_chain if:
6088 1. "shift of a concatination" is faster than general permutation.
6090 2. The TARGET machine can't execute vector instructions in parallel.
6091 This is because each step of the algorithm depends on previous.
6092 The algorithm in vect_permute_load_chain is much more parallel.
6094 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6098 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
6099 unsigned int length
,
6101 gimple_stmt_iterator
*gsi
,
6102 vec
<tree
> *result_chain
)
6104 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6105 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6106 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6109 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
6111 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6112 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6114 unsigned HOST_WIDE_INT nelt
, vf
;
6115 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6116 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6117 /* Not supported for variable-length vectors. */
6120 vec_perm_builder
sel (nelt
, nelt
, 1);
6121 sel
.quick_grow (nelt
);
6123 result_chain
->quick_grow (length
);
6124 memcpy (result_chain
->address (), dr_chain
.address (),
6125 length
* sizeof (tree
));
6127 if (pow2p_hwi (length
) && vf
> 4)
6129 unsigned int j
, log_length
= exact_log2 (length
);
6130 for (i
= 0; i
< nelt
/ 2; ++i
)
6132 for (i
= 0; i
< nelt
/ 2; ++i
)
6133 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6134 vec_perm_indices
indices (sel
, 2, nelt
);
6135 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6137 if (dump_enabled_p ())
6138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6139 "shuffle of 2 fields structure is not \
6140 supported by target\n");
6143 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6145 for (i
= 0; i
< nelt
/ 2; ++i
)
6147 for (i
= 0; i
< nelt
/ 2; ++i
)
6148 sel
[nelt
/ 2 + i
] = i
* 2;
6149 indices
.new_vector (sel
, 2, nelt
);
6150 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6152 if (dump_enabled_p ())
6153 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6154 "shuffle of 2 fields structure is not \
6155 supported by target\n");
6158 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6160 /* Generating permutation constant to shift all elements.
6161 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6162 for (i
= 0; i
< nelt
; i
++)
6163 sel
[i
] = nelt
/ 2 + i
;
6164 indices
.new_vector (sel
, 2, nelt
);
6165 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6169 "shift permutation is not supported by target\n");
6172 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6174 /* Generating permutation constant to select vector from 2.
6175 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6176 for (i
= 0; i
< nelt
/ 2; i
++)
6178 for (i
= nelt
/ 2; i
< nelt
; i
++)
6180 indices
.new_vector (sel
, 2, nelt
);
6181 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6183 if (dump_enabled_p ())
6184 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6185 "select is not supported by target\n");
6188 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6190 for (i
= 0; i
< log_length
; i
++)
6192 for (j
= 0; j
< length
; j
+= 2)
6194 first_vect
= dr_chain
[j
];
6195 second_vect
= dr_chain
[j
+ 1];
6197 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6198 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6199 first_vect
, first_vect
,
6201 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6204 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6205 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6206 second_vect
, second_vect
,
6208 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6211 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6212 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6213 vect
[0], vect
[1], shift1_mask
);
6214 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6215 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6217 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6218 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6219 vect
[0], vect
[1], select_mask
);
6220 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6221 (*result_chain
)[j
/2] = data_ref
;
6223 memcpy (dr_chain
.address (), result_chain
->address (),
6224 length
* sizeof (tree
));
6228 if (length
== 3 && vf
> 2)
6230 unsigned int k
= 0, l
= 0;
6232 /* Generating permutation constant to get all elements in rigth order.
6233 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6234 for (i
= 0; i
< nelt
; i
++)
6236 if (3 * k
+ (l
% 3) >= nelt
)
6239 l
+= (3 - (nelt
% 3));
6241 sel
[i
] = 3 * k
+ (l
% 3);
6244 vec_perm_indices
indices (sel
, 2, nelt
);
6245 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6247 if (dump_enabled_p ())
6248 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6249 "shuffle of 3 fields structure is not \
6250 supported by target\n");
6253 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6255 /* Generating permutation constant to shift all elements.
6256 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6257 for (i
= 0; i
< nelt
; i
++)
6258 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6259 indices
.new_vector (sel
, 2, nelt
);
6260 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6262 if (dump_enabled_p ())
6263 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6264 "shift permutation is not supported by target\n");
6267 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6269 /* Generating permutation constant to shift all elements.
6270 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6271 for (i
= 0; i
< nelt
; i
++)
6272 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6273 indices
.new_vector (sel
, 2, nelt
);
6274 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6276 if (dump_enabled_p ())
6277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6278 "shift permutation is not supported by target\n");
6281 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6283 /* Generating permutation constant to shift all elements.
6284 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6285 for (i
= 0; i
< nelt
; i
++)
6286 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6287 indices
.new_vector (sel
, 2, nelt
);
6288 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6292 "shift permutation is not supported by target\n");
6295 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6297 /* Generating permutation constant to shift all elements.
6298 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6299 for (i
= 0; i
< nelt
; i
++)
6300 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6301 indices
.new_vector (sel
, 2, nelt
);
6302 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6304 if (dump_enabled_p ())
6305 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6306 "shift permutation is not supported by target\n");
6309 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6311 for (k
= 0; k
< 3; k
++)
6313 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6314 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6315 dr_chain
[k
], dr_chain
[k
],
6317 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6321 for (k
= 0; k
< 3; k
++)
6323 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6324 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6325 vect
[k
% 3], vect
[(k
+ 1) % 3],
6327 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6328 vect_shift
[k
] = data_ref
;
6331 for (k
= 0; k
< 3; k
++)
6333 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6334 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6335 vect_shift
[(4 - k
) % 3],
6336 vect_shift
[(3 - k
) % 3],
6338 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6342 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6344 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6345 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6346 vect
[0], shift3_mask
);
6347 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6348 (*result_chain
)[nelt
% 3] = data_ref
;
6350 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6351 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6352 vect
[1], shift4_mask
);
6353 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6354 (*result_chain
)[0] = data_ref
;
6360 /* Function vect_transform_grouped_load.
6362 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6363 to perform their permutation and ascribe the result vectorized statements to
6364 the scalar statements.
6368 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
6369 gimple_stmt_iterator
*gsi
)
6372 vec
<tree
> result_chain
= vNULL
;
6374 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6375 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6376 vectors, that are ready for vector computation. */
6377 result_chain
.create (size
);
6379 /* If reassociation width for vector type is 2 or greater target machine can
6380 execute 2 or more vector instructions in parallel. Otherwise try to
6381 get chain for loads group using vect_shift_permute_load_chain. */
6382 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
6383 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6385 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
6386 gsi
, &result_chain
))
6387 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
6388 vect_record_grouped_load_vectors (stmt
, result_chain
);
6389 result_chain
.release ();
6392 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6393 generated as part of the vectorization of STMT. Assign the statement
6394 for each vector to the associated scalar statement. */
6397 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
6399 gimple
*first_stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
6400 gimple
*next_stmt
, *new_stmt
;
6401 unsigned int i
, gap_count
;
6404 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6405 Since we scan the chain starting from it's first node, their order
6406 corresponds the order of data-refs in RESULT_CHAIN. */
6407 next_stmt
= first_stmt
;
6409 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6414 /* Skip the gaps. Loads created for the gaps will be removed by dead
6415 code elimination pass later. No need to check for the first stmt in
6416 the group, since it always exists.
6417 DR_GROUP_GAP is the number of steps in elements from the previous
6418 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6419 correspond to the gaps. */
6420 if (next_stmt
!= first_stmt
6421 && gap_count
< DR_GROUP_GAP (vinfo_for_stmt (next_stmt
)))
6429 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6430 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6431 copies, and we put the new vector statement in the first available
6433 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
6434 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
6437 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6440 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
6442 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
6445 prev_stmt
= rel_stmt
;
6447 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
6450 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
6455 next_stmt
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
6457 /* If NEXT_STMT accesses the same DR as the previous statement,
6458 put the same TMP_DATA_REF as its vectorized statement; otherwise
6459 get the next data-ref from RESULT_CHAIN. */
6460 if (!next_stmt
|| !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6466 /* Function vect_force_dr_alignment_p.
6468 Returns whether the alignment of a DECL can be forced to be aligned
6469 on ALIGNMENT bit boundary. */
6472 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6477 if (decl_in_symtab_p (decl
)
6478 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6481 if (TREE_STATIC (decl
))
6482 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6484 return (alignment
<= MAX_STACK_ALIGNMENT
);
6488 /* Return whether the data reference DR is supported with respect to its
6490 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6491 it is aligned, i.e., check if it is possible to vectorize it with different
6494 enum dr_alignment_support
6495 vect_supportable_dr_alignment (struct data_reference
*dr
,
6496 bool check_aligned_accesses
)
6498 gimple
*stmt
= DR_STMT (dr
);
6499 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6500 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6501 machine_mode mode
= TYPE_MODE (vectype
);
6502 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6503 struct loop
*vect_loop
= NULL
;
6504 bool nested_in_vect_loop
= false;
6506 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6509 /* For now assume all conditional loads/stores support unaligned
6510 access without any special code. */
6511 if (is_gimple_call (stmt
)
6512 && gimple_call_internal_p (stmt
)
6513 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6514 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6515 return dr_unaligned_supported
;
6519 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6520 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6523 /* Possibly unaligned access. */
6525 /* We can choose between using the implicit realignment scheme (generating
6526 a misaligned_move stmt) and the explicit realignment scheme (generating
6527 aligned loads with a REALIGN_LOAD). There are two variants to the
6528 explicit realignment scheme: optimized, and unoptimized.
6529 We can optimize the realignment only if the step between consecutive
6530 vector loads is equal to the vector size. Since the vector memory
6531 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6532 is guaranteed that the misalignment amount remains the same throughout the
6533 execution of the vectorized loop. Therefore, we can create the
6534 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6535 at the loop preheader.
6537 However, in the case of outer-loop vectorization, when vectorizing a
6538 memory access in the inner-loop nested within the LOOP that is now being
6539 vectorized, while it is guaranteed that the misalignment of the
6540 vectorized memory access will remain the same in different outer-loop
6541 iterations, it is *not* guaranteed that is will remain the same throughout
6542 the execution of the inner-loop. This is because the inner-loop advances
6543 with the original scalar step (and not in steps of VS). If the inner-loop
6544 step happens to be a multiple of VS, then the misalignment remains fixed
6545 and we can use the optimized realignment scheme. For example:
6551 When vectorizing the i-loop in the above example, the step between
6552 consecutive vector loads is 1, and so the misalignment does not remain
6553 fixed across the execution of the inner-loop, and the realignment cannot
6554 be optimized (as illustrated in the following pseudo vectorized loop):
6556 for (i=0; i<N; i+=4)
6557 for (j=0; j<M; j++){
6558 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6559 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6560 // (assuming that we start from an aligned address).
6563 We therefore have to use the unoptimized realignment scheme:
6565 for (i=0; i<N; i+=4)
6566 for (j=k; j<M; j+=4)
6567 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6568 // that the misalignment of the initial address is
6571 The loop can then be vectorized as follows:
6573 for (k=0; k<4; k++){
6574 rt = get_realignment_token (&vp[k]);
6575 for (i=0; i<N; i+=4){
6577 for (j=k; j<M; j+=4){
6579 va = REALIGN_LOAD <v1,v2,rt>;
6586 if (DR_IS_READ (dr
))
6588 bool is_packed
= false;
6589 tree type
= (TREE_TYPE (DR_REF (dr
)));
6591 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6592 && (!targetm
.vectorize
.builtin_mask_for_load
6593 || targetm
.vectorize
.builtin_mask_for_load ()))
6595 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6597 /* If we are doing SLP then the accesses need not have the
6598 same alignment, instead it depends on the SLP group size. */
6600 && STMT_SLP_TYPE (stmt_info
)
6601 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6602 * DR_GROUP_SIZE (vinfo_for_stmt
6603 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6604 TYPE_VECTOR_SUBPARTS (vectype
)))
6606 else if (!loop_vinfo
6607 || (nested_in_vect_loop
6608 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6609 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6610 return dr_explicit_realign
;
6612 return dr_explicit_realign_optimized
;
6614 if (!known_alignment_for_access_p (dr
))
6615 is_packed
= not_size_aligned (DR_REF (dr
));
6617 if (targetm
.vectorize
.support_vector_misalignment
6618 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6619 /* Can't software pipeline the loads, but can at least do them. */
6620 return dr_unaligned_supported
;
6624 bool is_packed
= false;
6625 tree type
= (TREE_TYPE (DR_REF (dr
)));
6627 if (!known_alignment_for_access_p (dr
))
6628 is_packed
= not_size_aligned (DR_REF (dr
));
6630 if (targetm
.vectorize
.support_vector_misalignment
6631 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6632 return dr_unaligned_supported
;
6636 return dr_unaligned_unsupported
;