1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
62 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
63 tree vectype
, unsigned HOST_WIDE_INT count
)
65 machine_mode mode
, array_mode
;
68 mode
= TYPE_MODE (vectype
);
69 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
71 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
72 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
73 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC
"]\n",
79 GET_MODE_NAME (mode
), count
);
84 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
88 "cannot use %s<%s><%s>\n", name
,
89 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE
, vect_location
,
95 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
96 GET_MODE_NAME (mode
));
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
120 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
121 HOST_WIDE_INT
*rhs_size_unit
)
123 tree scalar_type
= gimple_expr_type (stmt
);
124 HOST_WIDE_INT lhs
, rhs
;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
131 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
133 if (is_gimple_assign (stmt
)
134 && (gimple_assign_cast_p (stmt
)
135 || gimple_assign_rhs_code (stmt
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (stmt
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
141 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
143 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
145 scalar_type
= rhs_type
;
148 *lhs_size_unit
= lhs
;
149 *rhs_size_unit
= rhs
;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
159 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
161 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
166 if (!runtime_alias_check_p (ddr
, loop
,
167 optimize_loop_nest_for_speed_p (loop
)))
170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
177 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
179 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
180 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
181 if (checks
[i
] == value
)
184 if (dump_enabled_p ())
186 dump_printf_loc (MSG_NOTE
, vect_location
, "need run-time check that ");
187 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, value
);
188 dump_printf (MSG_NOTE
, " is nonzero\n");
190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
193 /* Return true if we know that the order of vectorized STMT_A and
194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195 At least one of the statements is a write. */
198 vect_preserves_scalar_order_p (gimple
*stmt_a
, gimple
*stmt_b
)
200 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (stmt_a
);
201 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (stmt_b
);
203 /* Single statements are always kept in their original order. */
204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
209 group are emitted at the position of the first scalar load and all
210 stores in a group are emitted at the position of the last scalar store.
211 Thus writes will happen no earlier than their current position
212 (but could happen later) while reads will happen no later than their
213 current position (but could happen earlier). Reordering is therefore
214 only possible if the first access is a write. */
215 if (is_pattern_stmt_p (stmtinfo_a
))
216 stmt_a
= STMT_VINFO_RELATED_STMT (stmtinfo_a
);
217 if (is_pattern_stmt_p (stmtinfo_b
))
218 stmt_b
= STMT_VINFO_RELATED_STMT (stmtinfo_b
);
219 gimple
*earlier_stmt
= get_earlier_stmt (stmt_a
, stmt_b
);
220 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
)));
223 /* A subroutine of vect_analyze_data_ref_dependence. Handle
224 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
225 distances. These distances are conservatively correct but they don't
226 reflect a guaranteed dependence.
228 Return true if this function does all the work necessary to avoid
229 an alias or false if the caller should use the dependence distances
230 to limit the vectorization factor in the usual way. LOOP_DEPTH is
231 the depth of the loop described by LOOP_VINFO and the other arguments
232 are as for vect_analyze_data_ref_dependence. */
235 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
236 loop_vec_info loop_vinfo
,
237 int loop_depth
, unsigned int *max_vf
)
239 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
240 lambda_vector dist_v
;
242 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
244 int dist
= dist_v
[loop_depth
];
245 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
247 /* If the user asserted safelen >= DIST consecutive iterations
248 can be executed concurrently, assume independence.
250 ??? An alternative would be to add the alias check even
251 in this case, and vectorize the fallback loop with the
252 maximum VF set to safelen. However, if the user has
253 explicitly given a length, it's less likely that that
255 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
257 if ((unsigned int) loop
->safelen
< *max_vf
)
258 *max_vf
= loop
->safelen
;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
263 /* For dependence distances of 2 or more, we have the option
264 of limiting VF or checking for an alias at runtime.
265 Prefer to check at runtime if we can, to avoid limiting
266 the VF unnecessarily when the bases are in fact independent.
268 Note that the alias checks will be removed if the VF ends up
269 being small enough. */
270 return (!STMT_VINFO_GATHER_SCATTER_P
271 (vinfo_for_stmt (DR_STMT (DDR_A (ddr
))))
272 && !STMT_VINFO_GATHER_SCATTER_P
273 (vinfo_for_stmt (DR_STMT (DDR_B (ddr
))))
274 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
281 /* Function vect_analyze_data_ref_dependence.
283 Return TRUE if there (might) exist a dependence between a memory-reference
284 DRA and a memory-reference DRB. When versioning for alias may check a
285 dependence at run-time, return FALSE. Adjust *MAX_VF according to
286 the data dependence. */
289 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
290 loop_vec_info loop_vinfo
,
291 unsigned int *max_vf
)
294 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
295 struct data_reference
*dra
= DDR_A (ddr
);
296 struct data_reference
*drb
= DDR_B (ddr
);
297 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (vect_dr_stmt (dra
));
298 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (vect_dr_stmt (drb
));
299 lambda_vector dist_v
;
300 unsigned int loop_depth
;
302 /* In loop analysis all data references should be vectorizable. */
303 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
304 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
307 /* Independent data accesses. */
308 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
312 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
315 /* We do not have to consider dependences between accesses that belong
316 to the same group, unless the stride could be smaller than the
318 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
319 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
320 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
321 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
324 /* Even if we have an anti-dependence then, as the vectorized loop covers at
325 least two scalar iterations, there is always also a true dependence.
326 As the vectorizer does not re-order loads and stores we can ignore
327 the anti-dependence if TBAA can disambiguate both DRs similar to the
328 case with known negative distance anti-dependences (positive
329 distance anti-dependences would violate TBAA constraints). */
330 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
331 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
332 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
333 get_alias_set (DR_REF (drb
))))
336 /* Unknown data dependence. */
337 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
339 /* If user asserted safelen consecutive iterations can be
340 executed concurrently, assume independence. */
341 if (loop
->safelen
>= 2)
343 if ((unsigned int) loop
->safelen
< *max_vf
)
344 *max_vf
= loop
->safelen
;
345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
355 "versioning for alias not supported for: "
356 "can't determine dependence between ");
357 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
359 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
360 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
362 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
367 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
370 "versioning for alias required: "
371 "can't determine dependence between ");
372 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
374 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
375 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
377 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
380 /* Add to list of ddrs that need to be tested at run-time. */
381 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
384 /* Known data dependence. */
385 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
387 /* If user asserted safelen consecutive iterations can be
388 executed concurrently, assume independence. */
389 if (loop
->safelen
>= 2)
391 if ((unsigned int) loop
->safelen
< *max_vf
)
392 *max_vf
= loop
->safelen
;
393 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
397 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
398 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
400 if (dump_enabled_p ())
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
403 "versioning for alias not supported for: "
404 "bad dist vector for ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
407 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
408 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
410 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
415 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
418 "versioning for alias required: "
419 "bad dist vector for ");
420 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
421 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
422 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
423 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
425 /* Add to list of ddrs that need to be tested at run-time. */
426 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
429 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
431 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
432 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
436 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
438 int dist
= dist_v
[loop_depth
];
440 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE
, vect_location
,
442 "dependence distance = %d.\n", dist
);
446 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE
, vect_location
,
449 "dependence distance == 0 between ");
450 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
451 dump_printf (MSG_NOTE
, " and ");
452 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
453 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
456 /* When we perform grouped accesses and perform implicit CSE
457 by detecting equal accesses and doing disambiguation with
458 runtime alias tests like for
466 where we will end up loading { a[i], a[i+1] } once, make
467 sure that inserting group loads before the first load and
468 stores after the last store will do the right thing.
469 Similar for groups like
473 where loads from the group interleave with the store. */
474 if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra
),
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
479 "READ_WRITE dependence in interleaving.\n");
483 if (loop
->safelen
< 2)
485 tree indicator
= dr_zero_step_indicator (dra
);
486 if (!indicator
|| integer_zerop (indicator
))
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
490 "access also has a zero step\n");
493 else if (TREE_CODE (indicator
) != INTEGER_CST
)
494 vect_check_nonzero_value (loop_vinfo
, indicator
);
499 if (dist
> 0 && DDR_REVERSED_P (ddr
))
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
517 unsigned int abs_dist
= abs (dist
);
518 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf
= abs (dist
);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE
, vect_location
,
525 "adjusting maximal vectorization factor to %i\n",
529 if (abs_dist
>= *max_vf
)
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE
, vect_location
,
535 "dependence distance >= VF.\n");
539 if (dump_enabled_p ())
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
545 dump_printf (MSG_NOTE
, " and ");
546 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
547 dump_printf (MSG_NOTE
, "\n");
556 /* Function vect_analyze_data_ref_dependences.
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
564 unsigned int *max_vf
)
567 struct data_dependence_relation
*ddr
;
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
571 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
573 LOOP_VINFO_DDRS (loop_vinfo
)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
579 &LOOP_VINFO_DDRS (loop_vinfo
),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
585 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
587 /* For epilogues we either have no aliases or alias versioning
588 was applied to original loop. Therefore we may just get max_vf
589 using VF of original loop. */
590 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
591 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
593 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
594 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
601 /* Function vect_slp_analyze_data_ref_dependence.
603 Return TRUE if there (might) exist a dependence between a memory-reference
604 DRA and a memory-reference DRB. When versioning for alias may check a
605 dependence at run-time, return FALSE. Adjust *MAX_VF according to
606 the data dependence. */
609 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
611 struct data_reference
*dra
= DDR_A (ddr
);
612 struct data_reference
*drb
= DDR_B (ddr
);
614 /* We need to check dependences of statements marked as unvectorizable
615 as well, they still can prohibit vectorization. */
617 /* Independent data accesses. */
618 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
624 /* Read-read is OK. */
625 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
628 /* If dra and drb are part of the same interleaving chain consider
630 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra
)))
631 && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra
)))
632 == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb
)))))
635 /* Unknown data dependence. */
636 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
641 "can't determine dependence between ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
643 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
645 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
648 else if (dump_enabled_p ())
650 dump_printf_loc (MSG_NOTE
, vect_location
,
651 "determined dependence between ");
652 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
653 dump_printf (MSG_NOTE
, " and ");
654 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
655 dump_printf (MSG_NOTE
, "\n");
662 /* Analyze dependences involved in the transform of SLP NODE. STORES
663 contain the vector of scalar stores of this instance if we are
664 disambiguating the loads. */
667 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
668 vec
<gimple
*> stores
, gimple
*last_store
)
670 /* This walks over all stmts involved in the SLP load/store done
671 in NODE verifying we can sink them up to the last stmt in the
673 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
674 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
676 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
677 if (access
== last_access
)
679 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
681 bool ref_initialized_p
= false;
682 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
683 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
685 gimple
*stmt
= gsi_stmt (gsi
);
686 if (! gimple_vuse (stmt
)
687 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
690 /* If we couldn't record a (single) data reference for this
691 stmt we have to resort to the alias oracle. */
692 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
695 /* We are moving a store or sinking a load - this means
696 we cannot use TBAA for disambiguation. */
697 if (!ref_initialized_p
)
698 ao_ref_init (&ref
, DR_REF (dr_a
));
699 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
700 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
705 bool dependent
= false;
706 /* If we run into a store of this same instance (we've just
707 marked those) then delay dependence checking until we run
708 into the last store because this is where it will have
709 been sunk to (and we verify if we can do that as well). */
710 if (gimple_visited_p (stmt
))
712 if (stmt
!= last_store
)
716 FOR_EACH_VEC_ELT (stores
, i
, store
)
718 data_reference
*store_dr
719 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
720 ddr_p ddr
= initialize_data_dependence_relation
721 (dr_a
, store_dr
, vNULL
);
722 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
723 free_dependence_relation (ddr
);
730 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
732 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
733 free_dependence_relation (ddr
);
743 /* Function vect_analyze_data_ref_dependences.
745 Examine all the data references in the basic-block, and make sure there
746 do not exist any data dependences between them. Set *MAX_VF according to
747 the maximum vectorization factor the data dependences allow. */
750 vect_slp_analyze_instance_dependence (slp_instance instance
)
752 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
754 /* The stores of this instance are at the root of the SLP tree. */
755 slp_tree store
= SLP_INSTANCE_TREE (instance
);
756 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
759 /* Verify we can sink stores to the vectorized stmt insert location. */
760 gimple
*last_store
= NULL
;
763 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
766 /* Mark stores in this instance and remember the last one. */
767 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
768 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
769 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
774 /* Verify we can sink loads to the vectorized stmt insert location,
775 special-casing stores of this instance. */
778 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
779 if (! vect_slp_analyze_node_dependences (instance
, load
,
781 ? SLP_TREE_SCALAR_STMTS (store
)
782 : vNULL
, last_store
))
788 /* Unset the visited flag. */
790 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
791 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
796 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
797 the statement that contains DRB, which is useful for recording in the
801 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
802 innermost_loop_behavior
*drb
)
805 innermost_loop_behavior
*&entry
806 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
807 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_NOTE
, vect_location
,
813 "recording new base alignment for ");
814 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
815 dump_printf (MSG_NOTE
, "\n");
816 dump_printf_loc (MSG_NOTE
, vect_location
,
817 " alignment: %d\n", drb
->base_alignment
);
818 dump_printf_loc (MSG_NOTE
, vect_location
,
819 " misalignment: %d\n", drb
->base_misalignment
);
820 dump_printf_loc (MSG_NOTE
, vect_location
,
822 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
827 /* If the region we're going to vectorize is reached, all unconditional
828 data references occur at least once. We can therefore pool the base
829 alignment guarantees from each unconditional reference. Do this by
830 going through all the data references in VINFO and checking whether
831 the containing statement makes the reference unconditionally. If so,
832 record the alignment of the base address in VINFO so that it can be
833 used for all other references with the same base. */
836 vect_record_base_alignments (vec_info
*vinfo
)
838 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
839 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
842 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
844 gimple
*stmt
= vect_dr_stmt (dr
);
845 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
846 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
847 && STMT_VINFO_VECTORIZABLE (stmt_info
)
848 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
850 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
852 /* If DR is nested in the loop that is being vectorized, we can also
853 record the alignment of the base wrt the outer loop. */
854 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
855 vect_record_base_alignment
856 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
861 /* Return the target alignment for the vectorized form of DR. */
864 vect_calculate_target_alignment (struct data_reference
*dr
)
866 gimple
*stmt
= vect_dr_stmt (dr
);
867 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
868 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
869 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
872 /* Function vect_compute_data_ref_alignment
874 Compute the misalignment of the data reference DR.
877 1. DR_MISALIGNMENT (DR) is defined.
879 FOR NOW: No analysis is actually performed. Misalignment is calculated
880 only for trivial cases. TODO. */
883 vect_compute_data_ref_alignment (struct data_reference
*dr
)
885 gimple
*stmt
= vect_dr_stmt (dr
);
886 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
887 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
888 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
889 struct loop
*loop
= NULL
;
890 tree ref
= DR_REF (dr
);
891 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE
, vect_location
,
895 "vect_compute_data_ref_alignment:\n");
898 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
900 /* Initialize misalignment to unknown. */
901 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
903 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
906 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
907 bool step_preserves_misalignment_p
;
909 unsigned HOST_WIDE_INT vector_alignment
910 = vect_calculate_target_alignment (dr
) / BITS_PER_UNIT
;
911 DR_TARGET_ALIGNMENT (dr
) = vector_alignment
;
913 /* No step for BB vectorization. */
916 gcc_assert (integer_zerop (drb
->step
));
917 step_preserves_misalignment_p
= true;
920 /* In case the dataref is in an inner-loop of the loop that is being
921 vectorized (LOOP), we use the base and misalignment information
922 relative to the outer-loop (LOOP). This is ok only if the misalignment
923 stays the same throughout the execution of the inner-loop, which is why
924 we have to check that the stride of the dataref in the inner-loop evenly
925 divides by the vector alignment. */
926 else if (nested_in_vect_loop_p (loop
, stmt
))
928 step_preserves_misalignment_p
929 = (DR_STEP_ALIGNMENT (dr
) % vector_alignment
) == 0;
931 if (dump_enabled_p ())
933 if (step_preserves_misalignment_p
)
934 dump_printf_loc (MSG_NOTE
, vect_location
,
935 "inner step divides the vector alignment.\n");
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
938 "inner step doesn't divide the vector"
943 /* Similarly we can only use base and misalignment information relative to
944 an innermost loop if the misalignment stays the same throughout the
945 execution of the loop. As above, this is the case if the stride of
946 the dataref evenly divides by the alignment. */
949 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
950 step_preserves_misalignment_p
951 = multiple_p (DR_STEP_ALIGNMENT (dr
) * vf
, vector_alignment
);
953 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
955 "step doesn't divide the vector alignment.\n");
958 unsigned int base_alignment
= drb
->base_alignment
;
959 unsigned int base_misalignment
= drb
->base_misalignment
;
961 /* Calculate the maximum of the pooled base address alignment and the
962 alignment that we can compute for DR itself. */
963 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
964 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
966 base_alignment
= (*entry
)->base_alignment
;
967 base_misalignment
= (*entry
)->base_misalignment
;
970 if (drb
->offset_alignment
< vector_alignment
971 || !step_preserves_misalignment_p
972 /* We need to know whether the step wrt the vectorized loop is
973 negative when computing the starting misalignment below. */
974 || TREE_CODE (drb
->step
) != INTEGER_CST
)
976 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
979 "Unknown alignment for access: ");
980 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
981 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
986 if (base_alignment
< vector_alignment
)
988 unsigned int max_alignment
;
989 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
990 if (max_alignment
< vector_alignment
991 || !vect_can_force_dr_alignment_p (base
,
992 vector_alignment
* BITS_PER_UNIT
))
994 if (dump_enabled_p ())
996 dump_printf_loc (MSG_NOTE
, vect_location
,
997 "can't force alignment of ref: ");
998 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
999 dump_printf (MSG_NOTE
, "\n");
1004 /* Force the alignment of the decl.
1005 NOTE: This is the only change to the code we make during
1006 the analysis phase, before deciding to vectorize the loop. */
1007 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
1010 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
1011 dump_printf (MSG_NOTE
, "\n");
1014 DR_VECT_AUX (dr
)->base_decl
= base
;
1015 DR_VECT_AUX (dr
)->base_misaligned
= true;
1016 base_misalignment
= 0;
1018 poly_int64 misalignment
1019 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1021 /* If this is a backward running DR then first access in the larger
1022 vectype actually is N-1 elements before the address in the DR.
1023 Adjust misalign accordingly. */
1024 if (tree_int_cst_sgn (drb
->step
) < 0)
1025 /* PLUS because STEP is negative. */
1026 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1027 * TREE_INT_CST_LOW (drb
->step
));
1029 unsigned int const_misalignment
;
1030 if (!known_misalignment (misalignment
, vector_alignment
,
1031 &const_misalignment
))
1033 if (dump_enabled_p ())
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1036 "Non-constant misalignment for access: ");
1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1038 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1043 SET_DR_MISALIGNMENT (dr
, const_misalignment
);
1045 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1048 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
1049 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
1050 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1056 /* Function vect_update_misalignment_for_peel.
1057 Sets DR's misalignment
1058 - to 0 if it has the same alignment as DR_PEEL,
1059 - to the misalignment computed using NPEEL if DR's salignment is known,
1060 - to -1 (unknown) otherwise.
1062 DR - the data reference whose misalignment is to be adjusted.
1063 DR_PEEL - the data reference whose misalignment is being made
1064 zero in the vector loop by the peel.
1065 NPEEL - the number of iterations in the peel loop if the misalignment
1066 of DR_PEEL is known at compile time. */
1069 vect_update_misalignment_for_peel (struct data_reference
*dr
,
1070 struct data_reference
*dr_peel
, int npeel
)
1073 vec
<dr_p
> same_aligned_drs
;
1074 struct data_reference
*current_dr
;
1075 int dr_size
= vect_get_scalar_dr_size (dr
);
1076 int dr_peel_size
= vect_get_scalar_dr_size (dr_peel
);
1077 stmt_vec_info stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr
));
1078 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr_peel
));
1080 /* For interleaved data accesses the step in the loop must be multiplied by
1081 the size of the interleaving group. */
1082 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1083 dr_size
*= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info
)));
1084 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
1085 dr_peel_size
*= DR_GROUP_SIZE (peel_stmt_info
);
1087 /* It can be assumed that the data refs with the same alignment as dr_peel
1088 are aligned in the vector loop. */
1090 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel
)));
1091 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
1093 if (current_dr
!= dr
)
1095 gcc_assert (!known_alignment_for_access_p (dr
)
1096 || !known_alignment_for_access_p (dr_peel
)
1097 || (DR_MISALIGNMENT (dr
) / dr_size
1098 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
1099 SET_DR_MISALIGNMENT (dr
, 0);
1103 if (known_alignment_for_access_p (dr
)
1104 && known_alignment_for_access_p (dr_peel
))
1106 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1107 int misal
= DR_MISALIGNMENT (dr
);
1108 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1109 misal
&= DR_TARGET_ALIGNMENT (dr
) - 1;
1110 SET_DR_MISALIGNMENT (dr
, misal
);
1114 if (dump_enabled_p ())
1115 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1116 "to unknown (-1).\n");
1117 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1121 /* Function verify_data_ref_alignment
1123 Return TRUE if DR can be handled with respect to alignment. */
1126 verify_data_ref_alignment (data_reference_p dr
)
1128 enum dr_alignment_support supportable_dr_alignment
1129 = vect_supportable_dr_alignment (dr
, false);
1130 if (!supportable_dr_alignment
)
1132 if (dump_enabled_p ())
1134 if (DR_IS_READ (dr
))
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1136 "not vectorized: unsupported unaligned load.");
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1139 "not vectorized: unsupported unaligned "
1142 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1144 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1149 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE
, vect_location
,
1151 "Vectorizing an unaligned access.\n");
1156 /* Function vect_verify_datarefs_alignment
1158 Return TRUE if all data references in the loop can be
1159 handled with respect to alignment. */
1162 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1164 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
1165 struct data_reference
*dr
;
1168 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1170 gimple
*stmt
= vect_dr_stmt (dr
);
1171 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1173 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1176 /* For interleaving, only the alignment of the first access matters. */
1177 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1178 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1181 /* Strided accesses perform only component accesses, alignment is
1182 irrelevant for them. */
1183 if (STMT_VINFO_STRIDED_P (stmt_info
)
1184 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1187 if (! verify_data_ref_alignment (dr
))
1194 /* Given an memory reference EXP return whether its alignment is less
1198 not_size_aligned (tree exp
)
1200 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1203 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1204 > get_object_alignment (exp
));
1207 /* Function vector_alignment_reachable_p
1209 Return true if vector alignment for DR is reachable by peeling
1210 a few loop iterations. Return false otherwise. */
1213 vector_alignment_reachable_p (struct data_reference
*dr
)
1215 gimple
*stmt
= vect_dr_stmt (dr
);
1216 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1217 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1219 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1221 /* For interleaved access we peel only if number of iterations in
1222 the prolog loop ({VF - misalignment}), is a multiple of the
1223 number of the interleaved accesses. */
1224 int elem_size
, mis_in_elements
;
1226 /* FORNOW: handle only known alignment. */
1227 if (!known_alignment_for_access_p (dr
))
1230 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1231 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1232 elem_size
= vector_element_size (vector_size
, nelements
);
1233 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1235 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1239 /* If misalignment is known at the compile time then allow peeling
1240 only if natural alignment is reachable through peeling. */
1241 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1243 HOST_WIDE_INT elmsize
=
1244 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1245 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_NOTE
, vect_location
,
1248 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1249 dump_printf (MSG_NOTE
,
1250 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1252 if (DR_MISALIGNMENT (dr
) % elmsize
)
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1256 "data size does not divide the misalignment.\n");
1261 if (!known_alignment_for_access_p (dr
))
1263 tree type
= TREE_TYPE (DR_REF (dr
));
1264 bool is_packed
= not_size_aligned (DR_REF (dr
));
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1267 "Unknown misalignment, %snaturally aligned\n",
1268 is_packed
? "not " : "");
1269 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1276 /* Calculate the cost of the memory access represented by DR. */
1279 vect_get_data_access_cost (struct data_reference
*dr
,
1280 unsigned int *inside_cost
,
1281 unsigned int *outside_cost
,
1282 stmt_vector_for_cost
*body_cost_vec
,
1283 stmt_vector_for_cost
*prologue_cost_vec
)
1285 gimple
*stmt
= vect_dr_stmt (dr
);
1286 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1287 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1290 if (PURE_SLP_STMT (stmt_info
))
1293 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1295 if (DR_IS_READ (dr
))
1296 vect_get_load_cost (stmt_info
, ncopies
, true, inside_cost
, outside_cost
,
1297 prologue_cost_vec
, body_cost_vec
, false);
1299 vect_get_store_cost (stmt_info
, ncopies
, inside_cost
, body_cost_vec
);
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_NOTE
, vect_location
,
1303 "vect_get_data_access_cost: inside_cost = %d, "
1304 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1308 typedef struct _vect_peel_info
1310 struct data_reference
*dr
;
1315 typedef struct _vect_peel_extended_info
1317 struct _vect_peel_info peel_info
;
1318 unsigned int inside_cost
;
1319 unsigned int outside_cost
;
1320 } *vect_peel_extended_info
;
1323 /* Peeling hashtable helpers. */
1325 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1327 static inline hashval_t
hash (const _vect_peel_info
*);
1328 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1332 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1334 return (hashval_t
) peel_info
->npeel
;
1338 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1340 return (a
->npeel
== b
->npeel
);
1344 /* Insert DR into peeling hash table with NPEEL as key. */
1347 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1348 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1351 struct _vect_peel_info elem
, *slot
;
1352 _vect_peel_info
**new_slot
;
1353 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1356 slot
= peeling_htab
->find (&elem
);
1361 slot
= XNEW (struct _vect_peel_info
);
1362 slot
->npeel
= npeel
;
1365 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1369 if (!supportable_dr_alignment
1370 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1371 slot
->count
+= VECT_MAX_COST
;
1375 /* Traverse peeling hash table to find peeling option that aligns maximum
1376 number of data accesses. */
1379 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1380 _vect_peel_extended_info
*max
)
1382 vect_peel_info elem
= *slot
;
1384 if (elem
->count
> max
->peel_info
.count
1385 || (elem
->count
== max
->peel_info
.count
1386 && max
->peel_info
.npeel
> elem
->npeel
))
1388 max
->peel_info
.npeel
= elem
->npeel
;
1389 max
->peel_info
.count
= elem
->count
;
1390 max
->peel_info
.dr
= elem
->dr
;
1396 /* Get the costs of peeling NPEEL iterations checking data access costs
1397 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1398 misalignment will be zero after peeling. */
1401 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1402 struct data_reference
*dr0
,
1403 unsigned int *inside_cost
,
1404 unsigned int *outside_cost
,
1405 stmt_vector_for_cost
*body_cost_vec
,
1406 stmt_vector_for_cost
*prologue_cost_vec
,
1408 bool unknown_misalignment
)
1413 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1415 gimple
*stmt
= vect_dr_stmt (dr
);
1416 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1417 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1420 /* For interleaving, only the alignment of the first access
1422 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1423 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1426 /* Strided accesses perform only component accesses, alignment is
1427 irrelevant for them. */
1428 if (STMT_VINFO_STRIDED_P (stmt_info
)
1429 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1432 int save_misalignment
;
1433 save_misalignment
= DR_MISALIGNMENT (dr
);
1436 else if (unknown_misalignment
&& dr
== dr0
)
1437 SET_DR_MISALIGNMENT (dr
, 0);
1439 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1440 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1441 body_cost_vec
, prologue_cost_vec
);
1442 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1446 /* Traverse peeling hash table and calculate cost for each peeling option.
1447 Find the one with the lowest cost. */
1450 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1451 _vect_peel_extended_info
*min
)
1453 vect_peel_info elem
= *slot
;
1455 unsigned int inside_cost
= 0, outside_cost
= 0;
1456 gimple
*stmt
= vect_dr_stmt (elem
->dr
);
1457 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1458 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1459 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1462 prologue_cost_vec
.create (2);
1463 body_cost_vec
.create (2);
1464 epilogue_cost_vec
.create (2);
1466 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1467 elem
->dr
, &inside_cost
, &outside_cost
,
1468 &body_cost_vec
, &prologue_cost_vec
,
1469 elem
->npeel
, false);
1471 body_cost_vec
.release ();
1473 outside_cost
+= vect_get_known_peeling_cost
1474 (loop_vinfo
, elem
->npeel
, &dummy
,
1475 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1476 &prologue_cost_vec
, &epilogue_cost_vec
);
1478 /* Prologue and epilogue costs are added to the target model later.
1479 These costs depend only on the scalar iteration cost, the
1480 number of peeling iterations finally chosen, and the number of
1481 misaligned statements. So discard the information found here. */
1482 prologue_cost_vec
.release ();
1483 epilogue_cost_vec
.release ();
1485 if (inside_cost
< min
->inside_cost
1486 || (inside_cost
== min
->inside_cost
1487 && outside_cost
< min
->outside_cost
))
1489 min
->inside_cost
= inside_cost
;
1490 min
->outside_cost
= outside_cost
;
1491 min
->peel_info
.dr
= elem
->dr
;
1492 min
->peel_info
.npeel
= elem
->npeel
;
1493 min
->peel_info
.count
= elem
->count
;
1500 /* Choose best peeling option by traversing peeling hash table and either
1501 choosing an option with the lowest cost (if cost model is enabled) or the
1502 option that aligns as many accesses as possible. */
1504 static struct _vect_peel_extended_info
1505 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1506 loop_vec_info loop_vinfo
)
1508 struct _vect_peel_extended_info res
;
1510 res
.peel_info
.dr
= NULL
;
1512 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1514 res
.inside_cost
= INT_MAX
;
1515 res
.outside_cost
= INT_MAX
;
1516 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1517 vect_peeling_hash_get_lowest_cost
> (&res
);
1521 res
.peel_info
.count
= 0;
1522 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1523 vect_peeling_hash_get_most_frequent
> (&res
);
1524 res
.inside_cost
= 0;
1525 res
.outside_cost
= 0;
1531 /* Return true if the new peeling NPEEL is supported. */
1534 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1538 struct data_reference
*dr
= NULL
;
1539 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1541 stmt_vec_info stmt_info
;
1542 enum dr_alignment_support supportable_dr_alignment
;
1544 /* Ensure that all data refs can be vectorized after the peel. */
1545 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1547 int save_misalignment
;
1552 stmt
= vect_dr_stmt (dr
);
1553 stmt_info
= vinfo_for_stmt (stmt
);
1554 /* For interleaving, only the alignment of the first access
1556 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1557 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1560 /* Strided accesses perform only component accesses, alignment is
1561 irrelevant for them. */
1562 if (STMT_VINFO_STRIDED_P (stmt_info
)
1563 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1566 save_misalignment
= DR_MISALIGNMENT (dr
);
1567 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1568 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1569 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1571 if (!supportable_dr_alignment
)
1578 /* Function vect_enhance_data_refs_alignment
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1608 Here are the general steps involved in alignment enhancements:
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1622 -- Possibility 1: we do loop versioning:
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1672 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1673 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1674 enum dr_alignment_support supportable_dr_alignment
;
1675 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1676 struct data_reference
*dr
;
1678 bool do_peeling
= false;
1679 bool do_versioning
= false;
1682 stmt_vec_info stmt_info
;
1683 unsigned int npeel
= 0;
1684 bool one_misalignment_known
= false;
1685 bool one_misalignment_unknown
= false;
1686 bool one_dr_unsupportable
= false;
1687 struct data_reference
*unsupportable_dr
= NULL
;
1688 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1689 unsigned possible_npeel_number
= 1;
1691 unsigned int mis
, same_align_drs_max
= 0;
1692 hash_table
<peel_info_hasher
> peeling_htab (1);
1694 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1696 /* Reset data so we can safely be called multiple times. */
1697 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1698 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1700 /* While cost model enhancements are expected in the future, the high level
1701 view of the code at this time is as follows:
1703 A) If there is a misaligned access then see if peeling to align
1704 this access can make all data references satisfy
1705 vect_supportable_dr_alignment. If so, update data structures
1706 as needed and return true.
1708 B) If peeling wasn't possible and there is a data reference with an
1709 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1710 then see if loop versioning checks can be used to make all data
1711 references satisfy vect_supportable_dr_alignment. If so, update
1712 data structures as needed and return true.
1714 C) If neither peeling nor versioning were successful then return false if
1715 any data reference does not satisfy vect_supportable_dr_alignment.
1717 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1719 Note, Possibility 3 above (which is peeling and versioning together) is not
1720 being done at this time. */
1722 /* (1) Peeling to force alignment. */
1724 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1726 + How many accesses will become aligned due to the peeling
1727 - How many accesses will become unaligned due to the peeling,
1728 and the cost of misaligned accesses.
1729 - The cost of peeling (the extra runtime checks, the increase
1732 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1734 stmt
= vect_dr_stmt (dr
);
1735 stmt_info
= vinfo_for_stmt (stmt
);
1737 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1740 /* For interleaving, only the alignment of the first access
1742 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1743 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1746 /* For scatter-gather or invariant accesses there is nothing
1748 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1749 || integer_zerop (DR_STEP (dr
)))
1752 /* Strided accesses perform only component accesses, alignment is
1753 irrelevant for them. */
1754 if (STMT_VINFO_STRIDED_P (stmt_info
)
1755 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1758 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1759 do_peeling
= vector_alignment_reachable_p (dr
);
1762 if (known_alignment_for_access_p (dr
))
1764 unsigned int npeel_tmp
= 0;
1765 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1766 size_zero_node
) < 0;
1768 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1769 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr
);
1770 unsigned int dr_size
= vect_get_scalar_dr_size (dr
);
1771 mis
= (negative
? DR_MISALIGNMENT (dr
) : -DR_MISALIGNMENT (dr
));
1772 if (DR_MISALIGNMENT (dr
) != 0)
1773 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1775 /* For multiple types, it is possible that the bigger type access
1776 will have more than one peeling option. E.g., a loop with two
1777 types: one of size (vector size / 4), and the other one of
1778 size (vector size / 8). Vectorization factor will 8. If both
1779 accesses are misaligned by 3, the first one needs one scalar
1780 iteration to be aligned, and the second one needs 5. But the
1781 first one will be aligned also by peeling 5 scalar
1782 iterations, and in that case both accesses will be aligned.
1783 Hence, except for the immediate peeling amount, we also want
1784 to try to add full vector size, while we don't exceed
1785 vectorization factor.
1786 We do this automatically for cost model, since we calculate
1787 cost for every peeling option. */
1788 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1790 poly_uint64 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1791 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1792 possible_npeel_number
1793 = vect_get_num_vectors (nscalars
, vectype
);
1795 /* NPEEL_TMP is 0 when there is no misalignment, but also
1796 allow peeling NELEMENTS. */
1797 if (DR_MISALIGNMENT (dr
) == 0)
1798 possible_npeel_number
++;
1801 /* Save info about DR in the hash table. Also include peeling
1802 amounts according to the explanation above. */
1803 for (j
= 0; j
< possible_npeel_number
; j
++)
1805 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1807 npeel_tmp
+= target_align
/ dr_size
;
1810 one_misalignment_known
= true;
1814 /* If we don't know any misalignment values, we prefer
1815 peeling for data-ref that has the maximum number of data-refs
1816 with the same alignment, unless the target prefers to align
1817 stores over load. */
1818 unsigned same_align_drs
1819 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1821 || same_align_drs_max
< same_align_drs
)
1823 same_align_drs_max
= same_align_drs
;
1826 /* For data-refs with the same number of related
1827 accesses prefer the one where the misalign
1828 computation will be invariant in the outermost loop. */
1829 else if (same_align_drs_max
== same_align_drs
)
1831 struct loop
*ivloop0
, *ivloop
;
1832 ivloop0
= outermost_invariant_loop_for_expr
1833 (loop
, DR_BASE_ADDRESS (dr0
));
1834 ivloop
= outermost_invariant_loop_for_expr
1835 (loop
, DR_BASE_ADDRESS (dr
));
1836 if ((ivloop
&& !ivloop0
)
1837 || (ivloop
&& ivloop0
1838 && flow_loop_nested_p (ivloop
, ivloop0
)))
1842 one_misalignment_unknown
= true;
1844 /* Check for data refs with unsupportable alignment that
1846 if (!supportable_dr_alignment
)
1848 one_dr_unsupportable
= true;
1849 unsupportable_dr
= dr
;
1852 if (!first_store
&& DR_IS_WRITE (dr
))
1858 if (!aligned_access_p (dr
))
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1862 "vector alignment may not be reachable\n");
1868 /* Check if we can possibly peel the loop. */
1869 if (!vect_can_advance_ivs_p (loop_vinfo
)
1870 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1874 struct _vect_peel_extended_info peel_for_known_alignment
;
1875 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1876 struct _vect_peel_extended_info best_peel
;
1878 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1879 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1880 peel_for_unknown_alignment
.peel_info
.count
= 0;
1883 && one_misalignment_unknown
)
1885 /* Check if the target requires to prefer stores over loads, i.e., if
1886 misaligned stores are more expensive than misaligned loads (taking
1887 drs with same alignment into account). */
1888 unsigned int load_inside_cost
= 0;
1889 unsigned int load_outside_cost
= 0;
1890 unsigned int store_inside_cost
= 0;
1891 unsigned int store_outside_cost
= 0;
1892 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1894 stmt_vector_for_cost dummy
;
1896 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1899 &dummy
, &dummy
, estimated_npeels
, true);
1905 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1907 &store_outside_cost
,
1909 estimated_npeels
, true);
1914 store_inside_cost
= INT_MAX
;
1915 store_outside_cost
= INT_MAX
;
1918 if (load_inside_cost
> store_inside_cost
1919 || (load_inside_cost
== store_inside_cost
1920 && load_outside_cost
> store_outside_cost
))
1923 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1924 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1928 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1929 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1932 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1933 prologue_cost_vec
.create (2);
1934 epilogue_cost_vec
.create (2);
1937 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1938 (loop_vinfo
, estimated_npeels
, &dummy2
,
1939 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1940 &prologue_cost_vec
, &epilogue_cost_vec
);
1942 prologue_cost_vec
.release ();
1943 epilogue_cost_vec
.release ();
1945 peel_for_unknown_alignment
.peel_info
.count
= 1
1946 + STMT_VINFO_SAME_ALIGN_REFS
1947 (vinfo_for_stmt (vect_dr_stmt (dr0
))).length ();
1950 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1951 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1953 best_peel
= peel_for_unknown_alignment
;
1955 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1956 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1957 peel_for_known_alignment
.peel_info
.count
= 0;
1958 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1960 if (do_peeling
&& one_misalignment_known
)
1962 /* Peeling is possible, but there is no data access that is not supported
1963 unless aligned. So we try to choose the best possible peeling from
1965 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1966 (&peeling_htab
, loop_vinfo
);
1969 /* Compare costs of peeling for known and unknown alignment. */
1970 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1971 && peel_for_unknown_alignment
.inside_cost
1972 >= peel_for_known_alignment
.inside_cost
)
1974 best_peel
= peel_for_known_alignment
;
1976 /* If the best peeling for known alignment has NPEEL == 0, perform no
1977 peeling at all except if there is an unsupportable dr that we can
1979 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1983 /* If there is an unsupportable data ref, prefer this over all choices so far
1984 since we'd have to discard a chosen peeling except when it accidentally
1985 aligned the unsupportable data ref. */
1986 if (one_dr_unsupportable
)
1987 dr0
= unsupportable_dr
;
1988 else if (do_peeling
)
1990 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991 TODO: Use nopeel_outside_cost or get rid of it? */
1992 unsigned nopeel_inside_cost
= 0;
1993 unsigned nopeel_outside_cost
= 0;
1995 stmt_vector_for_cost dummy
;
1997 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1998 &nopeel_outside_cost
, &dummy
, &dummy
,
2002 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2003 costs will be recorded. */
2004 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2005 prologue_cost_vec
.create (2);
2006 epilogue_cost_vec
.create (2);
2009 nopeel_outside_cost
+= vect_get_known_peeling_cost
2010 (loop_vinfo
, 0, &dummy2
,
2011 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2012 &prologue_cost_vec
, &epilogue_cost_vec
);
2014 prologue_cost_vec
.release ();
2015 epilogue_cost_vec
.release ();
2017 npeel
= best_peel
.peel_info
.npeel
;
2018 dr0
= best_peel
.peel_info
.dr
;
2020 /* If no peeling is not more expensive than the best peeling we
2021 have so far, don't perform any peeling. */
2022 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2028 stmt
= vect_dr_stmt (dr0
);
2029 stmt_info
= vinfo_for_stmt (stmt
);
2030 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2032 if (known_alignment_for_access_p (dr0
))
2034 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
2035 size_zero_node
) < 0;
2038 /* Since it's known at compile time, compute the number of
2039 iterations in the peeled loop (the peeling factor) for use in
2040 updating DR_MISALIGNMENT values. The peeling factor is the
2041 vectorization factor minus the misalignment as an element
2043 mis
= negative
? DR_MISALIGNMENT (dr0
) : -DR_MISALIGNMENT (dr0
);
2044 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2045 npeel
= ((mis
& (target_align
- 1))
2046 / vect_get_scalar_dr_size (dr0
));
2049 /* For interleaved data access every iteration accesses all the
2050 members of the group, therefore we divide the number of iterations
2051 by the group size. */
2052 stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr0
));
2053 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2054 npeel
/= DR_GROUP_SIZE (stmt_info
);
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE
, vect_location
,
2058 "Try peeling by %d\n", npeel
);
2061 /* Ensure that all datarefs can be vectorized after the peel. */
2062 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
2065 /* Check if all datarefs are supportable and log. */
2066 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
2068 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2075 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2078 unsigned max_allowed_peel
2079 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
2080 if (max_allowed_peel
!= (unsigned)-1)
2082 unsigned max_peel
= npeel
;
2085 unsigned int target_align
= DR_TARGET_ALIGNMENT (dr0
);
2086 max_peel
= target_align
/ vect_get_scalar_dr_size (dr0
) - 1;
2088 if (max_peel
> max_allowed_peel
)
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE
, vect_location
,
2093 "Disable peeling, max peels reached: %d\n", max_peel
);
2098 /* Cost model #2 - if peeling may result in a remaining loop not
2099 iterating enough to be vectorized then do not peel. Since this
2100 is a cost heuristic rather than a correctness decision, use the
2101 most likely runtime value for variable vectorization factors. */
2103 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2105 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2106 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2107 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2108 < assumed_vf
+ max_peel
)
2114 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2115 If the misalignment of DR_i is identical to that of dr0 then set
2116 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2117 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2118 by the peeling factor times the element size of DR_i (MOD the
2119 vectorization factor times the size). Otherwise, the
2120 misalignment of DR_i must be set to unknown. */
2121 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2124 /* Strided accesses perform only component accesses, alignment
2125 is irrelevant for them. */
2126 stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr
));
2127 if (STMT_VINFO_STRIDED_P (stmt_info
)
2128 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2131 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2134 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2136 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2138 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2139 = DR_MISALIGNMENT (dr0
);
2140 SET_DR_MISALIGNMENT (dr0
, 0);
2141 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_NOTE
, vect_location
,
2144 "Alignment of access forced using peeling.\n");
2145 dump_printf_loc (MSG_NOTE
, vect_location
,
2146 "Peeling for alignment will be applied.\n");
2149 /* The inside-loop cost will be accounted for in vectorizable_load
2150 and vectorizable_store correctly with adjusted alignments.
2151 Drop the body_cst_vec on the floor here. */
2152 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2158 /* (2) Versioning to force alignment. */
2160 /* Try versioning if:
2161 1) optimize loop for speed
2162 2) there is at least one unsupported misaligned data ref with an unknown
2164 3) all misaligned data refs with a known misalignment are supported, and
2165 4) the number of runtime alignment checks is within reason. */
2168 optimize_loop_nest_for_speed_p (loop
)
2169 && (!loop
->inner
); /* FORNOW */
2173 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2175 stmt
= vect_dr_stmt (dr
);
2176 stmt_info
= vinfo_for_stmt (stmt
);
2178 /* For interleaving, only the alignment of the first access
2180 if (aligned_access_p (dr
)
2181 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2182 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2185 if (STMT_VINFO_STRIDED_P (stmt_info
))
2187 /* Strided loads perform only component accesses, alignment is
2188 irrelevant for them. */
2189 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2191 do_versioning
= false;
2195 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2197 if (!supportable_dr_alignment
)
2203 if (known_alignment_for_access_p (dr
)
2204 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2205 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2207 do_versioning
= false;
2211 stmt
= vect_dr_stmt (dr
);
2212 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2213 gcc_assert (vectype
);
2215 /* At present we don't support versioning for alignment
2216 with variable VF, since there's no guarantee that the
2217 VF is a power of two. We could relax this if we added
2218 a way of enforcing a power-of-two size. */
2219 unsigned HOST_WIDE_INT size
;
2220 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2222 do_versioning
= false;
2226 /* The rightmost bits of an aligned address must be zeros.
2227 Construct the mask needed for this test. For example,
2228 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2229 mask must be 15 = 0xf. */
2232 /* FORNOW: use the same mask to test all potentially unaligned
2233 references in the loop. The vectorizer currently supports
2234 a single vector size, see the reference to
2235 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2236 vectorization factor is computed. */
2237 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2238 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2239 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2240 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2245 /* Versioning requires at least one misaligned data reference. */
2246 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2247 do_versioning
= false;
2248 else if (!do_versioning
)
2249 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2254 vec
<gimple
*> may_misalign_stmts
2255 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2258 /* It can now be assumed that the data references in the statements
2259 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260 of the loop being vectorized. */
2261 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2263 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2264 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2265 SET_DR_MISALIGNMENT (dr
, 0);
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
,
2268 "Alignment of access forced using versioning.\n");
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE
, vect_location
,
2273 "Versioning for alignment will be applied.\n");
2275 /* Peeling and versioning can't be done together at this time. */
2276 gcc_assert (! (do_peeling
&& do_versioning
));
2278 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2283 /* This point is reached if neither peeling nor versioning is being done. */
2284 gcc_assert (! (do_peeling
|| do_versioning
));
2286 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2291 /* Function vect_find_same_alignment_drs.
2293 Update group and alignment relations according to the chosen
2294 vectorization factor. */
2297 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2299 struct data_reference
*dra
= DDR_A (ddr
);
2300 struct data_reference
*drb
= DDR_B (ddr
);
2301 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (vect_dr_stmt (dra
));
2302 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (vect_dr_stmt (drb
));
2304 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
2311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2314 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2315 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2316 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2319 /* Two references with distance zero have the same alignment. */
2320 poly_offset_int diff
= (wi::to_poly_offset (DR_INIT (dra
))
2321 - wi::to_poly_offset (DR_INIT (drb
)));
2322 if (maybe_ne (diff
, 0))
2324 /* Get the wider of the two alignments. */
2325 unsigned int align_a
= (vect_calculate_target_alignment (dra
)
2327 unsigned int align_b
= (vect_calculate_target_alignment (drb
)
2329 unsigned int max_align
= MAX (align_a
, align_b
);
2331 /* Require the gap to be a multiple of the larger vector alignment. */
2332 if (!multiple_p (diff
, max_align
))
2336 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2337 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2338 if (dump_enabled_p ())
2340 dump_printf_loc (MSG_NOTE
, vect_location
,
2341 "accesses have the same alignment: ");
2342 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2343 dump_printf (MSG_NOTE
, " and ");
2344 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2345 dump_printf (MSG_NOTE
, "\n");
2350 /* Function vect_analyze_data_refs_alignment
2352 Analyze the alignment of the data-references in the loop.
2353 Return FALSE if a data reference is found that cannot be vectorized. */
2356 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2358 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2360 /* Mark groups of data references with same alignment using
2361 data dependence information. */
2362 vec
<ddr_p
> ddrs
= vinfo
->shared
->ddrs
;
2363 struct data_dependence_relation
*ddr
;
2366 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2367 vect_find_same_alignment_drs (ddr
);
2369 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2370 struct data_reference
*dr
;
2372 vect_record_base_alignments (vinfo
);
2373 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2375 stmt_vec_info stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr
));
2376 if (STMT_VINFO_VECTORIZABLE (stmt_info
))
2377 vect_compute_data_ref_alignment (dr
);
2384 /* Analyze alignment of DRs of stmts in NODE. */
2387 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2389 /* We vectorize from the first scalar stmt in the node unless
2390 the node is permuted in which case we start from the first
2391 element in the group. */
2392 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2393 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2394 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2395 first_stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2397 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2398 vect_compute_data_ref_alignment (dr
);
2399 /* For creating the data-ref pointer we need alignment of the
2400 first element anyway. */
2402 vect_compute_data_ref_alignment (first_dr
);
2403 if (! verify_data_ref_alignment (dr
))
2405 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2407 "not vectorized: bad data alignment in basic "
2415 /* Function vect_slp_analyze_instance_alignment
2417 Analyze the alignment of the data-references in the SLP instance.
2418 Return FALSE if a data reference is found that cannot be vectorized. */
2421 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2423 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2427 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2428 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2431 node
= SLP_INSTANCE_TREE (instance
);
2432 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2433 && ! vect_slp_analyze_and_verify_node_alignment
2434 (SLP_INSTANCE_TREE (instance
)))
2441 /* Analyze groups of accesses: check that DR belongs to a group of
2442 accesses of legal size, step, etc. Detect gaps, single element
2443 interleaving, and other special cases. Set grouped access info.
2444 Collect groups of strided stores for further use in SLP analysis.
2445 Worker for vect_analyze_group_access. */
2448 vect_analyze_group_access_1 (struct data_reference
*dr
)
2450 tree step
= DR_STEP (dr
);
2451 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2452 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2453 gimple
*stmt
= vect_dr_stmt (dr
);
2454 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2455 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2456 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2457 HOST_WIDE_INT dr_step
= -1;
2458 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2459 bool slp_impossible
= false;
2461 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2462 size of the interleaving group (including gaps). */
2463 if (tree_fits_shwi_p (step
))
2465 dr_step
= tree_to_shwi (step
);
2466 /* Check that STEP is a multiple of type size. Otherwise there is
2467 a non-element-sized gap at the end of the group which we
2468 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2469 ??? As we can handle non-constant step fine here we should
2470 simply remove uses of DR_GROUP_GAP between the last and first
2471 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2472 simply not include that gap. */
2473 if ((dr_step
% type_size
) != 0)
2475 if (dump_enabled_p ())
2477 dump_printf_loc (MSG_NOTE
, vect_location
,
2479 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2480 dump_printf (MSG_NOTE
,
2481 " is not a multiple of the element size for ");
2482 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2483 dump_printf (MSG_NOTE
, "\n");
2487 groupsize
= absu_hwi (dr_step
) / type_size
;
2492 /* Not consecutive access is possible only if it is a part of interleaving. */
2493 if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2495 /* Check if it this DR is a part of interleaving, and is a single
2496 element of the group that is accessed in the loop. */
2498 /* Gaps are supported only for loads. STEP must be a multiple of the type
2501 && (dr_step
% type_size
) == 0
2504 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2505 DR_GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2506 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2507 if (dump_enabled_p ())
2509 dump_printf_loc (MSG_NOTE
, vect_location
,
2510 "Detected single element interleaving ");
2511 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2512 dump_printf (MSG_NOTE
, " step ");
2513 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2514 dump_printf (MSG_NOTE
, "\n");
2520 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2523 "not consecutive access ");
2524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2529 /* Mark the statement as unvectorizable. */
2530 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr
))) = false;
2534 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2535 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2539 if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2541 /* First stmt in the interleaving chain. Check the chain. */
2542 gimple
*next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2543 struct data_reference
*data_ref
= dr
;
2544 unsigned int count
= 1;
2545 tree prev_init
= DR_INIT (data_ref
);
2546 gimple
*prev
= stmt
;
2547 HOST_WIDE_INT diff
, gaps
= 0;
2549 /* By construction, all group members have INTEGER_CST DR_INITs. */
2552 /* Skip same data-refs. In case that two or more stmts share
2553 data-ref (supported only for loads), we vectorize only the first
2554 stmt, and the rest get their vectorized loads from the first
2556 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2557 DR_INIT (STMT_VINFO_DATA_REF (
2558 vinfo_for_stmt (next
)))))
2560 if (DR_IS_WRITE (data_ref
))
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2564 "Two store stmts share the same dr.\n");
2568 if (dump_enabled_p ())
2569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2570 "Two or more load stmts share the same dr.\n");
2572 /* For load use the same data-ref load. */
2573 DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2576 next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2581 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2583 /* All group members have the same STEP by construction. */
2584 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2586 /* Check that the distance between two accesses is equal to the type
2587 size. Otherwise, we have gaps. */
2588 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2589 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2592 /* FORNOW: SLP of accesses with gaps is not supported. */
2593 slp_impossible
= true;
2594 if (DR_IS_WRITE (data_ref
))
2596 if (dump_enabled_p ())
2597 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2598 "interleaved store with gaps\n");
2605 last_accessed_element
+= diff
;
2607 /* Store the gap from the previous member of the group. If there is no
2608 gap in the access, DR_GROUP_GAP is always 1. */
2609 DR_GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2611 prev_init
= DR_INIT (data_ref
);
2612 next
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2613 /* Count the number of data-refs in the chain. */
2618 groupsize
= count
+ gaps
;
2620 /* This could be UINT_MAX but as we are generating code in a very
2621 inefficient way we have to cap earlier. See PR78699 for example. */
2622 if (groupsize
> 4096)
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2626 "group is too large\n");
2630 /* Check that the size of the interleaving is equal to count for stores,
2631 i.e., that there are no gaps. */
2632 if (groupsize
!= count
2633 && !DR_IS_READ (dr
))
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2637 "interleaved store with gaps\n");
2641 /* If there is a gap after the last load in the group it is the
2642 difference between the groupsize and the last accessed
2644 When there is no gap, this difference should be 0. */
2645 DR_GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2647 DR_GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2648 if (dump_enabled_p ())
2650 dump_printf_loc (MSG_NOTE
, vect_location
,
2651 "Detected interleaving ");
2652 if (DR_IS_READ (dr
))
2653 dump_printf (MSG_NOTE
, "load ");
2655 dump_printf (MSG_NOTE
, "store ");
2656 dump_printf (MSG_NOTE
, "of size %u starting with ",
2657 (unsigned)groupsize
);
2658 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2659 if (DR_GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2660 dump_printf_loc (MSG_NOTE
, vect_location
,
2661 "There is a gap of %u elements after the group\n",
2662 DR_GROUP_GAP (vinfo_for_stmt (stmt
)));
2665 /* SLP: create an SLP data structure for every interleaving group of
2666 stores for further analysis in vect_analyse_slp. */
2667 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2670 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2672 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2679 /* Analyze groups of accesses: check that DR belongs to a group of
2680 accesses of legal size, step, etc. Detect gaps, single element
2681 interleaving, and other special cases. Set grouped access info.
2682 Collect groups of strided stores for further use in SLP analysis. */
2685 vect_analyze_group_access (struct data_reference
*dr
)
2687 if (!vect_analyze_group_access_1 (dr
))
2689 /* Dissolve the group if present. */
2691 gimple
*stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr
)));
2694 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2695 next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2696 DR_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2697 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2705 /* Analyze the access pattern of the data-reference DR.
2706 In case of non-consecutive accesses call vect_analyze_group_access() to
2707 analyze groups of accesses. */
2710 vect_analyze_data_ref_access (struct data_reference
*dr
)
2712 tree step
= DR_STEP (dr
);
2713 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2714 gimple
*stmt
= vect_dr_stmt (dr
);
2715 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2716 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2717 struct loop
*loop
= NULL
;
2719 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2723 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2725 if (loop_vinfo
&& !step
)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2729 "bad data-ref access in loop\n");
2733 /* Allow loads with zero step in inner-loop vectorization. */
2734 if (loop_vinfo
&& integer_zerop (step
))
2736 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2737 if (!nested_in_vect_loop_p (loop
, stmt
))
2738 return DR_IS_READ (dr
);
2739 /* Allow references with zero step for outer loops marked
2740 with pragma omp simd only - it guarantees absence of
2741 loop-carried dependencies between inner loop iterations. */
2742 if (loop
->safelen
< 2)
2744 if (dump_enabled_p ())
2745 dump_printf_loc (MSG_NOTE
, vect_location
,
2746 "zero step in inner loop of nest\n");
2751 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2753 /* Interleaved accesses are not yet supported within outer-loop
2754 vectorization for references in the inner-loop. */
2755 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2757 /* For the rest of the analysis we use the outer-loop step. */
2758 step
= STMT_VINFO_DR_STEP (stmt_info
);
2759 if (integer_zerop (step
))
2761 if (dump_enabled_p ())
2762 dump_printf_loc (MSG_NOTE
, vect_location
,
2763 "zero step in outer loop.\n");
2764 return DR_IS_READ (dr
);
2769 if (TREE_CODE (step
) == INTEGER_CST
)
2771 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2772 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2774 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2776 /* Mark that it is not interleaving. */
2777 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2782 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE
, vect_location
,
2786 "grouped access in outer loop.\n");
2791 /* Assume this is a DR handled by non-constant strided load case. */
2792 if (TREE_CODE (step
) != INTEGER_CST
)
2793 return (STMT_VINFO_STRIDED_P (stmt_info
)
2794 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2795 || vect_analyze_group_access (dr
)));
2797 /* Not consecutive access - check if it's a part of interleaving group. */
2798 return vect_analyze_group_access (dr
);
2801 /* Compare two data-references DRA and DRB to group them into chunks
2802 suitable for grouping. */
2805 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2807 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2808 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2811 /* Stabilize sort. */
2815 /* DRs in different loops never belong to the same group. */
2816 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2817 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2819 return loopa
->num
< loopb
->num
? -1 : 1;
2821 /* Ordering of DRs according to base. */
2822 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2823 DR_BASE_ADDRESS (drb
));
2827 /* And according to DR_OFFSET. */
2828 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2832 /* Put reads before writes. */
2833 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2834 return DR_IS_READ (dra
) ? -1 : 1;
2836 /* Then sort after access size. */
2837 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2838 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2842 /* And after step. */
2843 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2847 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2848 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2850 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2854 /* If OP is the result of a conversion, return the unconverted value,
2855 otherwise return null. */
2858 strip_conversion (tree op
)
2860 if (TREE_CODE (op
) != SSA_NAME
)
2862 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2863 if (!is_gimple_assign (stmt
)
2864 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2866 return gimple_assign_rhs1 (stmt
);
2869 /* Return true if vectorizable_* routines can handle statements STMT1
2870 and STMT2 being in a single group. */
2873 can_group_stmts_p (gimple
*stmt1
, gimple
*stmt2
)
2875 if (gimple_assign_single_p (stmt1
))
2876 return gimple_assign_single_p (stmt2
);
2878 if (is_gimple_call (stmt1
) && gimple_call_internal_p (stmt1
))
2880 /* Check for two masked loads or two masked stores. */
2881 if (!is_gimple_call (stmt2
) || !gimple_call_internal_p (stmt2
))
2883 internal_fn ifn
= gimple_call_internal_fn (stmt1
);
2884 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2886 if (ifn
!= gimple_call_internal_fn (stmt2
))
2889 /* Check that the masks are the same. Cope with casts of masks,
2890 like those created by build_mask_conversion. */
2891 tree mask1
= gimple_call_arg (stmt1
, 2);
2892 tree mask2
= gimple_call_arg (stmt2
, 2);
2893 if (!operand_equal_p (mask1
, mask2
, 0))
2895 mask1
= strip_conversion (mask1
);
2898 mask2
= strip_conversion (mask2
);
2901 if (!operand_equal_p (mask1
, mask2
, 0))
2910 /* Function vect_analyze_data_ref_accesses.
2912 Analyze the access pattern of all the data references in the loop.
2914 FORNOW: the only access pattern that is considered vectorizable is a
2915 simple step 1 (consecutive) access.
2917 FORNOW: handle only arrays and pointer accesses. */
2920 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2923 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2924 struct data_reference
*dr
;
2926 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928 if (datarefs
.is_empty ())
2931 /* Sort the array of datarefs to make building the interleaving chains
2932 linear. Don't modify the original vector's order, it is needed for
2933 determining what dependencies are reversed. */
2934 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2935 datarefs_copy
.qsort (dr_group_sort_cmp
);
2937 /* Build the interleaving chains. */
2938 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2940 data_reference_p dra
= datarefs_copy
[i
];
2941 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (vect_dr_stmt (dra
));
2942 stmt_vec_info lastinfo
= NULL
;
2943 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2944 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2949 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2951 data_reference_p drb
= datarefs_copy
[i
];
2952 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (vect_dr_stmt (drb
));
2953 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2954 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
2957 /* ??? Imperfect sorting (non-compatible types, non-modulo
2958 accesses, same accesses) can lead to a group to be artificially
2959 split here as we don't just skip over those. If it really
2960 matters we can push those to a worklist and re-iterate
2961 over them. The we can just skip ahead to the next DR here. */
2963 /* DRs in a different loop should not be put into the same
2964 interleaving group. */
2965 if (gimple_bb (DR_STMT (dra
))->loop_father
2966 != gimple_bb (DR_STMT (drb
))->loop_father
)
2969 /* Check that the data-refs have same first location (except init)
2970 and they are both either store or load (not load and store,
2971 not masked loads or stores). */
2972 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2973 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2974 DR_BASE_ADDRESS (drb
)) != 0
2975 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
2976 || !can_group_stmts_p (vect_dr_stmt (dra
), vect_dr_stmt (drb
)))
2979 /* Check that the data-refs have the same constant size. */
2980 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2981 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2982 if (!tree_fits_uhwi_p (sza
)
2983 || !tree_fits_uhwi_p (szb
)
2984 || !tree_int_cst_equal (sza
, szb
))
2987 /* Check that the data-refs have the same step. */
2988 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
2991 /* Check the types are compatible.
2992 ??? We don't distinguish this during sorting. */
2993 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2994 TREE_TYPE (DR_REF (drb
))))
2997 /* Check that the DR_INITs are compile-time constants. */
2998 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
2999 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
3002 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3003 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3004 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3005 HOST_WIDE_INT init_prev
3006 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1]));
3007 gcc_assert (init_a
<= init_b
3008 && init_a
<= init_prev
3009 && init_prev
<= init_b
);
3011 /* Do not place the same access in the interleaving chain twice. */
3012 if (init_b
== init_prev
)
3014 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1]))
3015 < gimple_uid (DR_STMT (drb
)));
3016 /* ??? For now we simply "drop" the later reference which is
3017 otherwise the same rather than finishing off this group.
3018 In the end we'd want to re-process duplicates forming
3019 multiple groups from the refs, likely by just collecting
3020 all candidates (including duplicates and split points
3021 below) in a vector and then process them together. */
3025 /* If init_b == init_a + the size of the type * k, we have an
3026 interleaving, and DRA is accessed before DRB. */
3027 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3028 if (type_size_a
== 0
3029 || (init_b
- init_a
) % type_size_a
!= 0)
3032 /* If we have a store, the accesses are adjacent. This splits
3033 groups into chunks we support (we don't support vectorization
3034 of stores with gaps). */
3035 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3038 /* If the step (if not zero or non-constant) is greater than the
3039 difference between data-refs' inits this splits groups into
3041 if (tree_fits_shwi_p (DR_STEP (dra
)))
3043 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
3044 if (step
!= 0 && step
<= (init_b
- init_a
))
3048 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE
, vect_location
,
3051 "Detected interleaving ");
3052 if (DR_IS_READ (dra
))
3053 dump_printf (MSG_NOTE
, "load ");
3055 dump_printf (MSG_NOTE
, "store ");
3056 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
3057 dump_printf (MSG_NOTE
, " and ");
3058 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
3059 dump_printf (MSG_NOTE
, "\n");
3062 /* Link the found element into the group list. */
3063 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3065 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = vect_dr_stmt (dra
);
3066 lastinfo
= stmtinfo_a
;
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = vect_dr_stmt (dra
);
3069 DR_GROUP_NEXT_ELEMENT (lastinfo
) = vect_dr_stmt (drb
);
3070 lastinfo
= stmtinfo_b
;
3074 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
3075 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr
)))
3076 && !vect_analyze_data_ref_access (dr
))
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3080 "not vectorized: complicated access pattern.\n");
3082 if (is_a
<bb_vec_info
> (vinfo
))
3084 /* Mark the statement as not vectorizable. */
3085 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr
))) = false;
3090 datarefs_copy
.release ();
3095 datarefs_copy
.release ();
3099 /* Function vect_vfa_segment_size.
3102 DR: The data reference.
3103 LENGTH_FACTOR: segment length to consider.
3105 Return a value suitable for the dr_with_seg_len::seg_len field.
3106 This is the "distance travelled" by the pointer from the first
3107 iteration in the segment to the last. Note that it does not include
3108 the size of the access; in effect it only describes the first byte. */
3111 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
3113 length_factor
= size_binop (MINUS_EXPR
,
3114 fold_convert (sizetype
, length_factor
),
3116 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr
)),
3120 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3121 gives the worst-case number of bytes covered by the segment. */
3123 static unsigned HOST_WIDE_INT
3124 vect_vfa_access_size (data_reference
*dr
)
3126 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (vect_dr_stmt (dr
));
3127 tree ref_type
= TREE_TYPE (DR_REF (dr
));
3128 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3129 unsigned HOST_WIDE_INT access_size
= ref_size
;
3130 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3132 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == vect_dr_stmt (dr
));
3133 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3135 if (STMT_VINFO_VEC_STMT (stmt_vinfo
)
3136 && (vect_supportable_dr_alignment (dr
, false)
3137 == dr_explicit_realign_optimized
))
3139 /* We might access a full vector's worth. */
3140 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3141 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3146 /* Get the minimum alignment for all the scalar accesses that DR describes. */
3149 vect_vfa_align (const data_reference
*dr
)
3151 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr
)));
3154 /* Function vect_no_alias_p.
3156 Given data references A and B with equal base and offset, see whether
3157 the alias relation can be decided at compilation time. Return 1 if
3158 it can and the references alias, 0 if it can and the references do
3159 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3160 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3161 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3164 vect_compile_time_alias (struct data_reference
*a
, struct data_reference
*b
,
3165 tree segment_length_a
, tree segment_length_b
,
3166 unsigned HOST_WIDE_INT access_size_a
,
3167 unsigned HOST_WIDE_INT access_size_b
)
3169 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
));
3170 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
));
3171 poly_uint64 const_length_a
;
3172 poly_uint64 const_length_b
;
3174 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3175 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3177 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3179 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3180 offset_a
= (offset_a
+ access_size_a
) - const_length_a
;
3183 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3184 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3186 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3187 offset_b
= (offset_b
+ access_size_b
) - const_length_b
;
3190 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3192 const_length_a
+= access_size_a
;
3193 const_length_b
+= access_size_b
;
3195 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3196 offset_b
, const_length_b
))
3199 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3200 offset_b
, const_length_b
))
3206 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3210 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3211 unsigned int loop_depth
, poly_uint64 vf
)
3213 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3214 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3217 /* If the dependence is exact, we should have limited the VF instead. */
3218 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3221 lambda_vector dist_v
;
3222 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3224 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3226 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3227 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3231 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_NOTE
, vect_location
,
3234 "dependence distance between ");
3235 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3236 dump_printf (MSG_NOTE
, " and ");
3237 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3238 dump_printf (MSG_NOTE
, " is >= VF\n");
3244 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3247 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3249 dump_printf (dump_kind
, "%s (", lower_bound
.unsigned_p
? "unsigned" : "abs");
3250 dump_generic_expr (dump_kind
, TDF_SLIM
, lower_bound
.expr
);
3251 dump_printf (dump_kind
, ") >= ");
3252 dump_dec (dump_kind
, lower_bound
.min_value
);
3255 /* Record that the vectorized loop requires the vec_lower_bound described
3256 by EXPR, UNSIGNED_P and MIN_VALUE. */
3259 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3260 poly_uint64 min_value
)
3262 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3263 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3264 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3266 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3267 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3268 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3269 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3271 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3272 lower_bounds
[i
].min_value
= min_value
;
3273 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_NOTE
, vect_location
,
3276 "updating run-time check to ");
3277 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3278 dump_printf (MSG_NOTE
, "\n");
3284 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3285 if (dump_enabled_p ())
3287 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3288 dump_lower_bound (MSG_NOTE
, lower_bound
);
3289 dump_printf (MSG_NOTE
, "\n");
3291 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3294 /* Return true if it's unlikely that the step of the vectorized form of DR
3295 will span fewer than GAP bytes. */
3298 vect_small_gap_p (loop_vec_info loop_vinfo
, data_reference
*dr
, poly_int64 gap
)
3300 stmt_vec_info stmt_info
= vinfo_for_stmt (vect_dr_stmt (dr
));
3302 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3303 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3304 count
*= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info
)));
3305 return estimated_poly_value (gap
) <= count
* vect_get_scalar_dr_size (dr
);
3308 /* Return true if we know that there is no alias between DR_A and DR_B
3309 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set
3310 *LOWER_BOUND_OUT to this N. */
3313 vectorizable_with_step_bound_p (data_reference
*dr_a
, data_reference
*dr_b
,
3314 poly_uint64
*lower_bound_out
)
3316 /* Check that there is a constant gap of known sign between DR_A
3318 poly_int64 init_a
, init_b
;
3319 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3320 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3321 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3322 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3323 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3324 || !ordered_p (init_a
, init_b
))
3327 /* Sort DR_A and DR_B by the address they access. */
3328 if (maybe_lt (init_b
, init_a
))
3330 std::swap (init_a
, init_b
);
3331 std::swap (dr_a
, dr_b
);
3334 /* If the two accesses could be dependent within a scalar iteration,
3335 make sure that we'd retain their order. */
3336 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_a
), init_b
)
3337 && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a
),
3338 vect_dr_stmt (dr_b
)))
3341 /* There is no alias if abs (DR_STEP) is greater than or equal to
3342 the bytes spanned by the combination of the two accesses. */
3343 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_b
) - init_a
;
3347 /* Function vect_prune_runtime_alias_test_list.
3349 Prune a list of ddrs to be tested at run-time by versioning for alias.
3350 Merge several alias checks into one if possible.
3351 Return FALSE if resulting list of ddrs is longer then allowed by
3352 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3355 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3357 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3358 hash_set
<tree_pair_hash
> compared_objects
;
3360 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3361 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3362 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3363 vec
<vec_object_pair
> &check_unequal_addrs
3364 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3365 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3366 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3372 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3374 /* Step values are irrelevant for aliasing if the number of vector
3375 iterations is equal to the number of scalar iterations (which can
3376 happen for fully-SLP loops). */
3377 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3381 /* Convert the checks for nonzero steps into bound tests. */
3383 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3384 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3387 if (may_alias_ddrs
.is_empty ())
3390 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3392 unsigned int loop_depth
3393 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3394 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3396 /* First, we collect all data ref pairs for aliasing checks. */
3397 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3400 poly_uint64 lower_bound
;
3401 struct data_reference
*dr_a
, *dr_b
;
3402 gimple
*dr_group_first_a
, *dr_group_first_b
;
3403 tree segment_length_a
, segment_length_b
;
3404 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3405 unsigned int align_a
, align_b
;
3406 gimple
*stmt_a
, *stmt_b
;
3408 /* Ignore the alias if the VF we chose ended up being no greater
3409 than the dependence distance. */
3410 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3413 if (DDR_OBJECT_A (ddr
))
3415 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3416 if (!compared_objects
.add (new_pair
))
3418 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3421 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3422 dump_printf (MSG_NOTE
, " and ");
3423 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3424 dump_printf (MSG_NOTE
, " have different addresses\n");
3426 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3432 stmt_a
= vect_dr_stmt (DDR_A (ddr
));
3435 stmt_b
= vect_dr_stmt (DDR_B (ddr
));
3437 /* Skip the pair if inter-iteration dependencies are irrelevant
3438 and intra-iteration dependencies are guaranteed to be honored. */
3440 && (vect_preserves_scalar_order_p (stmt_a
, stmt_b
)
3441 || vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)))
3443 if (dump_enabled_p ())
3445 dump_printf_loc (MSG_NOTE
, vect_location
,
3446 "no need for alias check between ");
3447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3448 dump_printf (MSG_NOTE
, " and ");
3449 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3450 dump_printf (MSG_NOTE
, " when VF is 1\n");
3455 /* See whether we can handle the alias using a bounds check on
3456 the step, and whether that's likely to be the best approach.
3457 (It might not be, for example, if the minimum step is much larger
3458 than the number of bytes handled by one vector iteration.) */
3460 && TREE_CODE (DR_STEP (dr_a
)) != INTEGER_CST
3461 && vectorizable_with_step_bound_p (dr_a
, dr_b
, &lower_bound
)
3462 && (vect_small_gap_p (loop_vinfo
, dr_a
, lower_bound
)
3463 || vect_small_gap_p (loop_vinfo
, dr_b
, lower_bound
)))
3465 bool unsigned_p
= dr_known_forward_stride_p (dr_a
);
3466 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between ");
3469 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3470 dump_printf (MSG_NOTE
, " and ");
3471 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3472 dump_printf (MSG_NOTE
, " when the step ");
3473 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_STEP (dr_a
));
3474 dump_printf (MSG_NOTE
, " is outside ");
3476 dump_printf (MSG_NOTE
, "[0");
3479 dump_printf (MSG_NOTE
, "(");
3480 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3482 dump_printf (MSG_NOTE
, ", ");
3483 dump_dec (MSG_NOTE
, lower_bound
);
3484 dump_printf (MSG_NOTE
, ")\n");
3486 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_a
), unsigned_p
,
3491 dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3492 if (dr_group_first_a
)
3494 stmt_a
= dr_group_first_a
;
3495 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3498 dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3499 if (dr_group_first_b
)
3501 stmt_b
= dr_group_first_b
;
3502 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3507 segment_length_a
= size_zero_node
;
3508 segment_length_b
= size_zero_node
;
3512 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3513 length_factor
= scalar_loop_iters
;
3515 length_factor
= size_int (vect_factor
);
3516 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3517 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3519 access_size_a
= vect_vfa_access_size (dr_a
);
3520 access_size_b
= vect_vfa_access_size (dr_b
);
3521 align_a
= vect_vfa_align (dr_a
);
3522 align_b
= vect_vfa_align (dr_b
);
3524 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3525 DR_BASE_ADDRESS (dr_b
));
3527 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3530 /* See whether the alias is known at compilation time. */
3532 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3533 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3534 && poly_int_tree_p (segment_length_a
)
3535 && poly_int_tree_p (segment_length_b
))
3537 int res
= vect_compile_time_alias (dr_a
, dr_b
,
3542 if (res
>= 0 && dump_enabled_p ())
3544 dump_printf_loc (MSG_NOTE
, vect_location
,
3545 "can tell at compile time that ");
3546 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a
));
3547 dump_printf (MSG_NOTE
, " and ");
3548 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b
));
3550 dump_printf (MSG_NOTE
, " do not alias\n");
3552 dump_printf (MSG_NOTE
, " alias\n");
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE
, vect_location
,
3562 "not vectorized: compilation time alias.\n");
3567 dr_with_seg_len_pair_t dr_with_seg_len_pair
3568 (dr_with_seg_len (dr_a
, segment_length_a
, access_size_a
, align_a
),
3569 dr_with_seg_len (dr_b
, segment_length_b
, access_size_b
, align_b
));
3571 /* Canonicalize pairs by sorting the two DR members. */
3573 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3575 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3578 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3580 unsigned int count
= (comp_alias_ddrs
.length ()
3581 + check_unequal_addrs
.length ());
3583 dump_printf_loc (MSG_NOTE
, vect_location
,
3584 "improved number of alias checks from %d to %d\n",
3585 may_alias_ddrs
.length (), count
);
3586 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3590 "number of versioning for alias "
3591 "run-time tests exceeds %d "
3592 "(--param vect-max-version-for-alias-checks)\n",
3593 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3600 /* Check whether we can use an internal function for a gather load
3601 or scatter store. READ_P is true for loads and false for stores.
3602 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3603 the type of the memory elements being loaded or stored. OFFSET_BITS
3604 is the number of bits in each scalar offset and OFFSET_SIGN is the
3605 sign of the offset. SCALE is the amount by which the offset should
3606 be multiplied *after* it has been converted to address width.
3608 Return true if the function is supported, storing the function
3609 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3612 vect_gather_scatter_fn_p (bool read_p
, bool masked_p
, tree vectype
,
3613 tree memory_type
, unsigned int offset_bits
,
3614 signop offset_sign
, int scale
,
3615 internal_fn
*ifn_out
, tree
*element_type_out
)
3617 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3618 unsigned int element_bits
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype
)));
3619 if (offset_bits
> element_bits
)
3620 /* Internal functions require the offset to be the same width as
3621 the vector elements. We can extend narrower offsets, but it isn't
3622 safe to truncate wider offsets. */
3625 if (element_bits
!= memory_bits
)
3626 /* For now the vector elements must be the same width as the
3630 /* Work out which function we need. */
3633 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3635 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3637 /* Test whether the target supports this combination. */
3638 if (!internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3639 offset_sign
, scale
))
3643 *element_type_out
= TREE_TYPE (vectype
);
3647 /* CALL is a call to an internal gather load or scatter store function.
3648 Describe the operation in INFO. */
3651 vect_describe_gather_scatter_call (gcall
*call
, gather_scatter_info
*info
)
3653 stmt_vec_info stmt_info
= vinfo_for_stmt (call
);
3654 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3655 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3657 info
->ifn
= gimple_call_internal_fn (call
);
3658 info
->decl
= NULL_TREE
;
3659 info
->base
= gimple_call_arg (call
, 0);
3660 info
->offset
= gimple_call_arg (call
, 1);
3661 info
->offset_dt
= vect_unknown_def_type
;
3662 info
->offset_vectype
= NULL_TREE
;
3663 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3664 info
->element_type
= TREE_TYPE (vectype
);
3665 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3668 /* Return true if a non-affine read or write in STMT is suitable for a
3669 gather load or scatter store. Describe the operation in *INFO if so. */
3672 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3673 gather_scatter_info
*info
)
3675 HOST_WIDE_INT scale
= 1;
3676 poly_int64 pbitpos
, pbitsize
;
3677 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3678 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3679 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3680 tree offtype
= NULL_TREE
;
3681 tree decl
= NULL_TREE
, base
, off
;
3682 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3683 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3685 int punsignedp
, reversep
, pvolatilep
= 0;
3688 bool masked_p
= false;
3690 /* See whether this is already a call to a gather/scatter internal function.
3691 If not, see whether it's a masked load or store. */
3692 gcall
*call
= dyn_cast
<gcall
*> (stmt
);
3693 if (call
&& gimple_call_internal_p (call
))
3695 ifn
= gimple_call_internal_fn (stmt
);
3696 if (internal_gather_scatter_fn_p (ifn
))
3698 vect_describe_gather_scatter_call (call
, info
);
3701 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3704 /* True if we should aim to use internal functions rather than
3705 built-in functions. */
3706 bool use_ifn_p
= (DR_IS_READ (dr
)
3707 ? supports_vec_gather_load_p ()
3708 : supports_vec_scatter_store_p ());
3711 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3712 see if we can use the def stmt of the address. */
3714 && TREE_CODE (base
) == MEM_REF
3715 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3716 && integer_zerop (TREE_OPERAND (base
, 1))
3717 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3719 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3720 if (is_gimple_assign (def_stmt
)
3721 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3722 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3725 /* The gather and scatter builtins need address of the form
3726 loop_invariant + vector * {1, 2, 4, 8}
3728 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3729 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3730 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3731 multiplications and additions in it. To get a vector, we need
3732 a single SSA_NAME that will be defined in the loop and will
3733 contain everything that is not loop invariant and that can be
3734 vectorized. The following code attempts to find such a preexistng
3735 SSA_NAME OFF and put the loop invariants into a tree BASE
3736 that can be gimplified before the loop. */
3737 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3738 &punsignedp
, &reversep
, &pvolatilep
);
3742 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3744 if (TREE_CODE (base
) == MEM_REF
)
3746 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3748 if (off
== NULL_TREE
)
3749 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3751 off
= size_binop (PLUS_EXPR
, off
,
3752 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3754 base
= TREE_OPERAND (base
, 0);
3757 base
= build_fold_addr_expr (base
);
3759 if (off
== NULL_TREE
)
3760 off
= size_zero_node
;
3762 /* If base is not loop invariant, either off is 0, then we start with just
3763 the constant offset in the loop invariant BASE and continue with base
3764 as OFF, otherwise give up.
3765 We could handle that case by gimplifying the addition of base + off
3766 into some SSA_NAME and use that as off, but for now punt. */
3767 if (!expr_invariant_in_loop_p (loop
, base
))
3769 if (!integer_zerop (off
))
3772 base
= size_int (pbytepos
);
3774 /* Otherwise put base + constant offset into the loop invariant BASE
3775 and continue with OFF. */
3778 base
= fold_convert (sizetype
, base
);
3779 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3782 /* OFF at this point may be either a SSA_NAME or some tree expression
3783 from get_inner_reference. Try to peel off loop invariants from it
3784 into BASE as long as possible. */
3786 while (offtype
== NULL_TREE
)
3788 enum tree_code code
;
3789 tree op0
, op1
, add
= NULL_TREE
;
3791 if (TREE_CODE (off
) == SSA_NAME
)
3793 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3795 if (expr_invariant_in_loop_p (loop
, off
))
3798 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3801 op0
= gimple_assign_rhs1 (def_stmt
);
3802 code
= gimple_assign_rhs_code (def_stmt
);
3803 op1
= gimple_assign_rhs2 (def_stmt
);
3807 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3809 code
= TREE_CODE (off
);
3810 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3814 case POINTER_PLUS_EXPR
:
3816 if (expr_invariant_in_loop_p (loop
, op0
))
3821 add
= fold_convert (sizetype
, add
);
3823 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3824 base
= size_binop (PLUS_EXPR
, base
, add
);
3827 if (expr_invariant_in_loop_p (loop
, op1
))
3835 if (expr_invariant_in_loop_p (loop
, op1
))
3837 add
= fold_convert (sizetype
, op1
);
3838 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3844 if (scale
== 1 && tree_fits_shwi_p (op1
))
3846 int new_scale
= tree_to_shwi (op1
);
3847 /* Only treat this as a scaling operation if the target
3850 && !vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
,
3851 vectype
, memory_type
, 1,
3852 TYPE_SIGN (TREE_TYPE (op0
)),
3865 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3866 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3868 if (TYPE_PRECISION (TREE_TYPE (op0
))
3869 == TYPE_PRECISION (TREE_TYPE (off
)))
3875 /* The internal functions need the offset to be the same width
3876 as the elements of VECTYPE. Don't include operations that
3877 cast the offset from that width to a different width. */
3879 && (int_size_in_bytes (TREE_TYPE (vectype
))
3880 == int_size_in_bytes (TREE_TYPE (off
))))
3883 if (TYPE_PRECISION (TREE_TYPE (op0
))
3884 < TYPE_PRECISION (TREE_TYPE (off
)))
3887 offtype
= TREE_TYPE (off
);
3898 /* If at the end OFF still isn't a SSA_NAME or isn't
3899 defined in the loop, punt. */
3900 if (TREE_CODE (off
) != SSA_NAME
3901 || expr_invariant_in_loop_p (loop
, off
))
3904 if (offtype
== NULL_TREE
)
3905 offtype
= TREE_TYPE (off
);
3909 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr
), masked_p
, vectype
,
3910 memory_type
, TYPE_PRECISION (offtype
),
3911 TYPE_SIGN (offtype
), scale
, &ifn
,
3917 if (DR_IS_READ (dr
))
3919 if (targetm
.vectorize
.builtin_gather
)
3920 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
3924 if (targetm
.vectorize
.builtin_scatter
)
3925 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
3932 element_type
= TREE_TYPE (vectype
);
3939 info
->offset_dt
= vect_unknown_def_type
;
3940 info
->offset_vectype
= NULL_TREE
;
3941 info
->scale
= scale
;
3942 info
->element_type
= element_type
;
3943 info
->memory_type
= memory_type
;
3947 /* Find the data references in STMT, analyze them with respect to LOOP and
3948 append them to DATAREFS. Return false if datarefs in this stmt cannot
3952 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
3953 vec
<data_reference_p
> *datarefs
)
3955 /* We can ignore clobbers for dataref analysis - they are removed during
3956 loop vectorization and BB vectorization checks dependences with a
3958 if (gimple_clobber_p (stmt
))
3961 if (gimple_has_volatile_ops (stmt
))
3963 if (dump_enabled_p ())
3965 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3966 "not vectorized: volatile type ");
3967 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3972 if (stmt_can_throw_internal (stmt
))
3974 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3977 "not vectorized: statement can throw an "
3979 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3984 auto_vec
<data_reference_p
, 2> refs
;
3985 if (!find_data_references_in_stmt (loop
, stmt
, &refs
))
3988 if (refs
.is_empty ())
3991 if (refs
.length () > 1)
3993 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3996 "not vectorized: more than one data ref "
3998 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4003 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4004 if (!gimple_call_internal_p (call
)
4005 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4006 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4008 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4011 "not vectorized: dr in a call ");
4012 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4017 data_reference_p dr
= refs
.pop ();
4018 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4019 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4021 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4024 "not vectorized: statement is bitfield "
4026 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4031 if (DR_BASE_ADDRESS (dr
)
4032 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4034 if (dump_enabled_p ())
4035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4036 "not vectorized: base addr of dr is a "
4041 /* Check whether this may be a SIMD lane access and adjust the
4042 DR to make it easier for us to handle it. */
4045 && (!DR_BASE_ADDRESS (dr
)
4050 struct data_reference
*newdr
4051 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4052 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4053 if (DR_BASE_ADDRESS (newdr
)
4054 && DR_OFFSET (newdr
)
4057 && integer_zerop (DR_STEP (newdr
)))
4059 tree off
= DR_OFFSET (newdr
);
4061 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4062 && TREE_CODE (off
) == MULT_EXPR
4063 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4065 tree step
= TREE_OPERAND (off
, 1);
4066 off
= TREE_OPERAND (off
, 0);
4068 if (CONVERT_EXPR_P (off
)
4069 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4070 < TYPE_PRECISION (TREE_TYPE (off
))))
4071 off
= TREE_OPERAND (off
, 0);
4072 if (TREE_CODE (off
) == SSA_NAME
)
4074 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4075 tree reft
= TREE_TYPE (DR_REF (newdr
));
4076 if (is_gimple_call (def
)
4077 && gimple_call_internal_p (def
)
4078 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4080 tree arg
= gimple_call_arg (def
, 0);
4081 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4082 arg
= SSA_NAME_VAR (arg
);
4083 if (arg
== loop
->simduid
4085 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4087 DR_OFFSET (newdr
) = ssize_int (0);
4088 DR_STEP (newdr
) = step
;
4089 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4090 DR_STEP_ALIGNMENT (newdr
)
4091 = highest_pow2_factor (step
);
4092 /* Mark as simd-lane access. */
4093 newdr
->aux
= (void *)-1;
4095 datarefs
->safe_push (newdr
);
4102 free_data_ref (newdr
);
4105 datarefs
->safe_push (dr
);
4109 /* Function vect_analyze_data_refs.
4111 Find all the data references in the loop or basic block.
4113 The general structure of the analysis of data refs in the vectorizer is as
4115 1- vect_analyze_data_refs(loop/bb): call
4116 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4117 in the loop/bb and their dependences.
4118 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4119 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4120 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4125 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
)
4127 struct loop
*loop
= NULL
;
4129 struct data_reference
*dr
;
4132 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4134 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4135 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4137 /* Go through the data-refs, check that the analysis succeeded. Update
4138 pointer from stmt_vec_info struct to DR and vectype. */
4140 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4141 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4144 stmt_vec_info stmt_info
;
4145 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4148 gcc_assert (DR_REF (dr
));
4149 stmt
= vect_dr_stmt (dr
);
4150 stmt_info
= vinfo_for_stmt (stmt
);
4152 /* Check that analysis of the data-ref succeeded. */
4153 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4158 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4159 && (targetm
.vectorize
.builtin_gather
!= NULL
4160 || supports_vec_gather_load_p ());
4163 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4164 && (targetm
.vectorize
.builtin_scatter
!= NULL
4165 || supports_vec_scatter_store_p ());
4167 /* If target supports vector gather loads or scatter stores,
4168 see if they can't be used. */
4169 if (is_a
<loop_vec_info
> (vinfo
)
4170 && !nested_in_vect_loop_p (loop
, stmt
))
4172 if (maybe_gather
|| maybe_scatter
)
4175 gatherscatter
= GATHER
;
4177 gatherscatter
= SCATTER
;
4181 if (gatherscatter
== SG_NONE
)
4183 if (dump_enabled_p ())
4185 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4186 "not vectorized: data ref analysis "
4188 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4190 if (is_a
<bb_vec_info
> (vinfo
))
4192 /* In BB vectorization the ref can still participate
4193 in dependence analysis, we just can't vectorize it. */
4194 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4201 /* See if this was detected as SIMD lane access. */
4202 if (dr
->aux
== (void *)-1)
4204 if (nested_in_vect_loop_p (loop
, stmt
))
4206 if (dump_enabled_p ())
4208 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4209 "not vectorized: data ref analysis "
4211 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4215 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
4218 tree base
= get_base_address (DR_REF (dr
));
4219 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4221 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4224 "not vectorized: base object not addressable "
4226 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4228 if (is_a
<bb_vec_info
> (vinfo
))
4230 /* In BB vectorization the ref can still participate
4231 in dependence analysis, we just can't vectorize it. */
4232 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4238 if (is_a
<loop_vec_info
> (vinfo
)
4240 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4242 if (nested_in_vect_loop_p (loop
, stmt
))
4244 if (dump_enabled_p ())
4246 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4247 "not vectorized: not suitable for strided "
4249 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4253 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4256 /* Update DR field in stmt_vec_info struct. */
4258 /* If the dataref is in an inner-loop of the loop that is considered for
4259 for vectorization, we also want to analyze the access relative to
4260 the outer-loop (DR contains information only relative to the
4261 inner-most enclosing loop). We do that by building a reference to the
4262 first location accessed by the inner-loop, and analyze it relative to
4264 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
4266 /* Build a reference to the first location accessed by the
4267 inner loop: *(BASE + INIT + OFFSET). By construction,
4268 this address must be invariant in the inner loop, so we
4269 can consider it as being used in the outer loop. */
4270 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4271 tree offset
= unshare_expr (DR_OFFSET (dr
));
4272 tree init
= unshare_expr (DR_INIT (dr
));
4273 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4275 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4276 tree init_ref
= build_fold_indirect_ref (init_addr
);
4278 if (dump_enabled_p ())
4280 dump_printf_loc (MSG_NOTE
, vect_location
,
4281 "analyze in outer loop: ");
4282 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
4283 dump_printf (MSG_NOTE
, "\n");
4286 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4288 /* dr_analyze_innermost already explained the failure. */
4291 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_NOTE
, vect_location
,
4294 "\touter base_address: ");
4295 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4296 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4297 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
4298 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4299 STMT_VINFO_DR_OFFSET (stmt_info
));
4300 dump_printf (MSG_NOTE
,
4301 "\n\touter constant offset from base address: ");
4302 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4303 STMT_VINFO_DR_INIT (stmt_info
));
4304 dump_printf (MSG_NOTE
, "\n\touter step: ");
4305 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4306 STMT_VINFO_DR_STEP (stmt_info
));
4307 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
4308 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
4309 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
4310 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
4311 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
4312 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
4313 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
4314 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4318 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info
));
4319 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
4321 /* Set vectype for STMT. */
4322 scalar_type
= TREE_TYPE (DR_REF (dr
));
4323 STMT_VINFO_VECTYPE (stmt_info
)
4324 = get_vectype_for_scalar_type (scalar_type
);
4325 if (!STMT_VINFO_VECTYPE (stmt_info
))
4327 if (dump_enabled_p ())
4329 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4330 "not vectorized: no vectype for stmt: ");
4331 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4332 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4335 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4338 if (is_a
<bb_vec_info
> (vinfo
))
4340 /* No vector type is fine, the ref can still participate
4341 in dependence analysis, we just can't vectorize it. */
4342 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4349 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_NOTE
, vect_location
,
4352 "got vectype for stmt: ");
4353 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
4354 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4355 STMT_VINFO_VECTYPE (stmt_info
));
4356 dump_printf (MSG_NOTE
, "\n");
4360 /* Adjust the minimal vectorization factor according to the
4362 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4363 *min_vf
= upper_bound (*min_vf
, vf
);
4365 if (gatherscatter
!= SG_NONE
)
4367 gather_scatter_info gs_info
;
4368 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4370 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4372 if (dump_enabled_p ())
4374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4375 (gatherscatter
== GATHER
) ?
4376 "not vectorized: not suitable for gather "
4378 "not vectorized: not suitable for scatter "
4380 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4384 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4388 /* We used to stop processing and prune the list here. Verify we no
4390 gcc_assert (i
== datarefs
.length ());
4396 /* Function vect_get_new_vect_var.
4398 Returns a name for a new variable. The current naming scheme appends the
4399 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4400 the name of vectorizer generated variables, and appends that to NAME if
4404 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4411 case vect_simple_var
:
4414 case vect_scalar_var
:
4420 case vect_pointer_var
:
4429 char* tmp
= concat (prefix
, "_", name
, NULL
);
4430 new_vect_var
= create_tmp_reg (type
, tmp
);
4434 new_vect_var
= create_tmp_reg (type
, prefix
);
4436 return new_vect_var
;
4439 /* Like vect_get_new_vect_var but return an SSA name. */
4442 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4449 case vect_simple_var
:
4452 case vect_scalar_var
:
4455 case vect_pointer_var
:
4464 char* tmp
= concat (prefix
, "_", name
, NULL
);
4465 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4469 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4471 return new_vect_var
;
4474 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4477 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
)
4479 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4480 int misalign
= DR_MISALIGNMENT (dr
);
4481 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4482 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4484 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4485 DR_TARGET_ALIGNMENT (dr
), misalign
);
4488 /* Function vect_create_addr_base_for_vector_ref.
4490 Create an expression that computes the address of the first memory location
4491 that will be accessed for a data reference.
4494 STMT: The statement containing the data reference.
4495 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4496 OFFSET: Optional. If supplied, it is be added to the initial address.
4497 LOOP: Specify relative to which loop-nest should the address be computed.
4498 For example, when the dataref is in an inner-loop nested in an
4499 outer-loop that is now being vectorized, LOOP can be either the
4500 outer-loop, or the inner-loop. The first memory location accessed
4501 by the following dataref ('in' points to short):
4508 if LOOP=i_loop: &in (relative to i_loop)
4509 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4510 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4511 initial address. Unlike OFFSET, which is number of elements to
4512 be added, BYTE_OFFSET is measured in bytes.
4515 1. Return an SSA_NAME whose value is the address of the memory location of
4516 the first vector of the data reference.
4517 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4518 these statement(s) which define the returned SSA_NAME.
4520 FORNOW: We are only handling array accesses with step 1. */
4523 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4524 gimple_seq
*new_stmt_list
,
4528 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4529 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4530 const char *base_name
;
4533 gimple_seq seq
= NULL
;
4535 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4536 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4537 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4539 tree data_ref_base
= unshare_expr (drb
->base_address
);
4540 tree base_offset
= unshare_expr (drb
->offset
);
4541 tree init
= unshare_expr (drb
->init
);
4544 base_name
= get_name (data_ref_base
);
4547 base_offset
= ssize_int (0);
4548 init
= ssize_int (0);
4549 base_name
= get_name (DR_REF (dr
));
4552 /* Create base_offset */
4553 base_offset
= size_binop (PLUS_EXPR
,
4554 fold_convert (sizetype
, base_offset
),
4555 fold_convert (sizetype
, init
));
4559 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4560 fold_convert (sizetype
, offset
), step
);
4561 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4562 base_offset
, offset
);
4566 byte_offset
= fold_convert (sizetype
, byte_offset
);
4567 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4568 base_offset
, byte_offset
);
4571 /* base + base_offset */
4573 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4576 addr_base
= build1 (ADDR_EXPR
,
4577 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4578 unshare_expr (DR_REF (dr
)));
4581 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4582 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4583 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4584 gimple_seq_add_seq (new_stmt_list
, seq
);
4586 if (DR_PTR_INFO (dr
)
4587 && TREE_CODE (addr_base
) == SSA_NAME
4588 && !SSA_NAME_PTR_INFO (addr_base
))
4590 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
);
4591 if (offset
|| byte_offset
)
4592 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4595 if (dump_enabled_p ())
4597 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4598 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4599 dump_printf (MSG_NOTE
, "\n");
4606 /* Function vect_create_data_ref_ptr.
4608 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4609 location accessed in the loop by STMT, along with the def-use update
4610 chain to appropriately advance the pointer through the loop iterations.
4611 Also set aliasing information for the pointer. This pointer is used by
4612 the callers to this function to create a memory reference expression for
4613 vector load/store access.
4616 1. STMT: a stmt that references memory. Expected to be of the form
4617 GIMPLE_ASSIGN <name, data-ref> or
4618 GIMPLE_ASSIGN <data-ref, name>.
4619 2. AGGR_TYPE: the type of the reference, which should be either a vector
4621 3. AT_LOOP: the loop where the vector memref is to be created.
4622 4. OFFSET (optional): an offset to be added to the initial address accessed
4623 by the data-ref in STMT.
4624 5. BSI: location where the new stmts are to be placed if there is no loop
4625 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4626 pointing to the initial address.
4627 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4628 to the initial address accessed by the data-ref in STMT. This is
4629 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4631 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4632 to the IV during each iteration of the loop. NULL says to move
4633 by one copy of AGGR_TYPE up or down, depending on the step of the
4637 1. Declare a new ptr to vector_type, and have it point to the base of the
4638 data reference (initial addressed accessed by the data reference).
4639 For example, for vector of type V8HI, the following code is generated:
4642 ap = (v8hi *)initial_address;
4644 if OFFSET is not supplied:
4645 initial_address = &a[init];
4646 if OFFSET is supplied:
4647 initial_address = &a[init + OFFSET];
4648 if BYTE_OFFSET is supplied:
4649 initial_address = &a[init] + BYTE_OFFSET;
4651 Return the initial_address in INITIAL_ADDRESS.
4653 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4654 update the pointer in each iteration of the loop.
4656 Return the increment stmt that updates the pointer in PTR_INCR.
4658 3. Set INV_P to true if the access pattern of the data reference in the
4659 vectorized loop is invariant. Set it to false otherwise.
4661 4. Return the pointer. */
4664 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4665 tree offset
, tree
*initial_address
,
4666 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4667 bool only_init
, bool *inv_p
, tree byte_offset
,
4670 const char *base_name
;
4671 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4672 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4673 struct loop
*loop
= NULL
;
4674 bool nested_in_vect_loop
= false;
4675 struct loop
*containing_loop
= NULL
;
4679 gimple_seq new_stmt_list
= NULL
;
4683 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4685 gimple_stmt_iterator incr_gsi
;
4687 tree indx_before_incr
, indx_after_incr
;
4690 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4692 gcc_assert (iv_step
!= NULL_TREE
4693 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4694 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4698 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4699 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4700 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4701 pe
= loop_preheader_edge (loop
);
4705 gcc_assert (bb_vinfo
);
4710 /* Check the step (evolution) of the load in LOOP, and record
4711 whether it's invariant. */
4712 step
= vect_dr_behavior (dr
)->step
;
4713 if (integer_zerop (step
))
4718 /* Create an expression for the first address accessed by this load
4720 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4722 if (dump_enabled_p ())
4724 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4725 dump_printf_loc (MSG_NOTE
, vect_location
,
4726 "create %s-pointer variable to type: ",
4727 get_tree_code_name (TREE_CODE (aggr_type
)));
4728 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4729 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4730 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4731 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4732 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4733 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4734 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4736 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4737 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4738 dump_printf (MSG_NOTE
, "\n");
4741 /* (1) Create the new aggregate-pointer variable.
4742 Vector and array types inherit the alias set of their component
4743 type by default so we need to use a ref-all pointer if the data
4744 reference does not conflict with the created aggregated data
4745 reference because it is not addressable. */
4746 bool need_ref_all
= false;
4747 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4748 get_alias_set (DR_REF (dr
))))
4749 need_ref_all
= true;
4750 /* Likewise for any of the data references in the stmt group. */
4751 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4753 gimple
*orig_stmt
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4756 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4757 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4758 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4759 get_alias_set (DR_REF (sdr
))))
4761 need_ref_all
= true;
4764 orig_stmt
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4768 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4770 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4773 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4774 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4775 def-use update cycles for the pointer: one relative to the outer-loop
4776 (LOOP), which is what steps (3) and (4) below do. The other is relative
4777 to the inner-loop (which is the inner-most loop containing the dataref),
4778 and this is done be step (5) below.
4780 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4781 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4782 redundant. Steps (3),(4) create the following:
4785 LOOP: vp1 = phi(vp0,vp2)
4791 If there is an inner-loop nested in loop, then step (5) will also be
4792 applied, and an additional update in the inner-loop will be created:
4795 LOOP: vp1 = phi(vp0,vp2)
4797 inner: vp3 = phi(vp1,vp4)
4798 vp4 = vp3 + inner_step
4804 /* (2) Calculate the initial address of the aggregate-pointer, and set
4805 the aggregate-pointer to point to it before the loop. */
4807 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4809 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4810 offset
, byte_offset
);
4815 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4816 gcc_assert (!new_bb
);
4819 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4822 *initial_address
= new_temp
;
4823 aggr_ptr_init
= new_temp
;
4825 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4826 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4827 inner-loop nested in LOOP (during outer-loop vectorization). */
4829 /* No update in loop is required. */
4830 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4831 aptr
= aggr_ptr_init
;
4834 if (iv_step
== NULL_TREE
)
4836 /* The step of the aggregate pointer is the type size. */
4837 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4838 /* One exception to the above is when the scalar step of the load in
4839 LOOP is zero. In this case the step here is also zero. */
4841 iv_step
= size_zero_node
;
4842 else if (tree_int_cst_sgn (step
) == -1)
4843 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4846 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4848 create_iv (aggr_ptr_init
,
4849 fold_convert (aggr_ptr_type
, iv_step
),
4850 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4851 &indx_before_incr
, &indx_after_incr
);
4852 incr
= gsi_stmt (incr_gsi
);
4853 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4855 /* Copy the points-to information if it exists. */
4856 if (DR_PTR_INFO (dr
))
4858 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4859 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4864 aptr
= indx_before_incr
;
4867 if (!nested_in_vect_loop
|| only_init
)
4871 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4872 nested in LOOP, if exists. */
4874 gcc_assert (nested_in_vect_loop
);
4877 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4879 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4880 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4882 incr
= gsi_stmt (incr_gsi
);
4883 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4885 /* Copy the points-to information if it exists. */
4886 if (DR_PTR_INFO (dr
))
4888 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
);
4889 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
);
4894 return indx_before_incr
;
4901 /* Function bump_vector_ptr
4903 Increment a pointer (to a vector type) by vector-size. If requested,
4904 i.e. if PTR-INCR is given, then also connect the new increment stmt
4905 to the existing def-use update-chain of the pointer, by modifying
4906 the PTR_INCR as illustrated below:
4908 The pointer def-use update-chain before this function:
4909 DATAREF_PTR = phi (p_0, p_2)
4911 PTR_INCR: p_2 = DATAREF_PTR + step
4913 The pointer def-use update-chain after this function:
4914 DATAREF_PTR = phi (p_0, p_2)
4916 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4918 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4921 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4923 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4924 the loop. The increment amount across iterations is expected
4926 BSI - location where the new update stmt is to be placed.
4927 STMT - the original scalar memory-access stmt that is being vectorized.
4928 BUMP - optional. The offset by which to bump the pointer. If not given,
4929 the offset is assumed to be vector_size.
4931 Output: Return NEW_DATAREF_PTR as illustrated above.
4936 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4937 gimple
*stmt
, tree bump
)
4939 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4940 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4941 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4942 tree update
= TYPE_SIZE_UNIT (vectype
);
4945 use_operand_p use_p
;
4946 tree new_dataref_ptr
;
4951 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4952 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4954 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4955 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4956 dataref_ptr
, update
);
4957 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4959 /* Copy the points-to information if it exists. */
4960 if (DR_PTR_INFO (dr
))
4962 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4963 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4967 return new_dataref_ptr
;
4969 /* Update the vector-pointer's cross-iteration increment. */
4970 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4972 tree use
= USE_FROM_PTR (use_p
);
4974 if (use
== dataref_ptr
)
4975 SET_USE (use_p
, new_dataref_ptr
);
4977 gcc_assert (operand_equal_p (use
, update
, 0));
4980 return new_dataref_ptr
;
4984 /* Copy memory reference info such as base/clique from the SRC reference
4985 to the DEST MEM_REF. */
4988 vect_copy_ref_info (tree dest
, tree src
)
4990 if (TREE_CODE (dest
) != MEM_REF
)
4993 tree src_base
= src
;
4994 while (handled_component_p (src_base
))
4995 src_base
= TREE_OPERAND (src_base
, 0);
4996 if (TREE_CODE (src_base
) != MEM_REF
4997 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5000 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5001 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5005 /* Function vect_create_destination_var.
5007 Create a new temporary of type VECTYPE. */
5010 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5016 enum vect_var_kind kind
;
5019 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5023 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5025 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5027 name
= get_name (scalar_dest
);
5029 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5031 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5032 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5038 /* Function vect_grouped_store_supported.
5040 Returns TRUE if interleave high and interleave low permutations
5041 are supported, and FALSE otherwise. */
5044 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5046 machine_mode mode
= TYPE_MODE (vectype
);
5048 /* vect_permute_store_chain requires the group size to be equal to 3 or
5049 be a power of two. */
5050 if (count
!= 3 && exact_log2 (count
) == -1)
5052 if (dump_enabled_p ())
5053 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5054 "the size of the group of accesses"
5055 " is not a power of 2 or not eqaul to 3\n");
5059 /* Check that the permutation is supported. */
5060 if (VECTOR_MODE_P (mode
))
5065 unsigned int j0
= 0, j1
= 0, j2
= 0;
5069 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5071 if (dump_enabled_p ())
5072 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5073 "cannot handle groups of 3 stores for"
5074 " variable-length vectors\n");
5078 vec_perm_builder
sel (nelt
, nelt
, 1);
5079 sel
.quick_grow (nelt
);
5080 vec_perm_indices indices
;
5081 for (j
= 0; j
< 3; j
++)
5083 int nelt0
= ((3 - j
) * nelt
) % 3;
5084 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5085 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5086 for (i
= 0; i
< nelt
; i
++)
5088 if (3 * i
+ nelt0
< nelt
)
5089 sel
[3 * i
+ nelt0
] = j0
++;
5090 if (3 * i
+ nelt1
< nelt
)
5091 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5092 if (3 * i
+ nelt2
< nelt
)
5093 sel
[3 * i
+ nelt2
] = 0;
5095 indices
.new_vector (sel
, 2, nelt
);
5096 if (!can_vec_perm_const_p (mode
, indices
))
5098 if (dump_enabled_p ())
5099 dump_printf (MSG_MISSED_OPTIMIZATION
,
5100 "permutation op not supported by target.\n");
5104 for (i
= 0; i
< nelt
; i
++)
5106 if (3 * i
+ nelt0
< nelt
)
5107 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5108 if (3 * i
+ nelt1
< nelt
)
5109 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5110 if (3 * i
+ nelt2
< nelt
)
5111 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5113 indices
.new_vector (sel
, 2, nelt
);
5114 if (!can_vec_perm_const_p (mode
, indices
))
5116 if (dump_enabled_p ())
5117 dump_printf (MSG_MISSED_OPTIMIZATION
,
5118 "permutation op not supported by target.\n");
5126 /* If length is not equal to 3 then only power of 2 is supported. */
5127 gcc_assert (pow2p_hwi (count
));
5128 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5130 /* The encoding has 2 interleaved stepped patterns. */
5131 vec_perm_builder
sel (nelt
, 2, 3);
5133 for (i
= 0; i
< 3; i
++)
5136 sel
[i
* 2 + 1] = i
+ nelt
;
5138 vec_perm_indices
indices (sel
, 2, nelt
);
5139 if (can_vec_perm_const_p (mode
, indices
))
5141 for (i
= 0; i
< 6; i
++)
5142 sel
[i
] += exact_div (nelt
, 2);
5143 indices
.new_vector (sel
, 2, nelt
);
5144 if (can_vec_perm_const_p (mode
, indices
))
5150 if (dump_enabled_p ())
5151 dump_printf (MSG_MISSED_OPTIMIZATION
,
5152 "permutaion op not supported by target.\n");
5157 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5158 type VECTYPE. MASKED_P says whether the masked form is needed. */
5161 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5165 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5166 vec_mask_store_lanes_optab
,
5169 return vect_lanes_optab_supported_p ("vec_store_lanes",
5170 vec_store_lanes_optab
,
5175 /* Function vect_permute_store_chain.
5177 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5178 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5179 the data correctly for the stores. Return the final references for stores
5182 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5183 The input is 4 vectors each containing 8 elements. We assign a number to
5184 each element, the input sequence is:
5186 1st vec: 0 1 2 3 4 5 6 7
5187 2nd vec: 8 9 10 11 12 13 14 15
5188 3rd vec: 16 17 18 19 20 21 22 23
5189 4th vec: 24 25 26 27 28 29 30 31
5191 The output sequence should be:
5193 1st vec: 0 8 16 24 1 9 17 25
5194 2nd vec: 2 10 18 26 3 11 19 27
5195 3rd vec: 4 12 20 28 5 13 21 30
5196 4th vec: 6 14 22 30 7 15 23 31
5198 i.e., we interleave the contents of the four vectors in their order.
5200 We use interleave_high/low instructions to create such output. The input of
5201 each interleave_high/low operation is two vectors:
5204 the even elements of the result vector are obtained left-to-right from the
5205 high/low elements of the first vector. The odd elements of the result are
5206 obtained left-to-right from the high/low elements of the second vector.
5207 The output of interleave_high will be: 0 4 1 5
5208 and of interleave_low: 2 6 3 7
5211 The permutation is done in log LENGTH stages. In each stage interleave_high
5212 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5213 where the first argument is taken from the first half of DR_CHAIN and the
5214 second argument from it's second half.
5217 I1: interleave_high (1st vec, 3rd vec)
5218 I2: interleave_low (1st vec, 3rd vec)
5219 I3: interleave_high (2nd vec, 4th vec)
5220 I4: interleave_low (2nd vec, 4th vec)
5222 The output for the first stage is:
5224 I1: 0 16 1 17 2 18 3 19
5225 I2: 4 20 5 21 6 22 7 23
5226 I3: 8 24 9 25 10 26 11 27
5227 I4: 12 28 13 29 14 30 15 31
5229 The output of the second stage, i.e. the final result is:
5231 I1: 0 8 16 24 1 9 17 25
5232 I2: 2 10 18 26 3 11 19 27
5233 I3: 4 12 20 28 5 13 21 30
5234 I4: 6 14 22 30 7 15 23 31. */
5237 vect_permute_store_chain (vec
<tree
> dr_chain
,
5238 unsigned int length
,
5240 gimple_stmt_iterator
*gsi
,
5241 vec
<tree
> *result_chain
)
5243 tree vect1
, vect2
, high
, low
;
5245 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5246 tree perm_mask_low
, perm_mask_high
;
5248 tree perm3_mask_low
, perm3_mask_high
;
5249 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5251 result_chain
->quick_grow (length
);
5252 memcpy (result_chain
->address (), dr_chain
.address (),
5253 length
* sizeof (tree
));
5257 /* vect_grouped_store_supported ensures that this is constant. */
5258 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5259 unsigned int j0
= 0, j1
= 0, j2
= 0;
5261 vec_perm_builder
sel (nelt
, nelt
, 1);
5262 sel
.quick_grow (nelt
);
5263 vec_perm_indices indices
;
5264 for (j
= 0; j
< 3; j
++)
5266 int nelt0
= ((3 - j
) * nelt
) % 3;
5267 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5268 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5270 for (i
= 0; i
< nelt
; i
++)
5272 if (3 * i
+ nelt0
< nelt
)
5273 sel
[3 * i
+ nelt0
] = j0
++;
5274 if (3 * i
+ nelt1
< nelt
)
5275 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5276 if (3 * i
+ nelt2
< nelt
)
5277 sel
[3 * i
+ nelt2
] = 0;
5279 indices
.new_vector (sel
, 2, nelt
);
5280 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5282 for (i
= 0; i
< nelt
; i
++)
5284 if (3 * i
+ nelt0
< nelt
)
5285 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5286 if (3 * i
+ nelt1
< nelt
)
5287 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5288 if (3 * i
+ nelt2
< nelt
)
5289 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5291 indices
.new_vector (sel
, 2, nelt
);
5292 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5294 vect1
= dr_chain
[0];
5295 vect2
= dr_chain
[1];
5297 /* Create interleaving stmt:
5298 low = VEC_PERM_EXPR <vect1, vect2,
5299 {j, nelt, *, j + 1, nelt + j + 1, *,
5300 j + 2, nelt + j + 2, *, ...}> */
5301 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5302 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5303 vect2
, perm3_mask_low
);
5304 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5307 vect2
= dr_chain
[2];
5308 /* Create interleaving stmt:
5309 low = VEC_PERM_EXPR <vect1, vect2,
5310 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5311 6, 7, nelt + j + 2, ...}> */
5312 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5313 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5314 vect2
, perm3_mask_high
);
5315 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5316 (*result_chain
)[j
] = data_ref
;
5321 /* If length is not equal to 3 then only power of 2 is supported. */
5322 gcc_assert (pow2p_hwi (length
));
5324 /* The encoding has 2 interleaved stepped patterns. */
5325 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5326 vec_perm_builder
sel (nelt
, 2, 3);
5328 for (i
= 0; i
< 3; i
++)
5331 sel
[i
* 2 + 1] = i
+ nelt
;
5333 vec_perm_indices
indices (sel
, 2, nelt
);
5334 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5336 for (i
= 0; i
< 6; i
++)
5337 sel
[i
] += exact_div (nelt
, 2);
5338 indices
.new_vector (sel
, 2, nelt
);
5339 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5341 for (i
= 0, n
= log_length
; i
< n
; i
++)
5343 for (j
= 0; j
< length
/2; j
++)
5345 vect1
= dr_chain
[j
];
5346 vect2
= dr_chain
[j
+length
/2];
5348 /* Create interleaving stmt:
5349 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5351 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5352 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5353 vect2
, perm_mask_high
);
5354 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5355 (*result_chain
)[2*j
] = high
;
5357 /* Create interleaving stmt:
5358 low = VEC_PERM_EXPR <vect1, vect2,
5359 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5361 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5362 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5363 vect2
, perm_mask_low
);
5364 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5365 (*result_chain
)[2*j
+1] = low
;
5367 memcpy (dr_chain
.address (), result_chain
->address (),
5368 length
* sizeof (tree
));
5373 /* Function vect_setup_realignment
5375 This function is called when vectorizing an unaligned load using
5376 the dr_explicit_realign[_optimized] scheme.
5377 This function generates the following code at the loop prolog:
5380 x msq_init = *(floor(p)); # prolog load
5381 realignment_token = call target_builtin;
5383 x msq = phi (msq_init, ---)
5385 The stmts marked with x are generated only for the case of
5386 dr_explicit_realign_optimized.
5388 The code above sets up a new (vector) pointer, pointing to the first
5389 location accessed by STMT, and a "floor-aligned" load using that pointer.
5390 It also generates code to compute the "realignment-token" (if the relevant
5391 target hook was defined), and creates a phi-node at the loop-header bb
5392 whose arguments are the result of the prolog-load (created by this
5393 function) and the result of a load that takes place in the loop (to be
5394 created by the caller to this function).
5396 For the case of dr_explicit_realign_optimized:
5397 The caller to this function uses the phi-result (msq) to create the
5398 realignment code inside the loop, and sets up the missing phi argument,
5401 msq = phi (msq_init, lsq)
5402 lsq = *(floor(p')); # load in loop
5403 result = realign_load (msq, lsq, realignment_token);
5405 For the case of dr_explicit_realign:
5407 msq = *(floor(p)); # load in loop
5409 lsq = *(floor(p')); # load in loop
5410 result = realign_load (msq, lsq, realignment_token);
5413 STMT - (scalar) load stmt to be vectorized. This load accesses
5414 a memory location that may be unaligned.
5415 BSI - place where new code is to be inserted.
5416 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5420 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5421 target hook, if defined.
5422 Return value - the result of the loop-header phi node. */
5425 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5426 tree
*realignment_token
,
5427 enum dr_alignment_support alignment_support_scheme
,
5429 struct loop
**at_loop
)
5431 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5432 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5433 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5434 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5435 struct loop
*loop
= NULL
;
5437 tree scalar_dest
= gimple_assign_lhs (stmt
);
5443 tree msq_init
= NULL_TREE
;
5446 tree msq
= NULL_TREE
;
5447 gimple_seq stmts
= NULL
;
5449 bool compute_in_loop
= false;
5450 bool nested_in_vect_loop
= false;
5451 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5452 struct loop
*loop_for_initial_load
= NULL
;
5456 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5457 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5460 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5461 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5463 /* We need to generate three things:
5464 1. the misalignment computation
5465 2. the extra vector load (for the optimized realignment scheme).
5466 3. the phi node for the two vectors from which the realignment is
5467 done (for the optimized realignment scheme). */
5469 /* 1. Determine where to generate the misalignment computation.
5471 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5472 calculation will be generated by this function, outside the loop (in the
5473 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5474 caller, inside the loop.
5476 Background: If the misalignment remains fixed throughout the iterations of
5477 the loop, then both realignment schemes are applicable, and also the
5478 misalignment computation can be done outside LOOP. This is because we are
5479 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5480 are a multiple of VS (the Vector Size), and therefore the misalignment in
5481 different vectorized LOOP iterations is always the same.
5482 The problem arises only if the memory access is in an inner-loop nested
5483 inside LOOP, which is now being vectorized using outer-loop vectorization.
5484 This is the only case when the misalignment of the memory access may not
5485 remain fixed throughout the iterations of the inner-loop (as explained in
5486 detail in vect_supportable_dr_alignment). In this case, not only is the
5487 optimized realignment scheme not applicable, but also the misalignment
5488 computation (and generation of the realignment token that is passed to
5489 REALIGN_LOAD) have to be done inside the loop.
5491 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5492 or not, which in turn determines if the misalignment is computed inside
5493 the inner-loop, or outside LOOP. */
5495 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5497 compute_in_loop
= true;
5498 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5502 /* 2. Determine where to generate the extra vector load.
5504 For the optimized realignment scheme, instead of generating two vector
5505 loads in each iteration, we generate a single extra vector load in the
5506 preheader of the loop, and in each iteration reuse the result of the
5507 vector load from the previous iteration. In case the memory access is in
5508 an inner-loop nested inside LOOP, which is now being vectorized using
5509 outer-loop vectorization, we need to determine whether this initial vector
5510 load should be generated at the preheader of the inner-loop, or can be
5511 generated at the preheader of LOOP. If the memory access has no evolution
5512 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5513 to be generated inside LOOP (in the preheader of the inner-loop). */
5515 if (nested_in_vect_loop
)
5517 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5518 bool invariant_in_outerloop
=
5519 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5520 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5523 loop_for_initial_load
= loop
;
5525 *at_loop
= loop_for_initial_load
;
5527 if (loop_for_initial_load
)
5528 pe
= loop_preheader_edge (loop_for_initial_load
);
5530 /* 3. For the case of the optimized realignment, create the first vector
5531 load at the loop preheader. */
5533 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5535 /* Create msq_init = *(floor(p1)) in the loop preheader */
5538 gcc_assert (!compute_in_loop
);
5539 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5540 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5541 NULL_TREE
, &init_addr
, NULL
, &inc
,
5543 if (TREE_CODE (ptr
) == SSA_NAME
)
5544 new_temp
= copy_ssa_name (ptr
);
5546 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5547 unsigned int align
= DR_TARGET_ALIGNMENT (dr
);
5548 new_stmt
= gimple_build_assign
5549 (new_temp
, BIT_AND_EXPR
, ptr
,
5550 build_int_cst (TREE_TYPE (ptr
), -(HOST_WIDE_INT
) align
));
5551 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5552 gcc_assert (!new_bb
);
5554 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5555 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5556 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5557 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5558 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5559 gimple_assign_set_lhs (new_stmt
, new_temp
);
5562 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5563 gcc_assert (!new_bb
);
5566 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5568 msq_init
= gimple_assign_lhs (new_stmt
);
5571 /* 4. Create realignment token using a target builtin, if available.
5572 It is done either inside the containing loop, or before LOOP (as
5573 determined above). */
5575 if (targetm
.vectorize
.builtin_mask_for_load
)
5580 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5583 /* Generate the INIT_ADDR computation outside LOOP. */
5584 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5588 pe
= loop_preheader_edge (loop
);
5589 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5590 gcc_assert (!new_bb
);
5593 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5596 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5597 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5599 vect_create_destination_var (scalar_dest
,
5600 gimple_call_return_type (new_stmt
));
5601 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5602 gimple_call_set_lhs (new_stmt
, new_temp
);
5604 if (compute_in_loop
)
5605 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5608 /* Generate the misalignment computation outside LOOP. */
5609 pe
= loop_preheader_edge (loop
);
5610 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5611 gcc_assert (!new_bb
);
5614 *realignment_token
= gimple_call_lhs (new_stmt
);
5616 /* The result of the CALL_EXPR to this builtin is determined from
5617 the value of the parameter and no global variables are touched
5618 which makes the builtin a "const" function. Requiring the
5619 builtin to have the "const" attribute makes it unnecessary
5620 to call mark_call_clobbered. */
5621 gcc_assert (TREE_READONLY (builtin_decl
));
5624 if (alignment_support_scheme
== dr_explicit_realign
)
5627 gcc_assert (!compute_in_loop
);
5628 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5631 /* 5. Create msq = phi <msq_init, lsq> in loop */
5633 pe
= loop_preheader_edge (containing_loop
);
5634 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5635 msq
= make_ssa_name (vec_dest
);
5636 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5637 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5643 /* Function vect_grouped_load_supported.
5645 COUNT is the size of the load group (the number of statements plus the
5646 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5647 only one statement, with a gap of COUNT - 1.
5649 Returns true if a suitable permute exists. */
5652 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5653 unsigned HOST_WIDE_INT count
)
5655 machine_mode mode
= TYPE_MODE (vectype
);
5657 /* If this is single-element interleaving with an element distance
5658 that leaves unused vector loads around punt - we at least create
5659 very sub-optimal code in that case (and blow up memory,
5661 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5665 "single-element interleaving not supported "
5666 "for not adjacent vector loads\n");
5670 /* vect_permute_load_chain requires the group size to be equal to 3 or
5671 be a power of two. */
5672 if (count
!= 3 && exact_log2 (count
) == -1)
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5676 "the size of the group of accesses"
5677 " is not a power of 2 or not equal to 3\n");
5681 /* Check that the permutation is supported. */
5682 if (VECTOR_MODE_P (mode
))
5688 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5690 if (dump_enabled_p ())
5691 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5692 "cannot handle groups of 3 loads for"
5693 " variable-length vectors\n");
5697 vec_perm_builder
sel (nelt
, nelt
, 1);
5698 sel
.quick_grow (nelt
);
5699 vec_perm_indices indices
;
5701 for (k
= 0; k
< 3; k
++)
5703 for (i
= 0; i
< nelt
; i
++)
5704 if (3 * i
+ k
< 2 * nelt
)
5708 indices
.new_vector (sel
, 2, nelt
);
5709 if (!can_vec_perm_const_p (mode
, indices
))
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5713 "shuffle of 3 loads is not supported by"
5717 for (i
= 0, j
= 0; i
< nelt
; i
++)
5718 if (3 * i
+ k
< 2 * nelt
)
5721 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5722 indices
.new_vector (sel
, 2, nelt
);
5723 if (!can_vec_perm_const_p (mode
, indices
))
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5727 "shuffle of 3 loads is not supported by"
5736 /* If length is not equal to 3 then only power of 2 is supported. */
5737 gcc_assert (pow2p_hwi (count
));
5738 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5740 /* The encoding has a single stepped pattern. */
5741 vec_perm_builder
sel (nelt
, 1, 3);
5743 for (i
= 0; i
< 3; i
++)
5745 vec_perm_indices
indices (sel
, 2, nelt
);
5746 if (can_vec_perm_const_p (mode
, indices
))
5748 for (i
= 0; i
< 3; i
++)
5750 indices
.new_vector (sel
, 2, nelt
);
5751 if (can_vec_perm_const_p (mode
, indices
))
5757 if (dump_enabled_p ())
5758 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5759 "extract even/odd not supported by target\n");
5763 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5764 type VECTYPE. MASKED_P says whether the masked form is needed. */
5767 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5771 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5772 vec_mask_load_lanes_optab
,
5775 return vect_lanes_optab_supported_p ("vec_load_lanes",
5776 vec_load_lanes_optab
,
5780 /* Function vect_permute_load_chain.
5782 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5783 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5784 the input data correctly. Return the final references for loads in
5787 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5788 The input is 4 vectors each containing 8 elements. We assign a number to each
5789 element, the input sequence is:
5791 1st vec: 0 1 2 3 4 5 6 7
5792 2nd vec: 8 9 10 11 12 13 14 15
5793 3rd vec: 16 17 18 19 20 21 22 23
5794 4th vec: 24 25 26 27 28 29 30 31
5796 The output sequence should be:
5798 1st vec: 0 4 8 12 16 20 24 28
5799 2nd vec: 1 5 9 13 17 21 25 29
5800 3rd vec: 2 6 10 14 18 22 26 30
5801 4th vec: 3 7 11 15 19 23 27 31
5803 i.e., the first output vector should contain the first elements of each
5804 interleaving group, etc.
5806 We use extract_even/odd instructions to create such output. The input of
5807 each extract_even/odd operation is two vectors
5811 and the output is the vector of extracted even/odd elements. The output of
5812 extract_even will be: 0 2 4 6
5813 and of extract_odd: 1 3 5 7
5816 The permutation is done in log LENGTH stages. In each stage extract_even
5817 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5818 their order. In our example,
5820 E1: extract_even (1st vec, 2nd vec)
5821 E2: extract_odd (1st vec, 2nd vec)
5822 E3: extract_even (3rd vec, 4th vec)
5823 E4: extract_odd (3rd vec, 4th vec)
5825 The output for the first stage will be:
5827 E1: 0 2 4 6 8 10 12 14
5828 E2: 1 3 5 7 9 11 13 15
5829 E3: 16 18 20 22 24 26 28 30
5830 E4: 17 19 21 23 25 27 29 31
5832 In order to proceed and create the correct sequence for the next stage (or
5833 for the correct output, if the second stage is the last one, as in our
5834 example), we first put the output of extract_even operation and then the
5835 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5836 The input for the second stage is:
5838 1st vec (E1): 0 2 4 6 8 10 12 14
5839 2nd vec (E3): 16 18 20 22 24 26 28 30
5840 3rd vec (E2): 1 3 5 7 9 11 13 15
5841 4th vec (E4): 17 19 21 23 25 27 29 31
5843 The output of the second stage:
5845 E1: 0 4 8 12 16 20 24 28
5846 E2: 2 6 10 14 18 22 26 30
5847 E3: 1 5 9 13 17 21 25 29
5848 E4: 3 7 11 15 19 23 27 31
5850 And RESULT_CHAIN after reordering:
5852 1st vec (E1): 0 4 8 12 16 20 24 28
5853 2nd vec (E3): 1 5 9 13 17 21 25 29
5854 3rd vec (E2): 2 6 10 14 18 22 26 30
5855 4th vec (E4): 3 7 11 15 19 23 27 31. */
5858 vect_permute_load_chain (vec
<tree
> dr_chain
,
5859 unsigned int length
,
5861 gimple_stmt_iterator
*gsi
,
5862 vec
<tree
> *result_chain
)
5864 tree data_ref
, first_vect
, second_vect
;
5865 tree perm_mask_even
, perm_mask_odd
;
5866 tree perm3_mask_low
, perm3_mask_high
;
5868 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5869 unsigned int i
, j
, log_length
= exact_log2 (length
);
5871 result_chain
->quick_grow (length
);
5872 memcpy (result_chain
->address (), dr_chain
.address (),
5873 length
* sizeof (tree
));
5877 /* vect_grouped_load_supported ensures that this is constant. */
5878 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5881 vec_perm_builder
sel (nelt
, nelt
, 1);
5882 sel
.quick_grow (nelt
);
5883 vec_perm_indices indices
;
5884 for (k
= 0; k
< 3; k
++)
5886 for (i
= 0; i
< nelt
; i
++)
5887 if (3 * i
+ k
< 2 * nelt
)
5891 indices
.new_vector (sel
, 2, nelt
);
5892 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5894 for (i
= 0, j
= 0; i
< nelt
; i
++)
5895 if (3 * i
+ k
< 2 * nelt
)
5898 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5899 indices
.new_vector (sel
, 2, nelt
);
5900 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5902 first_vect
= dr_chain
[0];
5903 second_vect
= dr_chain
[1];
5905 /* Create interleaving stmt (low part of):
5906 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5908 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5909 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5910 second_vect
, perm3_mask_low
);
5911 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5913 /* Create interleaving stmt (high part of):
5914 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5916 first_vect
= data_ref
;
5917 second_vect
= dr_chain
[2];
5918 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5919 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5920 second_vect
, perm3_mask_high
);
5921 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5922 (*result_chain
)[k
] = data_ref
;
5927 /* If length is not equal to 3 then only power of 2 is supported. */
5928 gcc_assert (pow2p_hwi (length
));
5930 /* The encoding has a single stepped pattern. */
5931 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5932 vec_perm_builder
sel (nelt
, 1, 3);
5934 for (i
= 0; i
< 3; ++i
)
5936 vec_perm_indices
indices (sel
, 2, nelt
);
5937 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
5939 for (i
= 0; i
< 3; ++i
)
5941 indices
.new_vector (sel
, 2, nelt
);
5942 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
5944 for (i
= 0; i
< log_length
; i
++)
5946 for (j
= 0; j
< length
; j
+= 2)
5948 first_vect
= dr_chain
[j
];
5949 second_vect
= dr_chain
[j
+1];
5951 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5952 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5953 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5954 first_vect
, second_vect
,
5956 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5957 (*result_chain
)[j
/2] = data_ref
;
5959 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5960 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5961 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5962 first_vect
, second_vect
,
5964 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5965 (*result_chain
)[j
/2+length
/2] = data_ref
;
5967 memcpy (dr_chain
.address (), result_chain
->address (),
5968 length
* sizeof (tree
));
5973 /* Function vect_shift_permute_load_chain.
5975 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5976 sequence of stmts to reorder the input data accordingly.
5977 Return the final references for loads in RESULT_CHAIN.
5978 Return true if successed, false otherwise.
5980 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5981 The input is 3 vectors each containing 8 elements. We assign a
5982 number to each element, the input sequence is:
5984 1st vec: 0 1 2 3 4 5 6 7
5985 2nd vec: 8 9 10 11 12 13 14 15
5986 3rd vec: 16 17 18 19 20 21 22 23
5988 The output sequence should be:
5990 1st vec: 0 3 6 9 12 15 18 21
5991 2nd vec: 1 4 7 10 13 16 19 22
5992 3rd vec: 2 5 8 11 14 17 20 23
5994 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5996 First we shuffle all 3 vectors to get correct elements order:
5998 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5999 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6000 3rd vec: (16 19 22) (17 20 23) (18 21)
6002 Next we unite and shift vector 3 times:
6005 shift right by 6 the concatenation of:
6006 "1st vec" and "2nd vec"
6007 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6008 "2nd vec" and "3rd vec"
6009 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6010 "3rd vec" and "1st vec"
6011 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6014 So that now new vectors are:
6016 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6017 2nd vec: (10 13) (16 19 22) (17 20 23)
6018 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6021 shift right by 5 the concatenation of:
6022 "1st vec" and "3rd vec"
6023 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6024 "2nd vec" and "1st vec"
6025 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6026 "3rd vec" and "2nd vec"
6027 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6030 So that now new vectors are:
6032 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6033 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6034 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6037 shift right by 5 the concatenation of:
6038 "1st vec" and "1st vec"
6039 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6040 shift right by 3 the concatenation of:
6041 "2nd vec" and "2nd vec"
6042 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6045 So that now all vectors are READY:
6046 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6047 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6048 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6050 This algorithm is faster than one in vect_permute_load_chain if:
6051 1. "shift of a concatination" is faster than general permutation.
6053 2. The TARGET machine can't execute vector instructions in parallel.
6054 This is because each step of the algorithm depends on previous.
6055 The algorithm in vect_permute_load_chain is much more parallel.
6057 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6061 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
6062 unsigned int length
,
6064 gimple_stmt_iterator
*gsi
,
6065 vec
<tree
> *result_chain
)
6067 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6068 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6069 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6072 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
6074 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6075 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6077 unsigned HOST_WIDE_INT nelt
, vf
;
6078 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6079 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6080 /* Not supported for variable-length vectors. */
6083 vec_perm_builder
sel (nelt
, nelt
, 1);
6084 sel
.quick_grow (nelt
);
6086 result_chain
->quick_grow (length
);
6087 memcpy (result_chain
->address (), dr_chain
.address (),
6088 length
* sizeof (tree
));
6090 if (pow2p_hwi (length
) && vf
> 4)
6092 unsigned int j
, log_length
= exact_log2 (length
);
6093 for (i
= 0; i
< nelt
/ 2; ++i
)
6095 for (i
= 0; i
< nelt
/ 2; ++i
)
6096 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6097 vec_perm_indices
indices (sel
, 2, nelt
);
6098 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6100 if (dump_enabled_p ())
6101 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6102 "shuffle of 2 fields structure is not \
6103 supported by target\n");
6106 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6108 for (i
= 0; i
< nelt
/ 2; ++i
)
6110 for (i
= 0; i
< nelt
/ 2; ++i
)
6111 sel
[nelt
/ 2 + i
] = i
* 2;
6112 indices
.new_vector (sel
, 2, nelt
);
6113 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6115 if (dump_enabled_p ())
6116 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6117 "shuffle of 2 fields structure is not \
6118 supported by target\n");
6121 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6123 /* Generating permutation constant to shift all elements.
6124 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6125 for (i
= 0; i
< nelt
; i
++)
6126 sel
[i
] = nelt
/ 2 + i
;
6127 indices
.new_vector (sel
, 2, nelt
);
6128 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6132 "shift permutation is not supported by target\n");
6135 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6137 /* Generating permutation constant to select vector from 2.
6138 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6139 for (i
= 0; i
< nelt
/ 2; i
++)
6141 for (i
= nelt
/ 2; i
< nelt
; i
++)
6143 indices
.new_vector (sel
, 2, nelt
);
6144 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6146 if (dump_enabled_p ())
6147 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6148 "select is not supported by target\n");
6151 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6153 for (i
= 0; i
< log_length
; i
++)
6155 for (j
= 0; j
< length
; j
+= 2)
6157 first_vect
= dr_chain
[j
];
6158 second_vect
= dr_chain
[j
+ 1];
6160 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6161 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6162 first_vect
, first_vect
,
6164 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6167 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6168 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6169 second_vect
, second_vect
,
6171 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6174 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6175 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6176 vect
[0], vect
[1], shift1_mask
);
6177 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6178 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6180 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6181 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6182 vect
[0], vect
[1], select_mask
);
6183 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6184 (*result_chain
)[j
/2] = data_ref
;
6186 memcpy (dr_chain
.address (), result_chain
->address (),
6187 length
* sizeof (tree
));
6191 if (length
== 3 && vf
> 2)
6193 unsigned int k
= 0, l
= 0;
6195 /* Generating permutation constant to get all elements in rigth order.
6196 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6197 for (i
= 0; i
< nelt
; i
++)
6199 if (3 * k
+ (l
% 3) >= nelt
)
6202 l
+= (3 - (nelt
% 3));
6204 sel
[i
] = 3 * k
+ (l
% 3);
6207 vec_perm_indices
indices (sel
, 2, nelt
);
6208 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6212 "shuffle of 3 fields structure is not \
6213 supported by target\n");
6216 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6218 /* Generating permutation constant to shift all elements.
6219 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6220 for (i
= 0; i
< nelt
; i
++)
6221 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6222 indices
.new_vector (sel
, 2, nelt
);
6223 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6225 if (dump_enabled_p ())
6226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6227 "shift permutation is not supported by target\n");
6230 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6232 /* Generating permutation constant to shift all elements.
6233 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6234 for (i
= 0; i
< nelt
; i
++)
6235 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6236 indices
.new_vector (sel
, 2, nelt
);
6237 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6241 "shift permutation is not supported by target\n");
6244 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6246 /* Generating permutation constant to shift all elements.
6247 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6248 for (i
= 0; i
< nelt
; i
++)
6249 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6250 indices
.new_vector (sel
, 2, nelt
);
6251 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6253 if (dump_enabled_p ())
6254 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6255 "shift permutation is not supported by target\n");
6258 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6260 /* Generating permutation constant to shift all elements.
6261 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6262 for (i
= 0; i
< nelt
; i
++)
6263 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6264 indices
.new_vector (sel
, 2, nelt
);
6265 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6267 if (dump_enabled_p ())
6268 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6269 "shift permutation is not supported by target\n");
6272 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6274 for (k
= 0; k
< 3; k
++)
6276 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6277 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6278 dr_chain
[k
], dr_chain
[k
],
6280 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6284 for (k
= 0; k
< 3; k
++)
6286 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6287 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6288 vect
[k
% 3], vect
[(k
+ 1) % 3],
6290 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6291 vect_shift
[k
] = data_ref
;
6294 for (k
= 0; k
< 3; k
++)
6296 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6297 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6298 vect_shift
[(4 - k
) % 3],
6299 vect_shift
[(3 - k
) % 3],
6301 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6305 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6307 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6308 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6309 vect
[0], shift3_mask
);
6310 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6311 (*result_chain
)[nelt
% 3] = data_ref
;
6313 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6314 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6315 vect
[1], shift4_mask
);
6316 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
6317 (*result_chain
)[0] = data_ref
;
6323 /* Function vect_transform_grouped_load.
6325 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6326 to perform their permutation and ascribe the result vectorized statements to
6327 the scalar statements.
6331 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
6332 gimple_stmt_iterator
*gsi
)
6335 vec
<tree
> result_chain
= vNULL
;
6337 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6338 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6339 vectors, that are ready for vector computation. */
6340 result_chain
.create (size
);
6342 /* If reassociation width for vector type is 2 or greater target machine can
6343 execute 2 or more vector instructions in parallel. Otherwise try to
6344 get chain for loads group using vect_shift_permute_load_chain. */
6345 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
6346 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6348 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
6349 gsi
, &result_chain
))
6350 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
6351 vect_record_grouped_load_vectors (stmt
, result_chain
);
6352 result_chain
.release ();
6355 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6356 generated as part of the vectorization of STMT. Assign the statement
6357 for each vector to the associated scalar statement. */
6360 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
6362 gimple
*first_stmt
= DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
6363 gimple
*next_stmt
, *new_stmt
;
6364 unsigned int i
, gap_count
;
6367 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6368 Since we scan the chain starting from it's first node, their order
6369 corresponds the order of data-refs in RESULT_CHAIN. */
6370 next_stmt
= first_stmt
;
6372 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6377 /* Skip the gaps. Loads created for the gaps will be removed by dead
6378 code elimination pass later. No need to check for the first stmt in
6379 the group, since it always exists.
6380 DR_GROUP_GAP is the number of steps in elements from the previous
6381 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6382 correspond to the gaps. */
6383 if (next_stmt
!= first_stmt
6384 && gap_count
< DR_GROUP_GAP (vinfo_for_stmt (next_stmt
)))
6392 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6393 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6394 copies, and we put the new vector statement in the first available
6396 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
6397 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
6400 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6403 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
6405 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
6408 prev_stmt
= rel_stmt
;
6410 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
6413 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
6418 next_stmt
= DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
6420 /* If NEXT_STMT accesses the same DR as the previous statement,
6421 put the same TMP_DATA_REF as its vectorized statement; otherwise
6422 get the next data-ref from RESULT_CHAIN. */
6423 if (!next_stmt
|| !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6429 /* Function vect_force_dr_alignment_p.
6431 Returns whether the alignment of a DECL can be forced to be aligned
6432 on ALIGNMENT bit boundary. */
6435 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6440 if (decl_in_symtab_p (decl
)
6441 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6444 if (TREE_STATIC (decl
))
6445 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6447 return (alignment
<= MAX_STACK_ALIGNMENT
);
6451 /* Return whether the data reference DR is supported with respect to its
6453 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6454 it is aligned, i.e., check if it is possible to vectorize it with different
6457 enum dr_alignment_support
6458 vect_supportable_dr_alignment (struct data_reference
*dr
,
6459 bool check_aligned_accesses
)
6461 gimple
*stmt
= vect_dr_stmt (dr
);
6462 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6463 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6464 machine_mode mode
= TYPE_MODE (vectype
);
6465 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6466 struct loop
*vect_loop
= NULL
;
6467 bool nested_in_vect_loop
= false;
6469 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6472 /* For now assume all conditional loads/stores support unaligned
6473 access without any special code. */
6474 if (is_gimple_call (stmt
)
6475 && gimple_call_internal_p (stmt
)
6476 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6477 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6478 return dr_unaligned_supported
;
6482 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6483 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6486 /* Possibly unaligned access. */
6488 /* We can choose between using the implicit realignment scheme (generating
6489 a misaligned_move stmt) and the explicit realignment scheme (generating
6490 aligned loads with a REALIGN_LOAD). There are two variants to the
6491 explicit realignment scheme: optimized, and unoptimized.
6492 We can optimize the realignment only if the step between consecutive
6493 vector loads is equal to the vector size. Since the vector memory
6494 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6495 is guaranteed that the misalignment amount remains the same throughout the
6496 execution of the vectorized loop. Therefore, we can create the
6497 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6498 at the loop preheader.
6500 However, in the case of outer-loop vectorization, when vectorizing a
6501 memory access in the inner-loop nested within the LOOP that is now being
6502 vectorized, while it is guaranteed that the misalignment of the
6503 vectorized memory access will remain the same in different outer-loop
6504 iterations, it is *not* guaranteed that is will remain the same throughout
6505 the execution of the inner-loop. This is because the inner-loop advances
6506 with the original scalar step (and not in steps of VS). If the inner-loop
6507 step happens to be a multiple of VS, then the misalignment remains fixed
6508 and we can use the optimized realignment scheme. For example:
6514 When vectorizing the i-loop in the above example, the step between
6515 consecutive vector loads is 1, and so the misalignment does not remain
6516 fixed across the execution of the inner-loop, and the realignment cannot
6517 be optimized (as illustrated in the following pseudo vectorized loop):
6519 for (i=0; i<N; i+=4)
6520 for (j=0; j<M; j++){
6521 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6522 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6523 // (assuming that we start from an aligned address).
6526 We therefore have to use the unoptimized realignment scheme:
6528 for (i=0; i<N; i+=4)
6529 for (j=k; j<M; j+=4)
6530 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6531 // that the misalignment of the initial address is
6534 The loop can then be vectorized as follows:
6536 for (k=0; k<4; k++){
6537 rt = get_realignment_token (&vp[k]);
6538 for (i=0; i<N; i+=4){
6540 for (j=k; j<M; j+=4){
6542 va = REALIGN_LOAD <v1,v2,rt>;
6549 if (DR_IS_READ (dr
))
6551 bool is_packed
= false;
6552 tree type
= (TREE_TYPE (DR_REF (dr
)));
6554 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6555 && (!targetm
.vectorize
.builtin_mask_for_load
6556 || targetm
.vectorize
.builtin_mask_for_load ()))
6558 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6560 /* If we are doing SLP then the accesses need not have the
6561 same alignment, instead it depends on the SLP group size. */
6563 && STMT_SLP_TYPE (stmt_info
)
6564 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6565 * DR_GROUP_SIZE (vinfo_for_stmt
6566 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6567 TYPE_VECTOR_SUBPARTS (vectype
)))
6569 else if (!loop_vinfo
6570 || (nested_in_vect_loop
6571 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6572 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6573 return dr_explicit_realign
;
6575 return dr_explicit_realign_optimized
;
6577 if (!known_alignment_for_access_p (dr
))
6578 is_packed
= not_size_aligned (DR_REF (dr
));
6580 if (targetm
.vectorize
.support_vector_misalignment
6581 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6582 /* Can't software pipeline the loads, but can at least do them. */
6583 return dr_unaligned_supported
;
6587 bool is_packed
= false;
6588 tree type
= (TREE_TYPE (DR_REF (dr
)));
6590 if (!known_alignment_for_access_p (dr
))
6591 is_packed
= not_size_aligned (DR_REF (dr
));
6593 if (targetm
.vectorize
.support_vector_misalignment
6594 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6595 return dr_unaligned_supported
;
6599 return dr_unaligned_unsupported
;