1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 #include "tree-hash-traits.h"
56 /* Return true if load- or store-lanes optab OPTAB is implemented for
57 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
61 tree vectype
, unsigned HOST_WIDE_INT count
)
64 scalar_int_mode array_mode
;
67 mode
= TYPE_MODE (vectype
);
68 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
69 if (!int_mode_for_size (count
* GET_MODE_BITSIZE (mode
),
70 limit_p
).exists (&array_mode
))
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
75 GET_MODE_NAME (mode
), count
);
79 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
83 "cannot use %s<%s><%s>\n", name
,
84 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE
, vect_location
,
90 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
91 GET_MODE_NAME (mode
));
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
116 HOST_WIDE_INT
*rhs_size_unit
)
118 tree scalar_type
= gimple_expr_type (stmt
);
119 HOST_WIDE_INT lhs
, rhs
;
121 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
123 if (is_gimple_assign (stmt
)
124 && (gimple_assign_cast_p (stmt
)
125 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
129 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
131 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
133 scalar_type
= rhs_type
;
136 *lhs_size_unit
= lhs
;
137 *rhs_size_unit
= rhs
;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
147 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
149 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
154 if (!runtime_alias_check_p (ddr
, loop
,
155 optimize_loop_nest_for_speed_p (loop
)))
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
163 /* A subroutine of vect_analyze_data_ref_dependence. Handle
164 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
165 distances. These distances are conservatively correct but they don't
166 reflect a guaranteed dependence.
168 Return true if this function does all the work necessary to avoid
169 an alias or false if the caller should use the dependence distances
170 to limit the vectorization factor in the usual way. LOOP_DEPTH is
171 the depth of the loop described by LOOP_VINFO and the other arguments
172 are as for vect_analyze_data_ref_dependence. */
175 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
176 loop_vec_info loop_vinfo
,
177 int loop_depth
, int *max_vf
)
179 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
180 lambda_vector dist_v
;
182 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
184 int dist
= dist_v
[loop_depth
];
185 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
187 /* If the user asserted safelen >= DIST consecutive iterations
188 can be executed concurrently, assume independence.
190 ??? An alternative would be to add the alias check even
191 in this case, and vectorize the fallback loop with the
192 maximum VF set to safelen. However, if the user has
193 explicitly given a length, it's less likely that that
195 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
197 if (loop
->safelen
< *max_vf
)
198 *max_vf
= loop
->safelen
;
199 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
203 /* For dependence distances of 2 or more, we have the option
204 of limiting VF or checking for an alias at runtime.
205 Prefer to check at runtime if we can, to avoid limiting
206 the VF unnecessarily when the bases are in fact independent.
208 Note that the alias checks will be removed if the VF ends up
209 being small enough. */
210 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
217 /* Function vect_analyze_data_ref_dependence.
219 Return TRUE if there (might) exist a dependence between a memory-reference
220 DRA and a memory-reference DRB. When versioning for alias may check a
221 dependence at run-time, return FALSE. Adjust *MAX_VF according to
222 the data dependence. */
225 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
226 loop_vec_info loop_vinfo
, int *max_vf
)
229 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
230 struct data_reference
*dra
= DDR_A (ddr
);
231 struct data_reference
*drb
= DDR_B (ddr
);
232 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
233 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
234 lambda_vector dist_v
;
235 unsigned int loop_depth
;
237 /* In loop analysis all data references should be vectorizable. */
238 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
239 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
242 /* Independent data accesses. */
243 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
247 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
250 /* We do not have to consider dependences between accesses that belong
251 to the same group. */
252 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
253 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
256 /* Even if we have an anti-dependence then, as the vectorized loop covers at
257 least two scalar iterations, there is always also a true dependence.
258 As the vectorizer does not re-order loads and stores we can ignore
259 the anti-dependence if TBAA can disambiguate both DRs similar to the
260 case with known negative distance anti-dependences (positive
261 distance anti-dependences would violate TBAA constraints). */
262 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
263 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
264 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
265 get_alias_set (DR_REF (drb
))))
268 /* Unknown data dependence. */
269 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
271 /* If user asserted safelen consecutive iterations can be
272 executed concurrently, assume independence. */
273 if (loop
->safelen
>= 2)
275 if (loop
->safelen
< *max_vf
)
276 *max_vf
= loop
->safelen
;
277 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
281 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
282 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
287 "versioning for alias not supported for: "
288 "can't determine dependence between ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
291 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
294 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
302 "versioning for alias required: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
306 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
309 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
319 /* If user asserted safelen consecutive iterations can be
320 executed concurrently, assume independence. */
321 if (loop
->safelen
>= 2)
323 if (loop
->safelen
< *max_vf
)
324 *max_vf
= loop
->safelen
;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
329 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
330 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
335 "versioning for alias not supported for: "
336 "bad dist vector for ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
339 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
342 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
350 "versioning for alias required: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
353 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
355 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
357 /* Add to list of ddrs that need to be tested at run-time. */
358 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
361 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
363 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
364 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
368 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
370 int dist
= dist_v
[loop_depth
];
372 if (dump_enabled_p ())
373 dump_printf_loc (MSG_NOTE
, vect_location
,
374 "dependence distance = %d.\n", dist
);
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE
, vect_location
,
381 "dependence distance == 0 between ");
382 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
383 dump_printf (MSG_NOTE
, " and ");
384 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
385 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
388 /* When we perform grouped accesses and perform implicit CSE
389 by detecting equal accesses and doing disambiguation with
390 runtime alias tests like for
398 where we will end up loading { a[i], a[i+1] } once, make
399 sure that inserting group loads before the first load and
400 stores after the last store will do the right thing.
401 Similar for groups like
405 where loads from the group interleave with the store. */
406 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
407 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
409 gimple
*earlier_stmt
;
410 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
412 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
416 "READ_WRITE dependence in interleaving."
425 if (dist
> 0 && DDR_REVERSED_P (ddr
))
427 /* If DDR_REVERSED_P the order of the data-refs in DDR was
428 reversed (to make distance vector positive), and the actual
429 distance is negative. */
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
432 "dependence distance negative.\n");
433 /* Record a negative dependence distance to later limit the
434 amount of stmt copying / unrolling we can perform.
435 Only need to handle read-after-write dependence. */
437 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
438 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
439 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
444 && abs (dist
) < *max_vf
)
446 /* The dependence distance requires reduction of the maximal
447 vectorization factor. */
448 *max_vf
= abs (dist
);
449 if (dump_enabled_p ())
450 dump_printf_loc (MSG_NOTE
, vect_location
,
451 "adjusting maximal vectorization factor to %i\n",
455 if (abs (dist
) >= *max_vf
)
457 /* Dependence distance does not create dependence, as far as
458 vectorization is concerned, in this case. */
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_NOTE
, vect_location
,
461 "dependence distance >= VF.\n");
465 if (dump_enabled_p ())
467 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
468 "not vectorized, possible dependence "
469 "between data-refs ");
470 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
471 dump_printf (MSG_NOTE
, " and ");
472 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
473 dump_printf (MSG_NOTE
, "\n");
482 /* Function vect_analyze_data_ref_dependences.
484 Examine all the data references in the loop, and make sure there do not
485 exist any data dependences between them. Set *MAX_VF according to
486 the maximum vectorization factor the data dependences allow. */
489 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
492 struct data_dependence_relation
*ddr
;
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE
, vect_location
,
496 "=== vect_analyze_data_ref_dependences ===\n");
498 LOOP_VINFO_DDRS (loop_vinfo
)
499 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
500 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
501 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
502 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
503 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
504 &LOOP_VINFO_DDRS (loop_vinfo
),
505 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
508 /* For epilogues we either have no aliases or alias versioning
509 was applied to original loop. Therefore we may just get max_vf
510 using VF of original loop. */
511 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
512 *max_vf
= LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo
);
514 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
515 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
522 /* Function vect_slp_analyze_data_ref_dependence.
524 Return TRUE if there (might) exist a dependence between a memory-reference
525 DRA and a memory-reference DRB. When versioning for alias may check a
526 dependence at run-time, return FALSE. Adjust *MAX_VF according to
527 the data dependence. */
530 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
532 struct data_reference
*dra
= DDR_A (ddr
);
533 struct data_reference
*drb
= DDR_B (ddr
);
535 /* We need to check dependences of statements marked as unvectorizable
536 as well, they still can prohibit vectorization. */
538 /* Independent data accesses. */
539 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
545 /* Read-read is OK. */
546 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
549 /* If dra and drb are part of the same interleaving chain consider
551 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
552 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
553 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
556 /* Unknown data dependence. */
557 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
559 if (dump_enabled_p ())
561 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
562 "can't determine dependence between ");
563 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
564 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
565 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
566 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
569 else if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE
, vect_location
,
572 "determined dependence between ");
573 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
574 dump_printf (MSG_NOTE
, " and ");
575 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
576 dump_printf (MSG_NOTE
, "\n");
583 /* Analyze dependences involved in the transform of SLP NODE. STORES
584 contain the vector of scalar stores of this instance if we are
585 disambiguating the loads. */
588 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
589 vec
<gimple
*> stores
, gimple
*last_store
)
591 /* This walks over all stmts involved in the SLP load/store done
592 in NODE verifying we can sink them up to the last stmt in the
594 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
595 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
597 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
598 if (access
== last_access
)
600 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
601 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
602 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
604 gimple
*stmt
= gsi_stmt (gsi
);
605 if (! gimple_vuse (stmt
)
606 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
609 /* If we couldn't record a (single) data reference for this
610 stmt we have to give up. */
611 /* ??? Here and below if dependence analysis fails we can resort
612 to the alias oracle which can handle more kinds of stmts. */
613 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
617 bool dependent
= false;
618 /* If we run into a store of this same instance (we've just
619 marked those) then delay dependence checking until we run
620 into the last store because this is where it will have
621 been sunk to (and we verify if we can do that as well). */
622 if (gimple_visited_p (stmt
))
624 if (stmt
!= last_store
)
628 FOR_EACH_VEC_ELT (stores
, i
, store
)
630 data_reference
*store_dr
631 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
632 ddr_p ddr
= initialize_data_dependence_relation
633 (dr_a
, store_dr
, vNULL
);
634 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
635 free_dependence_relation (ddr
);
642 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
644 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
645 free_dependence_relation (ddr
);
655 /* Function vect_analyze_data_ref_dependences.
657 Examine all the data references in the basic-block, and make sure there
658 do not exist any data dependences between them. Set *MAX_VF according to
659 the maximum vectorization factor the data dependences allow. */
662 vect_slp_analyze_instance_dependence (slp_instance instance
)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE
, vect_location
,
666 "=== vect_slp_analyze_instance_dependence ===\n");
668 /* The stores of this instance are at the root of the SLP tree. */
669 slp_tree store
= SLP_INSTANCE_TREE (instance
);
670 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
673 /* Verify we can sink stores to the vectorized stmt insert location. */
674 gimple
*last_store
= NULL
;
677 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
680 /* Mark stores in this instance and remember the last one. */
681 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
682 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
683 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
688 /* Verify we can sink loads to the vectorized stmt insert location,
689 special-casing stores of this instance. */
692 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
693 if (! vect_slp_analyze_node_dependences (instance
, load
,
695 ? SLP_TREE_SCALAR_STMTS (store
)
696 : vNULL
, last_store
))
702 /* Unset the visited flag. */
704 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
705 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
710 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
711 the statement that contains DRB, which is useful for recording in the
715 vect_record_base_alignment (vec_info
*vinfo
, gimple
*stmt
,
716 innermost_loop_behavior
*drb
)
719 innermost_loop_behavior
*&entry
720 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
721 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
724 if (dump_enabled_p ())
726 dump_printf_loc (MSG_NOTE
, vect_location
,
727 "recording new base alignment for ");
728 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, drb
->base_address
);
729 dump_printf (MSG_NOTE
, "\n");
730 dump_printf_loc (MSG_NOTE
, vect_location
,
731 " alignment: %d\n", drb
->base_alignment
);
732 dump_printf_loc (MSG_NOTE
, vect_location
,
733 " misalignment: %d\n", drb
->base_misalignment
);
734 dump_printf_loc (MSG_NOTE
, vect_location
,
736 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
741 /* If the region we're going to vectorize is reached, all unconditional
742 data references occur at least once. We can therefore pool the base
743 alignment guarantees from each unconditional reference. Do this by
744 going through all the data references in VINFO and checking whether
745 the containing statement makes the reference unconditionally. If so,
746 record the alignment of the base address in VINFO so that it can be
747 used for all other references with the same base. */
750 vect_record_base_alignments (vec_info
*vinfo
)
752 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
753 struct loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
756 FOR_EACH_VEC_ELT (vinfo
->datarefs
, i
, dr
)
757 if (!DR_IS_CONDITIONAL_IN_STMT (dr
))
759 gimple
*stmt
= DR_STMT (dr
);
760 vect_record_base_alignment (vinfo
, stmt
, &DR_INNERMOST (dr
));
762 /* If DR is nested in the loop that is being vectorized, we can also
763 record the alignment of the base wrt the outer loop. */
764 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
766 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
767 vect_record_base_alignment
768 (vinfo
, stmt
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
773 /* Function vect_compute_data_ref_alignment
775 Compute the misalignment of the data reference DR.
778 1. If during the misalignment computation it is found that the data reference
779 cannot be vectorized then false is returned.
780 2. DR_MISALIGNMENT (DR) is defined.
782 FOR NOW: No analysis is actually performed. Misalignment is calculated
783 only for trivial cases. TODO. */
786 vect_compute_data_ref_alignment (struct data_reference
*dr
)
788 gimple
*stmt
= DR_STMT (dr
);
789 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
790 vec_base_alignments
*base_alignments
= &stmt_info
->vinfo
->base_alignments
;
791 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
792 struct loop
*loop
= NULL
;
793 tree ref
= DR_REF (dr
);
794 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_NOTE
, vect_location
,
798 "vect_compute_data_ref_alignment:\n");
801 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
803 /* Initialize misalignment to unknown. */
804 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
806 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
807 bool step_preserves_misalignment_p
;
809 /* No step for BB vectorization. */
812 gcc_assert (integer_zerop (drb
->step
));
813 step_preserves_misalignment_p
= true;
816 /* In case the dataref is in an inner-loop of the loop that is being
817 vectorized (LOOP), we use the base and misalignment information
818 relative to the outer-loop (LOOP). This is ok only if the misalignment
819 stays the same throughout the execution of the inner-loop, which is why
820 we have to check that the stride of the dataref in the inner-loop evenly
821 divides by the vector size. */
822 else if (nested_in_vect_loop_p (loop
, stmt
))
824 step_preserves_misalignment_p
825 = (DR_STEP_ALIGNMENT (dr
)
826 % GET_MODE_SIZE (TYPE_MODE (vectype
))) == 0;
828 if (dump_enabled_p ())
830 if (step_preserves_misalignment_p
)
831 dump_printf_loc (MSG_NOTE
, vect_location
,
832 "inner step divides the vector-size.\n");
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
835 "inner step doesn't divide the vector-size.\n");
839 /* Similarly we can only use base and misalignment information relative to
840 an innermost loop if the misalignment stays the same throughout the
841 execution of the loop. As above, this is the case if the stride of
842 the dataref evenly divides by the vector size. */
845 unsigned vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
846 step_preserves_misalignment_p
847 = ((DR_STEP_ALIGNMENT (dr
) * vf
)
848 % GET_MODE_SIZE (TYPE_MODE (vectype
))) == 0;
850 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
851 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
852 "step doesn't divide the vector-size.\n");
855 unsigned int base_alignment
= drb
->base_alignment
;
856 unsigned int base_misalignment
= drb
->base_misalignment
;
857 unsigned HOST_WIDE_INT vector_alignment
= TYPE_ALIGN_UNIT (vectype
);
859 /* Calculate the maximum of the pooled base address alignment and the
860 alignment that we can compute for DR itself. */
861 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
862 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
864 base_alignment
= (*entry
)->base_alignment
;
865 base_misalignment
= (*entry
)->base_misalignment
;
868 if (drb
->offset_alignment
< vector_alignment
869 || !step_preserves_misalignment_p
870 /* We need to know whether the step wrt the vectorized loop is
871 negative when computing the starting misalignment below. */
872 || TREE_CODE (drb
->step
) != INTEGER_CST
)
874 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
877 "Unknown alignment for access: ");
878 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
879 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
884 if (base_alignment
< vector_alignment
)
886 tree base
= drb
->base_address
;
887 if (TREE_CODE (base
) == ADDR_EXPR
)
888 base
= TREE_OPERAND (base
, 0);
889 if (!vect_can_force_dr_alignment_p (base
,
890 vector_alignment
* BITS_PER_UNIT
))
892 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE
, vect_location
,
895 "can't force alignment of ref: ");
896 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
897 dump_printf (MSG_NOTE
, "\n");
902 if (DECL_USER_ALIGN (base
))
904 if (dump_enabled_p ())
906 dump_printf_loc (MSG_NOTE
, vect_location
,
907 "not forcing alignment of user-aligned "
909 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
910 dump_printf (MSG_NOTE
, "\n");
915 /* Force the alignment of the decl.
916 NOTE: This is the only change to the code we make during
917 the analysis phase, before deciding to vectorize the loop. */
918 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
921 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
922 dump_printf (MSG_NOTE
, "\n");
925 DR_VECT_AUX (dr
)->base_decl
= base
;
926 DR_VECT_AUX (dr
)->base_misaligned
= true;
927 base_misalignment
= 0;
929 unsigned int misalignment
= (base_misalignment
930 + TREE_INT_CST_LOW (drb
->init
));
932 /* If this is a backward running DR then first access in the larger
933 vectype actually is N-1 elements before the address in the DR.
934 Adjust misalign accordingly. */
935 if (tree_int_cst_sgn (drb
->step
) < 0)
936 /* PLUS because STEP is negative. */
937 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
938 * TREE_INT_CST_LOW (drb
->step
));
940 SET_DR_MISALIGNMENT (dr
, misalignment
& (vector_alignment
- 1));
942 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
945 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
946 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
947 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
954 /* Function vect_update_misalignment_for_peel.
955 Sets DR's misalignment
956 - to 0 if it has the same alignment as DR_PEEL,
957 - to the misalignment computed using NPEEL if DR's salignment is known,
958 - to -1 (unknown) otherwise.
960 DR - the data reference whose misalignment is to be adjusted.
961 DR_PEEL - the data reference whose misalignment is being made
962 zero in the vector loop by the peel.
963 NPEEL - the number of iterations in the peel loop if the misalignment
964 of DR_PEEL is known at compile time. */
967 vect_update_misalignment_for_peel (struct data_reference
*dr
,
968 struct data_reference
*dr_peel
, int npeel
)
971 vec
<dr_p
> same_aligned_drs
;
972 struct data_reference
*current_dr
;
973 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
974 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
975 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
976 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
978 /* For interleaved data accesses the step in the loop must be multiplied by
979 the size of the interleaving group. */
980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
981 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
982 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
983 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
985 /* It can be assumed that the data refs with the same alignment as dr_peel
986 are aligned in the vector loop. */
988 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
989 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
991 if (current_dr
!= dr
)
993 gcc_assert (!known_alignment_for_access_p (dr
)
994 || !known_alignment_for_access_p (dr_peel
)
995 || (DR_MISALIGNMENT (dr
) / dr_size
996 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
997 SET_DR_MISALIGNMENT (dr
, 0);
1001 if (known_alignment_for_access_p (dr
)
1002 && known_alignment_for_access_p (dr_peel
))
1004 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
1005 int misal
= DR_MISALIGNMENT (dr
);
1006 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1007 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
1008 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
1009 SET_DR_MISALIGNMENT (dr
, misal
);
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1015 "to unknown (-1).\n");
1016 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
1020 /* Function verify_data_ref_alignment
1022 Return TRUE if DR can be handled with respect to alignment. */
1025 verify_data_ref_alignment (data_reference_p dr
)
1027 enum dr_alignment_support supportable_dr_alignment
1028 = vect_supportable_dr_alignment (dr
, false);
1029 if (!supportable_dr_alignment
)
1031 if (dump_enabled_p ())
1033 if (DR_IS_READ (dr
))
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1035 "not vectorized: unsupported unaligned load.");
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1038 "not vectorized: unsupported unaligned "
1041 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1043 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
1048 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
1049 dump_printf_loc (MSG_NOTE
, vect_location
,
1050 "Vectorizing an unaligned access.\n");
1055 /* Function vect_verify_datarefs_alignment
1057 Return TRUE if all data references in the loop can be
1058 handled with respect to alignment. */
1061 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1063 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1064 struct data_reference
*dr
;
1067 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1069 gimple
*stmt
= DR_STMT (dr
);
1070 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1072 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1075 /* For interleaving, only the alignment of the first access matters. */
1076 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1077 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1080 /* Strided accesses perform only component accesses, alignment is
1081 irrelevant for them. */
1082 if (STMT_VINFO_STRIDED_P (stmt_info
)
1083 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1086 if (! verify_data_ref_alignment (dr
))
1093 /* Given an memory reference EXP return whether its alignment is less
1097 not_size_aligned (tree exp
)
1099 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1102 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1103 > get_object_alignment (exp
));
1106 /* Function vector_alignment_reachable_p
1108 Return true if vector alignment for DR is reachable by peeling
1109 a few loop iterations. Return false otherwise. */
1112 vector_alignment_reachable_p (struct data_reference
*dr
)
1114 gimple
*stmt
= DR_STMT (dr
);
1115 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1116 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1118 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1120 /* For interleaved access we peel only if number of iterations in
1121 the prolog loop ({VF - misalignment}), is a multiple of the
1122 number of the interleaved accesses. */
1123 int elem_size
, mis_in_elements
;
1124 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1126 /* FORNOW: handle only known alignment. */
1127 if (!known_alignment_for_access_p (dr
))
1130 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1131 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1133 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1137 /* If misalignment is known at the compile time then allow peeling
1138 only if natural alignment is reachable through peeling. */
1139 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1141 HOST_WIDE_INT elmsize
=
1142 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1143 if (dump_enabled_p ())
1145 dump_printf_loc (MSG_NOTE
, vect_location
,
1146 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1147 dump_printf (MSG_NOTE
,
1148 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1150 if (DR_MISALIGNMENT (dr
) % elmsize
)
1152 if (dump_enabled_p ())
1153 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1154 "data size does not divide the misalignment.\n");
1159 if (!known_alignment_for_access_p (dr
))
1161 tree type
= TREE_TYPE (DR_REF (dr
));
1162 bool is_packed
= not_size_aligned (DR_REF (dr
));
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1165 "Unknown misalignment, %snaturally aligned\n",
1166 is_packed
? "not " : "");
1167 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1174 /* Calculate the cost of the memory access represented by DR. */
1177 vect_get_data_access_cost (struct data_reference
*dr
,
1178 unsigned int *inside_cost
,
1179 unsigned int *outside_cost
,
1180 stmt_vector_for_cost
*body_cost_vec
)
1182 gimple
*stmt
= DR_STMT (dr
);
1183 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1184 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1185 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1186 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1187 int ncopies
= MAX (1, vf
/ nunits
); /* TODO: Handle SLP properly */
1189 if (DR_IS_READ (dr
))
1190 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1191 NULL
, body_cost_vec
, false);
1193 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_NOTE
, vect_location
,
1197 "vect_get_data_access_cost: inside_cost = %d, "
1198 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1202 typedef struct _vect_peel_info
1204 struct data_reference
*dr
;
1209 typedef struct _vect_peel_extended_info
1211 struct _vect_peel_info peel_info
;
1212 unsigned int inside_cost
;
1213 unsigned int outside_cost
;
1214 } *vect_peel_extended_info
;
1217 /* Peeling hashtable helpers. */
1219 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1221 static inline hashval_t
hash (const _vect_peel_info
*);
1222 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1226 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1228 return (hashval_t
) peel_info
->npeel
;
1232 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1234 return (a
->npeel
== b
->npeel
);
1238 /* Insert DR into peeling hash table with NPEEL as key. */
1241 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1242 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1245 struct _vect_peel_info elem
, *slot
;
1246 _vect_peel_info
**new_slot
;
1247 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1250 slot
= peeling_htab
->find (&elem
);
1255 slot
= XNEW (struct _vect_peel_info
);
1256 slot
->npeel
= npeel
;
1259 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1263 if (!supportable_dr_alignment
1264 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1265 slot
->count
+= VECT_MAX_COST
;
1269 /* Traverse peeling hash table to find peeling option that aligns maximum
1270 number of data accesses. */
1273 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1274 _vect_peel_extended_info
*max
)
1276 vect_peel_info elem
= *slot
;
1278 if (elem
->count
> max
->peel_info
.count
1279 || (elem
->count
== max
->peel_info
.count
1280 && max
->peel_info
.npeel
> elem
->npeel
))
1282 max
->peel_info
.npeel
= elem
->npeel
;
1283 max
->peel_info
.count
= elem
->count
;
1284 max
->peel_info
.dr
= elem
->dr
;
1290 /* Get the costs of peeling NPEEL iterations checking data access costs
1291 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1292 misalignment will be zero after peeling. */
1295 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1296 struct data_reference
*dr0
,
1297 unsigned int *inside_cost
,
1298 unsigned int *outside_cost
,
1299 stmt_vector_for_cost
*body_cost_vec
,
1301 bool unknown_misalignment
)
1306 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1308 gimple
*stmt
= DR_STMT (dr
);
1309 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1310 /* For interleaving, only the alignment of the first access
1312 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1313 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1316 /* Strided accesses perform only component accesses, alignment is
1317 irrelevant for them. */
1318 if (STMT_VINFO_STRIDED_P (stmt_info
)
1319 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1322 int save_misalignment
;
1323 save_misalignment
= DR_MISALIGNMENT (dr
);
1326 else if (unknown_misalignment
&& dr
== dr0
)
1327 SET_DR_MISALIGNMENT (dr
, 0);
1329 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1330 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1332 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1336 /* Traverse peeling hash table and calculate cost for each peeling option.
1337 Find the one with the lowest cost. */
1340 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1341 _vect_peel_extended_info
*min
)
1343 vect_peel_info elem
= *slot
;
1345 unsigned int inside_cost
= 0, outside_cost
= 0;
1346 gimple
*stmt
= DR_STMT (elem
->dr
);
1347 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1348 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1349 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1352 prologue_cost_vec
.create (2);
1353 body_cost_vec
.create (2);
1354 epilogue_cost_vec
.create (2);
1356 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1357 elem
->dr
, &inside_cost
, &outside_cost
,
1358 &body_cost_vec
, elem
->npeel
, false);
1360 body_cost_vec
.release ();
1362 outside_cost
+= vect_get_known_peeling_cost
1363 (loop_vinfo
, elem
->npeel
, &dummy
,
1364 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1365 &prologue_cost_vec
, &epilogue_cost_vec
);
1367 /* Prologue and epilogue costs are added to the target model later.
1368 These costs depend only on the scalar iteration cost, the
1369 number of peeling iterations finally chosen, and the number of
1370 misaligned statements. So discard the information found here. */
1371 prologue_cost_vec
.release ();
1372 epilogue_cost_vec
.release ();
1374 if (inside_cost
< min
->inside_cost
1375 || (inside_cost
== min
->inside_cost
1376 && outside_cost
< min
->outside_cost
))
1378 min
->inside_cost
= inside_cost
;
1379 min
->outside_cost
= outside_cost
;
1380 min
->peel_info
.dr
= elem
->dr
;
1381 min
->peel_info
.npeel
= elem
->npeel
;
1382 min
->peel_info
.count
= elem
->count
;
1389 /* Choose best peeling option by traversing peeling hash table and either
1390 choosing an option with the lowest cost (if cost model is enabled) or the
1391 option that aligns as many accesses as possible. */
1393 static struct _vect_peel_extended_info
1394 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1395 loop_vec_info loop_vinfo
)
1397 struct _vect_peel_extended_info res
;
1399 res
.peel_info
.dr
= NULL
;
1401 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1403 res
.inside_cost
= INT_MAX
;
1404 res
.outside_cost
= INT_MAX
;
1405 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1406 vect_peeling_hash_get_lowest_cost
> (&res
);
1410 res
.peel_info
.count
= 0;
1411 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1412 vect_peeling_hash_get_most_frequent
> (&res
);
1413 res
.inside_cost
= 0;
1414 res
.outside_cost
= 0;
1420 /* Return true if the new peeling NPEEL is supported. */
1423 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1427 struct data_reference
*dr
= NULL
;
1428 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1430 stmt_vec_info stmt_info
;
1431 enum dr_alignment_support supportable_dr_alignment
;
1433 /* Ensure that all data refs can be vectorized after the peel. */
1434 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1436 int save_misalignment
;
1441 stmt
= DR_STMT (dr
);
1442 stmt_info
= vinfo_for_stmt (stmt
);
1443 /* For interleaving, only the alignment of the first access
1445 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1446 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1449 /* Strided accesses perform only component accesses, alignment is
1450 irrelevant for them. */
1451 if (STMT_VINFO_STRIDED_P (stmt_info
)
1452 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1455 save_misalignment
= DR_MISALIGNMENT (dr
);
1456 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1457 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1458 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1460 if (!supportable_dr_alignment
)
1467 /* Function vect_enhance_data_refs_alignment
1469 This pass will use loop versioning and loop peeling in order to enhance
1470 the alignment of data references in the loop.
1472 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1473 original loop is to be vectorized. Any other loops that are created by
1474 the transformations performed in this pass - are not supposed to be
1475 vectorized. This restriction will be relaxed.
1477 This pass will require a cost model to guide it whether to apply peeling
1478 or versioning or a combination of the two. For example, the scheme that
1479 intel uses when given a loop with several memory accesses, is as follows:
1480 choose one memory access ('p') which alignment you want to force by doing
1481 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1482 other accesses are not necessarily aligned, or (2) use loop versioning to
1483 generate one loop in which all accesses are aligned, and another loop in
1484 which only 'p' is necessarily aligned.
1486 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1487 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1488 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1490 Devising a cost model is the most critical aspect of this work. It will
1491 guide us on which access to peel for, whether to use loop versioning, how
1492 many versions to create, etc. The cost model will probably consist of
1493 generic considerations as well as target specific considerations (on
1494 powerpc for example, misaligned stores are more painful than misaligned
1497 Here are the general steps involved in alignment enhancements:
1499 -- original loop, before alignment analysis:
1500 for (i=0; i<N; i++){
1501 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1502 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1505 -- After vect_compute_data_refs_alignment:
1506 for (i=0; i<N; i++){
1507 x = q[i]; # DR_MISALIGNMENT(q) = 3
1508 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1511 -- Possibility 1: we do loop versioning:
1513 for (i=0; i<N; i++){ # loop 1A
1514 x = q[i]; # DR_MISALIGNMENT(q) = 3
1515 p[i] = y; # DR_MISALIGNMENT(p) = 0
1519 for (i=0; i<N; i++){ # loop 1B
1520 x = q[i]; # DR_MISALIGNMENT(q) = 3
1521 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1525 -- Possibility 2: we do loop peeling:
1526 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1530 for (i = 3; i < N; i++){ # loop 2A
1531 x = q[i]; # DR_MISALIGNMENT(q) = 0
1532 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1535 -- Possibility 3: combination of loop peeling and versioning:
1536 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1541 for (i = 3; i<N; i++){ # loop 3A
1542 x = q[i]; # DR_MISALIGNMENT(q) = 0
1543 p[i] = y; # DR_MISALIGNMENT(p) = 0
1547 for (i = 3; i<N; i++){ # loop 3B
1548 x = q[i]; # DR_MISALIGNMENT(q) = 0
1549 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1553 These loops are later passed to loop_transform to be vectorized. The
1554 vectorizer will use the alignment information to guide the transformation
1555 (whether to generate regular loads/stores, or with special handling for
1559 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1561 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1562 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1563 enum dr_alignment_support supportable_dr_alignment
;
1564 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1565 struct data_reference
*dr
;
1567 bool do_peeling
= false;
1568 bool do_versioning
= false;
1571 stmt_vec_info stmt_info
;
1572 unsigned int npeel
= 0;
1573 bool one_misalignment_known
= false;
1574 bool one_misalignment_unknown
= false;
1575 bool one_dr_unsupportable
= false;
1576 struct data_reference
*unsupportable_dr
= NULL
;
1577 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1578 unsigned possible_npeel_number
= 1;
1580 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1581 hash_table
<peel_info_hasher
> peeling_htab (1);
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE
, vect_location
,
1585 "=== vect_enhance_data_refs_alignment ===\n");
1587 /* Reset data so we can safely be called multiple times. */
1588 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1589 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1591 /* While cost model enhancements are expected in the future, the high level
1592 view of the code at this time is as follows:
1594 A) If there is a misaligned access then see if peeling to align
1595 this access can make all data references satisfy
1596 vect_supportable_dr_alignment. If so, update data structures
1597 as needed and return true.
1599 B) If peeling wasn't possible and there is a data reference with an
1600 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1601 then see if loop versioning checks can be used to make all data
1602 references satisfy vect_supportable_dr_alignment. If so, update
1603 data structures as needed and return true.
1605 C) If neither peeling nor versioning were successful then return false if
1606 any data reference does not satisfy vect_supportable_dr_alignment.
1608 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1610 Note, Possibility 3 above (which is peeling and versioning together) is not
1611 being done at this time. */
1613 /* (1) Peeling to force alignment. */
1615 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1617 + How many accesses will become aligned due to the peeling
1618 - How many accesses will become unaligned due to the peeling,
1619 and the cost of misaligned accesses.
1620 - The cost of peeling (the extra runtime checks, the increase
1623 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1625 stmt
= DR_STMT (dr
);
1626 stmt_info
= vinfo_for_stmt (stmt
);
1628 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1631 /* For interleaving, only the alignment of the first access
1633 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1634 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1637 /* For invariant accesses there is nothing to enhance. */
1638 if (integer_zerop (DR_STEP (dr
)))
1641 /* Strided accesses perform only component accesses, alignment is
1642 irrelevant for them. */
1643 if (STMT_VINFO_STRIDED_P (stmt_info
)
1644 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1647 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1648 do_peeling
= vector_alignment_reachable_p (dr
);
1651 if (known_alignment_for_access_p (dr
))
1653 unsigned int npeel_tmp
= 0;
1654 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1655 size_zero_node
) < 0;
1657 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1658 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1659 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1660 TREE_TYPE (DR_REF (dr
))));
1661 if (DR_MISALIGNMENT (dr
) != 0)
1662 npeel_tmp
= (negative
? (mis
- nelements
)
1663 : (nelements
- mis
)) & (nelements
- 1);
1665 /* For multiple types, it is possible that the bigger type access
1666 will have more than one peeling option. E.g., a loop with two
1667 types: one of size (vector size / 4), and the other one of
1668 size (vector size / 8). Vectorization factor will 8. If both
1669 accesses are misaligned by 3, the first one needs one scalar
1670 iteration to be aligned, and the second one needs 5. But the
1671 first one will be aligned also by peeling 5 scalar
1672 iterations, and in that case both accesses will be aligned.
1673 Hence, except for the immediate peeling amount, we also want
1674 to try to add full vector size, while we don't exceed
1675 vectorization factor.
1676 We do this automatically for cost model, since we calculate
1677 cost for every peeling option. */
1678 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1680 if (STMT_SLP_TYPE (stmt_info
))
1681 possible_npeel_number
1682 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1684 possible_npeel_number
= vf
/ nelements
;
1686 /* NPEEL_TMP is 0 when there is no misalignment, but also
1687 allow peeling NELEMENTS. */
1688 if (DR_MISALIGNMENT (dr
) == 0)
1689 possible_npeel_number
++;
1692 /* Save info about DR in the hash table. Also include peeling
1693 amounts according to the explanation above. */
1694 for (j
= 0; j
< possible_npeel_number
; j
++)
1696 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1698 npeel_tmp
+= nelements
;
1701 one_misalignment_known
= true;
1705 /* If we don't know any misalignment values, we prefer
1706 peeling for data-ref that has the maximum number of data-refs
1707 with the same alignment, unless the target prefers to align
1708 stores over load. */
1709 unsigned same_align_drs
1710 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1712 || same_align_drs_max
< same_align_drs
)
1714 same_align_drs_max
= same_align_drs
;
1717 /* For data-refs with the same number of related
1718 accesses prefer the one where the misalign
1719 computation will be invariant in the outermost loop. */
1720 else if (same_align_drs_max
== same_align_drs
)
1722 struct loop
*ivloop0
, *ivloop
;
1723 ivloop0
= outermost_invariant_loop_for_expr
1724 (loop
, DR_BASE_ADDRESS (dr0
));
1725 ivloop
= outermost_invariant_loop_for_expr
1726 (loop
, DR_BASE_ADDRESS (dr
));
1727 if ((ivloop
&& !ivloop0
)
1728 || (ivloop
&& ivloop0
1729 && flow_loop_nested_p (ivloop
, ivloop0
)))
1733 one_misalignment_unknown
= true;
1735 /* Check for data refs with unsupportable alignment that
1737 if (!supportable_dr_alignment
)
1739 one_dr_unsupportable
= true;
1740 unsupportable_dr
= dr
;
1743 if (!first_store
&& DR_IS_WRITE (dr
))
1749 if (!aligned_access_p (dr
))
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1753 "vector alignment may not be reachable\n");
1759 /* Check if we can possibly peel the loop. */
1760 if (!vect_can_advance_ivs_p (loop_vinfo
)
1761 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1765 struct _vect_peel_extended_info peel_for_known_alignment
;
1766 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1767 struct _vect_peel_extended_info best_peel
;
1769 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1770 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1771 peel_for_unknown_alignment
.peel_info
.count
= 0;
1774 && one_misalignment_unknown
)
1776 /* Check if the target requires to prefer stores over loads, i.e., if
1777 misaligned stores are more expensive than misaligned loads (taking
1778 drs with same alignment into account). */
1779 unsigned int load_inside_cost
= 0;
1780 unsigned int load_outside_cost
= 0;
1781 unsigned int store_inside_cost
= 0;
1782 unsigned int store_outside_cost
= 0;
1784 stmt_vector_for_cost dummy
;
1786 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1789 &dummy
, vf
/ 2, true);
1795 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1797 &store_outside_cost
,
1798 &dummy
, vf
/ 2, true);
1803 store_inside_cost
= INT_MAX
;
1804 store_outside_cost
= INT_MAX
;
1807 if (load_inside_cost
> store_inside_cost
1808 || (load_inside_cost
== store_inside_cost
1809 && load_outside_cost
> store_outside_cost
))
1812 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1813 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1817 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1818 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1821 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1822 prologue_cost_vec
.create (2);
1823 epilogue_cost_vec
.create (2);
1826 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1827 (loop_vinfo
, vf
/ 2, &dummy2
,
1828 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1829 &prologue_cost_vec
, &epilogue_cost_vec
);
1831 prologue_cost_vec
.release ();
1832 epilogue_cost_vec
.release ();
1834 peel_for_unknown_alignment
.peel_info
.count
= 1
1835 + STMT_VINFO_SAME_ALIGN_REFS
1836 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1839 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1840 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1842 best_peel
= peel_for_unknown_alignment
;
1844 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1845 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1846 peel_for_known_alignment
.peel_info
.count
= 0;
1847 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1849 if (do_peeling
&& one_misalignment_known
)
1851 /* Peeling is possible, but there is no data access that is not supported
1852 unless aligned. So we try to choose the best possible peeling from
1854 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1855 (&peeling_htab
, loop_vinfo
);
1858 /* Compare costs of peeling for known and unknown alignment. */
1859 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1860 && peel_for_unknown_alignment
.inside_cost
1861 >= peel_for_known_alignment
.inside_cost
)
1863 best_peel
= peel_for_known_alignment
;
1865 /* If the best peeling for known alignment has NPEEL == 0, perform no
1866 peeling at all except if there is an unsupportable dr that we can
1868 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1872 /* If there is an unsupportable data ref, prefer this over all choices so far
1873 since we'd have to discard a chosen peeling except when it accidentally
1874 aligned the unsupportable data ref. */
1875 if (one_dr_unsupportable
)
1876 dr0
= unsupportable_dr
;
1877 else if (do_peeling
)
1879 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1880 TODO: Use nopeel_outside_cost or get rid of it? */
1881 unsigned nopeel_inside_cost
= 0;
1882 unsigned nopeel_outside_cost
= 0;
1884 stmt_vector_for_cost dummy
;
1886 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1887 &nopeel_outside_cost
, &dummy
, 0, false);
1890 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1891 costs will be recorded. */
1892 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1893 prologue_cost_vec
.create (2);
1894 epilogue_cost_vec
.create (2);
1897 nopeel_outside_cost
+= vect_get_known_peeling_cost
1898 (loop_vinfo
, 0, &dummy2
,
1899 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1900 &prologue_cost_vec
, &epilogue_cost_vec
);
1902 prologue_cost_vec
.release ();
1903 epilogue_cost_vec
.release ();
1905 npeel
= best_peel
.peel_info
.npeel
;
1906 dr0
= best_peel
.peel_info
.dr
;
1908 /* If no peeling is not more expensive than the best peeling we
1909 have so far, don't perform any peeling. */
1910 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1916 stmt
= DR_STMT (dr0
);
1917 stmt_info
= vinfo_for_stmt (stmt
);
1918 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1919 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1921 if (known_alignment_for_access_p (dr0
))
1923 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1924 size_zero_node
) < 0;
1927 /* Since it's known at compile time, compute the number of
1928 iterations in the peeled loop (the peeling factor) for use in
1929 updating DR_MISALIGNMENT values. The peeling factor is the
1930 vectorization factor minus the misalignment as an element
1932 mis
= DR_MISALIGNMENT (dr0
);
1933 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1934 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1938 /* For interleaved data access every iteration accesses all the
1939 members of the group, therefore we divide the number of iterations
1940 by the group size. */
1941 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1942 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1943 npeel
/= GROUP_SIZE (stmt_info
);
1945 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_NOTE
, vect_location
,
1947 "Try peeling by %d\n", npeel
);
1950 /* Ensure that all datarefs can be vectorized after the peel. */
1951 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
1954 /* Check if all datarefs are supportable and log. */
1955 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1957 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1964 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1967 unsigned max_allowed_peel
1968 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1969 if (max_allowed_peel
!= (unsigned)-1)
1971 unsigned max_peel
= npeel
;
1974 gimple
*dr_stmt
= DR_STMT (dr0
);
1975 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1976 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1977 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1979 if (max_peel
> max_allowed_peel
)
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE
, vect_location
,
1984 "Disable peeling, max peels reached: %d\n", max_peel
);
1989 /* Cost model #2 - if peeling may result in a remaining loop not
1990 iterating enough to be vectorized then do not peel. */
1992 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1995 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1996 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1997 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
2003 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2004 If the misalignment of DR_i is identical to that of dr0 then set
2005 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2006 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2007 by the peeling factor times the element size of DR_i (MOD the
2008 vectorization factor times the size). Otherwise, the
2009 misalignment of DR_i must be set to unknown. */
2010 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2013 /* Strided accesses perform only component accesses, alignment
2014 is irrelevant for them. */
2015 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2016 if (STMT_VINFO_STRIDED_P (stmt_info
)
2017 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2020 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
2023 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
2025 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2027 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2028 = DR_MISALIGNMENT (dr0
);
2029 SET_DR_MISALIGNMENT (dr0
, 0);
2030 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_NOTE
, vect_location
,
2033 "Alignment of access forced using peeling.\n");
2034 dump_printf_loc (MSG_NOTE
, vect_location
,
2035 "Peeling for alignment will be applied.\n");
2038 /* The inside-loop cost will be accounted for in vectorizable_load
2039 and vectorizable_store correctly with adjusted alignments.
2040 Drop the body_cst_vec on the floor here. */
2041 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2047 /* (2) Versioning to force alignment. */
2049 /* Try versioning if:
2050 1) optimize loop for speed
2051 2) there is at least one unsupported misaligned data ref with an unknown
2053 3) all misaligned data refs with a known misalignment are supported, and
2054 4) the number of runtime alignment checks is within reason. */
2057 optimize_loop_nest_for_speed_p (loop
)
2058 && (!loop
->inner
); /* FORNOW */
2062 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2064 stmt
= DR_STMT (dr
);
2065 stmt_info
= vinfo_for_stmt (stmt
);
2067 /* For interleaving, only the alignment of the first access
2069 if (aligned_access_p (dr
)
2070 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2071 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
2074 if (STMT_VINFO_STRIDED_P (stmt_info
))
2076 /* Strided loads perform only component accesses, alignment is
2077 irrelevant for them. */
2078 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2080 do_versioning
= false;
2084 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
2086 if (!supportable_dr_alignment
)
2092 if (known_alignment_for_access_p (dr
)
2093 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2094 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
2096 do_versioning
= false;
2100 stmt
= DR_STMT (dr
);
2101 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
2102 gcc_assert (vectype
);
2104 /* The rightmost bits of an aligned address must be zeros.
2105 Construct the mask needed for this test. For example,
2106 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2107 mask must be 15 = 0xf. */
2108 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
2110 /* FORNOW: use the same mask to test all potentially unaligned
2111 references in the loop. The vectorizer currently supports
2112 a single vector size, see the reference to
2113 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2114 vectorization factor is computed. */
2115 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
2116 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
2117 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2118 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
2123 /* Versioning requires at least one misaligned data reference. */
2124 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2125 do_versioning
= false;
2126 else if (!do_versioning
)
2127 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2132 vec
<gimple
*> may_misalign_stmts
2133 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2136 /* It can now be assumed that the data references in the statements
2137 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2138 of the loop being vectorized. */
2139 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2141 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2142 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2143 SET_DR_MISALIGNMENT (dr
, 0);
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_NOTE
, vect_location
,
2146 "Alignment of access forced using versioning.\n");
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_NOTE
, vect_location
,
2151 "Versioning for alignment will be applied.\n");
2153 /* Peeling and versioning can't be done together at this time. */
2154 gcc_assert (! (do_peeling
&& do_versioning
));
2156 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2161 /* This point is reached if neither peeling nor versioning is being done. */
2162 gcc_assert (! (do_peeling
|| do_versioning
));
2164 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2169 /* Function vect_find_same_alignment_drs.
2171 Update group and alignment relations according to the chosen
2172 vectorization factor. */
2175 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2177 struct data_reference
*dra
= DDR_A (ddr
);
2178 struct data_reference
*drb
= DDR_B (ddr
);
2179 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2180 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2182 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2188 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0)
2189 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2190 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2193 /* Two references with distance zero have the same alignment. */
2194 offset_int diff
= (wi::to_offset (DR_INIT (dra
))
2195 - wi::to_offset (DR_INIT (drb
)));
2198 /* Get the wider of the two alignments. */
2199 unsigned int align_a
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a
));
2200 unsigned int align_b
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b
));
2201 unsigned int max_align
= MAX (align_a
, align_b
);
2203 /* Require the gap to be a multiple of the larger vector alignment. */
2204 if (!wi::multiple_of_p (diff
, max_align
, SIGNED
))
2208 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2209 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2210 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE
, vect_location
,
2213 "accesses have the same alignment: ");
2214 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2215 dump_printf (MSG_NOTE
, " and ");
2216 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2217 dump_printf (MSG_NOTE
, "\n");
2222 /* Function vect_analyze_data_refs_alignment
2224 Analyze the alignment of the data-references in the loop.
2225 Return FALSE if a data reference is found that cannot be vectorized. */
2228 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_NOTE
, vect_location
,
2232 "=== vect_analyze_data_refs_alignment ===\n");
2234 /* Mark groups of data references with same alignment using
2235 data dependence information. */
2236 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2237 struct data_dependence_relation
*ddr
;
2240 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2241 vect_find_same_alignment_drs (ddr
);
2243 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2244 struct data_reference
*dr
;
2246 vect_record_base_alignments (vinfo
);
2247 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2249 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2250 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2251 && !vect_compute_data_ref_alignment (dr
))
2253 /* Strided accesses perform only component accesses, misalignment
2254 information is irrelevant for them. */
2255 if (STMT_VINFO_STRIDED_P (stmt_info
)
2256 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2261 "not vectorized: can't calculate alignment "
2272 /* Analyze alignment of DRs of stmts in NODE. */
2275 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2277 /* We vectorize from the first scalar stmt in the node unless
2278 the node is permuted in which case we start from the first
2279 element in the group. */
2280 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2281 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2282 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2283 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2285 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2286 if (! vect_compute_data_ref_alignment (dr
)
2287 /* For creating the data-ref pointer we need alignment of the
2288 first element anyway. */
2290 && ! vect_compute_data_ref_alignment (first_dr
))
2291 || ! verify_data_ref_alignment (dr
))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2295 "not vectorized: bad data alignment in basic "
2303 /* Function vect_slp_analyze_instance_alignment
2305 Analyze the alignment of the data-references in the SLP instance.
2306 Return FALSE if a data reference is found that cannot be vectorized. */
2309 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE
, vect_location
,
2313 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2317 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2318 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2321 node
= SLP_INSTANCE_TREE (instance
);
2322 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2323 && ! vect_slp_analyze_and_verify_node_alignment
2324 (SLP_INSTANCE_TREE (instance
)))
2331 /* Analyze groups of accesses: check that DR belongs to a group of
2332 accesses of legal size, step, etc. Detect gaps, single element
2333 interleaving, and other special cases. Set grouped access info.
2334 Collect groups of strided stores for further use in SLP analysis.
2335 Worker for vect_analyze_group_access. */
2338 vect_analyze_group_access_1 (struct data_reference
*dr
)
2340 tree step
= DR_STEP (dr
);
2341 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2342 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2343 gimple
*stmt
= DR_STMT (dr
);
2344 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2345 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2346 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2347 HOST_WIDE_INT dr_step
= -1;
2348 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2349 bool slp_impossible
= false;
2351 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2352 size of the interleaving group (including gaps). */
2353 if (tree_fits_shwi_p (step
))
2355 dr_step
= tree_to_shwi (step
);
2356 /* Check that STEP is a multiple of type size. Otherwise there is
2357 a non-element-sized gap at the end of the group which we
2358 cannot represent in GROUP_GAP or GROUP_SIZE.
2359 ??? As we can handle non-constant step fine here we should
2360 simply remove uses of GROUP_GAP between the last and first
2361 element and instead rely on DR_STEP. GROUP_SIZE then would
2362 simply not include that gap. */
2363 if ((dr_step
% type_size
) != 0)
2365 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE
, vect_location
,
2369 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2370 dump_printf (MSG_NOTE
,
2371 " is not a multiple of the element size for ");
2372 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2373 dump_printf (MSG_NOTE
, "\n");
2377 groupsize
= absu_hwi (dr_step
) / type_size
;
2382 /* Not consecutive access is possible only if it is a part of interleaving. */
2383 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2385 /* Check if it this DR is a part of interleaving, and is a single
2386 element of the group that is accessed in the loop. */
2388 /* Gaps are supported only for loads. STEP must be a multiple of the type
2389 size. The size of the group must be a power of 2. */
2391 && (dr_step
% type_size
) == 0
2393 && pow2p_hwi (groupsize
))
2395 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2396 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2397 GROUP_GAP (stmt_info
) = groupsize
- 1;
2398 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE
, vect_location
,
2401 "Detected single element interleaving ");
2402 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2403 dump_printf (MSG_NOTE
, " step ");
2404 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2405 dump_printf (MSG_NOTE
, "\n");
2411 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2414 "not consecutive access ");
2415 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2420 /* Mark the statement as unvectorizable. */
2421 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2425 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2426 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2430 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2432 /* First stmt in the interleaving chain. Check the chain. */
2433 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2434 struct data_reference
*data_ref
= dr
;
2435 unsigned int count
= 1;
2436 tree prev_init
= DR_INIT (data_ref
);
2437 gimple
*prev
= stmt
;
2438 HOST_WIDE_INT diff
, gaps
= 0;
2442 /* Skip same data-refs. In case that two or more stmts share
2443 data-ref (supported only for loads), we vectorize only the first
2444 stmt, and the rest get their vectorized loads from the first
2446 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2447 DR_INIT (STMT_VINFO_DATA_REF (
2448 vinfo_for_stmt (next
)))))
2450 if (DR_IS_WRITE (data_ref
))
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2454 "Two store stmts share the same dr.\n");
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2460 "Two or more load stmts share the same dr.\n");
2462 /* For load use the same data-ref load. */
2463 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2466 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2471 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2473 /* All group members have the same STEP by construction. */
2474 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2476 /* Check that the distance between two accesses is equal to the type
2477 size. Otherwise, we have gaps. */
2478 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2479 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2482 /* FORNOW: SLP of accesses with gaps is not supported. */
2483 slp_impossible
= true;
2484 if (DR_IS_WRITE (data_ref
))
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2488 "interleaved store with gaps\n");
2495 last_accessed_element
+= diff
;
2497 /* Store the gap from the previous member of the group. If there is no
2498 gap in the access, GROUP_GAP is always 1. */
2499 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2501 prev_init
= DR_INIT (data_ref
);
2502 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2503 /* Count the number of data-refs in the chain. */
2508 groupsize
= count
+ gaps
;
2510 /* This could be UINT_MAX but as we are generating code in a very
2511 inefficient way we have to cap earlier. See PR78699 for example. */
2512 if (groupsize
> 4096)
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2516 "group is too large\n");
2520 /* Check that the size of the interleaving is equal to count for stores,
2521 i.e., that there are no gaps. */
2522 if (groupsize
!= count
2523 && !DR_IS_READ (dr
))
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2527 "interleaved store with gaps\n");
2531 /* If there is a gap after the last load in the group it is the
2532 difference between the groupsize and the last accessed
2534 When there is no gap, this difference should be 0. */
2535 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2537 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2538 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_NOTE
, vect_location
,
2541 "Detected interleaving ");
2542 if (DR_IS_READ (dr
))
2543 dump_printf (MSG_NOTE
, "load ");
2545 dump_printf (MSG_NOTE
, "store ");
2546 dump_printf (MSG_NOTE
, "of size %u starting with ",
2547 (unsigned)groupsize
);
2548 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2549 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2550 dump_printf_loc (MSG_NOTE
, vect_location
,
2551 "There is a gap of %u elements after the group\n",
2552 GROUP_GAP (vinfo_for_stmt (stmt
)));
2555 /* SLP: create an SLP data structure for every interleaving group of
2556 stores for further analysis in vect_analyse_slp. */
2557 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2560 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2562 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2569 /* Analyze groups of accesses: check that DR belongs to a group of
2570 accesses of legal size, step, etc. Detect gaps, single element
2571 interleaving, and other special cases. Set grouped access info.
2572 Collect groups of strided stores for further use in SLP analysis. */
2575 vect_analyze_group_access (struct data_reference
*dr
)
2577 if (!vect_analyze_group_access_1 (dr
))
2579 /* Dissolve the group if present. */
2581 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2584 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2585 next
= GROUP_NEXT_ELEMENT (vinfo
);
2586 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2587 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2595 /* Analyze the access pattern of the data-reference DR.
2596 In case of non-consecutive accesses call vect_analyze_group_access() to
2597 analyze groups of accesses. */
2600 vect_analyze_data_ref_access (struct data_reference
*dr
)
2602 tree step
= DR_STEP (dr
);
2603 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2604 gimple
*stmt
= DR_STMT (dr
);
2605 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2606 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2607 struct loop
*loop
= NULL
;
2610 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2612 if (loop_vinfo
&& !step
)
2614 if (dump_enabled_p ())
2615 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2616 "bad data-ref access in loop\n");
2620 /* Allow loads with zero step in inner-loop vectorization. */
2621 if (loop_vinfo
&& integer_zerop (step
))
2623 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2624 if (!nested_in_vect_loop_p (loop
, stmt
))
2625 return DR_IS_READ (dr
);
2626 /* Allow references with zero step for outer loops marked
2627 with pragma omp simd only - it guarantees absence of
2628 loop-carried dependencies between inner loop iterations. */
2629 if (!loop
->force_vectorize
)
2631 if (dump_enabled_p ())
2632 dump_printf_loc (MSG_NOTE
, vect_location
,
2633 "zero step in inner loop of nest\n");
2638 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2640 /* Interleaved accesses are not yet supported within outer-loop
2641 vectorization for references in the inner-loop. */
2642 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2644 /* For the rest of the analysis we use the outer-loop step. */
2645 step
= STMT_VINFO_DR_STEP (stmt_info
);
2646 if (integer_zerop (step
))
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE
, vect_location
,
2650 "zero step in outer loop.\n");
2651 return DR_IS_READ (dr
);
2656 if (TREE_CODE (step
) == INTEGER_CST
)
2658 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2659 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2661 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2663 /* Mark that it is not interleaving. */
2664 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2669 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE
, vect_location
,
2673 "grouped access in outer loop.\n");
2678 /* Assume this is a DR handled by non-constant strided load case. */
2679 if (TREE_CODE (step
) != INTEGER_CST
)
2680 return (STMT_VINFO_STRIDED_P (stmt_info
)
2681 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2682 || vect_analyze_group_access (dr
)));
2684 /* Not consecutive access - check if it's a part of interleaving group. */
2685 return vect_analyze_group_access (dr
);
2688 /* Compare two data-references DRA and DRB to group them into chunks
2689 suitable for grouping. */
2692 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2694 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2695 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2698 /* Stabilize sort. */
2702 /* DRs in different loops never belong to the same group. */
2703 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2704 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2706 return loopa
->num
< loopb
->num
? -1 : 1;
2708 /* Ordering of DRs according to base. */
2709 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2711 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2712 DR_BASE_ADDRESS (drb
));
2717 /* And according to DR_OFFSET. */
2718 if (!dr_equal_offsets_p (dra
, drb
))
2720 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2725 /* Put reads before writes. */
2726 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2727 return DR_IS_READ (dra
) ? -1 : 1;
2729 /* Then sort after access size. */
2730 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2731 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2733 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2734 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2739 /* And after step. */
2740 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2742 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2747 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2748 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2750 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2754 /* Function vect_analyze_data_ref_accesses.
2756 Analyze the access pattern of all the data references in the loop.
2758 FORNOW: the only access pattern that is considered vectorizable is a
2759 simple step 1 (consecutive) access.
2761 FORNOW: handle only arrays and pointer accesses. */
2764 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2767 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2768 struct data_reference
*dr
;
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE
, vect_location
,
2772 "=== vect_analyze_data_ref_accesses ===\n");
2774 if (datarefs
.is_empty ())
2777 /* Sort the array of datarefs to make building the interleaving chains
2778 linear. Don't modify the original vector's order, it is needed for
2779 determining what dependencies are reversed. */
2780 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2781 datarefs_copy
.qsort (dr_group_sort_cmp
);
2783 /* Build the interleaving chains. */
2784 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2786 data_reference_p dra
= datarefs_copy
[i
];
2787 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2788 stmt_vec_info lastinfo
= NULL
;
2789 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2794 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2796 data_reference_p drb
= datarefs_copy
[i
];
2797 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2798 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2801 /* ??? Imperfect sorting (non-compatible types, non-modulo
2802 accesses, same accesses) can lead to a group to be artificially
2803 split here as we don't just skip over those. If it really
2804 matters we can push those to a worklist and re-iterate
2805 over them. The we can just skip ahead to the next DR here. */
2807 /* DRs in a different loop should not be put into the same
2808 interleaving group. */
2809 if (gimple_bb (DR_STMT (dra
))->loop_father
2810 != gimple_bb (DR_STMT (drb
))->loop_father
)
2813 /* Check that the data-refs have same first location (except init)
2814 and they are both either store or load (not load and store,
2815 not masked loads or stores). */
2816 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2817 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2818 DR_BASE_ADDRESS (drb
), 0)
2819 || !dr_equal_offsets_p (dra
, drb
)
2820 || !gimple_assign_single_p (DR_STMT (dra
))
2821 || !gimple_assign_single_p (DR_STMT (drb
)))
2824 /* Check that the data-refs have the same constant size. */
2825 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2826 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2827 if (!tree_fits_uhwi_p (sza
)
2828 || !tree_fits_uhwi_p (szb
)
2829 || !tree_int_cst_equal (sza
, szb
))
2832 /* Check that the data-refs have the same step. */
2833 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2836 /* Do not place the same access in the interleaving chain twice. */
2837 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2840 /* Check the types are compatible.
2841 ??? We don't distinguish this during sorting. */
2842 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2843 TREE_TYPE (DR_REF (drb
))))
2846 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2847 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2848 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2849 gcc_assert (init_a
<= init_b
);
2851 /* If init_b == init_a + the size of the type * k, we have an
2852 interleaving, and DRA is accessed before DRB. */
2853 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2854 if (type_size_a
== 0
2855 || (init_b
- init_a
) % type_size_a
!= 0)
2858 /* If we have a store, the accesses are adjacent. This splits
2859 groups into chunks we support (we don't support vectorization
2860 of stores with gaps). */
2861 if (!DR_IS_READ (dra
)
2862 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2863 (DR_INIT (datarefs_copy
[i
-1]))
2867 /* If the step (if not zero or non-constant) is greater than the
2868 difference between data-refs' inits this splits groups into
2870 if (tree_fits_shwi_p (DR_STEP (dra
)))
2872 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2873 if (step
!= 0 && step
<= (init_b
- init_a
))
2877 if (dump_enabled_p ())
2879 dump_printf_loc (MSG_NOTE
, vect_location
,
2880 "Detected interleaving ");
2881 if (DR_IS_READ (dra
))
2882 dump_printf (MSG_NOTE
, "load ");
2884 dump_printf (MSG_NOTE
, "store ");
2885 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2886 dump_printf (MSG_NOTE
, " and ");
2887 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2888 dump_printf (MSG_NOTE
, "\n");
2891 /* Link the found element into the group list. */
2892 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2894 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2895 lastinfo
= stmtinfo_a
;
2897 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2898 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2899 lastinfo
= stmtinfo_b
;
2903 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2904 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2905 && !vect_analyze_data_ref_access (dr
))
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2909 "not vectorized: complicated access pattern.\n");
2911 if (is_a
<bb_vec_info
> (vinfo
))
2913 /* Mark the statement as not vectorizable. */
2914 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2919 datarefs_copy
.release ();
2924 datarefs_copy
.release ();
2928 /* Function vect_vfa_segment_size.
2930 Create an expression that computes the size of segment
2931 that will be accessed for a data reference. The functions takes into
2932 account that realignment loads may access one more vector.
2935 DR: The data reference.
2936 LENGTH_FACTOR: segment length to consider.
2938 Return an expression whose value is the size of segment which will be
2942 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2944 tree segment_length
;
2946 if (integer_zerop (DR_STEP (dr
)))
2947 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2949 segment_length
= size_binop (MULT_EXPR
,
2950 fold_convert (sizetype
, DR_STEP (dr
)),
2951 fold_convert (sizetype
, length_factor
));
2953 if (vect_supportable_dr_alignment (dr
, false)
2954 == dr_explicit_realign_optimized
)
2956 tree vector_size
= TYPE_SIZE_UNIT
2957 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2959 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2961 return segment_length
;
2964 /* Function vect_no_alias_p.
2966 Given data references A and B with equal base and offset, the alias
2967 relation can be decided at compilation time, return TRUE if they do
2968 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2969 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2973 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2974 tree segment_length_a
, tree segment_length_b
)
2976 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
2977 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
2978 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
2981 tree seg_a_min
= DR_INIT (a
);
2982 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
2983 seg_a_min
, segment_length_a
);
2984 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2985 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2987 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
2989 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
2990 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
2991 seg_a_max
, unit_size
);
2992 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
2993 DR_INIT (a
), unit_size
);
2995 tree seg_b_min
= DR_INIT (b
);
2996 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
2997 seg_b_min
, segment_length_b
);
2998 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3000 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3001 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3002 seg_b_max
, unit_size
);
3003 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3004 DR_INIT (b
), unit_size
);
3007 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3008 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3014 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3018 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3019 unsigned int loop_depth
, unsigned HOST_WIDE_INT vf
)
3021 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3022 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3025 /* If the dependence is exact, we should have limited the VF instead. */
3026 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3029 lambda_vector dist_v
;
3030 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3032 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3034 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3035 && (unsigned HOST_WIDE_INT
) abs_hwi (dist
) < vf
)
3039 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_NOTE
, vect_location
,
3042 "dependence distance between ");
3043 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
3044 dump_printf (MSG_NOTE
, " and ");
3045 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
3046 dump_printf (MSG_NOTE
, " is >= VF\n");
3052 /* Function vect_prune_runtime_alias_test_list.
3054 Prune a list of ddrs to be tested at run-time by versioning for alias.
3055 Merge several alias checks into one if possible.
3056 Return FALSE if resulting list of ddrs is longer then allowed by
3057 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3060 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3062 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3063 hash_set
<tree_pair_hash
> compared_objects
;
3065 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3066 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3067 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3068 vec
<vec_object_pair
> &check_unequal_addrs
3069 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3070 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3071 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE
, vect_location
,
3079 "=== vect_prune_runtime_alias_test_list ===\n");
3081 if (may_alias_ddrs
.is_empty ())
3084 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3086 unsigned int loop_depth
3087 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3088 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3090 /* First, we collect all data ref pairs for aliasing checks. */
3091 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3094 struct data_reference
*dr_a
, *dr_b
;
3095 gimple
*dr_group_first_a
, *dr_group_first_b
;
3096 tree segment_length_a
, segment_length_b
;
3097 gimple
*stmt_a
, *stmt_b
;
3099 /* Ignore the alias if the VF we chose ended up being no greater
3100 than the dependence distance. */
3101 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3104 if (DDR_OBJECT_A (ddr
))
3106 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3107 if (!compared_objects
.add (new_pair
))
3109 if (dump_enabled_p ())
3111 dump_printf_loc (MSG_NOTE
, vect_location
, "checking that ");
3112 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.first
);
3113 dump_printf (MSG_NOTE
, " and ");
3114 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, new_pair
.second
);
3115 dump_printf (MSG_NOTE
, " have different addresses\n");
3117 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3123 stmt_a
= DR_STMT (DDR_A (ddr
));
3124 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3125 if (dr_group_first_a
)
3127 stmt_a
= dr_group_first_a
;
3128 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3132 stmt_b
= DR_STMT (DDR_B (ddr
));
3133 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3134 if (dr_group_first_b
)
3136 stmt_b
= dr_group_first_b
;
3137 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3140 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3141 length_factor
= scalar_loop_iters
;
3143 length_factor
= size_int (vect_factor
);
3144 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3145 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3147 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
3148 DR_BASE_ADDRESS (dr_b
));
3150 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
3153 /* Alias is known at compilation time. */
3155 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3156 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3157 && TREE_CODE (segment_length_a
) == INTEGER_CST
3158 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3160 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE
, vect_location
,
3165 "not vectorized: compilation time alias.\n");
3170 dr_with_seg_len_pair_t dr_with_seg_len_pair
3171 (dr_with_seg_len (dr_a
, segment_length_a
),
3172 dr_with_seg_len (dr_b
, segment_length_b
));
3174 /* Canonicalize pairs by sorting the two DR members. */
3176 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3178 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3181 prune_runtime_alias_test_list (&comp_alias_ddrs
,
3182 (unsigned HOST_WIDE_INT
) vect_factor
);
3184 unsigned int count
= (comp_alias_ddrs
.length ()
3185 + check_unequal_addrs
.length ());
3186 dump_printf_loc (MSG_NOTE
, vect_location
,
3187 "improved number of alias checks from %d to %d\n",
3188 may_alias_ddrs
.length (), count
);
3189 if ((int) count
> PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3191 if (dump_enabled_p ())
3192 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3193 "number of versioning for alias "
3194 "run-time tests exceeds %d "
3195 "(--param vect-max-version-for-alias-checks)\n",
3196 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3203 /* Return true if a non-affine read or write in STMT is suitable for a
3204 gather load or scatter store. Describe the operation in *INFO if so. */
3207 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3208 gather_scatter_info
*info
)
3210 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3211 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3212 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3213 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3214 tree offtype
= NULL_TREE
;
3215 tree decl
, base
, off
;
3217 int punsignedp
, reversep
, pvolatilep
= 0;
3220 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3221 see if we can use the def stmt of the address. */
3222 if (is_gimple_call (stmt
)
3223 && gimple_call_internal_p (stmt
)
3224 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3225 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3226 && TREE_CODE (base
) == MEM_REF
3227 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3228 && integer_zerop (TREE_OPERAND (base
, 1))
3229 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3231 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3232 if (is_gimple_assign (def_stmt
)
3233 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3234 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3237 /* The gather and scatter builtins need address of the form
3238 loop_invariant + vector * {1, 2, 4, 8}
3240 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3241 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3242 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3243 multiplications and additions in it. To get a vector, we need
3244 a single SSA_NAME that will be defined in the loop and will
3245 contain everything that is not loop invariant and that can be
3246 vectorized. The following code attempts to find such a preexistng
3247 SSA_NAME OFF and put the loop invariants into a tree BASE
3248 that can be gimplified before the loop. */
3249 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3250 &punsignedp
, &reversep
, &pvolatilep
);
3251 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3253 if (TREE_CODE (base
) == MEM_REF
)
3255 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3257 if (off
== NULL_TREE
)
3259 offset_int moff
= mem_ref_offset (base
);
3260 off
= wide_int_to_tree (sizetype
, moff
);
3263 off
= size_binop (PLUS_EXPR
, off
,
3264 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3266 base
= TREE_OPERAND (base
, 0);
3269 base
= build_fold_addr_expr (base
);
3271 if (off
== NULL_TREE
)
3272 off
= size_zero_node
;
3274 /* If base is not loop invariant, either off is 0, then we start with just
3275 the constant offset in the loop invariant BASE and continue with base
3276 as OFF, otherwise give up.
3277 We could handle that case by gimplifying the addition of base + off
3278 into some SSA_NAME and use that as off, but for now punt. */
3279 if (!expr_invariant_in_loop_p (loop
, base
))
3281 if (!integer_zerop (off
))
3284 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3286 /* Otherwise put base + constant offset into the loop invariant BASE
3287 and continue with OFF. */
3290 base
= fold_convert (sizetype
, base
);
3291 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3294 /* OFF at this point may be either a SSA_NAME or some tree expression
3295 from get_inner_reference. Try to peel off loop invariants from it
3296 into BASE as long as possible. */
3298 while (offtype
== NULL_TREE
)
3300 enum tree_code code
;
3301 tree op0
, op1
, add
= NULL_TREE
;
3303 if (TREE_CODE (off
) == SSA_NAME
)
3305 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3307 if (expr_invariant_in_loop_p (loop
, off
))
3310 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3313 op0
= gimple_assign_rhs1 (def_stmt
);
3314 code
= gimple_assign_rhs_code (def_stmt
);
3315 op1
= gimple_assign_rhs2 (def_stmt
);
3319 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3321 code
= TREE_CODE (off
);
3322 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3326 case POINTER_PLUS_EXPR
:
3328 if (expr_invariant_in_loop_p (loop
, op0
))
3333 add
= fold_convert (sizetype
, add
);
3335 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3336 base
= size_binop (PLUS_EXPR
, base
, add
);
3339 if (expr_invariant_in_loop_p (loop
, op1
))
3347 if (expr_invariant_in_loop_p (loop
, op1
))
3349 add
= fold_convert (sizetype
, op1
);
3350 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3356 if (scale
== 1 && tree_fits_shwi_p (op1
))
3358 scale
= tree_to_shwi (op1
);
3367 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3368 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3370 if (TYPE_PRECISION (TREE_TYPE (op0
))
3371 == TYPE_PRECISION (TREE_TYPE (off
)))
3376 if (TYPE_PRECISION (TREE_TYPE (op0
))
3377 < TYPE_PRECISION (TREE_TYPE (off
)))
3380 offtype
= TREE_TYPE (off
);
3391 /* If at the end OFF still isn't a SSA_NAME or isn't
3392 defined in the loop, punt. */
3393 if (TREE_CODE (off
) != SSA_NAME
3394 || expr_invariant_in_loop_p (loop
, off
))
3397 if (offtype
== NULL_TREE
)
3398 offtype
= TREE_TYPE (off
);
3400 if (DR_IS_READ (dr
))
3401 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3404 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3407 if (decl
== NULL_TREE
)
3413 info
->offset_dt
= vect_unknown_def_type
;
3414 info
->offset_vectype
= NULL_TREE
;
3415 info
->scale
= scale
;
3419 /* Function vect_analyze_data_refs.
3421 Find all the data references in the loop or basic block.
3423 The general structure of the analysis of data refs in the vectorizer is as
3425 1- vect_analyze_data_refs(loop/bb): call
3426 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3427 in the loop/bb and their dependences.
3428 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3429 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3430 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3435 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3437 struct loop
*loop
= NULL
;
3439 struct data_reference
*dr
;
3442 if (dump_enabled_p ())
3443 dump_printf_loc (MSG_NOTE
, vect_location
,
3444 "=== vect_analyze_data_refs ===\n");
3446 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3447 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3449 /* Go through the data-refs, check that the analysis succeeded. Update
3450 pointer from stmt_vec_info struct to DR and vectype. */
3452 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3453 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3456 stmt_vec_info stmt_info
;
3457 tree base
, offset
, init
;
3458 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3459 bool simd_lane_access
= false;
3463 if (!dr
|| !DR_REF (dr
))
3465 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3467 "not vectorized: unhandled data-ref\n");
3471 stmt
= DR_STMT (dr
);
3472 stmt_info
= vinfo_for_stmt (stmt
);
3474 /* Discard clobbers from the dataref vector. We will remove
3475 clobber stmts during vectorization. */
3476 if (gimple_clobber_p (stmt
))
3479 if (i
== datarefs
.length () - 1)
3484 datarefs
.ordered_remove (i
);
3489 /* Check that analysis of the data-ref succeeded. */
3490 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3495 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3496 && targetm
.vectorize
.builtin_gather
!= NULL
;
3499 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3500 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3501 bool maybe_simd_lane_access
3502 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3504 /* If target supports vector gather loads or scatter stores, or if
3505 this might be a SIMD lane access, see if they can't be used. */
3506 if (is_a
<loop_vec_info
> (vinfo
)
3507 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3508 && !nested_in_vect_loop_p (loop
, stmt
))
3510 struct data_reference
*newdr
3511 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3512 DR_REF (dr
), stmt
, !maybe_scatter
,
3513 DR_IS_CONDITIONAL_IN_STMT (dr
));
3514 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3515 if (DR_BASE_ADDRESS (newdr
)
3516 && DR_OFFSET (newdr
)
3519 && integer_zerop (DR_STEP (newdr
)))
3521 if (maybe_simd_lane_access
)
3523 tree off
= DR_OFFSET (newdr
);
3525 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3526 && TREE_CODE (off
) == MULT_EXPR
3527 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3529 tree step
= TREE_OPERAND (off
, 1);
3530 off
= TREE_OPERAND (off
, 0);
3532 if (CONVERT_EXPR_P (off
)
3533 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3535 < TYPE_PRECISION (TREE_TYPE (off
)))
3536 off
= TREE_OPERAND (off
, 0);
3537 if (TREE_CODE (off
) == SSA_NAME
)
3539 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3540 tree reft
= TREE_TYPE (DR_REF (newdr
));
3541 if (is_gimple_call (def
)
3542 && gimple_call_internal_p (def
)
3543 && (gimple_call_internal_fn (def
)
3544 == IFN_GOMP_SIMD_LANE
))
3546 tree arg
= gimple_call_arg (def
, 0);
3547 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3548 arg
= SSA_NAME_VAR (arg
);
3549 if (arg
== loop
->simduid
3551 && tree_int_cst_equal
3552 (TYPE_SIZE_UNIT (reft
),
3555 DR_OFFSET (newdr
) = ssize_int (0);
3556 DR_STEP (newdr
) = step
;
3557 DR_OFFSET_ALIGNMENT (newdr
)
3558 = BIGGEST_ALIGNMENT
;
3559 DR_STEP_ALIGNMENT (newdr
)
3560 = highest_pow2_factor (step
);
3562 simd_lane_access
= true;
3568 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3572 gatherscatter
= GATHER
;
3574 gatherscatter
= SCATTER
;
3577 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3578 free_data_ref (newdr
);
3581 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3583 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3586 "not vectorized: data ref analysis "
3588 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3591 if (is_a
<bb_vec_info
> (vinfo
))
3598 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3602 "not vectorized: base addr of dr is a "
3605 if (is_a
<bb_vec_info
> (vinfo
))
3608 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3613 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3615 if (dump_enabled_p ())
3617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3618 "not vectorized: volatile type ");
3619 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3622 if (is_a
<bb_vec_info
> (vinfo
))
3628 if (stmt_can_throw_internal (stmt
))
3630 if (dump_enabled_p ())
3632 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3633 "not vectorized: statement can throw an "
3635 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3638 if (is_a
<bb_vec_info
> (vinfo
))
3641 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3646 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3647 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3649 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3652 "not vectorized: statement is bitfield "
3654 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3657 if (is_a
<bb_vec_info
> (vinfo
))
3660 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3665 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3666 offset
= unshare_expr (DR_OFFSET (dr
));
3667 init
= unshare_expr (DR_INIT (dr
));
3669 if (is_gimple_call (stmt
)
3670 && (!gimple_call_internal_p (stmt
)
3671 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3672 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3674 if (dump_enabled_p ())
3676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3677 "not vectorized: dr in a call ");
3678 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3681 if (is_a
<bb_vec_info
> (vinfo
))
3684 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3689 /* Update DR field in stmt_vec_info struct. */
3691 /* If the dataref is in an inner-loop of the loop that is considered for
3692 for vectorization, we also want to analyze the access relative to
3693 the outer-loop (DR contains information only relative to the
3694 inner-most enclosing loop). We do that by building a reference to the
3695 first location accessed by the inner-loop, and analyze it relative to
3697 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3699 /* Build a reference to the first location accessed by the
3700 inner loop: *(BASE + INIT + OFFSET). By construction,
3701 this address must be invariant in the inner loop, so we
3702 can consider it as being used in the outer loop. */
3703 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
3705 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
3706 tree init_ref
= build_fold_indirect_ref (init_addr
);
3708 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_NOTE
, vect_location
,
3711 "analyze in outer loop: ");
3712 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
3713 dump_printf (MSG_NOTE
, "\n");
3716 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
3718 /* dr_analyze_innermost already explained the failure. */
3721 if (dump_enabled_p ())
3723 dump_printf_loc (MSG_NOTE
, vect_location
,
3724 "\touter base_address: ");
3725 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3726 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3727 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3728 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3729 STMT_VINFO_DR_OFFSET (stmt_info
));
3730 dump_printf (MSG_NOTE
,
3731 "\n\touter constant offset from base address: ");
3732 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3733 STMT_VINFO_DR_INIT (stmt_info
));
3734 dump_printf (MSG_NOTE
, "\n\touter step: ");
3735 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3736 STMT_VINFO_DR_STEP (stmt_info
));
3737 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
3738 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
3739 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
3740 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
3741 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
3742 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
3743 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
3744 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
3748 if (STMT_VINFO_DATA_REF (stmt_info
))
3750 if (dump_enabled_p ())
3752 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3753 "not vectorized: more than one data ref "
3755 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3758 if (is_a
<bb_vec_info
> (vinfo
))
3761 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3766 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3767 if (simd_lane_access
)
3769 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3770 free_data_ref (datarefs
[i
]);
3774 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
3775 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
3776 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
3778 if (dump_enabled_p ())
3780 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3781 "not vectorized: base object not addressable "
3783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3785 if (is_a
<bb_vec_info
> (vinfo
))
3787 /* In BB vectorization the ref can still participate
3788 in dependence analysis, we just can't vectorize it. */
3789 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3795 /* Set vectype for STMT. */
3796 scalar_type
= TREE_TYPE (DR_REF (dr
));
3797 STMT_VINFO_VECTYPE (stmt_info
)
3798 = get_vectype_for_scalar_type (scalar_type
);
3799 if (!STMT_VINFO_VECTYPE (stmt_info
))
3801 if (dump_enabled_p ())
3803 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3804 "not vectorized: no vectype for stmt: ");
3805 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3806 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3807 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3809 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3812 if (is_a
<bb_vec_info
> (vinfo
))
3814 /* No vector type is fine, the ref can still participate
3815 in dependence analysis, we just can't vectorize it. */
3816 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3820 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3822 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3823 if (gatherscatter
!= SG_NONE
)
3830 if (dump_enabled_p ())
3832 dump_printf_loc (MSG_NOTE
, vect_location
,
3833 "got vectype for stmt: ");
3834 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3835 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3836 STMT_VINFO_VECTYPE (stmt_info
));
3837 dump_printf (MSG_NOTE
, "\n");
3841 /* Adjust the minimal vectorization factor according to the
3843 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3847 if (gatherscatter
!= SG_NONE
)
3849 gather_scatter_info gs_info
;
3850 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3852 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
3854 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3856 if (dump_enabled_p ())
3858 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3859 (gatherscatter
== GATHER
) ?
3860 "not vectorized: not suitable for gather "
3862 "not vectorized: not suitable for scatter "
3864 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3869 free_data_ref (datarefs
[i
]);
3871 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3874 else if (is_a
<loop_vec_info
> (vinfo
)
3875 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3877 if (nested_in_vect_loop_p (loop
, stmt
))
3879 if (dump_enabled_p ())
3881 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3882 "not vectorized: not suitable for strided "
3884 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3888 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3892 /* If we stopped analysis at the first dataref we could not analyze
3893 when trying to vectorize a basic-block mark the rest of the datarefs
3894 as not vectorizable and truncate the vector of datarefs. That
3895 avoids spending useless time in analyzing their dependence. */
3896 if (i
!= datarefs
.length ())
3898 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3899 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3901 data_reference_p dr
= datarefs
[j
];
3902 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3905 datarefs
.truncate (i
);
3912 /* Function vect_get_new_vect_var.
3914 Returns a name for a new variable. The current naming scheme appends the
3915 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3916 the name of vectorizer generated variables, and appends that to NAME if
3920 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3927 case vect_simple_var
:
3930 case vect_scalar_var
:
3936 case vect_pointer_var
:
3945 char* tmp
= concat (prefix
, "_", name
, NULL
);
3946 new_vect_var
= create_tmp_reg (type
, tmp
);
3950 new_vect_var
= create_tmp_reg (type
, prefix
);
3952 return new_vect_var
;
3955 /* Like vect_get_new_vect_var but return an SSA name. */
3958 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3965 case vect_simple_var
:
3968 case vect_scalar_var
:
3971 case vect_pointer_var
:
3980 char* tmp
= concat (prefix
, "_", name
, NULL
);
3981 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
3985 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
3987 return new_vect_var
;
3990 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3993 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3994 stmt_vec_info stmt_info
)
3996 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3997 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3998 int misalign
= DR_MISALIGNMENT (dr
);
3999 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4000 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4002 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4005 /* Function vect_create_addr_base_for_vector_ref.
4007 Create an expression that computes the address of the first memory location
4008 that will be accessed for a data reference.
4011 STMT: The statement containing the data reference.
4012 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4013 OFFSET: Optional. If supplied, it is be added to the initial address.
4014 LOOP: Specify relative to which loop-nest should the address be computed.
4015 For example, when the dataref is in an inner-loop nested in an
4016 outer-loop that is now being vectorized, LOOP can be either the
4017 outer-loop, or the inner-loop. The first memory location accessed
4018 by the following dataref ('in' points to short):
4025 if LOOP=i_loop: &in (relative to i_loop)
4026 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4027 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4028 initial address. Unlike OFFSET, which is number of elements to
4029 be added, BYTE_OFFSET is measured in bytes.
4032 1. Return an SSA_NAME whose value is the address of the memory location of
4033 the first vector of the data reference.
4034 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4035 these statement(s) which define the returned SSA_NAME.
4037 FORNOW: We are only handling array accesses with step 1. */
4040 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4041 gimple_seq
*new_stmt_list
,
4045 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4046 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4047 const char *base_name
;
4050 gimple_seq seq
= NULL
;
4052 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4053 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4054 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
4056 tree data_ref_base
= unshare_expr (drb
->base_address
);
4057 tree base_offset
= unshare_expr (drb
->offset
);
4058 tree init
= unshare_expr (drb
->init
);
4061 base_name
= get_name (data_ref_base
);
4064 base_offset
= ssize_int (0);
4065 init
= ssize_int (0);
4066 base_name
= get_name (DR_REF (dr
));
4069 /* Create base_offset */
4070 base_offset
= size_binop (PLUS_EXPR
,
4071 fold_convert (sizetype
, base_offset
),
4072 fold_convert (sizetype
, init
));
4076 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4077 fold_convert (sizetype
, offset
), step
);
4078 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4079 base_offset
, offset
);
4083 byte_offset
= fold_convert (sizetype
, byte_offset
);
4084 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4085 base_offset
, byte_offset
);
4088 /* base + base_offset */
4090 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4093 addr_base
= build1 (ADDR_EXPR
,
4094 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4095 unshare_expr (DR_REF (dr
)));
4098 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4099 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4100 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4101 gimple_seq_add_seq (new_stmt_list
, seq
);
4103 if (DR_PTR_INFO (dr
)
4104 && TREE_CODE (addr_base
) == SSA_NAME
4105 && !SSA_NAME_PTR_INFO (addr_base
))
4107 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4108 if (offset
|| byte_offset
)
4109 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4112 if (dump_enabled_p ())
4114 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4115 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4116 dump_printf (MSG_NOTE
, "\n");
4123 /* Function vect_create_data_ref_ptr.
4125 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4126 location accessed in the loop by STMT, along with the def-use update
4127 chain to appropriately advance the pointer through the loop iterations.
4128 Also set aliasing information for the pointer. This pointer is used by
4129 the callers to this function to create a memory reference expression for
4130 vector load/store access.
4133 1. STMT: a stmt that references memory. Expected to be of the form
4134 GIMPLE_ASSIGN <name, data-ref> or
4135 GIMPLE_ASSIGN <data-ref, name>.
4136 2. AGGR_TYPE: the type of the reference, which should be either a vector
4138 3. AT_LOOP: the loop where the vector memref is to be created.
4139 4. OFFSET (optional): an offset to be added to the initial address accessed
4140 by the data-ref in STMT.
4141 5. BSI: location where the new stmts are to be placed if there is no loop
4142 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4143 pointing to the initial address.
4144 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4145 to the initial address accessed by the data-ref in STMT. This is
4146 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4150 1. Declare a new ptr to vector_type, and have it point to the base of the
4151 data reference (initial addressed accessed by the data reference).
4152 For example, for vector of type V8HI, the following code is generated:
4155 ap = (v8hi *)initial_address;
4157 if OFFSET is not supplied:
4158 initial_address = &a[init];
4159 if OFFSET is supplied:
4160 initial_address = &a[init + OFFSET];
4161 if BYTE_OFFSET is supplied:
4162 initial_address = &a[init] + BYTE_OFFSET;
4164 Return the initial_address in INITIAL_ADDRESS.
4166 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4167 update the pointer in each iteration of the loop.
4169 Return the increment stmt that updates the pointer in PTR_INCR.
4171 3. Set INV_P to true if the access pattern of the data reference in the
4172 vectorized loop is invariant. Set it to false otherwise.
4174 4. Return the pointer. */
4177 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4178 tree offset
, tree
*initial_address
,
4179 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4180 bool only_init
, bool *inv_p
, tree byte_offset
)
4182 const char *base_name
;
4183 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4184 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4185 struct loop
*loop
= NULL
;
4186 bool nested_in_vect_loop
= false;
4187 struct loop
*containing_loop
= NULL
;
4191 gimple_seq new_stmt_list
= NULL
;
4195 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4197 gimple_stmt_iterator incr_gsi
;
4199 tree indx_before_incr
, indx_after_incr
;
4202 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4204 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4205 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4209 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4210 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4211 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4212 pe
= loop_preheader_edge (loop
);
4216 gcc_assert (bb_vinfo
);
4221 /* Check the step (evolution) of the load in LOOP, and record
4222 whether it's invariant. */
4223 step
= vect_dr_behavior (dr
)->step
;
4224 if (integer_zerop (step
))
4229 /* Create an expression for the first address accessed by this load
4231 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4233 if (dump_enabled_p ())
4235 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4236 dump_printf_loc (MSG_NOTE
, vect_location
,
4237 "create %s-pointer variable to type: ",
4238 get_tree_code_name (TREE_CODE (aggr_type
)));
4239 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4240 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4241 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4242 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4243 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4244 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4245 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4247 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4248 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4249 dump_printf (MSG_NOTE
, "\n");
4252 /* (1) Create the new aggregate-pointer variable.
4253 Vector and array types inherit the alias set of their component
4254 type by default so we need to use a ref-all pointer if the data
4255 reference does not conflict with the created aggregated data
4256 reference because it is not addressable. */
4257 bool need_ref_all
= false;
4258 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4259 get_alias_set (DR_REF (dr
))))
4260 need_ref_all
= true;
4261 /* Likewise for any of the data references in the stmt group. */
4262 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4264 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4267 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4268 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4269 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4270 get_alias_set (DR_REF (sdr
))))
4272 need_ref_all
= true;
4275 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4279 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4281 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4284 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4285 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4286 def-use update cycles for the pointer: one relative to the outer-loop
4287 (LOOP), which is what steps (3) and (4) below do. The other is relative
4288 to the inner-loop (which is the inner-most loop containing the dataref),
4289 and this is done be step (5) below.
4291 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4292 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4293 redundant. Steps (3),(4) create the following:
4296 LOOP: vp1 = phi(vp0,vp2)
4302 If there is an inner-loop nested in loop, then step (5) will also be
4303 applied, and an additional update in the inner-loop will be created:
4306 LOOP: vp1 = phi(vp0,vp2)
4308 inner: vp3 = phi(vp1,vp4)
4309 vp4 = vp3 + inner_step
4315 /* (2) Calculate the initial address of the aggregate-pointer, and set
4316 the aggregate-pointer to point to it before the loop. */
4318 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4320 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4321 offset
, byte_offset
);
4326 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4327 gcc_assert (!new_bb
);
4330 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4333 *initial_address
= new_temp
;
4334 aggr_ptr_init
= new_temp
;
4336 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4337 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4338 inner-loop nested in LOOP (during outer-loop vectorization). */
4340 /* No update in loop is required. */
4341 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4342 aptr
= aggr_ptr_init
;
4345 /* The step of the aggregate pointer is the type size. */
4346 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4347 /* One exception to the above is when the scalar step of the load in
4348 LOOP is zero. In this case the step here is also zero. */
4350 iv_step
= size_zero_node
;
4351 else if (tree_int_cst_sgn (step
) == -1)
4352 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4354 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4356 create_iv (aggr_ptr_init
,
4357 fold_convert (aggr_ptr_type
, iv_step
),
4358 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4359 &indx_before_incr
, &indx_after_incr
);
4360 incr
= gsi_stmt (incr_gsi
);
4361 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4363 /* Copy the points-to information if it exists. */
4364 if (DR_PTR_INFO (dr
))
4366 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4367 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4372 aptr
= indx_before_incr
;
4375 if (!nested_in_vect_loop
|| only_init
)
4379 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4380 nested in LOOP, if exists. */
4382 gcc_assert (nested_in_vect_loop
);
4385 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4387 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4388 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4390 incr
= gsi_stmt (incr_gsi
);
4391 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4393 /* Copy the points-to information if it exists. */
4394 if (DR_PTR_INFO (dr
))
4396 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4397 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4402 return indx_before_incr
;
4409 /* Function bump_vector_ptr
4411 Increment a pointer (to a vector type) by vector-size. If requested,
4412 i.e. if PTR-INCR is given, then also connect the new increment stmt
4413 to the existing def-use update-chain of the pointer, by modifying
4414 the PTR_INCR as illustrated below:
4416 The pointer def-use update-chain before this function:
4417 DATAREF_PTR = phi (p_0, p_2)
4419 PTR_INCR: p_2 = DATAREF_PTR + step
4421 The pointer def-use update-chain after this function:
4422 DATAREF_PTR = phi (p_0, p_2)
4424 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4426 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4429 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4431 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4432 the loop. The increment amount across iterations is expected
4434 BSI - location where the new update stmt is to be placed.
4435 STMT - the original scalar memory-access stmt that is being vectorized.
4436 BUMP - optional. The offset by which to bump the pointer. If not given,
4437 the offset is assumed to be vector_size.
4439 Output: Return NEW_DATAREF_PTR as illustrated above.
4444 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4445 gimple
*stmt
, tree bump
)
4447 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4448 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4449 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4450 tree update
= TYPE_SIZE_UNIT (vectype
);
4453 use_operand_p use_p
;
4454 tree new_dataref_ptr
;
4459 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4460 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4462 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4463 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4464 dataref_ptr
, update
);
4465 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4467 /* Copy the points-to information if it exists. */
4468 if (DR_PTR_INFO (dr
))
4470 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4471 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4475 return new_dataref_ptr
;
4477 /* Update the vector-pointer's cross-iteration increment. */
4478 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4480 tree use
= USE_FROM_PTR (use_p
);
4482 if (use
== dataref_ptr
)
4483 SET_USE (use_p
, new_dataref_ptr
);
4485 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4488 return new_dataref_ptr
;
4492 /* Function vect_create_destination_var.
4494 Create a new temporary of type VECTYPE. */
4497 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4503 enum vect_var_kind kind
;
4506 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4510 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4512 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4514 name
= get_name (scalar_dest
);
4516 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4518 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4519 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4525 /* Function vect_grouped_store_supported.
4527 Returns TRUE if interleave high and interleave low permutations
4528 are supported, and FALSE otherwise. */
4531 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4533 machine_mode mode
= TYPE_MODE (vectype
);
4535 /* vect_permute_store_chain requires the group size to be equal to 3 or
4536 be a power of two. */
4537 if (count
!= 3 && exact_log2 (count
) == -1)
4539 if (dump_enabled_p ())
4540 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4541 "the size of the group of accesses"
4542 " is not a power of 2 or not eqaul to 3\n");
4546 /* Check that the permutation is supported. */
4547 if (VECTOR_MODE_P (mode
))
4549 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4550 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4554 unsigned int j0
= 0, j1
= 0, j2
= 0;
4557 for (j
= 0; j
< 3; j
++)
4559 int nelt0
= ((3 - j
) * nelt
) % 3;
4560 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4561 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4562 for (i
= 0; i
< nelt
; i
++)
4564 if (3 * i
+ nelt0
< nelt
)
4565 sel
[3 * i
+ nelt0
] = j0
++;
4566 if (3 * i
+ nelt1
< nelt
)
4567 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4568 if (3 * i
+ nelt2
< nelt
)
4569 sel
[3 * i
+ nelt2
] = 0;
4571 if (!can_vec_perm_p (mode
, false, sel
))
4573 if (dump_enabled_p ())
4574 dump_printf (MSG_MISSED_OPTIMIZATION
,
4575 "permutaion op not supported by target.\n");
4579 for (i
= 0; i
< nelt
; i
++)
4581 if (3 * i
+ nelt0
< nelt
)
4582 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4583 if (3 * i
+ nelt1
< nelt
)
4584 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4585 if (3 * i
+ nelt2
< nelt
)
4586 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4588 if (!can_vec_perm_p (mode
, false, sel
))
4590 if (dump_enabled_p ())
4591 dump_printf (MSG_MISSED_OPTIMIZATION
,
4592 "permutaion op not supported by target.\n");
4600 /* If length is not equal to 3 then only power of 2 is supported. */
4601 gcc_assert (pow2p_hwi (count
));
4603 for (i
= 0; i
< nelt
/ 2; i
++)
4606 sel
[i
* 2 + 1] = i
+ nelt
;
4608 if (can_vec_perm_p (mode
, false, sel
))
4610 for (i
= 0; i
< nelt
; i
++)
4612 if (can_vec_perm_p (mode
, false, sel
))
4618 if (dump_enabled_p ())
4619 dump_printf (MSG_MISSED_OPTIMIZATION
,
4620 "permutaion op not supported by target.\n");
4625 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4629 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4631 return vect_lanes_optab_supported_p ("vec_store_lanes",
4632 vec_store_lanes_optab
,
4637 /* Function vect_permute_store_chain.
4639 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4640 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4641 the data correctly for the stores. Return the final references for stores
4644 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4645 The input is 4 vectors each containing 8 elements. We assign a number to
4646 each element, the input sequence is:
4648 1st vec: 0 1 2 3 4 5 6 7
4649 2nd vec: 8 9 10 11 12 13 14 15
4650 3rd vec: 16 17 18 19 20 21 22 23
4651 4th vec: 24 25 26 27 28 29 30 31
4653 The output sequence should be:
4655 1st vec: 0 8 16 24 1 9 17 25
4656 2nd vec: 2 10 18 26 3 11 19 27
4657 3rd vec: 4 12 20 28 5 13 21 30
4658 4th vec: 6 14 22 30 7 15 23 31
4660 i.e., we interleave the contents of the four vectors in their order.
4662 We use interleave_high/low instructions to create such output. The input of
4663 each interleave_high/low operation is two vectors:
4666 the even elements of the result vector are obtained left-to-right from the
4667 high/low elements of the first vector. The odd elements of the result are
4668 obtained left-to-right from the high/low elements of the second vector.
4669 The output of interleave_high will be: 0 4 1 5
4670 and of interleave_low: 2 6 3 7
4673 The permutation is done in log LENGTH stages. In each stage interleave_high
4674 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4675 where the first argument is taken from the first half of DR_CHAIN and the
4676 second argument from it's second half.
4679 I1: interleave_high (1st vec, 3rd vec)
4680 I2: interleave_low (1st vec, 3rd vec)
4681 I3: interleave_high (2nd vec, 4th vec)
4682 I4: interleave_low (2nd vec, 4th vec)
4684 The output for the first stage is:
4686 I1: 0 16 1 17 2 18 3 19
4687 I2: 4 20 5 21 6 22 7 23
4688 I3: 8 24 9 25 10 26 11 27
4689 I4: 12 28 13 29 14 30 15 31
4691 The output of the second stage, i.e. the final result is:
4693 I1: 0 8 16 24 1 9 17 25
4694 I2: 2 10 18 26 3 11 19 27
4695 I3: 4 12 20 28 5 13 21 30
4696 I4: 6 14 22 30 7 15 23 31. */
4699 vect_permute_store_chain (vec
<tree
> dr_chain
,
4700 unsigned int length
,
4702 gimple_stmt_iterator
*gsi
,
4703 vec
<tree
> *result_chain
)
4705 tree vect1
, vect2
, high
, low
;
4707 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4708 tree perm_mask_low
, perm_mask_high
;
4710 tree perm3_mask_low
, perm3_mask_high
;
4711 unsigned int i
, n
, log_length
= exact_log2 (length
);
4712 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4713 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4715 result_chain
->quick_grow (length
);
4716 memcpy (result_chain
->address (), dr_chain
.address (),
4717 length
* sizeof (tree
));
4721 unsigned int j0
= 0, j1
= 0, j2
= 0;
4723 for (j
= 0; j
< 3; j
++)
4725 int nelt0
= ((3 - j
) * nelt
) % 3;
4726 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4727 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4729 for (i
= 0; i
< nelt
; i
++)
4731 if (3 * i
+ nelt0
< nelt
)
4732 sel
[3 * i
+ nelt0
] = j0
++;
4733 if (3 * i
+ nelt1
< nelt
)
4734 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4735 if (3 * i
+ nelt2
< nelt
)
4736 sel
[3 * i
+ nelt2
] = 0;
4738 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4740 for (i
= 0; i
< nelt
; i
++)
4742 if (3 * i
+ nelt0
< nelt
)
4743 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4744 if (3 * i
+ nelt1
< nelt
)
4745 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4746 if (3 * i
+ nelt2
< nelt
)
4747 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4749 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4751 vect1
= dr_chain
[0];
4752 vect2
= dr_chain
[1];
4754 /* Create interleaving stmt:
4755 low = VEC_PERM_EXPR <vect1, vect2,
4756 {j, nelt, *, j + 1, nelt + j + 1, *,
4757 j + 2, nelt + j + 2, *, ...}> */
4758 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4759 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4760 vect2
, perm3_mask_low
);
4761 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4764 vect2
= dr_chain
[2];
4765 /* Create interleaving stmt:
4766 low = VEC_PERM_EXPR <vect1, vect2,
4767 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4768 6, 7, nelt + j + 2, ...}> */
4769 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4770 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4771 vect2
, perm3_mask_high
);
4772 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4773 (*result_chain
)[j
] = data_ref
;
4778 /* If length is not equal to 3 then only power of 2 is supported. */
4779 gcc_assert (pow2p_hwi (length
));
4781 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4784 sel
[i
* 2 + 1] = i
+ nelt
;
4786 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4788 for (i
= 0; i
< nelt
; i
++)
4790 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4792 for (i
= 0, n
= log_length
; i
< n
; i
++)
4794 for (j
= 0; j
< length
/2; j
++)
4796 vect1
= dr_chain
[j
];
4797 vect2
= dr_chain
[j
+length
/2];
4799 /* Create interleaving stmt:
4800 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4802 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4803 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4804 vect2
, perm_mask_high
);
4805 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4806 (*result_chain
)[2*j
] = high
;
4808 /* Create interleaving stmt:
4809 low = VEC_PERM_EXPR <vect1, vect2,
4810 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4812 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4813 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4814 vect2
, perm_mask_low
);
4815 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4816 (*result_chain
)[2*j
+1] = low
;
4818 memcpy (dr_chain
.address (), result_chain
->address (),
4819 length
* sizeof (tree
));
4824 /* Function vect_setup_realignment
4826 This function is called when vectorizing an unaligned load using
4827 the dr_explicit_realign[_optimized] scheme.
4828 This function generates the following code at the loop prolog:
4831 x msq_init = *(floor(p)); # prolog load
4832 realignment_token = call target_builtin;
4834 x msq = phi (msq_init, ---)
4836 The stmts marked with x are generated only for the case of
4837 dr_explicit_realign_optimized.
4839 The code above sets up a new (vector) pointer, pointing to the first
4840 location accessed by STMT, and a "floor-aligned" load using that pointer.
4841 It also generates code to compute the "realignment-token" (if the relevant
4842 target hook was defined), and creates a phi-node at the loop-header bb
4843 whose arguments are the result of the prolog-load (created by this
4844 function) and the result of a load that takes place in the loop (to be
4845 created by the caller to this function).
4847 For the case of dr_explicit_realign_optimized:
4848 The caller to this function uses the phi-result (msq) to create the
4849 realignment code inside the loop, and sets up the missing phi argument,
4852 msq = phi (msq_init, lsq)
4853 lsq = *(floor(p')); # load in loop
4854 result = realign_load (msq, lsq, realignment_token);
4856 For the case of dr_explicit_realign:
4858 msq = *(floor(p)); # load in loop
4860 lsq = *(floor(p')); # load in loop
4861 result = realign_load (msq, lsq, realignment_token);
4864 STMT - (scalar) load stmt to be vectorized. This load accesses
4865 a memory location that may be unaligned.
4866 BSI - place where new code is to be inserted.
4867 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4871 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4872 target hook, if defined.
4873 Return value - the result of the loop-header phi node. */
4876 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4877 tree
*realignment_token
,
4878 enum dr_alignment_support alignment_support_scheme
,
4880 struct loop
**at_loop
)
4882 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4883 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4884 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4885 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4886 struct loop
*loop
= NULL
;
4888 tree scalar_dest
= gimple_assign_lhs (stmt
);
4894 tree msq_init
= NULL_TREE
;
4897 tree msq
= NULL_TREE
;
4898 gimple_seq stmts
= NULL
;
4900 bool compute_in_loop
= false;
4901 bool nested_in_vect_loop
= false;
4902 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4903 struct loop
*loop_for_initial_load
= NULL
;
4907 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4908 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4911 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4912 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4914 /* We need to generate three things:
4915 1. the misalignment computation
4916 2. the extra vector load (for the optimized realignment scheme).
4917 3. the phi node for the two vectors from which the realignment is
4918 done (for the optimized realignment scheme). */
4920 /* 1. Determine where to generate the misalignment computation.
4922 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4923 calculation will be generated by this function, outside the loop (in the
4924 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4925 caller, inside the loop.
4927 Background: If the misalignment remains fixed throughout the iterations of
4928 the loop, then both realignment schemes are applicable, and also the
4929 misalignment computation can be done outside LOOP. This is because we are
4930 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4931 are a multiple of VS (the Vector Size), and therefore the misalignment in
4932 different vectorized LOOP iterations is always the same.
4933 The problem arises only if the memory access is in an inner-loop nested
4934 inside LOOP, which is now being vectorized using outer-loop vectorization.
4935 This is the only case when the misalignment of the memory access may not
4936 remain fixed throughout the iterations of the inner-loop (as explained in
4937 detail in vect_supportable_dr_alignment). In this case, not only is the
4938 optimized realignment scheme not applicable, but also the misalignment
4939 computation (and generation of the realignment token that is passed to
4940 REALIGN_LOAD) have to be done inside the loop.
4942 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4943 or not, which in turn determines if the misalignment is computed inside
4944 the inner-loop, or outside LOOP. */
4946 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4948 compute_in_loop
= true;
4949 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4953 /* 2. Determine where to generate the extra vector load.
4955 For the optimized realignment scheme, instead of generating two vector
4956 loads in each iteration, we generate a single extra vector load in the
4957 preheader of the loop, and in each iteration reuse the result of the
4958 vector load from the previous iteration. In case the memory access is in
4959 an inner-loop nested inside LOOP, which is now being vectorized using
4960 outer-loop vectorization, we need to determine whether this initial vector
4961 load should be generated at the preheader of the inner-loop, or can be
4962 generated at the preheader of LOOP. If the memory access has no evolution
4963 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4964 to be generated inside LOOP (in the preheader of the inner-loop). */
4966 if (nested_in_vect_loop
)
4968 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4969 bool invariant_in_outerloop
=
4970 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4971 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4974 loop_for_initial_load
= loop
;
4976 *at_loop
= loop_for_initial_load
;
4978 if (loop_for_initial_load
)
4979 pe
= loop_preheader_edge (loop_for_initial_load
);
4981 /* 3. For the case of the optimized realignment, create the first vector
4982 load at the loop preheader. */
4984 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4986 /* Create msq_init = *(floor(p1)) in the loop preheader */
4989 gcc_assert (!compute_in_loop
);
4990 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4991 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4992 NULL_TREE
, &init_addr
, NULL
, &inc
,
4994 if (TREE_CODE (ptr
) == SSA_NAME
)
4995 new_temp
= copy_ssa_name (ptr
);
4997 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
4998 new_stmt
= gimple_build_assign
4999 (new_temp
, BIT_AND_EXPR
, ptr
,
5000 build_int_cst (TREE_TYPE (ptr
),
5001 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5002 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5003 gcc_assert (!new_bb
);
5005 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5006 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5007 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5008 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5009 gimple_assign_set_lhs (new_stmt
, new_temp
);
5012 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5013 gcc_assert (!new_bb
);
5016 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5018 msq_init
= gimple_assign_lhs (new_stmt
);
5021 /* 4. Create realignment token using a target builtin, if available.
5022 It is done either inside the containing loop, or before LOOP (as
5023 determined above). */
5025 if (targetm
.vectorize
.builtin_mask_for_load
)
5030 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5033 /* Generate the INIT_ADDR computation outside LOOP. */
5034 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5038 pe
= loop_preheader_edge (loop
);
5039 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5040 gcc_assert (!new_bb
);
5043 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5046 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5047 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5049 vect_create_destination_var (scalar_dest
,
5050 gimple_call_return_type (new_stmt
));
5051 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5052 gimple_call_set_lhs (new_stmt
, new_temp
);
5054 if (compute_in_loop
)
5055 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5058 /* Generate the misalignment computation outside LOOP. */
5059 pe
= loop_preheader_edge (loop
);
5060 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5061 gcc_assert (!new_bb
);
5064 *realignment_token
= gimple_call_lhs (new_stmt
);
5066 /* The result of the CALL_EXPR to this builtin is determined from
5067 the value of the parameter and no global variables are touched
5068 which makes the builtin a "const" function. Requiring the
5069 builtin to have the "const" attribute makes it unnecessary
5070 to call mark_call_clobbered. */
5071 gcc_assert (TREE_READONLY (builtin_decl
));
5074 if (alignment_support_scheme
== dr_explicit_realign
)
5077 gcc_assert (!compute_in_loop
);
5078 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5081 /* 5. Create msq = phi <msq_init, lsq> in loop */
5083 pe
= loop_preheader_edge (containing_loop
);
5084 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5085 msq
= make_ssa_name (vec_dest
);
5086 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5087 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5093 /* Function vect_grouped_load_supported.
5095 COUNT is the size of the load group (the number of statements plus the
5096 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5097 only one statement, with a gap of COUNT - 1.
5099 Returns true if a suitable permute exists. */
5102 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5103 unsigned HOST_WIDE_INT count
)
5105 machine_mode mode
= TYPE_MODE (vectype
);
5107 /* If this is single-element interleaving with an element distance
5108 that leaves unused vector loads around punt - we at least create
5109 very sub-optimal code in that case (and blow up memory,
5111 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5115 "single-element interleaving not supported "
5116 "for not adjacent vector loads\n");
5120 /* vect_permute_load_chain requires the group size to be equal to 3 or
5121 be a power of two. */
5122 if (count
!= 3 && exact_log2 (count
) == -1)
5124 if (dump_enabled_p ())
5125 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5126 "the size of the group of accesses"
5127 " is not a power of 2 or not equal to 3\n");
5131 /* Check that the permutation is supported. */
5132 if (VECTOR_MODE_P (mode
))
5134 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5135 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5140 for (k
= 0; k
< 3; k
++)
5142 for (i
= 0; i
< nelt
; i
++)
5143 if (3 * i
+ k
< 2 * nelt
)
5147 if (!can_vec_perm_p (mode
, false, sel
))
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5151 "shuffle of 3 loads is not supported by"
5155 for (i
= 0, j
= 0; i
< nelt
; i
++)
5156 if (3 * i
+ k
< 2 * nelt
)
5159 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5160 if (!can_vec_perm_p (mode
, false, sel
))
5162 if (dump_enabled_p ())
5163 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5164 "shuffle of 3 loads is not supported by"
5173 /* If length is not equal to 3 then only power of 2 is supported. */
5174 gcc_assert (pow2p_hwi (count
));
5175 for (i
= 0; i
< nelt
; i
++)
5177 if (can_vec_perm_p (mode
, false, sel
))
5179 for (i
= 0; i
< nelt
; i
++)
5181 if (can_vec_perm_p (mode
, false, sel
))
5187 if (dump_enabled_p ())
5188 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5189 "extract even/odd not supported by target\n");
5193 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5197 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5199 return vect_lanes_optab_supported_p ("vec_load_lanes",
5200 vec_load_lanes_optab
,
5204 /* Function vect_permute_load_chain.
5206 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5207 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5208 the input data correctly. Return the final references for loads in
5211 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212 The input is 4 vectors each containing 8 elements. We assign a number to each
5213 element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5218 4th vec: 24 25 26 27 28 29 30 31
5220 The output sequence should be:
5222 1st vec: 0 4 8 12 16 20 24 28
5223 2nd vec: 1 5 9 13 17 21 25 29
5224 3rd vec: 2 6 10 14 18 22 26 30
5225 4th vec: 3 7 11 15 19 23 27 31
5227 i.e., the first output vector should contain the first elements of each
5228 interleaving group, etc.
5230 We use extract_even/odd instructions to create such output. The input of
5231 each extract_even/odd operation is two vectors
5235 and the output is the vector of extracted even/odd elements. The output of
5236 extract_even will be: 0 2 4 6
5237 and of extract_odd: 1 3 5 7
5240 The permutation is done in log LENGTH stages. In each stage extract_even
5241 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5242 their order. In our example,
5244 E1: extract_even (1st vec, 2nd vec)
5245 E2: extract_odd (1st vec, 2nd vec)
5246 E3: extract_even (3rd vec, 4th vec)
5247 E4: extract_odd (3rd vec, 4th vec)
5249 The output for the first stage will be:
5251 E1: 0 2 4 6 8 10 12 14
5252 E2: 1 3 5 7 9 11 13 15
5253 E3: 16 18 20 22 24 26 28 30
5254 E4: 17 19 21 23 25 27 29 31
5256 In order to proceed and create the correct sequence for the next stage (or
5257 for the correct output, if the second stage is the last one, as in our
5258 example), we first put the output of extract_even operation and then the
5259 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5260 The input for the second stage is:
5262 1st vec (E1): 0 2 4 6 8 10 12 14
5263 2nd vec (E3): 16 18 20 22 24 26 28 30
5264 3rd vec (E2): 1 3 5 7 9 11 13 15
5265 4th vec (E4): 17 19 21 23 25 27 29 31
5267 The output of the second stage:
5269 E1: 0 4 8 12 16 20 24 28
5270 E2: 2 6 10 14 18 22 26 30
5271 E3: 1 5 9 13 17 21 25 29
5272 E4: 3 7 11 15 19 23 27 31
5274 And RESULT_CHAIN after reordering:
5276 1st vec (E1): 0 4 8 12 16 20 24 28
5277 2nd vec (E3): 1 5 9 13 17 21 25 29
5278 3rd vec (E2): 2 6 10 14 18 22 26 30
5279 4th vec (E4): 3 7 11 15 19 23 27 31. */
5282 vect_permute_load_chain (vec
<tree
> dr_chain
,
5283 unsigned int length
,
5285 gimple_stmt_iterator
*gsi
,
5286 vec
<tree
> *result_chain
)
5288 tree data_ref
, first_vect
, second_vect
;
5289 tree perm_mask_even
, perm_mask_odd
;
5290 tree perm3_mask_low
, perm3_mask_high
;
5292 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5293 unsigned int i
, j
, log_length
= exact_log2 (length
);
5294 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5295 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5297 result_chain
->quick_grow (length
);
5298 memcpy (result_chain
->address (), dr_chain
.address (),
5299 length
* sizeof (tree
));
5305 for (k
= 0; k
< 3; k
++)
5307 for (i
= 0; i
< nelt
; i
++)
5308 if (3 * i
+ k
< 2 * nelt
)
5312 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5314 for (i
= 0, j
= 0; i
< nelt
; i
++)
5315 if (3 * i
+ k
< 2 * nelt
)
5318 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5320 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5322 first_vect
= dr_chain
[0];
5323 second_vect
= dr_chain
[1];
5325 /* Create interleaving stmt (low part of):
5326 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5328 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5329 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5330 second_vect
, perm3_mask_low
);
5331 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5333 /* Create interleaving stmt (high part of):
5334 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5336 first_vect
= data_ref
;
5337 second_vect
= dr_chain
[2];
5338 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5339 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5340 second_vect
, perm3_mask_high
);
5341 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5342 (*result_chain
)[k
] = data_ref
;
5347 /* If length is not equal to 3 then only power of 2 is supported. */
5348 gcc_assert (pow2p_hwi (length
));
5350 for (i
= 0; i
< nelt
; ++i
)
5352 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5354 for (i
= 0; i
< nelt
; ++i
)
5356 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5358 for (i
= 0; i
< log_length
; i
++)
5360 for (j
= 0; j
< length
; j
+= 2)
5362 first_vect
= dr_chain
[j
];
5363 second_vect
= dr_chain
[j
+1];
5365 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5366 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5367 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5368 first_vect
, second_vect
,
5370 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5371 (*result_chain
)[j
/2] = data_ref
;
5373 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5374 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5375 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5376 first_vect
, second_vect
,
5378 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5379 (*result_chain
)[j
/2+length
/2] = data_ref
;
5381 memcpy (dr_chain
.address (), result_chain
->address (),
5382 length
* sizeof (tree
));
5387 /* Function vect_shift_permute_load_chain.
5389 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5390 sequence of stmts to reorder the input data accordingly.
5391 Return the final references for loads in RESULT_CHAIN.
5392 Return true if successed, false otherwise.
5394 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5395 The input is 3 vectors each containing 8 elements. We assign a
5396 number to each element, the input sequence is:
5398 1st vec: 0 1 2 3 4 5 6 7
5399 2nd vec: 8 9 10 11 12 13 14 15
5400 3rd vec: 16 17 18 19 20 21 22 23
5402 The output sequence should be:
5404 1st vec: 0 3 6 9 12 15 18 21
5405 2nd vec: 1 4 7 10 13 16 19 22
5406 3rd vec: 2 5 8 11 14 17 20 23
5408 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5410 First we shuffle all 3 vectors to get correct elements order:
5412 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5413 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5414 3rd vec: (16 19 22) (17 20 23) (18 21)
5416 Next we unite and shift vector 3 times:
5419 shift right by 6 the concatenation of:
5420 "1st vec" and "2nd vec"
5421 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5422 "2nd vec" and "3rd vec"
5423 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5424 "3rd vec" and "1st vec"
5425 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5428 So that now new vectors are:
5430 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5431 2nd vec: (10 13) (16 19 22) (17 20 23)
5432 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5435 shift right by 5 the concatenation of:
5436 "1st vec" and "3rd vec"
5437 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5438 "2nd vec" and "1st vec"
5439 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5440 "3rd vec" and "2nd vec"
5441 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5444 So that now new vectors are:
5446 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5447 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5448 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5451 shift right by 5 the concatenation of:
5452 "1st vec" and "1st vec"
5453 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5454 shift right by 3 the concatenation of:
5455 "2nd vec" and "2nd vec"
5456 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5459 So that now all vectors are READY:
5460 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5461 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5462 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5464 This algorithm is faster than one in vect_permute_load_chain if:
5465 1. "shift of a concatination" is faster than general permutation.
5467 2. The TARGET machine can't execute vector instructions in parallel.
5468 This is because each step of the algorithm depends on previous.
5469 The algorithm in vect_permute_load_chain is much more parallel.
5471 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5475 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5476 unsigned int length
,
5478 gimple_stmt_iterator
*gsi
,
5479 vec
<tree
> *result_chain
)
5481 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5482 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5483 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5486 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5488 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5489 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5490 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5491 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5493 result_chain
->quick_grow (length
);
5494 memcpy (result_chain
->address (), dr_chain
.address (),
5495 length
* sizeof (tree
));
5497 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5499 unsigned int j
, log_length
= exact_log2 (length
);
5500 for (i
= 0; i
< nelt
/ 2; ++i
)
5502 for (i
= 0; i
< nelt
/ 2; ++i
)
5503 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5504 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5508 "shuffle of 2 fields structure is not \
5509 supported by target\n");
5512 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5514 for (i
= 0; i
< nelt
/ 2; ++i
)
5516 for (i
= 0; i
< nelt
/ 2; ++i
)
5517 sel
[nelt
/ 2 + i
] = i
* 2;
5518 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5522 "shuffle of 2 fields structure is not \
5523 supported by target\n");
5526 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5528 /* Generating permutation constant to shift all elements.
5529 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5530 for (i
= 0; i
< nelt
; i
++)
5531 sel
[i
] = nelt
/ 2 + i
;
5532 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5534 if (dump_enabled_p ())
5535 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5536 "shift permutation is not supported by target\n");
5539 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5541 /* Generating permutation constant to select vector from 2.
5542 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5543 for (i
= 0; i
< nelt
/ 2; i
++)
5545 for (i
= nelt
/ 2; i
< nelt
; i
++)
5547 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5551 "select is not supported by target\n");
5554 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5556 for (i
= 0; i
< log_length
; i
++)
5558 for (j
= 0; j
< length
; j
+= 2)
5560 first_vect
= dr_chain
[j
];
5561 second_vect
= dr_chain
[j
+ 1];
5563 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5564 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5565 first_vect
, first_vect
,
5567 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5570 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5571 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5572 second_vect
, second_vect
,
5574 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5577 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5578 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5579 vect
[0], vect
[1], shift1_mask
);
5580 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5581 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5583 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5584 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5585 vect
[0], vect
[1], select_mask
);
5586 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5587 (*result_chain
)[j
/2] = data_ref
;
5589 memcpy (dr_chain
.address (), result_chain
->address (),
5590 length
* sizeof (tree
));
5594 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5596 unsigned int k
= 0, l
= 0;
5598 /* Generating permutation constant to get all elements in rigth order.
5599 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5600 for (i
= 0; i
< nelt
; i
++)
5602 if (3 * k
+ (l
% 3) >= nelt
)
5605 l
+= (3 - (nelt
% 3));
5607 sel
[i
] = 3 * k
+ (l
% 3);
5610 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5614 "shuffle of 3 fields structure is not \
5615 supported by target\n");
5618 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5620 /* Generating permutation constant to shift all elements.
5621 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5622 for (i
= 0; i
< nelt
; i
++)
5623 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5624 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5628 "shift permutation is not supported by target\n");
5631 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5633 /* Generating permutation constant to shift all elements.
5634 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5635 for (i
= 0; i
< nelt
; i
++)
5636 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5637 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5639 if (dump_enabled_p ())
5640 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5641 "shift permutation is not supported by target\n");
5644 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5646 /* Generating permutation constant to shift all elements.
5647 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5648 for (i
= 0; i
< nelt
; i
++)
5649 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5650 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5654 "shift permutation is not supported by target\n");
5657 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5659 /* Generating permutation constant to shift all elements.
5660 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5661 for (i
= 0; i
< nelt
; i
++)
5662 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5663 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5665 if (dump_enabled_p ())
5666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5667 "shift permutation is not supported by target\n");
5670 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5672 for (k
= 0; k
< 3; k
++)
5674 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5675 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5676 dr_chain
[k
], dr_chain
[k
],
5678 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5682 for (k
= 0; k
< 3; k
++)
5684 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5685 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5686 vect
[k
% 3], vect
[(k
+ 1) % 3],
5688 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5689 vect_shift
[k
] = data_ref
;
5692 for (k
= 0; k
< 3; k
++)
5694 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5695 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5696 vect_shift
[(4 - k
) % 3],
5697 vect_shift
[(3 - k
) % 3],
5699 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5703 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5705 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5706 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5707 vect
[0], shift3_mask
);
5708 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5709 (*result_chain
)[nelt
% 3] = data_ref
;
5711 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5712 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5713 vect
[1], shift4_mask
);
5714 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5715 (*result_chain
)[0] = data_ref
;
5721 /* Function vect_transform_grouped_load.
5723 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5724 to perform their permutation and ascribe the result vectorized statements to
5725 the scalar statements.
5729 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5730 gimple_stmt_iterator
*gsi
)
5733 vec
<tree
> result_chain
= vNULL
;
5735 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5736 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5737 vectors, that are ready for vector computation. */
5738 result_chain
.create (size
);
5740 /* If reassociation width for vector type is 2 or greater target machine can
5741 execute 2 or more vector instructions in parallel. Otherwise try to
5742 get chain for loads group using vect_shift_permute_load_chain. */
5743 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5744 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5746 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5747 gsi
, &result_chain
))
5748 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5749 vect_record_grouped_load_vectors (stmt
, result_chain
);
5750 result_chain
.release ();
5753 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5754 generated as part of the vectorization of STMT. Assign the statement
5755 for each vector to the associated scalar statement. */
5758 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5760 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5761 gimple
*next_stmt
, *new_stmt
;
5762 unsigned int i
, gap_count
;
5765 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5766 Since we scan the chain starting from it's first node, their order
5767 corresponds the order of data-refs in RESULT_CHAIN. */
5768 next_stmt
= first_stmt
;
5770 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5775 /* Skip the gaps. Loads created for the gaps will be removed by dead
5776 code elimination pass later. No need to check for the first stmt in
5777 the group, since it always exists.
5778 GROUP_GAP is the number of steps in elements from the previous
5779 access (if there is no gap GROUP_GAP is 1). We skip loads that
5780 correspond to the gaps. */
5781 if (next_stmt
!= first_stmt
5782 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5790 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5791 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5792 copies, and we put the new vector statement in the first available
5794 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5795 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5798 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5801 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5803 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5806 prev_stmt
= rel_stmt
;
5808 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5811 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5816 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5818 /* If NEXT_STMT accesses the same DR as the previous statement,
5819 put the same TMP_DATA_REF as its vectorized statement; otherwise
5820 get the next data-ref from RESULT_CHAIN. */
5821 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5827 /* Function vect_force_dr_alignment_p.
5829 Returns whether the alignment of a DECL can be forced to be aligned
5830 on ALIGNMENT bit boundary. */
5833 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5838 if (decl_in_symtab_p (decl
)
5839 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5842 if (TREE_STATIC (decl
))
5843 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5845 return (alignment
<= MAX_STACK_ALIGNMENT
);
5849 /* Return whether the data reference DR is supported with respect to its
5851 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5852 it is aligned, i.e., check if it is possible to vectorize it with different
5855 enum dr_alignment_support
5856 vect_supportable_dr_alignment (struct data_reference
*dr
,
5857 bool check_aligned_accesses
)
5859 gimple
*stmt
= DR_STMT (dr
);
5860 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5861 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5862 machine_mode mode
= TYPE_MODE (vectype
);
5863 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5864 struct loop
*vect_loop
= NULL
;
5865 bool nested_in_vect_loop
= false;
5867 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5870 /* For now assume all conditional loads/stores support unaligned
5871 access without any special code. */
5872 if (is_gimple_call (stmt
)
5873 && gimple_call_internal_p (stmt
)
5874 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5875 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5876 return dr_unaligned_supported
;
5880 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5881 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5884 /* Possibly unaligned access. */
5886 /* We can choose between using the implicit realignment scheme (generating
5887 a misaligned_move stmt) and the explicit realignment scheme (generating
5888 aligned loads with a REALIGN_LOAD). There are two variants to the
5889 explicit realignment scheme: optimized, and unoptimized.
5890 We can optimize the realignment only if the step between consecutive
5891 vector loads is equal to the vector size. Since the vector memory
5892 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5893 is guaranteed that the misalignment amount remains the same throughout the
5894 execution of the vectorized loop. Therefore, we can create the
5895 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5896 at the loop preheader.
5898 However, in the case of outer-loop vectorization, when vectorizing a
5899 memory access in the inner-loop nested within the LOOP that is now being
5900 vectorized, while it is guaranteed that the misalignment of the
5901 vectorized memory access will remain the same in different outer-loop
5902 iterations, it is *not* guaranteed that is will remain the same throughout
5903 the execution of the inner-loop. This is because the inner-loop advances
5904 with the original scalar step (and not in steps of VS). If the inner-loop
5905 step happens to be a multiple of VS, then the misalignment remains fixed
5906 and we can use the optimized realignment scheme. For example:
5912 When vectorizing the i-loop in the above example, the step between
5913 consecutive vector loads is 1, and so the misalignment does not remain
5914 fixed across the execution of the inner-loop, and the realignment cannot
5915 be optimized (as illustrated in the following pseudo vectorized loop):
5917 for (i=0; i<N; i+=4)
5918 for (j=0; j<M; j++){
5919 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5920 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5921 // (assuming that we start from an aligned address).
5924 We therefore have to use the unoptimized realignment scheme:
5926 for (i=0; i<N; i+=4)
5927 for (j=k; j<M; j+=4)
5928 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5929 // that the misalignment of the initial address is
5932 The loop can then be vectorized as follows:
5934 for (k=0; k<4; k++){
5935 rt = get_realignment_token (&vp[k]);
5936 for (i=0; i<N; i+=4){
5938 for (j=k; j<M; j+=4){
5940 va = REALIGN_LOAD <v1,v2,rt>;
5947 if (DR_IS_READ (dr
))
5949 bool is_packed
= false;
5950 tree type
= (TREE_TYPE (DR_REF (dr
)));
5952 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5953 && (!targetm
.vectorize
.builtin_mask_for_load
5954 || targetm
.vectorize
.builtin_mask_for_load ()))
5956 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5958 /* If we are doing SLP then the accesses need not have the
5959 same alignment, instead it depends on the SLP group size. */
5961 && STMT_SLP_TYPE (stmt_info
)
5962 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5963 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
5964 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
5966 else if (!loop_vinfo
5967 || (nested_in_vect_loop
5968 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5969 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
5970 return dr_explicit_realign
;
5972 return dr_explicit_realign_optimized
;
5974 if (!known_alignment_for_access_p (dr
))
5975 is_packed
= not_size_aligned (DR_REF (dr
));
5977 if (targetm
.vectorize
.support_vector_misalignment
5978 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
5979 /* Can't software pipeline the loads, but can at least do them. */
5980 return dr_unaligned_supported
;
5984 bool is_packed
= false;
5985 tree type
= (TREE_TYPE (DR_REF (dr
)));
5987 if (!known_alignment_for_access_p (dr
))
5988 is_packed
= not_size_aligned (DR_REF (dr
));
5990 if (targetm
.vectorize
.support_vector_misalignment
5991 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
5992 return dr_unaligned_supported
;
5996 return dr_unaligned_unsupported
;