1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
55 /* Return true if load- or store-lanes optab OPTAB is implemented for
56 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
60 tree vectype
, unsigned HOST_WIDE_INT count
)
62 machine_mode mode
, array_mode
;
65 mode
= TYPE_MODE (vectype
);
66 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
67 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
70 if (array_mode
== BLKmode
)
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
75 GET_MODE_NAME (mode
), count
);
79 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
83 "cannot use %s<%s><%s>\n", name
,
84 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE
, vect_location
,
90 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
91 GET_MODE_NAME (mode
));
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
116 HOST_WIDE_INT
*rhs_size_unit
)
118 tree scalar_type
= gimple_expr_type (stmt
);
119 HOST_WIDE_INT lhs
, rhs
;
121 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
123 if (is_gimple_assign (stmt
)
124 && (gimple_assign_cast_p (stmt
)
125 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
129 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
131 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
133 scalar_type
= rhs_type
;
136 *lhs_size_unit
= lhs
;
137 *rhs_size_unit
= rhs
;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
147 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
149 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
154 if (!runtime_alias_check_p (ddr
, loop
,
155 optimize_loop_nest_for_speed_p (loop
)))
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
163 /* Function vect_analyze_data_ref_dependence.
165 Return TRUE if there (might) exist a dependence between a memory-reference
166 DRA and a memory-reference DRB. When versioning for alias may check a
167 dependence at run-time, return FALSE. Adjust *MAX_VF according to
168 the data dependence. */
171 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
172 loop_vec_info loop_vinfo
, int *max_vf
)
175 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
176 struct data_reference
*dra
= DDR_A (ddr
);
177 struct data_reference
*drb
= DDR_B (ddr
);
178 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
179 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
180 lambda_vector dist_v
;
181 unsigned int loop_depth
;
183 /* In loop analysis all data references should be vectorizable. */
184 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
185 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
188 /* Independent data accesses. */
189 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
193 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
196 /* We do not have to consider dependences between accesses that belong
197 to the same group. */
198 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
199 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
202 /* Even if we have an anti-dependence then, as the vectorized loop covers at
203 least two scalar iterations, there is always also a true dependence.
204 As the vectorizer does not re-order loads and stores we can ignore
205 the anti-dependence if TBAA can disambiguate both DRs similar to the
206 case with known negative distance anti-dependences (positive
207 distance anti-dependences would violate TBAA constraints). */
208 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
209 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
210 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
211 get_alias_set (DR_REF (drb
))))
214 /* Unknown data dependence. */
215 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
217 /* If user asserted safelen consecutive iterations can be
218 executed concurrently, assume independence. */
219 if (loop
->safelen
>= 2)
221 if (loop
->safelen
< *max_vf
)
222 *max_vf
= loop
->safelen
;
223 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
227 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
228 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
233 "versioning for alias not supported for: "
234 "can't determine dependence between ");
235 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
237 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
238 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
240 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
245 if (dump_enabled_p ())
247 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
248 "versioning for alias required: "
249 "can't determine dependence between ");
250 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
252 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
255 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
258 /* Add to list of ddrs that need to be tested at run-time. */
259 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
262 /* Known data dependence. */
263 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
265 /* If user asserted safelen consecutive iterations can be
266 executed concurrently, assume independence. */
267 if (loop
->safelen
>= 2)
269 if (loop
->safelen
< *max_vf
)
270 *max_vf
= loop
->safelen
;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
275 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
276 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
281 "versioning for alias not supported for: "
282 "bad dist vector for ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
285 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
288 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
296 "versioning for alias required: "
297 "bad dist vector for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
299 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
301 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
303 /* Add to list of ddrs that need to be tested at run-time. */
304 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
307 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
310 int dist
= dist_v
[loop_depth
];
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE
, vect_location
,
314 "dependence distance = %d.\n", dist
);
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE
, vect_location
,
321 "dependence distance == 0 between ");
322 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
323 dump_printf (MSG_NOTE
, " and ");
324 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
325 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 /* When we perform grouped accesses and perform implicit CSE
329 by detecting equal accesses and doing disambiguation with
330 runtime alias tests like for
338 where we will end up loading { a[i], a[i+1] } once, make
339 sure that inserting group loads before the first load and
340 stores after the last store will do the right thing.
341 Similar for groups like
345 where loads from the group interleave with the store. */
346 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
347 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
349 gimple
*earlier_stmt
;
350 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
352 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
356 "READ_WRITE dependence in interleaving."
365 if (dist
> 0 && DDR_REVERSED_P (ddr
))
367 /* If DDR_REVERSED_P the order of the data-refs in DDR was
368 reversed (to make distance vector positive), and the actual
369 distance is negative. */
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
372 "dependence distance negative.\n");
373 /* Record a negative dependence distance to later limit the
374 amount of stmt copying / unrolling we can perform.
375 Only need to handle read-after-write dependence. */
377 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
378 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
379 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
384 && abs (dist
) < *max_vf
)
386 /* The dependence distance requires reduction of the maximal
387 vectorization factor. */
388 *max_vf
= abs (dist
);
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE
, vect_location
,
391 "adjusting maximal vectorization factor to %i\n",
395 if (abs (dist
) >= *max_vf
)
397 /* Dependence distance does not create dependence, as far as
398 vectorization is concerned, in this case. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE
, vect_location
,
401 "dependence distance >= VF.\n");
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
408 "not vectorized, possible dependence "
409 "between data-refs ");
410 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
411 dump_printf (MSG_NOTE
, " and ");
412 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
413 dump_printf (MSG_NOTE
, "\n");
422 /* Function vect_analyze_data_ref_dependences.
424 Examine all the data references in the loop, and make sure there do not
425 exist any data dependences between them. Set *MAX_VF according to
426 the maximum vectorization factor the data dependences allow. */
429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
432 struct data_dependence_relation
*ddr
;
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "=== vect_analyze_data_ref_dependences ===\n");
438 LOOP_VINFO_DDRS (loop_vinfo
)
439 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
440 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
441 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
442 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
443 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
444 &LOOP_VINFO_DDRS (loop_vinfo
),
445 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
448 /* For epilogues we either have no aliases or alias versioning
449 was applied to original loop. Therefore we may just get max_vf
450 using VF of original loop. */
451 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
452 *max_vf
= LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo
);
454 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
455 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
462 /* Function vect_slp_analyze_data_ref_dependence.
464 Return TRUE if there (might) exist a dependence between a memory-reference
465 DRA and a memory-reference DRB. When versioning for alias may check a
466 dependence at run-time, return FALSE. Adjust *MAX_VF according to
467 the data dependence. */
470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
472 struct data_reference
*dra
= DDR_A (ddr
);
473 struct data_reference
*drb
= DDR_B (ddr
);
475 /* We need to check dependences of statements marked as unvectorizable
476 as well, they still can prohibit vectorization. */
478 /* Independent data accesses. */
479 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
485 /* Read-read is OK. */
486 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
489 /* If dra and drb are part of the same interleaving chain consider
491 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
496 /* Unknown data dependence. */
497 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
504 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
506 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
509 else if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE
, vect_location
,
512 "determined dependence between ");
513 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
514 dump_printf (MSG_NOTE
, " and ");
515 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
516 dump_printf (MSG_NOTE
, "\n");
523 /* Analyze dependences involved in the transform of SLP NODE. STORES
524 contain the vector of scalar stores of this instance if we are
525 disambiguating the loads. */
528 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
529 vec
<gimple
*> stores
, gimple
*last_store
)
531 /* This walks over all stmts involved in the SLP load/store done
532 in NODE verifying we can sink them up to the last stmt in the
534 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
535 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
537 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
538 if (access
== last_access
)
540 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
541 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
542 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
544 gimple
*stmt
= gsi_stmt (gsi
);
545 if (! gimple_vuse (stmt
)
546 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
549 /* If we couldn't record a (single) data reference for this
550 stmt we have to give up. */
551 /* ??? Here and below if dependence analysis fails we can resort
552 to the alias oracle which can handle more kinds of stmts. */
553 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
557 bool dependent
= false;
558 /* If we run into a store of this same instance (we've just
559 marked those) then delay dependence checking until we run
560 into the last store because this is where it will have
561 been sunk to (and we verify if we can do that as well). */
562 if (gimple_visited_p (stmt
))
564 if (stmt
!= last_store
)
568 FOR_EACH_VEC_ELT (stores
, i
, store
)
570 data_reference
*store_dr
571 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
572 ddr_p ddr
= initialize_data_dependence_relation
573 (dr_a
, store_dr
, vNULL
);
574 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
575 free_dependence_relation (ddr
);
582 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
584 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
585 free_dependence_relation (ddr
);
595 /* Function vect_analyze_data_ref_dependences.
597 Examine all the data references in the basic-block, and make sure there
598 do not exist any data dependences between them. Set *MAX_VF according to
599 the maximum vectorization factor the data dependences allow. */
602 vect_slp_analyze_instance_dependence (slp_instance instance
)
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_NOTE
, vect_location
,
606 "=== vect_slp_analyze_instance_dependence ===\n");
608 /* The stores of this instance are at the root of the SLP tree. */
609 slp_tree store
= SLP_INSTANCE_TREE (instance
);
610 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
613 /* Verify we can sink stores to the vectorized stmt insert location. */
614 gimple
*last_store
= NULL
;
617 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
620 /* Mark stores in this instance and remember the last one. */
621 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
622 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
623 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
628 /* Verify we can sink loads to the vectorized stmt insert location,
629 special-casing stores of this instance. */
632 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
633 if (! vect_slp_analyze_node_dependences (instance
, load
,
635 ? SLP_TREE_SCALAR_STMTS (store
)
636 : vNULL
, last_store
))
642 /* Unset the visited flag. */
644 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
645 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
650 /* Function vect_compute_data_ref_alignment
652 Compute the misalignment of the data reference DR.
655 1. If during the misalignment computation it is found that the data reference
656 cannot be vectorized then false is returned.
657 2. DR_MISALIGNMENT (DR) is defined.
659 FOR NOW: No analysis is actually performed. Misalignment is calculated
660 only for trivial cases. TODO. */
663 vect_compute_data_ref_alignment (struct data_reference
*dr
)
665 gimple
*stmt
= DR_STMT (dr
);
666 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
667 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
668 struct loop
*loop
= NULL
;
669 tree ref
= DR_REF (dr
);
670 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE
, vect_location
,
674 "vect_compute_data_ref_alignment:\n");
677 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
679 /* Initialize misalignment to unknown. */
680 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
682 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
683 bool step_preserves_misalignment_p
;
685 /* No step for BB vectorization. */
688 gcc_assert (integer_zerop (drb
->step
));
689 step_preserves_misalignment_p
= true;
692 /* In case the dataref is in an inner-loop of the loop that is being
693 vectorized (LOOP), we use the base and misalignment information
694 relative to the outer-loop (LOOP). This is ok only if the misalignment
695 stays the same throughout the execution of the inner-loop, which is why
696 we have to check that the stride of the dataref in the inner-loop evenly
697 divides by the vector size. */
698 else if (nested_in_vect_loop_p (loop
, stmt
))
700 step_preserves_misalignment_p
701 = (DR_STEP_ALIGNMENT (dr
)
702 % GET_MODE_SIZE (TYPE_MODE (vectype
))) == 0;
704 if (dump_enabled_p ())
706 if (step_preserves_misalignment_p
)
707 dump_printf_loc (MSG_NOTE
, vect_location
,
708 "inner step divides the vector-size.\n");
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
711 "inner step doesn't divide the vector-size.\n");
715 /* Similarly we can only use base and misalignment information relative to
716 an innermost loop if the misalignment stays the same throughout the
717 execution of the loop. As above, this is the case if the stride of
718 the dataref evenly divides by the vector size. */
721 unsigned vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
722 step_preserves_misalignment_p
723 = ((DR_STEP_ALIGNMENT (dr
) * vf
)
724 % GET_MODE_SIZE (TYPE_MODE (vectype
))) == 0;
726 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
728 "step doesn't divide the vector-size.\n");
731 unsigned int base_alignment
= drb
->base_alignment
;
732 unsigned int base_misalignment
= drb
->base_misalignment
;
733 unsigned HOST_WIDE_INT vector_alignment
= TYPE_ALIGN_UNIT (vectype
);
735 if (drb
->offset_alignment
< vector_alignment
736 || !step_preserves_misalignment_p
737 /* We need to know whether the step wrt the vectorized loop is
738 negative when computing the starting misalignment below. */
739 || TREE_CODE (drb
->step
) != INTEGER_CST
)
741 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
744 "Unknown alignment for access: ");
745 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
746 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
751 if (base_alignment
< vector_alignment
)
753 tree base
= drb
->base_address
;
754 if (TREE_CODE (base
) == ADDR_EXPR
)
755 base
= TREE_OPERAND (base
, 0);
756 if (!vect_can_force_dr_alignment_p (base
,
757 vector_alignment
* BITS_PER_UNIT
))
759 if (dump_enabled_p ())
761 dump_printf_loc (MSG_NOTE
, vect_location
,
762 "can't force alignment of ref: ");
763 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
764 dump_printf (MSG_NOTE
, "\n");
769 if (DECL_USER_ALIGN (base
))
771 if (dump_enabled_p ())
773 dump_printf_loc (MSG_NOTE
, vect_location
,
774 "not forcing alignment of user-aligned "
776 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
777 dump_printf (MSG_NOTE
, "\n");
782 /* Force the alignment of the decl.
783 NOTE: This is the only change to the code we make during
784 the analysis phase, before deciding to vectorize the loop. */
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
788 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
789 dump_printf (MSG_NOTE
, "\n");
792 DR_VECT_AUX (dr
)->base_decl
= base
;
793 DR_VECT_AUX (dr
)->base_misaligned
= true;
794 base_misalignment
= 0;
796 unsigned int misalignment
= (base_misalignment
797 + TREE_INT_CST_LOW (drb
->init
));
799 /* If this is a backward running DR then first access in the larger
800 vectype actually is N-1 elements before the address in the DR.
801 Adjust misalign accordingly. */
802 if (tree_int_cst_sgn (drb
->step
) < 0)
803 /* PLUS because STEP is negative. */
804 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
805 * TREE_INT_CST_LOW (drb
->step
));
807 SET_DR_MISALIGNMENT (dr
, misalignment
& (vector_alignment
- 1));
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
812 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
813 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
814 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
821 /* Function vect_update_misalignment_for_peel.
822 Sets DR's misalignment
823 - to 0 if it has the same alignment as DR_PEEL,
824 - to the misalignment computed using NPEEL if DR's salignment is known,
825 - to -1 (unknown) otherwise.
827 DR - the data reference whose misalignment is to be adjusted.
828 DR_PEEL - the data reference whose misalignment is being made
829 zero in the vector loop by the peel.
830 NPEEL - the number of iterations in the peel loop if the misalignment
831 of DR_PEEL is known at compile time. */
834 vect_update_misalignment_for_peel (struct data_reference
*dr
,
835 struct data_reference
*dr_peel
, int npeel
)
838 vec
<dr_p
> same_aligned_drs
;
839 struct data_reference
*current_dr
;
840 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
841 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
842 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
843 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
845 /* For interleaved data accesses the step in the loop must be multiplied by
846 the size of the interleaving group. */
847 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
848 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
849 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
850 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
852 /* It can be assumed that the data refs with the same alignment as dr_peel
853 are aligned in the vector loop. */
855 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
856 FOR_EACH_VEC_ELT (same_aligned_drs
, i
, current_dr
)
858 if (current_dr
!= dr
)
860 gcc_assert (!known_alignment_for_access_p (dr
)
861 || !known_alignment_for_access_p (dr_peel
)
862 || (DR_MISALIGNMENT (dr
) / dr_size
863 == DR_MISALIGNMENT (dr_peel
) / dr_peel_size
));
864 SET_DR_MISALIGNMENT (dr
, 0);
868 if (known_alignment_for_access_p (dr
)
869 && known_alignment_for_access_p (dr_peel
))
871 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
872 int misal
= DR_MISALIGNMENT (dr
);
873 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
874 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
875 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
876 SET_DR_MISALIGNMENT (dr
, misal
);
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
882 "to unknown (-1).\n");
883 SET_DR_MISALIGNMENT (dr
, DR_MISALIGNMENT_UNKNOWN
);
887 /* Function verify_data_ref_alignment
889 Return TRUE if DR can be handled with respect to alignment. */
892 verify_data_ref_alignment (data_reference_p dr
)
894 enum dr_alignment_support supportable_dr_alignment
895 = vect_supportable_dr_alignment (dr
, false);
896 if (!supportable_dr_alignment
)
898 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
902 "not vectorized: unsupported unaligned load.");
904 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
905 "not vectorized: unsupported unaligned "
908 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
910 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
915 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
916 dump_printf_loc (MSG_NOTE
, vect_location
,
917 "Vectorizing an unaligned access.\n");
922 /* Function vect_verify_datarefs_alignment
924 Return TRUE if all data references in the loop can be
925 handled with respect to alignment. */
928 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
930 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
931 struct data_reference
*dr
;
934 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
936 gimple
*stmt
= DR_STMT (dr
);
937 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
939 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
942 /* For interleaving, only the alignment of the first access matters. */
943 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
944 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
947 /* Strided accesses perform only component accesses, alignment is
948 irrelevant for them. */
949 if (STMT_VINFO_STRIDED_P (stmt_info
)
950 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
953 if (! verify_data_ref_alignment (dr
))
960 /* Given an memory reference EXP return whether its alignment is less
964 not_size_aligned (tree exp
)
966 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
969 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
970 > get_object_alignment (exp
));
973 /* Function vector_alignment_reachable_p
975 Return true if vector alignment for DR is reachable by peeling
976 a few loop iterations. Return false otherwise. */
979 vector_alignment_reachable_p (struct data_reference
*dr
)
981 gimple
*stmt
= DR_STMT (dr
);
982 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
983 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
987 /* For interleaved access we peel only if number of iterations in
988 the prolog loop ({VF - misalignment}), is a multiple of the
989 number of the interleaved accesses. */
990 int elem_size
, mis_in_elements
;
991 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
993 /* FORNOW: handle only known alignment. */
994 if (!known_alignment_for_access_p (dr
))
997 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
998 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1000 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1004 /* If misalignment is known at the compile time then allow peeling
1005 only if natural alignment is reachable through peeling. */
1006 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1008 HOST_WIDE_INT elmsize
=
1009 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE
, vect_location
,
1013 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1014 dump_printf (MSG_NOTE
,
1015 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1017 if (DR_MISALIGNMENT (dr
) % elmsize
)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1021 "data size does not divide the misalignment.\n");
1026 if (!known_alignment_for_access_p (dr
))
1028 tree type
= TREE_TYPE (DR_REF (dr
));
1029 bool is_packed
= not_size_aligned (DR_REF (dr
));
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1032 "Unknown misalignment, %snaturally aligned\n",
1033 is_packed
? "not " : "");
1034 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1041 /* Calculate the cost of the memory access represented by DR. */
1044 vect_get_data_access_cost (struct data_reference
*dr
,
1045 unsigned int *inside_cost
,
1046 unsigned int *outside_cost
,
1047 stmt_vector_for_cost
*body_cost_vec
)
1049 gimple
*stmt
= DR_STMT (dr
);
1050 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1051 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1052 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1053 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1054 int ncopies
= MAX (1, vf
/ nunits
); /* TODO: Handle SLP properly */
1056 if (DR_IS_READ (dr
))
1057 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1058 NULL
, body_cost_vec
, false);
1060 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_NOTE
, vect_location
,
1064 "vect_get_data_access_cost: inside_cost = %d, "
1065 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1069 typedef struct _vect_peel_info
1071 struct data_reference
*dr
;
1076 typedef struct _vect_peel_extended_info
1078 struct _vect_peel_info peel_info
;
1079 unsigned int inside_cost
;
1080 unsigned int outside_cost
;
1081 } *vect_peel_extended_info
;
1084 /* Peeling hashtable helpers. */
1086 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1088 static inline hashval_t
hash (const _vect_peel_info
*);
1089 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1093 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1095 return (hashval_t
) peel_info
->npeel
;
1099 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1101 return (a
->npeel
== b
->npeel
);
1105 /* Insert DR into peeling hash table with NPEEL as key. */
1108 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1109 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1112 struct _vect_peel_info elem
, *slot
;
1113 _vect_peel_info
**new_slot
;
1114 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1117 slot
= peeling_htab
->find (&elem
);
1122 slot
= XNEW (struct _vect_peel_info
);
1123 slot
->npeel
= npeel
;
1126 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1130 if (!supportable_dr_alignment
1131 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1132 slot
->count
+= VECT_MAX_COST
;
1136 /* Traverse peeling hash table to find peeling option that aligns maximum
1137 number of data accesses. */
1140 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1141 _vect_peel_extended_info
*max
)
1143 vect_peel_info elem
= *slot
;
1145 if (elem
->count
> max
->peel_info
.count
1146 || (elem
->count
== max
->peel_info
.count
1147 && max
->peel_info
.npeel
> elem
->npeel
))
1149 max
->peel_info
.npeel
= elem
->npeel
;
1150 max
->peel_info
.count
= elem
->count
;
1151 max
->peel_info
.dr
= elem
->dr
;
1157 /* Get the costs of peeling NPEEL iterations checking data access costs
1158 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1159 misalignment will be zero after peeling. */
1162 vect_get_peeling_costs_all_drs (vec
<data_reference_p
> datarefs
,
1163 struct data_reference
*dr0
,
1164 unsigned int *inside_cost
,
1165 unsigned int *outside_cost
,
1166 stmt_vector_for_cost
*body_cost_vec
,
1168 bool unknown_misalignment
)
1173 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1175 gimple
*stmt
= DR_STMT (dr
);
1176 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1177 /* For interleaving, only the alignment of the first access
1179 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1180 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1183 /* Strided accesses perform only component accesses, alignment is
1184 irrelevant for them. */
1185 if (STMT_VINFO_STRIDED_P (stmt_info
)
1186 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1189 int save_misalignment
;
1190 save_misalignment
= DR_MISALIGNMENT (dr
);
1193 else if (unknown_misalignment
&& dr
== dr0
)
1194 SET_DR_MISALIGNMENT (dr
, 0);
1196 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1197 vect_get_data_access_cost (dr
, inside_cost
, outside_cost
,
1199 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1203 /* Traverse peeling hash table and calculate cost for each peeling option.
1204 Find the one with the lowest cost. */
1207 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1208 _vect_peel_extended_info
*min
)
1210 vect_peel_info elem
= *slot
;
1212 unsigned int inside_cost
= 0, outside_cost
= 0;
1213 gimple
*stmt
= DR_STMT (elem
->dr
);
1214 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1215 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1216 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1219 prologue_cost_vec
.create (2);
1220 body_cost_vec
.create (2);
1221 epilogue_cost_vec
.create (2);
1223 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo
),
1224 elem
->dr
, &inside_cost
, &outside_cost
,
1225 &body_cost_vec
, elem
->npeel
, false);
1227 body_cost_vec
.release ();
1229 outside_cost
+= vect_get_known_peeling_cost
1230 (loop_vinfo
, elem
->npeel
, &dummy
,
1231 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1232 &prologue_cost_vec
, &epilogue_cost_vec
);
1234 /* Prologue and epilogue costs are added to the target model later.
1235 These costs depend only on the scalar iteration cost, the
1236 number of peeling iterations finally chosen, and the number of
1237 misaligned statements. So discard the information found here. */
1238 prologue_cost_vec
.release ();
1239 epilogue_cost_vec
.release ();
1241 if (inside_cost
< min
->inside_cost
1242 || (inside_cost
== min
->inside_cost
1243 && outside_cost
< min
->outside_cost
))
1245 min
->inside_cost
= inside_cost
;
1246 min
->outside_cost
= outside_cost
;
1247 min
->peel_info
.dr
= elem
->dr
;
1248 min
->peel_info
.npeel
= elem
->npeel
;
1249 min
->peel_info
.count
= elem
->count
;
1256 /* Choose best peeling option by traversing peeling hash table and either
1257 choosing an option with the lowest cost (if cost model is enabled) or the
1258 option that aligns as many accesses as possible. */
1260 static struct _vect_peel_extended_info
1261 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1262 loop_vec_info loop_vinfo
)
1264 struct _vect_peel_extended_info res
;
1266 res
.peel_info
.dr
= NULL
;
1268 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1270 res
.inside_cost
= INT_MAX
;
1271 res
.outside_cost
= INT_MAX
;
1272 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1273 vect_peeling_hash_get_lowest_cost
> (&res
);
1277 res
.peel_info
.count
= 0;
1278 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1279 vect_peeling_hash_get_most_frequent
> (&res
);
1280 res
.inside_cost
= 0;
1281 res
.outside_cost
= 0;
1287 /* Return true if the new peeling NPEEL is supported. */
1290 vect_peeling_supportable (loop_vec_info loop_vinfo
, struct data_reference
*dr0
,
1294 struct data_reference
*dr
= NULL
;
1295 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1297 stmt_vec_info stmt_info
;
1298 enum dr_alignment_support supportable_dr_alignment
;
1300 /* Ensure that all data refs can be vectorized after the peel. */
1301 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1303 int save_misalignment
;
1308 stmt
= DR_STMT (dr
);
1309 stmt_info
= vinfo_for_stmt (stmt
);
1310 /* For interleaving, only the alignment of the first access
1312 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1313 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1316 /* Strided accesses perform only component accesses, alignment is
1317 irrelevant for them. */
1318 if (STMT_VINFO_STRIDED_P (stmt_info
)
1319 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1322 save_misalignment
= DR_MISALIGNMENT (dr
);
1323 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1324 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1325 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1327 if (!supportable_dr_alignment
)
1334 /* Function vect_enhance_data_refs_alignment
1336 This pass will use loop versioning and loop peeling in order to enhance
1337 the alignment of data references in the loop.
1339 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1340 original loop is to be vectorized. Any other loops that are created by
1341 the transformations performed in this pass - are not supposed to be
1342 vectorized. This restriction will be relaxed.
1344 This pass will require a cost model to guide it whether to apply peeling
1345 or versioning or a combination of the two. For example, the scheme that
1346 intel uses when given a loop with several memory accesses, is as follows:
1347 choose one memory access ('p') which alignment you want to force by doing
1348 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1349 other accesses are not necessarily aligned, or (2) use loop versioning to
1350 generate one loop in which all accesses are aligned, and another loop in
1351 which only 'p' is necessarily aligned.
1353 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1354 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1355 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1357 Devising a cost model is the most critical aspect of this work. It will
1358 guide us on which access to peel for, whether to use loop versioning, how
1359 many versions to create, etc. The cost model will probably consist of
1360 generic considerations as well as target specific considerations (on
1361 powerpc for example, misaligned stores are more painful than misaligned
1364 Here are the general steps involved in alignment enhancements:
1366 -- original loop, before alignment analysis:
1367 for (i=0; i<N; i++){
1368 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1369 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1372 -- After vect_compute_data_refs_alignment:
1373 for (i=0; i<N; i++){
1374 x = q[i]; # DR_MISALIGNMENT(q) = 3
1375 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1378 -- Possibility 1: we do loop versioning:
1380 for (i=0; i<N; i++){ # loop 1A
1381 x = q[i]; # DR_MISALIGNMENT(q) = 3
1382 p[i] = y; # DR_MISALIGNMENT(p) = 0
1386 for (i=0; i<N; i++){ # loop 1B
1387 x = q[i]; # DR_MISALIGNMENT(q) = 3
1388 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1392 -- Possibility 2: we do loop peeling:
1393 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1397 for (i = 3; i < N; i++){ # loop 2A
1398 x = q[i]; # DR_MISALIGNMENT(q) = 0
1399 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1402 -- Possibility 3: combination of loop peeling and versioning:
1403 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1408 for (i = 3; i<N; i++){ # loop 3A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = 0
1414 for (i = 3; i<N; i++){ # loop 3B
1415 x = q[i]; # DR_MISALIGNMENT(q) = 0
1416 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1420 These loops are later passed to loop_transform to be vectorized. The
1421 vectorizer will use the alignment information to guide the transformation
1422 (whether to generate regular loads/stores, or with special handling for
1426 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1428 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1429 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1430 enum dr_alignment_support supportable_dr_alignment
;
1431 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1432 struct data_reference
*dr
;
1434 bool do_peeling
= false;
1435 bool do_versioning
= false;
1438 stmt_vec_info stmt_info
;
1439 unsigned int npeel
= 0;
1440 bool one_misalignment_known
= false;
1441 bool one_misalignment_unknown
= false;
1442 bool one_dr_unsupportable
= false;
1443 struct data_reference
*unsupportable_dr
= NULL
;
1444 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1445 unsigned possible_npeel_number
= 1;
1447 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1448 hash_table
<peel_info_hasher
> peeling_htab (1);
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_NOTE
, vect_location
,
1452 "=== vect_enhance_data_refs_alignment ===\n");
1454 /* Reset data so we can safely be called multiple times. */
1455 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1456 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1458 /* While cost model enhancements are expected in the future, the high level
1459 view of the code at this time is as follows:
1461 A) If there is a misaligned access then see if peeling to align
1462 this access can make all data references satisfy
1463 vect_supportable_dr_alignment. If so, update data structures
1464 as needed and return true.
1466 B) If peeling wasn't possible and there is a data reference with an
1467 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1468 then see if loop versioning checks can be used to make all data
1469 references satisfy vect_supportable_dr_alignment. If so, update
1470 data structures as needed and return true.
1472 C) If neither peeling nor versioning were successful then return false if
1473 any data reference does not satisfy vect_supportable_dr_alignment.
1475 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1477 Note, Possibility 3 above (which is peeling and versioning together) is not
1478 being done at this time. */
1480 /* (1) Peeling to force alignment. */
1482 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1484 + How many accesses will become aligned due to the peeling
1485 - How many accesses will become unaligned due to the peeling,
1486 and the cost of misaligned accesses.
1487 - The cost of peeling (the extra runtime checks, the increase
1490 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1492 stmt
= DR_STMT (dr
);
1493 stmt_info
= vinfo_for_stmt (stmt
);
1495 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1498 /* For interleaving, only the alignment of the first access
1500 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1501 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1504 /* For invariant accesses there is nothing to enhance. */
1505 if (integer_zerop (DR_STEP (dr
)))
1508 /* Strided accesses perform only component accesses, alignment is
1509 irrelevant for them. */
1510 if (STMT_VINFO_STRIDED_P (stmt_info
)
1511 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1514 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1515 do_peeling
= vector_alignment_reachable_p (dr
);
1518 if (known_alignment_for_access_p (dr
))
1520 unsigned int npeel_tmp
= 0;
1521 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1522 size_zero_node
) < 0;
1524 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1525 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1526 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1527 TREE_TYPE (DR_REF (dr
))));
1528 if (DR_MISALIGNMENT (dr
) != 0)
1529 npeel_tmp
= (negative
? (mis
- nelements
)
1530 : (nelements
- mis
)) & (nelements
- 1);
1532 /* For multiple types, it is possible that the bigger type access
1533 will have more than one peeling option. E.g., a loop with two
1534 types: one of size (vector size / 4), and the other one of
1535 size (vector size / 8). Vectorization factor will 8. If both
1536 accesses are misaligned by 3, the first one needs one scalar
1537 iteration to be aligned, and the second one needs 5. But the
1538 first one will be aligned also by peeling 5 scalar
1539 iterations, and in that case both accesses will be aligned.
1540 Hence, except for the immediate peeling amount, we also want
1541 to try to add full vector size, while we don't exceed
1542 vectorization factor.
1543 We do this automatically for cost model, since we calculate
1544 cost for every peeling option. */
1545 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1547 if (STMT_SLP_TYPE (stmt_info
))
1548 possible_npeel_number
1549 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1551 possible_npeel_number
= vf
/ nelements
;
1553 /* NPEEL_TMP is 0 when there is no misalignment, but also
1554 allow peeling NELEMENTS. */
1555 if (DR_MISALIGNMENT (dr
) == 0)
1556 possible_npeel_number
++;
1559 /* Save info about DR in the hash table. Also include peeling
1560 amounts according to the explanation above. */
1561 for (j
= 0; j
< possible_npeel_number
; j
++)
1563 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1565 npeel_tmp
+= nelements
;
1568 one_misalignment_known
= true;
1572 /* If we don't know any misalignment values, we prefer
1573 peeling for data-ref that has the maximum number of data-refs
1574 with the same alignment, unless the target prefers to align
1575 stores over load. */
1576 unsigned same_align_drs
1577 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1579 || same_align_drs_max
< same_align_drs
)
1581 same_align_drs_max
= same_align_drs
;
1584 /* For data-refs with the same number of related
1585 accesses prefer the one where the misalign
1586 computation will be invariant in the outermost loop. */
1587 else if (same_align_drs_max
== same_align_drs
)
1589 struct loop
*ivloop0
, *ivloop
;
1590 ivloop0
= outermost_invariant_loop_for_expr
1591 (loop
, DR_BASE_ADDRESS (dr0
));
1592 ivloop
= outermost_invariant_loop_for_expr
1593 (loop
, DR_BASE_ADDRESS (dr
));
1594 if ((ivloop
&& !ivloop0
)
1595 || (ivloop
&& ivloop0
1596 && flow_loop_nested_p (ivloop
, ivloop0
)))
1600 one_misalignment_unknown
= true;
1602 /* Check for data refs with unsupportable alignment that
1604 if (!supportable_dr_alignment
)
1606 one_dr_unsupportable
= true;
1607 unsupportable_dr
= dr
;
1610 if (!first_store
&& DR_IS_WRITE (dr
))
1616 if (!aligned_access_p (dr
))
1618 if (dump_enabled_p ())
1619 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1620 "vector alignment may not be reachable\n");
1626 /* Check if we can possibly peel the loop. */
1627 if (!vect_can_advance_ivs_p (loop_vinfo
)
1628 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1632 struct _vect_peel_extended_info peel_for_known_alignment
;
1633 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1634 struct _vect_peel_extended_info best_peel
;
1636 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1637 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1638 peel_for_unknown_alignment
.peel_info
.count
= 0;
1641 && one_misalignment_unknown
)
1643 /* Check if the target requires to prefer stores over loads, i.e., if
1644 misaligned stores are more expensive than misaligned loads (taking
1645 drs with same alignment into account). */
1646 unsigned int load_inside_cost
= 0;
1647 unsigned int load_outside_cost
= 0;
1648 unsigned int store_inside_cost
= 0;
1649 unsigned int store_outside_cost
= 0;
1651 stmt_vector_for_cost dummy
;
1653 vect_get_peeling_costs_all_drs (datarefs
, dr0
,
1656 &dummy
, vf
/ 2, true);
1662 vect_get_peeling_costs_all_drs (datarefs
, first_store
,
1664 &store_outside_cost
,
1665 &dummy
, vf
/ 2, true);
1670 store_inside_cost
= INT_MAX
;
1671 store_outside_cost
= INT_MAX
;
1674 if (load_inside_cost
> store_inside_cost
1675 || (load_inside_cost
== store_inside_cost
1676 && load_outside_cost
> store_outside_cost
))
1679 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
1680 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
1684 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
1685 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
1688 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1689 prologue_cost_vec
.create (2);
1690 epilogue_cost_vec
.create (2);
1693 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
1694 (loop_vinfo
, vf
/ 2, &dummy2
,
1695 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1696 &prologue_cost_vec
, &epilogue_cost_vec
);
1698 prologue_cost_vec
.release ();
1699 epilogue_cost_vec
.release ();
1701 peel_for_unknown_alignment
.peel_info
.count
= 1
1702 + STMT_VINFO_SAME_ALIGN_REFS
1703 (vinfo_for_stmt (DR_STMT (dr0
))).length ();
1706 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
1707 peel_for_unknown_alignment
.peel_info
.dr
= dr0
;
1709 best_peel
= peel_for_unknown_alignment
;
1711 peel_for_known_alignment
.inside_cost
= INT_MAX
;
1712 peel_for_known_alignment
.outside_cost
= INT_MAX
;
1713 peel_for_known_alignment
.peel_info
.count
= 0;
1714 peel_for_known_alignment
.peel_info
.dr
= NULL
;
1716 if (do_peeling
&& one_misalignment_known
)
1718 /* Peeling is possible, but there is no data access that is not supported
1719 unless aligned. So we try to choose the best possible peeling from
1721 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
1722 (&peeling_htab
, loop_vinfo
);
1725 /* Compare costs of peeling for known and unknown alignment. */
1726 if (peel_for_known_alignment
.peel_info
.dr
!= NULL
1727 && peel_for_unknown_alignment
.inside_cost
1728 >= peel_for_known_alignment
.inside_cost
)
1730 best_peel
= peel_for_known_alignment
;
1732 /* If the best peeling for known alignment has NPEEL == 0, perform no
1733 peeling at all except if there is an unsupportable dr that we can
1735 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
1739 /* If there is an unsupportable data ref, prefer this over all choices so far
1740 since we'd have to discard a chosen peeling except when it accidentally
1741 aligned the unsupportable data ref. */
1742 if (one_dr_unsupportable
)
1743 dr0
= unsupportable_dr
;
1744 else if (do_peeling
)
1746 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1747 TODO: Use nopeel_outside_cost or get rid of it? */
1748 unsigned nopeel_inside_cost
= 0;
1749 unsigned nopeel_outside_cost
= 0;
1751 stmt_vector_for_cost dummy
;
1753 vect_get_peeling_costs_all_drs (datarefs
, NULL
, &nopeel_inside_cost
,
1754 &nopeel_outside_cost
, &dummy
, 0, false);
1757 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1758 costs will be recorded. */
1759 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
1760 prologue_cost_vec
.create (2);
1761 epilogue_cost_vec
.create (2);
1764 nopeel_outside_cost
+= vect_get_known_peeling_cost
1765 (loop_vinfo
, 0, &dummy2
,
1766 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1767 &prologue_cost_vec
, &epilogue_cost_vec
);
1769 prologue_cost_vec
.release ();
1770 epilogue_cost_vec
.release ();
1772 npeel
= best_peel
.peel_info
.npeel
;
1773 dr0
= best_peel
.peel_info
.dr
;
1775 /* If no peeling is not more expensive than the best peeling we
1776 have so far, don't perform any peeling. */
1777 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
1783 stmt
= DR_STMT (dr0
);
1784 stmt_info
= vinfo_for_stmt (stmt
);
1785 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1786 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1788 if (known_alignment_for_access_p (dr0
))
1790 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1791 size_zero_node
) < 0;
1794 /* Since it's known at compile time, compute the number of
1795 iterations in the peeled loop (the peeling factor) for use in
1796 updating DR_MISALIGNMENT values. The peeling factor is the
1797 vectorization factor minus the misalignment as an element
1799 mis
= DR_MISALIGNMENT (dr0
);
1800 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1801 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1805 /* For interleaved data access every iteration accesses all the
1806 members of the group, therefore we divide the number of iterations
1807 by the group size. */
1808 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1809 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1810 npeel
/= GROUP_SIZE (stmt_info
);
1812 if (dump_enabled_p ())
1813 dump_printf_loc (MSG_NOTE
, vect_location
,
1814 "Try peeling by %d\n", npeel
);
1817 /* Ensure that all datarefs can be vectorized after the peel. */
1818 if (!vect_peeling_supportable (loop_vinfo
, dr0
, npeel
))
1821 /* Check if all datarefs are supportable and log. */
1822 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1824 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1836 if (max_allowed_peel
!= (unsigned)-1)
1838 unsigned max_peel
= npeel
;
1841 gimple
*dr_stmt
= DR_STMT (dr0
);
1842 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1843 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1844 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1846 if (max_peel
> max_allowed_peel
)
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE
, vect_location
,
1851 "Disable peeling, max peels reached: %d\n", max_peel
);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1862 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1883 if (STMT_VINFO_STRIDED_P (stmt_info
)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1887 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1895 = DR_MISALIGNMENT (dr0
);
1896 SET_DR_MISALIGNMENT (dr0
, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE
, vect_location
,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE
, vect_location
,
1902 "Peeling for alignment will be applied.\n");
1905 /* The inside-loop cost will be accounted for in vectorizable_load
1906 and vectorizable_store correctly with adjusted alignments.
1907 Drop the body_cst_vec on the floor here. */
1908 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1914 /* (2) Versioning to force alignment. */
1916 /* Try versioning if:
1917 1) optimize loop for speed
1918 2) there is at least one unsupported misaligned data ref with an unknown
1920 3) all misaligned data refs with a known misalignment are supported, and
1921 4) the number of runtime alignment checks is within reason. */
1924 optimize_loop_nest_for_speed_p (loop
)
1925 && (!loop
->inner
); /* FORNOW */
1929 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1931 stmt
= DR_STMT (dr
);
1932 stmt_info
= vinfo_for_stmt (stmt
);
1934 /* For interleaving, only the alignment of the first access
1936 if (aligned_access_p (dr
)
1937 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1938 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1941 if (STMT_VINFO_STRIDED_P (stmt_info
))
1943 /* Strided loads perform only component accesses, alignment is
1944 irrelevant for them. */
1945 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1947 do_versioning
= false;
1951 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1953 if (!supportable_dr_alignment
)
1959 if (known_alignment_for_access_p (dr
)
1960 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1961 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1963 do_versioning
= false;
1967 stmt
= DR_STMT (dr
);
1968 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1969 gcc_assert (vectype
);
1971 /* The rightmost bits of an aligned address must be zeros.
1972 Construct the mask needed for this test. For example,
1973 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1974 mask must be 15 = 0xf. */
1975 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1977 /* FORNOW: use the same mask to test all potentially unaligned
1978 references in the loop. The vectorizer currently supports
1979 a single vector size, see the reference to
1980 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1981 vectorization factor is computed. */
1982 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1983 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1984 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1985 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1990 /* Versioning requires at least one misaligned data reference. */
1991 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1992 do_versioning
= false;
1993 else if (!do_versioning
)
1994 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1999 vec
<gimple
*> may_misalign_stmts
2000 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2003 /* It can now be assumed that the data references in the statements
2004 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2005 of the loop being vectorized. */
2006 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2008 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2009 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2010 SET_DR_MISALIGNMENT (dr
, 0);
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_NOTE
, vect_location
,
2013 "Alignment of access forced using versioning.\n");
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_NOTE
, vect_location
,
2018 "Versioning for alignment will be applied.\n");
2020 /* Peeling and versioning can't be done together at this time. */
2021 gcc_assert (! (do_peeling
&& do_versioning
));
2023 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2028 /* This point is reached if neither peeling nor versioning is being done. */
2029 gcc_assert (! (do_peeling
|| do_versioning
));
2031 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2036 /* Function vect_find_same_alignment_drs.
2038 Update group and alignment relations according to the chosen
2039 vectorization factor. */
2042 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
)
2044 struct data_reference
*dra
= DDR_A (ddr
);
2045 struct data_reference
*drb
= DDR_B (ddr
);
2046 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2047 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2049 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2055 if (!operand_equal_p (DR_BASE_OBJECT (dra
), DR_BASE_OBJECT (drb
),
2057 || !operand_equal_p (DR_OFFSET (dra
), DR_OFFSET (drb
), 0)
2058 || !operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2061 /* Two references with distance zero have the same alignment. */
2062 offset_int diff
= (wi::to_offset (DR_INIT (dra
))
2063 - wi::to_offset (DR_INIT (drb
)));
2066 /* Get the wider of the two alignments. */
2067 unsigned int align_a
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a
));
2068 unsigned int align_b
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b
));
2069 unsigned int max_align
= MAX (align_a
, align_b
);
2071 /* Require the gap to be a multiple of the larger vector alignment. */
2072 if (!wi::multiple_of_p (diff
, max_align
, SIGNED
))
2076 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2077 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2078 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE
, vect_location
,
2081 "accesses have the same alignment: ");
2082 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2083 dump_printf (MSG_NOTE
, " and ");
2084 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2085 dump_printf (MSG_NOTE
, "\n");
2090 /* Function vect_analyze_data_refs_alignment
2092 Analyze the alignment of the data-references in the loop.
2093 Return FALSE if a data reference is found that cannot be vectorized. */
2096 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_NOTE
, vect_location
,
2100 "=== vect_analyze_data_refs_alignment ===\n");
2102 /* Mark groups of data references with same alignment using
2103 data dependence information. */
2104 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2105 struct data_dependence_relation
*ddr
;
2108 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2109 vect_find_same_alignment_drs (ddr
);
2111 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2112 struct data_reference
*dr
;
2114 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2116 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2117 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2118 && !vect_compute_data_ref_alignment (dr
))
2120 /* Strided accesses perform only component accesses, misalignment
2121 information is irrelevant for them. */
2122 if (STMT_VINFO_STRIDED_P (stmt_info
)
2123 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2128 "not vectorized: can't calculate alignment "
2139 /* Analyze alignment of DRs of stmts in NODE. */
2142 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2144 /* We vectorize from the first scalar stmt in the node unless
2145 the node is permuted in which case we start from the first
2146 element in the group. */
2147 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2148 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2149 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2150 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2152 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2153 if (! vect_compute_data_ref_alignment (dr
)
2154 /* For creating the data-ref pointer we need alignment of the
2155 first element anyway. */
2157 && ! vect_compute_data_ref_alignment (first_dr
))
2158 || ! verify_data_ref_alignment (dr
))
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2162 "not vectorized: bad data alignment in basic "
2170 /* Function vect_slp_analyze_instance_alignment
2172 Analyze the alignment of the data-references in the SLP instance.
2173 Return FALSE if a data reference is found that cannot be vectorized. */
2176 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_NOTE
, vect_location
,
2180 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2184 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2185 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2188 node
= SLP_INSTANCE_TREE (instance
);
2189 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2190 && ! vect_slp_analyze_and_verify_node_alignment
2191 (SLP_INSTANCE_TREE (instance
)))
2198 /* Analyze groups of accesses: check that DR belongs to a group of
2199 accesses of legal size, step, etc. Detect gaps, single element
2200 interleaving, and other special cases. Set grouped access info.
2201 Collect groups of strided stores for further use in SLP analysis.
2202 Worker for vect_analyze_group_access. */
2205 vect_analyze_group_access_1 (struct data_reference
*dr
)
2207 tree step
= DR_STEP (dr
);
2208 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2209 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2210 gimple
*stmt
= DR_STMT (dr
);
2211 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2212 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2213 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2214 HOST_WIDE_INT dr_step
= -1;
2215 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2216 bool slp_impossible
= false;
2218 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2219 size of the interleaving group (including gaps). */
2220 if (tree_fits_shwi_p (step
))
2222 dr_step
= tree_to_shwi (step
);
2223 /* Check that STEP is a multiple of type size. Otherwise there is
2224 a non-element-sized gap at the end of the group which we
2225 cannot represent in GROUP_GAP or GROUP_SIZE.
2226 ??? As we can handle non-constant step fine here we should
2227 simply remove uses of GROUP_GAP between the last and first
2228 element and instead rely on DR_STEP. GROUP_SIZE then would
2229 simply not include that gap. */
2230 if ((dr_step
% type_size
) != 0)
2232 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE
, vect_location
,
2236 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2237 dump_printf (MSG_NOTE
,
2238 " is not a multiple of the element size for ");
2239 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2240 dump_printf (MSG_NOTE
, "\n");
2244 groupsize
= absu_hwi (dr_step
) / type_size
;
2249 /* Not consecutive access is possible only if it is a part of interleaving. */
2250 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2252 /* Check if it this DR is a part of interleaving, and is a single
2253 element of the group that is accessed in the loop. */
2255 /* Gaps are supported only for loads. STEP must be a multiple of the type
2256 size. The size of the group must be a power of 2. */
2258 && (dr_step
% type_size
) == 0
2260 && pow2p_hwi (groupsize
))
2262 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2263 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2264 GROUP_GAP (stmt_info
) = groupsize
- 1;
2265 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE
, vect_location
,
2268 "Detected single element interleaving ");
2269 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2270 dump_printf (MSG_NOTE
, " step ");
2271 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2272 dump_printf (MSG_NOTE
, "\n");
2278 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2281 "not consecutive access ");
2282 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2287 /* Mark the statement as unvectorizable. */
2288 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2292 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2293 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2297 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2299 /* First stmt in the interleaving chain. Check the chain. */
2300 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2301 struct data_reference
*data_ref
= dr
;
2302 unsigned int count
= 1;
2303 tree prev_init
= DR_INIT (data_ref
);
2304 gimple
*prev
= stmt
;
2305 HOST_WIDE_INT diff
, gaps
= 0;
2309 /* Skip same data-refs. In case that two or more stmts share
2310 data-ref (supported only for loads), we vectorize only the first
2311 stmt, and the rest get their vectorized loads from the first
2313 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2314 DR_INIT (STMT_VINFO_DATA_REF (
2315 vinfo_for_stmt (next
)))))
2317 if (DR_IS_WRITE (data_ref
))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2321 "Two store stmts share the same dr.\n");
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2327 "Two or more load stmts share the same dr.\n");
2329 /* For load use the same data-ref load. */
2330 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2333 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2338 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2340 /* All group members have the same STEP by construction. */
2341 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2343 /* Check that the distance between two accesses is equal to the type
2344 size. Otherwise, we have gaps. */
2345 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2346 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2349 /* FORNOW: SLP of accesses with gaps is not supported. */
2350 slp_impossible
= true;
2351 if (DR_IS_WRITE (data_ref
))
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2355 "interleaved store with gaps\n");
2362 last_accessed_element
+= diff
;
2364 /* Store the gap from the previous member of the group. If there is no
2365 gap in the access, GROUP_GAP is always 1. */
2366 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2368 prev_init
= DR_INIT (data_ref
);
2369 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2370 /* Count the number of data-refs in the chain. */
2375 groupsize
= count
+ gaps
;
2377 /* This could be UINT_MAX but as we are generating code in a very
2378 inefficient way we have to cap earlier. See PR78699 for example. */
2379 if (groupsize
> 4096)
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2383 "group is too large\n");
2387 /* Check that the size of the interleaving is equal to count for stores,
2388 i.e., that there are no gaps. */
2389 if (groupsize
!= count
2390 && !DR_IS_READ (dr
))
2392 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2394 "interleaved store with gaps\n");
2398 /* If there is a gap after the last load in the group it is the
2399 difference between the groupsize and the last accessed
2401 When there is no gap, this difference should be 0. */
2402 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2404 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2405 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_NOTE
, vect_location
,
2408 "Detected interleaving ");
2409 if (DR_IS_READ (dr
))
2410 dump_printf (MSG_NOTE
, "load ");
2412 dump_printf (MSG_NOTE
, "store ");
2413 dump_printf (MSG_NOTE
, "of size %u starting with ",
2414 (unsigned)groupsize
);
2415 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2416 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2417 dump_printf_loc (MSG_NOTE
, vect_location
,
2418 "There is a gap of %u elements after the group\n",
2419 GROUP_GAP (vinfo_for_stmt (stmt
)));
2422 /* SLP: create an SLP data structure for every interleaving group of
2423 stores for further analysis in vect_analyse_slp. */
2424 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2427 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2429 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2436 /* Analyze groups of accesses: check that DR belongs to a group of
2437 accesses of legal size, step, etc. Detect gaps, single element
2438 interleaving, and other special cases. Set grouped access info.
2439 Collect groups of strided stores for further use in SLP analysis. */
2442 vect_analyze_group_access (struct data_reference
*dr
)
2444 if (!vect_analyze_group_access_1 (dr
))
2446 /* Dissolve the group if present. */
2448 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2451 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2452 next
= GROUP_NEXT_ELEMENT (vinfo
);
2453 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2454 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2462 /* Analyze the access pattern of the data-reference DR.
2463 In case of non-consecutive accesses call vect_analyze_group_access() to
2464 analyze groups of accesses. */
2467 vect_analyze_data_ref_access (struct data_reference
*dr
)
2469 tree step
= DR_STEP (dr
);
2470 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2471 gimple
*stmt
= DR_STMT (dr
);
2472 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2473 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2474 struct loop
*loop
= NULL
;
2477 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2479 if (loop_vinfo
&& !step
)
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2483 "bad data-ref access in loop\n");
2487 /* Allow loads with zero step in inner-loop vectorization. */
2488 if (loop_vinfo
&& integer_zerop (step
))
2490 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2491 if (!nested_in_vect_loop_p (loop
, stmt
))
2492 return DR_IS_READ (dr
);
2493 /* Allow references with zero step for outer loops marked
2494 with pragma omp simd only - it guarantees absence of
2495 loop-carried dependencies between inner loop iterations. */
2496 if (!loop
->force_vectorize
)
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE
, vect_location
,
2500 "zero step in inner loop of nest\n");
2505 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2507 /* Interleaved accesses are not yet supported within outer-loop
2508 vectorization for references in the inner-loop. */
2509 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2511 /* For the rest of the analysis we use the outer-loop step. */
2512 step
= STMT_VINFO_DR_STEP (stmt_info
);
2513 if (integer_zerop (step
))
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE
, vect_location
,
2517 "zero step in outer loop.\n");
2518 return DR_IS_READ (dr
);
2523 if (TREE_CODE (step
) == INTEGER_CST
)
2525 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2526 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2528 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2530 /* Mark that it is not interleaving. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2536 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_NOTE
, vect_location
,
2540 "grouped access in outer loop.\n");
2545 /* Assume this is a DR handled by non-constant strided load case. */
2546 if (TREE_CODE (step
) != INTEGER_CST
)
2547 return (STMT_VINFO_STRIDED_P (stmt_info
)
2548 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2549 || vect_analyze_group_access (dr
)));
2551 /* Not consecutive access - check if it's a part of interleaving group. */
2552 return vect_analyze_group_access (dr
);
2555 /* Compare two data-references DRA and DRB to group them into chunks
2556 suitable for grouping. */
2559 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2561 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2562 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2565 /* Stabilize sort. */
2569 /* DRs in different loops never belong to the same group. */
2570 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2571 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2573 return loopa
->num
< loopb
->num
? -1 : 1;
2575 /* Ordering of DRs according to base. */
2576 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2578 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2579 DR_BASE_ADDRESS (drb
));
2584 /* And according to DR_OFFSET. */
2585 if (!dr_equal_offsets_p (dra
, drb
))
2587 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2592 /* Put reads before writes. */
2593 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2594 return DR_IS_READ (dra
) ? -1 : 1;
2596 /* Then sort after access size. */
2597 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2598 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2600 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2601 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2606 /* And after step. */
2607 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2609 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2614 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2615 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2617 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2621 /* Function vect_analyze_data_ref_accesses.
2623 Analyze the access pattern of all the data references in the loop.
2625 FORNOW: the only access pattern that is considered vectorizable is a
2626 simple step 1 (consecutive) access.
2628 FORNOW: handle only arrays and pointer accesses. */
2631 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2634 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2635 struct data_reference
*dr
;
2637 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE
, vect_location
,
2639 "=== vect_analyze_data_ref_accesses ===\n");
2641 if (datarefs
.is_empty ())
2644 /* Sort the array of datarefs to make building the interleaving chains
2645 linear. Don't modify the original vector's order, it is needed for
2646 determining what dependencies are reversed. */
2647 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2648 datarefs_copy
.qsort (dr_group_sort_cmp
);
2650 /* Build the interleaving chains. */
2651 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2653 data_reference_p dra
= datarefs_copy
[i
];
2654 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2655 stmt_vec_info lastinfo
= NULL
;
2656 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2661 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2663 data_reference_p drb
= datarefs_copy
[i
];
2664 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2665 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2668 /* ??? Imperfect sorting (non-compatible types, non-modulo
2669 accesses, same accesses) can lead to a group to be artificially
2670 split here as we don't just skip over those. If it really
2671 matters we can push those to a worklist and re-iterate
2672 over them. The we can just skip ahead to the next DR here. */
2674 /* DRs in a different loop should not be put into the same
2675 interleaving group. */
2676 if (gimple_bb (DR_STMT (dra
))->loop_father
2677 != gimple_bb (DR_STMT (drb
))->loop_father
)
2680 /* Check that the data-refs have same first location (except init)
2681 and they are both either store or load (not load and store,
2682 not masked loads or stores). */
2683 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2684 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2685 DR_BASE_ADDRESS (drb
), 0)
2686 || !dr_equal_offsets_p (dra
, drb
)
2687 || !gimple_assign_single_p (DR_STMT (dra
))
2688 || !gimple_assign_single_p (DR_STMT (drb
)))
2691 /* Check that the data-refs have the same constant size. */
2692 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2693 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2694 if (!tree_fits_uhwi_p (sza
)
2695 || !tree_fits_uhwi_p (szb
)
2696 || !tree_int_cst_equal (sza
, szb
))
2699 /* Check that the data-refs have the same step. */
2700 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2703 /* Do not place the same access in the interleaving chain twice. */
2704 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2707 /* Check the types are compatible.
2708 ??? We don't distinguish this during sorting. */
2709 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2710 TREE_TYPE (DR_REF (drb
))))
2713 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2714 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2715 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2716 gcc_assert (init_a
<= init_b
);
2718 /* If init_b == init_a + the size of the type * k, we have an
2719 interleaving, and DRA is accessed before DRB. */
2720 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2721 if (type_size_a
== 0
2722 || (init_b
- init_a
) % type_size_a
!= 0)
2725 /* If we have a store, the accesses are adjacent. This splits
2726 groups into chunks we support (we don't support vectorization
2727 of stores with gaps). */
2728 if (!DR_IS_READ (dra
)
2729 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2730 (DR_INIT (datarefs_copy
[i
-1]))
2734 /* If the step (if not zero or non-constant) is greater than the
2735 difference between data-refs' inits this splits groups into
2737 if (tree_fits_shwi_p (DR_STEP (dra
)))
2739 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2740 if (step
!= 0 && step
<= (init_b
- init_a
))
2744 if (dump_enabled_p ())
2746 dump_printf_loc (MSG_NOTE
, vect_location
,
2747 "Detected interleaving ");
2748 if (DR_IS_READ (dra
))
2749 dump_printf (MSG_NOTE
, "load ");
2751 dump_printf (MSG_NOTE
, "store ");
2752 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2753 dump_printf (MSG_NOTE
, " and ");
2754 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2755 dump_printf (MSG_NOTE
, "\n");
2758 /* Link the found element into the group list. */
2759 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2761 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2762 lastinfo
= stmtinfo_a
;
2764 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2765 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2766 lastinfo
= stmtinfo_b
;
2770 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2771 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2772 && !vect_analyze_data_ref_access (dr
))
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2776 "not vectorized: complicated access pattern.\n");
2778 if (is_a
<bb_vec_info
> (vinfo
))
2780 /* Mark the statement as not vectorizable. */
2781 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2786 datarefs_copy
.release ();
2791 datarefs_copy
.release ();
2795 /* Function vect_vfa_segment_size.
2797 Create an expression that computes the size of segment
2798 that will be accessed for a data reference. The functions takes into
2799 account that realignment loads may access one more vector.
2802 DR: The data reference.
2803 LENGTH_FACTOR: segment length to consider.
2805 Return an expression whose value is the size of segment which will be
2809 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2811 tree segment_length
;
2813 if (integer_zerop (DR_STEP (dr
)))
2814 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2816 segment_length
= size_binop (MULT_EXPR
,
2817 fold_convert (sizetype
, DR_STEP (dr
)),
2818 fold_convert (sizetype
, length_factor
));
2820 if (vect_supportable_dr_alignment (dr
, false)
2821 == dr_explicit_realign_optimized
)
2823 tree vector_size
= TYPE_SIZE_UNIT
2824 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2826 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2828 return segment_length
;
2831 /* Function vect_no_alias_p.
2833 Given data references A and B with equal base and offset, the alias
2834 relation can be decided at compilation time, return TRUE if they do
2835 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2836 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2840 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2841 tree segment_length_a
, tree segment_length_b
)
2843 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
2844 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
2845 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
2848 tree seg_a_min
= DR_INIT (a
);
2849 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
2850 seg_a_min
, segment_length_a
);
2851 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2852 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2854 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
2856 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
2857 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
2858 seg_a_max
, unit_size
);
2859 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
2860 DR_INIT (a
), unit_size
);
2862 tree seg_b_min
= DR_INIT (b
);
2863 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
2864 seg_b_min
, segment_length_b
);
2865 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
2867 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
2868 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
2869 seg_b_max
, unit_size
);
2870 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
2871 DR_INIT (b
), unit_size
);
2874 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
2875 || tree_int_cst_le (seg_b_max
, seg_a_min
))
2881 /* Function vect_prune_runtime_alias_test_list.
2883 Prune a list of ddrs to be tested at run-time by versioning for alias.
2884 Merge several alias checks into one if possible.
2885 Return FALSE if resulting list of ddrs is longer then allowed by
2886 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2889 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2891 vec
<ddr_p
> may_alias_ddrs
=
2892 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2893 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2894 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2895 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2896 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_NOTE
, vect_location
,
2904 "=== vect_prune_runtime_alias_test_list ===\n");
2906 if (may_alias_ddrs
.is_empty ())
2909 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2911 /* First, we collect all data ref pairs for aliasing checks. */
2912 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2915 struct data_reference
*dr_a
, *dr_b
;
2916 gimple
*dr_group_first_a
, *dr_group_first_b
;
2917 tree segment_length_a
, segment_length_b
;
2918 gimple
*stmt_a
, *stmt_b
;
2921 stmt_a
= DR_STMT (DDR_A (ddr
));
2922 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2923 if (dr_group_first_a
)
2925 stmt_a
= dr_group_first_a
;
2926 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2930 stmt_b
= DR_STMT (DDR_B (ddr
));
2931 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2932 if (dr_group_first_b
)
2934 stmt_b
= dr_group_first_b
;
2935 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2938 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2939 length_factor
= scalar_loop_iters
;
2941 length_factor
= size_int (vect_factor
);
2942 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2943 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2945 comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
2946 DR_BASE_ADDRESS (dr_b
));
2948 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
),
2951 /* Alias is known at compilation time. */
2953 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
2954 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
2955 && TREE_CODE (segment_length_a
) == INTEGER_CST
2956 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
2958 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
2961 if (dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE
, vect_location
,
2963 "not vectorized: compilation time alias.\n");
2968 dr_with_seg_len_pair_t dr_with_seg_len_pair
2969 (dr_with_seg_len (dr_a
, segment_length_a
),
2970 dr_with_seg_len (dr_b
, segment_length_b
));
2972 /* Canonicalize pairs by sorting the two DR members. */
2974 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2976 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2979 prune_runtime_alias_test_list (&comp_alias_ddrs
,
2980 (unsigned HOST_WIDE_INT
) vect_factor
);
2981 dump_printf_loc (MSG_NOTE
, vect_location
,
2982 "improved number of alias checks from %d to %d\n",
2983 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
2984 if ((int) comp_alias_ddrs
.length () >
2985 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
2987 if (dump_enabled_p ())
2988 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2989 "number of versioning for alias "
2990 "run-time tests exceeds %d "
2991 "(--param vect-max-version-for-alias-checks)\n",
2992 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
2996 /* All alias checks have been resolved at compilation time. */
2997 if (!comp_alias_ddrs
.length ())
2998 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
3003 /* Return true if a non-affine read or write in STMT is suitable for a
3004 gather load or scatter store. Describe the operation in *INFO if so. */
3007 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3008 gather_scatter_info
*info
)
3010 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3011 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3012 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3013 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3014 tree offtype
= NULL_TREE
;
3015 tree decl
, base
, off
;
3017 int punsignedp
, reversep
, pvolatilep
= 0;
3020 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3021 see if we can use the def stmt of the address. */
3022 if (is_gimple_call (stmt
)
3023 && gimple_call_internal_p (stmt
)
3024 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3025 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3026 && TREE_CODE (base
) == MEM_REF
3027 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3028 && integer_zerop (TREE_OPERAND (base
, 1))
3029 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3031 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3032 if (is_gimple_assign (def_stmt
)
3033 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3034 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3037 /* The gather and scatter builtins need address of the form
3038 loop_invariant + vector * {1, 2, 4, 8}
3040 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3041 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3042 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3043 multiplications and additions in it. To get a vector, we need
3044 a single SSA_NAME that will be defined in the loop and will
3045 contain everything that is not loop invariant and that can be
3046 vectorized. The following code attempts to find such a preexistng
3047 SSA_NAME OFF and put the loop invariants into a tree BASE
3048 that can be gimplified before the loop. */
3049 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3050 &punsignedp
, &reversep
, &pvolatilep
);
3051 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3053 if (TREE_CODE (base
) == MEM_REF
)
3055 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3057 if (off
== NULL_TREE
)
3059 offset_int moff
= mem_ref_offset (base
);
3060 off
= wide_int_to_tree (sizetype
, moff
);
3063 off
= size_binop (PLUS_EXPR
, off
,
3064 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3066 base
= TREE_OPERAND (base
, 0);
3069 base
= build_fold_addr_expr (base
);
3071 if (off
== NULL_TREE
)
3072 off
= size_zero_node
;
3074 /* If base is not loop invariant, either off is 0, then we start with just
3075 the constant offset in the loop invariant BASE and continue with base
3076 as OFF, otherwise give up.
3077 We could handle that case by gimplifying the addition of base + off
3078 into some SSA_NAME and use that as off, but for now punt. */
3079 if (!expr_invariant_in_loop_p (loop
, base
))
3081 if (!integer_zerop (off
))
3084 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3086 /* Otherwise put base + constant offset into the loop invariant BASE
3087 and continue with OFF. */
3090 base
= fold_convert (sizetype
, base
);
3091 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3094 /* OFF at this point may be either a SSA_NAME or some tree expression
3095 from get_inner_reference. Try to peel off loop invariants from it
3096 into BASE as long as possible. */
3098 while (offtype
== NULL_TREE
)
3100 enum tree_code code
;
3101 tree op0
, op1
, add
= NULL_TREE
;
3103 if (TREE_CODE (off
) == SSA_NAME
)
3105 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3107 if (expr_invariant_in_loop_p (loop
, off
))
3110 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3113 op0
= gimple_assign_rhs1 (def_stmt
);
3114 code
= gimple_assign_rhs_code (def_stmt
);
3115 op1
= gimple_assign_rhs2 (def_stmt
);
3119 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3121 code
= TREE_CODE (off
);
3122 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3126 case POINTER_PLUS_EXPR
:
3128 if (expr_invariant_in_loop_p (loop
, op0
))
3133 add
= fold_convert (sizetype
, add
);
3135 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3136 base
= size_binop (PLUS_EXPR
, base
, add
);
3139 if (expr_invariant_in_loop_p (loop
, op1
))
3147 if (expr_invariant_in_loop_p (loop
, op1
))
3149 add
= fold_convert (sizetype
, op1
);
3150 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3156 if (scale
== 1 && tree_fits_shwi_p (op1
))
3158 scale
= tree_to_shwi (op1
);
3167 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3168 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3170 if (TYPE_PRECISION (TREE_TYPE (op0
))
3171 == TYPE_PRECISION (TREE_TYPE (off
)))
3176 if (TYPE_PRECISION (TREE_TYPE (op0
))
3177 < TYPE_PRECISION (TREE_TYPE (off
)))
3180 offtype
= TREE_TYPE (off
);
3191 /* If at the end OFF still isn't a SSA_NAME or isn't
3192 defined in the loop, punt. */
3193 if (TREE_CODE (off
) != SSA_NAME
3194 || expr_invariant_in_loop_p (loop
, off
))
3197 if (offtype
== NULL_TREE
)
3198 offtype
= TREE_TYPE (off
);
3200 if (DR_IS_READ (dr
))
3201 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3204 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3207 if (decl
== NULL_TREE
)
3213 info
->offset_dt
= vect_unknown_def_type
;
3214 info
->offset_vectype
= NULL_TREE
;
3215 info
->scale
= scale
;
3219 /* Function vect_analyze_data_refs.
3221 Find all the data references in the loop or basic block.
3223 The general structure of the analysis of data refs in the vectorizer is as
3225 1- vect_analyze_data_refs(loop/bb): call
3226 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3227 in the loop/bb and their dependences.
3228 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3229 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3230 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3235 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3237 struct loop
*loop
= NULL
;
3239 struct data_reference
*dr
;
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE
, vect_location
,
3244 "=== vect_analyze_data_refs ===\n");
3246 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3247 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3249 /* Go through the data-refs, check that the analysis succeeded. Update
3250 pointer from stmt_vec_info struct to DR and vectype. */
3252 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3253 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3256 stmt_vec_info stmt_info
;
3257 tree base
, offset
, init
;
3258 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3259 bool simd_lane_access
= false;
3263 if (!dr
|| !DR_REF (dr
))
3265 if (dump_enabled_p ())
3266 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3267 "not vectorized: unhandled data-ref\n");
3271 stmt
= DR_STMT (dr
);
3272 stmt_info
= vinfo_for_stmt (stmt
);
3274 /* Discard clobbers from the dataref vector. We will remove
3275 clobber stmts during vectorization. */
3276 if (gimple_clobber_p (stmt
))
3279 if (i
== datarefs
.length () - 1)
3284 datarefs
.ordered_remove (i
);
3289 /* Check that analysis of the data-ref succeeded. */
3290 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3295 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3296 && targetm
.vectorize
.builtin_gather
!= NULL
;
3299 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3300 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3301 bool maybe_simd_lane_access
3302 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3304 /* If target supports vector gather loads or scatter stores, or if
3305 this might be a SIMD lane access, see if they can't be used. */
3306 if (is_a
<loop_vec_info
> (vinfo
)
3307 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3308 && !nested_in_vect_loop_p (loop
, stmt
))
3310 struct data_reference
*newdr
3311 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3312 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3313 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3314 if (DR_BASE_ADDRESS (newdr
)
3315 && DR_OFFSET (newdr
)
3318 && integer_zerop (DR_STEP (newdr
)))
3320 if (maybe_simd_lane_access
)
3322 tree off
= DR_OFFSET (newdr
);
3324 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3325 && TREE_CODE (off
) == MULT_EXPR
3326 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3328 tree step
= TREE_OPERAND (off
, 1);
3329 off
= TREE_OPERAND (off
, 0);
3331 if (CONVERT_EXPR_P (off
)
3332 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3334 < TYPE_PRECISION (TREE_TYPE (off
)))
3335 off
= TREE_OPERAND (off
, 0);
3336 if (TREE_CODE (off
) == SSA_NAME
)
3338 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3339 tree reft
= TREE_TYPE (DR_REF (newdr
));
3340 if (is_gimple_call (def
)
3341 && gimple_call_internal_p (def
)
3342 && (gimple_call_internal_fn (def
)
3343 == IFN_GOMP_SIMD_LANE
))
3345 tree arg
= gimple_call_arg (def
, 0);
3346 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3347 arg
= SSA_NAME_VAR (arg
);
3348 if (arg
== loop
->simduid
3350 && tree_int_cst_equal
3351 (TYPE_SIZE_UNIT (reft
),
3354 DR_OFFSET (newdr
) = ssize_int (0);
3355 DR_STEP (newdr
) = step
;
3356 DR_OFFSET_ALIGNMENT (newdr
)
3357 = BIGGEST_ALIGNMENT
;
3358 DR_STEP_ALIGNMENT (newdr
)
3359 = highest_pow2_factor (step
);
3361 simd_lane_access
= true;
3367 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3371 gatherscatter
= GATHER
;
3373 gatherscatter
= SCATTER
;
3376 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3377 free_data_ref (newdr
);
3380 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3382 if (dump_enabled_p ())
3384 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3385 "not vectorized: data ref analysis "
3387 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3390 if (is_a
<bb_vec_info
> (vinfo
))
3397 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3401 "not vectorized: base addr of dr is a "
3404 if (is_a
<bb_vec_info
> (vinfo
))
3407 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3412 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3417 "not vectorized: volatile type ");
3418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3421 if (is_a
<bb_vec_info
> (vinfo
))
3427 if (stmt_can_throw_internal (stmt
))
3429 if (dump_enabled_p ())
3431 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3432 "not vectorized: statement can throw an "
3434 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3437 if (is_a
<bb_vec_info
> (vinfo
))
3440 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3445 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3446 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3448 if (dump_enabled_p ())
3450 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3451 "not vectorized: statement is bitfield "
3453 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3456 if (is_a
<bb_vec_info
> (vinfo
))
3459 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3464 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3465 offset
= unshare_expr (DR_OFFSET (dr
));
3466 init
= unshare_expr (DR_INIT (dr
));
3468 if (is_gimple_call (stmt
)
3469 && (!gimple_call_internal_p (stmt
)
3470 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3471 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3476 "not vectorized: dr in a call ");
3477 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3480 if (is_a
<bb_vec_info
> (vinfo
))
3483 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3488 /* Update DR field in stmt_vec_info struct. */
3490 /* If the dataref is in an inner-loop of the loop that is considered for
3491 for vectorization, we also want to analyze the access relative to
3492 the outer-loop (DR contains information only relative to the
3493 inner-most enclosing loop). We do that by building a reference to the
3494 first location accessed by the inner-loop, and analyze it relative to
3496 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3498 /* Build a reference to the first location accessed by the
3499 inner loop: *(BASE + INIT + OFFSET). By construction,
3500 this address must be invariant in the inner loop, so we
3501 can consider it as being used in the outer loop. */
3502 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
3504 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
3505 tree init_ref
= build_fold_indirect_ref (init_addr
);
3507 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_NOTE
, vect_location
,
3510 "analyze in outer loop: ");
3511 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, init_ref
);
3512 dump_printf (MSG_NOTE
, "\n");
3515 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
3517 /* dr_analyze_innermost already explained the failure. */
3520 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE
, vect_location
,
3523 "\touter base_address: ");
3524 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3525 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3526 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3527 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3528 STMT_VINFO_DR_OFFSET (stmt_info
));
3529 dump_printf (MSG_NOTE
,
3530 "\n\touter constant offset from base address: ");
3531 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3532 STMT_VINFO_DR_INIT (stmt_info
));
3533 dump_printf (MSG_NOTE
, "\n\touter step: ");
3534 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3535 STMT_VINFO_DR_STEP (stmt_info
));
3536 dump_printf (MSG_NOTE
, "\n\touter base alignment: %d\n",
3537 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
));
3538 dump_printf (MSG_NOTE
, "\n\touter base misalignment: %d\n",
3539 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
));
3540 dump_printf (MSG_NOTE
, "\n\touter offset alignment: %d\n",
3541 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
));
3542 dump_printf (MSG_NOTE
, "\n\touter step alignment: %d\n",
3543 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
3547 if (STMT_VINFO_DATA_REF (stmt_info
))
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3552 "not vectorized: more than one data ref "
3554 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3557 if (is_a
<bb_vec_info
> (vinfo
))
3560 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3565 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3566 if (simd_lane_access
)
3568 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3569 free_data_ref (datarefs
[i
]);
3573 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == ADDR_EXPR
3574 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0))
3575 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr
), 0)))
3577 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3580 "not vectorized: base object not addressable "
3582 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3584 if (is_a
<bb_vec_info
> (vinfo
))
3586 /* In BB vectorization the ref can still participate
3587 in dependence analysis, we just can't vectorize it. */
3588 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3594 /* Set vectype for STMT. */
3595 scalar_type
= TREE_TYPE (DR_REF (dr
));
3596 STMT_VINFO_VECTYPE (stmt_info
)
3597 = get_vectype_for_scalar_type (scalar_type
);
3598 if (!STMT_VINFO_VECTYPE (stmt_info
))
3600 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3603 "not vectorized: no vectype for stmt: ");
3604 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3605 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3606 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3608 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3611 if (is_a
<bb_vec_info
> (vinfo
))
3613 /* No vector type is fine, the ref can still participate
3614 in dependence analysis, we just can't vectorize it. */
3615 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3619 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3621 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3622 if (gatherscatter
!= SG_NONE
)
3629 if (dump_enabled_p ())
3631 dump_printf_loc (MSG_NOTE
, vect_location
,
3632 "got vectype for stmt: ");
3633 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3634 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3635 STMT_VINFO_VECTYPE (stmt_info
));
3636 dump_printf (MSG_NOTE
, "\n");
3640 /* Adjust the minimal vectorization factor according to the
3642 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3646 if (gatherscatter
!= SG_NONE
)
3648 gather_scatter_info gs_info
;
3649 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3651 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
3653 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3655 if (dump_enabled_p ())
3657 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3658 (gatherscatter
== GATHER
) ?
3659 "not vectorized: not suitable for gather "
3661 "not vectorized: not suitable for scatter "
3663 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3668 free_data_ref (datarefs
[i
]);
3670 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3673 else if (is_a
<loop_vec_info
> (vinfo
)
3674 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3676 if (nested_in_vect_loop_p (loop
, stmt
))
3678 if (dump_enabled_p ())
3680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3681 "not vectorized: not suitable for strided "
3683 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3687 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3691 /* If we stopped analysis at the first dataref we could not analyze
3692 when trying to vectorize a basic-block mark the rest of the datarefs
3693 as not vectorizable and truncate the vector of datarefs. That
3694 avoids spending useless time in analyzing their dependence. */
3695 if (i
!= datarefs
.length ())
3697 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3698 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3700 data_reference_p dr
= datarefs
[j
];
3701 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3704 datarefs
.truncate (i
);
3711 /* Function vect_get_new_vect_var.
3713 Returns a name for a new variable. The current naming scheme appends the
3714 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3715 the name of vectorizer generated variables, and appends that to NAME if
3719 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3726 case vect_simple_var
:
3729 case vect_scalar_var
:
3735 case vect_pointer_var
:
3744 char* tmp
= concat (prefix
, "_", name
, NULL
);
3745 new_vect_var
= create_tmp_reg (type
, tmp
);
3749 new_vect_var
= create_tmp_reg (type
, prefix
);
3751 return new_vect_var
;
3754 /* Like vect_get_new_vect_var but return an SSA name. */
3757 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3764 case vect_simple_var
:
3767 case vect_scalar_var
:
3770 case vect_pointer_var
:
3779 char* tmp
= concat (prefix
, "_", name
, NULL
);
3780 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
3784 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
3786 return new_vect_var
;
3789 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3792 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3793 stmt_vec_info stmt_info
)
3795 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3796 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3797 int misalign
= DR_MISALIGNMENT (dr
);
3798 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
3799 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
3801 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
3804 /* Function vect_create_addr_base_for_vector_ref.
3806 Create an expression that computes the address of the first memory location
3807 that will be accessed for a data reference.
3810 STMT: The statement containing the data reference.
3811 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3812 OFFSET: Optional. If supplied, it is be added to the initial address.
3813 LOOP: Specify relative to which loop-nest should the address be computed.
3814 For example, when the dataref is in an inner-loop nested in an
3815 outer-loop that is now being vectorized, LOOP can be either the
3816 outer-loop, or the inner-loop. The first memory location accessed
3817 by the following dataref ('in' points to short):
3824 if LOOP=i_loop: &in (relative to i_loop)
3825 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3826 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3827 initial address. Unlike OFFSET, which is number of elements to
3828 be added, BYTE_OFFSET is measured in bytes.
3831 1. Return an SSA_NAME whose value is the address of the memory location of
3832 the first vector of the data reference.
3833 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3834 these statement(s) which define the returned SSA_NAME.
3836 FORNOW: We are only handling array accesses with step 1. */
3839 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
3840 gimple_seq
*new_stmt_list
,
3844 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3845 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3846 const char *base_name
;
3849 gimple_seq seq
= NULL
;
3851 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3852 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3853 innermost_loop_behavior
*drb
= vect_dr_behavior (dr
);
3855 tree data_ref_base
= unshare_expr (drb
->base_address
);
3856 tree base_offset
= unshare_expr (drb
->offset
);
3857 tree init
= unshare_expr (drb
->init
);
3860 base_name
= get_name (data_ref_base
);
3863 base_offset
= ssize_int (0);
3864 init
= ssize_int (0);
3865 base_name
= get_name (DR_REF (dr
));
3868 /* Create base_offset */
3869 base_offset
= size_binop (PLUS_EXPR
,
3870 fold_convert (sizetype
, base_offset
),
3871 fold_convert (sizetype
, init
));
3875 offset
= fold_build2 (MULT_EXPR
, sizetype
,
3876 fold_convert (sizetype
, offset
), step
);
3877 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3878 base_offset
, offset
);
3882 byte_offset
= fold_convert (sizetype
, byte_offset
);
3883 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
3884 base_offset
, byte_offset
);
3887 /* base + base_offset */
3889 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
3892 addr_base
= build1 (ADDR_EXPR
,
3893 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
3894 unshare_expr (DR_REF (dr
)));
3897 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
3898 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
3899 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
3900 gimple_seq_add_seq (new_stmt_list
, seq
);
3902 if (DR_PTR_INFO (dr
)
3903 && TREE_CODE (addr_base
) == SSA_NAME
3904 && !SSA_NAME_PTR_INFO (addr_base
))
3906 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
3907 if (offset
|| byte_offset
)
3908 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
3914 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
3915 dump_printf (MSG_NOTE
, "\n");
3922 /* Function vect_create_data_ref_ptr.
3924 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3925 location accessed in the loop by STMT, along with the def-use update
3926 chain to appropriately advance the pointer through the loop iterations.
3927 Also set aliasing information for the pointer. This pointer is used by
3928 the callers to this function to create a memory reference expression for
3929 vector load/store access.
3932 1. STMT: a stmt that references memory. Expected to be of the form
3933 GIMPLE_ASSIGN <name, data-ref> or
3934 GIMPLE_ASSIGN <data-ref, name>.
3935 2. AGGR_TYPE: the type of the reference, which should be either a vector
3937 3. AT_LOOP: the loop where the vector memref is to be created.
3938 4. OFFSET (optional): an offset to be added to the initial address accessed
3939 by the data-ref in STMT.
3940 5. BSI: location where the new stmts are to be placed if there is no loop
3941 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3942 pointing to the initial address.
3943 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3944 to the initial address accessed by the data-ref in STMT. This is
3945 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3949 1. Declare a new ptr to vector_type, and have it point to the base of the
3950 data reference (initial addressed accessed by the data reference).
3951 For example, for vector of type V8HI, the following code is generated:
3954 ap = (v8hi *)initial_address;
3956 if OFFSET is not supplied:
3957 initial_address = &a[init];
3958 if OFFSET is supplied:
3959 initial_address = &a[init + OFFSET];
3960 if BYTE_OFFSET is supplied:
3961 initial_address = &a[init] + BYTE_OFFSET;
3963 Return the initial_address in INITIAL_ADDRESS.
3965 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3966 update the pointer in each iteration of the loop.
3968 Return the increment stmt that updates the pointer in PTR_INCR.
3970 3. Set INV_P to true if the access pattern of the data reference in the
3971 vectorized loop is invariant. Set it to false otherwise.
3973 4. Return the pointer. */
3976 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
3977 tree offset
, tree
*initial_address
,
3978 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
3979 bool only_init
, bool *inv_p
, tree byte_offset
)
3981 const char *base_name
;
3982 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3983 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3984 struct loop
*loop
= NULL
;
3985 bool nested_in_vect_loop
= false;
3986 struct loop
*containing_loop
= NULL
;
3990 gimple_seq new_stmt_list
= NULL
;
3994 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3996 gimple_stmt_iterator incr_gsi
;
3998 tree indx_before_incr
, indx_after_incr
;
4001 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4003 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4004 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4008 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4009 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4010 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4011 pe
= loop_preheader_edge (loop
);
4015 gcc_assert (bb_vinfo
);
4020 /* Check the step (evolution) of the load in LOOP, and record
4021 whether it's invariant. */
4022 step
= vect_dr_behavior (dr
)->step
;
4023 if (integer_zerop (step
))
4028 /* Create an expression for the first address accessed by this load
4030 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4032 if (dump_enabled_p ())
4034 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4035 dump_printf_loc (MSG_NOTE
, vect_location
,
4036 "create %s-pointer variable to type: ",
4037 get_tree_code_name (TREE_CODE (aggr_type
)));
4038 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4039 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4040 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4041 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4042 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4043 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4044 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4046 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4047 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4048 dump_printf (MSG_NOTE
, "\n");
4051 /* (1) Create the new aggregate-pointer variable.
4052 Vector and array types inherit the alias set of their component
4053 type by default so we need to use a ref-all pointer if the data
4054 reference does not conflict with the created aggregated data
4055 reference because it is not addressable. */
4056 bool need_ref_all
= false;
4057 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4058 get_alias_set (DR_REF (dr
))))
4059 need_ref_all
= true;
4060 /* Likewise for any of the data references in the stmt group. */
4061 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4063 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4066 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4067 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4068 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4069 get_alias_set (DR_REF (sdr
))))
4071 need_ref_all
= true;
4074 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4078 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4080 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4083 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4084 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4085 def-use update cycles for the pointer: one relative to the outer-loop
4086 (LOOP), which is what steps (3) and (4) below do. The other is relative
4087 to the inner-loop (which is the inner-most loop containing the dataref),
4088 and this is done be step (5) below.
4090 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4091 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4092 redundant. Steps (3),(4) create the following:
4095 LOOP: vp1 = phi(vp0,vp2)
4101 If there is an inner-loop nested in loop, then step (5) will also be
4102 applied, and an additional update in the inner-loop will be created:
4105 LOOP: vp1 = phi(vp0,vp2)
4107 inner: vp3 = phi(vp1,vp4)
4108 vp4 = vp3 + inner_step
4114 /* (2) Calculate the initial address of the aggregate-pointer, and set
4115 the aggregate-pointer to point to it before the loop. */
4117 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4119 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4120 offset
, byte_offset
);
4125 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4126 gcc_assert (!new_bb
);
4129 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4132 *initial_address
= new_temp
;
4133 aggr_ptr_init
= new_temp
;
4135 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4136 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4137 inner-loop nested in LOOP (during outer-loop vectorization). */
4139 /* No update in loop is required. */
4140 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4141 aptr
= aggr_ptr_init
;
4144 /* The step of the aggregate pointer is the type size. */
4145 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4146 /* One exception to the above is when the scalar step of the load in
4147 LOOP is zero. In this case the step here is also zero. */
4149 iv_step
= size_zero_node
;
4150 else if (tree_int_cst_sgn (step
) == -1)
4151 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4153 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4155 create_iv (aggr_ptr_init
,
4156 fold_convert (aggr_ptr_type
, iv_step
),
4157 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4158 &indx_before_incr
, &indx_after_incr
);
4159 incr
= gsi_stmt (incr_gsi
);
4160 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4162 /* Copy the points-to information if it exists. */
4163 if (DR_PTR_INFO (dr
))
4165 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4166 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4171 aptr
= indx_before_incr
;
4174 if (!nested_in_vect_loop
|| only_init
)
4178 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4179 nested in LOOP, if exists. */
4181 gcc_assert (nested_in_vect_loop
);
4184 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4186 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4187 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4189 incr
= gsi_stmt (incr_gsi
);
4190 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4192 /* Copy the points-to information if it exists. */
4193 if (DR_PTR_INFO (dr
))
4195 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4196 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4201 return indx_before_incr
;
4208 /* Function bump_vector_ptr
4210 Increment a pointer (to a vector type) by vector-size. If requested,
4211 i.e. if PTR-INCR is given, then also connect the new increment stmt
4212 to the existing def-use update-chain of the pointer, by modifying
4213 the PTR_INCR as illustrated below:
4215 The pointer def-use update-chain before this function:
4216 DATAREF_PTR = phi (p_0, p_2)
4218 PTR_INCR: p_2 = DATAREF_PTR + step
4220 The pointer def-use update-chain after this function:
4221 DATAREF_PTR = phi (p_0, p_2)
4223 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4225 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4228 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4230 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4231 the loop. The increment amount across iterations is expected
4233 BSI - location where the new update stmt is to be placed.
4234 STMT - the original scalar memory-access stmt that is being vectorized.
4235 BUMP - optional. The offset by which to bump the pointer. If not given,
4236 the offset is assumed to be vector_size.
4238 Output: Return NEW_DATAREF_PTR as illustrated above.
4243 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4244 gimple
*stmt
, tree bump
)
4246 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4247 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4248 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4249 tree update
= TYPE_SIZE_UNIT (vectype
);
4252 use_operand_p use_p
;
4253 tree new_dataref_ptr
;
4258 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4259 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4261 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4262 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4263 dataref_ptr
, update
);
4264 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4266 /* Copy the points-to information if it exists. */
4267 if (DR_PTR_INFO (dr
))
4269 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4270 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4274 return new_dataref_ptr
;
4276 /* Update the vector-pointer's cross-iteration increment. */
4277 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4279 tree use
= USE_FROM_PTR (use_p
);
4281 if (use
== dataref_ptr
)
4282 SET_USE (use_p
, new_dataref_ptr
);
4284 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4287 return new_dataref_ptr
;
4291 /* Function vect_create_destination_var.
4293 Create a new temporary of type VECTYPE. */
4296 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4302 enum vect_var_kind kind
;
4305 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4309 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4311 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4313 name
= get_name (scalar_dest
);
4315 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4317 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4318 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4324 /* Function vect_grouped_store_supported.
4326 Returns TRUE if interleave high and interleave low permutations
4327 are supported, and FALSE otherwise. */
4330 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4332 machine_mode mode
= TYPE_MODE (vectype
);
4334 /* vect_permute_store_chain requires the group size to be equal to 3 or
4335 be a power of two. */
4336 if (count
!= 3 && exact_log2 (count
) == -1)
4338 if (dump_enabled_p ())
4339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4340 "the size of the group of accesses"
4341 " is not a power of 2 or not eqaul to 3\n");
4345 /* Check that the permutation is supported. */
4346 if (VECTOR_MODE_P (mode
))
4348 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4349 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4353 unsigned int j0
= 0, j1
= 0, j2
= 0;
4356 for (j
= 0; j
< 3; j
++)
4358 int nelt0
= ((3 - j
) * nelt
) % 3;
4359 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4360 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4361 for (i
= 0; i
< nelt
; i
++)
4363 if (3 * i
+ nelt0
< nelt
)
4364 sel
[3 * i
+ nelt0
] = j0
++;
4365 if (3 * i
+ nelt1
< nelt
)
4366 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4367 if (3 * i
+ nelt2
< nelt
)
4368 sel
[3 * i
+ nelt2
] = 0;
4370 if (!can_vec_perm_p (mode
, false, sel
))
4372 if (dump_enabled_p ())
4373 dump_printf (MSG_MISSED_OPTIMIZATION
,
4374 "permutaion op not supported by target.\n");
4378 for (i
= 0; i
< nelt
; i
++)
4380 if (3 * i
+ nelt0
< nelt
)
4381 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4382 if (3 * i
+ nelt1
< nelt
)
4383 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4384 if (3 * i
+ nelt2
< nelt
)
4385 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4387 if (!can_vec_perm_p (mode
, false, sel
))
4389 if (dump_enabled_p ())
4390 dump_printf (MSG_MISSED_OPTIMIZATION
,
4391 "permutaion op not supported by target.\n");
4399 /* If length is not equal to 3 then only power of 2 is supported. */
4400 gcc_assert (pow2p_hwi (count
));
4402 for (i
= 0; i
< nelt
/ 2; i
++)
4405 sel
[i
* 2 + 1] = i
+ nelt
;
4407 if (can_vec_perm_p (mode
, false, sel
))
4409 for (i
= 0; i
< nelt
; i
++)
4411 if (can_vec_perm_p (mode
, false, sel
))
4417 if (dump_enabled_p ())
4418 dump_printf (MSG_MISSED_OPTIMIZATION
,
4419 "permutaion op not supported by target.\n");
4424 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4428 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4430 return vect_lanes_optab_supported_p ("vec_store_lanes",
4431 vec_store_lanes_optab
,
4436 /* Function vect_permute_store_chain.
4438 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4439 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4440 the data correctly for the stores. Return the final references for stores
4443 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4444 The input is 4 vectors each containing 8 elements. We assign a number to
4445 each element, the input sequence is:
4447 1st vec: 0 1 2 3 4 5 6 7
4448 2nd vec: 8 9 10 11 12 13 14 15
4449 3rd vec: 16 17 18 19 20 21 22 23
4450 4th vec: 24 25 26 27 28 29 30 31
4452 The output sequence should be:
4454 1st vec: 0 8 16 24 1 9 17 25
4455 2nd vec: 2 10 18 26 3 11 19 27
4456 3rd vec: 4 12 20 28 5 13 21 30
4457 4th vec: 6 14 22 30 7 15 23 31
4459 i.e., we interleave the contents of the four vectors in their order.
4461 We use interleave_high/low instructions to create such output. The input of
4462 each interleave_high/low operation is two vectors:
4465 the even elements of the result vector are obtained left-to-right from the
4466 high/low elements of the first vector. The odd elements of the result are
4467 obtained left-to-right from the high/low elements of the second vector.
4468 The output of interleave_high will be: 0 4 1 5
4469 and of interleave_low: 2 6 3 7
4472 The permutation is done in log LENGTH stages. In each stage interleave_high
4473 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4474 where the first argument is taken from the first half of DR_CHAIN and the
4475 second argument from it's second half.
4478 I1: interleave_high (1st vec, 3rd vec)
4479 I2: interleave_low (1st vec, 3rd vec)
4480 I3: interleave_high (2nd vec, 4th vec)
4481 I4: interleave_low (2nd vec, 4th vec)
4483 The output for the first stage is:
4485 I1: 0 16 1 17 2 18 3 19
4486 I2: 4 20 5 21 6 22 7 23
4487 I3: 8 24 9 25 10 26 11 27
4488 I4: 12 28 13 29 14 30 15 31
4490 The output of the second stage, i.e. the final result is:
4492 I1: 0 8 16 24 1 9 17 25
4493 I2: 2 10 18 26 3 11 19 27
4494 I3: 4 12 20 28 5 13 21 30
4495 I4: 6 14 22 30 7 15 23 31. */
4498 vect_permute_store_chain (vec
<tree
> dr_chain
,
4499 unsigned int length
,
4501 gimple_stmt_iterator
*gsi
,
4502 vec
<tree
> *result_chain
)
4504 tree vect1
, vect2
, high
, low
;
4506 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4507 tree perm_mask_low
, perm_mask_high
;
4509 tree perm3_mask_low
, perm3_mask_high
;
4510 unsigned int i
, n
, log_length
= exact_log2 (length
);
4511 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4512 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4514 result_chain
->quick_grow (length
);
4515 memcpy (result_chain
->address (), dr_chain
.address (),
4516 length
* sizeof (tree
));
4520 unsigned int j0
= 0, j1
= 0, j2
= 0;
4522 for (j
= 0; j
< 3; j
++)
4524 int nelt0
= ((3 - j
) * nelt
) % 3;
4525 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4526 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4528 for (i
= 0; i
< nelt
; i
++)
4530 if (3 * i
+ nelt0
< nelt
)
4531 sel
[3 * i
+ nelt0
] = j0
++;
4532 if (3 * i
+ nelt1
< nelt
)
4533 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4534 if (3 * i
+ nelt2
< nelt
)
4535 sel
[3 * i
+ nelt2
] = 0;
4537 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4539 for (i
= 0; i
< nelt
; i
++)
4541 if (3 * i
+ nelt0
< nelt
)
4542 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4543 if (3 * i
+ nelt1
< nelt
)
4544 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4545 if (3 * i
+ nelt2
< nelt
)
4546 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4548 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4550 vect1
= dr_chain
[0];
4551 vect2
= dr_chain
[1];
4553 /* Create interleaving stmt:
4554 low = VEC_PERM_EXPR <vect1, vect2,
4555 {j, nelt, *, j + 1, nelt + j + 1, *,
4556 j + 2, nelt + j + 2, *, ...}> */
4557 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4558 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4559 vect2
, perm3_mask_low
);
4560 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4563 vect2
= dr_chain
[2];
4564 /* Create interleaving stmt:
4565 low = VEC_PERM_EXPR <vect1, vect2,
4566 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4567 6, 7, nelt + j + 2, ...}> */
4568 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4569 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4570 vect2
, perm3_mask_high
);
4571 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4572 (*result_chain
)[j
] = data_ref
;
4577 /* If length is not equal to 3 then only power of 2 is supported. */
4578 gcc_assert (pow2p_hwi (length
));
4580 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4583 sel
[i
* 2 + 1] = i
+ nelt
;
4585 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4587 for (i
= 0; i
< nelt
; i
++)
4589 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4591 for (i
= 0, n
= log_length
; i
< n
; i
++)
4593 for (j
= 0; j
< length
/2; j
++)
4595 vect1
= dr_chain
[j
];
4596 vect2
= dr_chain
[j
+length
/2];
4598 /* Create interleaving stmt:
4599 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4601 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4602 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4603 vect2
, perm_mask_high
);
4604 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4605 (*result_chain
)[2*j
] = high
;
4607 /* Create interleaving stmt:
4608 low = VEC_PERM_EXPR <vect1, vect2,
4609 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4611 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4612 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4613 vect2
, perm_mask_low
);
4614 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4615 (*result_chain
)[2*j
+1] = low
;
4617 memcpy (dr_chain
.address (), result_chain
->address (),
4618 length
* sizeof (tree
));
4623 /* Function vect_setup_realignment
4625 This function is called when vectorizing an unaligned load using
4626 the dr_explicit_realign[_optimized] scheme.
4627 This function generates the following code at the loop prolog:
4630 x msq_init = *(floor(p)); # prolog load
4631 realignment_token = call target_builtin;
4633 x msq = phi (msq_init, ---)
4635 The stmts marked with x are generated only for the case of
4636 dr_explicit_realign_optimized.
4638 The code above sets up a new (vector) pointer, pointing to the first
4639 location accessed by STMT, and a "floor-aligned" load using that pointer.
4640 It also generates code to compute the "realignment-token" (if the relevant
4641 target hook was defined), and creates a phi-node at the loop-header bb
4642 whose arguments are the result of the prolog-load (created by this
4643 function) and the result of a load that takes place in the loop (to be
4644 created by the caller to this function).
4646 For the case of dr_explicit_realign_optimized:
4647 The caller to this function uses the phi-result (msq) to create the
4648 realignment code inside the loop, and sets up the missing phi argument,
4651 msq = phi (msq_init, lsq)
4652 lsq = *(floor(p')); # load in loop
4653 result = realign_load (msq, lsq, realignment_token);
4655 For the case of dr_explicit_realign:
4657 msq = *(floor(p)); # load in loop
4659 lsq = *(floor(p')); # load in loop
4660 result = realign_load (msq, lsq, realignment_token);
4663 STMT - (scalar) load stmt to be vectorized. This load accesses
4664 a memory location that may be unaligned.
4665 BSI - place where new code is to be inserted.
4666 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4670 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4671 target hook, if defined.
4672 Return value - the result of the loop-header phi node. */
4675 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4676 tree
*realignment_token
,
4677 enum dr_alignment_support alignment_support_scheme
,
4679 struct loop
**at_loop
)
4681 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4682 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4683 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4684 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4685 struct loop
*loop
= NULL
;
4687 tree scalar_dest
= gimple_assign_lhs (stmt
);
4693 tree msq_init
= NULL_TREE
;
4696 tree msq
= NULL_TREE
;
4697 gimple_seq stmts
= NULL
;
4699 bool compute_in_loop
= false;
4700 bool nested_in_vect_loop
= false;
4701 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4702 struct loop
*loop_for_initial_load
= NULL
;
4706 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4707 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4710 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4711 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4713 /* We need to generate three things:
4714 1. the misalignment computation
4715 2. the extra vector load (for the optimized realignment scheme).
4716 3. the phi node for the two vectors from which the realignment is
4717 done (for the optimized realignment scheme). */
4719 /* 1. Determine where to generate the misalignment computation.
4721 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4722 calculation will be generated by this function, outside the loop (in the
4723 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4724 caller, inside the loop.
4726 Background: If the misalignment remains fixed throughout the iterations of
4727 the loop, then both realignment schemes are applicable, and also the
4728 misalignment computation can be done outside LOOP. This is because we are
4729 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4730 are a multiple of VS (the Vector Size), and therefore the misalignment in
4731 different vectorized LOOP iterations is always the same.
4732 The problem arises only if the memory access is in an inner-loop nested
4733 inside LOOP, which is now being vectorized using outer-loop vectorization.
4734 This is the only case when the misalignment of the memory access may not
4735 remain fixed throughout the iterations of the inner-loop (as explained in
4736 detail in vect_supportable_dr_alignment). In this case, not only is the
4737 optimized realignment scheme not applicable, but also the misalignment
4738 computation (and generation of the realignment token that is passed to
4739 REALIGN_LOAD) have to be done inside the loop.
4741 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4742 or not, which in turn determines if the misalignment is computed inside
4743 the inner-loop, or outside LOOP. */
4745 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4747 compute_in_loop
= true;
4748 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4752 /* 2. Determine where to generate the extra vector load.
4754 For the optimized realignment scheme, instead of generating two vector
4755 loads in each iteration, we generate a single extra vector load in the
4756 preheader of the loop, and in each iteration reuse the result of the
4757 vector load from the previous iteration. In case the memory access is in
4758 an inner-loop nested inside LOOP, which is now being vectorized using
4759 outer-loop vectorization, we need to determine whether this initial vector
4760 load should be generated at the preheader of the inner-loop, or can be
4761 generated at the preheader of LOOP. If the memory access has no evolution
4762 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4763 to be generated inside LOOP (in the preheader of the inner-loop). */
4765 if (nested_in_vect_loop
)
4767 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4768 bool invariant_in_outerloop
=
4769 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4770 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4773 loop_for_initial_load
= loop
;
4775 *at_loop
= loop_for_initial_load
;
4777 if (loop_for_initial_load
)
4778 pe
= loop_preheader_edge (loop_for_initial_load
);
4780 /* 3. For the case of the optimized realignment, create the first vector
4781 load at the loop preheader. */
4783 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4785 /* Create msq_init = *(floor(p1)) in the loop preheader */
4788 gcc_assert (!compute_in_loop
);
4789 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4790 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4791 NULL_TREE
, &init_addr
, NULL
, &inc
,
4793 if (TREE_CODE (ptr
) == SSA_NAME
)
4794 new_temp
= copy_ssa_name (ptr
);
4796 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
4797 new_stmt
= gimple_build_assign
4798 (new_temp
, BIT_AND_EXPR
, ptr
,
4799 build_int_cst (TREE_TYPE (ptr
),
4800 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4801 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4802 gcc_assert (!new_bb
);
4804 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4805 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4806 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4807 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4808 gimple_assign_set_lhs (new_stmt
, new_temp
);
4811 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4812 gcc_assert (!new_bb
);
4815 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4817 msq_init
= gimple_assign_lhs (new_stmt
);
4820 /* 4. Create realignment token using a target builtin, if available.
4821 It is done either inside the containing loop, or before LOOP (as
4822 determined above). */
4824 if (targetm
.vectorize
.builtin_mask_for_load
)
4829 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4832 /* Generate the INIT_ADDR computation outside LOOP. */
4833 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4837 pe
= loop_preheader_edge (loop
);
4838 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4839 gcc_assert (!new_bb
);
4842 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4845 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4846 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4848 vect_create_destination_var (scalar_dest
,
4849 gimple_call_return_type (new_stmt
));
4850 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4851 gimple_call_set_lhs (new_stmt
, new_temp
);
4853 if (compute_in_loop
)
4854 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4857 /* Generate the misalignment computation outside LOOP. */
4858 pe
= loop_preheader_edge (loop
);
4859 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4860 gcc_assert (!new_bb
);
4863 *realignment_token
= gimple_call_lhs (new_stmt
);
4865 /* The result of the CALL_EXPR to this builtin is determined from
4866 the value of the parameter and no global variables are touched
4867 which makes the builtin a "const" function. Requiring the
4868 builtin to have the "const" attribute makes it unnecessary
4869 to call mark_call_clobbered. */
4870 gcc_assert (TREE_READONLY (builtin_decl
));
4873 if (alignment_support_scheme
== dr_explicit_realign
)
4876 gcc_assert (!compute_in_loop
);
4877 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
4880 /* 5. Create msq = phi <msq_init, lsq> in loop */
4882 pe
= loop_preheader_edge (containing_loop
);
4883 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4884 msq
= make_ssa_name (vec_dest
);
4885 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
4886 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
4892 /* Function vect_grouped_load_supported.
4894 COUNT is the size of the load group (the number of statements plus the
4895 number of gaps). SINGLE_ELEMENT_P is true if there is actually
4896 only one statement, with a gap of COUNT - 1.
4898 Returns true if a suitable permute exists. */
4901 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
4902 unsigned HOST_WIDE_INT count
)
4904 machine_mode mode
= TYPE_MODE (vectype
);
4906 /* If this is single-element interleaving with an element distance
4907 that leaves unused vector loads around punt - we at least create
4908 very sub-optimal code in that case (and blow up memory,
4910 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
4912 if (dump_enabled_p ())
4913 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4914 "single-element interleaving not supported "
4915 "for not adjacent vector loads\n");
4919 /* vect_permute_load_chain requires the group size to be equal to 3 or
4920 be a power of two. */
4921 if (count
!= 3 && exact_log2 (count
) == -1)
4923 if (dump_enabled_p ())
4924 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4925 "the size of the group of accesses"
4926 " is not a power of 2 or not equal to 3\n");
4930 /* Check that the permutation is supported. */
4931 if (VECTOR_MODE_P (mode
))
4933 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
4934 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4939 for (k
= 0; k
< 3; k
++)
4941 for (i
= 0; i
< nelt
; i
++)
4942 if (3 * i
+ k
< 2 * nelt
)
4946 if (!can_vec_perm_p (mode
, false, sel
))
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4950 "shuffle of 3 loads is not supported by"
4954 for (i
= 0, j
= 0; i
< nelt
; i
++)
4955 if (3 * i
+ k
< 2 * nelt
)
4958 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
4959 if (!can_vec_perm_p (mode
, false, sel
))
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4963 "shuffle of 3 loads is not supported by"
4972 /* If length is not equal to 3 then only power of 2 is supported. */
4973 gcc_assert (pow2p_hwi (count
));
4974 for (i
= 0; i
< nelt
; i
++)
4976 if (can_vec_perm_p (mode
, false, sel
))
4978 for (i
= 0; i
< nelt
; i
++)
4980 if (can_vec_perm_p (mode
, false, sel
))
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4988 "extract even/odd not supported by target\n");
4992 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4996 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4998 return vect_lanes_optab_supported_p ("vec_load_lanes",
4999 vec_load_lanes_optab
,
5003 /* Function vect_permute_load_chain.
5005 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5006 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5007 the input data correctly. Return the final references for loads in
5010 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5011 The input is 4 vectors each containing 8 elements. We assign a number to each
5012 element, the input sequence is:
5014 1st vec: 0 1 2 3 4 5 6 7
5015 2nd vec: 8 9 10 11 12 13 14 15
5016 3rd vec: 16 17 18 19 20 21 22 23
5017 4th vec: 24 25 26 27 28 29 30 31
5019 The output sequence should be:
5021 1st vec: 0 4 8 12 16 20 24 28
5022 2nd vec: 1 5 9 13 17 21 25 29
5023 3rd vec: 2 6 10 14 18 22 26 30
5024 4th vec: 3 7 11 15 19 23 27 31
5026 i.e., the first output vector should contain the first elements of each
5027 interleaving group, etc.
5029 We use extract_even/odd instructions to create such output. The input of
5030 each extract_even/odd operation is two vectors
5034 and the output is the vector of extracted even/odd elements. The output of
5035 extract_even will be: 0 2 4 6
5036 and of extract_odd: 1 3 5 7
5039 The permutation is done in log LENGTH stages. In each stage extract_even
5040 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5041 their order. In our example,
5043 E1: extract_even (1st vec, 2nd vec)
5044 E2: extract_odd (1st vec, 2nd vec)
5045 E3: extract_even (3rd vec, 4th vec)
5046 E4: extract_odd (3rd vec, 4th vec)
5048 The output for the first stage will be:
5050 E1: 0 2 4 6 8 10 12 14
5051 E2: 1 3 5 7 9 11 13 15
5052 E3: 16 18 20 22 24 26 28 30
5053 E4: 17 19 21 23 25 27 29 31
5055 In order to proceed and create the correct sequence for the next stage (or
5056 for the correct output, if the second stage is the last one, as in our
5057 example), we first put the output of extract_even operation and then the
5058 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5059 The input for the second stage is:
5061 1st vec (E1): 0 2 4 6 8 10 12 14
5062 2nd vec (E3): 16 18 20 22 24 26 28 30
5063 3rd vec (E2): 1 3 5 7 9 11 13 15
5064 4th vec (E4): 17 19 21 23 25 27 29 31
5066 The output of the second stage:
5068 E1: 0 4 8 12 16 20 24 28
5069 E2: 2 6 10 14 18 22 26 30
5070 E3: 1 5 9 13 17 21 25 29
5071 E4: 3 7 11 15 19 23 27 31
5073 And RESULT_CHAIN after reordering:
5075 1st vec (E1): 0 4 8 12 16 20 24 28
5076 2nd vec (E3): 1 5 9 13 17 21 25 29
5077 3rd vec (E2): 2 6 10 14 18 22 26 30
5078 4th vec (E4): 3 7 11 15 19 23 27 31. */
5081 vect_permute_load_chain (vec
<tree
> dr_chain
,
5082 unsigned int length
,
5084 gimple_stmt_iterator
*gsi
,
5085 vec
<tree
> *result_chain
)
5087 tree data_ref
, first_vect
, second_vect
;
5088 tree perm_mask_even
, perm_mask_odd
;
5089 tree perm3_mask_low
, perm3_mask_high
;
5091 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5092 unsigned int i
, j
, log_length
= exact_log2 (length
);
5093 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5094 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5096 result_chain
->quick_grow (length
);
5097 memcpy (result_chain
->address (), dr_chain
.address (),
5098 length
* sizeof (tree
));
5104 for (k
= 0; k
< 3; k
++)
5106 for (i
= 0; i
< nelt
; i
++)
5107 if (3 * i
+ k
< 2 * nelt
)
5111 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5113 for (i
= 0, j
= 0; i
< nelt
; i
++)
5114 if (3 * i
+ k
< 2 * nelt
)
5117 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5119 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5121 first_vect
= dr_chain
[0];
5122 second_vect
= dr_chain
[1];
5124 /* Create interleaving stmt (low part of):
5125 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5127 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5128 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5129 second_vect
, perm3_mask_low
);
5130 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5132 /* Create interleaving stmt (high part of):
5133 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5135 first_vect
= data_ref
;
5136 second_vect
= dr_chain
[2];
5137 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5138 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5139 second_vect
, perm3_mask_high
);
5140 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5141 (*result_chain
)[k
] = data_ref
;
5146 /* If length is not equal to 3 then only power of 2 is supported. */
5147 gcc_assert (pow2p_hwi (length
));
5149 for (i
= 0; i
< nelt
; ++i
)
5151 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5153 for (i
= 0; i
< nelt
; ++i
)
5155 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5157 for (i
= 0; i
< log_length
; i
++)
5159 for (j
= 0; j
< length
; j
+= 2)
5161 first_vect
= dr_chain
[j
];
5162 second_vect
= dr_chain
[j
+1];
5164 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5165 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5166 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5167 first_vect
, second_vect
,
5169 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5170 (*result_chain
)[j
/2] = data_ref
;
5172 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5173 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5174 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5175 first_vect
, second_vect
,
5177 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5178 (*result_chain
)[j
/2+length
/2] = data_ref
;
5180 memcpy (dr_chain
.address (), result_chain
->address (),
5181 length
* sizeof (tree
));
5186 /* Function vect_shift_permute_load_chain.
5188 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5189 sequence of stmts to reorder the input data accordingly.
5190 Return the final references for loads in RESULT_CHAIN.
5191 Return true if successed, false otherwise.
5193 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5194 The input is 3 vectors each containing 8 elements. We assign a
5195 number to each element, the input sequence is:
5197 1st vec: 0 1 2 3 4 5 6 7
5198 2nd vec: 8 9 10 11 12 13 14 15
5199 3rd vec: 16 17 18 19 20 21 22 23
5201 The output sequence should be:
5203 1st vec: 0 3 6 9 12 15 18 21
5204 2nd vec: 1 4 7 10 13 16 19 22
5205 3rd vec: 2 5 8 11 14 17 20 23
5207 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5209 First we shuffle all 3 vectors to get correct elements order:
5211 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5212 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5213 3rd vec: (16 19 22) (17 20 23) (18 21)
5215 Next we unite and shift vector 3 times:
5218 shift right by 6 the concatenation of:
5219 "1st vec" and "2nd vec"
5220 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5221 "2nd vec" and "3rd vec"
5222 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5223 "3rd vec" and "1st vec"
5224 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5227 So that now new vectors are:
5229 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5230 2nd vec: (10 13) (16 19 22) (17 20 23)
5231 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5234 shift right by 5 the concatenation of:
5235 "1st vec" and "3rd vec"
5236 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5237 "2nd vec" and "1st vec"
5238 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5239 "3rd vec" and "2nd vec"
5240 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5243 So that now new vectors are:
5245 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5246 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5247 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5250 shift right by 5 the concatenation of:
5251 "1st vec" and "1st vec"
5252 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5253 shift right by 3 the concatenation of:
5254 "2nd vec" and "2nd vec"
5255 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5258 So that now all vectors are READY:
5259 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5260 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5261 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5263 This algorithm is faster than one in vect_permute_load_chain if:
5264 1. "shift of a concatination" is faster than general permutation.
5266 2. The TARGET machine can't execute vector instructions in parallel.
5267 This is because each step of the algorithm depends on previous.
5268 The algorithm in vect_permute_load_chain is much more parallel.
5270 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5274 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5275 unsigned int length
,
5277 gimple_stmt_iterator
*gsi
,
5278 vec
<tree
> *result_chain
)
5280 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5281 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5282 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5285 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5287 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5288 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5289 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5290 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5292 result_chain
->quick_grow (length
);
5293 memcpy (result_chain
->address (), dr_chain
.address (),
5294 length
* sizeof (tree
));
5296 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5298 unsigned int j
, log_length
= exact_log2 (length
);
5299 for (i
= 0; i
< nelt
/ 2; ++i
)
5301 for (i
= 0; i
< nelt
/ 2; ++i
)
5302 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5303 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5305 if (dump_enabled_p ())
5306 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5307 "shuffle of 2 fields structure is not \
5308 supported by target\n");
5311 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5313 for (i
= 0; i
< nelt
/ 2; ++i
)
5315 for (i
= 0; i
< nelt
/ 2; ++i
)
5316 sel
[nelt
/ 2 + i
] = i
* 2;
5317 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5319 if (dump_enabled_p ())
5320 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5321 "shuffle of 2 fields structure is not \
5322 supported by target\n");
5325 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5327 /* Generating permutation constant to shift all elements.
5328 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5329 for (i
= 0; i
< nelt
; i
++)
5330 sel
[i
] = nelt
/ 2 + i
;
5331 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5333 if (dump_enabled_p ())
5334 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5335 "shift permutation is not supported by target\n");
5338 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5340 /* Generating permutation constant to select vector from 2.
5341 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5342 for (i
= 0; i
< nelt
/ 2; i
++)
5344 for (i
= nelt
/ 2; i
< nelt
; i
++)
5346 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5348 if (dump_enabled_p ())
5349 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5350 "select is not supported by target\n");
5353 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5355 for (i
= 0; i
< log_length
; i
++)
5357 for (j
= 0; j
< length
; j
+= 2)
5359 first_vect
= dr_chain
[j
];
5360 second_vect
= dr_chain
[j
+ 1];
5362 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5363 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5364 first_vect
, first_vect
,
5366 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5369 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5370 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5371 second_vect
, second_vect
,
5373 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5376 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5377 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5378 vect
[0], vect
[1], shift1_mask
);
5379 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5380 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5382 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5383 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5384 vect
[0], vect
[1], select_mask
);
5385 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5386 (*result_chain
)[j
/2] = data_ref
;
5388 memcpy (dr_chain
.address (), result_chain
->address (),
5389 length
* sizeof (tree
));
5393 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5395 unsigned int k
= 0, l
= 0;
5397 /* Generating permutation constant to get all elements in rigth order.
5398 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5399 for (i
= 0; i
< nelt
; i
++)
5401 if (3 * k
+ (l
% 3) >= nelt
)
5404 l
+= (3 - (nelt
% 3));
5406 sel
[i
] = 3 * k
+ (l
% 3);
5409 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5411 if (dump_enabled_p ())
5412 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5413 "shuffle of 3 fields structure is not \
5414 supported by target\n");
5417 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5419 /* Generating permutation constant to shift all elements.
5420 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5421 for (i
= 0; i
< nelt
; i
++)
5422 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5423 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5425 if (dump_enabled_p ())
5426 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5427 "shift permutation is not supported by target\n");
5430 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5432 /* Generating permutation constant to shift all elements.
5433 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5434 for (i
= 0; i
< nelt
; i
++)
5435 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5436 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5440 "shift permutation is not supported by target\n");
5443 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5445 /* Generating permutation constant to shift all elements.
5446 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5447 for (i
= 0; i
< nelt
; i
++)
5448 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5449 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5451 if (dump_enabled_p ())
5452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5453 "shift permutation is not supported by target\n");
5456 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5458 /* Generating permutation constant to shift all elements.
5459 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5460 for (i
= 0; i
< nelt
; i
++)
5461 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5462 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5464 if (dump_enabled_p ())
5465 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5466 "shift permutation is not supported by target\n");
5469 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5471 for (k
= 0; k
< 3; k
++)
5473 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5474 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5475 dr_chain
[k
], dr_chain
[k
],
5477 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5481 for (k
= 0; k
< 3; k
++)
5483 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5484 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5485 vect
[k
% 3], vect
[(k
+ 1) % 3],
5487 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5488 vect_shift
[k
] = data_ref
;
5491 for (k
= 0; k
< 3; k
++)
5493 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5494 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5495 vect_shift
[(4 - k
) % 3],
5496 vect_shift
[(3 - k
) % 3],
5498 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5502 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5504 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5505 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5506 vect
[0], shift3_mask
);
5507 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5508 (*result_chain
)[nelt
% 3] = data_ref
;
5510 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5511 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5512 vect
[1], shift4_mask
);
5513 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5514 (*result_chain
)[0] = data_ref
;
5520 /* Function vect_transform_grouped_load.
5522 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5523 to perform their permutation and ascribe the result vectorized statements to
5524 the scalar statements.
5528 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5529 gimple_stmt_iterator
*gsi
)
5532 vec
<tree
> result_chain
= vNULL
;
5534 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5535 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5536 vectors, that are ready for vector computation. */
5537 result_chain
.create (size
);
5539 /* If reassociation width for vector type is 2 or greater target machine can
5540 execute 2 or more vector instructions in parallel. Otherwise try to
5541 get chain for loads group using vect_shift_permute_load_chain. */
5542 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5543 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5545 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5546 gsi
, &result_chain
))
5547 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5548 vect_record_grouped_load_vectors (stmt
, result_chain
);
5549 result_chain
.release ();
5552 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5553 generated as part of the vectorization of STMT. Assign the statement
5554 for each vector to the associated scalar statement. */
5557 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5559 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5560 gimple
*next_stmt
, *new_stmt
;
5561 unsigned int i
, gap_count
;
5564 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5565 Since we scan the chain starting from it's first node, their order
5566 corresponds the order of data-refs in RESULT_CHAIN. */
5567 next_stmt
= first_stmt
;
5569 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5574 /* Skip the gaps. Loads created for the gaps will be removed by dead
5575 code elimination pass later. No need to check for the first stmt in
5576 the group, since it always exists.
5577 GROUP_GAP is the number of steps in elements from the previous
5578 access (if there is no gap GROUP_GAP is 1). We skip loads that
5579 correspond to the gaps. */
5580 if (next_stmt
!= first_stmt
5581 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5589 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5590 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5591 copies, and we put the new vector statement in the first available
5593 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5594 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5597 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5600 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5602 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5605 prev_stmt
= rel_stmt
;
5607 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5610 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5615 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5617 /* If NEXT_STMT accesses the same DR as the previous statement,
5618 put the same TMP_DATA_REF as its vectorized statement; otherwise
5619 get the next data-ref from RESULT_CHAIN. */
5620 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5626 /* Function vect_force_dr_alignment_p.
5628 Returns whether the alignment of a DECL can be forced to be aligned
5629 on ALIGNMENT bit boundary. */
5632 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5637 if (decl_in_symtab_p (decl
)
5638 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5641 if (TREE_STATIC (decl
))
5642 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5644 return (alignment
<= MAX_STACK_ALIGNMENT
);
5648 /* Return whether the data reference DR is supported with respect to its
5650 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5651 it is aligned, i.e., check if it is possible to vectorize it with different
5654 enum dr_alignment_support
5655 vect_supportable_dr_alignment (struct data_reference
*dr
,
5656 bool check_aligned_accesses
)
5658 gimple
*stmt
= DR_STMT (dr
);
5659 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5660 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5661 machine_mode mode
= TYPE_MODE (vectype
);
5662 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5663 struct loop
*vect_loop
= NULL
;
5664 bool nested_in_vect_loop
= false;
5666 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5669 /* For now assume all conditional loads/stores support unaligned
5670 access without any special code. */
5671 if (is_gimple_call (stmt
)
5672 && gimple_call_internal_p (stmt
)
5673 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5674 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5675 return dr_unaligned_supported
;
5679 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5680 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5683 /* Possibly unaligned access. */
5685 /* We can choose between using the implicit realignment scheme (generating
5686 a misaligned_move stmt) and the explicit realignment scheme (generating
5687 aligned loads with a REALIGN_LOAD). There are two variants to the
5688 explicit realignment scheme: optimized, and unoptimized.
5689 We can optimize the realignment only if the step between consecutive
5690 vector loads is equal to the vector size. Since the vector memory
5691 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5692 is guaranteed that the misalignment amount remains the same throughout the
5693 execution of the vectorized loop. Therefore, we can create the
5694 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5695 at the loop preheader.
5697 However, in the case of outer-loop vectorization, when vectorizing a
5698 memory access in the inner-loop nested within the LOOP that is now being
5699 vectorized, while it is guaranteed that the misalignment of the
5700 vectorized memory access will remain the same in different outer-loop
5701 iterations, it is *not* guaranteed that is will remain the same throughout
5702 the execution of the inner-loop. This is because the inner-loop advances
5703 with the original scalar step (and not in steps of VS). If the inner-loop
5704 step happens to be a multiple of VS, then the misalignment remains fixed
5705 and we can use the optimized realignment scheme. For example:
5711 When vectorizing the i-loop in the above example, the step between
5712 consecutive vector loads is 1, and so the misalignment does not remain
5713 fixed across the execution of the inner-loop, and the realignment cannot
5714 be optimized (as illustrated in the following pseudo vectorized loop):
5716 for (i=0; i<N; i+=4)
5717 for (j=0; j<M; j++){
5718 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5719 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5720 // (assuming that we start from an aligned address).
5723 We therefore have to use the unoptimized realignment scheme:
5725 for (i=0; i<N; i+=4)
5726 for (j=k; j<M; j+=4)
5727 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5728 // that the misalignment of the initial address is
5731 The loop can then be vectorized as follows:
5733 for (k=0; k<4; k++){
5734 rt = get_realignment_token (&vp[k]);
5735 for (i=0; i<N; i+=4){
5737 for (j=k; j<M; j+=4){
5739 va = REALIGN_LOAD <v1,v2,rt>;
5746 if (DR_IS_READ (dr
))
5748 bool is_packed
= false;
5749 tree type
= (TREE_TYPE (DR_REF (dr
)));
5751 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5752 && (!targetm
.vectorize
.builtin_mask_for_load
5753 || targetm
.vectorize
.builtin_mask_for_load ()))
5755 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5757 /* If we are doing SLP then the accesses need not have the
5758 same alignment, instead it depends on the SLP group size. */
5760 && STMT_SLP_TYPE (stmt_info
)
5761 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5762 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
5763 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
5765 else if (!loop_vinfo
5766 || (nested_in_vect_loop
5767 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5768 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
5769 return dr_explicit_realign
;
5771 return dr_explicit_realign_optimized
;
5773 if (!known_alignment_for_access_p (dr
))
5774 is_packed
= not_size_aligned (DR_REF (dr
));
5776 if (targetm
.vectorize
.support_vector_misalignment
5777 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
5778 /* Can't software pipeline the loads, but can at least do them. */
5779 return dr_unaligned_supported
;
5783 bool is_packed
= false;
5784 tree type
= (TREE_TYPE (DR_REF (dr
)));
5786 if (!known_alignment_for_access_p (dr
))
5787 is_packed
= not_size_aligned (DR_REF (dr
));
5789 if (targetm
.vectorize
.support_vector_misalignment
5790 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
5791 return dr_unaligned_supported
;
5795 return dr_unaligned_unsupported
;