1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
59 tree vectype
, unsigned HOST_WIDE_INT count
)
61 machine_mode mode
, array_mode
;
64 mode
= TYPE_MODE (vectype
);
65 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
66 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
69 if (array_mode
== BLKmode
)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
74 GET_MODE_NAME (mode
), count
);
78 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
82 "cannot use %s<%s><%s>\n", name
,
83 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE
, vect_location
,
89 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
90 GET_MODE_NAME (mode
));
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
114 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
115 HOST_WIDE_INT
*rhs_size_unit
)
117 tree scalar_type
= gimple_expr_type (stmt
);
118 HOST_WIDE_INT lhs
, rhs
;
120 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
122 if (is_gimple_assign (stmt
)
123 && (gimple_assign_cast_p (stmt
)
124 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
128 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
130 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
132 scalar_type
= rhs_type
;
135 *lhs_size_unit
= lhs
;
136 *rhs_size_unit
= rhs
;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
146 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
148 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE
, vect_location
,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
158 dump_printf (MSG_NOTE
, " and ");
159 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
160 dump_printf (MSG_NOTE
, "\n");
163 if (optimize_loop_nest_for_size_p (loop
))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
167 "versioning not supported when optimizing"
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
177 "versioning not yet supported for outer-loops.\n");
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
183 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
188 "versioning not yet supported for non-constant "
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
206 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
207 loop_vec_info loop_vinfo
, int *max_vf
)
210 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
211 struct data_reference
*dra
= DDR_A (ddr
);
212 struct data_reference
*drb
= DDR_B (ddr
);
213 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
214 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
215 lambda_vector dist_v
;
216 unsigned int loop_depth
;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
228 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
244 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
246 get_alias_set (DR_REF (drb
))))
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop
->safelen
>= 2)
256 if (loop
->safelen
< *max_vf
)
257 *max_vf
= loop
->safelen
;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
272 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
275 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
287 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop
->safelen
>= 2)
304 if (loop
->safelen
< *max_vf
)
305 *max_vf
= loop
->safelen
;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
320 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
336 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
342 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
345 int dist
= dist_v
[loop_depth
];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance = %d.\n", dist
);
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE
, vect_location
,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
358 dump_printf (MSG_NOTE
, " and ");
359 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
360 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
384 gimple
*earlier_stmt
;
385 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "READ_WRITE dependence in interleaving."
400 if (dist
> 0 && DDR_REVERSED_P (ddr
))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
419 && abs (dist
) < *max_vf
)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf
= abs (dist
);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE
, vect_location
,
426 "adjusting maximal vectorization factor to %i\n",
430 if (abs (dist
) >= *max_vf
)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "dependence distance >= VF.\n");
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
446 dump_printf (MSG_NOTE
, " and ");
447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
448 dump_printf (MSG_NOTE
, "\n");
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
467 struct data_dependence_relation
*ddr
;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE
, vect_location
,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo
)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
479 &LOOP_VINFO_DDRS (loop_vinfo
),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
483 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
484 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
491 /* Function vect_slp_analyze_data_ref_dependence.
493 Return TRUE if there (might) exist a dependence between a memory-reference
494 DRA and a memory-reference DRB. When versioning for alias may check a
495 dependence at run-time, return FALSE. Adjust *MAX_VF according to
496 the data dependence. */
499 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
501 struct data_reference
*dra
= DDR_A (ddr
);
502 struct data_reference
*drb
= DDR_B (ddr
);
504 /* We need to check dependences of statements marked as unvectorizable
505 as well, they still can prohibit vectorization. */
507 /* Independent data accesses. */
508 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
514 /* Read-read is OK. */
515 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
518 /* If dra and drb are part of the same interleaving chain consider
520 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
521 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
522 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
525 /* Unknown data dependence. */
526 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
528 if (dump_enabled_p ())
530 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
531 "can't determine dependence between ");
532 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
533 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
534 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
535 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
538 else if (dump_enabled_p ())
540 dump_printf_loc (MSG_NOTE
, vect_location
,
541 "determined dependence between ");
542 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
543 dump_printf (MSG_NOTE
, " and ");
544 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
545 dump_printf (MSG_NOTE
, "\n");
552 /* Analyze dependences involved in the transform of SLP NODE. STORES
553 contain the vector of scalar stores of this instance if we are
554 disambiguating the loads. */
557 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
558 vec
<gimple
*> stores
, gimple
*last_store
)
560 /* This walks over all stmts involved in the SLP load/store done
561 in NODE verifying we can sink them up to the last stmt in the
563 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
564 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
566 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
567 if (access
== last_access
)
569 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
570 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
571 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
573 gimple
*stmt
= gsi_stmt (gsi
);
574 if (! gimple_vuse (stmt
)
575 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
578 /* If we couldn't record a (single) data reference for this
579 stmt we have to give up. */
580 /* ??? Here and below if dependence analysis fails we can resort
581 to the alias oracle which can handle more kinds of stmts. */
582 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
586 bool dependent
= false;
587 /* If we run into a store of this same instance (we've just
588 marked those) then delay dependence checking until we run
589 into the last store because this is where it will have
590 been sunk to (and we verify if we can do that as well). */
591 if (gimple_visited_p (stmt
))
593 if (stmt
!= last_store
)
597 FOR_EACH_VEC_ELT (stores
, i
, store
)
599 data_reference
*store_dr
600 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
601 ddr_p ddr
= initialize_data_dependence_relation
602 (dr_a
, store_dr
, vNULL
);
603 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
604 free_dependence_relation (ddr
);
611 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
613 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
614 free_dependence_relation (ddr
);
624 /* Function vect_analyze_data_ref_dependences.
626 Examine all the data references in the basic-block, and make sure there
627 do not exist any data dependences between them. Set *MAX_VF according to
628 the maximum vectorization factor the data dependences allow. */
631 vect_slp_analyze_instance_dependence (slp_instance instance
)
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE
, vect_location
,
635 "=== vect_slp_analyze_instance_dependence ===\n");
637 /* The stores of this instance are at the root of the SLP tree. */
638 slp_tree store
= SLP_INSTANCE_TREE (instance
);
639 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
642 /* Verify we can sink stores to the vectorized stmt insert location. */
643 gimple
*last_store
= NULL
;
646 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
649 /* Mark stores in this instance and remember the last one. */
650 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
651 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
652 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
657 /* Verify we can sink loads to the vectorized stmt insert location,
658 special-casing stores of this instance. */
661 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
662 if (! vect_slp_analyze_node_dependences (instance
, load
,
664 ? SLP_TREE_SCALAR_STMTS (store
)
665 : vNULL
, last_store
))
671 /* Unset the visited flag. */
673 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
674 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
679 /* Function vect_compute_data_ref_alignment
681 Compute the misalignment of the data reference DR.
684 1. If during the misalignment computation it is found that the data reference
685 cannot be vectorized then false is returned.
686 2. DR_MISALIGNMENT (DR) is defined.
688 FOR NOW: No analysis is actually performed. Misalignment is calculated
689 only for trivial cases. TODO. */
692 vect_compute_data_ref_alignment (struct data_reference
*dr
)
694 gimple
*stmt
= DR_STMT (dr
);
695 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
696 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
697 struct loop
*loop
= NULL
;
698 tree ref
= DR_REF (dr
);
700 tree base
, base_addr
;
701 tree misalign
= NULL_TREE
;
704 unsigned HOST_WIDE_INT alignment
;
706 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE
, vect_location
,
708 "vect_compute_data_ref_alignment:\n");
711 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
713 /* Initialize misalignment to unknown. */
714 SET_DR_MISALIGNMENT (dr
, -1);
716 if (tree_fits_shwi_p (DR_STEP (dr
)))
717 misalign
= DR_INIT (dr
);
718 aligned_to
= DR_ALIGNED_TO (dr
);
719 base_addr
= DR_BASE_ADDRESS (dr
);
720 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
722 /* In case the dataref is in an inner-loop of the loop that is being
723 vectorized (LOOP), we use the base and misalignment information
724 relative to the outer-loop (LOOP). This is ok only if the misalignment
725 stays the same throughout the execution of the inner-loop, which is why
726 we have to check that the stride of the dataref in the inner-loop evenly
727 divides by the vector size. */
728 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
730 tree step
= DR_STEP (dr
);
732 if (tree_fits_shwi_p (step
)
733 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE
, vect_location
,
737 "inner step divides the vector-size.\n");
738 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
739 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
740 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
746 "inner step doesn't divide the vector-size.\n");
747 misalign
= NULL_TREE
;
751 /* Similarly we can only use base and misalignment information relative to
752 an innermost loop if the misalignment stays the same throughout the
753 execution of the loop. As above, this is the case if the stride of
754 the dataref evenly divides by the vector size. */
757 tree step
= DR_STEP (dr
);
758 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
760 if (tree_fits_shwi_p (step
)
761 && ((tree_to_shwi (step
) * vf
)
762 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
764 if (dump_enabled_p ())
765 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
766 "step doesn't divide the vector-size.\n");
767 misalign
= NULL_TREE
;
771 /* To look at alignment of the base we have to preserve an inner MEM_REF
772 as that carries alignment information of the actual access. */
774 while (handled_component_p (base
))
775 base
= TREE_OPERAND (base
, 0);
776 if (TREE_CODE (base
) == MEM_REF
)
777 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
778 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
779 unsigned int base_alignment
= get_object_alignment (base
);
781 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
782 DR_VECT_AUX (dr
)->base_element_aligned
= true;
784 alignment
= TYPE_ALIGN_UNIT (vectype
);
786 if ((compare_tree_int (aligned_to
, alignment
) < 0)
789 if (dump_enabled_p ())
791 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
792 "Unknown alignment for access: ");
793 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
794 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
799 if (base_alignment
< TYPE_ALIGN (vectype
))
801 /* Strip an inner MEM_REF to a bare decl if possible. */
802 if (TREE_CODE (base
) == MEM_REF
803 && integer_zerop (TREE_OPERAND (base
, 1))
804 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
805 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
807 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE
, vect_location
,
812 "can't force alignment of ref: ");
813 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
814 dump_printf (MSG_NOTE
, "\n");
819 /* Force the alignment of the decl.
820 NOTE: This is the only change to the code we make during
821 the analysis phase, before deciding to vectorize the loop. */
822 if (dump_enabled_p ())
824 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
825 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
826 dump_printf (MSG_NOTE
, "\n");
829 DR_VECT_AUX (dr
)->base_decl
= base
;
830 DR_VECT_AUX (dr
)->base_misaligned
= true;
831 DR_VECT_AUX (dr
)->base_element_aligned
= true;
834 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
835 step
= STMT_VINFO_DR_STEP (stmt_info
);
838 /* If this is a backward running DR then first access in the larger
839 vectype actually is N-1 elements before the address in the DR.
840 Adjust misalign accordingly. */
841 if (tree_int_cst_sgn (step
) < 0)
843 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
844 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
845 otherwise we wouldn't be here. */
846 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, step
);
847 /* PLUS because STEP was negative. */
848 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
851 SET_DR_MISALIGNMENT (dr
,
852 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
854 if (dump_enabled_p ())
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
857 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
858 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
859 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
866 /* Function vect_update_misalignment_for_peel
868 DR - the data reference whose misalignment is to be adjusted.
869 DR_PEEL - the data reference whose misalignment is being made
870 zero in the vector loop by the peel.
871 NPEEL - the number of iterations in the peel loop if the misalignment
872 of DR_PEEL is known at compile time. */
875 vect_update_misalignment_for_peel (struct data_reference
*dr
,
876 struct data_reference
*dr_peel
, int npeel
)
879 vec
<dr_p
> same_align_drs
;
880 struct data_reference
*current_dr
;
881 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
882 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
883 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
884 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
886 /* For interleaved data accesses the step in the loop must be multiplied by
887 the size of the interleaving group. */
888 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
889 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
890 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
891 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
893 /* It can be assumed that the data refs with the same alignment as dr_peel
894 are aligned in the vector loop. */
896 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
897 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
899 if (current_dr
!= dr
)
901 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
902 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
903 SET_DR_MISALIGNMENT (dr
, 0);
907 if (known_alignment_for_access_p (dr
)
908 && known_alignment_for_access_p (dr_peel
))
910 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
911 int misal
= DR_MISALIGNMENT (dr
);
912 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
913 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
914 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
915 SET_DR_MISALIGNMENT (dr
, misal
);
919 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
921 SET_DR_MISALIGNMENT (dr
, -1);
925 /* Function verify_data_ref_alignment
927 Return TRUE if DR can be handled with respect to alignment. */
930 verify_data_ref_alignment (data_reference_p dr
)
932 enum dr_alignment_support supportable_dr_alignment
933 = vect_supportable_dr_alignment (dr
, false);
934 if (!supportable_dr_alignment
)
936 if (dump_enabled_p ())
939 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
940 "not vectorized: unsupported unaligned load.");
942 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
943 "not vectorized: unsupported unaligned "
946 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
948 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
953 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
954 dump_printf_loc (MSG_NOTE
, vect_location
,
955 "Vectorizing an unaligned access.\n");
960 /* Function vect_verify_datarefs_alignment
962 Return TRUE if all data references in the loop can be
963 handled with respect to alignment. */
966 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
968 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
969 struct data_reference
*dr
;
972 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
974 gimple
*stmt
= DR_STMT (dr
);
975 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
977 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
980 /* For interleaving, only the alignment of the first access matters. */
981 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
982 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
985 /* Strided accesses perform only component accesses, alignment is
986 irrelevant for them. */
987 if (STMT_VINFO_STRIDED_P (stmt_info
)
988 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
991 if (! verify_data_ref_alignment (dr
))
998 /* Given an memory reference EXP return whether its alignment is less
1002 not_size_aligned (tree exp
)
1004 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1007 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1008 > get_object_alignment (exp
));
1011 /* Function vector_alignment_reachable_p
1013 Return true if vector alignment for DR is reachable by peeling
1014 a few loop iterations. Return false otherwise. */
1017 vector_alignment_reachable_p (struct data_reference
*dr
)
1019 gimple
*stmt
= DR_STMT (dr
);
1020 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1021 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1023 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1025 /* For interleaved access we peel only if number of iterations in
1026 the prolog loop ({VF - misalignment}), is a multiple of the
1027 number of the interleaved accesses. */
1028 int elem_size
, mis_in_elements
;
1029 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1031 /* FORNOW: handle only known alignment. */
1032 if (!known_alignment_for_access_p (dr
))
1035 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1036 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1038 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1042 /* If misalignment is known at the compile time then allow peeling
1043 only if natural alignment is reachable through peeling. */
1044 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1046 HOST_WIDE_INT elmsize
=
1047 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1048 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_NOTE
, vect_location
,
1051 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1052 dump_printf (MSG_NOTE
,
1053 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1055 if (DR_MISALIGNMENT (dr
) % elmsize
)
1057 if (dump_enabled_p ())
1058 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1059 "data size does not divide the misalignment.\n");
1064 if (!known_alignment_for_access_p (dr
))
1066 tree type
= TREE_TYPE (DR_REF (dr
));
1067 bool is_packed
= not_size_aligned (DR_REF (dr
));
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1070 "Unknown misalignment, is_packed = %d\n",is_packed
);
1071 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1072 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1082 /* Calculate the cost of the memory access represented by DR. */
1085 vect_get_data_access_cost (struct data_reference
*dr
,
1086 unsigned int *inside_cost
,
1087 unsigned int *outside_cost
,
1088 stmt_vector_for_cost
*body_cost_vec
)
1090 gimple
*stmt
= DR_STMT (dr
);
1091 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1092 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1093 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1094 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1095 int ncopies
= vf
/ nunits
;
1097 if (DR_IS_READ (dr
))
1098 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1099 NULL
, body_cost_vec
, false);
1101 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_NOTE
, vect_location
,
1105 "vect_get_data_access_cost: inside_cost = %d, "
1106 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1110 typedef struct _vect_peel_info
1113 struct data_reference
*dr
;
1117 typedef struct _vect_peel_extended_info
1119 struct _vect_peel_info peel_info
;
1120 unsigned int inside_cost
;
1121 unsigned int outside_cost
;
1122 stmt_vector_for_cost body_cost_vec
;
1123 } *vect_peel_extended_info
;
1126 /* Peeling hashtable helpers. */
1128 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1130 static inline hashval_t
hash (const _vect_peel_info
*);
1131 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1135 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1137 return (hashval_t
) peel_info
->npeel
;
1141 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1143 return (a
->npeel
== b
->npeel
);
1147 /* Insert DR into peeling hash table with NPEEL as key. */
1150 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1151 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1154 struct _vect_peel_info elem
, *slot
;
1155 _vect_peel_info
**new_slot
;
1156 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1159 slot
= peeling_htab
->find (&elem
);
1164 slot
= XNEW (struct _vect_peel_info
);
1165 slot
->npeel
= npeel
;
1168 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1172 if (!supportable_dr_alignment
1173 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1174 slot
->count
+= VECT_MAX_COST
;
1178 /* Traverse peeling hash table to find peeling option that aligns maximum
1179 number of data accesses. */
1182 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1183 _vect_peel_extended_info
*max
)
1185 vect_peel_info elem
= *slot
;
1187 if (elem
->count
> max
->peel_info
.count
1188 || (elem
->count
== max
->peel_info
.count
1189 && max
->peel_info
.npeel
> elem
->npeel
))
1191 max
->peel_info
.npeel
= elem
->npeel
;
1192 max
->peel_info
.count
= elem
->count
;
1193 max
->peel_info
.dr
= elem
->dr
;
1200 /* Traverse peeling hash table and calculate cost for each peeling option.
1201 Find the one with the lowest cost. */
1204 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1205 _vect_peel_extended_info
*min
)
1207 vect_peel_info elem
= *slot
;
1208 int save_misalignment
, dummy
;
1209 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1210 gimple
*stmt
= DR_STMT (elem
->dr
);
1211 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1212 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1213 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1214 struct data_reference
*dr
;
1215 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1217 prologue_cost_vec
.create (2);
1218 body_cost_vec
.create (2);
1219 epilogue_cost_vec
.create (2);
1221 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1223 stmt
= DR_STMT (dr
);
1224 stmt_info
= vinfo_for_stmt (stmt
);
1225 /* For interleaving, only the alignment of the first access
1227 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1228 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1231 /* Strided accesses perform only component accesses, alignment is
1232 irrelevant for them. */
1233 if (STMT_VINFO_STRIDED_P (stmt_info
)
1234 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1237 save_misalignment
= DR_MISALIGNMENT (dr
);
1238 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1239 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1241 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1244 outside_cost
+= vect_get_known_peeling_cost
1245 (loop_vinfo
, elem
->npeel
, &dummy
,
1246 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1247 &prologue_cost_vec
, &epilogue_cost_vec
);
1249 /* Prologue and epilogue costs are added to the target model later.
1250 These costs depend only on the scalar iteration cost, the
1251 number of peeling iterations finally chosen, and the number of
1252 misaligned statements. So discard the information found here. */
1253 prologue_cost_vec
.release ();
1254 epilogue_cost_vec
.release ();
1256 if (inside_cost
< min
->inside_cost
1257 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1259 min
->inside_cost
= inside_cost
;
1260 min
->outside_cost
= outside_cost
;
1261 min
->body_cost_vec
.release ();
1262 min
->body_cost_vec
= body_cost_vec
;
1263 min
->peel_info
.dr
= elem
->dr
;
1264 min
->peel_info
.npeel
= elem
->npeel
;
1267 body_cost_vec
.release ();
1273 /* Choose best peeling option by traversing peeling hash table and either
1274 choosing an option with the lowest cost (if cost model is enabled) or the
1275 option that aligns as many accesses as possible. */
1277 static struct data_reference
*
1278 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1279 loop_vec_info loop_vinfo
,
1280 unsigned int *npeel
,
1281 stmt_vector_for_cost
*body_cost_vec
)
1283 struct _vect_peel_extended_info res
;
1285 res
.peel_info
.dr
= NULL
;
1286 res
.body_cost_vec
= stmt_vector_for_cost ();
1288 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1290 res
.inside_cost
= INT_MAX
;
1291 res
.outside_cost
= INT_MAX
;
1292 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1293 vect_peeling_hash_get_lowest_cost
> (&res
);
1297 res
.peel_info
.count
= 0;
1298 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1299 vect_peeling_hash_get_most_frequent
> (&res
);
1302 *npeel
= res
.peel_info
.npeel
;
1303 *body_cost_vec
= res
.body_cost_vec
;
1304 return res
.peel_info
.dr
;
1308 /* Function vect_enhance_data_refs_alignment
1310 This pass will use loop versioning and loop peeling in order to enhance
1311 the alignment of data references in the loop.
1313 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1314 original loop is to be vectorized. Any other loops that are created by
1315 the transformations performed in this pass - are not supposed to be
1316 vectorized. This restriction will be relaxed.
1318 This pass will require a cost model to guide it whether to apply peeling
1319 or versioning or a combination of the two. For example, the scheme that
1320 intel uses when given a loop with several memory accesses, is as follows:
1321 choose one memory access ('p') which alignment you want to force by doing
1322 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1323 other accesses are not necessarily aligned, or (2) use loop versioning to
1324 generate one loop in which all accesses are aligned, and another loop in
1325 which only 'p' is necessarily aligned.
1327 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1328 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1329 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1331 Devising a cost model is the most critical aspect of this work. It will
1332 guide us on which access to peel for, whether to use loop versioning, how
1333 many versions to create, etc. The cost model will probably consist of
1334 generic considerations as well as target specific considerations (on
1335 powerpc for example, misaligned stores are more painful than misaligned
1338 Here are the general steps involved in alignment enhancements:
1340 -- original loop, before alignment analysis:
1341 for (i=0; i<N; i++){
1342 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1343 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1346 -- After vect_compute_data_refs_alignment:
1347 for (i=0; i<N; i++){
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1352 -- Possibility 1: we do loop versioning:
1354 for (i=0; i<N; i++){ # loop 1A
1355 x = q[i]; # DR_MISALIGNMENT(q) = 3
1356 p[i] = y; # DR_MISALIGNMENT(p) = 0
1360 for (i=0; i<N; i++){ # loop 1B
1361 x = q[i]; # DR_MISALIGNMENT(q) = 3
1362 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1366 -- Possibility 2: we do loop peeling:
1367 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1371 for (i = 3; i < N; i++){ # loop 2A
1372 x = q[i]; # DR_MISALIGNMENT(q) = 0
1373 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1376 -- Possibility 3: combination of loop peeling and versioning:
1377 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1382 for (i = 3; i<N; i++){ # loop 3A
1383 x = q[i]; # DR_MISALIGNMENT(q) = 0
1384 p[i] = y; # DR_MISALIGNMENT(p) = 0
1388 for (i = 3; i<N; i++){ # loop 3B
1389 x = q[i]; # DR_MISALIGNMENT(q) = 0
1390 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1394 These loops are later passed to loop_transform to be vectorized. The
1395 vectorizer will use the alignment information to guide the transformation
1396 (whether to generate regular loads/stores, or with special handling for
1400 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1402 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1403 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1404 enum dr_alignment_support supportable_dr_alignment
;
1405 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1406 struct data_reference
*dr
;
1408 bool do_peeling
= false;
1409 bool do_versioning
= false;
1412 stmt_vec_info stmt_info
;
1413 unsigned int npeel
= 0;
1414 bool all_misalignments_unknown
= true;
1415 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1416 unsigned possible_npeel_number
= 1;
1418 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1419 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1420 hash_table
<peel_info_hasher
> peeling_htab (1);
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE
, vect_location
,
1424 "=== vect_enhance_data_refs_alignment ===\n");
1426 /* Reset data so we can safely be called multiple times. */
1427 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1428 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1430 /* While cost model enhancements are expected in the future, the high level
1431 view of the code at this time is as follows:
1433 A) If there is a misaligned access then see if peeling to align
1434 this access can make all data references satisfy
1435 vect_supportable_dr_alignment. If so, update data structures
1436 as needed and return true.
1438 B) If peeling wasn't possible and there is a data reference with an
1439 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1440 then see if loop versioning checks can be used to make all data
1441 references satisfy vect_supportable_dr_alignment. If so, update
1442 data structures as needed and return true.
1444 C) If neither peeling nor versioning were successful then return false if
1445 any data reference does not satisfy vect_supportable_dr_alignment.
1447 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1449 Note, Possibility 3 above (which is peeling and versioning together) is not
1450 being done at this time. */
1452 /* (1) Peeling to force alignment. */
1454 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1456 + How many accesses will become aligned due to the peeling
1457 - How many accesses will become unaligned due to the peeling,
1458 and the cost of misaligned accesses.
1459 - The cost of peeling (the extra runtime checks, the increase
1462 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1464 stmt
= DR_STMT (dr
);
1465 stmt_info
= vinfo_for_stmt (stmt
);
1467 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1470 /* For interleaving, only the alignment of the first access
1472 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1473 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1476 /* For invariant accesses there is nothing to enhance. */
1477 if (integer_zerop (DR_STEP (dr
)))
1480 /* Strided accesses perform only component accesses, alignment is
1481 irrelevant for them. */
1482 if (STMT_VINFO_STRIDED_P (stmt_info
)
1483 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1486 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1487 do_peeling
= vector_alignment_reachable_p (dr
);
1490 if (known_alignment_for_access_p (dr
))
1492 unsigned int npeel_tmp
;
1493 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1494 size_zero_node
) < 0;
1496 /* Save info about DR in the hash table. */
1497 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1498 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1499 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1500 TREE_TYPE (DR_REF (dr
))));
1501 npeel_tmp
= (negative
1502 ? (mis
- nelements
) : (nelements
- mis
))
1505 /* For multiple types, it is possible that the bigger type access
1506 will have more than one peeling option. E.g., a loop with two
1507 types: one of size (vector size / 4), and the other one of
1508 size (vector size / 8). Vectorization factor will 8. If both
1509 access are misaligned by 3, the first one needs one scalar
1510 iteration to be aligned, and the second one needs 5. But the
1511 first one will be aligned also by peeling 5 scalar
1512 iterations, and in that case both accesses will be aligned.
1513 Hence, except for the immediate peeling amount, we also want
1514 to try to add full vector size, while we don't exceed
1515 vectorization factor.
1516 We do this automtically for cost model, since we calculate cost
1517 for every peeling option. */
1518 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1520 if (STMT_SLP_TYPE (stmt_info
))
1521 possible_npeel_number
1522 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1524 possible_npeel_number
= vf
/ nelements
;
1527 /* Handle the aligned case. We may decide to align some other
1528 access, making DR unaligned. */
1529 if (DR_MISALIGNMENT (dr
) == 0)
1532 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1533 possible_npeel_number
++;
1536 for (j
= 0; j
< possible_npeel_number
; j
++)
1538 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1540 npeel_tmp
+= nelements
;
1543 all_misalignments_unknown
= false;
1544 /* Data-ref that was chosen for the case that all the
1545 misalignments are unknown is not relevant anymore, since we
1546 have a data-ref with known alignment. */
1551 /* If we don't know any misalignment values, we prefer
1552 peeling for data-ref that has the maximum number of data-refs
1553 with the same alignment, unless the target prefers to align
1554 stores over load. */
1555 if (all_misalignments_unknown
)
1557 unsigned same_align_drs
1558 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1560 || same_align_drs_max
< same_align_drs
)
1562 same_align_drs_max
= same_align_drs
;
1565 /* For data-refs with the same number of related
1566 accesses prefer the one where the misalign
1567 computation will be invariant in the outermost loop. */
1568 else if (same_align_drs_max
== same_align_drs
)
1570 struct loop
*ivloop0
, *ivloop
;
1571 ivloop0
= outermost_invariant_loop_for_expr
1572 (loop
, DR_BASE_ADDRESS (dr0
));
1573 ivloop
= outermost_invariant_loop_for_expr
1574 (loop
, DR_BASE_ADDRESS (dr
));
1575 if ((ivloop
&& !ivloop0
)
1576 || (ivloop
&& ivloop0
1577 && flow_loop_nested_p (ivloop
, ivloop0
)))
1581 if (!first_store
&& DR_IS_WRITE (dr
))
1585 /* If there are both known and unknown misaligned accesses in the
1586 loop, we choose peeling amount according to the known
1588 if (!supportable_dr_alignment
)
1591 if (!first_store
&& DR_IS_WRITE (dr
))
1598 if (!aligned_access_p (dr
))
1600 if (dump_enabled_p ())
1601 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1602 "vector alignment may not be reachable\n");
1608 /* Check if we can possibly peel the loop. */
1609 if (!vect_can_advance_ivs_p (loop_vinfo
)
1610 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1615 && all_misalignments_unknown
1616 && vect_supportable_dr_alignment (dr0
, false))
1618 /* Check if the target requires to prefer stores over loads, i.e., if
1619 misaligned stores are more expensive than misaligned loads (taking
1620 drs with same alignment into account). */
1621 if (first_store
&& DR_IS_READ (dr0
))
1623 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1624 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1625 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1626 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1627 stmt_vector_for_cost dummy
;
1630 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1632 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1633 &store_outside_cost
, &dummy
);
1637 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1638 aligning the load DR0). */
1639 load_inside_penalty
= store_inside_cost
;
1640 load_outside_penalty
= store_outside_cost
;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (first_store
))).iterate (i
, &dr
);
1645 if (DR_IS_READ (dr
))
1647 load_inside_penalty
+= load_inside_cost
;
1648 load_outside_penalty
+= load_outside_cost
;
1652 load_inside_penalty
+= store_inside_cost
;
1653 load_outside_penalty
+= store_outside_cost
;
1656 /* Calculate the penalty for leaving DR0 unaligned (by
1657 aligning the FIRST_STORE). */
1658 store_inside_penalty
= load_inside_cost
;
1659 store_outside_penalty
= load_outside_cost
;
1661 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1662 DR_STMT (dr0
))).iterate (i
, &dr
);
1664 if (DR_IS_READ (dr
))
1666 store_inside_penalty
+= load_inside_cost
;
1667 store_outside_penalty
+= load_outside_cost
;
1671 store_inside_penalty
+= store_inside_cost
;
1672 store_outside_penalty
+= store_outside_cost
;
1675 if (load_inside_penalty
> store_inside_penalty
1676 || (load_inside_penalty
== store_inside_penalty
1677 && load_outside_penalty
> store_outside_penalty
))
1681 /* In case there are only loads with different unknown misalignments, use
1682 peeling only if it may help to align other accesses in the loop or
1683 if it may help improving load bandwith when we'd end up using
1685 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1687 && !STMT_VINFO_SAME_ALIGN_REFS (
1688 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1689 && (vect_supportable_dr_alignment (dr0
, false)
1690 != dr_unaligned_supported
1691 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1692 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1696 if (do_peeling
&& !dr0
)
1698 /* Peeling is possible, but there is no data access that is not supported
1699 unless aligned. So we try to choose the best possible peeling. */
1701 /* We should get here only if there are drs with known misalignment. */
1702 gcc_assert (!all_misalignments_unknown
);
1704 /* Choose the best peeling from the hash table. */
1705 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1714 stmt
= DR_STMT (dr0
);
1715 stmt_info
= vinfo_for_stmt (stmt
);
1716 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1717 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1719 if (known_alignment_for_access_p (dr0
))
1721 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1722 size_zero_node
) < 0;
1725 /* Since it's known at compile time, compute the number of
1726 iterations in the peeled loop (the peeling factor) for use in
1727 updating DR_MISALIGNMENT values. The peeling factor is the
1728 vectorization factor minus the misalignment as an element
1730 mis
= DR_MISALIGNMENT (dr0
);
1731 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1732 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1736 /* For interleaved data access every iteration accesses all the
1737 members of the group, therefore we divide the number of iterations
1738 by the group size. */
1739 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1741 npeel
/= GROUP_SIZE (stmt_info
);
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE
, vect_location
,
1745 "Try peeling by %d\n", npeel
);
1748 /* Ensure that all data refs can be vectorized after the peel. */
1749 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1751 int save_misalignment
;
1756 stmt
= DR_STMT (dr
);
1757 stmt_info
= vinfo_for_stmt (stmt
);
1758 /* For interleaving, only the alignment of the first access
1760 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1761 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1764 /* Strided accesses perform only component accesses, alignment is
1765 irrelevant for them. */
1766 if (STMT_VINFO_STRIDED_P (stmt_info
)
1767 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1770 save_misalignment
= DR_MISALIGNMENT (dr
);
1771 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1772 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1773 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1775 if (!supportable_dr_alignment
)
1782 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1784 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1789 body_cost_vec
.release ();
1794 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1797 unsigned max_allowed_peel
1798 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1799 if (max_allowed_peel
!= (unsigned)-1)
1801 unsigned max_peel
= npeel
;
1804 gimple
*dr_stmt
= DR_STMT (dr0
);
1805 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1806 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1807 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1809 if (max_peel
> max_allowed_peel
)
1812 if (dump_enabled_p ())
1813 dump_printf_loc (MSG_NOTE
, vect_location
,
1814 "Disable peeling, max peels reached: %d\n", max_peel
);
1819 /* Cost model #2 - if peeling may result in a remaining loop not
1820 iterating enough to be vectorized then do not peel. */
1822 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1825 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1826 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1827 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1833 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1834 If the misalignment of DR_i is identical to that of dr0 then set
1835 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1836 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1837 by the peeling factor times the element size of DR_i (MOD the
1838 vectorization factor times the size). Otherwise, the
1839 misalignment of DR_i must be set to unknown. */
1840 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1843 /* Strided accesses perform only component accesses, alignment
1844 is irrelevant for them. */
1845 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1846 if (STMT_VINFO_STRIDED_P (stmt_info
)
1847 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1850 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1853 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1855 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1857 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1858 = DR_MISALIGNMENT (dr0
);
1859 SET_DR_MISALIGNMENT (dr0
, 0);
1860 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE
, vect_location
,
1863 "Alignment of access forced using peeling.\n");
1864 dump_printf_loc (MSG_NOTE
, vect_location
,
1865 "Peeling for alignment will be applied.\n");
1867 /* The inside-loop cost will be accounted for in vectorizable_load
1868 and vectorizable_store correctly with adjusted alignments.
1869 Drop the body_cst_vec on the floor here. */
1870 body_cost_vec
.release ();
1872 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1878 body_cost_vec
.release ();
1880 /* (2) Versioning to force alignment. */
1882 /* Try versioning if:
1883 1) optimize loop for speed
1884 2) there is at least one unsupported misaligned data ref with an unknown
1886 3) all misaligned data refs with a known misalignment are supported, and
1887 4) the number of runtime alignment checks is within reason. */
1890 optimize_loop_nest_for_speed_p (loop
)
1891 && (!loop
->inner
); /* FORNOW */
1895 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1897 stmt
= DR_STMT (dr
);
1898 stmt_info
= vinfo_for_stmt (stmt
);
1900 /* For interleaving, only the alignment of the first access
1902 if (aligned_access_p (dr
)
1903 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1904 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1907 if (STMT_VINFO_STRIDED_P (stmt_info
))
1909 /* Strided loads perform only component accesses, alignment is
1910 irrelevant for them. */
1911 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1913 do_versioning
= false;
1917 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1919 if (!supportable_dr_alignment
)
1925 if (known_alignment_for_access_p (dr
)
1926 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1927 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1929 do_versioning
= false;
1933 stmt
= DR_STMT (dr
);
1934 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1935 gcc_assert (vectype
);
1937 /* The rightmost bits of an aligned address must be zeros.
1938 Construct the mask needed for this test. For example,
1939 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1940 mask must be 15 = 0xf. */
1941 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1943 /* FORNOW: use the same mask to test all potentially unaligned
1944 references in the loop. The vectorizer currently supports
1945 a single vector size, see the reference to
1946 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1947 vectorization factor is computed. */
1948 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1949 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1950 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1951 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1956 /* Versioning requires at least one misaligned data reference. */
1957 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1958 do_versioning
= false;
1959 else if (!do_versioning
)
1960 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1965 vec
<gimple
*> may_misalign_stmts
1966 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1969 /* It can now be assumed that the data references in the statements
1970 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1971 of the loop being vectorized. */
1972 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1974 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1975 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1976 SET_DR_MISALIGNMENT (dr
, 0);
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE
, vect_location
,
1979 "Alignment of access forced using versioning.\n");
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE
, vect_location
,
1984 "Versioning for alignment will be applied.\n");
1986 /* Peeling and versioning can't be done together at this time. */
1987 gcc_assert (! (do_peeling
&& do_versioning
));
1989 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1994 /* This point is reached if neither peeling nor versioning is being done. */
1995 gcc_assert (! (do_peeling
|| do_versioning
));
1997 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2002 /* Function vect_find_same_alignment_drs.
2004 Update group and alignment relations according to the chosen
2005 vectorization factor. */
2008 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
2009 loop_vec_info loop_vinfo
)
2012 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2013 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2014 struct data_reference
*dra
= DDR_A (ddr
);
2015 struct data_reference
*drb
= DDR_B (ddr
);
2016 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2017 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2018 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2019 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2020 lambda_vector dist_v
;
2021 unsigned int loop_depth
;
2023 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2029 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2032 /* Loop-based vectorization and known data dependence. */
2033 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2036 /* Data-dependence analysis reports a distance vector of zero
2037 for data-references that overlap only in the first iteration
2038 but have different sign step (see PR45764).
2039 So as a sanity check require equal DR_STEP. */
2040 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2043 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2044 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2046 int dist
= dist_v
[loop_depth
];
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_NOTE
, vect_location
,
2050 "dependence distance = %d.\n", dist
);
2052 /* Same loop iteration. */
2054 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2056 /* Two references with distance zero have the same alignment. */
2057 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2058 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2059 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE
, vect_location
,
2062 "accesses have the same alignment.\n");
2063 dump_printf (MSG_NOTE
,
2064 "dependence distance modulo vf == 0 between ");
2065 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2066 dump_printf (MSG_NOTE
, " and ");
2067 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2068 dump_printf (MSG_NOTE
, "\n");
2075 /* Function vect_analyze_data_refs_alignment
2077 Analyze the alignment of the data-references in the loop.
2078 Return FALSE if a data reference is found that cannot be vectorized. */
2081 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_NOTE
, vect_location
,
2085 "=== vect_analyze_data_refs_alignment ===\n");
2087 /* Mark groups of data references with same alignment using
2088 data dependence information. */
2089 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2090 struct data_dependence_relation
*ddr
;
2093 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2094 vect_find_same_alignment_drs (ddr
, vinfo
);
2096 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2097 struct data_reference
*dr
;
2099 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2101 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2102 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2103 && !vect_compute_data_ref_alignment (dr
))
2105 /* Strided accesses perform only component accesses, misalignment
2106 information is irrelevant for them. */
2107 if (STMT_VINFO_STRIDED_P (stmt_info
)
2108 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2113 "not vectorized: can't calculate alignment "
2124 /* Analyze alignment of DRs of stmts in NODE. */
2127 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2129 /* We vectorize from the first scalar stmt in the node unless
2130 the node is permuted in which case we start from the first
2131 element in the group. */
2132 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2133 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2134 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2135 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2137 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2138 if (! vect_compute_data_ref_alignment (dr
)
2139 /* For creating the data-ref pointer we need alignment of the
2140 first element anyway. */
2142 && ! vect_compute_data_ref_alignment (first_dr
))
2143 || ! verify_data_ref_alignment (dr
))
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2147 "not vectorized: bad data alignment in basic "
2155 /* Function vect_slp_analyze_instance_alignment
2157 Analyze the alignment of the data-references in the SLP instance.
2158 Return FALSE if a data reference is found that cannot be vectorized. */
2161 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2163 if (dump_enabled_p ())
2164 dump_printf_loc (MSG_NOTE
, vect_location
,
2165 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2169 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2170 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2173 node
= SLP_INSTANCE_TREE (instance
);
2174 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2175 && ! vect_slp_analyze_and_verify_node_alignment
2176 (SLP_INSTANCE_TREE (instance
)))
2183 /* Analyze groups of accesses: check that DR belongs to a group of
2184 accesses of legal size, step, etc. Detect gaps, single element
2185 interleaving, and other special cases. Set grouped access info.
2186 Collect groups of strided stores for further use in SLP analysis.
2187 Worker for vect_analyze_group_access. */
2190 vect_analyze_group_access_1 (struct data_reference
*dr
)
2192 tree step
= DR_STEP (dr
);
2193 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2194 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2195 gimple
*stmt
= DR_STMT (dr
);
2196 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2197 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2198 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2199 HOST_WIDE_INT dr_step
= -1;
2200 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2201 bool slp_impossible
= false;
2203 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2204 size of the interleaving group (including gaps). */
2205 if (tree_fits_shwi_p (step
))
2207 dr_step
= tree_to_shwi (step
);
2208 /* Check that STEP is a multiple of type size. Otherwise there is
2209 a non-element-sized gap at the end of the group which we
2210 cannot represent in GROUP_GAP or GROUP_SIZE.
2211 ??? As we can handle non-constant step fine here we should
2212 simply remove uses of GROUP_GAP between the last and first
2213 element and instead rely on DR_STEP. GROUP_SIZE then would
2214 simply not include that gap. */
2215 if ((dr_step
% type_size
) != 0)
2217 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_NOTE
, vect_location
,
2221 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2222 dump_printf (MSG_NOTE
,
2223 " is not a multiple of the element size for ");
2224 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2225 dump_printf (MSG_NOTE
, "\n");
2229 groupsize
= absu_hwi (dr_step
) / type_size
;
2234 /* Not consecutive access is possible only if it is a part of interleaving. */
2235 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2237 /* Check if it this DR is a part of interleaving, and is a single
2238 element of the group that is accessed in the loop. */
2240 /* Gaps are supported only for loads. STEP must be a multiple of the type
2241 size. The size of the group must be a power of 2. */
2243 && (dr_step
% type_size
) == 0
2245 && pow2p_hwi (groupsize
))
2247 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2248 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2249 GROUP_GAP (stmt_info
) = groupsize
- 1;
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE
, vect_location
,
2253 "Detected single element interleaving ");
2254 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2255 dump_printf (MSG_NOTE
, " step ");
2256 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2257 dump_printf (MSG_NOTE
, "\n");
2263 if (dump_enabled_p ())
2265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2266 "not consecutive access ");
2267 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2272 /* Mark the statement as unvectorizable. */
2273 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2277 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2278 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2282 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2284 /* First stmt in the interleaving chain. Check the chain. */
2285 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2286 struct data_reference
*data_ref
= dr
;
2287 unsigned int count
= 1;
2288 tree prev_init
= DR_INIT (data_ref
);
2289 gimple
*prev
= stmt
;
2290 HOST_WIDE_INT diff
, gaps
= 0;
2294 /* Skip same data-refs. In case that two or more stmts share
2295 data-ref (supported only for loads), we vectorize only the first
2296 stmt, and the rest get their vectorized loads from the first
2298 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2299 DR_INIT (STMT_VINFO_DATA_REF (
2300 vinfo_for_stmt (next
)))))
2302 if (DR_IS_WRITE (data_ref
))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2306 "Two store stmts share the same dr.\n");
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2312 "Two or more load stmts share the same dr.\n");
2314 /* For load use the same data-ref load. */
2315 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2318 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2323 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2325 /* All group members have the same STEP by construction. */
2326 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2328 /* Check that the distance between two accesses is equal to the type
2329 size. Otherwise, we have gaps. */
2330 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2331 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2334 /* FORNOW: SLP of accesses with gaps is not supported. */
2335 slp_impossible
= true;
2336 if (DR_IS_WRITE (data_ref
))
2338 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2340 "interleaved store with gaps\n");
2347 last_accessed_element
+= diff
;
2349 /* Store the gap from the previous member of the group. If there is no
2350 gap in the access, GROUP_GAP is always 1. */
2351 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2353 prev_init
= DR_INIT (data_ref
);
2354 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2355 /* Count the number of data-refs in the chain. */
2360 groupsize
= count
+ gaps
;
2362 if (groupsize
> UINT_MAX
)
2364 if (dump_enabled_p ())
2365 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2366 "group is too large\n");
2370 /* Check that the size of the interleaving is equal to count for stores,
2371 i.e., that there are no gaps. */
2372 if (groupsize
!= count
2373 && !DR_IS_READ (dr
))
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2377 "interleaved store with gaps\n");
2381 /* If there is a gap after the last load in the group it is the
2382 difference between the groupsize and the last accessed
2384 When there is no gap, this difference should be 0. */
2385 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2387 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2388 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE
, vect_location
,
2391 "Detected interleaving ");
2392 if (DR_IS_READ (dr
))
2393 dump_printf (MSG_NOTE
, "load ");
2395 dump_printf (MSG_NOTE
, "store ");
2396 dump_printf (MSG_NOTE
, "of size %u starting with ",
2397 (unsigned)groupsize
);
2398 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2399 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2400 dump_printf_loc (MSG_NOTE
, vect_location
,
2401 "There is a gap of %u elements after the group\n",
2402 GROUP_GAP (vinfo_for_stmt (stmt
)));
2405 /* SLP: create an SLP data structure for every interleaving group of
2406 stores for further analysis in vect_analyse_slp. */
2407 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2410 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2412 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2419 /* Analyze groups of accesses: check that DR belongs to a group of
2420 accesses of legal size, step, etc. Detect gaps, single element
2421 interleaving, and other special cases. Set grouped access info.
2422 Collect groups of strided stores for further use in SLP analysis. */
2425 vect_analyze_group_access (struct data_reference
*dr
)
2427 if (!vect_analyze_group_access_1 (dr
))
2429 /* Dissolve the group if present. */
2431 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2434 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2435 next
= GROUP_NEXT_ELEMENT (vinfo
);
2436 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2437 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2445 /* Analyze the access pattern of the data-reference DR.
2446 In case of non-consecutive accesses call vect_analyze_group_access() to
2447 analyze groups of accesses. */
2450 vect_analyze_data_ref_access (struct data_reference
*dr
)
2452 tree step
= DR_STEP (dr
);
2453 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2454 gimple
*stmt
= DR_STMT (dr
);
2455 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2456 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2457 struct loop
*loop
= NULL
;
2460 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2462 if (loop_vinfo
&& !step
)
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2466 "bad data-ref access in loop\n");
2470 /* Allow loads with zero step in inner-loop vectorization. */
2471 if (loop_vinfo
&& integer_zerop (step
))
2473 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2474 if (!nested_in_vect_loop_p (loop
, stmt
))
2475 return DR_IS_READ (dr
);
2476 /* Allow references with zero step for outer loops marked
2477 with pragma omp simd only - it guarantees absence of
2478 loop-carried dependencies between inner loop iterations. */
2479 if (!loop
->force_vectorize
)
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE
, vect_location
,
2483 "zero step in inner loop of nest\n");
2488 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2490 /* Interleaved accesses are not yet supported within outer-loop
2491 vectorization for references in the inner-loop. */
2492 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2494 /* For the rest of the analysis we use the outer-loop step. */
2495 step
= STMT_VINFO_DR_STEP (stmt_info
);
2496 if (integer_zerop (step
))
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE
, vect_location
,
2500 "zero step in outer loop.\n");
2501 return DR_IS_READ (dr
);
2506 if (TREE_CODE (step
) == INTEGER_CST
)
2508 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2509 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2511 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2513 /* Mark that it is not interleaving. */
2514 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2519 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_NOTE
, vect_location
,
2523 "grouped access in outer loop.\n");
2528 /* Assume this is a DR handled by non-constant strided load case. */
2529 if (TREE_CODE (step
) != INTEGER_CST
)
2530 return (STMT_VINFO_STRIDED_P (stmt_info
)
2531 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2532 || vect_analyze_group_access (dr
)));
2534 /* Not consecutive access - check if it's a part of interleaving group. */
2535 return vect_analyze_group_access (dr
);
2540 /* A helper function used in the comparator function to sort data
2541 references. T1 and T2 are two data references to be compared.
2542 The function returns -1, 0, or 1. */
2545 compare_tree (tree t1
, tree t2
)
2548 enum tree_code code
;
2561 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2562 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2564 code
= TREE_CODE (t1
);
2567 /* For const values, we can just use hash values for comparisons. */
2575 hashval_t h1
= iterative_hash_expr (t1
, 0);
2576 hashval_t h2
= iterative_hash_expr (t2
, 0);
2578 return h1
< h2
? -1 : 1;
2583 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2587 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2588 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2592 tclass
= TREE_CODE_CLASS (code
);
2594 /* For var-decl, we could compare their UIDs. */
2595 if (tclass
== tcc_declaration
)
2597 if (DECL_UID (t1
) != DECL_UID (t2
))
2598 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2602 /* For expressions with operands, compare their operands recursively. */
2603 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2605 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2615 /* Compare two data-references DRA and DRB to group them into chunks
2616 suitable for grouping. */
2619 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2621 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2622 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2625 /* Stabilize sort. */
2629 /* DRs in different loops never belong to the same group. */
2630 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2631 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2633 return loopa
->num
< loopb
->num
? -1 : 1;
2635 /* Ordering of DRs according to base. */
2636 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2638 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2643 /* And according to DR_OFFSET. */
2644 if (!dr_equal_offsets_p (dra
, drb
))
2646 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2651 /* Put reads before writes. */
2652 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2653 return DR_IS_READ (dra
) ? -1 : 1;
2655 /* Then sort after access size. */
2656 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2657 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2659 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2660 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2665 /* And after step. */
2666 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2668 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2673 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2674 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2676 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2680 /* Function vect_analyze_data_ref_accesses.
2682 Analyze the access pattern of all the data references in the loop.
2684 FORNOW: the only access pattern that is considered vectorizable is a
2685 simple step 1 (consecutive) access.
2687 FORNOW: handle only arrays and pointer accesses. */
2690 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2693 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2694 struct data_reference
*dr
;
2696 if (dump_enabled_p ())
2697 dump_printf_loc (MSG_NOTE
, vect_location
,
2698 "=== vect_analyze_data_ref_accesses ===\n");
2700 if (datarefs
.is_empty ())
2703 /* Sort the array of datarefs to make building the interleaving chains
2704 linear. Don't modify the original vector's order, it is needed for
2705 determining what dependencies are reversed. */
2706 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2707 datarefs_copy
.qsort (dr_group_sort_cmp
);
2709 /* Build the interleaving chains. */
2710 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2712 data_reference_p dra
= datarefs_copy
[i
];
2713 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2714 stmt_vec_info lastinfo
= NULL
;
2715 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2720 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2722 data_reference_p drb
= datarefs_copy
[i
];
2723 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2724 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2727 /* ??? Imperfect sorting (non-compatible types, non-modulo
2728 accesses, same accesses) can lead to a group to be artificially
2729 split here as we don't just skip over those. If it really
2730 matters we can push those to a worklist and re-iterate
2731 over them. The we can just skip ahead to the next DR here. */
2733 /* DRs in a different loop should not be put into the same
2734 interleaving group. */
2735 if (gimple_bb (DR_STMT (dra
))->loop_father
2736 != gimple_bb (DR_STMT (drb
))->loop_father
)
2739 /* Check that the data-refs have same first location (except init)
2740 and they are both either store or load (not load and store,
2741 not masked loads or stores). */
2742 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2743 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2744 DR_BASE_ADDRESS (drb
), 0)
2745 || !dr_equal_offsets_p (dra
, drb
)
2746 || !gimple_assign_single_p (DR_STMT (dra
))
2747 || !gimple_assign_single_p (DR_STMT (drb
)))
2750 /* Check that the data-refs have the same constant size. */
2751 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2752 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2753 if (!tree_fits_uhwi_p (sza
)
2754 || !tree_fits_uhwi_p (szb
)
2755 || !tree_int_cst_equal (sza
, szb
))
2758 /* Check that the data-refs have the same step. */
2759 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2762 /* Do not place the same access in the interleaving chain twice. */
2763 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2766 /* Check the types are compatible.
2767 ??? We don't distinguish this during sorting. */
2768 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2769 TREE_TYPE (DR_REF (drb
))))
2772 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2773 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2774 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2775 gcc_assert (init_a
<= init_b
);
2777 /* If init_b == init_a + the size of the type * k, we have an
2778 interleaving, and DRA is accessed before DRB. */
2779 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2780 if (type_size_a
== 0
2781 || (init_b
- init_a
) % type_size_a
!= 0)
2784 /* If we have a store, the accesses are adjacent. This splits
2785 groups into chunks we support (we don't support vectorization
2786 of stores with gaps). */
2787 if (!DR_IS_READ (dra
)
2788 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2789 (DR_INIT (datarefs_copy
[i
-1]))
2793 /* If the step (if not zero or non-constant) is greater than the
2794 difference between data-refs' inits this splits groups into
2796 if (tree_fits_shwi_p (DR_STEP (dra
)))
2798 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2799 if (step
!= 0 && step
<= (init_b
- init_a
))
2803 if (dump_enabled_p ())
2805 dump_printf_loc (MSG_NOTE
, vect_location
,
2806 "Detected interleaving ");
2807 if (DR_IS_READ (dra
))
2808 dump_printf (MSG_NOTE
, "load ");
2810 dump_printf (MSG_NOTE
, "store ");
2811 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2812 dump_printf (MSG_NOTE
, " and ");
2813 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2814 dump_printf (MSG_NOTE
, "\n");
2817 /* Link the found element into the group list. */
2818 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2820 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2821 lastinfo
= stmtinfo_a
;
2823 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2824 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2825 lastinfo
= stmtinfo_b
;
2829 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2830 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2831 && !vect_analyze_data_ref_access (dr
))
2833 if (dump_enabled_p ())
2834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2835 "not vectorized: complicated access pattern.\n");
2837 if (is_a
<bb_vec_info
> (vinfo
))
2839 /* Mark the statement as not vectorizable. */
2840 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2845 datarefs_copy
.release ();
2850 datarefs_copy
.release ();
2855 /* Operator == between two dr_with_seg_len objects.
2857 This equality operator is used to make sure two data refs
2858 are the same one so that we will consider to combine the
2859 aliasing checks of those two pairs of data dependent data
2863 operator == (const dr_with_seg_len
& d1
,
2864 const dr_with_seg_len
& d2
)
2866 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2867 DR_BASE_ADDRESS (d2
.dr
), 0)
2868 && compare_tree (DR_OFFSET (d1
.dr
), DR_OFFSET (d2
.dr
)) == 0
2869 && compare_tree (DR_INIT (d1
.dr
), DR_INIT (d2
.dr
)) == 0
2870 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2873 /* Function comp_dr_with_seg_len_pair.
2875 Comparison function for sorting objects of dr_with_seg_len_pair_t
2876 so that we can combine aliasing checks in one scan. */
2879 comp_dr_with_seg_len_pair (const void *pa_
, const void *pb_
)
2881 const dr_with_seg_len_pair_t
* pa
= (const dr_with_seg_len_pair_t
*) pa_
;
2882 const dr_with_seg_len_pair_t
* pb
= (const dr_with_seg_len_pair_t
*) pb_
;
2883 const dr_with_seg_len
&a1
= pa
->first
, &a2
= pa
->second
;
2884 const dr_with_seg_len
&b1
= pb
->first
, &b2
= pb
->second
;
2886 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2887 if a and c have the same basic address snd step, and b and d have the same
2888 address and step. Therefore, if any a&c or b&d don't have the same address
2889 and step, we don't care the order of those two pairs after sorting. */
2892 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a1
.dr
),
2893 DR_BASE_ADDRESS (b1
.dr
))) != 0)
2895 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a2
.dr
),
2896 DR_BASE_ADDRESS (b2
.dr
))) != 0)
2898 if ((comp_res
= compare_tree (DR_STEP (a1
.dr
), DR_STEP (b1
.dr
))) != 0)
2900 if ((comp_res
= compare_tree (DR_STEP (a2
.dr
), DR_STEP (b2
.dr
))) != 0)
2902 if ((comp_res
= compare_tree (DR_OFFSET (a1
.dr
), DR_OFFSET (b1
.dr
))) != 0)
2904 if ((comp_res
= compare_tree (DR_INIT (a1
.dr
), DR_INIT (b1
.dr
))) != 0)
2906 if ((comp_res
= compare_tree (DR_OFFSET (a2
.dr
), DR_OFFSET (b2
.dr
))) != 0)
2908 if ((comp_res
= compare_tree (DR_INIT (a2
.dr
), DR_INIT (b2
.dr
))) != 0)
2914 /* Function vect_vfa_segment_size.
2916 Create an expression that computes the size of segment
2917 that will be accessed for a data reference. The functions takes into
2918 account that realignment loads may access one more vector.
2921 DR: The data reference.
2922 LENGTH_FACTOR: segment length to consider.
2924 Return an expression whose value is the size of segment which will be
2928 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2930 tree segment_length
;
2932 if (integer_zerop (DR_STEP (dr
)))
2933 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2935 segment_length
= size_binop (MULT_EXPR
,
2936 fold_convert (sizetype
, DR_STEP (dr
)),
2937 fold_convert (sizetype
, length_factor
));
2939 if (vect_supportable_dr_alignment (dr
, false)
2940 == dr_explicit_realign_optimized
)
2942 tree vector_size
= TYPE_SIZE_UNIT
2943 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2945 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2947 return segment_length
;
2950 /* Function vect_no_alias_p.
2952 Given data references A and B with equal base and offset, the alias
2953 relation can be decided at compilation time, return TRUE if they do
2954 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2955 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2959 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2960 tree segment_length_a
, tree segment_length_b
)
2962 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
2963 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
2964 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
2967 tree seg_a_min
= DR_INIT (a
);
2968 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
2969 seg_a_min
, segment_length_a
);
2970 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2971 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2973 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
2975 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
2976 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
2977 seg_a_max
, unit_size
);
2978 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
2979 DR_INIT (a
), unit_size
);
2981 tree seg_b_min
= DR_INIT (b
);
2982 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
2983 seg_b_min
, segment_length_b
);
2984 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
2986 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
2987 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
2988 seg_b_max
, unit_size
);
2989 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
2990 DR_INIT (b
), unit_size
);
2993 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
2994 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3000 /* Function vect_prune_runtime_alias_test_list.
3002 Prune a list of ddrs to be tested at run-time by versioning for alias.
3003 Merge several alias checks into one if possible.
3004 Return FALSE if resulting list of ddrs is longer then allowed by
3005 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3008 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3010 vec
<ddr_p
> may_alias_ddrs
=
3011 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3012 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
3013 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3014 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3015 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3021 if (dump_enabled_p ())
3022 dump_printf_loc (MSG_NOTE
, vect_location
,
3023 "=== vect_prune_runtime_alias_test_list ===\n");
3025 if (may_alias_ddrs
.is_empty ())
3028 /* Basically, for each pair of dependent data refs store_ptr_0
3029 and load_ptr_0, we create an expression:
3031 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3032 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3034 for aliasing checks. However, in some cases we can decrease
3035 the number of checks by combining two checks into one. For
3036 example, suppose we have another pair of data refs store_ptr_0
3037 and load_ptr_1, and if the following condition is satisfied:
3039 load_ptr_0 < load_ptr_1 &&
3040 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3042 (this condition means, in each iteration of vectorized loop,
3043 the accessed memory of store_ptr_0 cannot be between the memory
3044 of load_ptr_0 and load_ptr_1.)
3046 we then can use only the following expression to finish the
3047 alising checks between store_ptr_0 & load_ptr_0 and
3048 store_ptr_0 & load_ptr_1:
3050 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3051 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3053 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3054 same basic address. */
3056 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3058 /* First, we collect all data ref pairs for aliasing checks. */
3059 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3062 struct data_reference
*dr_a
, *dr_b
;
3063 gimple
*dr_group_first_a
, *dr_group_first_b
;
3064 tree segment_length_a
, segment_length_b
;
3065 gimple
*stmt_a
, *stmt_b
;
3068 stmt_a
= DR_STMT (DDR_A (ddr
));
3069 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3070 if (dr_group_first_a
)
3072 stmt_a
= dr_group_first_a
;
3073 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3077 stmt_b
= DR_STMT (DDR_B (ddr
));
3078 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3079 if (dr_group_first_b
)
3081 stmt_b
= dr_group_first_b
;
3082 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3085 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3086 length_factor
= scalar_loop_iters
;
3088 length_factor
= size_int (vect_factor
);
3089 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3090 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3092 comp_res
= compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
));
3094 comp_res
= compare_tree (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
));
3096 /* Alias is known at compilation time. */
3098 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3099 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3100 && TREE_CODE (segment_length_a
) == INTEGER_CST
3101 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3103 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3106 if (dump_enabled_p ())
3107 dump_printf_loc (MSG_NOTE
, vect_location
,
3108 "not vectorized: compilation time alias.\n");
3113 dr_with_seg_len_pair_t dr_with_seg_len_pair
3114 (dr_with_seg_len (dr_a
, segment_length_a
),
3115 dr_with_seg_len (dr_b
, segment_length_b
));
3117 /* Canonicalize pairs by sorting the two DR members. */
3119 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3121 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3124 /* Second, we sort the collected data ref pairs so that we can scan
3125 them once to combine all possible aliasing checks. */
3126 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3128 /* Third, we scan the sorted dr pairs and check if we can combine
3129 alias checks of two neighboring dr pairs. */
3130 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3132 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3133 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3134 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3135 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3136 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3138 /* Remove duplicate data ref pairs. */
3139 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3141 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_NOTE
, vect_location
,
3144 "found equal ranges ");
3145 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3146 DR_REF (dr_a1
->dr
));
3147 dump_printf (MSG_NOTE
, ", ");
3148 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3149 DR_REF (dr_b1
->dr
));
3150 dump_printf (MSG_NOTE
, " and ");
3151 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3152 DR_REF (dr_a2
->dr
));
3153 dump_printf (MSG_NOTE
, ", ");
3154 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3155 DR_REF (dr_b2
->dr
));
3156 dump_printf (MSG_NOTE
, "\n");
3159 comp_alias_ddrs
.ordered_remove (i
--);
3163 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3165 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3166 and DR_A1 and DR_A2 are two consecutive memrefs. */
3167 if (*dr_a1
== *dr_a2
)
3169 std::swap (dr_a1
, dr_b1
);
3170 std::swap (dr_a2
, dr_b2
);
3173 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3174 DR_BASE_ADDRESS (dr_a2
->dr
), 0)
3175 || !operand_equal_p (DR_OFFSET (dr_a1
->dr
),
3176 DR_OFFSET (dr_a2
->dr
), 0)
3177 || !tree_fits_shwi_p (DR_INIT (dr_a1
->dr
))
3178 || !tree_fits_shwi_p (DR_INIT (dr_a2
->dr
)))
3181 /* Make sure dr_a1 starts left of dr_a2. */
3182 if (tree_int_cst_lt (DR_INIT (dr_a2
->dr
), DR_INIT (dr_a1
->dr
)))
3183 std::swap (*dr_a1
, *dr_a2
);
3185 bool do_remove
= false;
3186 unsigned HOST_WIDE_INT diff
3187 = (tree_to_shwi (DR_INIT (dr_a2
->dr
))
3188 - tree_to_shwi (DR_INIT (dr_a1
->dr
)));
3190 /* If the left segment does not extend beyond the start of the
3191 right segment the new segment length is that of the right
3192 plus the segment distance. */
3193 if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3194 && compare_tree_int (dr_a1
->seg_len
, diff
) <= 0)
3196 dr_a1
->seg_len
= size_binop (PLUS_EXPR
, dr_a2
->seg_len
,
3200 /* Generally the new segment length is the maximum of the
3201 left segment size and the right segment size plus the distance.
3202 ??? We can also build tree MAX_EXPR here but it's not clear this
3204 else if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3205 && tree_fits_uhwi_p (dr_a2
->seg_len
))
3207 unsigned HOST_WIDE_INT seg_len_a1
= tree_to_uhwi (dr_a1
->seg_len
);
3208 unsigned HOST_WIDE_INT seg_len_a2
= tree_to_uhwi (dr_a2
->seg_len
);
3209 dr_a1
->seg_len
= size_int (MAX (seg_len_a1
, diff
+ seg_len_a2
));
3212 /* Now we check if the following condition is satisfied:
3214 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3216 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3217 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3218 have to make a best estimation. We can get the minimum value
3219 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3220 then either of the following two conditions can guarantee the
3223 1: DIFF <= MIN_SEG_LEN_B
3224 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3227 unsigned HOST_WIDE_INT min_seg_len_b
3228 = (tree_fits_uhwi_p (dr_b1
->seg_len
)
3229 ? tree_to_uhwi (dr_b1
->seg_len
)
3232 if (diff
<= min_seg_len_b
3233 || (tree_fits_uhwi_p (dr_a1
->seg_len
)
3234 && diff
- tree_to_uhwi (dr_a1
->seg_len
) < min_seg_len_b
))
3236 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3237 dr_a2
->seg_len
, size_int (diff
));
3244 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_NOTE
, vect_location
,
3247 "merging ranges for ");
3248 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a1
->dr
));
3249 dump_printf (MSG_NOTE
, ", ");
3250 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b1
->dr
));
3251 dump_printf (MSG_NOTE
, " and ");
3252 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a2
->dr
));
3253 dump_printf (MSG_NOTE
, ", ");
3254 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b2
->dr
));
3255 dump_printf (MSG_NOTE
, "\n");
3257 comp_alias_ddrs
.ordered_remove (i
--);
3262 dump_printf_loc (MSG_NOTE
, vect_location
,
3263 "improved number of alias checks from %d to %d\n",
3264 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3265 if ((int) comp_alias_ddrs
.length () >
3266 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3268 if (dump_enabled_p ())
3269 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3270 "number of versioning for alias "
3271 "run-time tests exceeds %d "
3272 "(--param vect-max-version-for-alias-checks)\n",
3273 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3277 /* All alias checks have been resolved at compilation time. */
3278 if (!comp_alias_ddrs
.length ())
3279 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
3284 /* Return true if a non-affine read or write in STMT is suitable for a
3285 gather load or scatter store. Describe the operation in *INFO if so. */
3288 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3289 gather_scatter_info
*info
)
3291 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3292 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3293 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3294 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3295 tree offtype
= NULL_TREE
;
3296 tree decl
, base
, off
;
3298 int punsignedp
, reversep
, pvolatilep
= 0;
3301 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3302 see if we can use the def stmt of the address. */
3303 if (is_gimple_call (stmt
)
3304 && gimple_call_internal_p (stmt
)
3305 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3306 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3307 && TREE_CODE (base
) == MEM_REF
3308 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3309 && integer_zerop (TREE_OPERAND (base
, 1))
3310 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3312 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3313 if (is_gimple_assign (def_stmt
)
3314 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3315 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3318 /* The gather and scatter builtins need address of the form
3319 loop_invariant + vector * {1, 2, 4, 8}
3321 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3322 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3323 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3324 multiplications and additions in it. To get a vector, we need
3325 a single SSA_NAME that will be defined in the loop and will
3326 contain everything that is not loop invariant and that can be
3327 vectorized. The following code attempts to find such a preexistng
3328 SSA_NAME OFF and put the loop invariants into a tree BASE
3329 that can be gimplified before the loop. */
3330 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3331 &punsignedp
, &reversep
, &pvolatilep
);
3332 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3334 if (TREE_CODE (base
) == MEM_REF
)
3336 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3338 if (off
== NULL_TREE
)
3340 offset_int moff
= mem_ref_offset (base
);
3341 off
= wide_int_to_tree (sizetype
, moff
);
3344 off
= size_binop (PLUS_EXPR
, off
,
3345 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3347 base
= TREE_OPERAND (base
, 0);
3350 base
= build_fold_addr_expr (base
);
3352 if (off
== NULL_TREE
)
3353 off
= size_zero_node
;
3355 /* If base is not loop invariant, either off is 0, then we start with just
3356 the constant offset in the loop invariant BASE and continue with base
3357 as OFF, otherwise give up.
3358 We could handle that case by gimplifying the addition of base + off
3359 into some SSA_NAME and use that as off, but for now punt. */
3360 if (!expr_invariant_in_loop_p (loop
, base
))
3362 if (!integer_zerop (off
))
3365 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3367 /* Otherwise put base + constant offset into the loop invariant BASE
3368 and continue with OFF. */
3371 base
= fold_convert (sizetype
, base
);
3372 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3375 /* OFF at this point may be either a SSA_NAME or some tree expression
3376 from get_inner_reference. Try to peel off loop invariants from it
3377 into BASE as long as possible. */
3379 while (offtype
== NULL_TREE
)
3381 enum tree_code code
;
3382 tree op0
, op1
, add
= NULL_TREE
;
3384 if (TREE_CODE (off
) == SSA_NAME
)
3386 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3388 if (expr_invariant_in_loop_p (loop
, off
))
3391 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3394 op0
= gimple_assign_rhs1 (def_stmt
);
3395 code
= gimple_assign_rhs_code (def_stmt
);
3396 op1
= gimple_assign_rhs2 (def_stmt
);
3400 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3402 code
= TREE_CODE (off
);
3403 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3407 case POINTER_PLUS_EXPR
:
3409 if (expr_invariant_in_loop_p (loop
, op0
))
3414 add
= fold_convert (sizetype
, add
);
3416 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3417 base
= size_binop (PLUS_EXPR
, base
, add
);
3420 if (expr_invariant_in_loop_p (loop
, op1
))
3428 if (expr_invariant_in_loop_p (loop
, op1
))
3430 add
= fold_convert (sizetype
, op1
);
3431 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3437 if (scale
== 1 && tree_fits_shwi_p (op1
))
3439 scale
= tree_to_shwi (op1
);
3448 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3449 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3451 if (TYPE_PRECISION (TREE_TYPE (op0
))
3452 == TYPE_PRECISION (TREE_TYPE (off
)))
3457 if (TYPE_PRECISION (TREE_TYPE (op0
))
3458 < TYPE_PRECISION (TREE_TYPE (off
)))
3461 offtype
= TREE_TYPE (off
);
3472 /* If at the end OFF still isn't a SSA_NAME or isn't
3473 defined in the loop, punt. */
3474 if (TREE_CODE (off
) != SSA_NAME
3475 || expr_invariant_in_loop_p (loop
, off
))
3478 if (offtype
== NULL_TREE
)
3479 offtype
= TREE_TYPE (off
);
3481 if (DR_IS_READ (dr
))
3482 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3485 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3488 if (decl
== NULL_TREE
)
3494 info
->offset_dt
= vect_unknown_def_type
;
3495 info
->offset_vectype
= NULL_TREE
;
3496 info
->scale
= scale
;
3500 /* Function vect_analyze_data_refs.
3502 Find all the data references in the loop or basic block.
3504 The general structure of the analysis of data refs in the vectorizer is as
3506 1- vect_analyze_data_refs(loop/bb): call
3507 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3508 in the loop/bb and their dependences.
3509 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3510 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3511 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3516 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3518 struct loop
*loop
= NULL
;
3520 struct data_reference
*dr
;
3523 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE
, vect_location
,
3525 "=== vect_analyze_data_refs ===\n");
3527 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3528 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3530 /* Go through the data-refs, check that the analysis succeeded. Update
3531 pointer from stmt_vec_info struct to DR and vectype. */
3533 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3534 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3537 stmt_vec_info stmt_info
;
3538 tree base
, offset
, init
;
3539 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3540 bool simd_lane_access
= false;
3544 if (!dr
|| !DR_REF (dr
))
3546 if (dump_enabled_p ())
3547 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3548 "not vectorized: unhandled data-ref\n");
3552 stmt
= DR_STMT (dr
);
3553 stmt_info
= vinfo_for_stmt (stmt
);
3555 /* Discard clobbers from the dataref vector. We will remove
3556 clobber stmts during vectorization. */
3557 if (gimple_clobber_p (stmt
))
3560 if (i
== datarefs
.length () - 1)
3565 datarefs
.ordered_remove (i
);
3570 /* Check that analysis of the data-ref succeeded. */
3571 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3576 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3577 && targetm
.vectorize
.builtin_gather
!= NULL
;
3580 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3581 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3582 bool maybe_simd_lane_access
3583 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3585 /* If target supports vector gather loads or scatter stores, or if
3586 this might be a SIMD lane access, see if they can't be used. */
3587 if (is_a
<loop_vec_info
> (vinfo
)
3588 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3589 && !nested_in_vect_loop_p (loop
, stmt
))
3591 struct data_reference
*newdr
3592 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3593 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3594 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3595 if (DR_BASE_ADDRESS (newdr
)
3596 && DR_OFFSET (newdr
)
3599 && integer_zerop (DR_STEP (newdr
)))
3601 if (maybe_simd_lane_access
)
3603 tree off
= DR_OFFSET (newdr
);
3605 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3606 && TREE_CODE (off
) == MULT_EXPR
3607 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3609 tree step
= TREE_OPERAND (off
, 1);
3610 off
= TREE_OPERAND (off
, 0);
3612 if (CONVERT_EXPR_P (off
)
3613 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3615 < TYPE_PRECISION (TREE_TYPE (off
)))
3616 off
= TREE_OPERAND (off
, 0);
3617 if (TREE_CODE (off
) == SSA_NAME
)
3619 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3620 tree reft
= TREE_TYPE (DR_REF (newdr
));
3621 if (is_gimple_call (def
)
3622 && gimple_call_internal_p (def
)
3623 && (gimple_call_internal_fn (def
)
3624 == IFN_GOMP_SIMD_LANE
))
3626 tree arg
= gimple_call_arg (def
, 0);
3627 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3628 arg
= SSA_NAME_VAR (arg
);
3629 if (arg
== loop
->simduid
3631 && tree_int_cst_equal
3632 (TYPE_SIZE_UNIT (reft
),
3635 DR_OFFSET (newdr
) = ssize_int (0);
3636 DR_STEP (newdr
) = step
;
3637 DR_ALIGNED_TO (newdr
)
3638 = size_int (BIGGEST_ALIGNMENT
);
3640 simd_lane_access
= true;
3646 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3650 gatherscatter
= GATHER
;
3652 gatherscatter
= SCATTER
;
3655 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3656 free_data_ref (newdr
);
3659 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3661 if (dump_enabled_p ())
3663 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3664 "not vectorized: data ref analysis "
3666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3669 if (is_a
<bb_vec_info
> (vinfo
))
3676 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3678 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3680 "not vectorized: base addr of dr is a "
3683 if (is_a
<bb_vec_info
> (vinfo
))
3686 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3691 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3693 if (dump_enabled_p ())
3695 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3696 "not vectorized: volatile type ");
3697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3700 if (is_a
<bb_vec_info
> (vinfo
))
3706 if (stmt_can_throw_internal (stmt
))
3708 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3711 "not vectorized: statement can throw an "
3713 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3716 if (is_a
<bb_vec_info
> (vinfo
))
3719 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3724 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3725 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3727 if (dump_enabled_p ())
3729 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3730 "not vectorized: statement is bitfield "
3732 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3735 if (is_a
<bb_vec_info
> (vinfo
))
3738 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3743 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3744 offset
= unshare_expr (DR_OFFSET (dr
));
3745 init
= unshare_expr (DR_INIT (dr
));
3747 if (is_gimple_call (stmt
)
3748 && (!gimple_call_internal_p (stmt
)
3749 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3750 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3752 if (dump_enabled_p ())
3754 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3755 "not vectorized: dr in a call ");
3756 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3759 if (is_a
<bb_vec_info
> (vinfo
))
3762 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3767 /* Update DR field in stmt_vec_info struct. */
3769 /* If the dataref is in an inner-loop of the loop that is considered for
3770 for vectorization, we also want to analyze the access relative to
3771 the outer-loop (DR contains information only relative to the
3772 inner-most enclosing loop). We do that by building a reference to the
3773 first location accessed by the inner-loop, and analyze it relative to
3775 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3777 tree outer_step
, outer_base
, outer_init
;
3778 HOST_WIDE_INT pbitsize
, pbitpos
;
3781 int punsignedp
, preversep
, pvolatilep
;
3782 affine_iv base_iv
, offset_iv
;
3785 /* Build a reference to the first location accessed by the
3786 inner-loop: *(BASE+INIT). (The first location is actually
3787 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3788 tree inner_base
= build_fold_indirect_ref
3789 (fold_build_pointer_plus (base
, init
));
3791 if (dump_enabled_p ())
3793 dump_printf_loc (MSG_NOTE
, vect_location
,
3794 "analyze in outer-loop: ");
3795 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3796 dump_printf (MSG_NOTE
, "\n");
3799 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3800 &poffset
, &pmode
, &punsignedp
,
3801 &preversep
, &pvolatilep
);
3802 gcc_assert (outer_base
!= NULL_TREE
);
3804 if (pbitpos
% BITS_PER_UNIT
!= 0)
3806 if (dump_enabled_p ())
3807 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3808 "failed: bit offset alignment.\n");
3814 if (dump_enabled_p ())
3815 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3816 "failed: reverse storage order.\n");
3820 outer_base
= build_fold_addr_expr (outer_base
);
3821 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3824 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3826 "failed: evolution of base is not affine.\n");
3833 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3841 offset_iv
.base
= ssize_int (0);
3842 offset_iv
.step
= ssize_int (0);
3844 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3847 if (dump_enabled_p ())
3848 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3849 "evolution of offset is not affine.\n");
3853 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3854 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3855 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3856 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3857 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3859 outer_step
= size_binop (PLUS_EXPR
,
3860 fold_convert (ssizetype
, base_iv
.step
),
3861 fold_convert (ssizetype
, offset_iv
.step
));
3863 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3864 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3865 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3866 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3867 STMT_VINFO_DR_OFFSET (stmt_info
) =
3868 fold_convert (ssizetype
, offset_iv
.base
);
3869 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3870 size_int (highest_pow2_factor (offset_iv
.base
));
3872 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_NOTE
, vect_location
,
3875 "\touter base_address: ");
3876 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3877 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3878 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3879 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3880 STMT_VINFO_DR_OFFSET (stmt_info
));
3881 dump_printf (MSG_NOTE
,
3882 "\n\touter constant offset from base address: ");
3883 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3884 STMT_VINFO_DR_INIT (stmt_info
));
3885 dump_printf (MSG_NOTE
, "\n\touter step: ");
3886 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3887 STMT_VINFO_DR_STEP (stmt_info
));
3888 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3889 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3890 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3891 dump_printf (MSG_NOTE
, "\n");
3895 if (STMT_VINFO_DATA_REF (stmt_info
))
3897 if (dump_enabled_p ())
3899 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3900 "not vectorized: more than one data ref "
3902 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3905 if (is_a
<bb_vec_info
> (vinfo
))
3908 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3913 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3914 if (simd_lane_access
)
3916 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3917 free_data_ref (datarefs
[i
]);
3921 /* Set vectype for STMT. */
3922 scalar_type
= TREE_TYPE (DR_REF (dr
));
3923 STMT_VINFO_VECTYPE (stmt_info
)
3924 = get_vectype_for_scalar_type (scalar_type
);
3925 if (!STMT_VINFO_VECTYPE (stmt_info
))
3927 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3930 "not vectorized: no vectype for stmt: ");
3931 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3932 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3933 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3935 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3938 if (is_a
<bb_vec_info
> (vinfo
))
3940 /* No vector type is fine, the ref can still participate
3941 in dependence analysis, we just can't vectorize it. */
3942 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3946 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3948 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3949 if (gatherscatter
!= SG_NONE
)
3956 if (dump_enabled_p ())
3958 dump_printf_loc (MSG_NOTE
, vect_location
,
3959 "got vectype for stmt: ");
3960 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3961 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3962 STMT_VINFO_VECTYPE (stmt_info
));
3963 dump_printf (MSG_NOTE
, "\n");
3967 /* Adjust the minimal vectorization factor according to the
3969 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3973 if (gatherscatter
!= SG_NONE
)
3975 gather_scatter_info gs_info
;
3976 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3978 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
3980 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3982 if (dump_enabled_p ())
3984 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3985 (gatherscatter
== GATHER
) ?
3986 "not vectorized: not suitable for gather "
3988 "not vectorized: not suitable for scatter "
3990 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3995 free_data_ref (datarefs
[i
]);
3997 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4000 else if (is_a
<loop_vec_info
> (vinfo
)
4001 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4003 if (nested_in_vect_loop_p (loop
, stmt
))
4005 if (dump_enabled_p ())
4007 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4008 "not vectorized: not suitable for strided "
4010 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4014 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4018 /* If we stopped analysis at the first dataref we could not analyze
4019 when trying to vectorize a basic-block mark the rest of the datarefs
4020 as not vectorizable and truncate the vector of datarefs. That
4021 avoids spending useless time in analyzing their dependence. */
4022 if (i
!= datarefs
.length ())
4024 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4025 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4027 data_reference_p dr
= datarefs
[j
];
4028 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4031 datarefs
.truncate (i
);
4038 /* Function vect_get_new_vect_var.
4040 Returns a name for a new variable. The current naming scheme appends the
4041 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4042 the name of vectorizer generated variables, and appends that to NAME if
4046 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4053 case vect_simple_var
:
4056 case vect_scalar_var
:
4062 case vect_pointer_var
:
4071 char* tmp
= concat (prefix
, "_", name
, NULL
);
4072 new_vect_var
= create_tmp_reg (type
, tmp
);
4076 new_vect_var
= create_tmp_reg (type
, prefix
);
4078 return new_vect_var
;
4081 /* Like vect_get_new_vect_var but return an SSA name. */
4084 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4091 case vect_simple_var
:
4094 case vect_scalar_var
:
4097 case vect_pointer_var
:
4106 char* tmp
= concat (prefix
, "_", name
, NULL
);
4107 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4111 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4113 return new_vect_var
;
4116 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4119 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
4120 stmt_vec_info stmt_info
)
4122 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4123 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
4124 int misalign
= DR_MISALIGNMENT (dr
);
4126 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4128 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4131 /* Function vect_create_addr_base_for_vector_ref.
4133 Create an expression that computes the address of the first memory location
4134 that will be accessed for a data reference.
4137 STMT: The statement containing the data reference.
4138 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4139 OFFSET: Optional. If supplied, it is be added to the initial address.
4140 LOOP: Specify relative to which loop-nest should the address be computed.
4141 For example, when the dataref is in an inner-loop nested in an
4142 outer-loop that is now being vectorized, LOOP can be either the
4143 outer-loop, or the inner-loop. The first memory location accessed
4144 by the following dataref ('in' points to short):
4151 if LOOP=i_loop: &in (relative to i_loop)
4152 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4153 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4154 initial address. Unlike OFFSET, which is number of elements to
4155 be added, BYTE_OFFSET is measured in bytes.
4158 1. Return an SSA_NAME whose value is the address of the memory location of
4159 the first vector of the data reference.
4160 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4161 these statement(s) which define the returned SSA_NAME.
4163 FORNOW: We are only handling array accesses with step 1. */
4166 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4167 gimple_seq
*new_stmt_list
,
4172 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4173 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4175 const char *base_name
;
4178 gimple_seq seq
= NULL
;
4182 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4183 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4185 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4187 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4189 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4191 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4192 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4193 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4197 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4198 base_offset
= unshare_expr (DR_OFFSET (dr
));
4199 init
= unshare_expr (DR_INIT (dr
));
4203 base_name
= get_name (data_ref_base
);
4206 base_offset
= ssize_int (0);
4207 init
= ssize_int (0);
4208 base_name
= get_name (DR_REF (dr
));
4211 /* Create base_offset */
4212 base_offset
= size_binop (PLUS_EXPR
,
4213 fold_convert (sizetype
, base_offset
),
4214 fold_convert (sizetype
, init
));
4218 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4219 fold_convert (sizetype
, offset
), step
);
4220 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4221 base_offset
, offset
);
4225 byte_offset
= fold_convert (sizetype
, byte_offset
);
4226 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4227 base_offset
, byte_offset
);
4230 /* base + base_offset */
4232 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4235 addr_base
= build1 (ADDR_EXPR
,
4236 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4237 unshare_expr (DR_REF (dr
)));
4240 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4241 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4242 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4243 gimple_seq_add_seq (new_stmt_list
, seq
);
4245 if (DR_PTR_INFO (dr
)
4246 && TREE_CODE (addr_base
) == SSA_NAME
4247 && !SSA_NAME_PTR_INFO (addr_base
))
4249 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4250 if (offset
|| byte_offset
)
4251 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4254 if (dump_enabled_p ())
4256 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4257 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4258 dump_printf (MSG_NOTE
, "\n");
4265 /* Function vect_create_data_ref_ptr.
4267 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4268 location accessed in the loop by STMT, along with the def-use update
4269 chain to appropriately advance the pointer through the loop iterations.
4270 Also set aliasing information for the pointer. This pointer is used by
4271 the callers to this function to create a memory reference expression for
4272 vector load/store access.
4275 1. STMT: a stmt that references memory. Expected to be of the form
4276 GIMPLE_ASSIGN <name, data-ref> or
4277 GIMPLE_ASSIGN <data-ref, name>.
4278 2. AGGR_TYPE: the type of the reference, which should be either a vector
4280 3. AT_LOOP: the loop where the vector memref is to be created.
4281 4. OFFSET (optional): an offset to be added to the initial address accessed
4282 by the data-ref in STMT.
4283 5. BSI: location where the new stmts are to be placed if there is no loop
4284 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4285 pointing to the initial address.
4286 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4287 to the initial address accessed by the data-ref in STMT. This is
4288 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4292 1. Declare a new ptr to vector_type, and have it point to the base of the
4293 data reference (initial addressed accessed by the data reference).
4294 For example, for vector of type V8HI, the following code is generated:
4297 ap = (v8hi *)initial_address;
4299 if OFFSET is not supplied:
4300 initial_address = &a[init];
4301 if OFFSET is supplied:
4302 initial_address = &a[init + OFFSET];
4303 if BYTE_OFFSET is supplied:
4304 initial_address = &a[init] + BYTE_OFFSET;
4306 Return the initial_address in INITIAL_ADDRESS.
4308 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4309 update the pointer in each iteration of the loop.
4311 Return the increment stmt that updates the pointer in PTR_INCR.
4313 3. Set INV_P to true if the access pattern of the data reference in the
4314 vectorized loop is invariant. Set it to false otherwise.
4316 4. Return the pointer. */
4319 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4320 tree offset
, tree
*initial_address
,
4321 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4322 bool only_init
, bool *inv_p
, tree byte_offset
)
4324 const char *base_name
;
4325 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4326 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4327 struct loop
*loop
= NULL
;
4328 bool nested_in_vect_loop
= false;
4329 struct loop
*containing_loop
= NULL
;
4333 gimple_seq new_stmt_list
= NULL
;
4337 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4339 gimple_stmt_iterator incr_gsi
;
4341 tree indx_before_incr
, indx_after_incr
;
4344 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4346 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4347 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4351 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4352 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4353 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4354 pe
= loop_preheader_edge (loop
);
4358 gcc_assert (bb_vinfo
);
4363 /* Check the step (evolution) of the load in LOOP, and record
4364 whether it's invariant. */
4365 if (nested_in_vect_loop
)
4366 step
= STMT_VINFO_DR_STEP (stmt_info
);
4368 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4370 if (integer_zerop (step
))
4375 /* Create an expression for the first address accessed by this load
4377 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4379 if (dump_enabled_p ())
4381 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4382 dump_printf_loc (MSG_NOTE
, vect_location
,
4383 "create %s-pointer variable to type: ",
4384 get_tree_code_name (TREE_CODE (aggr_type
)));
4385 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4386 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4387 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4388 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4389 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4390 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4391 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4393 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4394 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4395 dump_printf (MSG_NOTE
, "\n");
4398 /* (1) Create the new aggregate-pointer variable.
4399 Vector and array types inherit the alias set of their component
4400 type by default so we need to use a ref-all pointer if the data
4401 reference does not conflict with the created aggregated data
4402 reference because it is not addressable. */
4403 bool need_ref_all
= false;
4404 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4405 get_alias_set (DR_REF (dr
))))
4406 need_ref_all
= true;
4407 /* Likewise for any of the data references in the stmt group. */
4408 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4410 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4413 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4414 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4415 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4416 get_alias_set (DR_REF (sdr
))))
4418 need_ref_all
= true;
4421 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4425 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4427 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4430 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4431 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4432 def-use update cycles for the pointer: one relative to the outer-loop
4433 (LOOP), which is what steps (3) and (4) below do. The other is relative
4434 to the inner-loop (which is the inner-most loop containing the dataref),
4435 and this is done be step (5) below.
4437 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4438 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4439 redundant. Steps (3),(4) create the following:
4442 LOOP: vp1 = phi(vp0,vp2)
4448 If there is an inner-loop nested in loop, then step (5) will also be
4449 applied, and an additional update in the inner-loop will be created:
4452 LOOP: vp1 = phi(vp0,vp2)
4454 inner: vp3 = phi(vp1,vp4)
4455 vp4 = vp3 + inner_step
4461 /* (2) Calculate the initial address of the aggregate-pointer, and set
4462 the aggregate-pointer to point to it before the loop. */
4464 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4466 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4467 offset
, loop
, byte_offset
);
4472 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4473 gcc_assert (!new_bb
);
4476 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4479 *initial_address
= new_temp
;
4480 aggr_ptr_init
= new_temp
;
4482 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4483 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4484 inner-loop nested in LOOP (during outer-loop vectorization). */
4486 /* No update in loop is required. */
4487 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4488 aptr
= aggr_ptr_init
;
4491 /* The step of the aggregate pointer is the type size. */
4492 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4493 /* One exception to the above is when the scalar step of the load in
4494 LOOP is zero. In this case the step here is also zero. */
4496 iv_step
= size_zero_node
;
4497 else if (tree_int_cst_sgn (step
) == -1)
4498 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4500 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4502 create_iv (aggr_ptr_init
,
4503 fold_convert (aggr_ptr_type
, iv_step
),
4504 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4505 &indx_before_incr
, &indx_after_incr
);
4506 incr
= gsi_stmt (incr_gsi
);
4507 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4509 /* Copy the points-to information if it exists. */
4510 if (DR_PTR_INFO (dr
))
4512 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4513 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4518 aptr
= indx_before_incr
;
4521 if (!nested_in_vect_loop
|| only_init
)
4525 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4526 nested in LOOP, if exists. */
4528 gcc_assert (nested_in_vect_loop
);
4531 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4533 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4534 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4536 incr
= gsi_stmt (incr_gsi
);
4537 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4539 /* Copy the points-to information if it exists. */
4540 if (DR_PTR_INFO (dr
))
4542 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4543 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4548 return indx_before_incr
;
4555 /* Function bump_vector_ptr
4557 Increment a pointer (to a vector type) by vector-size. If requested,
4558 i.e. if PTR-INCR is given, then also connect the new increment stmt
4559 to the existing def-use update-chain of the pointer, by modifying
4560 the PTR_INCR as illustrated below:
4562 The pointer def-use update-chain before this function:
4563 DATAREF_PTR = phi (p_0, p_2)
4565 PTR_INCR: p_2 = DATAREF_PTR + step
4567 The pointer def-use update-chain after this function:
4568 DATAREF_PTR = phi (p_0, p_2)
4570 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4572 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4575 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4577 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4578 the loop. The increment amount across iterations is expected
4580 BSI - location where the new update stmt is to be placed.
4581 STMT - the original scalar memory-access stmt that is being vectorized.
4582 BUMP - optional. The offset by which to bump the pointer. If not given,
4583 the offset is assumed to be vector_size.
4585 Output: Return NEW_DATAREF_PTR as illustrated above.
4590 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4591 gimple
*stmt
, tree bump
)
4593 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4594 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4595 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4596 tree update
= TYPE_SIZE_UNIT (vectype
);
4599 use_operand_p use_p
;
4600 tree new_dataref_ptr
;
4605 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4606 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4608 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4609 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4610 dataref_ptr
, update
);
4611 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4613 /* Copy the points-to information if it exists. */
4614 if (DR_PTR_INFO (dr
))
4616 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4617 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4621 return new_dataref_ptr
;
4623 /* Update the vector-pointer's cross-iteration increment. */
4624 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4626 tree use
= USE_FROM_PTR (use_p
);
4628 if (use
== dataref_ptr
)
4629 SET_USE (use_p
, new_dataref_ptr
);
4631 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4634 return new_dataref_ptr
;
4638 /* Function vect_create_destination_var.
4640 Create a new temporary of type VECTYPE. */
4643 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4649 enum vect_var_kind kind
;
4652 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4656 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4658 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4660 name
= get_name (scalar_dest
);
4662 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4664 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4665 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4671 /* Function vect_grouped_store_supported.
4673 Returns TRUE if interleave high and interleave low permutations
4674 are supported, and FALSE otherwise. */
4677 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4679 machine_mode mode
= TYPE_MODE (vectype
);
4681 /* vect_permute_store_chain requires the group size to be equal to 3 or
4682 be a power of two. */
4683 if (count
!= 3 && exact_log2 (count
) == -1)
4685 if (dump_enabled_p ())
4686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4687 "the size of the group of accesses"
4688 " is not a power of 2 or not eqaul to 3\n");
4692 /* Check that the permutation is supported. */
4693 if (VECTOR_MODE_P (mode
))
4695 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4696 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4700 unsigned int j0
= 0, j1
= 0, j2
= 0;
4703 for (j
= 0; j
< 3; j
++)
4705 int nelt0
= ((3 - j
) * nelt
) % 3;
4706 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4707 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4708 for (i
= 0; i
< nelt
; i
++)
4710 if (3 * i
+ nelt0
< nelt
)
4711 sel
[3 * i
+ nelt0
] = j0
++;
4712 if (3 * i
+ nelt1
< nelt
)
4713 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4714 if (3 * i
+ nelt2
< nelt
)
4715 sel
[3 * i
+ nelt2
] = 0;
4717 if (!can_vec_perm_p (mode
, false, sel
))
4719 if (dump_enabled_p ())
4720 dump_printf (MSG_MISSED_OPTIMIZATION
,
4721 "permutaion op not supported by target.\n");
4725 for (i
= 0; i
< nelt
; i
++)
4727 if (3 * i
+ nelt0
< nelt
)
4728 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4729 if (3 * i
+ nelt1
< nelt
)
4730 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4731 if (3 * i
+ nelt2
< nelt
)
4732 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4734 if (!can_vec_perm_p (mode
, false, sel
))
4736 if (dump_enabled_p ())
4737 dump_printf (MSG_MISSED_OPTIMIZATION
,
4738 "permutaion op not supported by target.\n");
4746 /* If length is not equal to 3 then only power of 2 is supported. */
4747 gcc_assert (pow2p_hwi (count
));
4749 for (i
= 0; i
< nelt
/ 2; i
++)
4752 sel
[i
* 2 + 1] = i
+ nelt
;
4754 if (can_vec_perm_p (mode
, false, sel
))
4756 for (i
= 0; i
< nelt
; i
++)
4758 if (can_vec_perm_p (mode
, false, sel
))
4764 if (dump_enabled_p ())
4765 dump_printf (MSG_MISSED_OPTIMIZATION
,
4766 "permutaion op not supported by target.\n");
4771 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4775 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4777 return vect_lanes_optab_supported_p ("vec_store_lanes",
4778 vec_store_lanes_optab
,
4783 /* Function vect_permute_store_chain.
4785 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4786 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4787 the data correctly for the stores. Return the final references for stores
4790 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4791 The input is 4 vectors each containing 8 elements. We assign a number to
4792 each element, the input sequence is:
4794 1st vec: 0 1 2 3 4 5 6 7
4795 2nd vec: 8 9 10 11 12 13 14 15
4796 3rd vec: 16 17 18 19 20 21 22 23
4797 4th vec: 24 25 26 27 28 29 30 31
4799 The output sequence should be:
4801 1st vec: 0 8 16 24 1 9 17 25
4802 2nd vec: 2 10 18 26 3 11 19 27
4803 3rd vec: 4 12 20 28 5 13 21 30
4804 4th vec: 6 14 22 30 7 15 23 31
4806 i.e., we interleave the contents of the four vectors in their order.
4808 We use interleave_high/low instructions to create such output. The input of
4809 each interleave_high/low operation is two vectors:
4812 the even elements of the result vector are obtained left-to-right from the
4813 high/low elements of the first vector. The odd elements of the result are
4814 obtained left-to-right from the high/low elements of the second vector.
4815 The output of interleave_high will be: 0 4 1 5
4816 and of interleave_low: 2 6 3 7
4819 The permutation is done in log LENGTH stages. In each stage interleave_high
4820 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4821 where the first argument is taken from the first half of DR_CHAIN and the
4822 second argument from it's second half.
4825 I1: interleave_high (1st vec, 3rd vec)
4826 I2: interleave_low (1st vec, 3rd vec)
4827 I3: interleave_high (2nd vec, 4th vec)
4828 I4: interleave_low (2nd vec, 4th vec)
4830 The output for the first stage is:
4832 I1: 0 16 1 17 2 18 3 19
4833 I2: 4 20 5 21 6 22 7 23
4834 I3: 8 24 9 25 10 26 11 27
4835 I4: 12 28 13 29 14 30 15 31
4837 The output of the second stage, i.e. the final result is:
4839 I1: 0 8 16 24 1 9 17 25
4840 I2: 2 10 18 26 3 11 19 27
4841 I3: 4 12 20 28 5 13 21 30
4842 I4: 6 14 22 30 7 15 23 31. */
4845 vect_permute_store_chain (vec
<tree
> dr_chain
,
4846 unsigned int length
,
4848 gimple_stmt_iterator
*gsi
,
4849 vec
<tree
> *result_chain
)
4851 tree vect1
, vect2
, high
, low
;
4853 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4854 tree perm_mask_low
, perm_mask_high
;
4856 tree perm3_mask_low
, perm3_mask_high
;
4857 unsigned int i
, n
, log_length
= exact_log2 (length
);
4858 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4859 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4861 result_chain
->quick_grow (length
);
4862 memcpy (result_chain
->address (), dr_chain
.address (),
4863 length
* sizeof (tree
));
4867 unsigned int j0
= 0, j1
= 0, j2
= 0;
4869 for (j
= 0; j
< 3; j
++)
4871 int nelt0
= ((3 - j
) * nelt
) % 3;
4872 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4873 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4875 for (i
= 0; i
< nelt
; i
++)
4877 if (3 * i
+ nelt0
< nelt
)
4878 sel
[3 * i
+ nelt0
] = j0
++;
4879 if (3 * i
+ nelt1
< nelt
)
4880 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4881 if (3 * i
+ nelt2
< nelt
)
4882 sel
[3 * i
+ nelt2
] = 0;
4884 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4886 for (i
= 0; i
< nelt
; i
++)
4888 if (3 * i
+ nelt0
< nelt
)
4889 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4890 if (3 * i
+ nelt1
< nelt
)
4891 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4892 if (3 * i
+ nelt2
< nelt
)
4893 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4895 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4897 vect1
= dr_chain
[0];
4898 vect2
= dr_chain
[1];
4900 /* Create interleaving stmt:
4901 low = VEC_PERM_EXPR <vect1, vect2,
4902 {j, nelt, *, j + 1, nelt + j + 1, *,
4903 j + 2, nelt + j + 2, *, ...}> */
4904 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4905 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4906 vect2
, perm3_mask_low
);
4907 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4910 vect2
= dr_chain
[2];
4911 /* Create interleaving stmt:
4912 low = VEC_PERM_EXPR <vect1, vect2,
4913 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4914 6, 7, nelt + j + 2, ...}> */
4915 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4916 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4917 vect2
, perm3_mask_high
);
4918 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4919 (*result_chain
)[j
] = data_ref
;
4924 /* If length is not equal to 3 then only power of 2 is supported. */
4925 gcc_assert (pow2p_hwi (length
));
4927 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4930 sel
[i
* 2 + 1] = i
+ nelt
;
4932 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4934 for (i
= 0; i
< nelt
; i
++)
4936 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4938 for (i
= 0, n
= log_length
; i
< n
; i
++)
4940 for (j
= 0; j
< length
/2; j
++)
4942 vect1
= dr_chain
[j
];
4943 vect2
= dr_chain
[j
+length
/2];
4945 /* Create interleaving stmt:
4946 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4948 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4949 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4950 vect2
, perm_mask_high
);
4951 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4952 (*result_chain
)[2*j
] = high
;
4954 /* Create interleaving stmt:
4955 low = VEC_PERM_EXPR <vect1, vect2,
4956 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4958 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4959 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4960 vect2
, perm_mask_low
);
4961 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4962 (*result_chain
)[2*j
+1] = low
;
4964 memcpy (dr_chain
.address (), result_chain
->address (),
4965 length
* sizeof (tree
));
4970 /* Function vect_setup_realignment
4972 This function is called when vectorizing an unaligned load using
4973 the dr_explicit_realign[_optimized] scheme.
4974 This function generates the following code at the loop prolog:
4977 x msq_init = *(floor(p)); # prolog load
4978 realignment_token = call target_builtin;
4980 x msq = phi (msq_init, ---)
4982 The stmts marked with x are generated only for the case of
4983 dr_explicit_realign_optimized.
4985 The code above sets up a new (vector) pointer, pointing to the first
4986 location accessed by STMT, and a "floor-aligned" load using that pointer.
4987 It also generates code to compute the "realignment-token" (if the relevant
4988 target hook was defined), and creates a phi-node at the loop-header bb
4989 whose arguments are the result of the prolog-load (created by this
4990 function) and the result of a load that takes place in the loop (to be
4991 created by the caller to this function).
4993 For the case of dr_explicit_realign_optimized:
4994 The caller to this function uses the phi-result (msq) to create the
4995 realignment code inside the loop, and sets up the missing phi argument,
4998 msq = phi (msq_init, lsq)
4999 lsq = *(floor(p')); # load in loop
5000 result = realign_load (msq, lsq, realignment_token);
5002 For the case of dr_explicit_realign:
5004 msq = *(floor(p)); # load in loop
5006 lsq = *(floor(p')); # load in loop
5007 result = realign_load (msq, lsq, realignment_token);
5010 STMT - (scalar) load stmt to be vectorized. This load accesses
5011 a memory location that may be unaligned.
5012 BSI - place where new code is to be inserted.
5013 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5017 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5018 target hook, if defined.
5019 Return value - the result of the loop-header phi node. */
5022 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5023 tree
*realignment_token
,
5024 enum dr_alignment_support alignment_support_scheme
,
5026 struct loop
**at_loop
)
5028 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5029 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5030 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5031 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5032 struct loop
*loop
= NULL
;
5034 tree scalar_dest
= gimple_assign_lhs (stmt
);
5040 tree msq_init
= NULL_TREE
;
5043 tree msq
= NULL_TREE
;
5044 gimple_seq stmts
= NULL
;
5046 bool compute_in_loop
= false;
5047 bool nested_in_vect_loop
= false;
5048 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5049 struct loop
*loop_for_initial_load
= NULL
;
5053 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5054 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5057 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5058 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5060 /* We need to generate three things:
5061 1. the misalignment computation
5062 2. the extra vector load (for the optimized realignment scheme).
5063 3. the phi node for the two vectors from which the realignment is
5064 done (for the optimized realignment scheme). */
5066 /* 1. Determine where to generate the misalignment computation.
5068 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5069 calculation will be generated by this function, outside the loop (in the
5070 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5071 caller, inside the loop.
5073 Background: If the misalignment remains fixed throughout the iterations of
5074 the loop, then both realignment schemes are applicable, and also the
5075 misalignment computation can be done outside LOOP. This is because we are
5076 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5077 are a multiple of VS (the Vector Size), and therefore the misalignment in
5078 different vectorized LOOP iterations is always the same.
5079 The problem arises only if the memory access is in an inner-loop nested
5080 inside LOOP, which is now being vectorized using outer-loop vectorization.
5081 This is the only case when the misalignment of the memory access may not
5082 remain fixed throughout the iterations of the inner-loop (as explained in
5083 detail in vect_supportable_dr_alignment). In this case, not only is the
5084 optimized realignment scheme not applicable, but also the misalignment
5085 computation (and generation of the realignment token that is passed to
5086 REALIGN_LOAD) have to be done inside the loop.
5088 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5089 or not, which in turn determines if the misalignment is computed inside
5090 the inner-loop, or outside LOOP. */
5092 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5094 compute_in_loop
= true;
5095 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5099 /* 2. Determine where to generate the extra vector load.
5101 For the optimized realignment scheme, instead of generating two vector
5102 loads in each iteration, we generate a single extra vector load in the
5103 preheader of the loop, and in each iteration reuse the result of the
5104 vector load from the previous iteration. In case the memory access is in
5105 an inner-loop nested inside LOOP, which is now being vectorized using
5106 outer-loop vectorization, we need to determine whether this initial vector
5107 load should be generated at the preheader of the inner-loop, or can be
5108 generated at the preheader of LOOP. If the memory access has no evolution
5109 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5110 to be generated inside LOOP (in the preheader of the inner-loop). */
5112 if (nested_in_vect_loop
)
5114 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5115 bool invariant_in_outerloop
=
5116 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5117 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5120 loop_for_initial_load
= loop
;
5122 *at_loop
= loop_for_initial_load
;
5124 if (loop_for_initial_load
)
5125 pe
= loop_preheader_edge (loop_for_initial_load
);
5127 /* 3. For the case of the optimized realignment, create the first vector
5128 load at the loop preheader. */
5130 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5132 /* Create msq_init = *(floor(p1)) in the loop preheader */
5135 gcc_assert (!compute_in_loop
);
5136 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5137 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5138 NULL_TREE
, &init_addr
, NULL
, &inc
,
5140 if (TREE_CODE (ptr
) == SSA_NAME
)
5141 new_temp
= copy_ssa_name (ptr
);
5143 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5144 new_stmt
= gimple_build_assign
5145 (new_temp
, BIT_AND_EXPR
, ptr
,
5146 build_int_cst (TREE_TYPE (ptr
),
5147 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5148 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5149 gcc_assert (!new_bb
);
5151 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5152 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5153 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5154 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5155 gimple_assign_set_lhs (new_stmt
, new_temp
);
5158 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5159 gcc_assert (!new_bb
);
5162 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5164 msq_init
= gimple_assign_lhs (new_stmt
);
5167 /* 4. Create realignment token using a target builtin, if available.
5168 It is done either inside the containing loop, or before LOOP (as
5169 determined above). */
5171 if (targetm
.vectorize
.builtin_mask_for_load
)
5176 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5179 /* Generate the INIT_ADDR computation outside LOOP. */
5180 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5184 pe
= loop_preheader_edge (loop
);
5185 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5186 gcc_assert (!new_bb
);
5189 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5192 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5193 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5195 vect_create_destination_var (scalar_dest
,
5196 gimple_call_return_type (new_stmt
));
5197 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5198 gimple_call_set_lhs (new_stmt
, new_temp
);
5200 if (compute_in_loop
)
5201 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5204 /* Generate the misalignment computation outside LOOP. */
5205 pe
= loop_preheader_edge (loop
);
5206 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5207 gcc_assert (!new_bb
);
5210 *realignment_token
= gimple_call_lhs (new_stmt
);
5212 /* The result of the CALL_EXPR to this builtin is determined from
5213 the value of the parameter and no global variables are touched
5214 which makes the builtin a "const" function. Requiring the
5215 builtin to have the "const" attribute makes it unnecessary
5216 to call mark_call_clobbered. */
5217 gcc_assert (TREE_READONLY (builtin_decl
));
5220 if (alignment_support_scheme
== dr_explicit_realign
)
5223 gcc_assert (!compute_in_loop
);
5224 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5227 /* 5. Create msq = phi <msq_init, lsq> in loop */
5229 pe
= loop_preheader_edge (containing_loop
);
5230 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5231 msq
= make_ssa_name (vec_dest
);
5232 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5233 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5239 /* Function vect_grouped_load_supported.
5241 COUNT is the size of the load group (the number of statements plus the
5242 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5243 only one statement, with a gap of COUNT - 1.
5245 Returns true if a suitable permute exists. */
5248 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5249 unsigned HOST_WIDE_INT count
)
5251 machine_mode mode
= TYPE_MODE (vectype
);
5253 /* If this is single-element interleaving with an element distance
5254 that leaves unused vector loads around punt - we at least create
5255 very sub-optimal code in that case (and blow up memory,
5257 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5259 if (dump_enabled_p ())
5260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5261 "single-element interleaving not supported "
5262 "for not adjacent vector loads\n");
5266 /* vect_permute_load_chain requires the group size to be equal to 3 or
5267 be a power of two. */
5268 if (count
!= 3 && exact_log2 (count
) == -1)
5270 if (dump_enabled_p ())
5271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5272 "the size of the group of accesses"
5273 " is not a power of 2 or not equal to 3\n");
5277 /* Check that the permutation is supported. */
5278 if (VECTOR_MODE_P (mode
))
5280 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5281 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5286 for (k
= 0; k
< 3; k
++)
5288 for (i
= 0; i
< nelt
; i
++)
5289 if (3 * i
+ k
< 2 * nelt
)
5293 if (!can_vec_perm_p (mode
, false, sel
))
5295 if (dump_enabled_p ())
5296 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5297 "shuffle of 3 loads is not supported by"
5301 for (i
= 0, j
= 0; i
< nelt
; i
++)
5302 if (3 * i
+ k
< 2 * nelt
)
5305 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5306 if (!can_vec_perm_p (mode
, false, sel
))
5308 if (dump_enabled_p ())
5309 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5310 "shuffle of 3 loads is not supported by"
5319 /* If length is not equal to 3 then only power of 2 is supported. */
5320 gcc_assert (pow2p_hwi (count
));
5321 for (i
= 0; i
< nelt
; i
++)
5323 if (can_vec_perm_p (mode
, false, sel
))
5325 for (i
= 0; i
< nelt
; i
++)
5327 if (can_vec_perm_p (mode
, false, sel
))
5333 if (dump_enabled_p ())
5334 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5335 "extract even/odd not supported by target\n");
5339 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5343 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5345 return vect_lanes_optab_supported_p ("vec_load_lanes",
5346 vec_load_lanes_optab
,
5350 /* Function vect_permute_load_chain.
5352 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5353 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5354 the input data correctly. Return the final references for loads in
5357 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5358 The input is 4 vectors each containing 8 elements. We assign a number to each
5359 element, the input sequence is:
5361 1st vec: 0 1 2 3 4 5 6 7
5362 2nd vec: 8 9 10 11 12 13 14 15
5363 3rd vec: 16 17 18 19 20 21 22 23
5364 4th vec: 24 25 26 27 28 29 30 31
5366 The output sequence should be:
5368 1st vec: 0 4 8 12 16 20 24 28
5369 2nd vec: 1 5 9 13 17 21 25 29
5370 3rd vec: 2 6 10 14 18 22 26 30
5371 4th vec: 3 7 11 15 19 23 27 31
5373 i.e., the first output vector should contain the first elements of each
5374 interleaving group, etc.
5376 We use extract_even/odd instructions to create such output. The input of
5377 each extract_even/odd operation is two vectors
5381 and the output is the vector of extracted even/odd elements. The output of
5382 extract_even will be: 0 2 4 6
5383 and of extract_odd: 1 3 5 7
5386 The permutation is done in log LENGTH stages. In each stage extract_even
5387 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5388 their order. In our example,
5390 E1: extract_even (1st vec, 2nd vec)
5391 E2: extract_odd (1st vec, 2nd vec)
5392 E3: extract_even (3rd vec, 4th vec)
5393 E4: extract_odd (3rd vec, 4th vec)
5395 The output for the first stage will be:
5397 E1: 0 2 4 6 8 10 12 14
5398 E2: 1 3 5 7 9 11 13 15
5399 E3: 16 18 20 22 24 26 28 30
5400 E4: 17 19 21 23 25 27 29 31
5402 In order to proceed and create the correct sequence for the next stage (or
5403 for the correct output, if the second stage is the last one, as in our
5404 example), we first put the output of extract_even operation and then the
5405 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5406 The input for the second stage is:
5408 1st vec (E1): 0 2 4 6 8 10 12 14
5409 2nd vec (E3): 16 18 20 22 24 26 28 30
5410 3rd vec (E2): 1 3 5 7 9 11 13 15
5411 4th vec (E4): 17 19 21 23 25 27 29 31
5413 The output of the second stage:
5415 E1: 0 4 8 12 16 20 24 28
5416 E2: 2 6 10 14 18 22 26 30
5417 E3: 1 5 9 13 17 21 25 29
5418 E4: 3 7 11 15 19 23 27 31
5420 And RESULT_CHAIN after reordering:
5422 1st vec (E1): 0 4 8 12 16 20 24 28
5423 2nd vec (E3): 1 5 9 13 17 21 25 29
5424 3rd vec (E2): 2 6 10 14 18 22 26 30
5425 4th vec (E4): 3 7 11 15 19 23 27 31. */
5428 vect_permute_load_chain (vec
<tree
> dr_chain
,
5429 unsigned int length
,
5431 gimple_stmt_iterator
*gsi
,
5432 vec
<tree
> *result_chain
)
5434 tree data_ref
, first_vect
, second_vect
;
5435 tree perm_mask_even
, perm_mask_odd
;
5436 tree perm3_mask_low
, perm3_mask_high
;
5438 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5439 unsigned int i
, j
, log_length
= exact_log2 (length
);
5440 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5441 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5443 result_chain
->quick_grow (length
);
5444 memcpy (result_chain
->address (), dr_chain
.address (),
5445 length
* sizeof (tree
));
5451 for (k
= 0; k
< 3; k
++)
5453 for (i
= 0; i
< nelt
; i
++)
5454 if (3 * i
+ k
< 2 * nelt
)
5458 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5460 for (i
= 0, j
= 0; i
< nelt
; i
++)
5461 if (3 * i
+ k
< 2 * nelt
)
5464 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5466 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5468 first_vect
= dr_chain
[0];
5469 second_vect
= dr_chain
[1];
5471 /* Create interleaving stmt (low part of):
5472 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5474 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5475 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5476 second_vect
, perm3_mask_low
);
5477 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5479 /* Create interleaving stmt (high part of):
5480 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5482 first_vect
= data_ref
;
5483 second_vect
= dr_chain
[2];
5484 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5485 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5486 second_vect
, perm3_mask_high
);
5487 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5488 (*result_chain
)[k
] = data_ref
;
5493 /* If length is not equal to 3 then only power of 2 is supported. */
5494 gcc_assert (pow2p_hwi (length
));
5496 for (i
= 0; i
< nelt
; ++i
)
5498 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5500 for (i
= 0; i
< nelt
; ++i
)
5502 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5504 for (i
= 0; i
< log_length
; i
++)
5506 for (j
= 0; j
< length
; j
+= 2)
5508 first_vect
= dr_chain
[j
];
5509 second_vect
= dr_chain
[j
+1];
5511 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5512 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5513 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5514 first_vect
, second_vect
,
5516 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5517 (*result_chain
)[j
/2] = data_ref
;
5519 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5520 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5521 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5522 first_vect
, second_vect
,
5524 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5525 (*result_chain
)[j
/2+length
/2] = data_ref
;
5527 memcpy (dr_chain
.address (), result_chain
->address (),
5528 length
* sizeof (tree
));
5533 /* Function vect_shift_permute_load_chain.
5535 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5536 sequence of stmts to reorder the input data accordingly.
5537 Return the final references for loads in RESULT_CHAIN.
5538 Return true if successed, false otherwise.
5540 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5541 The input is 3 vectors each containing 8 elements. We assign a
5542 number to each element, the input sequence is:
5544 1st vec: 0 1 2 3 4 5 6 7
5545 2nd vec: 8 9 10 11 12 13 14 15
5546 3rd vec: 16 17 18 19 20 21 22 23
5548 The output sequence should be:
5550 1st vec: 0 3 6 9 12 15 18 21
5551 2nd vec: 1 4 7 10 13 16 19 22
5552 3rd vec: 2 5 8 11 14 17 20 23
5554 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5556 First we shuffle all 3 vectors to get correct elements order:
5558 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5559 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5560 3rd vec: (16 19 22) (17 20 23) (18 21)
5562 Next we unite and shift vector 3 times:
5565 shift right by 6 the concatenation of:
5566 "1st vec" and "2nd vec"
5567 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5568 "2nd vec" and "3rd vec"
5569 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5570 "3rd vec" and "1st vec"
5571 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5574 So that now new vectors are:
5576 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5577 2nd vec: (10 13) (16 19 22) (17 20 23)
5578 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5581 shift right by 5 the concatenation of:
5582 "1st vec" and "3rd vec"
5583 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5584 "2nd vec" and "1st vec"
5585 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5586 "3rd vec" and "2nd vec"
5587 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5590 So that now new vectors are:
5592 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5593 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5594 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5597 shift right by 5 the concatenation of:
5598 "1st vec" and "1st vec"
5599 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5600 shift right by 3 the concatenation of:
5601 "2nd vec" and "2nd vec"
5602 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5605 So that now all vectors are READY:
5606 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5607 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5608 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5610 This algorithm is faster than one in vect_permute_load_chain if:
5611 1. "shift of a concatination" is faster than general permutation.
5613 2. The TARGET machine can't execute vector instructions in parallel.
5614 This is because each step of the algorithm depends on previous.
5615 The algorithm in vect_permute_load_chain is much more parallel.
5617 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5621 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5622 unsigned int length
,
5624 gimple_stmt_iterator
*gsi
,
5625 vec
<tree
> *result_chain
)
5627 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5628 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5629 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5632 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5634 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5635 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5636 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5637 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5639 result_chain
->quick_grow (length
);
5640 memcpy (result_chain
->address (), dr_chain
.address (),
5641 length
* sizeof (tree
));
5643 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5645 unsigned int j
, log_length
= exact_log2 (length
);
5646 for (i
= 0; i
< nelt
/ 2; ++i
)
5648 for (i
= 0; i
< nelt
/ 2; ++i
)
5649 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5650 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5654 "shuffle of 2 fields structure is not \
5655 supported by target\n");
5658 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5660 for (i
= 0; i
< nelt
/ 2; ++i
)
5662 for (i
= 0; i
< nelt
/ 2; ++i
)
5663 sel
[nelt
/ 2 + i
] = i
* 2;
5664 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5668 "shuffle of 2 fields structure is not \
5669 supported by target\n");
5672 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5674 /* Generating permutation constant to shift all elements.
5675 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5676 for (i
= 0; i
< nelt
; i
++)
5677 sel
[i
] = nelt
/ 2 + i
;
5678 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5680 if (dump_enabled_p ())
5681 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5682 "shift permutation is not supported by target\n");
5685 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5687 /* Generating permutation constant to select vector from 2.
5688 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5689 for (i
= 0; i
< nelt
/ 2; i
++)
5691 for (i
= nelt
/ 2; i
< nelt
; i
++)
5693 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5697 "select is not supported by target\n");
5700 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5702 for (i
= 0; i
< log_length
; i
++)
5704 for (j
= 0; j
< length
; j
+= 2)
5706 first_vect
= dr_chain
[j
];
5707 second_vect
= dr_chain
[j
+ 1];
5709 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5710 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5711 first_vect
, first_vect
,
5713 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5716 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5717 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5718 second_vect
, second_vect
,
5720 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5723 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5724 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5725 vect
[0], vect
[1], shift1_mask
);
5726 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5727 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5729 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5730 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5731 vect
[0], vect
[1], select_mask
);
5732 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5733 (*result_chain
)[j
/2] = data_ref
;
5735 memcpy (dr_chain
.address (), result_chain
->address (),
5736 length
* sizeof (tree
));
5740 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5742 unsigned int k
= 0, l
= 0;
5744 /* Generating permutation constant to get all elements in rigth order.
5745 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5746 for (i
= 0; i
< nelt
; i
++)
5748 if (3 * k
+ (l
% 3) >= nelt
)
5751 l
+= (3 - (nelt
% 3));
5753 sel
[i
] = 3 * k
+ (l
% 3);
5756 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5758 if (dump_enabled_p ())
5759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5760 "shuffle of 3 fields structure is not \
5761 supported by target\n");
5764 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5766 /* Generating permutation constant to shift all elements.
5767 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5768 for (i
= 0; i
< nelt
; i
++)
5769 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5770 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5774 "shift permutation is not supported by target\n");
5777 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5779 /* Generating permutation constant to shift all elements.
5780 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5781 for (i
= 0; i
< nelt
; i
++)
5782 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5783 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5785 if (dump_enabled_p ())
5786 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5787 "shift permutation is not supported by target\n");
5790 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5792 /* Generating permutation constant to shift all elements.
5793 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5794 for (i
= 0; i
< nelt
; i
++)
5795 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5796 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5798 if (dump_enabled_p ())
5799 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5800 "shift permutation is not supported by target\n");
5803 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5805 /* Generating permutation constant to shift all elements.
5806 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5807 for (i
= 0; i
< nelt
; i
++)
5808 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5809 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5811 if (dump_enabled_p ())
5812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5813 "shift permutation is not supported by target\n");
5816 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5818 for (k
= 0; k
< 3; k
++)
5820 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5821 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5822 dr_chain
[k
], dr_chain
[k
],
5824 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5828 for (k
= 0; k
< 3; k
++)
5830 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5831 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5832 vect
[k
% 3], vect
[(k
+ 1) % 3],
5834 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5835 vect_shift
[k
] = data_ref
;
5838 for (k
= 0; k
< 3; k
++)
5840 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5841 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5842 vect_shift
[(4 - k
) % 3],
5843 vect_shift
[(3 - k
) % 3],
5845 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5849 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5851 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5852 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5853 vect
[0], shift3_mask
);
5854 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5855 (*result_chain
)[nelt
% 3] = data_ref
;
5857 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5858 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5859 vect
[1], shift4_mask
);
5860 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5861 (*result_chain
)[0] = data_ref
;
5867 /* Function vect_transform_grouped_load.
5869 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5870 to perform their permutation and ascribe the result vectorized statements to
5871 the scalar statements.
5875 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5876 gimple_stmt_iterator
*gsi
)
5879 vec
<tree
> result_chain
= vNULL
;
5881 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5882 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5883 vectors, that are ready for vector computation. */
5884 result_chain
.create (size
);
5886 /* If reassociation width for vector type is 2 or greater target machine can
5887 execute 2 or more vector instructions in parallel. Otherwise try to
5888 get chain for loads group using vect_shift_permute_load_chain. */
5889 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5890 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5892 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5893 gsi
, &result_chain
))
5894 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5895 vect_record_grouped_load_vectors (stmt
, result_chain
);
5896 result_chain
.release ();
5899 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5900 generated as part of the vectorization of STMT. Assign the statement
5901 for each vector to the associated scalar statement. */
5904 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5906 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5907 gimple
*next_stmt
, *new_stmt
;
5908 unsigned int i
, gap_count
;
5911 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5912 Since we scan the chain starting from it's first node, their order
5913 corresponds the order of data-refs in RESULT_CHAIN. */
5914 next_stmt
= first_stmt
;
5916 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5921 /* Skip the gaps. Loads created for the gaps will be removed by dead
5922 code elimination pass later. No need to check for the first stmt in
5923 the group, since it always exists.
5924 GROUP_GAP is the number of steps in elements from the previous
5925 access (if there is no gap GROUP_GAP is 1). We skip loads that
5926 correspond to the gaps. */
5927 if (next_stmt
!= first_stmt
5928 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5936 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5937 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5938 copies, and we put the new vector statement in the first available
5940 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5941 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5944 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5947 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5949 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5952 prev_stmt
= rel_stmt
;
5954 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5957 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5962 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5964 /* If NEXT_STMT accesses the same DR as the previous statement,
5965 put the same TMP_DATA_REF as its vectorized statement; otherwise
5966 get the next data-ref from RESULT_CHAIN. */
5967 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5973 /* Function vect_force_dr_alignment_p.
5975 Returns whether the alignment of a DECL can be forced to be aligned
5976 on ALIGNMENT bit boundary. */
5979 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5984 if (decl_in_symtab_p (decl
)
5985 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5988 if (TREE_STATIC (decl
))
5989 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5991 return (alignment
<= MAX_STACK_ALIGNMENT
);
5995 /* Return whether the data reference DR is supported with respect to its
5997 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5998 it is aligned, i.e., check if it is possible to vectorize it with different
6001 enum dr_alignment_support
6002 vect_supportable_dr_alignment (struct data_reference
*dr
,
6003 bool check_aligned_accesses
)
6005 gimple
*stmt
= DR_STMT (dr
);
6006 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6007 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6008 machine_mode mode
= TYPE_MODE (vectype
);
6009 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6010 struct loop
*vect_loop
= NULL
;
6011 bool nested_in_vect_loop
= false;
6013 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6016 /* For now assume all conditional loads/stores support unaligned
6017 access without any special code. */
6018 if (is_gimple_call (stmt
)
6019 && gimple_call_internal_p (stmt
)
6020 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6021 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6022 return dr_unaligned_supported
;
6026 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6027 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6030 /* Possibly unaligned access. */
6032 /* We can choose between using the implicit realignment scheme (generating
6033 a misaligned_move stmt) and the explicit realignment scheme (generating
6034 aligned loads with a REALIGN_LOAD). There are two variants to the
6035 explicit realignment scheme: optimized, and unoptimized.
6036 We can optimize the realignment only if the step between consecutive
6037 vector loads is equal to the vector size. Since the vector memory
6038 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6039 is guaranteed that the misalignment amount remains the same throughout the
6040 execution of the vectorized loop. Therefore, we can create the
6041 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6042 at the loop preheader.
6044 However, in the case of outer-loop vectorization, when vectorizing a
6045 memory access in the inner-loop nested within the LOOP that is now being
6046 vectorized, while it is guaranteed that the misalignment of the
6047 vectorized memory access will remain the same in different outer-loop
6048 iterations, it is *not* guaranteed that is will remain the same throughout
6049 the execution of the inner-loop. This is because the inner-loop advances
6050 with the original scalar step (and not in steps of VS). If the inner-loop
6051 step happens to be a multiple of VS, then the misalignment remains fixed
6052 and we can use the optimized realignment scheme. For example:
6058 When vectorizing the i-loop in the above example, the step between
6059 consecutive vector loads is 1, and so the misalignment does not remain
6060 fixed across the execution of the inner-loop, and the realignment cannot
6061 be optimized (as illustrated in the following pseudo vectorized loop):
6063 for (i=0; i<N; i+=4)
6064 for (j=0; j<M; j++){
6065 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6066 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6067 // (assuming that we start from an aligned address).
6070 We therefore have to use the unoptimized realignment scheme:
6072 for (i=0; i<N; i+=4)
6073 for (j=k; j<M; j+=4)
6074 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6075 // that the misalignment of the initial address is
6078 The loop can then be vectorized as follows:
6080 for (k=0; k<4; k++){
6081 rt = get_realignment_token (&vp[k]);
6082 for (i=0; i<N; i+=4){
6084 for (j=k; j<M; j+=4){
6086 va = REALIGN_LOAD <v1,v2,rt>;
6093 if (DR_IS_READ (dr
))
6095 bool is_packed
= false;
6096 tree type
= (TREE_TYPE (DR_REF (dr
)));
6098 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6099 && (!targetm
.vectorize
.builtin_mask_for_load
6100 || targetm
.vectorize
.builtin_mask_for_load ()))
6102 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6104 /* If we are doing SLP then the accesses need not have the
6105 same alignment, instead it depends on the SLP group size. */
6107 && STMT_SLP_TYPE (stmt_info
)
6108 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6109 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
6110 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
6112 else if (!loop_vinfo
6113 || (nested_in_vect_loop
6114 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6115 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6116 return dr_explicit_realign
;
6118 return dr_explicit_realign_optimized
;
6120 if (!known_alignment_for_access_p (dr
))
6121 is_packed
= not_size_aligned (DR_REF (dr
));
6123 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6124 || targetm
.vectorize
.
6125 support_vector_misalignment (mode
, type
,
6126 DR_MISALIGNMENT (dr
), is_packed
))
6127 /* Can't software pipeline the loads, but can at least do them. */
6128 return dr_unaligned_supported
;
6132 bool is_packed
= false;
6133 tree type
= (TREE_TYPE (DR_REF (dr
)));
6135 if (!known_alignment_for_access_p (dr
))
6136 is_packed
= not_size_aligned (DR_REF (dr
));
6138 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6139 || targetm
.vectorize
.
6140 support_vector_misalignment (mode
, type
,
6141 DR_MISALIGNMENT (dr
), is_packed
))
6142 return dr_unaligned_supported
;
6146 return dr_unaligned_unsupported
;