1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "optabs-tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
58 tree vectype
, unsigned HOST_WIDE_INT count
)
60 machine_mode mode
, array_mode
;
63 mode
= TYPE_MODE (vectype
);
64 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
65 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
68 if (array_mode
== BLKmode
)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
73 GET_MODE_NAME (mode
), count
);
77 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
81 "cannot use %s<%s><%s>\n", name
,
82 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE
, vect_location
,
88 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
89 GET_MODE_NAME (mode
));
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
114 HOST_WIDE_INT
*rhs_size_unit
)
116 tree scalar_type
= gimple_expr_type (stmt
);
117 HOST_WIDE_INT lhs
, rhs
;
119 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
121 if (is_gimple_assign (stmt
)
122 && (gimple_assign_cast_p (stmt
)
123 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
127 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
129 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
131 scalar_type
= rhs_type
;
134 *lhs_size_unit
= lhs
;
135 *rhs_size_unit
= rhs
;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
145 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
147 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE
, vect_location
,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
157 dump_printf (MSG_NOTE
, " and ");
158 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
159 dump_printf (MSG_NOTE
, "\n");
162 if (optimize_loop_nest_for_size_p (loop
))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
166 "versioning not supported when optimizing"
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
176 "versioning not yet supported for outer-loops.\n");
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
187 "versioning not yet supported for non-constant "
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
205 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
206 loop_vec_info loop_vinfo
, int *max_vf
)
209 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
210 struct data_reference
*dra
= DDR_A (ddr
);
211 struct data_reference
*drb
= DDR_B (ddr
);
212 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
213 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
214 lambda_vector dist_v
;
215 unsigned int loop_depth
;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
227 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
237 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
239 get_alias_set (DR_REF (drb
))))
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop
->safelen
>= 2)
249 if (loop
->safelen
< *max_vf
)
250 *max_vf
= loop
->safelen
;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
265 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
268 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
280 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
283 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop
->safelen
>= 2)
297 if (loop
->safelen
< *max_vf
)
298 *max_vf
= loop
->safelen
;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
313 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
316 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
327 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
329 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
335 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
338 int dist
= dist_v
[loop_depth
];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE
, vect_location
,
342 "dependence distance = %d.\n", dist
);
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
351 dump_printf (MSG_NOTE
, " and ");
352 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
353 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
377 gimple
*earlier_stmt
;
378 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
384 "READ_WRITE dependence in interleaving."
393 if (dist
> 0 && DDR_REVERSED_P (ddr
))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
412 && abs (dist
) < *max_vf
)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf
= abs (dist
);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE
, vect_location
,
419 "adjusting maximal vectorization factor to %i\n",
423 if (abs (dist
) >= *max_vf
)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE
, vect_location
,
429 "dependence distance >= VF.\n");
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
439 dump_printf (MSG_NOTE
, " and ");
440 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
441 dump_printf (MSG_NOTE
, "\n");
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
460 struct data_dependence_relation
*ddr
;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo
)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
471 &LOOP_VINFO_DDRS (loop_vinfo
),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
476 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
493 struct data_reference
*dra
= DDR_A (ddr
);
494 struct data_reference
*drb
= DDR_B (ddr
);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
510 /* If dra and drb are part of the same interleaving chain consider
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
525 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
527 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE
, vect_location
,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
535 dump_printf (MSG_NOTE
, " and ");
536 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
537 dump_printf (MSG_NOTE
, "\n");
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
549 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
550 vec
<gimple
*> stores
, gimple
*last_store
)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
555 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
556 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
558 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
559 if (access
== last_access
)
561 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
562 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
563 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
565 gimple
*stmt
= gsi_stmt (gsi
);
566 if (! gimple_vuse (stmt
)
567 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt
))
584 if (stmt
!= last_store
)
588 FOR_EACH_VEC_ELT (stores
, i
, store
)
590 data_reference
*store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
592 ddr_p ddr
= initialize_data_dependence_relation
593 (dr_a
, store_dr
, vNULL
);
594 if (vect_slp_analyze_data_ref_dependence (ddr
))
596 free_dependence_relation (ddr
);
599 free_dependence_relation (ddr
);
603 ddr_p ddr
= initialize_data_dependence_relation (dr_a
, dr_b
, vNULL
);
604 if (vect_slp_analyze_data_ref_dependence (ddr
))
606 free_dependence_relation (ddr
);
609 free_dependence_relation (ddr
);
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
623 vect_slp_analyze_instance_dependence (slp_instance instance
)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE
, vect_location
,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store
= SLP_INSTANCE_TREE (instance
);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple
*last_store
= NULL
;
638 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
641 /* Mark stores in this instance and remember the last one. */
642 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
643 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
654 if (! vect_slp_analyze_node_dependences (instance
, load
,
656 ? SLP_TREE_SCALAR_STMTS (store
)
657 : vNULL
, last_store
))
663 /* Unset the visited flag. */
665 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
684 vect_compute_data_ref_alignment (struct data_reference
*dr
)
686 gimple
*stmt
= DR_STMT (dr
);
687 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
688 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
689 struct loop
*loop
= NULL
;
690 tree ref
= DR_REF (dr
);
692 tree base
, base_addr
;
693 tree misalign
= NULL_TREE
;
695 unsigned HOST_WIDE_INT alignment
;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE
, vect_location
,
699 "vect_compute_data_ref_alignment:\n");
702 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr
, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr
)))
708 misalign
= DR_INIT (dr
);
709 aligned_to
= DR_ALIGNED_TO (dr
);
710 base_addr
= DR_BASE_ADDRESS (dr
);
711 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
721 tree step
= DR_STEP (dr
);
723 if (tree_fits_shwi_p (step
)
724 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE
, vect_location
,
728 "inner step divides the vector-size.\n");
729 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
730 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
731 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
737 "inner step doesn't divide the vector-size.\n");
738 misalign
= NULL_TREE
;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
748 tree step
= DR_STEP (dr
);
749 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
751 if (tree_fits_shwi_p (step
)
752 && ((tree_to_shwi (step
) * vf
)
753 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
757 "step doesn't divide the vector-size.\n");
758 misalign
= NULL_TREE
;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
765 while (handled_component_p (base
))
766 base
= TREE_OPERAND (base
, 0);
767 if (TREE_CODE (base
) == MEM_REF
)
768 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
770 unsigned int base_alignment
= get_object_alignment (base
);
772 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
773 DR_VECT_AUX (dr
)->base_element_aligned
= true;
775 alignment
= TYPE_ALIGN_UNIT (vectype
);
777 if ((compare_tree_int (aligned_to
, alignment
) < 0)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
785 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
790 if (base_alignment
< TYPE_ALIGN (vectype
))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base
) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base
, 1))
795 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
796 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE
, vect_location
,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
805 dump_printf (MSG_NOTE
, "\n");
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
816 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
817 dump_printf (MSG_NOTE
, "\n");
820 DR_VECT_AUX (dr
)->base_decl
= base
;
821 DR_VECT_AUX (dr
)->base_misaligned
= true;
822 DR_VECT_AUX (dr
)->base_element_aligned
= true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr
)) < 0)
830 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
834 /* PLUS because DR_STEP was negative. */
835 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
838 SET_DR_MISALIGNMENT (dr
,
839 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
846 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
862 vect_update_misalignment_for_peel (struct data_reference
*dr
,
863 struct data_reference
*dr_peel
, int npeel
)
866 vec
<dr_p
> same_align_drs
;
867 struct data_reference
*current_dr
;
868 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
869 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
870 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
871 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
876 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
878 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
884 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
886 if (current_dr
!= dr
)
888 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
889 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
890 SET_DR_MISALIGNMENT (dr
, 0);
894 if (known_alignment_for_access_p (dr
)
895 && known_alignment_for_access_p (dr_peel
))
897 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
898 int misal
= DR_MISALIGNMENT (dr
);
899 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
900 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
901 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
902 SET_DR_MISALIGNMENT (dr
, misal
);
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr
, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
917 verify_data_ref_alignment (data_reference_p dr
)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr
, false);
921 if (!supportable_dr_alignment
)
923 if (dump_enabled_p ())
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
927 "not vectorized: unsupported unaligned load.");
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
930 "not vectorized: unsupported unaligned "
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
935 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
940 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE
, vect_location
,
942 "Vectorizing an unaligned access.\n");
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
953 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
955 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
956 struct data_reference
*dr
;
959 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
961 gimple
*stmt
= DR_STMT (dr
);
962 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
969 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info
)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
978 if (! verify_data_ref_alignment (dr
))
985 /* Given an memory reference EXP return whether its alignment is less
989 not_size_aligned (tree exp
)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
995 > get_object_alignment (exp
));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1004 vector_alignment_reachable_p (struct data_reference
*dr
)
1006 gimple
*stmt
= DR_STMT (dr
);
1007 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1008 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size
, mis_in_elements
;
1016 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr
))
1022 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1023 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1025 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1033 HOST_WIDE_INT elmsize
=
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE
, vect_location
,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1039 dump_printf (MSG_NOTE
,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1042 if (DR_MISALIGNMENT (dr
) % elmsize
)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1046 "data size does not divide the misalignment.\n");
1051 if (!known_alignment_for_access_p (dr
))
1053 tree type
= TREE_TYPE (DR_REF (dr
));
1054 bool is_packed
= not_size_aligned (DR_REF (dr
));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1057 "Unknown misalignment, is_packed = %d\n",is_packed
);
1058 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1059 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1069 /* Calculate the cost of the memory access represented by DR. */
1072 vect_get_data_access_cost (struct data_reference
*dr
,
1073 unsigned int *inside_cost
,
1074 unsigned int *outside_cost
,
1075 stmt_vector_for_cost
*body_cost_vec
)
1077 gimple
*stmt
= DR_STMT (dr
);
1078 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1079 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1080 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1081 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1082 int ncopies
= vf
/ nunits
;
1084 if (DR_IS_READ (dr
))
1085 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1086 NULL
, body_cost_vec
, false);
1088 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE
, vect_location
,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1097 typedef struct _vect_peel_info
1100 struct data_reference
*dr
;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info
;
1107 unsigned int inside_cost
;
1108 unsigned int outside_cost
;
1109 stmt_vector_for_cost body_cost_vec
;
1110 } *vect_peel_extended_info
;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1117 static inline hashval_t
hash (const _vect_peel_info
*);
1118 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1122 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1124 return (hashval_t
) peel_info
->npeel
;
1128 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1130 return (a
->npeel
== b
->npeel
);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1137 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1138 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1141 struct _vect_peel_info elem
, *slot
;
1142 _vect_peel_info
**new_slot
;
1143 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1146 slot
= peeling_htab
->find (&elem
);
1151 slot
= XNEW (struct _vect_peel_info
);
1152 slot
->npeel
= npeel
;
1155 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1161 slot
->count
+= VECT_MAX_COST
;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1170 _vect_peel_extended_info
*max
)
1172 vect_peel_info elem
= *slot
;
1174 if (elem
->count
> max
->peel_info
.count
1175 || (elem
->count
== max
->peel_info
.count
1176 && max
->peel_info
.npeel
> elem
->npeel
))
1178 max
->peel_info
.npeel
= elem
->npeel
;
1179 max
->peel_info
.count
= elem
->count
;
1180 max
->peel_info
.dr
= elem
->dr
;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1192 _vect_peel_extended_info
*min
)
1194 vect_peel_info elem
= *slot
;
1195 int save_misalignment
, dummy
;
1196 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1197 gimple
*stmt
= DR_STMT (elem
->dr
);
1198 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1199 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1200 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1201 struct data_reference
*dr
;
1202 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1204 prologue_cost_vec
.create (2);
1205 body_cost_vec
.create (2);
1206 epilogue_cost_vec
.create (2);
1208 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1210 stmt
= DR_STMT (dr
);
1211 stmt_info
= vinfo_for_stmt (stmt
);
1212 /* For interleaving, only the alignment of the first access
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1215 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1218 /* Strided accesses perform only component accesses, alignment is
1219 irrelevant for them. */
1220 if (STMT_VINFO_STRIDED_P (stmt_info
)
1221 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1224 save_misalignment
= DR_MISALIGNMENT (dr
);
1225 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1226 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1228 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1231 outside_cost
+= vect_get_known_peeling_cost
1232 (loop_vinfo
, elem
->npeel
, &dummy
,
1233 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1234 &prologue_cost_vec
, &epilogue_cost_vec
);
1236 /* Prologue and epilogue costs are added to the target model later.
1237 These costs depend only on the scalar iteration cost, the
1238 number of peeling iterations finally chosen, and the number of
1239 misaligned statements. So discard the information found here. */
1240 prologue_cost_vec
.release ();
1241 epilogue_cost_vec
.release ();
1243 if (inside_cost
< min
->inside_cost
1244 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1246 min
->inside_cost
= inside_cost
;
1247 min
->outside_cost
= outside_cost
;
1248 min
->body_cost_vec
.release ();
1249 min
->body_cost_vec
= body_cost_vec
;
1250 min
->peel_info
.dr
= elem
->dr
;
1251 min
->peel_info
.npeel
= elem
->npeel
;
1254 body_cost_vec
.release ();
1260 /* Choose best peeling option by traversing peeling hash table and either
1261 choosing an option with the lowest cost (if cost model is enabled) or the
1262 option that aligns as many accesses as possible. */
1264 static struct data_reference
*
1265 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1266 loop_vec_info loop_vinfo
,
1267 unsigned int *npeel
,
1268 stmt_vector_for_cost
*body_cost_vec
)
1270 struct _vect_peel_extended_info res
;
1272 res
.peel_info
.dr
= NULL
;
1273 res
.body_cost_vec
= stmt_vector_for_cost ();
1275 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1277 res
.inside_cost
= INT_MAX
;
1278 res
.outside_cost
= INT_MAX
;
1279 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1280 vect_peeling_hash_get_lowest_cost
> (&res
);
1284 res
.peel_info
.count
= 0;
1285 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1286 vect_peeling_hash_get_most_frequent
> (&res
);
1289 *npeel
= res
.peel_info
.npeel
;
1290 *body_cost_vec
= res
.body_cost_vec
;
1291 return res
.peel_info
.dr
;
1295 /* Function vect_enhance_data_refs_alignment
1297 This pass will use loop versioning and loop peeling in order to enhance
1298 the alignment of data references in the loop.
1300 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1301 original loop is to be vectorized. Any other loops that are created by
1302 the transformations performed in this pass - are not supposed to be
1303 vectorized. This restriction will be relaxed.
1305 This pass will require a cost model to guide it whether to apply peeling
1306 or versioning or a combination of the two. For example, the scheme that
1307 intel uses when given a loop with several memory accesses, is as follows:
1308 choose one memory access ('p') which alignment you want to force by doing
1309 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1310 other accesses are not necessarily aligned, or (2) use loop versioning to
1311 generate one loop in which all accesses are aligned, and another loop in
1312 which only 'p' is necessarily aligned.
1314 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1315 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1316 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1318 Devising a cost model is the most critical aspect of this work. It will
1319 guide us on which access to peel for, whether to use loop versioning, how
1320 many versions to create, etc. The cost model will probably consist of
1321 generic considerations as well as target specific considerations (on
1322 powerpc for example, misaligned stores are more painful than misaligned
1325 Here are the general steps involved in alignment enhancements:
1327 -- original loop, before alignment analysis:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- After vect_compute_data_refs_alignment:
1334 for (i=0; i<N; i++){
1335 x = q[i]; # DR_MISALIGNMENT(q) = 3
1336 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1339 -- Possibility 1: we do loop versioning:
1341 for (i=0; i<N; i++){ # loop 1A
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = 0
1347 for (i=0; i<N; i++){ # loop 1B
1348 x = q[i]; # DR_MISALIGNMENT(q) = 3
1349 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1353 -- Possibility 2: we do loop peeling:
1354 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1358 for (i = 3; i < N; i++){ # loop 2A
1359 x = q[i]; # DR_MISALIGNMENT(q) = 0
1360 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1363 -- Possibility 3: combination of loop peeling and versioning:
1364 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1369 for (i = 3; i<N; i++){ # loop 3A
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = 0
1375 for (i = 3; i<N; i++){ # loop 3B
1376 x = q[i]; # DR_MISALIGNMENT(q) = 0
1377 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1381 These loops are later passed to loop_transform to be vectorized. The
1382 vectorizer will use the alignment information to guide the transformation
1383 (whether to generate regular loads/stores, or with special handling for
1387 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1389 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1390 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1391 enum dr_alignment_support supportable_dr_alignment
;
1392 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1393 struct data_reference
*dr
;
1395 bool do_peeling
= false;
1396 bool do_versioning
= false;
1399 stmt_vec_info stmt_info
;
1400 unsigned int npeel
= 0;
1401 bool all_misalignments_unknown
= true;
1402 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1403 unsigned possible_npeel_number
= 1;
1405 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1406 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1407 hash_table
<peel_info_hasher
> peeling_htab (1);
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE
, vect_location
,
1411 "=== vect_enhance_data_refs_alignment ===\n");
1413 /* Reset data so we can safely be called multiple times. */
1414 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1415 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1417 /* While cost model enhancements are expected in the future, the high level
1418 view of the code at this time is as follows:
1420 A) If there is a misaligned access then see if peeling to align
1421 this access can make all data references satisfy
1422 vect_supportable_dr_alignment. If so, update data structures
1423 as needed and return true.
1425 B) If peeling wasn't possible and there is a data reference with an
1426 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1427 then see if loop versioning checks can be used to make all data
1428 references satisfy vect_supportable_dr_alignment. If so, update
1429 data structures as needed and return true.
1431 C) If neither peeling nor versioning were successful then return false if
1432 any data reference does not satisfy vect_supportable_dr_alignment.
1434 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1436 Note, Possibility 3 above (which is peeling and versioning together) is not
1437 being done at this time. */
1439 /* (1) Peeling to force alignment. */
1441 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1443 + How many accesses will become aligned due to the peeling
1444 - How many accesses will become unaligned due to the peeling,
1445 and the cost of misaligned accesses.
1446 - The cost of peeling (the extra runtime checks, the increase
1449 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1451 stmt
= DR_STMT (dr
);
1452 stmt_info
= vinfo_for_stmt (stmt
);
1454 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1457 /* For interleaving, only the alignment of the first access
1459 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1460 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1463 /* For invariant accesses there is nothing to enhance. */
1464 if (integer_zerop (DR_STEP (dr
)))
1467 /* Strided accesses perform only component accesses, alignment is
1468 irrelevant for them. */
1469 if (STMT_VINFO_STRIDED_P (stmt_info
)
1470 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1473 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1474 do_peeling
= vector_alignment_reachable_p (dr
);
1477 if (known_alignment_for_access_p (dr
))
1479 unsigned int npeel_tmp
;
1480 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1481 size_zero_node
) < 0;
1483 /* Save info about DR in the hash table. */
1484 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1485 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1486 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1487 TREE_TYPE (DR_REF (dr
))));
1488 npeel_tmp
= (negative
1489 ? (mis
- nelements
) : (nelements
- mis
))
1492 /* For multiple types, it is possible that the bigger type access
1493 will have more than one peeling option. E.g., a loop with two
1494 types: one of size (vector size / 4), and the other one of
1495 size (vector size / 8). Vectorization factor will 8. If both
1496 access are misaligned by 3, the first one needs one scalar
1497 iteration to be aligned, and the second one needs 5. But the
1498 first one will be aligned also by peeling 5 scalar
1499 iterations, and in that case both accesses will be aligned.
1500 Hence, except for the immediate peeling amount, we also want
1501 to try to add full vector size, while we don't exceed
1502 vectorization factor.
1503 We do this automtically for cost model, since we calculate cost
1504 for every peeling option. */
1505 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1507 if (STMT_SLP_TYPE (stmt_info
))
1508 possible_npeel_number
1509 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1511 possible_npeel_number
= vf
/ nelements
;
1514 /* Handle the aligned case. We may decide to align some other
1515 access, making DR unaligned. */
1516 if (DR_MISALIGNMENT (dr
) == 0)
1519 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1520 possible_npeel_number
++;
1523 for (j
= 0; j
< possible_npeel_number
; j
++)
1525 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1527 npeel_tmp
+= nelements
;
1530 all_misalignments_unknown
= false;
1531 /* Data-ref that was chosen for the case that all the
1532 misalignments are unknown is not relevant anymore, since we
1533 have a data-ref with known alignment. */
1538 /* If we don't know any misalignment values, we prefer
1539 peeling for data-ref that has the maximum number of data-refs
1540 with the same alignment, unless the target prefers to align
1541 stores over load. */
1542 if (all_misalignments_unknown
)
1544 unsigned same_align_drs
1545 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1547 || same_align_drs_max
< same_align_drs
)
1549 same_align_drs_max
= same_align_drs
;
1552 /* For data-refs with the same number of related
1553 accesses prefer the one where the misalign
1554 computation will be invariant in the outermost loop. */
1555 else if (same_align_drs_max
== same_align_drs
)
1557 struct loop
*ivloop0
, *ivloop
;
1558 ivloop0
= outermost_invariant_loop_for_expr
1559 (loop
, DR_BASE_ADDRESS (dr0
));
1560 ivloop
= outermost_invariant_loop_for_expr
1561 (loop
, DR_BASE_ADDRESS (dr
));
1562 if ((ivloop
&& !ivloop0
)
1563 || (ivloop
&& ivloop0
1564 && flow_loop_nested_p (ivloop
, ivloop0
)))
1568 if (!first_store
&& DR_IS_WRITE (dr
))
1572 /* If there are both known and unknown misaligned accesses in the
1573 loop, we choose peeling amount according to the known
1575 if (!supportable_dr_alignment
)
1578 if (!first_store
&& DR_IS_WRITE (dr
))
1585 if (!aligned_access_p (dr
))
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1589 "vector alignment may not be reachable\n");
1595 /* Check if we can possibly peel the loop. */
1596 if (!vect_can_advance_ivs_p (loop_vinfo
)
1597 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1602 && all_misalignments_unknown
1603 && vect_supportable_dr_alignment (dr0
, false))
1605 /* Check if the target requires to prefer stores over loads, i.e., if
1606 misaligned stores are more expensive than misaligned loads (taking
1607 drs with same alignment into account). */
1608 if (first_store
&& DR_IS_READ (dr0
))
1610 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1611 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1612 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1613 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1614 stmt_vector_for_cost dummy
;
1617 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1619 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1620 &store_outside_cost
, &dummy
);
1624 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1625 aligning the load DR0). */
1626 load_inside_penalty
= store_inside_cost
;
1627 load_outside_penalty
= store_outside_cost
;
1629 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1630 DR_STMT (first_store
))).iterate (i
, &dr
);
1632 if (DR_IS_READ (dr
))
1634 load_inside_penalty
+= load_inside_cost
;
1635 load_outside_penalty
+= load_outside_cost
;
1639 load_inside_penalty
+= store_inside_cost
;
1640 load_outside_penalty
+= store_outside_cost
;
1643 /* Calculate the penalty for leaving DR0 unaligned (by
1644 aligning the FIRST_STORE). */
1645 store_inside_penalty
= load_inside_cost
;
1646 store_outside_penalty
= load_outside_cost
;
1648 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1649 DR_STMT (dr0
))).iterate (i
, &dr
);
1651 if (DR_IS_READ (dr
))
1653 store_inside_penalty
+= load_inside_cost
;
1654 store_outside_penalty
+= load_outside_cost
;
1658 store_inside_penalty
+= store_inside_cost
;
1659 store_outside_penalty
+= store_outside_cost
;
1662 if (load_inside_penalty
> store_inside_penalty
1663 || (load_inside_penalty
== store_inside_penalty
1664 && load_outside_penalty
> store_outside_penalty
))
1668 /* In case there are only loads with different unknown misalignments, use
1669 peeling only if it may help to align other accesses in the loop or
1670 if it may help improving load bandwith when we'd end up using
1672 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1674 && !STMT_VINFO_SAME_ALIGN_REFS (
1675 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1676 && (vect_supportable_dr_alignment (dr0
, false)
1677 != dr_unaligned_supported
1678 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1679 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1683 if (do_peeling
&& !dr0
)
1685 /* Peeling is possible, but there is no data access that is not supported
1686 unless aligned. So we try to choose the best possible peeling. */
1688 /* We should get here only if there are drs with known misalignment. */
1689 gcc_assert (!all_misalignments_unknown
);
1691 /* Choose the best peeling from the hash table. */
1692 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1701 stmt
= DR_STMT (dr0
);
1702 stmt_info
= vinfo_for_stmt (stmt
);
1703 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1704 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1706 if (known_alignment_for_access_p (dr0
))
1708 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1709 size_zero_node
) < 0;
1712 /* Since it's known at compile time, compute the number of
1713 iterations in the peeled loop (the peeling factor) for use in
1714 updating DR_MISALIGNMENT values. The peeling factor is the
1715 vectorization factor minus the misalignment as an element
1717 mis
= DR_MISALIGNMENT (dr0
);
1718 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1719 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1723 /* For interleaved data access every iteration accesses all the
1724 members of the group, therefore we divide the number of iterations
1725 by the group size. */
1726 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1727 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1728 npeel
/= GROUP_SIZE (stmt_info
);
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE
, vect_location
,
1732 "Try peeling by %d\n", npeel
);
1735 /* Ensure that all data refs can be vectorized after the peel. */
1736 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1738 int save_misalignment
;
1743 stmt
= DR_STMT (dr
);
1744 stmt_info
= vinfo_for_stmt (stmt
);
1745 /* For interleaving, only the alignment of the first access
1747 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1748 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info
)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1757 save_misalignment
= DR_MISALIGNMENT (dr
);
1758 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1759 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1760 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1762 if (!supportable_dr_alignment
)
1769 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1771 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1776 body_cost_vec
.release ();
1781 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1784 unsigned max_allowed_peel
1785 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1786 if (max_allowed_peel
!= (unsigned)-1)
1788 unsigned max_peel
= npeel
;
1791 gimple
*dr_stmt
= DR_STMT (dr0
);
1792 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1793 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1794 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1796 if (max_peel
> max_allowed_peel
)
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_NOTE
, vect_location
,
1801 "Disable peeling, max peels reached: %d\n", max_peel
);
1806 /* Cost model #2 - if peeling may result in a remaining loop not
1807 iterating enough to be vectorized then do not peel. */
1809 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1812 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1813 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1814 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1820 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1821 If the misalignment of DR_i is identical to that of dr0 then set
1822 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1823 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1824 by the peeling factor times the element size of DR_i (MOD the
1825 vectorization factor times the size). Otherwise, the
1826 misalignment of DR_i must be set to unknown. */
1827 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1830 /* Strided accesses perform only component accesses, alignment
1831 is irrelevant for them. */
1832 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1833 if (STMT_VINFO_STRIDED_P (stmt_info
)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1837 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1840 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1844 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1845 = DR_MISALIGNMENT (dr0
);
1846 SET_DR_MISALIGNMENT (dr0
, 0);
1847 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_NOTE
, vect_location
,
1850 "Alignment of access forced using peeling.\n");
1851 dump_printf_loc (MSG_NOTE
, vect_location
,
1852 "Peeling for alignment will be applied.\n");
1854 /* The inside-loop cost will be accounted for in vectorizable_load
1855 and vectorizable_store correctly with adjusted alignments.
1856 Drop the body_cst_vec on the floor here. */
1857 body_cost_vec
.release ();
1859 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1865 body_cost_vec
.release ();
1867 /* (2) Versioning to force alignment. */
1869 /* Try versioning if:
1870 1) optimize loop for speed
1871 2) there is at least one unsupported misaligned data ref with an unknown
1873 3) all misaligned data refs with a known misalignment are supported, and
1874 4) the number of runtime alignment checks is within reason. */
1877 optimize_loop_nest_for_speed_p (loop
)
1878 && (!loop
->inner
); /* FORNOW */
1882 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1884 stmt
= DR_STMT (dr
);
1885 stmt_info
= vinfo_for_stmt (stmt
);
1887 /* For interleaving, only the alignment of the first access
1889 if (aligned_access_p (dr
)
1890 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1891 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1894 if (STMT_VINFO_STRIDED_P (stmt_info
))
1896 /* Strided loads perform only component accesses, alignment is
1897 irrelevant for them. */
1898 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1900 do_versioning
= false;
1904 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1906 if (!supportable_dr_alignment
)
1912 if (known_alignment_for_access_p (dr
)
1913 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1914 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1916 do_versioning
= false;
1920 stmt
= DR_STMT (dr
);
1921 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1922 gcc_assert (vectype
);
1924 /* The rightmost bits of an aligned address must be zeros.
1925 Construct the mask needed for this test. For example,
1926 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1927 mask must be 15 = 0xf. */
1928 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1930 /* FORNOW: use the same mask to test all potentially unaligned
1931 references in the loop. The vectorizer currently supports
1932 a single vector size, see the reference to
1933 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1934 vectorization factor is computed. */
1935 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1936 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1937 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1938 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1943 /* Versioning requires at least one misaligned data reference. */
1944 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1945 do_versioning
= false;
1946 else if (!do_versioning
)
1947 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1952 vec
<gimple
*> may_misalign_stmts
1953 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1956 /* It can now be assumed that the data references in the statements
1957 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1958 of the loop being vectorized. */
1959 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1961 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1962 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1963 SET_DR_MISALIGNMENT (dr
, 0);
1964 if (dump_enabled_p ())
1965 dump_printf_loc (MSG_NOTE
, vect_location
,
1966 "Alignment of access forced using versioning.\n");
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE
, vect_location
,
1971 "Versioning for alignment will be applied.\n");
1973 /* Peeling and versioning can't be done together at this time. */
1974 gcc_assert (! (do_peeling
&& do_versioning
));
1976 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1981 /* This point is reached if neither peeling nor versioning is being done. */
1982 gcc_assert (! (do_peeling
|| do_versioning
));
1984 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1989 /* Function vect_find_same_alignment_drs.
1991 Update group and alignment relations according to the chosen
1992 vectorization factor. */
1995 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1996 loop_vec_info loop_vinfo
)
1999 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2000 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2001 struct data_reference
*dra
= DDR_A (ddr
);
2002 struct data_reference
*drb
= DDR_B (ddr
);
2003 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2004 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2005 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2006 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2007 lambda_vector dist_v
;
2008 unsigned int loop_depth
;
2010 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2016 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2019 /* Loop-based vectorization and known data dependence. */
2020 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2023 /* Data-dependence analysis reports a distance vector of zero
2024 for data-references that overlap only in the first iteration
2025 but have different sign step (see PR45764).
2026 So as a sanity check require equal DR_STEP. */
2027 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2030 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2031 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2033 int dist
= dist_v
[loop_depth
];
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE
, vect_location
,
2037 "dependence distance = %d.\n", dist
);
2039 /* Same loop iteration. */
2041 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2043 /* Two references with distance zero have the same alignment. */
2044 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2045 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2046 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE
, vect_location
,
2049 "accesses have the same alignment.\n");
2050 dump_printf (MSG_NOTE
,
2051 "dependence distance modulo vf == 0 between ");
2052 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2053 dump_printf (MSG_NOTE
, " and ");
2054 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2055 dump_printf (MSG_NOTE
, "\n");
2062 /* Function vect_analyze_data_refs_alignment
2064 Analyze the alignment of the data-references in the loop.
2065 Return FALSE if a data reference is found that cannot be vectorized. */
2068 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE
, vect_location
,
2072 "=== vect_analyze_data_refs_alignment ===\n");
2074 /* Mark groups of data references with same alignment using
2075 data dependence information. */
2076 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2077 struct data_dependence_relation
*ddr
;
2080 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2081 vect_find_same_alignment_drs (ddr
, vinfo
);
2083 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2084 struct data_reference
*dr
;
2086 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2088 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2089 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2090 && !vect_compute_data_ref_alignment (dr
))
2092 /* Strided accesses perform only component accesses, misalignment
2093 information is irrelevant for them. */
2094 if (STMT_VINFO_STRIDED_P (stmt_info
)
2095 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2100 "not vectorized: can't calculate alignment "
2111 /* Analyze alignment of DRs of stmts in NODE. */
2114 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2116 /* We vectorize from the first scalar stmt in the node unless
2117 the node is permuted in which case we start from the first
2118 element in the group. */
2119 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2120 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2121 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2122 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2124 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2125 if (! vect_compute_data_ref_alignment (dr
)
2126 /* For creating the data-ref pointer we need alignment of the
2127 first element anyway. */
2129 && ! vect_compute_data_ref_alignment (first_dr
))
2130 || ! verify_data_ref_alignment (dr
))
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2134 "not vectorized: bad data alignment in basic "
2142 /* Function vect_slp_analyze_instance_alignment
2144 Analyze the alignment of the data-references in the SLP instance.
2145 Return FALSE if a data reference is found that cannot be vectorized. */
2148 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE
, vect_location
,
2152 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2156 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2157 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2160 node
= SLP_INSTANCE_TREE (instance
);
2161 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2162 && ! vect_slp_analyze_and_verify_node_alignment
2163 (SLP_INSTANCE_TREE (instance
)))
2170 /* Analyze groups of accesses: check that DR belongs to a group of
2171 accesses of legal size, step, etc. Detect gaps, single element
2172 interleaving, and other special cases. Set grouped access info.
2173 Collect groups of strided stores for further use in SLP analysis.
2174 Worker for vect_analyze_group_access. */
2177 vect_analyze_group_access_1 (struct data_reference
*dr
)
2179 tree step
= DR_STEP (dr
);
2180 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2181 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2182 gimple
*stmt
= DR_STMT (dr
);
2183 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2184 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2185 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2186 HOST_WIDE_INT dr_step
= -1;
2187 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2188 bool slp_impossible
= false;
2190 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2191 size of the interleaving group (including gaps). */
2192 if (tree_fits_shwi_p (step
))
2194 dr_step
= tree_to_shwi (step
);
2195 /* Check that STEP is a multiple of type size. Otherwise there is
2196 a non-element-sized gap at the end of the group which we
2197 cannot represent in GROUP_GAP or GROUP_SIZE.
2198 ??? As we can handle non-constant step fine here we should
2199 simply remove uses of GROUP_GAP between the last and first
2200 element and instead rely on DR_STEP. GROUP_SIZE then would
2201 simply not include that gap. */
2202 if ((dr_step
% type_size
) != 0)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE
, vect_location
,
2208 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2209 dump_printf (MSG_NOTE
,
2210 " is not a multiple of the element size for ");
2211 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2212 dump_printf (MSG_NOTE
, "\n");
2216 groupsize
= absu_hwi (dr_step
) / type_size
;
2221 /* Not consecutive access is possible only if it is a part of interleaving. */
2222 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2224 /* Check if it this DR is a part of interleaving, and is a single
2225 element of the group that is accessed in the loop. */
2227 /* Gaps are supported only for loads. STEP must be a multiple of the type
2228 size. The size of the group must be a power of 2. */
2230 && (dr_step
% type_size
) == 0
2232 && exact_log2 (groupsize
) != -1)
2234 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2235 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2236 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_NOTE
, vect_location
,
2239 "Detected single element interleaving ");
2240 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2241 dump_printf (MSG_NOTE
, " step ");
2242 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2243 dump_printf (MSG_NOTE
, "\n");
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2252 "not consecutive access ");
2253 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2258 /* Mark the statement as unvectorizable. */
2259 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2263 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2264 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2268 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2270 /* First stmt in the interleaving chain. Check the chain. */
2271 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2272 struct data_reference
*data_ref
= dr
;
2273 unsigned int count
= 1;
2274 tree prev_init
= DR_INIT (data_ref
);
2275 gimple
*prev
= stmt
;
2276 HOST_WIDE_INT diff
, gaps
= 0;
2280 /* Skip same data-refs. In case that two or more stmts share
2281 data-ref (supported only for loads), we vectorize only the first
2282 stmt, and the rest get their vectorized loads from the first
2284 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2285 DR_INIT (STMT_VINFO_DATA_REF (
2286 vinfo_for_stmt (next
)))))
2288 if (DR_IS_WRITE (data_ref
))
2290 if (dump_enabled_p ())
2291 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2292 "Two store stmts share the same dr.\n");
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2298 "Two or more load stmts share the same dr.\n");
2300 /* For load use the same data-ref load. */
2301 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2304 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2309 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2311 /* All group members have the same STEP by construction. */
2312 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2314 /* Check that the distance between two accesses is equal to the type
2315 size. Otherwise, we have gaps. */
2316 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2317 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2320 /* FORNOW: SLP of accesses with gaps is not supported. */
2321 slp_impossible
= true;
2322 if (DR_IS_WRITE (data_ref
))
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2326 "interleaved store with gaps\n");
2333 last_accessed_element
+= diff
;
2335 /* Store the gap from the previous member of the group. If there is no
2336 gap in the access, GROUP_GAP is always 1. */
2337 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2339 prev_init
= DR_INIT (data_ref
);
2340 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2341 /* Count the number of data-refs in the chain. */
2346 groupsize
= count
+ gaps
;
2348 if (groupsize
> UINT_MAX
)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2352 "group is too large\n");
2356 /* Check that the size of the interleaving is equal to count for stores,
2357 i.e., that there are no gaps. */
2358 if (groupsize
!= count
2359 && !DR_IS_READ (dr
))
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2363 "interleaved store with gaps\n");
2367 /* If there is a gap after the last load in the group it is the
2368 difference between the groupsize and the last accessed
2370 When there is no gap, this difference should be 0. */
2371 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2373 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2374 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_NOTE
, vect_location
,
2377 "Detected interleaving ");
2378 if (DR_IS_READ (dr
))
2379 dump_printf (MSG_NOTE
, "load ");
2381 dump_printf (MSG_NOTE
, "store ");
2382 dump_printf (MSG_NOTE
, "of size %u starting with ",
2383 (unsigned)groupsize
);
2384 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2385 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2386 dump_printf_loc (MSG_NOTE
, vect_location
,
2387 "There is a gap of %u elements after the group\n",
2388 GROUP_GAP (vinfo_for_stmt (stmt
)));
2391 /* SLP: create an SLP data structure for every interleaving group of
2392 stores for further analysis in vect_analyse_slp. */
2393 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2396 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2398 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2405 /* Analyze groups of accesses: check that DR belongs to a group of
2406 accesses of legal size, step, etc. Detect gaps, single element
2407 interleaving, and other special cases. Set grouped access info.
2408 Collect groups of strided stores for further use in SLP analysis. */
2411 vect_analyze_group_access (struct data_reference
*dr
)
2413 if (!vect_analyze_group_access_1 (dr
))
2415 /* Dissolve the group if present. */
2417 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2420 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2421 next
= GROUP_NEXT_ELEMENT (vinfo
);
2422 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2423 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2431 /* Analyze the access pattern of the data-reference DR.
2432 In case of non-consecutive accesses call vect_analyze_group_access() to
2433 analyze groups of accesses. */
2436 vect_analyze_data_ref_access (struct data_reference
*dr
)
2438 tree step
= DR_STEP (dr
);
2439 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2440 gimple
*stmt
= DR_STMT (dr
);
2441 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2442 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2443 struct loop
*loop
= NULL
;
2446 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2448 if (loop_vinfo
&& !step
)
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2452 "bad data-ref access in loop\n");
2456 /* Allow loads with zero step in inner-loop vectorization. */
2457 if (loop_vinfo
&& integer_zerop (step
))
2459 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2460 if (!nested_in_vect_loop_p (loop
, stmt
))
2461 return DR_IS_READ (dr
);
2462 /* Allow references with zero step for outer loops marked
2463 with pragma omp simd only - it guarantees absence of
2464 loop-carried dependencies between inner loop iterations. */
2465 if (!loop
->force_vectorize
)
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE
, vect_location
,
2469 "zero step in inner loop of nest\n");
2474 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2476 /* Interleaved accesses are not yet supported within outer-loop
2477 vectorization for references in the inner-loop. */
2478 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2480 /* For the rest of the analysis we use the outer-loop step. */
2481 step
= STMT_VINFO_DR_STEP (stmt_info
);
2482 if (integer_zerop (step
))
2484 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE
, vect_location
,
2486 "zero step in outer loop.\n");
2487 return DR_IS_READ (dr
);
2492 if (TREE_CODE (step
) == INTEGER_CST
)
2494 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2495 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2497 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2499 /* Mark that it is not interleaving. */
2500 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2505 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE
, vect_location
,
2509 "grouped access in outer loop.\n");
2514 /* Assume this is a DR handled by non-constant strided load case. */
2515 if (TREE_CODE (step
) != INTEGER_CST
)
2516 return (STMT_VINFO_STRIDED_P (stmt_info
)
2517 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2518 || vect_analyze_group_access (dr
)));
2520 /* Not consecutive access - check if it's a part of interleaving group. */
2521 return vect_analyze_group_access (dr
);
2526 /* A helper function used in the comparator function to sort data
2527 references. T1 and T2 are two data references to be compared.
2528 The function returns -1, 0, or 1. */
2531 compare_tree (tree t1
, tree t2
)
2534 enum tree_code code
;
2547 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2548 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2550 code
= TREE_CODE (t1
);
2553 /* For const values, we can just use hash values for comparisons. */
2561 hashval_t h1
= iterative_hash_expr (t1
, 0);
2562 hashval_t h2
= iterative_hash_expr (t2
, 0);
2564 return h1
< h2
? -1 : 1;
2569 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2573 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2574 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2578 tclass
= TREE_CODE_CLASS (code
);
2580 /* For var-decl, we could compare their UIDs. */
2581 if (tclass
== tcc_declaration
)
2583 if (DECL_UID (t1
) != DECL_UID (t2
))
2584 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2588 /* For expressions with operands, compare their operands recursively. */
2589 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2591 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2601 /* Compare two data-references DRA and DRB to group them into chunks
2602 suitable for grouping. */
2605 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2607 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2608 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2611 /* Stabilize sort. */
2615 /* DRs in different loops never belong to the same group. */
2616 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2617 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2619 return loopa
->num
< loopb
->num
? -1 : 1;
2621 /* Ordering of DRs according to base. */
2622 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2624 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2629 /* And according to DR_OFFSET. */
2630 if (!dr_equal_offsets_p (dra
, drb
))
2632 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2637 /* Put reads before writes. */
2638 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2639 return DR_IS_READ (dra
) ? -1 : 1;
2641 /* Then sort after access size. */
2642 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2643 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2645 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2646 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2651 /* And after step. */
2652 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2654 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2659 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2660 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2662 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2666 /* Function vect_analyze_data_ref_accesses.
2668 Analyze the access pattern of all the data references in the loop.
2670 FORNOW: the only access pattern that is considered vectorizable is a
2671 simple step 1 (consecutive) access.
2673 FORNOW: handle only arrays and pointer accesses. */
2676 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2679 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2680 struct data_reference
*dr
;
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_NOTE
, vect_location
,
2684 "=== vect_analyze_data_ref_accesses ===\n");
2686 if (datarefs
.is_empty ())
2689 /* Sort the array of datarefs to make building the interleaving chains
2690 linear. Don't modify the original vector's order, it is needed for
2691 determining what dependencies are reversed. */
2692 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2693 datarefs_copy
.qsort (dr_group_sort_cmp
);
2695 /* Build the interleaving chains. */
2696 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2698 data_reference_p dra
= datarefs_copy
[i
];
2699 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2700 stmt_vec_info lastinfo
= NULL
;
2701 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2703 data_reference_p drb
= datarefs_copy
[i
];
2704 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2706 /* ??? Imperfect sorting (non-compatible types, non-modulo
2707 accesses, same accesses) can lead to a group to be artificially
2708 split here as we don't just skip over those. If it really
2709 matters we can push those to a worklist and re-iterate
2710 over them. The we can just skip ahead to the next DR here. */
2712 /* DRs in a different loop should not be put into the same
2713 interleaving group. */
2714 if (gimple_bb (DR_STMT (dra
))->loop_father
2715 != gimple_bb (DR_STMT (drb
))->loop_father
)
2718 /* Check that the data-refs have same first location (except init)
2719 and they are both either store or load (not load and store,
2720 not masked loads or stores). */
2721 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2722 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2723 DR_BASE_ADDRESS (drb
), 0)
2724 || !dr_equal_offsets_p (dra
, drb
)
2725 || !gimple_assign_single_p (DR_STMT (dra
))
2726 || !gimple_assign_single_p (DR_STMT (drb
)))
2729 /* Check that the data-refs have the same constant size. */
2730 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2731 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2732 if (!tree_fits_uhwi_p (sza
)
2733 || !tree_fits_uhwi_p (szb
)
2734 || !tree_int_cst_equal (sza
, szb
))
2737 /* Check that the data-refs have the same step. */
2738 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2741 /* Do not place the same access in the interleaving chain twice. */
2742 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2745 /* Check the types are compatible.
2746 ??? We don't distinguish this during sorting. */
2747 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2748 TREE_TYPE (DR_REF (drb
))))
2751 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2752 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2753 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2754 gcc_assert (init_a
< init_b
);
2756 /* If init_b == init_a + the size of the type * k, we have an
2757 interleaving, and DRA is accessed before DRB. */
2758 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2759 if (type_size_a
== 0
2760 || (init_b
- init_a
) % type_size_a
!= 0)
2763 /* If we have a store, the accesses are adjacent. This splits
2764 groups into chunks we support (we don't support vectorization
2765 of stores with gaps). */
2766 if (!DR_IS_READ (dra
)
2767 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2768 (DR_INIT (datarefs_copy
[i
-1]))
2772 /* If the step (if not zero or non-constant) is greater than the
2773 difference between data-refs' inits this splits groups into
2775 if (tree_fits_shwi_p (DR_STEP (dra
)))
2777 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2778 if (step
!= 0 && step
<= (init_b
- init_a
))
2782 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE
, vect_location
,
2785 "Detected interleaving ");
2786 if (DR_IS_READ (dra
))
2787 dump_printf (MSG_NOTE
, "load ");
2789 dump_printf (MSG_NOTE
, "store ");
2790 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2791 dump_printf (MSG_NOTE
, " and ");
2792 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2793 dump_printf (MSG_NOTE
, "\n");
2796 /* Link the found element into the group list. */
2797 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2799 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2800 lastinfo
= stmtinfo_a
;
2802 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2803 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2804 lastinfo
= stmtinfo_b
;
2808 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2809 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2810 && !vect_analyze_data_ref_access (dr
))
2812 if (dump_enabled_p ())
2813 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2814 "not vectorized: complicated access pattern.\n");
2816 if (is_a
<bb_vec_info
> (vinfo
))
2818 /* Mark the statement as not vectorizable. */
2819 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2824 datarefs_copy
.release ();
2829 datarefs_copy
.release ();
2834 /* Operator == between two dr_with_seg_len objects.
2836 This equality operator is used to make sure two data refs
2837 are the same one so that we will consider to combine the
2838 aliasing checks of those two pairs of data dependent data
2842 operator == (const dr_with_seg_len
& d1
,
2843 const dr_with_seg_len
& d2
)
2845 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2846 DR_BASE_ADDRESS (d2
.dr
), 0)
2847 && compare_tree (d1
.offset
, d2
.offset
) == 0
2848 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2851 /* Function comp_dr_with_seg_len_pair.
2853 Comparison function for sorting objects of dr_with_seg_len_pair_t
2854 so that we can combine aliasing checks in one scan. */
2857 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2859 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2860 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2862 const dr_with_seg_len
&p11
= p1
->first
,
2867 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2868 if a and c have the same basic address snd step, and b and d have the same
2869 address and step. Therefore, if any a&c or b&d don't have the same address
2870 and step, we don't care the order of those two pairs after sorting. */
2873 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2874 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2876 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2877 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2879 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2881 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2883 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2885 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2891 /* Function vect_vfa_segment_size.
2893 Create an expression that computes the size of segment
2894 that will be accessed for a data reference. The functions takes into
2895 account that realignment loads may access one more vector.
2898 DR: The data reference.
2899 LENGTH_FACTOR: segment length to consider.
2901 Return an expression whose value is the size of segment which will be
2905 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2907 tree segment_length
;
2909 if (integer_zerop (DR_STEP (dr
)))
2910 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2912 segment_length
= size_binop (MULT_EXPR
,
2913 fold_convert (sizetype
, DR_STEP (dr
)),
2914 fold_convert (sizetype
, length_factor
));
2916 if (vect_supportable_dr_alignment (dr
, false)
2917 == dr_explicit_realign_optimized
)
2919 tree vector_size
= TYPE_SIZE_UNIT
2920 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2922 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2924 return segment_length
;
2927 /* Function vect_prune_runtime_alias_test_list.
2929 Prune a list of ddrs to be tested at run-time by versioning for alias.
2930 Merge several alias checks into one if possible.
2931 Return FALSE if resulting list of ddrs is longer then allowed by
2932 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2935 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2937 vec
<ddr_p
> may_alias_ddrs
=
2938 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2939 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2940 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2941 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2942 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE
, vect_location
,
2950 "=== vect_prune_runtime_alias_test_list ===\n");
2952 if (may_alias_ddrs
.is_empty ())
2955 /* Basically, for each pair of dependent data refs store_ptr_0
2956 and load_ptr_0, we create an expression:
2958 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2959 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2961 for aliasing checks. However, in some cases we can decrease
2962 the number of checks by combining two checks into one. For
2963 example, suppose we have another pair of data refs store_ptr_0
2964 and load_ptr_1, and if the following condition is satisfied:
2966 load_ptr_0 < load_ptr_1 &&
2967 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2969 (this condition means, in each iteration of vectorized loop,
2970 the accessed memory of store_ptr_0 cannot be between the memory
2971 of load_ptr_0 and load_ptr_1.)
2973 we then can use only the following expression to finish the
2974 alising checks between store_ptr_0 & load_ptr_0 and
2975 store_ptr_0 & load_ptr_1:
2977 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2978 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2980 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2981 same basic address. */
2983 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2985 /* First, we collect all data ref pairs for aliasing checks. */
2986 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2988 struct data_reference
*dr_a
, *dr_b
;
2989 gimple
*dr_group_first_a
, *dr_group_first_b
;
2990 tree segment_length_a
, segment_length_b
;
2991 gimple
*stmt_a
, *stmt_b
;
2994 stmt_a
= DR_STMT (DDR_A (ddr
));
2995 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2996 if (dr_group_first_a
)
2998 stmt_a
= dr_group_first_a
;
2999 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3003 stmt_b
= DR_STMT (DDR_B (ddr
));
3004 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3005 if (dr_group_first_b
)
3007 stmt_b
= dr_group_first_b
;
3008 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3011 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3012 length_factor
= scalar_loop_iters
;
3014 length_factor
= size_int (vect_factor
);
3015 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3016 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3018 dr_with_seg_len_pair_t dr_with_seg_len_pair
3019 (dr_with_seg_len (dr_a
, segment_length_a
),
3020 dr_with_seg_len (dr_b
, segment_length_b
));
3022 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
3023 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3025 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3028 /* Second, we sort the collected data ref pairs so that we can scan
3029 them once to combine all possible aliasing checks. */
3030 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3032 /* Third, we scan the sorted dr pairs and check if we can combine
3033 alias checks of two neighboring dr pairs. */
3034 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3036 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3037 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3038 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3039 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3040 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3042 /* Remove duplicate data ref pairs. */
3043 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3045 if (dump_enabled_p ())
3047 dump_printf_loc (MSG_NOTE
, vect_location
,
3048 "found equal ranges ");
3049 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3050 DR_REF (dr_a1
->dr
));
3051 dump_printf (MSG_NOTE
, ", ");
3052 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3053 DR_REF (dr_b1
->dr
));
3054 dump_printf (MSG_NOTE
, " and ");
3055 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3056 DR_REF (dr_a2
->dr
));
3057 dump_printf (MSG_NOTE
, ", ");
3058 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3059 DR_REF (dr_b2
->dr
));
3060 dump_printf (MSG_NOTE
, "\n");
3063 comp_alias_ddrs
.ordered_remove (i
--);
3067 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3069 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3070 and DR_A1 and DR_A2 are two consecutive memrefs. */
3071 if (*dr_a1
== *dr_a2
)
3073 std::swap (dr_a1
, dr_b1
);
3074 std::swap (dr_a2
, dr_b2
);
3077 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3078 DR_BASE_ADDRESS (dr_a2
->dr
),
3080 || !tree_fits_shwi_p (dr_a1
->offset
)
3081 || !tree_fits_shwi_p (dr_a2
->offset
))
3084 /* Make sure dr_a1 starts left of dr_a2. */
3085 if (tree_int_cst_lt (dr_a2
->offset
, dr_a1
->offset
))
3086 std::swap (*dr_a1
, *dr_a2
);
3088 unsigned HOST_WIDE_INT diff
3089 = tree_to_shwi (dr_a2
->offset
) - tree_to_shwi (dr_a1
->offset
);
3092 bool do_remove
= false;
3094 /* If the left segment does not extend beyond the start of the
3095 right segment the new segment length is that of the right
3096 plus the segment distance. */
3097 if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3098 && compare_tree_int (dr_a1
->seg_len
, diff
) <= 0)
3100 dr_a1
->seg_len
= size_binop (PLUS_EXPR
, dr_a2
->seg_len
,
3104 /* Generally the new segment length is the maximum of the
3105 left segment size and the right segment size plus the distance.
3106 ??? We can also build tree MAX_EXPR here but it's not clear this
3108 else if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3109 && tree_fits_uhwi_p (dr_a2
->seg_len
))
3111 unsigned HOST_WIDE_INT seg_len_a1
= tree_to_uhwi (dr_a1
->seg_len
);
3112 unsigned HOST_WIDE_INT seg_len_a2
= tree_to_uhwi (dr_a2
->seg_len
);
3113 dr_a1
->seg_len
= size_int (MAX (seg_len_a1
, diff
+ seg_len_a2
));
3116 /* Now we check if the following condition is satisfied:
3118 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3120 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3121 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3122 have to make a best estimation. We can get the minimum value
3123 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3124 then either of the following two conditions can guarantee the
3127 1: DIFF <= MIN_SEG_LEN_B
3128 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3131 unsigned HOST_WIDE_INT min_seg_len_b
3132 = (tree_fits_uhwi_p (dr_b1
->seg_len
)
3133 ? tree_to_uhwi (dr_b1
->seg_len
)
3136 if (diff
<= min_seg_len_b
3137 || (tree_fits_uhwi_p (dr_a1
->seg_len
)
3138 && diff
- tree_to_uhwi (dr_a1
->seg_len
) < min_seg_len_b
))
3140 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3141 dr_a2
->seg_len
, size_int (diff
));
3148 if (dump_enabled_p ())
3150 dump_printf_loc (MSG_NOTE
, vect_location
,
3151 "merging ranges for ");
3152 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a1
->dr
));
3153 dump_printf (MSG_NOTE
, ", ");
3154 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b1
->dr
));
3155 dump_printf (MSG_NOTE
, " and ");
3156 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a2
->dr
));
3157 dump_printf (MSG_NOTE
, ", ");
3158 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b2
->dr
));
3159 dump_printf (MSG_NOTE
, "\n");
3161 comp_alias_ddrs
.ordered_remove (i
--);
3166 dump_printf_loc (MSG_NOTE
, vect_location
,
3167 "improved number of alias checks from %d to %d\n",
3168 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3169 if ((int) comp_alias_ddrs
.length () >
3170 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3176 /* Check whether a non-affine read or write in stmt is suitable for gather load
3177 or scatter store and if so, return a builtin decl for that operation. */
3180 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
3181 tree
*offp
, int *scalep
)
3183 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3184 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3185 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3186 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3187 tree offtype
= NULL_TREE
;
3188 tree decl
, base
, off
;
3190 int punsignedp
, reversep
, pvolatilep
= 0;
3193 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3194 see if we can use the def stmt of the address. */
3195 if (is_gimple_call (stmt
)
3196 && gimple_call_internal_p (stmt
)
3197 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3198 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3199 && TREE_CODE (base
) == MEM_REF
3200 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3201 && integer_zerop (TREE_OPERAND (base
, 1))
3202 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3204 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3205 if (is_gimple_assign (def_stmt
)
3206 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3207 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3210 /* The gather and scatter builtins need address of the form
3211 loop_invariant + vector * {1, 2, 4, 8}
3213 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3214 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3215 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3216 multiplications and additions in it. To get a vector, we need
3217 a single SSA_NAME that will be defined in the loop and will
3218 contain everything that is not loop invariant and that can be
3219 vectorized. The following code attempts to find such a preexistng
3220 SSA_NAME OFF and put the loop invariants into a tree BASE
3221 that can be gimplified before the loop. */
3222 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3223 &punsignedp
, &reversep
, &pvolatilep
, false);
3224 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3226 if (TREE_CODE (base
) == MEM_REF
)
3228 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3230 if (off
== NULL_TREE
)
3232 offset_int moff
= mem_ref_offset (base
);
3233 off
= wide_int_to_tree (sizetype
, moff
);
3236 off
= size_binop (PLUS_EXPR
, off
,
3237 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3239 base
= TREE_OPERAND (base
, 0);
3242 base
= build_fold_addr_expr (base
);
3244 if (off
== NULL_TREE
)
3245 off
= size_zero_node
;
3247 /* If base is not loop invariant, either off is 0, then we start with just
3248 the constant offset in the loop invariant BASE and continue with base
3249 as OFF, otherwise give up.
3250 We could handle that case by gimplifying the addition of base + off
3251 into some SSA_NAME and use that as off, but for now punt. */
3252 if (!expr_invariant_in_loop_p (loop
, base
))
3254 if (!integer_zerop (off
))
3257 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3259 /* Otherwise put base + constant offset into the loop invariant BASE
3260 and continue with OFF. */
3263 base
= fold_convert (sizetype
, base
);
3264 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3267 /* OFF at this point may be either a SSA_NAME or some tree expression
3268 from get_inner_reference. Try to peel off loop invariants from it
3269 into BASE as long as possible. */
3271 while (offtype
== NULL_TREE
)
3273 enum tree_code code
;
3274 tree op0
, op1
, add
= NULL_TREE
;
3276 if (TREE_CODE (off
) == SSA_NAME
)
3278 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3280 if (expr_invariant_in_loop_p (loop
, off
))
3283 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3286 op0
= gimple_assign_rhs1 (def_stmt
);
3287 code
= gimple_assign_rhs_code (def_stmt
);
3288 op1
= gimple_assign_rhs2 (def_stmt
);
3292 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3294 code
= TREE_CODE (off
);
3295 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3299 case POINTER_PLUS_EXPR
:
3301 if (expr_invariant_in_loop_p (loop
, op0
))
3306 add
= fold_convert (sizetype
, add
);
3308 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3309 base
= size_binop (PLUS_EXPR
, base
, add
);
3312 if (expr_invariant_in_loop_p (loop
, op1
))
3320 if (expr_invariant_in_loop_p (loop
, op1
))
3322 add
= fold_convert (sizetype
, op1
);
3323 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3329 if (scale
== 1 && tree_fits_shwi_p (op1
))
3331 scale
= tree_to_shwi (op1
);
3340 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3341 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3343 if (TYPE_PRECISION (TREE_TYPE (op0
))
3344 == TYPE_PRECISION (TREE_TYPE (off
)))
3349 if (TYPE_PRECISION (TREE_TYPE (op0
))
3350 < TYPE_PRECISION (TREE_TYPE (off
)))
3353 offtype
= TREE_TYPE (off
);
3364 /* If at the end OFF still isn't a SSA_NAME or isn't
3365 defined in the loop, punt. */
3366 if (TREE_CODE (off
) != SSA_NAME
3367 || expr_invariant_in_loop_p (loop
, off
))
3370 if (offtype
== NULL_TREE
)
3371 offtype
= TREE_TYPE (off
);
3373 if (DR_IS_READ (dr
))
3374 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3377 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3380 if (decl
== NULL_TREE
)
3392 /* Function vect_analyze_data_refs.
3394 Find all the data references in the loop or basic block.
3396 The general structure of the analysis of data refs in the vectorizer is as
3398 1- vect_analyze_data_refs(loop/bb): call
3399 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3400 in the loop/bb and their dependences.
3401 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3402 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3403 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3408 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3410 struct loop
*loop
= NULL
;
3412 struct data_reference
*dr
;
3415 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_NOTE
, vect_location
,
3417 "=== vect_analyze_data_refs ===\n");
3419 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3420 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3422 /* Go through the data-refs, check that the analysis succeeded. Update
3423 pointer from stmt_vec_info struct to DR and vectype. */
3425 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3426 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3429 stmt_vec_info stmt_info
;
3430 tree base
, offset
, init
;
3431 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3432 bool simd_lane_access
= false;
3436 if (!dr
|| !DR_REF (dr
))
3438 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3440 "not vectorized: unhandled data-ref\n");
3444 stmt
= DR_STMT (dr
);
3445 stmt_info
= vinfo_for_stmt (stmt
);
3447 /* Discard clobbers from the dataref vector. We will remove
3448 clobber stmts during vectorization. */
3449 if (gimple_clobber_p (stmt
))
3452 if (i
== datarefs
.length () - 1)
3457 datarefs
.ordered_remove (i
);
3462 /* Check that analysis of the data-ref succeeded. */
3463 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3468 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3469 && targetm
.vectorize
.builtin_gather
!= NULL
;
3472 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3473 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3474 bool maybe_simd_lane_access
3475 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3477 /* If target supports vector gather loads or scatter stores, or if
3478 this might be a SIMD lane access, see if they can't be used. */
3479 if (is_a
<loop_vec_info
> (vinfo
)
3480 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3481 && !nested_in_vect_loop_p (loop
, stmt
))
3483 struct data_reference
*newdr
3484 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3485 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3486 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3487 if (DR_BASE_ADDRESS (newdr
)
3488 && DR_OFFSET (newdr
)
3491 && integer_zerop (DR_STEP (newdr
)))
3493 if (maybe_simd_lane_access
)
3495 tree off
= DR_OFFSET (newdr
);
3497 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3498 && TREE_CODE (off
) == MULT_EXPR
3499 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3501 tree step
= TREE_OPERAND (off
, 1);
3502 off
= TREE_OPERAND (off
, 0);
3504 if (CONVERT_EXPR_P (off
)
3505 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3507 < TYPE_PRECISION (TREE_TYPE (off
)))
3508 off
= TREE_OPERAND (off
, 0);
3509 if (TREE_CODE (off
) == SSA_NAME
)
3511 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3512 tree reft
= TREE_TYPE (DR_REF (newdr
));
3513 if (is_gimple_call (def
)
3514 && gimple_call_internal_p (def
)
3515 && (gimple_call_internal_fn (def
)
3516 == IFN_GOMP_SIMD_LANE
))
3518 tree arg
= gimple_call_arg (def
, 0);
3519 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3520 arg
= SSA_NAME_VAR (arg
);
3521 if (arg
== loop
->simduid
3523 && tree_int_cst_equal
3524 (TYPE_SIZE_UNIT (reft
),
3527 DR_OFFSET (newdr
) = ssize_int (0);
3528 DR_STEP (newdr
) = step
;
3529 DR_ALIGNED_TO (newdr
)
3530 = size_int (BIGGEST_ALIGNMENT
);
3532 simd_lane_access
= true;
3538 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3542 gatherscatter
= GATHER
;
3544 gatherscatter
= SCATTER
;
3547 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3548 free_data_ref (newdr
);
3551 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3553 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3556 "not vectorized: data ref analysis "
3558 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3559 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3562 if (is_a
<bb_vec_info
> (vinfo
))
3569 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3573 "not vectorized: base addr of dr is a "
3576 if (is_a
<bb_vec_info
> (vinfo
))
3579 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3584 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3586 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3589 "not vectorized: volatile type ");
3590 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3591 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3594 if (is_a
<bb_vec_info
> (vinfo
))
3600 if (stmt_can_throw_internal (stmt
))
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3605 "not vectorized: statement can throw an "
3607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3608 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3611 if (is_a
<bb_vec_info
> (vinfo
))
3614 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3619 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3620 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3622 if (dump_enabled_p ())
3624 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3625 "not vectorized: statement is bitfield "
3627 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3628 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3631 if (is_a
<bb_vec_info
> (vinfo
))
3634 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3639 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3640 offset
= unshare_expr (DR_OFFSET (dr
));
3641 init
= unshare_expr (DR_INIT (dr
));
3643 if (is_gimple_call (stmt
)
3644 && (!gimple_call_internal_p (stmt
)
3645 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3646 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3651 "not vectorized: dr in a call ");
3652 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3653 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3656 if (is_a
<bb_vec_info
> (vinfo
))
3659 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3664 /* Update DR field in stmt_vec_info struct. */
3666 /* If the dataref is in an inner-loop of the loop that is considered for
3667 for vectorization, we also want to analyze the access relative to
3668 the outer-loop (DR contains information only relative to the
3669 inner-most enclosing loop). We do that by building a reference to the
3670 first location accessed by the inner-loop, and analyze it relative to
3672 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3674 tree outer_step
, outer_base
, outer_init
;
3675 HOST_WIDE_INT pbitsize
, pbitpos
;
3678 int punsignedp
, preversep
, pvolatilep
;
3679 affine_iv base_iv
, offset_iv
;
3682 /* Build a reference to the first location accessed by the
3683 inner-loop: *(BASE+INIT). (The first location is actually
3684 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3685 tree inner_base
= build_fold_indirect_ref
3686 (fold_build_pointer_plus (base
, init
));
3688 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE
, vect_location
,
3691 "analyze in outer-loop: ");
3692 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3693 dump_printf (MSG_NOTE
, "\n");
3696 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3697 &poffset
, &pmode
, &punsignedp
,
3698 &preversep
, &pvolatilep
, false);
3699 gcc_assert (outer_base
!= NULL_TREE
);
3701 if (pbitpos
% BITS_PER_UNIT
!= 0)
3703 if (dump_enabled_p ())
3704 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3705 "failed: bit offset alignment.\n");
3711 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3713 "failed: reverse storage order.\n");
3717 outer_base
= build_fold_addr_expr (outer_base
);
3718 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3721 if (dump_enabled_p ())
3722 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3723 "failed: evolution of base is not affine.\n");
3730 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3738 offset_iv
.base
= ssize_int (0);
3739 offset_iv
.step
= ssize_int (0);
3741 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3744 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3746 "evolution of offset is not affine.\n");
3750 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3751 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3752 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3753 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3754 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3756 outer_step
= size_binop (PLUS_EXPR
,
3757 fold_convert (ssizetype
, base_iv
.step
),
3758 fold_convert (ssizetype
, offset_iv
.step
));
3760 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3761 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3762 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3763 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3764 STMT_VINFO_DR_OFFSET (stmt_info
) =
3765 fold_convert (ssizetype
, offset_iv
.base
);
3766 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3767 size_int (highest_pow2_factor (offset_iv
.base
));
3769 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_NOTE
, vect_location
,
3772 "\touter base_address: ");
3773 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3774 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3775 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3776 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3777 STMT_VINFO_DR_OFFSET (stmt_info
));
3778 dump_printf (MSG_NOTE
,
3779 "\n\touter constant offset from base address: ");
3780 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3781 STMT_VINFO_DR_INIT (stmt_info
));
3782 dump_printf (MSG_NOTE
, "\n\touter step: ");
3783 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3784 STMT_VINFO_DR_STEP (stmt_info
));
3785 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3786 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3787 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3788 dump_printf (MSG_NOTE
, "\n");
3792 if (STMT_VINFO_DATA_REF (stmt_info
))
3794 if (dump_enabled_p ())
3796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3797 "not vectorized: more than one data ref "
3799 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3800 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3803 if (is_a
<bb_vec_info
> (vinfo
))
3806 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3811 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3812 if (simd_lane_access
)
3814 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3815 free_data_ref (datarefs
[i
]);
3819 /* Set vectype for STMT. */
3820 scalar_type
= TREE_TYPE (DR_REF (dr
));
3821 STMT_VINFO_VECTYPE (stmt_info
)
3822 = get_vectype_for_scalar_type (scalar_type
);
3823 if (!STMT_VINFO_VECTYPE (stmt_info
))
3825 if (dump_enabled_p ())
3827 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3828 "not vectorized: no vectype for stmt: ");
3829 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3830 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3831 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3833 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3836 if (is_a
<bb_vec_info
> (vinfo
))
3838 /* No vector type is fine, the ref can still participate
3839 in dependence analysis, we just can't vectorize it. */
3840 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3844 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3846 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3847 if (gatherscatter
!= SG_NONE
)
3854 if (dump_enabled_p ())
3856 dump_printf_loc (MSG_NOTE
, vect_location
,
3857 "got vectype for stmt: ");
3858 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3859 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3860 STMT_VINFO_VECTYPE (stmt_info
));
3861 dump_printf (MSG_NOTE
, "\n");
3865 /* Adjust the minimal vectorization factor according to the
3867 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3871 if (gatherscatter
!= SG_NONE
)
3874 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3876 || get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3878 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3880 if (dump_enabled_p ())
3882 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3883 (gatherscatter
== GATHER
) ?
3884 "not vectorized: not suitable for gather "
3886 "not vectorized: not suitable for scatter "
3888 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3889 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3894 free_data_ref (datarefs
[i
]);
3896 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3899 else if (is_a
<loop_vec_info
> (vinfo
)
3900 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3902 if (nested_in_vect_loop_p (loop
, stmt
))
3904 if (dump_enabled_p ())
3906 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3907 "not vectorized: not suitable for strided "
3909 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3910 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3914 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3918 /* If we stopped analysis at the first dataref we could not analyze
3919 when trying to vectorize a basic-block mark the rest of the datarefs
3920 as not vectorizable and truncate the vector of datarefs. That
3921 avoids spending useless time in analyzing their dependence. */
3922 if (i
!= datarefs
.length ())
3924 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3925 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3927 data_reference_p dr
= datarefs
[j
];
3928 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3931 datarefs
.truncate (i
);
3938 /* Function vect_get_new_vect_var.
3940 Returns a name for a new variable. The current naming scheme appends the
3941 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3942 the name of vectorizer generated variables, and appends that to NAME if
3946 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3953 case vect_simple_var
:
3956 case vect_scalar_var
:
3962 case vect_pointer_var
:
3971 char* tmp
= concat (prefix
, "_", name
, NULL
);
3972 new_vect_var
= create_tmp_reg (type
, tmp
);
3976 new_vect_var
= create_tmp_reg (type
, prefix
);
3978 return new_vect_var
;
3981 /* Like vect_get_new_vect_var but return an SSA name. */
3984 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3991 case vect_simple_var
:
3994 case vect_scalar_var
:
3997 case vect_pointer_var
:
4006 char* tmp
= concat (prefix
, "_", name
, NULL
);
4007 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4011 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4013 return new_vect_var
;
4016 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4019 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
4020 stmt_vec_info stmt_info
)
4022 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4023 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
4024 int misalign
= DR_MISALIGNMENT (dr
);
4026 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4028 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4031 /* Function vect_create_addr_base_for_vector_ref.
4033 Create an expression that computes the address of the first memory location
4034 that will be accessed for a data reference.
4037 STMT: The statement containing the data reference.
4038 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4039 OFFSET: Optional. If supplied, it is be added to the initial address.
4040 LOOP: Specify relative to which loop-nest should the address be computed.
4041 For example, when the dataref is in an inner-loop nested in an
4042 outer-loop that is now being vectorized, LOOP can be either the
4043 outer-loop, or the inner-loop. The first memory location accessed
4044 by the following dataref ('in' points to short):
4051 if LOOP=i_loop: &in (relative to i_loop)
4052 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4053 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4054 initial address. Unlike OFFSET, which is number of elements to
4055 be added, BYTE_OFFSET is measured in bytes.
4058 1. Return an SSA_NAME whose value is the address of the memory location of
4059 the first vector of the data reference.
4060 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4061 these statement(s) which define the returned SSA_NAME.
4063 FORNOW: We are only handling array accesses with step 1. */
4066 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4067 gimple_seq
*new_stmt_list
,
4072 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4073 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4075 const char *base_name
;
4078 gimple_seq seq
= NULL
;
4082 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4083 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4085 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4087 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4089 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4091 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4092 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4093 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4097 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4098 base_offset
= unshare_expr (DR_OFFSET (dr
));
4099 init
= unshare_expr (DR_INIT (dr
));
4103 base_name
= get_name (data_ref_base
);
4106 base_offset
= ssize_int (0);
4107 init
= ssize_int (0);
4108 base_name
= get_name (DR_REF (dr
));
4111 /* Create base_offset */
4112 base_offset
= size_binop (PLUS_EXPR
,
4113 fold_convert (sizetype
, base_offset
),
4114 fold_convert (sizetype
, init
));
4118 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4119 fold_convert (sizetype
, offset
), step
);
4120 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4121 base_offset
, offset
);
4125 byte_offset
= fold_convert (sizetype
, byte_offset
);
4126 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4127 base_offset
, byte_offset
);
4130 /* base + base_offset */
4132 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4135 addr_base
= build1 (ADDR_EXPR
,
4136 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4137 unshare_expr (DR_REF (dr
)));
4140 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4141 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4142 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4143 gimple_seq_add_seq (new_stmt_list
, seq
);
4145 if (DR_PTR_INFO (dr
)
4146 && TREE_CODE (addr_base
) == SSA_NAME
4147 && !SSA_NAME_PTR_INFO (addr_base
))
4149 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4150 if (offset
|| byte_offset
)
4151 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4154 if (dump_enabled_p ())
4156 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4157 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4158 dump_printf (MSG_NOTE
, "\n");
4165 /* Function vect_create_data_ref_ptr.
4167 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4168 location accessed in the loop by STMT, along with the def-use update
4169 chain to appropriately advance the pointer through the loop iterations.
4170 Also set aliasing information for the pointer. This pointer is used by
4171 the callers to this function to create a memory reference expression for
4172 vector load/store access.
4175 1. STMT: a stmt that references memory. Expected to be of the form
4176 GIMPLE_ASSIGN <name, data-ref> or
4177 GIMPLE_ASSIGN <data-ref, name>.
4178 2. AGGR_TYPE: the type of the reference, which should be either a vector
4180 3. AT_LOOP: the loop where the vector memref is to be created.
4181 4. OFFSET (optional): an offset to be added to the initial address accessed
4182 by the data-ref in STMT.
4183 5. BSI: location where the new stmts are to be placed if there is no loop
4184 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4185 pointing to the initial address.
4186 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4187 to the initial address accessed by the data-ref in STMT. This is
4188 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4192 1. Declare a new ptr to vector_type, and have it point to the base of the
4193 data reference (initial addressed accessed by the data reference).
4194 For example, for vector of type V8HI, the following code is generated:
4197 ap = (v8hi *)initial_address;
4199 if OFFSET is not supplied:
4200 initial_address = &a[init];
4201 if OFFSET is supplied:
4202 initial_address = &a[init + OFFSET];
4203 if BYTE_OFFSET is supplied:
4204 initial_address = &a[init] + BYTE_OFFSET;
4206 Return the initial_address in INITIAL_ADDRESS.
4208 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4209 update the pointer in each iteration of the loop.
4211 Return the increment stmt that updates the pointer in PTR_INCR.
4213 3. Set INV_P to true if the access pattern of the data reference in the
4214 vectorized loop is invariant. Set it to false otherwise.
4216 4. Return the pointer. */
4219 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4220 tree offset
, tree
*initial_address
,
4221 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4222 bool only_init
, bool *inv_p
, tree byte_offset
)
4224 const char *base_name
;
4225 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4226 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4227 struct loop
*loop
= NULL
;
4228 bool nested_in_vect_loop
= false;
4229 struct loop
*containing_loop
= NULL
;
4233 gimple_seq new_stmt_list
= NULL
;
4237 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4239 gimple_stmt_iterator incr_gsi
;
4241 tree indx_before_incr
, indx_after_incr
;
4244 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4246 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4247 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4251 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4252 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4253 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4254 pe
= loop_preheader_edge (loop
);
4258 gcc_assert (bb_vinfo
);
4263 /* Check the step (evolution) of the load in LOOP, and record
4264 whether it's invariant. */
4265 if (nested_in_vect_loop
)
4266 step
= STMT_VINFO_DR_STEP (stmt_info
);
4268 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4270 if (integer_zerop (step
))
4275 /* Create an expression for the first address accessed by this load
4277 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4279 if (dump_enabled_p ())
4281 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4282 dump_printf_loc (MSG_NOTE
, vect_location
,
4283 "create %s-pointer variable to type: ",
4284 get_tree_code_name (TREE_CODE (aggr_type
)));
4285 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4286 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4287 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4288 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4289 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4290 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4291 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4293 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4294 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4295 dump_printf (MSG_NOTE
, "\n");
4298 /* (1) Create the new aggregate-pointer variable.
4299 Vector and array types inherit the alias set of their component
4300 type by default so we need to use a ref-all pointer if the data
4301 reference does not conflict with the created aggregated data
4302 reference because it is not addressable. */
4303 bool need_ref_all
= false;
4304 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4305 get_alias_set (DR_REF (dr
))))
4306 need_ref_all
= true;
4307 /* Likewise for any of the data references in the stmt group. */
4308 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4310 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4313 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4314 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4315 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4316 get_alias_set (DR_REF (sdr
))))
4318 need_ref_all
= true;
4321 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4325 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4327 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4330 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4331 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4332 def-use update cycles for the pointer: one relative to the outer-loop
4333 (LOOP), which is what steps (3) and (4) below do. The other is relative
4334 to the inner-loop (which is the inner-most loop containing the dataref),
4335 and this is done be step (5) below.
4337 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4338 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4339 redundant. Steps (3),(4) create the following:
4342 LOOP: vp1 = phi(vp0,vp2)
4348 If there is an inner-loop nested in loop, then step (5) will also be
4349 applied, and an additional update in the inner-loop will be created:
4352 LOOP: vp1 = phi(vp0,vp2)
4354 inner: vp3 = phi(vp1,vp4)
4355 vp4 = vp3 + inner_step
4361 /* (2) Calculate the initial address of the aggregate-pointer, and set
4362 the aggregate-pointer to point to it before the loop. */
4364 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4366 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4367 offset
, loop
, byte_offset
);
4372 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4373 gcc_assert (!new_bb
);
4376 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4379 *initial_address
= new_temp
;
4380 aggr_ptr_init
= new_temp
;
4382 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4383 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4384 inner-loop nested in LOOP (during outer-loop vectorization). */
4386 /* No update in loop is required. */
4387 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4388 aptr
= aggr_ptr_init
;
4391 /* The step of the aggregate pointer is the type size. */
4392 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4393 /* One exception to the above is when the scalar step of the load in
4394 LOOP is zero. In this case the step here is also zero. */
4396 iv_step
= size_zero_node
;
4397 else if (tree_int_cst_sgn (step
) == -1)
4398 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4400 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4402 create_iv (aggr_ptr_init
,
4403 fold_convert (aggr_ptr_type
, iv_step
),
4404 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4405 &indx_before_incr
, &indx_after_incr
);
4406 incr
= gsi_stmt (incr_gsi
);
4407 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4409 /* Copy the points-to information if it exists. */
4410 if (DR_PTR_INFO (dr
))
4412 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4413 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4418 aptr
= indx_before_incr
;
4421 if (!nested_in_vect_loop
|| only_init
)
4425 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4426 nested in LOOP, if exists. */
4428 gcc_assert (nested_in_vect_loop
);
4431 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4433 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4434 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4436 incr
= gsi_stmt (incr_gsi
);
4437 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4439 /* Copy the points-to information if it exists. */
4440 if (DR_PTR_INFO (dr
))
4442 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4443 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4448 return indx_before_incr
;
4455 /* Function bump_vector_ptr
4457 Increment a pointer (to a vector type) by vector-size. If requested,
4458 i.e. if PTR-INCR is given, then also connect the new increment stmt
4459 to the existing def-use update-chain of the pointer, by modifying
4460 the PTR_INCR as illustrated below:
4462 The pointer def-use update-chain before this function:
4463 DATAREF_PTR = phi (p_0, p_2)
4465 PTR_INCR: p_2 = DATAREF_PTR + step
4467 The pointer def-use update-chain after this function:
4468 DATAREF_PTR = phi (p_0, p_2)
4470 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4472 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4475 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4477 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4478 the loop. The increment amount across iterations is expected
4480 BSI - location where the new update stmt is to be placed.
4481 STMT - the original scalar memory-access stmt that is being vectorized.
4482 BUMP - optional. The offset by which to bump the pointer. If not given,
4483 the offset is assumed to be vector_size.
4485 Output: Return NEW_DATAREF_PTR as illustrated above.
4490 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4491 gimple
*stmt
, tree bump
)
4493 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4494 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4495 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4496 tree update
= TYPE_SIZE_UNIT (vectype
);
4499 use_operand_p use_p
;
4500 tree new_dataref_ptr
;
4505 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4506 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4508 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4509 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4510 dataref_ptr
, update
);
4511 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4513 /* Copy the points-to information if it exists. */
4514 if (DR_PTR_INFO (dr
))
4516 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4517 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4521 return new_dataref_ptr
;
4523 /* Update the vector-pointer's cross-iteration increment. */
4524 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4526 tree use
= USE_FROM_PTR (use_p
);
4528 if (use
== dataref_ptr
)
4529 SET_USE (use_p
, new_dataref_ptr
);
4531 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4534 return new_dataref_ptr
;
4538 /* Function vect_create_destination_var.
4540 Create a new temporary of type VECTYPE. */
4543 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4549 enum vect_var_kind kind
;
4552 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4556 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4558 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4560 name
= get_name (scalar_dest
);
4562 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4564 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4565 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4571 /* Function vect_grouped_store_supported.
4573 Returns TRUE if interleave high and interleave low permutations
4574 are supported, and FALSE otherwise. */
4577 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4579 machine_mode mode
= TYPE_MODE (vectype
);
4581 /* vect_permute_store_chain requires the group size to be equal to 3 or
4582 be a power of two. */
4583 if (count
!= 3 && exact_log2 (count
) == -1)
4585 if (dump_enabled_p ())
4586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4587 "the size of the group of accesses"
4588 " is not a power of 2 or not eqaul to 3\n");
4592 /* Check that the permutation is supported. */
4593 if (VECTOR_MODE_P (mode
))
4595 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4596 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4600 unsigned int j0
= 0, j1
= 0, j2
= 0;
4603 for (j
= 0; j
< 3; j
++)
4605 int nelt0
= ((3 - j
) * nelt
) % 3;
4606 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4607 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4608 for (i
= 0; i
< nelt
; i
++)
4610 if (3 * i
+ nelt0
< nelt
)
4611 sel
[3 * i
+ nelt0
] = j0
++;
4612 if (3 * i
+ nelt1
< nelt
)
4613 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4614 if (3 * i
+ nelt2
< nelt
)
4615 sel
[3 * i
+ nelt2
] = 0;
4617 if (!can_vec_perm_p (mode
, false, sel
))
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_MISSED_OPTIMIZATION
,
4621 "permutaion op not supported by target.\n");
4625 for (i
= 0; i
< nelt
; i
++)
4627 if (3 * i
+ nelt0
< nelt
)
4628 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4629 if (3 * i
+ nelt1
< nelt
)
4630 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4631 if (3 * i
+ nelt2
< nelt
)
4632 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4634 if (!can_vec_perm_p (mode
, false, sel
))
4636 if (dump_enabled_p ())
4637 dump_printf (MSG_MISSED_OPTIMIZATION
,
4638 "permutaion op not supported by target.\n");
4646 /* If length is not equal to 3 then only power of 2 is supported. */
4647 gcc_assert (exact_log2 (count
) != -1);
4649 for (i
= 0; i
< nelt
/ 2; i
++)
4652 sel
[i
* 2 + 1] = i
+ nelt
;
4654 if (can_vec_perm_p (mode
, false, sel
))
4656 for (i
= 0; i
< nelt
; i
++)
4658 if (can_vec_perm_p (mode
, false, sel
))
4664 if (dump_enabled_p ())
4665 dump_printf (MSG_MISSED_OPTIMIZATION
,
4666 "permutaion op not supported by target.\n");
4671 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4675 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4677 return vect_lanes_optab_supported_p ("vec_store_lanes",
4678 vec_store_lanes_optab
,
4683 /* Function vect_permute_store_chain.
4685 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4686 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4687 the data correctly for the stores. Return the final references for stores
4690 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4691 The input is 4 vectors each containing 8 elements. We assign a number to
4692 each element, the input sequence is:
4694 1st vec: 0 1 2 3 4 5 6 7
4695 2nd vec: 8 9 10 11 12 13 14 15
4696 3rd vec: 16 17 18 19 20 21 22 23
4697 4th vec: 24 25 26 27 28 29 30 31
4699 The output sequence should be:
4701 1st vec: 0 8 16 24 1 9 17 25
4702 2nd vec: 2 10 18 26 3 11 19 27
4703 3rd vec: 4 12 20 28 5 13 21 30
4704 4th vec: 6 14 22 30 7 15 23 31
4706 i.e., we interleave the contents of the four vectors in their order.
4708 We use interleave_high/low instructions to create such output. The input of
4709 each interleave_high/low operation is two vectors:
4712 the even elements of the result vector are obtained left-to-right from the
4713 high/low elements of the first vector. The odd elements of the result are
4714 obtained left-to-right from the high/low elements of the second vector.
4715 The output of interleave_high will be: 0 4 1 5
4716 and of interleave_low: 2 6 3 7
4719 The permutation is done in log LENGTH stages. In each stage interleave_high
4720 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4721 where the first argument is taken from the first half of DR_CHAIN and the
4722 second argument from it's second half.
4725 I1: interleave_high (1st vec, 3rd vec)
4726 I2: interleave_low (1st vec, 3rd vec)
4727 I3: interleave_high (2nd vec, 4th vec)
4728 I4: interleave_low (2nd vec, 4th vec)
4730 The output for the first stage is:
4732 I1: 0 16 1 17 2 18 3 19
4733 I2: 4 20 5 21 6 22 7 23
4734 I3: 8 24 9 25 10 26 11 27
4735 I4: 12 28 13 29 14 30 15 31
4737 The output of the second stage, i.e. the final result is:
4739 I1: 0 8 16 24 1 9 17 25
4740 I2: 2 10 18 26 3 11 19 27
4741 I3: 4 12 20 28 5 13 21 30
4742 I4: 6 14 22 30 7 15 23 31. */
4745 vect_permute_store_chain (vec
<tree
> dr_chain
,
4746 unsigned int length
,
4748 gimple_stmt_iterator
*gsi
,
4749 vec
<tree
> *result_chain
)
4751 tree vect1
, vect2
, high
, low
;
4753 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4754 tree perm_mask_low
, perm_mask_high
;
4756 tree perm3_mask_low
, perm3_mask_high
;
4757 unsigned int i
, n
, log_length
= exact_log2 (length
);
4758 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4759 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4761 result_chain
->quick_grow (length
);
4762 memcpy (result_chain
->address (), dr_chain
.address (),
4763 length
* sizeof (tree
));
4767 unsigned int j0
= 0, j1
= 0, j2
= 0;
4769 for (j
= 0; j
< 3; j
++)
4771 int nelt0
= ((3 - j
) * nelt
) % 3;
4772 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4773 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4775 for (i
= 0; i
< nelt
; i
++)
4777 if (3 * i
+ nelt0
< nelt
)
4778 sel
[3 * i
+ nelt0
] = j0
++;
4779 if (3 * i
+ nelt1
< nelt
)
4780 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4781 if (3 * i
+ nelt2
< nelt
)
4782 sel
[3 * i
+ nelt2
] = 0;
4784 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4786 for (i
= 0; i
< nelt
; i
++)
4788 if (3 * i
+ nelt0
< nelt
)
4789 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4790 if (3 * i
+ nelt1
< nelt
)
4791 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4792 if (3 * i
+ nelt2
< nelt
)
4793 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4795 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4797 vect1
= dr_chain
[0];
4798 vect2
= dr_chain
[1];
4800 /* Create interleaving stmt:
4801 low = VEC_PERM_EXPR <vect1, vect2,
4802 {j, nelt, *, j + 1, nelt + j + 1, *,
4803 j + 2, nelt + j + 2, *, ...}> */
4804 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4805 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4806 vect2
, perm3_mask_low
);
4807 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4810 vect2
= dr_chain
[2];
4811 /* Create interleaving stmt:
4812 low = VEC_PERM_EXPR <vect1, vect2,
4813 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4814 6, 7, nelt + j + 2, ...}> */
4815 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4816 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4817 vect2
, perm3_mask_high
);
4818 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4819 (*result_chain
)[j
] = data_ref
;
4824 /* If length is not equal to 3 then only power of 2 is supported. */
4825 gcc_assert (exact_log2 (length
) != -1);
4827 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4830 sel
[i
* 2 + 1] = i
+ nelt
;
4832 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4834 for (i
= 0; i
< nelt
; i
++)
4836 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4838 for (i
= 0, n
= log_length
; i
< n
; i
++)
4840 for (j
= 0; j
< length
/2; j
++)
4842 vect1
= dr_chain
[j
];
4843 vect2
= dr_chain
[j
+length
/2];
4845 /* Create interleaving stmt:
4846 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4848 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4849 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4850 vect2
, perm_mask_high
);
4851 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4852 (*result_chain
)[2*j
] = high
;
4854 /* Create interleaving stmt:
4855 low = VEC_PERM_EXPR <vect1, vect2,
4856 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4858 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4859 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4860 vect2
, perm_mask_low
);
4861 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4862 (*result_chain
)[2*j
+1] = low
;
4864 memcpy (dr_chain
.address (), result_chain
->address (),
4865 length
* sizeof (tree
));
4870 /* Function vect_setup_realignment
4872 This function is called when vectorizing an unaligned load using
4873 the dr_explicit_realign[_optimized] scheme.
4874 This function generates the following code at the loop prolog:
4877 x msq_init = *(floor(p)); # prolog load
4878 realignment_token = call target_builtin;
4880 x msq = phi (msq_init, ---)
4882 The stmts marked with x are generated only for the case of
4883 dr_explicit_realign_optimized.
4885 The code above sets up a new (vector) pointer, pointing to the first
4886 location accessed by STMT, and a "floor-aligned" load using that pointer.
4887 It also generates code to compute the "realignment-token" (if the relevant
4888 target hook was defined), and creates a phi-node at the loop-header bb
4889 whose arguments are the result of the prolog-load (created by this
4890 function) and the result of a load that takes place in the loop (to be
4891 created by the caller to this function).
4893 For the case of dr_explicit_realign_optimized:
4894 The caller to this function uses the phi-result (msq) to create the
4895 realignment code inside the loop, and sets up the missing phi argument,
4898 msq = phi (msq_init, lsq)
4899 lsq = *(floor(p')); # load in loop
4900 result = realign_load (msq, lsq, realignment_token);
4902 For the case of dr_explicit_realign:
4904 msq = *(floor(p)); # load in loop
4906 lsq = *(floor(p')); # load in loop
4907 result = realign_load (msq, lsq, realignment_token);
4910 STMT - (scalar) load stmt to be vectorized. This load accesses
4911 a memory location that may be unaligned.
4912 BSI - place where new code is to be inserted.
4913 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4917 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4918 target hook, if defined.
4919 Return value - the result of the loop-header phi node. */
4922 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4923 tree
*realignment_token
,
4924 enum dr_alignment_support alignment_support_scheme
,
4926 struct loop
**at_loop
)
4928 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4929 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4930 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4931 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4932 struct loop
*loop
= NULL
;
4934 tree scalar_dest
= gimple_assign_lhs (stmt
);
4940 tree msq_init
= NULL_TREE
;
4943 tree msq
= NULL_TREE
;
4944 gimple_seq stmts
= NULL
;
4946 bool compute_in_loop
= false;
4947 bool nested_in_vect_loop
= false;
4948 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4949 struct loop
*loop_for_initial_load
= NULL
;
4953 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4954 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4957 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4958 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4960 /* We need to generate three things:
4961 1. the misalignment computation
4962 2. the extra vector load (for the optimized realignment scheme).
4963 3. the phi node for the two vectors from which the realignment is
4964 done (for the optimized realignment scheme). */
4966 /* 1. Determine where to generate the misalignment computation.
4968 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4969 calculation will be generated by this function, outside the loop (in the
4970 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4971 caller, inside the loop.
4973 Background: If the misalignment remains fixed throughout the iterations of
4974 the loop, then both realignment schemes are applicable, and also the
4975 misalignment computation can be done outside LOOP. This is because we are
4976 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4977 are a multiple of VS (the Vector Size), and therefore the misalignment in
4978 different vectorized LOOP iterations is always the same.
4979 The problem arises only if the memory access is in an inner-loop nested
4980 inside LOOP, which is now being vectorized using outer-loop vectorization.
4981 This is the only case when the misalignment of the memory access may not
4982 remain fixed throughout the iterations of the inner-loop (as explained in
4983 detail in vect_supportable_dr_alignment). In this case, not only is the
4984 optimized realignment scheme not applicable, but also the misalignment
4985 computation (and generation of the realignment token that is passed to
4986 REALIGN_LOAD) have to be done inside the loop.
4988 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4989 or not, which in turn determines if the misalignment is computed inside
4990 the inner-loop, or outside LOOP. */
4992 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4994 compute_in_loop
= true;
4995 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4999 /* 2. Determine where to generate the extra vector load.
5001 For the optimized realignment scheme, instead of generating two vector
5002 loads in each iteration, we generate a single extra vector load in the
5003 preheader of the loop, and in each iteration reuse the result of the
5004 vector load from the previous iteration. In case the memory access is in
5005 an inner-loop nested inside LOOP, which is now being vectorized using
5006 outer-loop vectorization, we need to determine whether this initial vector
5007 load should be generated at the preheader of the inner-loop, or can be
5008 generated at the preheader of LOOP. If the memory access has no evolution
5009 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5010 to be generated inside LOOP (in the preheader of the inner-loop). */
5012 if (nested_in_vect_loop
)
5014 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5015 bool invariant_in_outerloop
=
5016 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5017 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5020 loop_for_initial_load
= loop
;
5022 *at_loop
= loop_for_initial_load
;
5024 if (loop_for_initial_load
)
5025 pe
= loop_preheader_edge (loop_for_initial_load
);
5027 /* 3. For the case of the optimized realignment, create the first vector
5028 load at the loop preheader. */
5030 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5032 /* Create msq_init = *(floor(p1)) in the loop preheader */
5035 gcc_assert (!compute_in_loop
);
5036 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5037 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5038 NULL_TREE
, &init_addr
, NULL
, &inc
,
5040 if (TREE_CODE (ptr
) == SSA_NAME
)
5041 new_temp
= copy_ssa_name (ptr
);
5043 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5044 new_stmt
= gimple_build_assign
5045 (new_temp
, BIT_AND_EXPR
, ptr
,
5046 build_int_cst (TREE_TYPE (ptr
),
5047 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5048 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5049 gcc_assert (!new_bb
);
5051 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5052 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5053 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5054 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5055 gimple_assign_set_lhs (new_stmt
, new_temp
);
5058 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5059 gcc_assert (!new_bb
);
5062 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5064 msq_init
= gimple_assign_lhs (new_stmt
);
5067 /* 4. Create realignment token using a target builtin, if available.
5068 It is done either inside the containing loop, or before LOOP (as
5069 determined above). */
5071 if (targetm
.vectorize
.builtin_mask_for_load
)
5076 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5079 /* Generate the INIT_ADDR computation outside LOOP. */
5080 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5084 pe
= loop_preheader_edge (loop
);
5085 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5086 gcc_assert (!new_bb
);
5089 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5092 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5093 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5095 vect_create_destination_var (scalar_dest
,
5096 gimple_call_return_type (new_stmt
));
5097 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5098 gimple_call_set_lhs (new_stmt
, new_temp
);
5100 if (compute_in_loop
)
5101 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5104 /* Generate the misalignment computation outside LOOP. */
5105 pe
= loop_preheader_edge (loop
);
5106 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5107 gcc_assert (!new_bb
);
5110 *realignment_token
= gimple_call_lhs (new_stmt
);
5112 /* The result of the CALL_EXPR to this builtin is determined from
5113 the value of the parameter and no global variables are touched
5114 which makes the builtin a "const" function. Requiring the
5115 builtin to have the "const" attribute makes it unnecessary
5116 to call mark_call_clobbered. */
5117 gcc_assert (TREE_READONLY (builtin_decl
));
5120 if (alignment_support_scheme
== dr_explicit_realign
)
5123 gcc_assert (!compute_in_loop
);
5124 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5127 /* 5. Create msq = phi <msq_init, lsq> in loop */
5129 pe
= loop_preheader_edge (containing_loop
);
5130 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5131 msq
= make_ssa_name (vec_dest
);
5132 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5133 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5139 /* Function vect_grouped_load_supported.
5141 Returns TRUE if even and odd permutations are supported,
5142 and FALSE otherwise. */
5145 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5147 machine_mode mode
= TYPE_MODE (vectype
);
5149 /* vect_permute_load_chain requires the group size to be equal to 3 or
5150 be a power of two. */
5151 if (count
!= 3 && exact_log2 (count
) == -1)
5153 if (dump_enabled_p ())
5154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5155 "the size of the group of accesses"
5156 " is not a power of 2 or not equal to 3\n");
5160 /* Check that the permutation is supported. */
5161 if (VECTOR_MODE_P (mode
))
5163 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5164 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5169 for (k
= 0; k
< 3; k
++)
5171 for (i
= 0; i
< nelt
; i
++)
5172 if (3 * i
+ k
< 2 * nelt
)
5176 if (!can_vec_perm_p (mode
, false, sel
))
5178 if (dump_enabled_p ())
5179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5180 "shuffle of 3 loads is not supported by"
5184 for (i
= 0, j
= 0; i
< nelt
; i
++)
5185 if (3 * i
+ k
< 2 * nelt
)
5188 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5189 if (!can_vec_perm_p (mode
, false, sel
))
5191 if (dump_enabled_p ())
5192 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5193 "shuffle of 3 loads is not supported by"
5202 /* If length is not equal to 3 then only power of 2 is supported. */
5203 gcc_assert (exact_log2 (count
) != -1);
5204 for (i
= 0; i
< nelt
; i
++)
5206 if (can_vec_perm_p (mode
, false, sel
))
5208 for (i
= 0; i
< nelt
; i
++)
5210 if (can_vec_perm_p (mode
, false, sel
))
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5218 "extract even/odd not supported by target\n");
5222 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5226 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5228 return vect_lanes_optab_supported_p ("vec_load_lanes",
5229 vec_load_lanes_optab
,
5233 /* Function vect_permute_load_chain.
5235 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5236 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5237 the input data correctly. Return the final references for loads in
5240 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5241 The input is 4 vectors each containing 8 elements. We assign a number to each
5242 element, the input sequence is:
5244 1st vec: 0 1 2 3 4 5 6 7
5245 2nd vec: 8 9 10 11 12 13 14 15
5246 3rd vec: 16 17 18 19 20 21 22 23
5247 4th vec: 24 25 26 27 28 29 30 31
5249 The output sequence should be:
5251 1st vec: 0 4 8 12 16 20 24 28
5252 2nd vec: 1 5 9 13 17 21 25 29
5253 3rd vec: 2 6 10 14 18 22 26 30
5254 4th vec: 3 7 11 15 19 23 27 31
5256 i.e., the first output vector should contain the first elements of each
5257 interleaving group, etc.
5259 We use extract_even/odd instructions to create such output. The input of
5260 each extract_even/odd operation is two vectors
5264 and the output is the vector of extracted even/odd elements. The output of
5265 extract_even will be: 0 2 4 6
5266 and of extract_odd: 1 3 5 7
5269 The permutation is done in log LENGTH stages. In each stage extract_even
5270 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5271 their order. In our example,
5273 E1: extract_even (1st vec, 2nd vec)
5274 E2: extract_odd (1st vec, 2nd vec)
5275 E3: extract_even (3rd vec, 4th vec)
5276 E4: extract_odd (3rd vec, 4th vec)
5278 The output for the first stage will be:
5280 E1: 0 2 4 6 8 10 12 14
5281 E2: 1 3 5 7 9 11 13 15
5282 E3: 16 18 20 22 24 26 28 30
5283 E4: 17 19 21 23 25 27 29 31
5285 In order to proceed and create the correct sequence for the next stage (or
5286 for the correct output, if the second stage is the last one, as in our
5287 example), we first put the output of extract_even operation and then the
5288 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5289 The input for the second stage is:
5291 1st vec (E1): 0 2 4 6 8 10 12 14
5292 2nd vec (E3): 16 18 20 22 24 26 28 30
5293 3rd vec (E2): 1 3 5 7 9 11 13 15
5294 4th vec (E4): 17 19 21 23 25 27 29 31
5296 The output of the second stage:
5298 E1: 0 4 8 12 16 20 24 28
5299 E2: 2 6 10 14 18 22 26 30
5300 E3: 1 5 9 13 17 21 25 29
5301 E4: 3 7 11 15 19 23 27 31
5303 And RESULT_CHAIN after reordering:
5305 1st vec (E1): 0 4 8 12 16 20 24 28
5306 2nd vec (E3): 1 5 9 13 17 21 25 29
5307 3rd vec (E2): 2 6 10 14 18 22 26 30
5308 4th vec (E4): 3 7 11 15 19 23 27 31. */
5311 vect_permute_load_chain (vec
<tree
> dr_chain
,
5312 unsigned int length
,
5314 gimple_stmt_iterator
*gsi
,
5315 vec
<tree
> *result_chain
)
5317 tree data_ref
, first_vect
, second_vect
;
5318 tree perm_mask_even
, perm_mask_odd
;
5319 tree perm3_mask_low
, perm3_mask_high
;
5321 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5322 unsigned int i
, j
, log_length
= exact_log2 (length
);
5323 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5324 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5326 result_chain
->quick_grow (length
);
5327 memcpy (result_chain
->address (), dr_chain
.address (),
5328 length
* sizeof (tree
));
5334 for (k
= 0; k
< 3; k
++)
5336 for (i
= 0; i
< nelt
; i
++)
5337 if (3 * i
+ k
< 2 * nelt
)
5341 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5343 for (i
= 0, j
= 0; i
< nelt
; i
++)
5344 if (3 * i
+ k
< 2 * nelt
)
5347 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5349 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5351 first_vect
= dr_chain
[0];
5352 second_vect
= dr_chain
[1];
5354 /* Create interleaving stmt (low part of):
5355 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5357 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5358 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5359 second_vect
, perm3_mask_low
);
5360 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5362 /* Create interleaving stmt (high part of):
5363 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5365 first_vect
= data_ref
;
5366 second_vect
= dr_chain
[2];
5367 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5368 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5369 second_vect
, perm3_mask_high
);
5370 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5371 (*result_chain
)[k
] = data_ref
;
5376 /* If length is not equal to 3 then only power of 2 is supported. */
5377 gcc_assert (exact_log2 (length
) != -1);
5379 for (i
= 0; i
< nelt
; ++i
)
5381 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5383 for (i
= 0; i
< nelt
; ++i
)
5385 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5387 for (i
= 0; i
< log_length
; i
++)
5389 for (j
= 0; j
< length
; j
+= 2)
5391 first_vect
= dr_chain
[j
];
5392 second_vect
= dr_chain
[j
+1];
5394 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5395 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5396 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5397 first_vect
, second_vect
,
5399 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5400 (*result_chain
)[j
/2] = data_ref
;
5402 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5403 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5404 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5405 first_vect
, second_vect
,
5407 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5408 (*result_chain
)[j
/2+length
/2] = data_ref
;
5410 memcpy (dr_chain
.address (), result_chain
->address (),
5411 length
* sizeof (tree
));
5416 /* Function vect_shift_permute_load_chain.
5418 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5419 sequence of stmts to reorder the input data accordingly.
5420 Return the final references for loads in RESULT_CHAIN.
5421 Return true if successed, false otherwise.
5423 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5424 The input is 3 vectors each containing 8 elements. We assign a
5425 number to each element, the input sequence is:
5427 1st vec: 0 1 2 3 4 5 6 7
5428 2nd vec: 8 9 10 11 12 13 14 15
5429 3rd vec: 16 17 18 19 20 21 22 23
5431 The output sequence should be:
5433 1st vec: 0 3 6 9 12 15 18 21
5434 2nd vec: 1 4 7 10 13 16 19 22
5435 3rd vec: 2 5 8 11 14 17 20 23
5437 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5439 First we shuffle all 3 vectors to get correct elements order:
5441 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5442 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5443 3rd vec: (16 19 22) (17 20 23) (18 21)
5445 Next we unite and shift vector 3 times:
5448 shift right by 6 the concatenation of:
5449 "1st vec" and "2nd vec"
5450 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5451 "2nd vec" and "3rd vec"
5452 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5453 "3rd vec" and "1st vec"
5454 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5457 So that now new vectors are:
5459 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5460 2nd vec: (10 13) (16 19 22) (17 20 23)
5461 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5464 shift right by 5 the concatenation of:
5465 "1st vec" and "3rd vec"
5466 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5467 "2nd vec" and "1st vec"
5468 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5469 "3rd vec" and "2nd vec"
5470 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5473 So that now new vectors are:
5475 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5476 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5477 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5480 shift right by 5 the concatenation of:
5481 "1st vec" and "1st vec"
5482 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5483 shift right by 3 the concatenation of:
5484 "2nd vec" and "2nd vec"
5485 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5488 So that now all vectors are READY:
5489 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5490 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5491 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5493 This algorithm is faster than one in vect_permute_load_chain if:
5494 1. "shift of a concatination" is faster than general permutation.
5496 2. The TARGET machine can't execute vector instructions in parallel.
5497 This is because each step of the algorithm depends on previous.
5498 The algorithm in vect_permute_load_chain is much more parallel.
5500 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5504 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5505 unsigned int length
,
5507 gimple_stmt_iterator
*gsi
,
5508 vec
<tree
> *result_chain
)
5510 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5511 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5512 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5515 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5517 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5518 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5519 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5520 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5522 result_chain
->quick_grow (length
);
5523 memcpy (result_chain
->address (), dr_chain
.address (),
5524 length
* sizeof (tree
));
5526 if (exact_log2 (length
) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5528 unsigned int j
, log_length
= exact_log2 (length
);
5529 for (i
= 0; i
< nelt
/ 2; ++i
)
5531 for (i
= 0; i
< nelt
/ 2; ++i
)
5532 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5537 "shuffle of 2 fields structure is not \
5538 supported by target\n");
5541 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5543 for (i
= 0; i
< nelt
/ 2; ++i
)
5545 for (i
= 0; i
< nelt
/ 2; ++i
)
5546 sel
[nelt
/ 2 + i
] = i
* 2;
5547 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5549 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5551 "shuffle of 2 fields structure is not \
5552 supported by target\n");
5555 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5557 /* Generating permutation constant to shift all elements.
5558 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5559 for (i
= 0; i
< nelt
; i
++)
5560 sel
[i
] = nelt
/ 2 + i
;
5561 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5565 "shift permutation is not supported by target\n");
5568 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5570 /* Generating permutation constant to select vector from 2.
5571 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5572 for (i
= 0; i
< nelt
/ 2; i
++)
5574 for (i
= nelt
/ 2; i
< nelt
; i
++)
5576 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5578 if (dump_enabled_p ())
5579 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5580 "select is not supported by target\n");
5583 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5585 for (i
= 0; i
< log_length
; i
++)
5587 for (j
= 0; j
< length
; j
+= 2)
5589 first_vect
= dr_chain
[j
];
5590 second_vect
= dr_chain
[j
+ 1];
5592 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5593 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5594 first_vect
, first_vect
,
5596 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5599 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5600 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5601 second_vect
, second_vect
,
5603 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5606 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5607 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5608 vect
[0], vect
[1], shift1_mask
);
5609 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5610 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5612 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5613 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5614 vect
[0], vect
[1], select_mask
);
5615 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5616 (*result_chain
)[j
/2] = data_ref
;
5618 memcpy (dr_chain
.address (), result_chain
->address (),
5619 length
* sizeof (tree
));
5623 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5625 unsigned int k
= 0, l
= 0;
5627 /* Generating permutation constant to get all elements in rigth order.
5628 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5629 for (i
= 0; i
< nelt
; i
++)
5631 if (3 * k
+ (l
% 3) >= nelt
)
5634 l
+= (3 - (nelt
% 3));
5636 sel
[i
] = 3 * k
+ (l
% 3);
5639 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5641 if (dump_enabled_p ())
5642 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5643 "shuffle of 3 fields structure is not \
5644 supported by target\n");
5647 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5649 /* Generating permutation constant to shift all elements.
5650 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5651 for (i
= 0; i
< nelt
; i
++)
5652 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5653 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5655 if (dump_enabled_p ())
5656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5657 "shift permutation is not supported by target\n");
5660 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5662 /* Generating permutation constant to shift all elements.
5663 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5664 for (i
= 0; i
< nelt
; i
++)
5665 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5666 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5670 "shift permutation is not supported by target\n");
5673 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5675 /* Generating permutation constant to shift all elements.
5676 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5677 for (i
= 0; i
< nelt
; i
++)
5678 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5679 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5681 if (dump_enabled_p ())
5682 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5683 "shift permutation is not supported by target\n");
5686 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5688 /* Generating permutation constant to shift all elements.
5689 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5690 for (i
= 0; i
< nelt
; i
++)
5691 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5692 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5696 "shift permutation is not supported by target\n");
5699 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5701 for (k
= 0; k
< 3; k
++)
5703 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5704 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5705 dr_chain
[k
], dr_chain
[k
],
5707 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5711 for (k
= 0; k
< 3; k
++)
5713 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5714 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5715 vect
[k
% 3], vect
[(k
+ 1) % 3],
5717 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5718 vect_shift
[k
] = data_ref
;
5721 for (k
= 0; k
< 3; k
++)
5723 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5724 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5725 vect_shift
[(4 - k
) % 3],
5726 vect_shift
[(3 - k
) % 3],
5728 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5732 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5734 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5735 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5736 vect
[0], shift3_mask
);
5737 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5738 (*result_chain
)[nelt
% 3] = data_ref
;
5740 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5741 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5742 vect
[1], shift4_mask
);
5743 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5744 (*result_chain
)[0] = data_ref
;
5750 /* Function vect_transform_grouped_load.
5752 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5753 to perform their permutation and ascribe the result vectorized statements to
5754 the scalar statements.
5758 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5759 gimple_stmt_iterator
*gsi
)
5762 vec
<tree
> result_chain
= vNULL
;
5764 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5765 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5766 vectors, that are ready for vector computation. */
5767 result_chain
.create (size
);
5769 /* If reassociation width for vector type is 2 or greater target machine can
5770 execute 2 or more vector instructions in parallel. Otherwise try to
5771 get chain for loads group using vect_shift_permute_load_chain. */
5772 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5773 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5774 || exact_log2 (size
) != -1
5775 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5776 gsi
, &result_chain
))
5777 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5778 vect_record_grouped_load_vectors (stmt
, result_chain
);
5779 result_chain
.release ();
5782 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5783 generated as part of the vectorization of STMT. Assign the statement
5784 for each vector to the associated scalar statement. */
5787 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5789 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5790 gimple
*next_stmt
, *new_stmt
;
5791 unsigned int i
, gap_count
;
5794 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5795 Since we scan the chain starting from it's first node, their order
5796 corresponds the order of data-refs in RESULT_CHAIN. */
5797 next_stmt
= first_stmt
;
5799 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5804 /* Skip the gaps. Loads created for the gaps will be removed by dead
5805 code elimination pass later. No need to check for the first stmt in
5806 the group, since it always exists.
5807 GROUP_GAP is the number of steps in elements from the previous
5808 access (if there is no gap GROUP_GAP is 1). We skip loads that
5809 correspond to the gaps. */
5810 if (next_stmt
!= first_stmt
5811 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5819 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5820 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5821 copies, and we put the new vector statement in the first available
5823 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5824 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5827 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5830 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5832 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5835 prev_stmt
= rel_stmt
;
5837 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5840 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5845 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5847 /* If NEXT_STMT accesses the same DR as the previous statement,
5848 put the same TMP_DATA_REF as its vectorized statement; otherwise
5849 get the next data-ref from RESULT_CHAIN. */
5850 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5856 /* Function vect_force_dr_alignment_p.
5858 Returns whether the alignment of a DECL can be forced to be aligned
5859 on ALIGNMENT bit boundary. */
5862 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5864 if (TREE_CODE (decl
) != VAR_DECL
)
5867 if (decl_in_symtab_p (decl
)
5868 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5871 if (TREE_STATIC (decl
))
5872 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5874 return (alignment
<= MAX_STACK_ALIGNMENT
);
5878 /* Return whether the data reference DR is supported with respect to its
5880 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5881 it is aligned, i.e., check if it is possible to vectorize it with different
5884 enum dr_alignment_support
5885 vect_supportable_dr_alignment (struct data_reference
*dr
,
5886 bool check_aligned_accesses
)
5888 gimple
*stmt
= DR_STMT (dr
);
5889 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5890 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5891 machine_mode mode
= TYPE_MODE (vectype
);
5892 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5893 struct loop
*vect_loop
= NULL
;
5894 bool nested_in_vect_loop
= false;
5896 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5899 /* For now assume all conditional loads/stores support unaligned
5900 access without any special code. */
5901 if (is_gimple_call (stmt
)
5902 && gimple_call_internal_p (stmt
)
5903 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5904 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5905 return dr_unaligned_supported
;
5909 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5910 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5913 /* Possibly unaligned access. */
5915 /* We can choose between using the implicit realignment scheme (generating
5916 a misaligned_move stmt) and the explicit realignment scheme (generating
5917 aligned loads with a REALIGN_LOAD). There are two variants to the
5918 explicit realignment scheme: optimized, and unoptimized.
5919 We can optimize the realignment only if the step between consecutive
5920 vector loads is equal to the vector size. Since the vector memory
5921 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5922 is guaranteed that the misalignment amount remains the same throughout the
5923 execution of the vectorized loop. Therefore, we can create the
5924 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5925 at the loop preheader.
5927 However, in the case of outer-loop vectorization, when vectorizing a
5928 memory access in the inner-loop nested within the LOOP that is now being
5929 vectorized, while it is guaranteed that the misalignment of the
5930 vectorized memory access will remain the same in different outer-loop
5931 iterations, it is *not* guaranteed that is will remain the same throughout
5932 the execution of the inner-loop. This is because the inner-loop advances
5933 with the original scalar step (and not in steps of VS). If the inner-loop
5934 step happens to be a multiple of VS, then the misalignment remains fixed
5935 and we can use the optimized realignment scheme. For example:
5941 When vectorizing the i-loop in the above example, the step between
5942 consecutive vector loads is 1, and so the misalignment does not remain
5943 fixed across the execution of the inner-loop, and the realignment cannot
5944 be optimized (as illustrated in the following pseudo vectorized loop):
5946 for (i=0; i<N; i+=4)
5947 for (j=0; j<M; j++){
5948 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5949 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5950 // (assuming that we start from an aligned address).
5953 We therefore have to use the unoptimized realignment scheme:
5955 for (i=0; i<N; i+=4)
5956 for (j=k; j<M; j+=4)
5957 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5958 // that the misalignment of the initial address is
5961 The loop can then be vectorized as follows:
5963 for (k=0; k<4; k++){
5964 rt = get_realignment_token (&vp[k]);
5965 for (i=0; i<N; i+=4){
5967 for (j=k; j<M; j+=4){
5969 va = REALIGN_LOAD <v1,v2,rt>;
5976 if (DR_IS_READ (dr
))
5978 bool is_packed
= false;
5979 tree type
= (TREE_TYPE (DR_REF (dr
)));
5981 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5982 && (!targetm
.vectorize
.builtin_mask_for_load
5983 || targetm
.vectorize
.builtin_mask_for_load ()))
5985 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5987 /* If we are doing SLP then the accesses need not have the
5988 same alignment, instead it depends on the SLP group size. */
5990 && STMT_SLP_TYPE (stmt_info
)
5991 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
5992 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
5993 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
5995 else if (!loop_vinfo
5996 || (nested_in_vect_loop
5997 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5998 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
5999 return dr_explicit_realign
;
6001 return dr_explicit_realign_optimized
;
6003 if (!known_alignment_for_access_p (dr
))
6004 is_packed
= not_size_aligned (DR_REF (dr
));
6006 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6007 || targetm
.vectorize
.
6008 support_vector_misalignment (mode
, type
,
6009 DR_MISALIGNMENT (dr
), is_packed
))
6010 /* Can't software pipeline the loads, but can at least do them. */
6011 return dr_unaligned_supported
;
6015 bool is_packed
= false;
6016 tree type
= (TREE_TYPE (DR_REF (dr
)));
6018 if (!known_alignment_for_access_p (dr
))
6019 is_packed
= not_size_aligned (DR_REF (dr
));
6021 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6022 || targetm
.vectorize
.
6023 support_vector_misalignment (mode
, type
,
6024 DR_MISALIGNMENT (dr
), is_packed
))
6025 return dr_unaligned_supported
;
6029 return dr_unaligned_unsupported
;