1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "optabs-tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
58 tree vectype
, unsigned HOST_WIDE_INT count
)
60 machine_mode mode
, array_mode
;
63 mode
= TYPE_MODE (vectype
);
64 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
65 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
68 if (array_mode
== BLKmode
)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
73 GET_MODE_NAME (mode
), count
);
77 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
81 "cannot use %s<%s><%s>\n", name
,
82 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE
, vect_location
,
88 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
89 GET_MODE_NAME (mode
));
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
114 HOST_WIDE_INT
*rhs_size_unit
)
116 tree scalar_type
= gimple_expr_type (stmt
);
117 HOST_WIDE_INT lhs
, rhs
;
119 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
121 if (is_gimple_assign (stmt
)
122 && (gimple_assign_cast_p (stmt
)
123 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
127 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
129 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
131 scalar_type
= rhs_type
;
134 *lhs_size_unit
= lhs
;
135 *rhs_size_unit
= rhs
;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
145 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
147 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE
, vect_location
,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
157 dump_printf (MSG_NOTE
, " and ");
158 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
159 dump_printf (MSG_NOTE
, "\n");
162 if (optimize_loop_nest_for_size_p (loop
))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
166 "versioning not supported when optimizing"
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
176 "versioning not yet supported for outer-loops.\n");
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
187 "versioning not yet supported for non-constant "
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
205 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
206 loop_vec_info loop_vinfo
, int *max_vf
)
209 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
210 struct data_reference
*dra
= DDR_A (ddr
);
211 struct data_reference
*drb
= DDR_B (ddr
);
212 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
213 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
214 lambda_vector dist_v
;
215 unsigned int loop_depth
;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
227 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
237 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
239 get_alias_set (DR_REF (drb
))))
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop
->safelen
>= 2)
249 if (loop
->safelen
< *max_vf
)
250 *max_vf
= loop
->safelen
;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
265 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
268 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
280 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
283 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop
->safelen
>= 2)
297 if (loop
->safelen
< *max_vf
)
298 *max_vf
= loop
->safelen
;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
313 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
316 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
327 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
329 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
335 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
338 int dist
= dist_v
[loop_depth
];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE
, vect_location
,
342 "dependence distance = %d.\n", dist
);
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
351 dump_printf (MSG_NOTE
, " and ");
352 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
353 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
377 gimple
*earlier_stmt
;
378 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
384 "READ_WRITE dependence in interleaving."
393 if (dist
> 0 && DDR_REVERSED_P (ddr
))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
412 && abs (dist
) < *max_vf
)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf
= abs (dist
);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE
, vect_location
,
419 "adjusting maximal vectorization factor to %i\n",
423 if (abs (dist
) >= *max_vf
)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE
, vect_location
,
429 "dependence distance >= VF.\n");
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
439 dump_printf (MSG_NOTE
, " and ");
440 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
441 dump_printf (MSG_NOTE
, "\n");
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
460 struct data_dependence_relation
*ddr
;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo
)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
471 &LOOP_VINFO_DDRS (loop_vinfo
),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
476 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
493 struct data_reference
*dra
= DDR_A (ddr
);
494 struct data_reference
*drb
= DDR_B (ddr
);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
510 /* If dra and drb are part of the same interleaving chain consider
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
525 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
527 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE
, vect_location
,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
535 dump_printf (MSG_NOTE
, " and ");
536 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
537 dump_printf (MSG_NOTE
, "\n");
544 /* Analyze dependences involved in the transform of SLP NODE. STORES
545 contain the vector of scalar stores of this instance if we are
546 disambiguating the loads. */
549 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
550 vec
<gimple
*> stores
, gimple
*last_store
)
552 /* This walks over all stmts involved in the SLP load/store done
553 in NODE verifying we can sink them up to the last stmt in the
555 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
556 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
558 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
559 if (access
== last_access
)
561 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
562 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
563 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
565 gimple
*stmt
= gsi_stmt (gsi
);
566 if (! gimple_vuse (stmt
)
567 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
570 /* If we couldn't record a (single) data reference for this
571 stmt we have to give up. */
572 /* ??? Here and below if dependence analysis fails we can resort
573 to the alias oracle which can handle more kinds of stmts. */
574 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
578 /* If we run into a store of this same instance (we've just
579 marked those) then delay dependence checking until we run
580 into the last store because this is where it will have
581 been sunk to (and we verify if we can do that as well). */
582 if (gimple_visited_p (stmt
))
584 if (stmt
!= last_store
)
588 FOR_EACH_VEC_ELT (stores
, i
, store
)
590 data_reference
*store_dr
591 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
592 ddr_p ddr
= initialize_data_dependence_relation
593 (dr_a
, store_dr
, vNULL
);
594 if (vect_slp_analyze_data_ref_dependence (ddr
))
596 free_dependence_relation (ddr
);
599 free_dependence_relation (ddr
);
603 ddr_p ddr
= initialize_data_dependence_relation (dr_a
, dr_b
, vNULL
);
604 if (vect_slp_analyze_data_ref_dependence (ddr
))
606 free_dependence_relation (ddr
);
609 free_dependence_relation (ddr
);
616 /* Function vect_analyze_data_ref_dependences.
618 Examine all the data references in the basic-block, and make sure there
619 do not exist any data dependences between them. Set *MAX_VF according to
620 the maximum vectorization factor the data dependences allow. */
623 vect_slp_analyze_instance_dependence (slp_instance instance
)
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE
, vect_location
,
627 "=== vect_slp_analyze_instance_dependence ===\n");
629 /* The stores of this instance are at the root of the SLP tree. */
630 slp_tree store
= SLP_INSTANCE_TREE (instance
);
631 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
634 /* Verify we can sink stores to the vectorized stmt insert location. */
635 gimple
*last_store
= NULL
;
638 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
641 /* Mark stores in this instance and remember the last one. */
642 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
643 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
644 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
649 /* Verify we can sink loads to the vectorized stmt insert location,
650 special-casing stores of this instance. */
653 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
654 if (! vect_slp_analyze_node_dependences (instance
, load
,
656 ? SLP_TREE_SCALAR_STMTS (store
)
657 : vNULL
, last_store
))
663 /* Unset the visited flag. */
665 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
666 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
671 /* Function vect_compute_data_ref_alignment
673 Compute the misalignment of the data reference DR.
676 1. If during the misalignment computation it is found that the data reference
677 cannot be vectorized then false is returned.
678 2. DR_MISALIGNMENT (DR) is defined.
680 FOR NOW: No analysis is actually performed. Misalignment is calculated
681 only for trivial cases. TODO. */
684 vect_compute_data_ref_alignment (struct data_reference
*dr
)
686 gimple
*stmt
= DR_STMT (dr
);
687 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
688 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
689 struct loop
*loop
= NULL
;
690 tree ref
= DR_REF (dr
);
692 tree base
, base_addr
;
693 tree misalign
= NULL_TREE
;
695 unsigned HOST_WIDE_INT alignment
;
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_NOTE
, vect_location
,
699 "vect_compute_data_ref_alignment:\n");
702 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
704 /* Initialize misalignment to unknown. */
705 SET_DR_MISALIGNMENT (dr
, -1);
707 if (tree_fits_shwi_p (DR_STEP (dr
)))
708 misalign
= DR_INIT (dr
);
709 aligned_to
= DR_ALIGNED_TO (dr
);
710 base_addr
= DR_BASE_ADDRESS (dr
);
711 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
713 /* In case the dataref is in an inner-loop of the loop that is being
714 vectorized (LOOP), we use the base and misalignment information
715 relative to the outer-loop (LOOP). This is ok only if the misalignment
716 stays the same throughout the execution of the inner-loop, which is why
717 we have to check that the stride of the dataref in the inner-loop evenly
718 divides by the vector size. */
719 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
721 tree step
= DR_STEP (dr
);
723 if (tree_fits_shwi_p (step
)
724 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE
, vect_location
,
728 "inner step divides the vector-size.\n");
729 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
730 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
731 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
737 "inner step doesn't divide the vector-size.\n");
738 misalign
= NULL_TREE
;
742 /* Similarly we can only use base and misalignment information relative to
743 an innermost loop if the misalignment stays the same throughout the
744 execution of the loop. As above, this is the case if the stride of
745 the dataref evenly divides by the vector size. */
748 tree step
= DR_STEP (dr
);
749 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
751 if (tree_fits_shwi_p (step
)
752 && ((tree_to_shwi (step
) * vf
)
753 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
757 "step doesn't divide the vector-size.\n");
758 misalign
= NULL_TREE
;
762 /* To look at alignment of the base we have to preserve an inner MEM_REF
763 as that carries alignment information of the actual access. */
765 while (handled_component_p (base
))
766 base
= TREE_OPERAND (base
, 0);
767 if (TREE_CODE (base
) == MEM_REF
)
768 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
769 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
770 unsigned int base_alignment
= get_object_alignment (base
);
772 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
773 DR_VECT_AUX (dr
)->base_element_aligned
= true;
775 alignment
= TYPE_ALIGN_UNIT (vectype
);
777 if ((compare_tree_int (aligned_to
, alignment
) < 0)
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
783 "Unknown alignment for access: ");
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
785 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
790 if (base_alignment
< TYPE_ALIGN (vectype
))
792 /* Strip an inner MEM_REF to a bare decl if possible. */
793 if (TREE_CODE (base
) == MEM_REF
794 && integer_zerop (TREE_OPERAND (base
, 1))
795 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
796 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
798 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE
, vect_location
,
803 "can't force alignment of ref: ");
804 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
805 dump_printf (MSG_NOTE
, "\n");
810 /* Force the alignment of the decl.
811 NOTE: This is the only change to the code we make during
812 the analysis phase, before deciding to vectorize the loop. */
813 if (dump_enabled_p ())
815 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
816 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
817 dump_printf (MSG_NOTE
, "\n");
820 DR_VECT_AUX (dr
)->base_decl
= base
;
821 DR_VECT_AUX (dr
)->base_misaligned
= true;
822 DR_VECT_AUX (dr
)->base_element_aligned
= true;
825 /* If this is a backward running DR then first access in the larger
826 vectype actually is N-1 elements before the address in the DR.
827 Adjust misalign accordingly. */
828 if (tree_int_cst_sgn (DR_STEP (dr
)) < 0)
830 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
831 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
832 otherwise we wouldn't be here. */
833 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
834 /* PLUS because DR_STEP was negative. */
835 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
838 SET_DR_MISALIGNMENT (dr
,
839 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
841 if (dump_enabled_p ())
843 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
844 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
845 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
846 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
853 /* Function vect_update_misalignment_for_peel
855 DR - the data reference whose misalignment is to be adjusted.
856 DR_PEEL - the data reference whose misalignment is being made
857 zero in the vector loop by the peel.
858 NPEEL - the number of iterations in the peel loop if the misalignment
859 of DR_PEEL is known at compile time. */
862 vect_update_misalignment_for_peel (struct data_reference
*dr
,
863 struct data_reference
*dr_peel
, int npeel
)
866 vec
<dr_p
> same_align_drs
;
867 struct data_reference
*current_dr
;
868 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
869 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
870 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
871 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
873 /* For interleaved data accesses the step in the loop must be multiplied by
874 the size of the interleaving group. */
875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
876 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
877 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
878 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
880 /* It can be assumed that the data refs with the same alignment as dr_peel
881 are aligned in the vector loop. */
883 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
884 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
886 if (current_dr
!= dr
)
888 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
889 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
890 SET_DR_MISALIGNMENT (dr
, 0);
894 if (known_alignment_for_access_p (dr
)
895 && known_alignment_for_access_p (dr_peel
))
897 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
898 int misal
= DR_MISALIGNMENT (dr
);
899 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
900 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
901 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
902 SET_DR_MISALIGNMENT (dr
, misal
);
906 if (dump_enabled_p ())
907 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
908 SET_DR_MISALIGNMENT (dr
, -1);
912 /* Function verify_data_ref_alignment
914 Return TRUE if DR can be handled with respect to alignment. */
917 verify_data_ref_alignment (data_reference_p dr
)
919 enum dr_alignment_support supportable_dr_alignment
920 = vect_supportable_dr_alignment (dr
, false);
921 if (!supportable_dr_alignment
)
923 if (dump_enabled_p ())
926 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
927 "not vectorized: unsupported unaligned load.");
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
930 "not vectorized: unsupported unaligned "
933 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
935 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
940 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE
, vect_location
,
942 "Vectorizing an unaligned access.\n");
947 /* Function vect_verify_datarefs_alignment
949 Return TRUE if all data references in the loop can be
950 handled with respect to alignment. */
953 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
955 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
956 struct data_reference
*dr
;
959 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
961 gimple
*stmt
= DR_STMT (dr
);
962 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
964 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
967 /* For interleaving, only the alignment of the first access matters. */
968 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
969 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
972 /* Strided accesses perform only component accesses, alignment is
973 irrelevant for them. */
974 if (STMT_VINFO_STRIDED_P (stmt_info
)
975 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
978 if (! verify_data_ref_alignment (dr
))
985 /* Given an memory reference EXP return whether its alignment is less
989 not_size_aligned (tree exp
)
991 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
994 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
995 > get_object_alignment (exp
));
998 /* Function vector_alignment_reachable_p
1000 Return true if vector alignment for DR is reachable by peeling
1001 a few loop iterations. Return false otherwise. */
1004 vector_alignment_reachable_p (struct data_reference
*dr
)
1006 gimple
*stmt
= DR_STMT (dr
);
1007 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1008 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1010 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1012 /* For interleaved access we peel only if number of iterations in
1013 the prolog loop ({VF - misalignment}), is a multiple of the
1014 number of the interleaved accesses. */
1015 int elem_size
, mis_in_elements
;
1016 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1018 /* FORNOW: handle only known alignment. */
1019 if (!known_alignment_for_access_p (dr
))
1022 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1023 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1025 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1029 /* If misalignment is known at the compile time then allow peeling
1030 only if natural alignment is reachable through peeling. */
1031 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1033 HOST_WIDE_INT elmsize
=
1034 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1035 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_NOTE
, vect_location
,
1038 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1039 dump_printf (MSG_NOTE
,
1040 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1042 if (DR_MISALIGNMENT (dr
) % elmsize
)
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1046 "data size does not divide the misalignment.\n");
1051 if (!known_alignment_for_access_p (dr
))
1053 tree type
= TREE_TYPE (DR_REF (dr
));
1054 bool is_packed
= not_size_aligned (DR_REF (dr
));
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1057 "Unknown misalignment, is_packed = %d\n",is_packed
);
1058 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1059 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1069 /* Calculate the cost of the memory access represented by DR. */
1072 vect_get_data_access_cost (struct data_reference
*dr
,
1073 unsigned int *inside_cost
,
1074 unsigned int *outside_cost
,
1075 stmt_vector_for_cost
*body_cost_vec
)
1077 gimple
*stmt
= DR_STMT (dr
);
1078 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1079 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1080 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1081 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1082 int ncopies
= vf
/ nunits
;
1084 if (DR_IS_READ (dr
))
1085 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1086 NULL
, body_cost_vec
, false);
1088 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE
, vect_location
,
1092 "vect_get_data_access_cost: inside_cost = %d, "
1093 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1097 typedef struct _vect_peel_info
1100 struct data_reference
*dr
;
1104 typedef struct _vect_peel_extended_info
1106 struct _vect_peel_info peel_info
;
1107 unsigned int inside_cost
;
1108 unsigned int outside_cost
;
1109 stmt_vector_for_cost body_cost_vec
;
1110 } *vect_peel_extended_info
;
1113 /* Peeling hashtable helpers. */
1115 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1117 static inline hashval_t
hash (const _vect_peel_info
*);
1118 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1122 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1124 return (hashval_t
) peel_info
->npeel
;
1128 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1130 return (a
->npeel
== b
->npeel
);
1134 /* Insert DR into peeling hash table with NPEEL as key. */
1137 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1138 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1141 struct _vect_peel_info elem
, *slot
;
1142 _vect_peel_info
**new_slot
;
1143 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1146 slot
= peeling_htab
->find (&elem
);
1151 slot
= XNEW (struct _vect_peel_info
);
1152 slot
->npeel
= npeel
;
1155 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1159 if (!supportable_dr_alignment
1160 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1161 slot
->count
+= VECT_MAX_COST
;
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166 number of data accesses. */
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1170 _vect_peel_extended_info
*max
)
1172 vect_peel_info elem
= *slot
;
1174 if (elem
->count
> max
->peel_info
.count
1175 || (elem
->count
== max
->peel_info
.count
1176 && max
->peel_info
.npeel
> elem
->npeel
))
1178 max
->peel_info
.npeel
= elem
->npeel
;
1179 max
->peel_info
.count
= elem
->count
;
1180 max
->peel_info
.dr
= elem
->dr
;
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188 Find the one with the lowest cost. */
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1192 _vect_peel_extended_info
*min
)
1194 vect_peel_info elem
= *slot
;
1195 int save_misalignment
, dummy
;
1196 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1197 gimple
*stmt
= DR_STMT (elem
->dr
);
1198 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1199 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1200 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1201 struct data_reference
*dr
;
1202 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1204 prologue_cost_vec
.create (2);
1205 body_cost_vec
.create (2);
1206 epilogue_cost_vec
.create (2);
1208 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1210 stmt
= DR_STMT (dr
);
1211 stmt_info
= vinfo_for_stmt (stmt
);
1212 /* For interleaving, only the alignment of the first access
1214 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1215 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1218 save_misalignment
= DR_MISALIGNMENT (dr
);
1219 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1220 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1222 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1225 outside_cost
+= vect_get_known_peeling_cost
1226 (loop_vinfo
, elem
->npeel
, &dummy
,
1227 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1228 &prologue_cost_vec
, &epilogue_cost_vec
);
1230 /* Prologue and epilogue costs are added to the target model later.
1231 These costs depend only on the scalar iteration cost, the
1232 number of peeling iterations finally chosen, and the number of
1233 misaligned statements. So discard the information found here. */
1234 prologue_cost_vec
.release ();
1235 epilogue_cost_vec
.release ();
1237 if (inside_cost
< min
->inside_cost
1238 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1240 min
->inside_cost
= inside_cost
;
1241 min
->outside_cost
= outside_cost
;
1242 min
->body_cost_vec
.release ();
1243 min
->body_cost_vec
= body_cost_vec
;
1244 min
->peel_info
.dr
= elem
->dr
;
1245 min
->peel_info
.npeel
= elem
->npeel
;
1248 body_cost_vec
.release ();
1254 /* Choose best peeling option by traversing peeling hash table and either
1255 choosing an option with the lowest cost (if cost model is enabled) or the
1256 option that aligns as many accesses as possible. */
1258 static struct data_reference
*
1259 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1260 loop_vec_info loop_vinfo
,
1261 unsigned int *npeel
,
1262 stmt_vector_for_cost
*body_cost_vec
)
1264 struct _vect_peel_extended_info res
;
1266 res
.peel_info
.dr
= NULL
;
1267 res
.body_cost_vec
= stmt_vector_for_cost ();
1269 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1271 res
.inside_cost
= INT_MAX
;
1272 res
.outside_cost
= INT_MAX
;
1273 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1274 vect_peeling_hash_get_lowest_cost
> (&res
);
1278 res
.peel_info
.count
= 0;
1279 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1280 vect_peeling_hash_get_most_frequent
> (&res
);
1283 *npeel
= res
.peel_info
.npeel
;
1284 *body_cost_vec
= res
.body_cost_vec
;
1285 return res
.peel_info
.dr
;
1289 /* Function vect_enhance_data_refs_alignment
1291 This pass will use loop versioning and loop peeling in order to enhance
1292 the alignment of data references in the loop.
1294 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1295 original loop is to be vectorized. Any other loops that are created by
1296 the transformations performed in this pass - are not supposed to be
1297 vectorized. This restriction will be relaxed.
1299 This pass will require a cost model to guide it whether to apply peeling
1300 or versioning or a combination of the two. For example, the scheme that
1301 intel uses when given a loop with several memory accesses, is as follows:
1302 choose one memory access ('p') which alignment you want to force by doing
1303 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1304 other accesses are not necessarily aligned, or (2) use loop versioning to
1305 generate one loop in which all accesses are aligned, and another loop in
1306 which only 'p' is necessarily aligned.
1308 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1309 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1310 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1312 Devising a cost model is the most critical aspect of this work. It will
1313 guide us on which access to peel for, whether to use loop versioning, how
1314 many versions to create, etc. The cost model will probably consist of
1315 generic considerations as well as target specific considerations (on
1316 powerpc for example, misaligned stores are more painful than misaligned
1319 Here are the general steps involved in alignment enhancements:
1321 -- original loop, before alignment analysis:
1322 for (i=0; i<N; i++){
1323 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1324 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1327 -- After vect_compute_data_refs_alignment:
1328 for (i=0; i<N; i++){
1329 x = q[i]; # DR_MISALIGNMENT(q) = 3
1330 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1333 -- Possibility 1: we do loop versioning:
1335 for (i=0; i<N; i++){ # loop 1A
1336 x = q[i]; # DR_MISALIGNMENT(q) = 3
1337 p[i] = y; # DR_MISALIGNMENT(p) = 0
1341 for (i=0; i<N; i++){ # loop 1B
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1347 -- Possibility 2: we do loop peeling:
1348 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1352 for (i = 3; i < N; i++){ # loop 2A
1353 x = q[i]; # DR_MISALIGNMENT(q) = 0
1354 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1357 -- Possibility 3: combination of loop peeling and versioning:
1358 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1363 for (i = 3; i<N; i++){ # loop 3A
1364 x = q[i]; # DR_MISALIGNMENT(q) = 0
1365 p[i] = y; # DR_MISALIGNMENT(p) = 0
1369 for (i = 3; i<N; i++){ # loop 3B
1370 x = q[i]; # DR_MISALIGNMENT(q) = 0
1371 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1375 These loops are later passed to loop_transform to be vectorized. The
1376 vectorizer will use the alignment information to guide the transformation
1377 (whether to generate regular loads/stores, or with special handling for
1381 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1383 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1384 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1385 enum dr_alignment_support supportable_dr_alignment
;
1386 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1387 struct data_reference
*dr
;
1389 bool do_peeling
= false;
1390 bool do_versioning
= false;
1393 stmt_vec_info stmt_info
;
1394 unsigned int npeel
= 0;
1395 bool all_misalignments_unknown
= true;
1396 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1397 unsigned possible_npeel_number
= 1;
1399 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1400 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1401 hash_table
<peel_info_hasher
> peeling_htab (1);
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_NOTE
, vect_location
,
1405 "=== vect_enhance_data_refs_alignment ===\n");
1407 /* Reset data so we can safely be called multiple times. */
1408 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1409 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1411 /* While cost model enhancements are expected in the future, the high level
1412 view of the code at this time is as follows:
1414 A) If there is a misaligned access then see if peeling to align
1415 this access can make all data references satisfy
1416 vect_supportable_dr_alignment. If so, update data structures
1417 as needed and return true.
1419 B) If peeling wasn't possible and there is a data reference with an
1420 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1421 then see if loop versioning checks can be used to make all data
1422 references satisfy vect_supportable_dr_alignment. If so, update
1423 data structures as needed and return true.
1425 C) If neither peeling nor versioning were successful then return false if
1426 any data reference does not satisfy vect_supportable_dr_alignment.
1428 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1430 Note, Possibility 3 above (which is peeling and versioning together) is not
1431 being done at this time. */
1433 /* (1) Peeling to force alignment. */
1435 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1437 + How many accesses will become aligned due to the peeling
1438 - How many accesses will become unaligned due to the peeling,
1439 and the cost of misaligned accesses.
1440 - The cost of peeling (the extra runtime checks, the increase
1443 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1445 stmt
= DR_STMT (dr
);
1446 stmt_info
= vinfo_for_stmt (stmt
);
1448 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1451 /* For interleaving, only the alignment of the first access
1453 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1454 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1457 /* For invariant accesses there is nothing to enhance. */
1458 if (integer_zerop (DR_STEP (dr
)))
1461 /* Strided accesses perform only component accesses, alignment is
1462 irrelevant for them. */
1463 if (STMT_VINFO_STRIDED_P (stmt_info
)
1464 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1467 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1468 do_peeling
= vector_alignment_reachable_p (dr
);
1471 if (known_alignment_for_access_p (dr
))
1473 unsigned int npeel_tmp
;
1474 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1475 size_zero_node
) < 0;
1477 /* Save info about DR in the hash table. */
1478 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1479 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1480 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1481 TREE_TYPE (DR_REF (dr
))));
1482 npeel_tmp
= (negative
1483 ? (mis
- nelements
) : (nelements
- mis
))
1486 /* For multiple types, it is possible that the bigger type access
1487 will have more than one peeling option. E.g., a loop with two
1488 types: one of size (vector size / 4), and the other one of
1489 size (vector size / 8). Vectorization factor will 8. If both
1490 access are misaligned by 3, the first one needs one scalar
1491 iteration to be aligned, and the second one needs 5. But the
1492 the first one will be aligned also by peeling 5 scalar
1493 iterations, and in that case both accesses will be aligned.
1494 Hence, except for the immediate peeling amount, we also want
1495 to try to add full vector size, while we don't exceed
1496 vectorization factor.
1497 We do this automtically for cost model, since we calculate cost
1498 for every peeling option. */
1499 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1501 if (STMT_SLP_TYPE (stmt_info
))
1502 possible_npeel_number
1503 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1505 possible_npeel_number
= vf
/ nelements
;
1508 /* Handle the aligned case. We may decide to align some other
1509 access, making DR unaligned. */
1510 if (DR_MISALIGNMENT (dr
) == 0)
1513 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1514 possible_npeel_number
++;
1517 for (j
= 0; j
< possible_npeel_number
; j
++)
1519 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1521 npeel_tmp
+= nelements
;
1524 all_misalignments_unknown
= false;
1525 /* Data-ref that was chosen for the case that all the
1526 misalignments are unknown is not relevant anymore, since we
1527 have a data-ref with known alignment. */
1532 /* If we don't know any misalignment values, we prefer
1533 peeling for data-ref that has the maximum number of data-refs
1534 with the same alignment, unless the target prefers to align
1535 stores over load. */
1536 if (all_misalignments_unknown
)
1538 unsigned same_align_drs
1539 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1541 || same_align_drs_max
< same_align_drs
)
1543 same_align_drs_max
= same_align_drs
;
1546 /* For data-refs with the same number of related
1547 accesses prefer the one where the misalign
1548 computation will be invariant in the outermost loop. */
1549 else if (same_align_drs_max
== same_align_drs
)
1551 struct loop
*ivloop0
, *ivloop
;
1552 ivloop0
= outermost_invariant_loop_for_expr
1553 (loop
, DR_BASE_ADDRESS (dr0
));
1554 ivloop
= outermost_invariant_loop_for_expr
1555 (loop
, DR_BASE_ADDRESS (dr
));
1556 if ((ivloop
&& !ivloop0
)
1557 || (ivloop
&& ivloop0
1558 && flow_loop_nested_p (ivloop
, ivloop0
)))
1562 if (!first_store
&& DR_IS_WRITE (dr
))
1566 /* If there are both known and unknown misaligned accesses in the
1567 loop, we choose peeling amount according to the known
1569 if (!supportable_dr_alignment
)
1572 if (!first_store
&& DR_IS_WRITE (dr
))
1579 if (!aligned_access_p (dr
))
1581 if (dump_enabled_p ())
1582 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1583 "vector alignment may not be reachable\n");
1589 /* Check if we can possibly peel the loop. */
1590 if (!vect_can_advance_ivs_p (loop_vinfo
)
1591 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1596 && all_misalignments_unknown
1597 && vect_supportable_dr_alignment (dr0
, false))
1599 /* Check if the target requires to prefer stores over loads, i.e., if
1600 misaligned stores are more expensive than misaligned loads (taking
1601 drs with same alignment into account). */
1602 if (first_store
&& DR_IS_READ (dr0
))
1604 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1605 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1606 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1607 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1608 stmt_vector_for_cost dummy
;
1611 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1613 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1614 &store_outside_cost
, &dummy
);
1618 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1619 aligning the load DR0). */
1620 load_inside_penalty
= store_inside_cost
;
1621 load_outside_penalty
= store_outside_cost
;
1623 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1624 DR_STMT (first_store
))).iterate (i
, &dr
);
1626 if (DR_IS_READ (dr
))
1628 load_inside_penalty
+= load_inside_cost
;
1629 load_outside_penalty
+= load_outside_cost
;
1633 load_inside_penalty
+= store_inside_cost
;
1634 load_outside_penalty
+= store_outside_cost
;
1637 /* Calculate the penalty for leaving DR0 unaligned (by
1638 aligning the FIRST_STORE). */
1639 store_inside_penalty
= load_inside_cost
;
1640 store_outside_penalty
= load_outside_cost
;
1642 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1643 DR_STMT (dr0
))).iterate (i
, &dr
);
1645 if (DR_IS_READ (dr
))
1647 store_inside_penalty
+= load_inside_cost
;
1648 store_outside_penalty
+= load_outside_cost
;
1652 store_inside_penalty
+= store_inside_cost
;
1653 store_outside_penalty
+= store_outside_cost
;
1656 if (load_inside_penalty
> store_inside_penalty
1657 || (load_inside_penalty
== store_inside_penalty
1658 && load_outside_penalty
> store_outside_penalty
))
1662 /* In case there are only loads with different unknown misalignments, use
1663 peeling only if it may help to align other accesses in the loop or
1664 if it may help improving load bandwith when we'd end up using
1666 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1668 && !STMT_VINFO_SAME_ALIGN_REFS (
1669 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1670 && (vect_supportable_dr_alignment (dr0
, false)
1671 != dr_unaligned_supported
1672 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1673 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1677 if (do_peeling
&& !dr0
)
1679 /* Peeling is possible, but there is no data access that is not supported
1680 unless aligned. So we try to choose the best possible peeling. */
1682 /* We should get here only if there are drs with known misalignment. */
1683 gcc_assert (!all_misalignments_unknown
);
1685 /* Choose the best peeling from the hash table. */
1686 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1695 stmt
= DR_STMT (dr0
);
1696 stmt_info
= vinfo_for_stmt (stmt
);
1697 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1698 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1700 if (known_alignment_for_access_p (dr0
))
1702 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1703 size_zero_node
) < 0;
1706 /* Since it's known at compile time, compute the number of
1707 iterations in the peeled loop (the peeling factor) for use in
1708 updating DR_MISALIGNMENT values. The peeling factor is the
1709 vectorization factor minus the misalignment as an element
1711 mis
= DR_MISALIGNMENT (dr0
);
1712 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1713 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1717 /* For interleaved data access every iteration accesses all the
1718 members of the group, therefore we divide the number of iterations
1719 by the group size. */
1720 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1721 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1722 npeel
/= GROUP_SIZE (stmt_info
);
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE
, vect_location
,
1726 "Try peeling by %d\n", npeel
);
1729 /* Ensure that all data refs can be vectorized after the peel. */
1730 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1732 int save_misalignment
;
1737 stmt
= DR_STMT (dr
);
1738 stmt_info
= vinfo_for_stmt (stmt
);
1739 /* For interleaving, only the alignment of the first access
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1742 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1745 /* Strided accesses perform only component accesses, alignment is
1746 irrelevant for them. */
1747 if (STMT_VINFO_STRIDED_P (stmt_info
)
1748 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1751 save_misalignment
= DR_MISALIGNMENT (dr
);
1752 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1753 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1754 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1756 if (!supportable_dr_alignment
)
1763 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1765 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1770 body_cost_vec
.release ();
1775 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1778 unsigned max_allowed_peel
1779 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1780 if (max_allowed_peel
!= (unsigned)-1)
1782 unsigned max_peel
= npeel
;
1785 gimple
*dr_stmt
= DR_STMT (dr0
);
1786 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1787 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1788 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1790 if (max_peel
> max_allowed_peel
)
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_NOTE
, vect_location
,
1795 "Disable peeling, max peels reached: %d\n", max_peel
);
1800 /* Cost model #2 - if peeling may result in a remaining loop not
1801 iterating enough to be vectorized then do not peel. */
1803 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1806 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1807 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1808 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1814 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1815 If the misalignment of DR_i is identical to that of dr0 then set
1816 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1817 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1818 by the peeling factor times the element size of DR_i (MOD the
1819 vectorization factor times the size). Otherwise, the
1820 misalignment of DR_i must be set to unknown. */
1821 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1823 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1825 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1827 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1829 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1830 = DR_MISALIGNMENT (dr0
);
1831 SET_DR_MISALIGNMENT (dr0
, 0);
1832 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE
, vect_location
,
1835 "Alignment of access forced using peeling.\n");
1836 dump_printf_loc (MSG_NOTE
, vect_location
,
1837 "Peeling for alignment will be applied.\n");
1839 /* The inside-loop cost will be accounted for in vectorizable_load
1840 and vectorizable_store correctly with adjusted alignments.
1841 Drop the body_cst_vec on the floor here. */
1842 body_cost_vec
.release ();
1844 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1850 body_cost_vec
.release ();
1852 /* (2) Versioning to force alignment. */
1854 /* Try versioning if:
1855 1) optimize loop for speed
1856 2) there is at least one unsupported misaligned data ref with an unknown
1858 3) all misaligned data refs with a known misalignment are supported, and
1859 4) the number of runtime alignment checks is within reason. */
1862 optimize_loop_nest_for_speed_p (loop
)
1863 && (!loop
->inner
); /* FORNOW */
1867 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1869 stmt
= DR_STMT (dr
);
1870 stmt_info
= vinfo_for_stmt (stmt
);
1872 /* For interleaving, only the alignment of the first access
1874 if (aligned_access_p (dr
)
1875 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1876 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1879 if (STMT_VINFO_STRIDED_P (stmt_info
))
1881 /* Strided loads perform only component accesses, alignment is
1882 irrelevant for them. */
1883 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1885 do_versioning
= false;
1889 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1891 if (!supportable_dr_alignment
)
1897 if (known_alignment_for_access_p (dr
)
1898 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1899 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1901 do_versioning
= false;
1905 stmt
= DR_STMT (dr
);
1906 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1907 gcc_assert (vectype
);
1909 /* The rightmost bits of an aligned address must be zeros.
1910 Construct the mask needed for this test. For example,
1911 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1912 mask must be 15 = 0xf. */
1913 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1915 /* FORNOW: use the same mask to test all potentially unaligned
1916 references in the loop. The vectorizer currently supports
1917 a single vector size, see the reference to
1918 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1919 vectorization factor is computed. */
1920 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1921 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1922 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1923 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1928 /* Versioning requires at least one misaligned data reference. */
1929 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1930 do_versioning
= false;
1931 else if (!do_versioning
)
1932 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1937 vec
<gimple
*> may_misalign_stmts
1938 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1941 /* It can now be assumed that the data references in the statements
1942 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1943 of the loop being vectorized. */
1944 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1946 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1947 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1948 SET_DR_MISALIGNMENT (dr
, 0);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE
, vect_location
,
1951 "Alignment of access forced using versioning.\n");
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE
, vect_location
,
1956 "Versioning for alignment will be applied.\n");
1958 /* Peeling and versioning can't be done together at this time. */
1959 gcc_assert (! (do_peeling
&& do_versioning
));
1961 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1966 /* This point is reached if neither peeling nor versioning is being done. */
1967 gcc_assert (! (do_peeling
|| do_versioning
));
1969 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1974 /* Function vect_find_same_alignment_drs.
1976 Update group and alignment relations according to the chosen
1977 vectorization factor. */
1980 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1981 loop_vec_info loop_vinfo
)
1984 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1985 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1986 struct data_reference
*dra
= DDR_A (ddr
);
1987 struct data_reference
*drb
= DDR_B (ddr
);
1988 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
1989 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
1990 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
1991 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
1992 lambda_vector dist_v
;
1993 unsigned int loop_depth
;
1995 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2001 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2004 /* Loop-based vectorization and known data dependence. */
2005 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2008 /* Data-dependence analysis reports a distance vector of zero
2009 for data-references that overlap only in the first iteration
2010 but have different sign step (see PR45764).
2011 So as a sanity check require equal DR_STEP. */
2012 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2015 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2016 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2018 int dist
= dist_v
[loop_depth
];
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE
, vect_location
,
2022 "dependence distance = %d.\n", dist
);
2024 /* Same loop iteration. */
2026 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2028 /* Two references with distance zero have the same alignment. */
2029 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2030 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE
, vect_location
,
2034 "accesses have the same alignment.\n");
2035 dump_printf (MSG_NOTE
,
2036 "dependence distance modulo vf == 0 between ");
2037 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2038 dump_printf (MSG_NOTE
, " and ");
2039 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2040 dump_printf (MSG_NOTE
, "\n");
2047 /* Function vect_analyze_data_refs_alignment
2049 Analyze the alignment of the data-references in the loop.
2050 Return FALSE if a data reference is found that cannot be vectorized. */
2053 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE
, vect_location
,
2057 "=== vect_analyze_data_refs_alignment ===\n");
2059 /* Mark groups of data references with same alignment using
2060 data dependence information. */
2061 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2062 struct data_dependence_relation
*ddr
;
2065 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2066 vect_find_same_alignment_drs (ddr
, vinfo
);
2068 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2069 struct data_reference
*dr
;
2071 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2073 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2074 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2075 && !vect_compute_data_ref_alignment (dr
))
2077 /* Strided accesses perform only component accesses, misalignment
2078 information is irrelevant for them. */
2079 if (STMT_VINFO_STRIDED_P (stmt_info
)
2080 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2085 "not vectorized: can't calculate alignment "
2096 /* Analyze alignment of DRs of stmts in NODE. */
2099 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2101 /* We vectorize from the first scalar stmt in the node unless
2102 the node is permuted in which case we start from the first
2103 element in the group. */
2104 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2105 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2106 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2107 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2109 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2110 if (! vect_compute_data_ref_alignment (dr
)
2111 /* For creating the data-ref pointer we need alignment of the
2112 first element anyway. */
2114 && ! vect_compute_data_ref_alignment (first_dr
))
2115 || ! verify_data_ref_alignment (dr
))
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2119 "not vectorized: bad data alignment in basic "
2127 /* Function vect_slp_analyze_instance_alignment
2129 Analyze the alignment of the data-references in the SLP instance.
2130 Return FALSE if a data reference is found that cannot be vectorized. */
2133 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_NOTE
, vect_location
,
2137 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2141 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2142 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2145 node
= SLP_INSTANCE_TREE (instance
);
2146 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2147 && ! vect_slp_analyze_and_verify_node_alignment
2148 (SLP_INSTANCE_TREE (instance
)))
2155 /* Analyze groups of accesses: check that DR belongs to a group of
2156 accesses of legal size, step, etc. Detect gaps, single element
2157 interleaving, and other special cases. Set grouped access info.
2158 Collect groups of strided stores for further use in SLP analysis.
2159 Worker for vect_analyze_group_access. */
2162 vect_analyze_group_access_1 (struct data_reference
*dr
)
2164 tree step
= DR_STEP (dr
);
2165 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2166 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2167 gimple
*stmt
= DR_STMT (dr
);
2168 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2169 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2170 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2171 HOST_WIDE_INT dr_step
= -1;
2172 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2173 bool slp_impossible
= false;
2175 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2176 size of the interleaving group (including gaps). */
2177 if (tree_fits_shwi_p (step
))
2179 dr_step
= tree_to_shwi (step
);
2180 /* Check that STEP is a multiple of type size. Otherwise there is
2181 a non-element-sized gap at the end of the group which we
2182 cannot represent in GROUP_GAP or GROUP_SIZE.
2183 ??? As we can handle non-constant step fine here we should
2184 simply remove uses of GROUP_GAP between the last and first
2185 element and instead rely on DR_STEP. GROUP_SIZE then would
2186 simply not include that gap. */
2187 if ((dr_step
% type_size
) != 0)
2189 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE
, vect_location
,
2193 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2194 dump_printf (MSG_NOTE
,
2195 " is not a multiple of the element size for ");
2196 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2197 dump_printf (MSG_NOTE
, "\n");
2201 groupsize
= absu_hwi (dr_step
) / type_size
;
2206 /* Not consecutive access is possible only if it is a part of interleaving. */
2207 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2209 /* Check if it this DR is a part of interleaving, and is a single
2210 element of the group that is accessed in the loop. */
2212 /* Gaps are supported only for loads. STEP must be a multiple of the type
2213 size. The size of the group must be a power of 2. */
2215 && (dr_step
% type_size
) == 0
2217 && exact_log2 (groupsize
) != -1)
2219 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2220 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2221 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE
, vect_location
,
2224 "Detected single element interleaving ");
2225 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2226 dump_printf (MSG_NOTE
, " step ");
2227 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2228 dump_printf (MSG_NOTE
, "\n");
2234 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2237 "not consecutive access ");
2238 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2243 /* Mark the statement as unvectorizable. */
2244 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2248 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2249 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2253 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2255 /* First stmt in the interleaving chain. Check the chain. */
2256 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2257 struct data_reference
*data_ref
= dr
;
2258 unsigned int count
= 1;
2259 tree prev_init
= DR_INIT (data_ref
);
2260 gimple
*prev
= stmt
;
2261 HOST_WIDE_INT diff
, gaps
= 0;
2265 /* Skip same data-refs. In case that two or more stmts share
2266 data-ref (supported only for loads), we vectorize only the first
2267 stmt, and the rest get their vectorized loads from the first
2269 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2270 DR_INIT (STMT_VINFO_DATA_REF (
2271 vinfo_for_stmt (next
)))))
2273 if (DR_IS_WRITE (data_ref
))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2277 "Two store stmts share the same dr.\n");
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2283 "Two or more load stmts share the same dr.\n");
2285 /* For load use the same data-ref load. */
2286 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2289 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2294 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2296 /* All group members have the same STEP by construction. */
2297 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2299 /* Check that the distance between two accesses is equal to the type
2300 size. Otherwise, we have gaps. */
2301 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2302 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2305 /* FORNOW: SLP of accesses with gaps is not supported. */
2306 slp_impossible
= true;
2307 if (DR_IS_WRITE (data_ref
))
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2311 "interleaved store with gaps\n");
2318 last_accessed_element
+= diff
;
2320 /* Store the gap from the previous member of the group. If there is no
2321 gap in the access, GROUP_GAP is always 1. */
2322 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2324 prev_init
= DR_INIT (data_ref
);
2325 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2326 /* Count the number of data-refs in the chain. */
2331 groupsize
= count
+ gaps
;
2333 if (groupsize
> UINT_MAX
)
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2337 "group is too large\n");
2341 /* Check that the size of the interleaving is equal to count for stores,
2342 i.e., that there are no gaps. */
2343 if (groupsize
!= count
2344 && !DR_IS_READ (dr
))
2346 if (dump_enabled_p ())
2347 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2348 "interleaved store with gaps\n");
2352 /* If there is a gap after the last load in the group it is the
2353 difference between the groupsize and the last accessed
2355 When there is no gap, this difference should be 0. */
2356 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2358 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2359 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_NOTE
, vect_location
,
2362 "Detected interleaving ");
2363 if (DR_IS_READ (dr
))
2364 dump_printf (MSG_NOTE
, "load ");
2366 dump_printf (MSG_NOTE
, "store ");
2367 dump_printf (MSG_NOTE
, "of size %u starting with ",
2368 (unsigned)groupsize
);
2369 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2370 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2371 dump_printf_loc (MSG_NOTE
, vect_location
,
2372 "There is a gap of %u elements after the group\n",
2373 GROUP_GAP (vinfo_for_stmt (stmt
)));
2376 /* SLP: create an SLP data structure for every interleaving group of
2377 stores for further analysis in vect_analyse_slp. */
2378 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2381 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2383 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2390 /* Analyze groups of accesses: check that DR belongs to a group of
2391 accesses of legal size, step, etc. Detect gaps, single element
2392 interleaving, and other special cases. Set grouped access info.
2393 Collect groups of strided stores for further use in SLP analysis. */
2396 vect_analyze_group_access (struct data_reference
*dr
)
2398 if (!vect_analyze_group_access_1 (dr
))
2400 /* Dissolve the group if present. */
2402 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2405 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2406 next
= GROUP_NEXT_ELEMENT (vinfo
);
2407 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2408 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2416 /* Analyze the access pattern of the data-reference DR.
2417 In case of non-consecutive accesses call vect_analyze_group_access() to
2418 analyze groups of accesses. */
2421 vect_analyze_data_ref_access (struct data_reference
*dr
)
2423 tree step
= DR_STEP (dr
);
2424 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2425 gimple
*stmt
= DR_STMT (dr
);
2426 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2427 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2428 struct loop
*loop
= NULL
;
2431 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2433 if (loop_vinfo
&& !step
)
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2437 "bad data-ref access in loop\n");
2441 /* Allow loads with zero step in inner-loop vectorization. */
2442 if (loop_vinfo
&& integer_zerop (step
))
2444 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2445 if (!nested_in_vect_loop_p (loop
, stmt
))
2446 return DR_IS_READ (dr
);
2447 /* Allow references with zero step for outer loops marked
2448 with pragma omp simd only - it guarantees absence of
2449 loop-carried dependencies between inner loop iterations. */
2450 if (!loop
->force_vectorize
)
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_NOTE
, vect_location
,
2454 "zero step in inner loop of nest\n");
2459 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2461 /* Interleaved accesses are not yet supported within outer-loop
2462 vectorization for references in the inner-loop. */
2463 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2465 /* For the rest of the analysis we use the outer-loop step. */
2466 step
= STMT_VINFO_DR_STEP (stmt_info
);
2467 if (integer_zerop (step
))
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_NOTE
, vect_location
,
2471 "zero step in outer loop.\n");
2472 return DR_IS_READ (dr
);
2477 if (TREE_CODE (step
) == INTEGER_CST
)
2479 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2480 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2482 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2484 /* Mark that it is not interleaving. */
2485 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2490 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_NOTE
, vect_location
,
2494 "grouped access in outer loop.\n");
2499 /* Assume this is a DR handled by non-constant strided load case. */
2500 if (TREE_CODE (step
) != INTEGER_CST
)
2501 return (STMT_VINFO_STRIDED_P (stmt_info
)
2502 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2503 || vect_analyze_group_access (dr
)));
2505 /* Not consecutive access - check if it's a part of interleaving group. */
2506 return vect_analyze_group_access (dr
);
2511 /* A helper function used in the comparator function to sort data
2512 references. T1 and T2 are two data references to be compared.
2513 The function returns -1, 0, or 1. */
2516 compare_tree (tree t1
, tree t2
)
2519 enum tree_code code
;
2532 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2533 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2535 code
= TREE_CODE (t1
);
2538 /* For const values, we can just use hash values for comparisons. */
2546 hashval_t h1
= iterative_hash_expr (t1
, 0);
2547 hashval_t h2
= iterative_hash_expr (t2
, 0);
2549 return h1
< h2
? -1 : 1;
2554 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2558 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2559 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2563 tclass
= TREE_CODE_CLASS (code
);
2565 /* For var-decl, we could compare their UIDs. */
2566 if (tclass
== tcc_declaration
)
2568 if (DECL_UID (t1
) != DECL_UID (t2
))
2569 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2573 /* For expressions with operands, compare their operands recursively. */
2574 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2576 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2586 /* Compare two data-references DRA and DRB to group them into chunks
2587 suitable for grouping. */
2590 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2592 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2593 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2596 /* Stabilize sort. */
2600 /* Ordering of DRs according to base. */
2601 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2603 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2608 /* And according to DR_OFFSET. */
2609 if (!dr_equal_offsets_p (dra
, drb
))
2611 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2616 /* Put reads before writes. */
2617 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2618 return DR_IS_READ (dra
) ? -1 : 1;
2620 /* Then sort after access size. */
2621 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2622 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2624 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2625 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2630 /* And after step. */
2631 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2633 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2638 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2639 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2641 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2645 /* Function vect_analyze_data_ref_accesses.
2647 Analyze the access pattern of all the data references in the loop.
2649 FORNOW: the only access pattern that is considered vectorizable is a
2650 simple step 1 (consecutive) access.
2652 FORNOW: handle only arrays and pointer accesses. */
2655 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2658 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2659 struct data_reference
*dr
;
2661 if (dump_enabled_p ())
2662 dump_printf_loc (MSG_NOTE
, vect_location
,
2663 "=== vect_analyze_data_ref_accesses ===\n");
2665 if (datarefs
.is_empty ())
2668 /* Sort the array of datarefs to make building the interleaving chains
2669 linear. Don't modify the original vector's order, it is needed for
2670 determining what dependencies are reversed. */
2671 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2672 datarefs_copy
.qsort (dr_group_sort_cmp
);
2674 /* Build the interleaving chains. */
2675 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2677 data_reference_p dra
= datarefs_copy
[i
];
2678 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2679 stmt_vec_info lastinfo
= NULL
;
2680 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2682 data_reference_p drb
= datarefs_copy
[i
];
2683 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2685 /* ??? Imperfect sorting (non-compatible types, non-modulo
2686 accesses, same accesses) can lead to a group to be artificially
2687 split here as we don't just skip over those. If it really
2688 matters we can push those to a worklist and re-iterate
2689 over them. The we can just skip ahead to the next DR here. */
2691 /* Check that the data-refs have same first location (except init)
2692 and they are both either store or load (not load and store,
2693 not masked loads or stores). */
2694 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2695 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2696 DR_BASE_ADDRESS (drb
), 0)
2697 || !dr_equal_offsets_p (dra
, drb
)
2698 || !gimple_assign_single_p (DR_STMT (dra
))
2699 || !gimple_assign_single_p (DR_STMT (drb
)))
2702 /* Check that the data-refs have the same constant size. */
2703 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2704 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2705 if (!tree_fits_uhwi_p (sza
)
2706 || !tree_fits_uhwi_p (szb
)
2707 || !tree_int_cst_equal (sza
, szb
))
2710 /* Check that the data-refs have the same step. */
2711 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2714 /* Do not place the same access in the interleaving chain twice. */
2715 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2718 /* Check the types are compatible.
2719 ??? We don't distinguish this during sorting. */
2720 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2721 TREE_TYPE (DR_REF (drb
))))
2724 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2725 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2726 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2727 gcc_assert (init_a
< init_b
);
2729 /* If init_b == init_a + the size of the type * k, we have an
2730 interleaving, and DRA is accessed before DRB. */
2731 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2732 if (type_size_a
== 0
2733 || (init_b
- init_a
) % type_size_a
!= 0)
2736 /* If we have a store, the accesses are adjacent. This splits
2737 groups into chunks we support (we don't support vectorization
2738 of stores with gaps). */
2739 if (!DR_IS_READ (dra
)
2740 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2741 (DR_INIT (datarefs_copy
[i
-1]))
2745 /* If the step (if not zero or non-constant) is greater than the
2746 difference between data-refs' inits this splits groups into
2748 if (tree_fits_shwi_p (DR_STEP (dra
)))
2750 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2751 if (step
!= 0 && step
<= (init_b
- init_a
))
2755 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_NOTE
, vect_location
,
2758 "Detected interleaving ");
2759 if (DR_IS_READ (dra
))
2760 dump_printf (MSG_NOTE
, "load ");
2762 dump_printf (MSG_NOTE
, "store ");
2763 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2764 dump_printf (MSG_NOTE
, " and ");
2765 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2766 dump_printf (MSG_NOTE
, "\n");
2769 /* Link the found element into the group list. */
2770 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2772 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2773 lastinfo
= stmtinfo_a
;
2775 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2776 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2777 lastinfo
= stmtinfo_b
;
2781 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2782 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2783 && !vect_analyze_data_ref_access (dr
))
2785 if (dump_enabled_p ())
2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2787 "not vectorized: complicated access pattern.\n");
2789 if (is_a
<bb_vec_info
> (vinfo
))
2791 /* Mark the statement as not vectorizable. */
2792 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2797 datarefs_copy
.release ();
2802 datarefs_copy
.release ();
2807 /* Operator == between two dr_with_seg_len objects.
2809 This equality operator is used to make sure two data refs
2810 are the same one so that we will consider to combine the
2811 aliasing checks of those two pairs of data dependent data
2815 operator == (const dr_with_seg_len
& d1
,
2816 const dr_with_seg_len
& d2
)
2818 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2819 DR_BASE_ADDRESS (d2
.dr
), 0)
2820 && compare_tree (d1
.offset
, d2
.offset
) == 0
2821 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2824 /* Function comp_dr_with_seg_len_pair.
2826 Comparison function for sorting objects of dr_with_seg_len_pair_t
2827 so that we can combine aliasing checks in one scan. */
2830 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2832 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2833 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2835 const dr_with_seg_len
&p11
= p1
->first
,
2840 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2841 if a and c have the same basic address snd step, and b and d have the same
2842 address and step. Therefore, if any a&c or b&d don't have the same address
2843 and step, we don't care the order of those two pairs after sorting. */
2846 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2847 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2849 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2850 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2852 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2854 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2856 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2858 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2864 /* Function vect_vfa_segment_size.
2866 Create an expression that computes the size of segment
2867 that will be accessed for a data reference. The functions takes into
2868 account that realignment loads may access one more vector.
2871 DR: The data reference.
2872 LENGTH_FACTOR: segment length to consider.
2874 Return an expression whose value is the size of segment which will be
2878 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2880 tree segment_length
;
2882 if (integer_zerop (DR_STEP (dr
)))
2883 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2885 segment_length
= size_binop (MULT_EXPR
,
2886 fold_convert (sizetype
, DR_STEP (dr
)),
2887 fold_convert (sizetype
, length_factor
));
2889 if (vect_supportable_dr_alignment (dr
, false)
2890 == dr_explicit_realign_optimized
)
2892 tree vector_size
= TYPE_SIZE_UNIT
2893 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2895 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2897 return segment_length
;
2900 /* Function vect_prune_runtime_alias_test_list.
2902 Prune a list of ddrs to be tested at run-time by versioning for alias.
2903 Merge several alias checks into one if possible.
2904 Return FALSE if resulting list of ddrs is longer then allowed by
2905 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2908 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2910 vec
<ddr_p
> may_alias_ddrs
=
2911 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2912 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2913 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2914 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2915 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2921 if (dump_enabled_p ())
2922 dump_printf_loc (MSG_NOTE
, vect_location
,
2923 "=== vect_prune_runtime_alias_test_list ===\n");
2925 if (may_alias_ddrs
.is_empty ())
2928 /* Basically, for each pair of dependent data refs store_ptr_0
2929 and load_ptr_0, we create an expression:
2931 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2932 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2934 for aliasing checks. However, in some cases we can decrease
2935 the number of checks by combining two checks into one. For
2936 example, suppose we have another pair of data refs store_ptr_0
2937 and load_ptr_1, and if the following condition is satisfied:
2939 load_ptr_0 < load_ptr_1 &&
2940 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2942 (this condition means, in each iteration of vectorized loop,
2943 the accessed memory of store_ptr_0 cannot be between the memory
2944 of load_ptr_0 and load_ptr_1.)
2946 we then can use only the following expression to finish the
2947 alising checks between store_ptr_0 & load_ptr_0 and
2948 store_ptr_0 & load_ptr_1:
2950 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2951 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2953 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2954 same basic address. */
2956 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2958 /* First, we collect all data ref pairs for aliasing checks. */
2959 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2961 struct data_reference
*dr_a
, *dr_b
;
2962 gimple
*dr_group_first_a
, *dr_group_first_b
;
2963 tree segment_length_a
, segment_length_b
;
2964 gimple
*stmt_a
, *stmt_b
;
2967 stmt_a
= DR_STMT (DDR_A (ddr
));
2968 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2969 if (dr_group_first_a
)
2971 stmt_a
= dr_group_first_a
;
2972 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2976 stmt_b
= DR_STMT (DDR_B (ddr
));
2977 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2978 if (dr_group_first_b
)
2980 stmt_b
= dr_group_first_b
;
2981 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2984 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2985 length_factor
= scalar_loop_iters
;
2987 length_factor
= size_int (vect_factor
);
2988 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2989 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2991 dr_with_seg_len_pair_t dr_with_seg_len_pair
2992 (dr_with_seg_len (dr_a
, segment_length_a
),
2993 dr_with_seg_len (dr_b
, segment_length_b
));
2995 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2996 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2998 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3001 /* Second, we sort the collected data ref pairs so that we can scan
3002 them once to combine all possible aliasing checks. */
3003 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3005 /* Third, we scan the sorted dr pairs and check if we can combine
3006 alias checks of two neighbouring dr pairs. */
3007 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3009 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3010 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3011 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3012 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3013 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3015 /* Remove duplicate data ref pairs. */
3016 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3018 if (dump_enabled_p ())
3020 dump_printf_loc (MSG_NOTE
, vect_location
,
3021 "found equal ranges ");
3022 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3023 DR_REF (dr_a1
->dr
));
3024 dump_printf (MSG_NOTE
, ", ");
3025 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3026 DR_REF (dr_b1
->dr
));
3027 dump_printf (MSG_NOTE
, " and ");
3028 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3029 DR_REF (dr_a2
->dr
));
3030 dump_printf (MSG_NOTE
, ", ");
3031 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3032 DR_REF (dr_b2
->dr
));
3033 dump_printf (MSG_NOTE
, "\n");
3036 comp_alias_ddrs
.ordered_remove (i
--);
3040 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3042 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3043 and DR_A1 and DR_A2 are two consecutive memrefs. */
3044 if (*dr_a1
== *dr_a2
)
3046 std::swap (dr_a1
, dr_b1
);
3047 std::swap (dr_a2
, dr_b2
);
3050 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3051 DR_BASE_ADDRESS (dr_a2
->dr
),
3053 || !tree_fits_shwi_p (dr_a1
->offset
)
3054 || !tree_fits_shwi_p (dr_a2
->offset
))
3057 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
3058 - tree_to_shwi (dr_a1
->offset
));
3061 /* Now we check if the following condition is satisfied:
3063 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3065 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3066 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3067 have to make a best estimation. We can get the minimum value
3068 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3069 then either of the following two conditions can guarantee the
3072 1: DIFF <= MIN_SEG_LEN_B
3073 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3077 HOST_WIDE_INT min_seg_len_b
= (tree_fits_shwi_p (dr_b1
->seg_len
)
3078 ? tree_to_shwi (dr_b1
->seg_len
)
3081 if (diff
<= min_seg_len_b
3082 || (tree_fits_shwi_p (dr_a1
->seg_len
)
3083 && diff
- tree_to_shwi (dr_a1
->seg_len
) < min_seg_len_b
))
3085 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_NOTE
, vect_location
,
3088 "merging ranges for ");
3089 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3090 DR_REF (dr_a1
->dr
));
3091 dump_printf (MSG_NOTE
, ", ");
3092 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3093 DR_REF (dr_b1
->dr
));
3094 dump_printf (MSG_NOTE
, " and ");
3095 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3096 DR_REF (dr_a2
->dr
));
3097 dump_printf (MSG_NOTE
, ", ");
3098 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3099 DR_REF (dr_b2
->dr
));
3100 dump_printf (MSG_NOTE
, "\n");
3103 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3104 dr_a2
->seg_len
, size_int (diff
));
3105 comp_alias_ddrs
.ordered_remove (i
--);
3110 dump_printf_loc (MSG_NOTE
, vect_location
,
3111 "improved number of alias checks from %d to %d\n",
3112 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3113 if ((int) comp_alias_ddrs
.length () >
3114 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3120 /* Check whether a non-affine read or write in stmt is suitable for gather load
3121 or scatter store and if so, return a builtin decl for that operation. */
3124 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
3125 tree
*offp
, int *scalep
)
3127 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3128 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3129 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3130 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3131 tree offtype
= NULL_TREE
;
3132 tree decl
, base
, off
;
3134 int punsignedp
, reversep
, pvolatilep
= 0;
3137 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3138 see if we can use the def stmt of the address. */
3139 if (is_gimple_call (stmt
)
3140 && gimple_call_internal_p (stmt
)
3141 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3142 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3143 && TREE_CODE (base
) == MEM_REF
3144 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3145 && integer_zerop (TREE_OPERAND (base
, 1))
3146 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3148 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3149 if (is_gimple_assign (def_stmt
)
3150 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3151 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3154 /* The gather and scatter builtins need address of the form
3155 loop_invariant + vector * {1, 2, 4, 8}
3157 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3158 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3159 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3160 multiplications and additions in it. To get a vector, we need
3161 a single SSA_NAME that will be defined in the loop and will
3162 contain everything that is not loop invariant and that can be
3163 vectorized. The following code attempts to find such a preexistng
3164 SSA_NAME OFF and put the loop invariants into a tree BASE
3165 that can be gimplified before the loop. */
3166 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3167 &punsignedp
, &reversep
, &pvolatilep
, false);
3168 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3170 if (TREE_CODE (base
) == MEM_REF
)
3172 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3174 if (off
== NULL_TREE
)
3176 offset_int moff
= mem_ref_offset (base
);
3177 off
= wide_int_to_tree (sizetype
, moff
);
3180 off
= size_binop (PLUS_EXPR
, off
,
3181 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3183 base
= TREE_OPERAND (base
, 0);
3186 base
= build_fold_addr_expr (base
);
3188 if (off
== NULL_TREE
)
3189 off
= size_zero_node
;
3191 /* If base is not loop invariant, either off is 0, then we start with just
3192 the constant offset in the loop invariant BASE and continue with base
3193 as OFF, otherwise give up.
3194 We could handle that case by gimplifying the addition of base + off
3195 into some SSA_NAME and use that as off, but for now punt. */
3196 if (!expr_invariant_in_loop_p (loop
, base
))
3198 if (!integer_zerop (off
))
3201 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3203 /* Otherwise put base + constant offset into the loop invariant BASE
3204 and continue with OFF. */
3207 base
= fold_convert (sizetype
, base
);
3208 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3211 /* OFF at this point may be either a SSA_NAME or some tree expression
3212 from get_inner_reference. Try to peel off loop invariants from it
3213 into BASE as long as possible. */
3215 while (offtype
== NULL_TREE
)
3217 enum tree_code code
;
3218 tree op0
, op1
, add
= NULL_TREE
;
3220 if (TREE_CODE (off
) == SSA_NAME
)
3222 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3224 if (expr_invariant_in_loop_p (loop
, off
))
3227 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3230 op0
= gimple_assign_rhs1 (def_stmt
);
3231 code
= gimple_assign_rhs_code (def_stmt
);
3232 op1
= gimple_assign_rhs2 (def_stmt
);
3236 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3238 code
= TREE_CODE (off
);
3239 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3243 case POINTER_PLUS_EXPR
:
3245 if (expr_invariant_in_loop_p (loop
, op0
))
3250 add
= fold_convert (sizetype
, add
);
3252 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3253 base
= size_binop (PLUS_EXPR
, base
, add
);
3256 if (expr_invariant_in_loop_p (loop
, op1
))
3264 if (expr_invariant_in_loop_p (loop
, op1
))
3266 add
= fold_convert (sizetype
, op1
);
3267 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3273 if (scale
== 1 && tree_fits_shwi_p (op1
))
3275 scale
= tree_to_shwi (op1
);
3284 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3285 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3287 if (TYPE_PRECISION (TREE_TYPE (op0
))
3288 == TYPE_PRECISION (TREE_TYPE (off
)))
3293 if (TYPE_PRECISION (TREE_TYPE (op0
))
3294 < TYPE_PRECISION (TREE_TYPE (off
)))
3297 offtype
= TREE_TYPE (off
);
3308 /* If at the end OFF still isn't a SSA_NAME or isn't
3309 defined in the loop, punt. */
3310 if (TREE_CODE (off
) != SSA_NAME
3311 || expr_invariant_in_loop_p (loop
, off
))
3314 if (offtype
== NULL_TREE
)
3315 offtype
= TREE_TYPE (off
);
3317 if (DR_IS_READ (dr
))
3318 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3321 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3324 if (decl
== NULL_TREE
)
3336 /* Function vect_analyze_data_refs.
3338 Find all the data references in the loop or basic block.
3340 The general structure of the analysis of data refs in the vectorizer is as
3342 1- vect_analyze_data_refs(loop/bb): call
3343 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3344 in the loop/bb and their dependences.
3345 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3346 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3347 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3352 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3354 struct loop
*loop
= NULL
;
3356 struct data_reference
*dr
;
3359 if (dump_enabled_p ())
3360 dump_printf_loc (MSG_NOTE
, vect_location
,
3361 "=== vect_analyze_data_refs ===\n");
3363 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3364 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3366 /* Go through the data-refs, check that the analysis succeeded. Update
3367 pointer from stmt_vec_info struct to DR and vectype. */
3369 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3370 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3373 stmt_vec_info stmt_info
;
3374 tree base
, offset
, init
;
3375 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3376 bool simd_lane_access
= false;
3380 if (!dr
|| !DR_REF (dr
))
3382 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3384 "not vectorized: unhandled data-ref\n");
3388 stmt
= DR_STMT (dr
);
3389 stmt_info
= vinfo_for_stmt (stmt
);
3391 /* Discard clobbers from the dataref vector. We will remove
3392 clobber stmts during vectorization. */
3393 if (gimple_clobber_p (stmt
))
3396 if (i
== datarefs
.length () - 1)
3401 datarefs
.ordered_remove (i
);
3406 /* Check that analysis of the data-ref succeeded. */
3407 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3412 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3413 && targetm
.vectorize
.builtin_gather
!= NULL
;
3416 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3417 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3418 bool maybe_simd_lane_access
3419 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3421 /* If target supports vector gather loads or scatter stores, or if
3422 this might be a SIMD lane access, see if they can't be used. */
3423 if (is_a
<loop_vec_info
> (vinfo
)
3424 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3425 && !nested_in_vect_loop_p (loop
, stmt
))
3427 struct data_reference
*newdr
3428 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3429 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3430 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3431 if (DR_BASE_ADDRESS (newdr
)
3432 && DR_OFFSET (newdr
)
3435 && integer_zerop (DR_STEP (newdr
)))
3437 if (maybe_simd_lane_access
)
3439 tree off
= DR_OFFSET (newdr
);
3441 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3442 && TREE_CODE (off
) == MULT_EXPR
3443 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3445 tree step
= TREE_OPERAND (off
, 1);
3446 off
= TREE_OPERAND (off
, 0);
3448 if (CONVERT_EXPR_P (off
)
3449 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3451 < TYPE_PRECISION (TREE_TYPE (off
)))
3452 off
= TREE_OPERAND (off
, 0);
3453 if (TREE_CODE (off
) == SSA_NAME
)
3455 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3456 tree reft
= TREE_TYPE (DR_REF (newdr
));
3457 if (is_gimple_call (def
)
3458 && gimple_call_internal_p (def
)
3459 && (gimple_call_internal_fn (def
)
3460 == IFN_GOMP_SIMD_LANE
))
3462 tree arg
= gimple_call_arg (def
, 0);
3463 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3464 arg
= SSA_NAME_VAR (arg
);
3465 if (arg
== loop
->simduid
3467 && tree_int_cst_equal
3468 (TYPE_SIZE_UNIT (reft
),
3471 DR_OFFSET (newdr
) = ssize_int (0);
3472 DR_STEP (newdr
) = step
;
3473 DR_ALIGNED_TO (newdr
)
3474 = size_int (BIGGEST_ALIGNMENT
);
3476 simd_lane_access
= true;
3482 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3486 gatherscatter
= GATHER
;
3488 gatherscatter
= SCATTER
;
3491 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3492 free_data_ref (newdr
);
3495 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3497 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3500 "not vectorized: data ref analysis "
3502 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3503 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3506 if (is_a
<bb_vec_info
> (vinfo
))
3513 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3517 "not vectorized: base addr of dr is a "
3520 if (is_a
<bb_vec_info
> (vinfo
))
3523 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3528 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3530 if (dump_enabled_p ())
3532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3533 "not vectorized: volatile type ");
3534 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3535 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3538 if (is_a
<bb_vec_info
> (vinfo
))
3544 if (stmt_can_throw_internal (stmt
))
3546 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3549 "not vectorized: statement can throw an "
3551 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3552 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3555 if (is_a
<bb_vec_info
> (vinfo
))
3558 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3563 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3564 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3569 "not vectorized: statement is bitfield "
3571 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3572 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3575 if (is_a
<bb_vec_info
> (vinfo
))
3578 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3583 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3584 offset
= unshare_expr (DR_OFFSET (dr
));
3585 init
= unshare_expr (DR_INIT (dr
));
3587 if (is_gimple_call (stmt
)
3588 && (!gimple_call_internal_p (stmt
)
3589 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3590 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3592 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3595 "not vectorized: dr in a call ");
3596 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3597 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3600 if (is_a
<bb_vec_info
> (vinfo
))
3603 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3608 /* Update DR field in stmt_vec_info struct. */
3610 /* If the dataref is in an inner-loop of the loop that is considered for
3611 for vectorization, we also want to analyze the access relative to
3612 the outer-loop (DR contains information only relative to the
3613 inner-most enclosing loop). We do that by building a reference to the
3614 first location accessed by the inner-loop, and analyze it relative to
3616 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3618 tree outer_step
, outer_base
, outer_init
;
3619 HOST_WIDE_INT pbitsize
, pbitpos
;
3622 int punsignedp
, preversep
, pvolatilep
;
3623 affine_iv base_iv
, offset_iv
;
3626 /* Build a reference to the first location accessed by the
3627 inner-loop: *(BASE+INIT). (The first location is actually
3628 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3629 tree inner_base
= build_fold_indirect_ref
3630 (fold_build_pointer_plus (base
, init
));
3632 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_NOTE
, vect_location
,
3635 "analyze in outer-loop: ");
3636 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3637 dump_printf (MSG_NOTE
, "\n");
3640 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3641 &poffset
, &pmode
, &punsignedp
,
3642 &preversep
, &pvolatilep
, false);
3643 gcc_assert (outer_base
!= NULL_TREE
);
3645 if (pbitpos
% BITS_PER_UNIT
!= 0)
3647 if (dump_enabled_p ())
3648 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3649 "failed: bit offset alignment.\n");
3655 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3657 "failed: reverse storage order.\n");
3661 outer_base
= build_fold_addr_expr (outer_base
);
3662 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3665 if (dump_enabled_p ())
3666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3667 "failed: evolution of base is not affine.\n");
3674 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3682 offset_iv
.base
= ssize_int (0);
3683 offset_iv
.step
= ssize_int (0);
3685 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3690 "evolution of offset is not affine.\n");
3694 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3695 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3696 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3697 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3698 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3700 outer_step
= size_binop (PLUS_EXPR
,
3701 fold_convert (ssizetype
, base_iv
.step
),
3702 fold_convert (ssizetype
, offset_iv
.step
));
3704 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3705 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3706 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3707 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3708 STMT_VINFO_DR_OFFSET (stmt_info
) =
3709 fold_convert (ssizetype
, offset_iv
.base
);
3710 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3711 size_int (highest_pow2_factor (offset_iv
.base
));
3713 if (dump_enabled_p ())
3715 dump_printf_loc (MSG_NOTE
, vect_location
,
3716 "\touter base_address: ");
3717 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3718 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3719 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3720 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3721 STMT_VINFO_DR_OFFSET (stmt_info
));
3722 dump_printf (MSG_NOTE
,
3723 "\n\touter constant offset from base address: ");
3724 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3725 STMT_VINFO_DR_INIT (stmt_info
));
3726 dump_printf (MSG_NOTE
, "\n\touter step: ");
3727 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3728 STMT_VINFO_DR_STEP (stmt_info
));
3729 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3730 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3731 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3732 dump_printf (MSG_NOTE
, "\n");
3736 if (STMT_VINFO_DATA_REF (stmt_info
))
3738 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3741 "not vectorized: more than one data ref "
3743 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3744 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3747 if (is_a
<bb_vec_info
> (vinfo
))
3750 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3755 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3756 if (simd_lane_access
)
3758 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3759 free_data_ref (datarefs
[i
]);
3763 /* Set vectype for STMT. */
3764 scalar_type
= TREE_TYPE (DR_REF (dr
));
3765 STMT_VINFO_VECTYPE (stmt_info
)
3766 = get_vectype_for_scalar_type (scalar_type
);
3767 if (!STMT_VINFO_VECTYPE (stmt_info
))
3769 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3772 "not vectorized: no vectype for stmt: ");
3773 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3774 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3775 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3777 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3780 if (is_a
<bb_vec_info
> (vinfo
))
3782 /* No vector type is fine, the ref can still participate
3783 in dependence analysis, we just can't vectorize it. */
3784 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3788 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3790 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3791 if (gatherscatter
!= SG_NONE
)
3798 if (dump_enabled_p ())
3800 dump_printf_loc (MSG_NOTE
, vect_location
,
3801 "got vectype for stmt: ");
3802 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3803 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3804 STMT_VINFO_VECTYPE (stmt_info
));
3805 dump_printf (MSG_NOTE
, "\n");
3809 /* Adjust the minimal vectorization factor according to the
3811 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3815 if (gatherscatter
!= SG_NONE
)
3818 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3820 || get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3822 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3824 if (dump_enabled_p ())
3826 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3827 (gatherscatter
== GATHER
) ?
3828 "not vectorized: not suitable for gather "
3830 "not vectorized: not suitable for scatter "
3832 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3833 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3839 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3842 else if (is_a
<loop_vec_info
> (vinfo
)
3843 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3845 if (nested_in_vect_loop_p (loop
, stmt
))
3847 if (dump_enabled_p ())
3849 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3850 "not vectorized: not suitable for strided "
3852 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3853 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3857 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3861 /* If we stopped analysis at the first dataref we could not analyze
3862 when trying to vectorize a basic-block mark the rest of the datarefs
3863 as not vectorizable and truncate the vector of datarefs. That
3864 avoids spending useless time in analyzing their dependence. */
3865 if (i
!= datarefs
.length ())
3867 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3868 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3870 data_reference_p dr
= datarefs
[j
];
3871 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3874 datarefs
.truncate (i
);
3881 /* Function vect_get_new_vect_var.
3883 Returns a name for a new variable. The current naming scheme appends the
3884 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3885 the name of vectorizer generated variables, and appends that to NAME if
3889 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3896 case vect_simple_var
:
3899 case vect_scalar_var
:
3905 case vect_pointer_var
:
3914 char* tmp
= concat (prefix
, "_", name
, NULL
);
3915 new_vect_var
= create_tmp_reg (type
, tmp
);
3919 new_vect_var
= create_tmp_reg (type
, prefix
);
3921 return new_vect_var
;
3924 /* Like vect_get_new_vect_var but return an SSA name. */
3927 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3934 case vect_simple_var
:
3937 case vect_scalar_var
:
3940 case vect_pointer_var
:
3949 char* tmp
= concat (prefix
, "_", name
, NULL
);
3950 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
3954 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
3956 return new_vect_var
;
3959 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3962 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3963 stmt_vec_info stmt_info
)
3965 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3966 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3967 int misalign
= DR_MISALIGNMENT (dr
);
3969 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
3971 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
3974 /* Function vect_create_addr_base_for_vector_ref.
3976 Create an expression that computes the address of the first memory location
3977 that will be accessed for a data reference.
3980 STMT: The statement containing the data reference.
3981 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3982 OFFSET: Optional. If supplied, it is be added to the initial address.
3983 LOOP: Specify relative to which loop-nest should the address be computed.
3984 For example, when the dataref is in an inner-loop nested in an
3985 outer-loop that is now being vectorized, LOOP can be either the
3986 outer-loop, or the inner-loop. The first memory location accessed
3987 by the following dataref ('in' points to short):
3994 if LOOP=i_loop: &in (relative to i_loop)
3995 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3996 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3997 initial address. Unlike OFFSET, which is number of elements to
3998 be added, BYTE_OFFSET is measured in bytes.
4001 1. Return an SSA_NAME whose value is the address of the memory location of
4002 the first vector of the data reference.
4003 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4004 these statement(s) which define the returned SSA_NAME.
4006 FORNOW: We are only handling array accesses with step 1. */
4009 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4010 gimple_seq
*new_stmt_list
,
4015 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4016 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4018 const char *base_name
;
4021 gimple_seq seq
= NULL
;
4025 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4026 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4028 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4030 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4032 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4034 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4035 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4036 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4040 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4041 base_offset
= unshare_expr (DR_OFFSET (dr
));
4042 init
= unshare_expr (DR_INIT (dr
));
4046 base_name
= get_name (data_ref_base
);
4049 base_offset
= ssize_int (0);
4050 init
= ssize_int (0);
4051 base_name
= get_name (DR_REF (dr
));
4054 /* Create base_offset */
4055 base_offset
= size_binop (PLUS_EXPR
,
4056 fold_convert (sizetype
, base_offset
),
4057 fold_convert (sizetype
, init
));
4061 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4062 fold_convert (sizetype
, offset
), step
);
4063 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4064 base_offset
, offset
);
4068 byte_offset
= fold_convert (sizetype
, byte_offset
);
4069 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4070 base_offset
, byte_offset
);
4073 /* base + base_offset */
4075 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4078 addr_base
= build1 (ADDR_EXPR
,
4079 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4080 unshare_expr (DR_REF (dr
)));
4083 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4084 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4085 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4086 gimple_seq_add_seq (new_stmt_list
, seq
);
4088 if (DR_PTR_INFO (dr
)
4089 && TREE_CODE (addr_base
) == SSA_NAME
4090 && !SSA_NAME_PTR_INFO (addr_base
))
4092 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4093 if (offset
|| byte_offset
)
4094 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4097 if (dump_enabled_p ())
4099 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4100 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4101 dump_printf (MSG_NOTE
, "\n");
4108 /* Function vect_create_data_ref_ptr.
4110 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4111 location accessed in the loop by STMT, along with the def-use update
4112 chain to appropriately advance the pointer through the loop iterations.
4113 Also set aliasing information for the pointer. This pointer is used by
4114 the callers to this function to create a memory reference expression for
4115 vector load/store access.
4118 1. STMT: a stmt that references memory. Expected to be of the form
4119 GIMPLE_ASSIGN <name, data-ref> or
4120 GIMPLE_ASSIGN <data-ref, name>.
4121 2. AGGR_TYPE: the type of the reference, which should be either a vector
4123 3. AT_LOOP: the loop where the vector memref is to be created.
4124 4. OFFSET (optional): an offset to be added to the initial address accessed
4125 by the data-ref in STMT.
4126 5. BSI: location where the new stmts are to be placed if there is no loop
4127 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4128 pointing to the initial address.
4129 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4130 to the initial address accessed by the data-ref in STMT. This is
4131 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4135 1. Declare a new ptr to vector_type, and have it point to the base of the
4136 data reference (initial addressed accessed by the data reference).
4137 For example, for vector of type V8HI, the following code is generated:
4140 ap = (v8hi *)initial_address;
4142 if OFFSET is not supplied:
4143 initial_address = &a[init];
4144 if OFFSET is supplied:
4145 initial_address = &a[init + OFFSET];
4146 if BYTE_OFFSET is supplied:
4147 initial_address = &a[init] + BYTE_OFFSET;
4149 Return the initial_address in INITIAL_ADDRESS.
4151 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4152 update the pointer in each iteration of the loop.
4154 Return the increment stmt that updates the pointer in PTR_INCR.
4156 3. Set INV_P to true if the access pattern of the data reference in the
4157 vectorized loop is invariant. Set it to false otherwise.
4159 4. Return the pointer. */
4162 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4163 tree offset
, tree
*initial_address
,
4164 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4165 bool only_init
, bool *inv_p
, tree byte_offset
)
4167 const char *base_name
;
4168 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4169 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4170 struct loop
*loop
= NULL
;
4171 bool nested_in_vect_loop
= false;
4172 struct loop
*containing_loop
= NULL
;
4176 gimple_seq new_stmt_list
= NULL
;
4180 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4182 gimple_stmt_iterator incr_gsi
;
4184 tree indx_before_incr
, indx_after_incr
;
4187 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4189 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4190 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4194 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4195 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4196 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4197 pe
= loop_preheader_edge (loop
);
4201 gcc_assert (bb_vinfo
);
4206 /* Check the step (evolution) of the load in LOOP, and record
4207 whether it's invariant. */
4208 if (nested_in_vect_loop
)
4209 step
= STMT_VINFO_DR_STEP (stmt_info
);
4211 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4213 if (integer_zerop (step
))
4218 /* Create an expression for the first address accessed by this load
4220 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4222 if (dump_enabled_p ())
4224 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4225 dump_printf_loc (MSG_NOTE
, vect_location
,
4226 "create %s-pointer variable to type: ",
4227 get_tree_code_name (TREE_CODE (aggr_type
)));
4228 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4229 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4230 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4231 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4232 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4233 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4234 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4236 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4237 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4238 dump_printf (MSG_NOTE
, "\n");
4241 /* (1) Create the new aggregate-pointer variable.
4242 Vector and array types inherit the alias set of their component
4243 type by default so we need to use a ref-all pointer if the data
4244 reference does not conflict with the created aggregated data
4245 reference because it is not addressable. */
4246 bool need_ref_all
= false;
4247 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4248 get_alias_set (DR_REF (dr
))))
4249 need_ref_all
= true;
4250 /* Likewise for any of the data references in the stmt group. */
4251 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4253 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4256 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4257 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4258 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4259 get_alias_set (DR_REF (sdr
))))
4261 need_ref_all
= true;
4264 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4268 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4270 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4273 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4274 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4275 def-use update cycles for the pointer: one relative to the outer-loop
4276 (LOOP), which is what steps (3) and (4) below do. The other is relative
4277 to the inner-loop (which is the inner-most loop containing the dataref),
4278 and this is done be step (5) below.
4280 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4281 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4282 redundant. Steps (3),(4) create the following:
4285 LOOP: vp1 = phi(vp0,vp2)
4291 If there is an inner-loop nested in loop, then step (5) will also be
4292 applied, and an additional update in the inner-loop will be created:
4295 LOOP: vp1 = phi(vp0,vp2)
4297 inner: vp3 = phi(vp1,vp4)
4298 vp4 = vp3 + inner_step
4304 /* (2) Calculate the initial address of the aggregate-pointer, and set
4305 the aggregate-pointer to point to it before the loop. */
4307 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4309 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4310 offset
, loop
, byte_offset
);
4315 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4316 gcc_assert (!new_bb
);
4319 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4322 *initial_address
= new_temp
;
4323 aggr_ptr_init
= new_temp
;
4325 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4326 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4327 inner-loop nested in LOOP (during outer-loop vectorization). */
4329 /* No update in loop is required. */
4330 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4331 aptr
= aggr_ptr_init
;
4334 /* The step of the aggregate pointer is the type size. */
4335 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4336 /* One exception to the above is when the scalar step of the load in
4337 LOOP is zero. In this case the step here is also zero. */
4339 iv_step
= size_zero_node
;
4340 else if (tree_int_cst_sgn (step
) == -1)
4341 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4343 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4345 create_iv (aggr_ptr_init
,
4346 fold_convert (aggr_ptr_type
, iv_step
),
4347 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4348 &indx_before_incr
, &indx_after_incr
);
4349 incr
= gsi_stmt (incr_gsi
);
4350 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4352 /* Copy the points-to information if it exists. */
4353 if (DR_PTR_INFO (dr
))
4355 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4356 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4361 aptr
= indx_before_incr
;
4364 if (!nested_in_vect_loop
|| only_init
)
4368 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4369 nested in LOOP, if exists. */
4371 gcc_assert (nested_in_vect_loop
);
4374 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4376 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4377 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4379 incr
= gsi_stmt (incr_gsi
);
4380 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4382 /* Copy the points-to information if it exists. */
4383 if (DR_PTR_INFO (dr
))
4385 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4386 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4391 return indx_before_incr
;
4398 /* Function bump_vector_ptr
4400 Increment a pointer (to a vector type) by vector-size. If requested,
4401 i.e. if PTR-INCR is given, then also connect the new increment stmt
4402 to the existing def-use update-chain of the pointer, by modifying
4403 the PTR_INCR as illustrated below:
4405 The pointer def-use update-chain before this function:
4406 DATAREF_PTR = phi (p_0, p_2)
4408 PTR_INCR: p_2 = DATAREF_PTR + step
4410 The pointer def-use update-chain after this function:
4411 DATAREF_PTR = phi (p_0, p_2)
4413 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4415 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4418 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4420 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4421 the loop. The increment amount across iterations is expected
4423 BSI - location where the new update stmt is to be placed.
4424 STMT - the original scalar memory-access stmt that is being vectorized.
4425 BUMP - optional. The offset by which to bump the pointer. If not given,
4426 the offset is assumed to be vector_size.
4428 Output: Return NEW_DATAREF_PTR as illustrated above.
4433 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4434 gimple
*stmt
, tree bump
)
4436 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4437 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4438 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4439 tree update
= TYPE_SIZE_UNIT (vectype
);
4442 use_operand_p use_p
;
4443 tree new_dataref_ptr
;
4448 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4449 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4451 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4452 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4453 dataref_ptr
, update
);
4454 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4456 /* Copy the points-to information if it exists. */
4457 if (DR_PTR_INFO (dr
))
4459 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4460 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4464 return new_dataref_ptr
;
4466 /* Update the vector-pointer's cross-iteration increment. */
4467 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4469 tree use
= USE_FROM_PTR (use_p
);
4471 if (use
== dataref_ptr
)
4472 SET_USE (use_p
, new_dataref_ptr
);
4474 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4477 return new_dataref_ptr
;
4481 /* Function vect_create_destination_var.
4483 Create a new temporary of type VECTYPE. */
4486 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4492 enum vect_var_kind kind
;
4495 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4499 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4501 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4503 name
= get_name (scalar_dest
);
4505 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4507 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4508 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4514 /* Function vect_grouped_store_supported.
4516 Returns TRUE if interleave high and interleave low permutations
4517 are supported, and FALSE otherwise. */
4520 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4522 machine_mode mode
= TYPE_MODE (vectype
);
4524 /* vect_permute_store_chain requires the group size to be equal to 3 or
4525 be a power of two. */
4526 if (count
!= 3 && exact_log2 (count
) == -1)
4528 if (dump_enabled_p ())
4529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4530 "the size of the group of accesses"
4531 " is not a power of 2 or not eqaul to 3\n");
4535 /* Check that the permutation is supported. */
4536 if (VECTOR_MODE_P (mode
))
4538 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4539 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4543 unsigned int j0
= 0, j1
= 0, j2
= 0;
4546 for (j
= 0; j
< 3; j
++)
4548 int nelt0
= ((3 - j
) * nelt
) % 3;
4549 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4550 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4551 for (i
= 0; i
< nelt
; i
++)
4553 if (3 * i
+ nelt0
< nelt
)
4554 sel
[3 * i
+ nelt0
] = j0
++;
4555 if (3 * i
+ nelt1
< nelt
)
4556 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4557 if (3 * i
+ nelt2
< nelt
)
4558 sel
[3 * i
+ nelt2
] = 0;
4560 if (!can_vec_perm_p (mode
, false, sel
))
4562 if (dump_enabled_p ())
4563 dump_printf (MSG_MISSED_OPTIMIZATION
,
4564 "permutaion op not supported by target.\n");
4568 for (i
= 0; i
< nelt
; i
++)
4570 if (3 * i
+ nelt0
< nelt
)
4571 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4572 if (3 * i
+ nelt1
< nelt
)
4573 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4574 if (3 * i
+ nelt2
< nelt
)
4575 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4577 if (!can_vec_perm_p (mode
, false, sel
))
4579 if (dump_enabled_p ())
4580 dump_printf (MSG_MISSED_OPTIMIZATION
,
4581 "permutaion op not supported by target.\n");
4589 /* If length is not equal to 3 then only power of 2 is supported. */
4590 gcc_assert (exact_log2 (count
) != -1);
4592 for (i
= 0; i
< nelt
/ 2; i
++)
4595 sel
[i
* 2 + 1] = i
+ nelt
;
4597 if (can_vec_perm_p (mode
, false, sel
))
4599 for (i
= 0; i
< nelt
; i
++)
4601 if (can_vec_perm_p (mode
, false, sel
))
4607 if (dump_enabled_p ())
4608 dump_printf (MSG_MISSED_OPTIMIZATION
,
4609 "permutaion op not supported by target.\n");
4614 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4618 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4620 return vect_lanes_optab_supported_p ("vec_store_lanes",
4621 vec_store_lanes_optab
,
4626 /* Function vect_permute_store_chain.
4628 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4629 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4630 the data correctly for the stores. Return the final references for stores
4633 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4634 The input is 4 vectors each containing 8 elements. We assign a number to
4635 each element, the input sequence is:
4637 1st vec: 0 1 2 3 4 5 6 7
4638 2nd vec: 8 9 10 11 12 13 14 15
4639 3rd vec: 16 17 18 19 20 21 22 23
4640 4th vec: 24 25 26 27 28 29 30 31
4642 The output sequence should be:
4644 1st vec: 0 8 16 24 1 9 17 25
4645 2nd vec: 2 10 18 26 3 11 19 27
4646 3rd vec: 4 12 20 28 5 13 21 30
4647 4th vec: 6 14 22 30 7 15 23 31
4649 i.e., we interleave the contents of the four vectors in their order.
4651 We use interleave_high/low instructions to create such output. The input of
4652 each interleave_high/low operation is two vectors:
4655 the even elements of the result vector are obtained left-to-right from the
4656 high/low elements of the first vector. The odd elements of the result are
4657 obtained left-to-right from the high/low elements of the second vector.
4658 The output of interleave_high will be: 0 4 1 5
4659 and of interleave_low: 2 6 3 7
4662 The permutation is done in log LENGTH stages. In each stage interleave_high
4663 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4664 where the first argument is taken from the first half of DR_CHAIN and the
4665 second argument from it's second half.
4668 I1: interleave_high (1st vec, 3rd vec)
4669 I2: interleave_low (1st vec, 3rd vec)
4670 I3: interleave_high (2nd vec, 4th vec)
4671 I4: interleave_low (2nd vec, 4th vec)
4673 The output for the first stage is:
4675 I1: 0 16 1 17 2 18 3 19
4676 I2: 4 20 5 21 6 22 7 23
4677 I3: 8 24 9 25 10 26 11 27
4678 I4: 12 28 13 29 14 30 15 31
4680 The output of the second stage, i.e. the final result is:
4682 I1: 0 8 16 24 1 9 17 25
4683 I2: 2 10 18 26 3 11 19 27
4684 I3: 4 12 20 28 5 13 21 30
4685 I4: 6 14 22 30 7 15 23 31. */
4688 vect_permute_store_chain (vec
<tree
> dr_chain
,
4689 unsigned int length
,
4691 gimple_stmt_iterator
*gsi
,
4692 vec
<tree
> *result_chain
)
4694 tree vect1
, vect2
, high
, low
;
4696 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4697 tree perm_mask_low
, perm_mask_high
;
4699 tree perm3_mask_low
, perm3_mask_high
;
4700 unsigned int i
, n
, log_length
= exact_log2 (length
);
4701 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4702 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4704 result_chain
->quick_grow (length
);
4705 memcpy (result_chain
->address (), dr_chain
.address (),
4706 length
* sizeof (tree
));
4710 unsigned int j0
= 0, j1
= 0, j2
= 0;
4712 for (j
= 0; j
< 3; j
++)
4714 int nelt0
= ((3 - j
) * nelt
) % 3;
4715 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4716 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4718 for (i
= 0; i
< nelt
; i
++)
4720 if (3 * i
+ nelt0
< nelt
)
4721 sel
[3 * i
+ nelt0
] = j0
++;
4722 if (3 * i
+ nelt1
< nelt
)
4723 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4724 if (3 * i
+ nelt2
< nelt
)
4725 sel
[3 * i
+ nelt2
] = 0;
4727 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4729 for (i
= 0; i
< nelt
; i
++)
4731 if (3 * i
+ nelt0
< nelt
)
4732 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4733 if (3 * i
+ nelt1
< nelt
)
4734 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4735 if (3 * i
+ nelt2
< nelt
)
4736 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4738 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4740 vect1
= dr_chain
[0];
4741 vect2
= dr_chain
[1];
4743 /* Create interleaving stmt:
4744 low = VEC_PERM_EXPR <vect1, vect2,
4745 {j, nelt, *, j + 1, nelt + j + 1, *,
4746 j + 2, nelt + j + 2, *, ...}> */
4747 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4748 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4749 vect2
, perm3_mask_low
);
4750 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4753 vect2
= dr_chain
[2];
4754 /* Create interleaving stmt:
4755 low = VEC_PERM_EXPR <vect1, vect2,
4756 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4757 6, 7, nelt + j + 2, ...}> */
4758 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4759 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4760 vect2
, perm3_mask_high
);
4761 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4762 (*result_chain
)[j
] = data_ref
;
4767 /* If length is not equal to 3 then only power of 2 is supported. */
4768 gcc_assert (exact_log2 (length
) != -1);
4770 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4773 sel
[i
* 2 + 1] = i
+ nelt
;
4775 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4777 for (i
= 0; i
< nelt
; i
++)
4779 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4781 for (i
= 0, n
= log_length
; i
< n
; i
++)
4783 for (j
= 0; j
< length
/2; j
++)
4785 vect1
= dr_chain
[j
];
4786 vect2
= dr_chain
[j
+length
/2];
4788 /* Create interleaving stmt:
4789 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4791 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4792 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4793 vect2
, perm_mask_high
);
4794 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4795 (*result_chain
)[2*j
] = high
;
4797 /* Create interleaving stmt:
4798 low = VEC_PERM_EXPR <vect1, vect2,
4799 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4801 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4802 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4803 vect2
, perm_mask_low
);
4804 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4805 (*result_chain
)[2*j
+1] = low
;
4807 memcpy (dr_chain
.address (), result_chain
->address (),
4808 length
* sizeof (tree
));
4813 /* Function vect_setup_realignment
4815 This function is called when vectorizing an unaligned load using
4816 the dr_explicit_realign[_optimized] scheme.
4817 This function generates the following code at the loop prolog:
4820 x msq_init = *(floor(p)); # prolog load
4821 realignment_token = call target_builtin;
4823 x msq = phi (msq_init, ---)
4825 The stmts marked with x are generated only for the case of
4826 dr_explicit_realign_optimized.
4828 The code above sets up a new (vector) pointer, pointing to the first
4829 location accessed by STMT, and a "floor-aligned" load using that pointer.
4830 It also generates code to compute the "realignment-token" (if the relevant
4831 target hook was defined), and creates a phi-node at the loop-header bb
4832 whose arguments are the result of the prolog-load (created by this
4833 function) and the result of a load that takes place in the loop (to be
4834 created by the caller to this function).
4836 For the case of dr_explicit_realign_optimized:
4837 The caller to this function uses the phi-result (msq) to create the
4838 realignment code inside the loop, and sets up the missing phi argument,
4841 msq = phi (msq_init, lsq)
4842 lsq = *(floor(p')); # load in loop
4843 result = realign_load (msq, lsq, realignment_token);
4845 For the case of dr_explicit_realign:
4847 msq = *(floor(p)); # load in loop
4849 lsq = *(floor(p')); # load in loop
4850 result = realign_load (msq, lsq, realignment_token);
4853 STMT - (scalar) load stmt to be vectorized. This load accesses
4854 a memory location that may be unaligned.
4855 BSI - place where new code is to be inserted.
4856 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4860 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4861 target hook, if defined.
4862 Return value - the result of the loop-header phi node. */
4865 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4866 tree
*realignment_token
,
4867 enum dr_alignment_support alignment_support_scheme
,
4869 struct loop
**at_loop
)
4871 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4872 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4873 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4874 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4875 struct loop
*loop
= NULL
;
4877 tree scalar_dest
= gimple_assign_lhs (stmt
);
4883 tree msq_init
= NULL_TREE
;
4886 tree msq
= NULL_TREE
;
4887 gimple_seq stmts
= NULL
;
4889 bool compute_in_loop
= false;
4890 bool nested_in_vect_loop
= false;
4891 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4892 struct loop
*loop_for_initial_load
= NULL
;
4896 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4897 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4900 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4901 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4903 /* We need to generate three things:
4904 1. the misalignment computation
4905 2. the extra vector load (for the optimized realignment scheme).
4906 3. the phi node for the two vectors from which the realignment is
4907 done (for the optimized realignment scheme). */
4909 /* 1. Determine where to generate the misalignment computation.
4911 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4912 calculation will be generated by this function, outside the loop (in the
4913 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4914 caller, inside the loop.
4916 Background: If the misalignment remains fixed throughout the iterations of
4917 the loop, then both realignment schemes are applicable, and also the
4918 misalignment computation can be done outside LOOP. This is because we are
4919 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4920 are a multiple of VS (the Vector Size), and therefore the misalignment in
4921 different vectorized LOOP iterations is always the same.
4922 The problem arises only if the memory access is in an inner-loop nested
4923 inside LOOP, which is now being vectorized using outer-loop vectorization.
4924 This is the only case when the misalignment of the memory access may not
4925 remain fixed throughout the iterations of the inner-loop (as explained in
4926 detail in vect_supportable_dr_alignment). In this case, not only is the
4927 optimized realignment scheme not applicable, but also the misalignment
4928 computation (and generation of the realignment token that is passed to
4929 REALIGN_LOAD) have to be done inside the loop.
4931 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4932 or not, which in turn determines if the misalignment is computed inside
4933 the inner-loop, or outside LOOP. */
4935 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4937 compute_in_loop
= true;
4938 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4942 /* 2. Determine where to generate the extra vector load.
4944 For the optimized realignment scheme, instead of generating two vector
4945 loads in each iteration, we generate a single extra vector load in the
4946 preheader of the loop, and in each iteration reuse the result of the
4947 vector load from the previous iteration. In case the memory access is in
4948 an inner-loop nested inside LOOP, which is now being vectorized using
4949 outer-loop vectorization, we need to determine whether this initial vector
4950 load should be generated at the preheader of the inner-loop, or can be
4951 generated at the preheader of LOOP. If the memory access has no evolution
4952 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4953 to be generated inside LOOP (in the preheader of the inner-loop). */
4955 if (nested_in_vect_loop
)
4957 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4958 bool invariant_in_outerloop
=
4959 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4960 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4963 loop_for_initial_load
= loop
;
4965 *at_loop
= loop_for_initial_load
;
4967 if (loop_for_initial_load
)
4968 pe
= loop_preheader_edge (loop_for_initial_load
);
4970 /* 3. For the case of the optimized realignment, create the first vector
4971 load at the loop preheader. */
4973 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4975 /* Create msq_init = *(floor(p1)) in the loop preheader */
4978 gcc_assert (!compute_in_loop
);
4979 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4980 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4981 NULL_TREE
, &init_addr
, NULL
, &inc
,
4983 if (TREE_CODE (ptr
) == SSA_NAME
)
4984 new_temp
= copy_ssa_name (ptr
);
4986 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
4987 new_stmt
= gimple_build_assign
4988 (new_temp
, BIT_AND_EXPR
, ptr
,
4989 build_int_cst (TREE_TYPE (ptr
),
4990 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4991 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4992 gcc_assert (!new_bb
);
4994 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4995 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4996 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4997 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4998 gimple_assign_set_lhs (new_stmt
, new_temp
);
5001 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5002 gcc_assert (!new_bb
);
5005 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5007 msq_init
= gimple_assign_lhs (new_stmt
);
5010 /* 4. Create realignment token using a target builtin, if available.
5011 It is done either inside the containing loop, or before LOOP (as
5012 determined above). */
5014 if (targetm
.vectorize
.builtin_mask_for_load
)
5019 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5022 /* Generate the INIT_ADDR computation outside LOOP. */
5023 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5027 pe
= loop_preheader_edge (loop
);
5028 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5029 gcc_assert (!new_bb
);
5032 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5035 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5036 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5038 vect_create_destination_var (scalar_dest
,
5039 gimple_call_return_type (new_stmt
));
5040 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5041 gimple_call_set_lhs (new_stmt
, new_temp
);
5043 if (compute_in_loop
)
5044 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5047 /* Generate the misalignment computation outside LOOP. */
5048 pe
= loop_preheader_edge (loop
);
5049 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5050 gcc_assert (!new_bb
);
5053 *realignment_token
= gimple_call_lhs (new_stmt
);
5055 /* The result of the CALL_EXPR to this builtin is determined from
5056 the value of the parameter and no global variables are touched
5057 which makes the builtin a "const" function. Requiring the
5058 builtin to have the "const" attribute makes it unnecessary
5059 to call mark_call_clobbered. */
5060 gcc_assert (TREE_READONLY (builtin_decl
));
5063 if (alignment_support_scheme
== dr_explicit_realign
)
5066 gcc_assert (!compute_in_loop
);
5067 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5070 /* 5. Create msq = phi <msq_init, lsq> in loop */
5072 pe
= loop_preheader_edge (containing_loop
);
5073 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5074 msq
= make_ssa_name (vec_dest
);
5075 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5076 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5082 /* Function vect_grouped_load_supported.
5084 Returns TRUE if even and odd permutations are supported,
5085 and FALSE otherwise. */
5088 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5090 machine_mode mode
= TYPE_MODE (vectype
);
5092 /* vect_permute_load_chain requires the group size to be equal to 3 or
5093 be a power of two. */
5094 if (count
!= 3 && exact_log2 (count
) == -1)
5096 if (dump_enabled_p ())
5097 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5098 "the size of the group of accesses"
5099 " is not a power of 2 or not equal to 3\n");
5103 /* Check that the permutation is supported. */
5104 if (VECTOR_MODE_P (mode
))
5106 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5107 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5112 for (k
= 0; k
< 3; k
++)
5114 for (i
= 0; i
< nelt
; i
++)
5115 if (3 * i
+ k
< 2 * nelt
)
5119 if (!can_vec_perm_p (mode
, false, sel
))
5121 if (dump_enabled_p ())
5122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5123 "shuffle of 3 loads is not supported by"
5127 for (i
= 0, j
= 0; i
< nelt
; i
++)
5128 if (3 * i
+ k
< 2 * nelt
)
5131 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5132 if (!can_vec_perm_p (mode
, false, sel
))
5134 if (dump_enabled_p ())
5135 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5136 "shuffle of 3 loads is not supported by"
5145 /* If length is not equal to 3 then only power of 2 is supported. */
5146 gcc_assert (exact_log2 (count
) != -1);
5147 for (i
= 0; i
< nelt
; i
++)
5149 if (can_vec_perm_p (mode
, false, sel
))
5151 for (i
= 0; i
< nelt
; i
++)
5153 if (can_vec_perm_p (mode
, false, sel
))
5159 if (dump_enabled_p ())
5160 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5161 "extract even/odd not supported by target\n");
5165 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5169 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5171 return vect_lanes_optab_supported_p ("vec_load_lanes",
5172 vec_load_lanes_optab
,
5176 /* Function vect_permute_load_chain.
5178 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5179 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5180 the input data correctly. Return the final references for loads in
5183 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5184 The input is 4 vectors each containing 8 elements. We assign a number to each
5185 element, the input sequence is:
5187 1st vec: 0 1 2 3 4 5 6 7
5188 2nd vec: 8 9 10 11 12 13 14 15
5189 3rd vec: 16 17 18 19 20 21 22 23
5190 4th vec: 24 25 26 27 28 29 30 31
5192 The output sequence should be:
5194 1st vec: 0 4 8 12 16 20 24 28
5195 2nd vec: 1 5 9 13 17 21 25 29
5196 3rd vec: 2 6 10 14 18 22 26 30
5197 4th vec: 3 7 11 15 19 23 27 31
5199 i.e., the first output vector should contain the first elements of each
5200 interleaving group, etc.
5202 We use extract_even/odd instructions to create such output. The input of
5203 each extract_even/odd operation is two vectors
5207 and the output is the vector of extracted even/odd elements. The output of
5208 extract_even will be: 0 2 4 6
5209 and of extract_odd: 1 3 5 7
5212 The permutation is done in log LENGTH stages. In each stage extract_even
5213 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5214 their order. In our example,
5216 E1: extract_even (1st vec, 2nd vec)
5217 E2: extract_odd (1st vec, 2nd vec)
5218 E3: extract_even (3rd vec, 4th vec)
5219 E4: extract_odd (3rd vec, 4th vec)
5221 The output for the first stage will be:
5223 E1: 0 2 4 6 8 10 12 14
5224 E2: 1 3 5 7 9 11 13 15
5225 E3: 16 18 20 22 24 26 28 30
5226 E4: 17 19 21 23 25 27 29 31
5228 In order to proceed and create the correct sequence for the next stage (or
5229 for the correct output, if the second stage is the last one, as in our
5230 example), we first put the output of extract_even operation and then the
5231 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5232 The input for the second stage is:
5234 1st vec (E1): 0 2 4 6 8 10 12 14
5235 2nd vec (E3): 16 18 20 22 24 26 28 30
5236 3rd vec (E2): 1 3 5 7 9 11 13 15
5237 4th vec (E4): 17 19 21 23 25 27 29 31
5239 The output of the second stage:
5241 E1: 0 4 8 12 16 20 24 28
5242 E2: 2 6 10 14 18 22 26 30
5243 E3: 1 5 9 13 17 21 25 29
5244 E4: 3 7 11 15 19 23 27 31
5246 And RESULT_CHAIN after reordering:
5248 1st vec (E1): 0 4 8 12 16 20 24 28
5249 2nd vec (E3): 1 5 9 13 17 21 25 29
5250 3rd vec (E2): 2 6 10 14 18 22 26 30
5251 4th vec (E4): 3 7 11 15 19 23 27 31. */
5254 vect_permute_load_chain (vec
<tree
> dr_chain
,
5255 unsigned int length
,
5257 gimple_stmt_iterator
*gsi
,
5258 vec
<tree
> *result_chain
)
5260 tree data_ref
, first_vect
, second_vect
;
5261 tree perm_mask_even
, perm_mask_odd
;
5262 tree perm3_mask_low
, perm3_mask_high
;
5264 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5265 unsigned int i
, j
, log_length
= exact_log2 (length
);
5266 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5267 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5269 result_chain
->quick_grow (length
);
5270 memcpy (result_chain
->address (), dr_chain
.address (),
5271 length
* sizeof (tree
));
5277 for (k
= 0; k
< 3; k
++)
5279 for (i
= 0; i
< nelt
; i
++)
5280 if (3 * i
+ k
< 2 * nelt
)
5284 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5286 for (i
= 0, j
= 0; i
< nelt
; i
++)
5287 if (3 * i
+ k
< 2 * nelt
)
5290 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5292 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5294 first_vect
= dr_chain
[0];
5295 second_vect
= dr_chain
[1];
5297 /* Create interleaving stmt (low part of):
5298 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5300 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5301 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5302 second_vect
, perm3_mask_low
);
5303 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5305 /* Create interleaving stmt (high part of):
5306 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5308 first_vect
= data_ref
;
5309 second_vect
= dr_chain
[2];
5310 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5311 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5312 second_vect
, perm3_mask_high
);
5313 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5314 (*result_chain
)[k
] = data_ref
;
5319 /* If length is not equal to 3 then only power of 2 is supported. */
5320 gcc_assert (exact_log2 (length
) != -1);
5322 for (i
= 0; i
< nelt
; ++i
)
5324 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5326 for (i
= 0; i
< nelt
; ++i
)
5328 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5330 for (i
= 0; i
< log_length
; i
++)
5332 for (j
= 0; j
< length
; j
+= 2)
5334 first_vect
= dr_chain
[j
];
5335 second_vect
= dr_chain
[j
+1];
5337 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5338 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5339 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5340 first_vect
, second_vect
,
5342 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5343 (*result_chain
)[j
/2] = data_ref
;
5345 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5346 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5347 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5348 first_vect
, second_vect
,
5350 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5351 (*result_chain
)[j
/2+length
/2] = data_ref
;
5353 memcpy (dr_chain
.address (), result_chain
->address (),
5354 length
* sizeof (tree
));
5359 /* Function vect_shift_permute_load_chain.
5361 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5362 sequence of stmts to reorder the input data accordingly.
5363 Return the final references for loads in RESULT_CHAIN.
5364 Return true if successed, false otherwise.
5366 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5367 The input is 3 vectors each containing 8 elements. We assign a
5368 number to each element, the input sequence is:
5370 1st vec: 0 1 2 3 4 5 6 7
5371 2nd vec: 8 9 10 11 12 13 14 15
5372 3rd vec: 16 17 18 19 20 21 22 23
5374 The output sequence should be:
5376 1st vec: 0 3 6 9 12 15 18 21
5377 2nd vec: 1 4 7 10 13 16 19 22
5378 3rd vec: 2 5 8 11 14 17 20 23
5380 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5382 First we shuffle all 3 vectors to get correct elements order:
5384 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5385 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5386 3rd vec: (16 19 22) (17 20 23) (18 21)
5388 Next we unite and shift vector 3 times:
5391 shift right by 6 the concatenation of:
5392 "1st vec" and "2nd vec"
5393 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5394 "2nd vec" and "3rd vec"
5395 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5396 "3rd vec" and "1st vec"
5397 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5400 So that now new vectors are:
5402 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5403 2nd vec: (10 13) (16 19 22) (17 20 23)
5404 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5407 shift right by 5 the concatenation of:
5408 "1st vec" and "3rd vec"
5409 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5410 "2nd vec" and "1st vec"
5411 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5412 "3rd vec" and "2nd vec"
5413 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5416 So that now new vectors are:
5418 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5419 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5420 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5423 shift right by 5 the concatenation of:
5424 "1st vec" and "1st vec"
5425 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5426 shift right by 3 the concatenation of:
5427 "2nd vec" and "2nd vec"
5428 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5431 So that now all vectors are READY:
5432 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5433 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5434 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5436 This algorithm is faster than one in vect_permute_load_chain if:
5437 1. "shift of a concatination" is faster than general permutation.
5439 2. The TARGET machine can't execute vector instructions in parallel.
5440 This is because each step of the algorithm depends on previous.
5441 The algorithm in vect_permute_load_chain is much more parallel.
5443 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5447 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5448 unsigned int length
,
5450 gimple_stmt_iterator
*gsi
,
5451 vec
<tree
> *result_chain
)
5453 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5454 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5455 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5458 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5460 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5461 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5462 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5463 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5465 result_chain
->quick_grow (length
);
5466 memcpy (result_chain
->address (), dr_chain
.address (),
5467 length
* sizeof (tree
));
5469 if (exact_log2 (length
) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5471 unsigned int j
, log_length
= exact_log2 (length
);
5472 for (i
= 0; i
< nelt
/ 2; ++i
)
5474 for (i
= 0; i
< nelt
/ 2; ++i
)
5475 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5476 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5480 "shuffle of 2 fields structure is not \
5481 supported by target\n");
5484 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5486 for (i
= 0; i
< nelt
/ 2; ++i
)
5488 for (i
= 0; i
< nelt
/ 2; ++i
)
5489 sel
[nelt
/ 2 + i
] = i
* 2;
5490 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5492 if (dump_enabled_p ())
5493 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5494 "shuffle of 2 fields structure is not \
5495 supported by target\n");
5498 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5500 /* Generating permutation constant to shift all elements.
5501 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5502 for (i
= 0; i
< nelt
; i
++)
5503 sel
[i
] = nelt
/ 2 + i
;
5504 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5508 "shift permutation is not supported by target\n");
5511 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5513 /* Generating permutation constant to select vector from 2.
5514 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5515 for (i
= 0; i
< nelt
/ 2; i
++)
5517 for (i
= nelt
/ 2; i
< nelt
; i
++)
5519 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5523 "select is not supported by target\n");
5526 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5528 for (i
= 0; i
< log_length
; i
++)
5530 for (j
= 0; j
< length
; j
+= 2)
5532 first_vect
= dr_chain
[j
];
5533 second_vect
= dr_chain
[j
+ 1];
5535 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5536 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5537 first_vect
, first_vect
,
5539 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5542 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5543 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5544 second_vect
, second_vect
,
5546 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5549 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5550 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5551 vect
[0], vect
[1], shift1_mask
);
5552 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5553 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5555 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5556 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5557 vect
[0], vect
[1], select_mask
);
5558 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5559 (*result_chain
)[j
/2] = data_ref
;
5561 memcpy (dr_chain
.address (), result_chain
->address (),
5562 length
* sizeof (tree
));
5566 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5568 unsigned int k
= 0, l
= 0;
5570 /* Generating permutation constant to get all elements in rigth order.
5571 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5572 for (i
= 0; i
< nelt
; i
++)
5574 if (3 * k
+ (l
% 3) >= nelt
)
5577 l
+= (3 - (nelt
% 3));
5579 sel
[i
] = 3 * k
+ (l
% 3);
5582 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5586 "shuffle of 3 fields structure is not \
5587 supported by target\n");
5590 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5592 /* Generating permutation constant to shift all elements.
5593 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5594 for (i
= 0; i
< nelt
; i
++)
5595 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5596 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5598 if (dump_enabled_p ())
5599 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5600 "shift permutation is not supported by target\n");
5603 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5605 /* Generating permutation constant to shift all elements.
5606 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5607 for (i
= 0; i
< nelt
; i
++)
5608 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5609 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5611 if (dump_enabled_p ())
5612 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5613 "shift permutation is not supported by target\n");
5616 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5618 /* Generating permutation constant to shift all elements.
5619 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5620 for (i
= 0; i
< nelt
; i
++)
5621 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5622 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5624 if (dump_enabled_p ())
5625 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5626 "shift permutation is not supported by target\n");
5629 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5631 /* Generating permutation constant to shift all elements.
5632 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5633 for (i
= 0; i
< nelt
; i
++)
5634 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5635 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5637 if (dump_enabled_p ())
5638 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5639 "shift permutation is not supported by target\n");
5642 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5644 for (k
= 0; k
< 3; k
++)
5646 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5647 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5648 dr_chain
[k
], dr_chain
[k
],
5650 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5654 for (k
= 0; k
< 3; k
++)
5656 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5657 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5658 vect
[k
% 3], vect
[(k
+ 1) % 3],
5660 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5661 vect_shift
[k
] = data_ref
;
5664 for (k
= 0; k
< 3; k
++)
5666 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5667 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5668 vect_shift
[(4 - k
) % 3],
5669 vect_shift
[(3 - k
) % 3],
5671 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5675 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5677 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5678 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5679 vect
[0], shift3_mask
);
5680 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5681 (*result_chain
)[nelt
% 3] = data_ref
;
5683 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5684 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5685 vect
[1], shift4_mask
);
5686 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5687 (*result_chain
)[0] = data_ref
;
5693 /* Function vect_transform_grouped_load.
5695 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5696 to perform their permutation and ascribe the result vectorized statements to
5697 the scalar statements.
5701 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5702 gimple_stmt_iterator
*gsi
)
5705 vec
<tree
> result_chain
= vNULL
;
5707 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5708 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5709 vectors, that are ready for vector computation. */
5710 result_chain
.create (size
);
5712 /* If reassociation width for vector type is 2 or greater target machine can
5713 execute 2 or more vector instructions in parallel. Otherwise try to
5714 get chain for loads group using vect_shift_permute_load_chain. */
5715 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5716 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5717 || exact_log2 (size
) != -1
5718 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5719 gsi
, &result_chain
))
5720 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5721 vect_record_grouped_load_vectors (stmt
, result_chain
);
5722 result_chain
.release ();
5725 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5726 generated as part of the vectorization of STMT. Assign the statement
5727 for each vector to the associated scalar statement. */
5730 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5732 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5733 gimple
*next_stmt
, *new_stmt
;
5734 unsigned int i
, gap_count
;
5737 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5738 Since we scan the chain starting from it's first node, their order
5739 corresponds the order of data-refs in RESULT_CHAIN. */
5740 next_stmt
= first_stmt
;
5742 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5747 /* Skip the gaps. Loads created for the gaps will be removed by dead
5748 code elimination pass later. No need to check for the first stmt in
5749 the group, since it always exists.
5750 GROUP_GAP is the number of steps in elements from the previous
5751 access (if there is no gap GROUP_GAP is 1). We skip loads that
5752 correspond to the gaps. */
5753 if (next_stmt
!= first_stmt
5754 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5762 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5763 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5764 copies, and we put the new vector statement in the first available
5766 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5767 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5770 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5773 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5775 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5778 prev_stmt
= rel_stmt
;
5780 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5783 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5788 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5790 /* If NEXT_STMT accesses the same DR as the previous statement,
5791 put the same TMP_DATA_REF as its vectorized statement; otherwise
5792 get the next data-ref from RESULT_CHAIN. */
5793 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5799 /* Function vect_force_dr_alignment_p.
5801 Returns whether the alignment of a DECL can be forced to be aligned
5802 on ALIGNMENT bit boundary. */
5805 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5807 if (TREE_CODE (decl
) != VAR_DECL
)
5810 if (decl_in_symtab_p (decl
)
5811 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5814 if (TREE_STATIC (decl
))
5815 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5817 return (alignment
<= MAX_STACK_ALIGNMENT
);
5821 /* Return whether the data reference DR is supported with respect to its
5823 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5824 it is aligned, i.e., check if it is possible to vectorize it with different
5827 enum dr_alignment_support
5828 vect_supportable_dr_alignment (struct data_reference
*dr
,
5829 bool check_aligned_accesses
)
5831 gimple
*stmt
= DR_STMT (dr
);
5832 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5833 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5834 machine_mode mode
= TYPE_MODE (vectype
);
5835 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5836 struct loop
*vect_loop
= NULL
;
5837 bool nested_in_vect_loop
= false;
5839 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5842 /* For now assume all conditional loads/stores support unaligned
5843 access without any special code. */
5844 if (is_gimple_call (stmt
)
5845 && gimple_call_internal_p (stmt
)
5846 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5847 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5848 return dr_unaligned_supported
;
5852 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5853 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5856 /* Possibly unaligned access. */
5858 /* We can choose between using the implicit realignment scheme (generating
5859 a misaligned_move stmt) and the explicit realignment scheme (generating
5860 aligned loads with a REALIGN_LOAD). There are two variants to the
5861 explicit realignment scheme: optimized, and unoptimized.
5862 We can optimize the realignment only if the step between consecutive
5863 vector loads is equal to the vector size. Since the vector memory
5864 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5865 is guaranteed that the misalignment amount remains the same throughout the
5866 execution of the vectorized loop. Therefore, we can create the
5867 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5868 at the loop preheader.
5870 However, in the case of outer-loop vectorization, when vectorizing a
5871 memory access in the inner-loop nested within the LOOP that is now being
5872 vectorized, while it is guaranteed that the misalignment of the
5873 vectorized memory access will remain the same in different outer-loop
5874 iterations, it is *not* guaranteed that is will remain the same throughout
5875 the execution of the inner-loop. This is because the inner-loop advances
5876 with the original scalar step (and not in steps of VS). If the inner-loop
5877 step happens to be a multiple of VS, then the misalignment remains fixed
5878 and we can use the optimized realignment scheme. For example:
5884 When vectorizing the i-loop in the above example, the step between
5885 consecutive vector loads is 1, and so the misalignment does not remain
5886 fixed across the execution of the inner-loop, and the realignment cannot
5887 be optimized (as illustrated in the following pseudo vectorized loop):
5889 for (i=0; i<N; i+=4)
5890 for (j=0; j<M; j++){
5891 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5892 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5893 // (assuming that we start from an aligned address).
5896 We therefore have to use the unoptimized realignment scheme:
5898 for (i=0; i<N; i+=4)
5899 for (j=k; j<M; j+=4)
5900 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5901 // that the misalignment of the initial address is
5904 The loop can then be vectorized as follows:
5906 for (k=0; k<4; k++){
5907 rt = get_realignment_token (&vp[k]);
5908 for (i=0; i<N; i+=4){
5910 for (j=k; j<M; j+=4){
5912 va = REALIGN_LOAD <v1,v2,rt>;
5919 if (DR_IS_READ (dr
))
5921 bool is_packed
= false;
5922 tree type
= (TREE_TYPE (DR_REF (dr
)));
5924 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5925 && (!targetm
.vectorize
.builtin_mask_for_load
5926 || targetm
.vectorize
.builtin_mask_for_load ()))
5928 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5929 if ((nested_in_vect_loop
5930 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5931 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5933 return dr_explicit_realign
;
5935 return dr_explicit_realign_optimized
;
5937 if (!known_alignment_for_access_p (dr
))
5938 is_packed
= not_size_aligned (DR_REF (dr
));
5940 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5941 || targetm
.vectorize
.
5942 support_vector_misalignment (mode
, type
,
5943 DR_MISALIGNMENT (dr
), is_packed
))
5944 /* Can't software pipeline the loads, but can at least do them. */
5945 return dr_unaligned_supported
;
5949 bool is_packed
= false;
5950 tree type
= (TREE_TYPE (DR_REF (dr
)));
5952 if (!known_alignment_for_access_p (dr
))
5953 is_packed
= not_size_aligned (DR_REF (dr
));
5955 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5956 || targetm
.vectorize
.
5957 support_vector_misalignment (mode
, type
,
5958 DR_MISALIGNMENT (dr
), is_packed
))
5959 return dr_unaligned_supported
;
5963 return dr_unaligned_unsupported
;