1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
59 tree vectype
, unsigned HOST_WIDE_INT count
)
61 machine_mode mode
, array_mode
;
64 mode
= TYPE_MODE (vectype
);
65 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
66 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
69 if (array_mode
== BLKmode
)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
74 GET_MODE_NAME (mode
), count
);
78 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
82 "cannot use %s<%s><%s>\n", name
,
83 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE
, vect_location
,
89 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
90 GET_MODE_NAME (mode
));
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
114 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
115 HOST_WIDE_INT
*rhs_size_unit
)
117 tree scalar_type
= gimple_expr_type (stmt
);
118 HOST_WIDE_INT lhs
, rhs
;
120 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
122 if (is_gimple_assign (stmt
)
123 && (gimple_assign_cast_p (stmt
)
124 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
128 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
130 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
132 scalar_type
= rhs_type
;
135 *lhs_size_unit
= lhs
;
136 *rhs_size_unit
= rhs
;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
146 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
148 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE
, vect_location
,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
158 dump_printf (MSG_NOTE
, " and ");
159 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
160 dump_printf (MSG_NOTE
, "\n");
163 if (optimize_loop_nest_for_size_p (loop
))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
167 "versioning not supported when optimizing"
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
177 "versioning not yet supported for outer-loops.\n");
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
183 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
188 "versioning not yet supported for non-constant "
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
206 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
207 loop_vec_info loop_vinfo
, int *max_vf
)
210 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
211 struct data_reference
*dra
= DDR_A (ddr
);
212 struct data_reference
*drb
= DDR_B (ddr
);
213 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
214 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
215 lambda_vector dist_v
;
216 unsigned int loop_depth
;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
228 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
244 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
246 get_alias_set (DR_REF (drb
))))
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop
->safelen
>= 2)
256 if (loop
->safelen
< *max_vf
)
257 *max_vf
= loop
->safelen
;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
272 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
275 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
287 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop
->safelen
>= 2)
304 if (loop
->safelen
< *max_vf
)
305 *max_vf
= loop
->safelen
;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
320 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
336 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
342 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
345 int dist
= dist_v
[loop_depth
];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance = %d.\n", dist
);
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE
, vect_location
,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
358 dump_printf (MSG_NOTE
, " and ");
359 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
360 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
384 gimple
*earlier_stmt
;
385 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "READ_WRITE dependence in interleaving."
400 if (dist
> 0 && DDR_REVERSED_P (ddr
))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
419 && abs (dist
) < *max_vf
)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf
= abs (dist
);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE
, vect_location
,
426 "adjusting maximal vectorization factor to %i\n",
430 if (abs (dist
) >= *max_vf
)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "dependence distance >= VF.\n");
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
446 dump_printf (MSG_NOTE
, " and ");
447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
448 dump_printf (MSG_NOTE
, "\n");
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
467 struct data_dependence_relation
*ddr
;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE
, vect_location
,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo
)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
479 &LOOP_VINFO_DDRS (loop_vinfo
),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
487 *max_vf
= LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo
);
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
490 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
507 struct data_reference
*dra
= DDR_A (ddr
);
508 struct data_reference
*drb
= DDR_B (ddr
);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
524 /* If dra and drb are part of the same interleaving chain consider
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
539 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
541 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE
, vect_location
,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
549 dump_printf (MSG_NOTE
, " and ");
550 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
551 dump_printf (MSG_NOTE
, "\n");
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
563 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
564 vec
<gimple
*> stores
, gimple
*last_store
)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
569 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
570 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
572 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
573 if (access
== last_access
)
575 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
576 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
577 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
579 gimple
*stmt
= gsi_stmt (gsi
);
580 if (! gimple_vuse (stmt
)
581 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
592 bool dependent
= false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt
))
599 if (stmt
!= last_store
)
603 FOR_EACH_VEC_ELT (stores
, i
, store
)
605 data_reference
*store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
607 ddr_p ddr
= initialize_data_dependence_relation
608 (dr_a
, store_dr
, vNULL
);
609 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
610 free_dependence_relation (ddr
);
617 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
619 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
620 free_dependence_relation (ddr
);
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
637 vect_slp_analyze_instance_dependence (slp_instance instance
)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE
, vect_location
,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store
= SLP_INSTANCE_TREE (instance
);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple
*last_store
= NULL
;
652 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
655 /* Mark stores in this instance and remember the last one. */
656 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
657 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
668 if (! vect_slp_analyze_node_dependences (instance
, load
,
670 ? SLP_TREE_SCALAR_STMTS (store
)
671 : vNULL
, last_store
))
677 /* Unset the visited flag. */
679 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
698 vect_compute_data_ref_alignment (struct data_reference
*dr
)
700 gimple
*stmt
= DR_STMT (dr
);
701 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
702 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
703 struct loop
*loop
= NULL
;
704 tree ref
= DR_REF (dr
);
706 tree base
, base_addr
;
707 tree misalign
= NULL_TREE
;
710 unsigned HOST_WIDE_INT alignment
;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "vect_compute_data_ref_alignment:\n");
717 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr
, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr
)))
723 misalign
= DR_INIT (dr
);
724 aligned_to
= DR_ALIGNED_TO (dr
);
725 base_addr
= DR_BASE_ADDRESS (dr
);
726 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
736 tree step
= DR_STEP (dr
);
738 if (tree_fits_shwi_p (step
)
739 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE
, vect_location
,
743 "inner step divides the vector-size.\n");
744 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
745 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
746 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
752 "inner step doesn't divide the vector-size.\n");
753 misalign
= NULL_TREE
;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
763 tree step
= DR_STEP (dr
);
764 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
766 if (tree_fits_shwi_p (step
)
767 && ((tree_to_shwi (step
) * vf
)
768 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
772 "step doesn't divide the vector-size.\n");
773 misalign
= NULL_TREE
;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
780 while (handled_component_p (base
))
781 base
= TREE_OPERAND (base
, 0);
782 unsigned int base_alignment
;
783 unsigned HOST_WIDE_INT base_bitpos
;
784 get_object_alignment_1 (base
, &base_alignment
, &base_bitpos
);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
789 if (TREE_CODE (base
) == MEM_REF
)
791 base_bitpos
-= mem_ref_offset (base
).to_short_addr () * BITS_PER_UNIT
;
792 base_bitpos
&= (base_alignment
- 1);
794 if (base_bitpos
!= 0)
795 base_alignment
= base_bitpos
& -base_bitpos
;
796 /* Also look at the alignment of the base address DR analysis
798 unsigned int base_addr_alignment
= get_pointer_alignment (base_addr
);
799 if (base_addr_alignment
> base_alignment
)
800 base_alignment
= base_addr_alignment
;
802 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
803 DR_VECT_AUX (dr
)->base_element_aligned
= true;
805 alignment
= TYPE_ALIGN_UNIT (vectype
);
807 if ((compare_tree_int (aligned_to
, alignment
) < 0)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
813 "Unknown alignment for access: ");
814 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
815 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
820 if (base_alignment
< TYPE_ALIGN (vectype
))
823 if (TREE_CODE (base
) == ADDR_EXPR
)
824 base
= TREE_OPERAND (base
, 0);
825 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
827 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE
, vect_location
,
830 "can't force alignment of ref: ");
831 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
832 dump_printf (MSG_NOTE
, "\n");
837 if (DECL_USER_ALIGN (base
))
839 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE
, vect_location
,
842 "not forcing alignment of user-aligned "
844 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
845 dump_printf (MSG_NOTE
, "\n");
850 /* Force the alignment of the decl.
851 NOTE: This is the only change to the code we make during
852 the analysis phase, before deciding to vectorize the loop. */
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
856 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
857 dump_printf (MSG_NOTE
, "\n");
860 DR_VECT_AUX (dr
)->base_decl
= base
;
861 DR_VECT_AUX (dr
)->base_misaligned
= true;
862 DR_VECT_AUX (dr
)->base_element_aligned
= true;
865 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
866 step
= STMT_VINFO_DR_STEP (stmt_info
);
869 /* If this is a backward running DR then first access in the larger
870 vectype actually is N-1 elements before the address in the DR.
871 Adjust misalign accordingly. */
872 if (tree_int_cst_sgn (step
) < 0)
874 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
875 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
876 otherwise we wouldn't be here. */
877 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, step
);
878 /* PLUS because STEP was negative. */
879 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
882 SET_DR_MISALIGNMENT (dr
,
883 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
885 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
888 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
889 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
890 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
897 /* Function vect_update_misalignment_for_peel
899 DR - the data reference whose misalignment is to be adjusted.
900 DR_PEEL - the data reference whose misalignment is being made
901 zero in the vector loop by the peel.
902 NPEEL - the number of iterations in the peel loop if the misalignment
903 of DR_PEEL is known at compile time. */
906 vect_update_misalignment_for_peel (struct data_reference
*dr
,
907 struct data_reference
*dr_peel
, int npeel
)
910 vec
<dr_p
> same_align_drs
;
911 struct data_reference
*current_dr
;
912 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
913 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
914 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
915 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
917 /* For interleaved data accesses the step in the loop must be multiplied by
918 the size of the interleaving group. */
919 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
920 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
921 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
922 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
924 /* It can be assumed that the data refs with the same alignment as dr_peel
925 are aligned in the vector loop. */
927 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
928 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
930 if (current_dr
!= dr
)
932 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
933 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
934 SET_DR_MISALIGNMENT (dr
, 0);
938 if (known_alignment_for_access_p (dr
)
939 && known_alignment_for_access_p (dr_peel
))
941 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
942 int misal
= DR_MISALIGNMENT (dr
);
943 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
944 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
945 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
946 SET_DR_MISALIGNMENT (dr
, misal
);
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
952 SET_DR_MISALIGNMENT (dr
, -1);
956 /* Function verify_data_ref_alignment
958 Return TRUE if DR can be handled with respect to alignment. */
961 verify_data_ref_alignment (data_reference_p dr
)
963 enum dr_alignment_support supportable_dr_alignment
964 = vect_supportable_dr_alignment (dr
, false);
965 if (!supportable_dr_alignment
)
967 if (dump_enabled_p ())
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
971 "not vectorized: unsupported unaligned load.");
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
974 "not vectorized: unsupported unaligned "
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
979 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
984 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE
, vect_location
,
986 "Vectorizing an unaligned access.\n");
991 /* Function vect_verify_datarefs_alignment
993 Return TRUE if all data references in the loop can be
994 handled with respect to alignment. */
997 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
999 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1000 struct data_reference
*dr
;
1003 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1005 gimple
*stmt
= DR_STMT (dr
);
1006 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1008 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1011 /* For interleaving, only the alignment of the first access matters. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1013 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1016 /* Strided accesses perform only component accesses, alignment is
1017 irrelevant for them. */
1018 if (STMT_VINFO_STRIDED_P (stmt_info
)
1019 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1022 if (! verify_data_ref_alignment (dr
))
1029 /* Given an memory reference EXP return whether its alignment is less
1033 not_size_aligned (tree exp
)
1035 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1038 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1039 > get_object_alignment (exp
));
1042 /* Function vector_alignment_reachable_p
1044 Return true if vector alignment for DR is reachable by peeling
1045 a few loop iterations. Return false otherwise. */
1048 vector_alignment_reachable_p (struct data_reference
*dr
)
1050 gimple
*stmt
= DR_STMT (dr
);
1051 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1052 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1056 /* For interleaved access we peel only if number of iterations in
1057 the prolog loop ({VF - misalignment}), is a multiple of the
1058 number of the interleaved accesses. */
1059 int elem_size
, mis_in_elements
;
1060 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1062 /* FORNOW: handle only known alignment. */
1063 if (!known_alignment_for_access_p (dr
))
1066 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1067 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1069 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1073 /* If misalignment is known at the compile time then allow peeling
1074 only if natural alignment is reachable through peeling. */
1075 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1077 HOST_WIDE_INT elmsize
=
1078 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1079 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE
, vect_location
,
1082 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1083 dump_printf (MSG_NOTE
,
1084 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1086 if (DR_MISALIGNMENT (dr
) % elmsize
)
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1090 "data size does not divide the misalignment.\n");
1095 if (!known_alignment_for_access_p (dr
))
1097 tree type
= TREE_TYPE (DR_REF (dr
));
1098 bool is_packed
= not_size_aligned (DR_REF (dr
));
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1101 "Unknown misalignment, %snaturally aligned\n",
1102 is_packed
? "not " : "");
1103 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1110 /* Calculate the cost of the memory access represented by DR. */
1113 vect_get_data_access_cost (struct data_reference
*dr
,
1114 unsigned int *inside_cost
,
1115 unsigned int *outside_cost
,
1116 stmt_vector_for_cost
*body_cost_vec
)
1118 gimple
*stmt
= DR_STMT (dr
);
1119 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1120 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1121 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1122 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1123 int ncopies
= vf
/ nunits
;
1125 if (DR_IS_READ (dr
))
1126 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1127 NULL
, body_cost_vec
, false);
1129 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1131 if (dump_enabled_p ())
1132 dump_printf_loc (MSG_NOTE
, vect_location
,
1133 "vect_get_data_access_cost: inside_cost = %d, "
1134 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1138 typedef struct _vect_peel_info
1140 struct data_reference
*dr
;
1145 typedef struct _vect_peel_extended_info
1147 struct _vect_peel_info peel_info
;
1148 unsigned int inside_cost
;
1149 unsigned int outside_cost
;
1150 stmt_vector_for_cost body_cost_vec
;
1151 } *vect_peel_extended_info
;
1154 /* Peeling hashtable helpers. */
1156 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1158 static inline hashval_t
hash (const _vect_peel_info
*);
1159 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1163 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1165 return (hashval_t
) peel_info
->npeel
;
1169 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1171 return (a
->npeel
== b
->npeel
);
1175 /* Insert DR into peeling hash table with NPEEL as key. */
1178 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1179 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1182 struct _vect_peel_info elem
, *slot
;
1183 _vect_peel_info
**new_slot
;
1184 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1187 slot
= peeling_htab
->find (&elem
);
1192 slot
= XNEW (struct _vect_peel_info
);
1193 slot
->npeel
= npeel
;
1196 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1200 if (!supportable_dr_alignment
1201 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1202 slot
->count
+= VECT_MAX_COST
;
1206 /* Traverse peeling hash table to find peeling option that aligns maximum
1207 number of data accesses. */
1210 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1211 _vect_peel_extended_info
*max
)
1213 vect_peel_info elem
= *slot
;
1215 if (elem
->count
> max
->peel_info
.count
1216 || (elem
->count
== max
->peel_info
.count
1217 && max
->peel_info
.npeel
> elem
->npeel
))
1219 max
->peel_info
.npeel
= elem
->npeel
;
1220 max
->peel_info
.count
= elem
->count
;
1221 max
->peel_info
.dr
= elem
->dr
;
1228 /* Traverse peeling hash table and calculate cost for each peeling option.
1229 Find the one with the lowest cost. */
1232 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1233 _vect_peel_extended_info
*min
)
1235 vect_peel_info elem
= *slot
;
1236 int save_misalignment
, dummy
;
1237 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1238 gimple
*stmt
= DR_STMT (elem
->dr
);
1239 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1240 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1241 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1242 struct data_reference
*dr
;
1243 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1245 prologue_cost_vec
.create (2);
1246 body_cost_vec
.create (2);
1247 epilogue_cost_vec
.create (2);
1249 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1251 stmt
= DR_STMT (dr
);
1252 stmt_info
= vinfo_for_stmt (stmt
);
1253 /* For interleaving, only the alignment of the first access
1255 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1256 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1259 /* Strided accesses perform only component accesses, alignment is
1260 irrelevant for them. */
1261 if (STMT_VINFO_STRIDED_P (stmt_info
)
1262 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1265 save_misalignment
= DR_MISALIGNMENT (dr
);
1266 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1267 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1269 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1272 outside_cost
+= vect_get_known_peeling_cost
1273 (loop_vinfo
, elem
->npeel
, &dummy
,
1274 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1275 &prologue_cost_vec
, &epilogue_cost_vec
);
1277 /* Prologue and epilogue costs are added to the target model later.
1278 These costs depend only on the scalar iteration cost, the
1279 number of peeling iterations finally chosen, and the number of
1280 misaligned statements. So discard the information found here. */
1281 prologue_cost_vec
.release ();
1282 epilogue_cost_vec
.release ();
1284 if (inside_cost
< min
->inside_cost
1285 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1287 min
->inside_cost
= inside_cost
;
1288 min
->outside_cost
= outside_cost
;
1289 min
->body_cost_vec
.release ();
1290 min
->body_cost_vec
= body_cost_vec
;
1291 min
->peel_info
.dr
= elem
->dr
;
1292 min
->peel_info
.npeel
= elem
->npeel
;
1295 body_cost_vec
.release ();
1301 /* Choose best peeling option by traversing peeling hash table and either
1302 choosing an option with the lowest cost (if cost model is enabled) or the
1303 option that aligns as many accesses as possible. */
1305 static struct data_reference
*
1306 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1307 loop_vec_info loop_vinfo
,
1308 unsigned int *npeel
,
1309 stmt_vector_for_cost
*body_cost_vec
)
1311 struct _vect_peel_extended_info res
;
1313 res
.peel_info
.dr
= NULL
;
1314 res
.body_cost_vec
= stmt_vector_for_cost ();
1316 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1318 res
.inside_cost
= INT_MAX
;
1319 res
.outside_cost
= INT_MAX
;
1320 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1321 vect_peeling_hash_get_lowest_cost
> (&res
);
1325 res
.peel_info
.count
= 0;
1326 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1327 vect_peeling_hash_get_most_frequent
> (&res
);
1330 *npeel
= res
.peel_info
.npeel
;
1331 *body_cost_vec
= res
.body_cost_vec
;
1332 return res
.peel_info
.dr
;
1336 /* Function vect_enhance_data_refs_alignment
1338 This pass will use loop versioning and loop peeling in order to enhance
1339 the alignment of data references in the loop.
1341 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1342 original loop is to be vectorized. Any other loops that are created by
1343 the transformations performed in this pass - are not supposed to be
1344 vectorized. This restriction will be relaxed.
1346 This pass will require a cost model to guide it whether to apply peeling
1347 or versioning or a combination of the two. For example, the scheme that
1348 intel uses when given a loop with several memory accesses, is as follows:
1349 choose one memory access ('p') which alignment you want to force by doing
1350 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1351 other accesses are not necessarily aligned, or (2) use loop versioning to
1352 generate one loop in which all accesses are aligned, and another loop in
1353 which only 'p' is necessarily aligned.
1355 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1356 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1357 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1359 Devising a cost model is the most critical aspect of this work. It will
1360 guide us on which access to peel for, whether to use loop versioning, how
1361 many versions to create, etc. The cost model will probably consist of
1362 generic considerations as well as target specific considerations (on
1363 powerpc for example, misaligned stores are more painful than misaligned
1366 Here are the general steps involved in alignment enhancements:
1368 -- original loop, before alignment analysis:
1369 for (i=0; i<N; i++){
1370 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1371 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1374 -- After vect_compute_data_refs_alignment:
1375 for (i=0; i<N; i++){
1376 x = q[i]; # DR_MISALIGNMENT(q) = 3
1377 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1380 -- Possibility 1: we do loop versioning:
1382 for (i=0; i<N; i++){ # loop 1A
1383 x = q[i]; # DR_MISALIGNMENT(q) = 3
1384 p[i] = y; # DR_MISALIGNMENT(p) = 0
1388 for (i=0; i<N; i++){ # loop 1B
1389 x = q[i]; # DR_MISALIGNMENT(q) = 3
1390 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1394 -- Possibility 2: we do loop peeling:
1395 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1399 for (i = 3; i < N; i++){ # loop 2A
1400 x = q[i]; # DR_MISALIGNMENT(q) = 0
1401 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1404 -- Possibility 3: combination of loop peeling and versioning:
1405 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1410 for (i = 3; i<N; i++){ # loop 3A
1411 x = q[i]; # DR_MISALIGNMENT(q) = 0
1412 p[i] = y; # DR_MISALIGNMENT(p) = 0
1416 for (i = 3; i<N; i++){ # loop 3B
1417 x = q[i]; # DR_MISALIGNMENT(q) = 0
1418 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1422 These loops are later passed to loop_transform to be vectorized. The
1423 vectorizer will use the alignment information to guide the transformation
1424 (whether to generate regular loads/stores, or with special handling for
1428 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1430 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1431 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1432 enum dr_alignment_support supportable_dr_alignment
;
1433 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1434 struct data_reference
*dr
;
1436 bool do_peeling
= false;
1437 bool do_versioning
= false;
1440 stmt_vec_info stmt_info
;
1441 unsigned int npeel
= 0;
1442 bool all_misalignments_unknown
= true;
1443 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1444 unsigned possible_npeel_number
= 1;
1446 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1447 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1448 hash_table
<peel_info_hasher
> peeling_htab (1);
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_NOTE
, vect_location
,
1452 "=== vect_enhance_data_refs_alignment ===\n");
1454 /* Reset data so we can safely be called multiple times. */
1455 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1456 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1458 /* While cost model enhancements are expected in the future, the high level
1459 view of the code at this time is as follows:
1461 A) If there is a misaligned access then see if peeling to align
1462 this access can make all data references satisfy
1463 vect_supportable_dr_alignment. If so, update data structures
1464 as needed and return true.
1466 B) If peeling wasn't possible and there is a data reference with an
1467 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1468 then see if loop versioning checks can be used to make all data
1469 references satisfy vect_supportable_dr_alignment. If so, update
1470 data structures as needed and return true.
1472 C) If neither peeling nor versioning were successful then return false if
1473 any data reference does not satisfy vect_supportable_dr_alignment.
1475 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1477 Note, Possibility 3 above (which is peeling and versioning together) is not
1478 being done at this time. */
1480 /* (1) Peeling to force alignment. */
1482 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1484 + How many accesses will become aligned due to the peeling
1485 - How many accesses will become unaligned due to the peeling,
1486 and the cost of misaligned accesses.
1487 - The cost of peeling (the extra runtime checks, the increase
1490 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1492 stmt
= DR_STMT (dr
);
1493 stmt_info
= vinfo_for_stmt (stmt
);
1495 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1498 /* For interleaving, only the alignment of the first access
1500 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1501 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1504 /* For invariant accesses there is nothing to enhance. */
1505 if (integer_zerop (DR_STEP (dr
)))
1508 /* Strided accesses perform only component accesses, alignment is
1509 irrelevant for them. */
1510 if (STMT_VINFO_STRIDED_P (stmt_info
)
1511 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1514 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1515 do_peeling
= vector_alignment_reachable_p (dr
);
1518 if (known_alignment_for_access_p (dr
))
1520 unsigned int npeel_tmp
;
1521 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1522 size_zero_node
) < 0;
1524 /* Save info about DR in the hash table. */
1525 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1526 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1527 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1528 TREE_TYPE (DR_REF (dr
))));
1529 npeel_tmp
= (negative
1530 ? (mis
- nelements
) : (nelements
- mis
))
1533 /* For multiple types, it is possible that the bigger type access
1534 will have more than one peeling option. E.g., a loop with two
1535 types: one of size (vector size / 4), and the other one of
1536 size (vector size / 8). Vectorization factor will 8. If both
1537 access are misaligned by 3, the first one needs one scalar
1538 iteration to be aligned, and the second one needs 5. But the
1539 first one will be aligned also by peeling 5 scalar
1540 iterations, and in that case both accesses will be aligned.
1541 Hence, except for the immediate peeling amount, we also want
1542 to try to add full vector size, while we don't exceed
1543 vectorization factor.
1544 We do this automtically for cost model, since we calculate cost
1545 for every peeling option. */
1546 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1548 if (STMT_SLP_TYPE (stmt_info
))
1549 possible_npeel_number
1550 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1552 possible_npeel_number
= vf
/ nelements
;
1555 /* Handle the aligned case. We may decide to align some other
1556 access, making DR unaligned. */
1557 if (DR_MISALIGNMENT (dr
) == 0)
1560 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1561 possible_npeel_number
++;
1564 for (j
= 0; j
< possible_npeel_number
; j
++)
1566 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1568 npeel_tmp
+= nelements
;
1571 all_misalignments_unknown
= false;
1572 /* Data-ref that was chosen for the case that all the
1573 misalignments are unknown is not relevant anymore, since we
1574 have a data-ref with known alignment. */
1579 /* If we don't know any misalignment values, we prefer
1580 peeling for data-ref that has the maximum number of data-refs
1581 with the same alignment, unless the target prefers to align
1582 stores over load. */
1583 if (all_misalignments_unknown
)
1585 unsigned same_align_drs
1586 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1588 || same_align_drs_max
< same_align_drs
)
1590 same_align_drs_max
= same_align_drs
;
1593 /* For data-refs with the same number of related
1594 accesses prefer the one where the misalign
1595 computation will be invariant in the outermost loop. */
1596 else if (same_align_drs_max
== same_align_drs
)
1598 struct loop
*ivloop0
, *ivloop
;
1599 ivloop0
= outermost_invariant_loop_for_expr
1600 (loop
, DR_BASE_ADDRESS (dr0
));
1601 ivloop
= outermost_invariant_loop_for_expr
1602 (loop
, DR_BASE_ADDRESS (dr
));
1603 if ((ivloop
&& !ivloop0
)
1604 || (ivloop
&& ivloop0
1605 && flow_loop_nested_p (ivloop
, ivloop0
)))
1609 if (!first_store
&& DR_IS_WRITE (dr
))
1613 /* If there are both known and unknown misaligned accesses in the
1614 loop, we choose peeling amount according to the known
1616 if (!supportable_dr_alignment
)
1619 if (!first_store
&& DR_IS_WRITE (dr
))
1626 if (!aligned_access_p (dr
))
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1630 "vector alignment may not be reachable\n");
1636 /* Check if we can possibly peel the loop. */
1637 if (!vect_can_advance_ivs_p (loop_vinfo
)
1638 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1643 && all_misalignments_unknown
1644 && vect_supportable_dr_alignment (dr0
, false))
1646 /* Check if the target requires to prefer stores over loads, i.e., if
1647 misaligned stores are more expensive than misaligned loads (taking
1648 drs with same alignment into account). */
1649 if (first_store
&& DR_IS_READ (dr0
))
1651 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1652 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1653 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1654 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1655 stmt_vector_for_cost dummy
;
1658 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1660 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1661 &store_outside_cost
, &dummy
);
1665 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1666 aligning the load DR0). */
1667 load_inside_penalty
= store_inside_cost
;
1668 load_outside_penalty
= store_outside_cost
;
1670 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1671 DR_STMT (first_store
))).iterate (i
, &dr
);
1673 if (DR_IS_READ (dr
))
1675 load_inside_penalty
+= load_inside_cost
;
1676 load_outside_penalty
+= load_outside_cost
;
1680 load_inside_penalty
+= store_inside_cost
;
1681 load_outside_penalty
+= store_outside_cost
;
1684 /* Calculate the penalty for leaving DR0 unaligned (by
1685 aligning the FIRST_STORE). */
1686 store_inside_penalty
= load_inside_cost
;
1687 store_outside_penalty
= load_outside_cost
;
1689 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1690 DR_STMT (dr0
))).iterate (i
, &dr
);
1692 if (DR_IS_READ (dr
))
1694 store_inside_penalty
+= load_inside_cost
;
1695 store_outside_penalty
+= load_outside_cost
;
1699 store_inside_penalty
+= store_inside_cost
;
1700 store_outside_penalty
+= store_outside_cost
;
1703 if (load_inside_penalty
> store_inside_penalty
1704 || (load_inside_penalty
== store_inside_penalty
1705 && load_outside_penalty
> store_outside_penalty
))
1709 /* In case there are only loads with different unknown misalignments, use
1710 peeling only if it may help to align other accesses in the loop or
1711 if it may help improving load bandwith when we'd end up using
1713 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1715 && !STMT_VINFO_SAME_ALIGN_REFS (
1716 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1717 && (vect_supportable_dr_alignment (dr0
, false)
1718 != dr_unaligned_supported
1719 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1720 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1724 if (do_peeling
&& !dr0
)
1726 /* Peeling is possible, but there is no data access that is not supported
1727 unless aligned. So we try to choose the best possible peeling. */
1729 /* We should get here only if there are drs with known misalignment. */
1730 gcc_assert (!all_misalignments_unknown
);
1732 /* Choose the best peeling from the hash table. */
1733 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1742 stmt
= DR_STMT (dr0
);
1743 stmt_info
= vinfo_for_stmt (stmt
);
1744 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1745 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1747 if (known_alignment_for_access_p (dr0
))
1749 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1750 size_zero_node
) < 0;
1753 /* Since it's known at compile time, compute the number of
1754 iterations in the peeled loop (the peeling factor) for use in
1755 updating DR_MISALIGNMENT values. The peeling factor is the
1756 vectorization factor minus the misalignment as an element
1758 mis
= DR_MISALIGNMENT (dr0
);
1759 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1760 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1764 /* For interleaved data access every iteration accesses all the
1765 members of the group, therefore we divide the number of iterations
1766 by the group size. */
1767 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1768 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1769 npeel
/= GROUP_SIZE (stmt_info
);
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_NOTE
, vect_location
,
1773 "Try peeling by %d\n", npeel
);
1776 /* Ensure that all data refs can be vectorized after the peel. */
1777 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1779 int save_misalignment
;
1784 stmt
= DR_STMT (dr
);
1785 stmt_info
= vinfo_for_stmt (stmt
);
1786 /* For interleaving, only the alignment of the first access
1788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1789 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1792 /* Strided accesses perform only component accesses, alignment is
1793 irrelevant for them. */
1794 if (STMT_VINFO_STRIDED_P (stmt_info
)
1795 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1798 save_misalignment
= DR_MISALIGNMENT (dr
);
1799 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1800 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1801 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1803 if (!supportable_dr_alignment
)
1810 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1812 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1817 body_cost_vec
.release ();
1822 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1825 unsigned max_allowed_peel
1826 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1827 if (max_allowed_peel
!= (unsigned)-1)
1829 unsigned max_peel
= npeel
;
1832 gimple
*dr_stmt
= DR_STMT (dr0
);
1833 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1834 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1835 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1837 if (max_peel
> max_allowed_peel
)
1840 if (dump_enabled_p ())
1841 dump_printf_loc (MSG_NOTE
, vect_location
,
1842 "Disable peeling, max peels reached: %d\n", max_peel
);
1847 /* Cost model #2 - if peeling may result in a remaining loop not
1848 iterating enough to be vectorized then do not peel. */
1850 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1853 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1854 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1855 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1861 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1862 If the misalignment of DR_i is identical to that of dr0 then set
1863 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1864 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1865 by the peeling factor times the element size of DR_i (MOD the
1866 vectorization factor times the size). Otherwise, the
1867 misalignment of DR_i must be set to unknown. */
1868 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1871 /* Strided accesses perform only component accesses, alignment
1872 is irrelevant for them. */
1873 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1874 if (STMT_VINFO_STRIDED_P (stmt_info
)
1875 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1878 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1881 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1883 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1885 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1886 = DR_MISALIGNMENT (dr0
);
1887 SET_DR_MISALIGNMENT (dr0
, 0);
1888 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_NOTE
, vect_location
,
1891 "Alignment of access forced using peeling.\n");
1892 dump_printf_loc (MSG_NOTE
, vect_location
,
1893 "Peeling for alignment will be applied.\n");
1895 /* The inside-loop cost will be accounted for in vectorizable_load
1896 and vectorizable_store correctly with adjusted alignments.
1897 Drop the body_cst_vec on the floor here. */
1898 body_cost_vec
.release ();
1900 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1906 body_cost_vec
.release ();
1908 /* (2) Versioning to force alignment. */
1910 /* Try versioning if:
1911 1) optimize loop for speed
1912 2) there is at least one unsupported misaligned data ref with an unknown
1914 3) all misaligned data refs with a known misalignment are supported, and
1915 4) the number of runtime alignment checks is within reason. */
1918 optimize_loop_nest_for_speed_p (loop
)
1919 && (!loop
->inner
); /* FORNOW */
1923 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1925 stmt
= DR_STMT (dr
);
1926 stmt_info
= vinfo_for_stmt (stmt
);
1928 /* For interleaving, only the alignment of the first access
1930 if (aligned_access_p (dr
)
1931 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1932 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1935 if (STMT_VINFO_STRIDED_P (stmt_info
))
1937 /* Strided loads perform only component accesses, alignment is
1938 irrelevant for them. */
1939 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1941 do_versioning
= false;
1945 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1947 if (!supportable_dr_alignment
)
1953 if (known_alignment_for_access_p (dr
)
1954 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1955 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1957 do_versioning
= false;
1961 stmt
= DR_STMT (dr
);
1962 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1963 gcc_assert (vectype
);
1965 /* The rightmost bits of an aligned address must be zeros.
1966 Construct the mask needed for this test. For example,
1967 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1968 mask must be 15 = 0xf. */
1969 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1971 /* FORNOW: use the same mask to test all potentially unaligned
1972 references in the loop. The vectorizer currently supports
1973 a single vector size, see the reference to
1974 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1975 vectorization factor is computed. */
1976 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1977 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1978 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1979 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1984 /* Versioning requires at least one misaligned data reference. */
1985 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1986 do_versioning
= false;
1987 else if (!do_versioning
)
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1993 vec
<gimple
*> may_misalign_stmts
1994 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1997 /* It can now be assumed that the data references in the statements
1998 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1999 of the loop being vectorized. */
2000 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2002 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2003 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2004 SET_DR_MISALIGNMENT (dr
, 0);
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE
, vect_location
,
2007 "Alignment of access forced using versioning.\n");
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_NOTE
, vect_location
,
2012 "Versioning for alignment will be applied.\n");
2014 /* Peeling and versioning can't be done together at this time. */
2015 gcc_assert (! (do_peeling
&& do_versioning
));
2017 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2022 /* This point is reached if neither peeling nor versioning is being done. */
2023 gcc_assert (! (do_peeling
|| do_versioning
));
2025 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2030 /* Function vect_find_same_alignment_drs.
2032 Update group and alignment relations according to the chosen
2033 vectorization factor. */
2036 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
2037 loop_vec_info loop_vinfo
)
2040 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2041 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2042 struct data_reference
*dra
= DDR_A (ddr
);
2043 struct data_reference
*drb
= DDR_B (ddr
);
2044 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2045 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2046 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2047 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2048 lambda_vector dist_v
;
2049 unsigned int loop_depth
;
2051 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2057 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2060 /* Loop-based vectorization and known data dependence. */
2061 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2064 /* Data-dependence analysis reports a distance vector of zero
2065 for data-references that overlap only in the first iteration
2066 but have different sign step (see PR45764).
2067 So as a sanity check require equal DR_STEP. */
2068 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2071 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2072 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2074 int dist
= dist_v
[loop_depth
];
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE
, vect_location
,
2078 "dependence distance = %d.\n", dist
);
2080 /* Same loop iteration. */
2082 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2084 /* Two references with distance zero have the same alignment. */
2085 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2086 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2087 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE
, vect_location
,
2090 "accesses have the same alignment.\n");
2091 dump_printf (MSG_NOTE
,
2092 "dependence distance modulo vf == 0 between ");
2093 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2094 dump_printf (MSG_NOTE
, " and ");
2095 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2096 dump_printf (MSG_NOTE
, "\n");
2103 /* Function vect_analyze_data_refs_alignment
2105 Analyze the alignment of the data-references in the loop.
2106 Return FALSE if a data reference is found that cannot be vectorized. */
2109 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_NOTE
, vect_location
,
2113 "=== vect_analyze_data_refs_alignment ===\n");
2115 /* Mark groups of data references with same alignment using
2116 data dependence information. */
2117 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2118 struct data_dependence_relation
*ddr
;
2121 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2122 vect_find_same_alignment_drs (ddr
, vinfo
);
2124 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2125 struct data_reference
*dr
;
2127 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2129 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2130 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2131 && !vect_compute_data_ref_alignment (dr
))
2133 /* Strided accesses perform only component accesses, misalignment
2134 information is irrelevant for them. */
2135 if (STMT_VINFO_STRIDED_P (stmt_info
)
2136 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2141 "not vectorized: can't calculate alignment "
2152 /* Analyze alignment of DRs of stmts in NODE. */
2155 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2157 /* We vectorize from the first scalar stmt in the node unless
2158 the node is permuted in which case we start from the first
2159 element in the group. */
2160 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2161 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2162 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2163 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2165 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2166 if (! vect_compute_data_ref_alignment (dr
)
2167 /* For creating the data-ref pointer we need alignment of the
2168 first element anyway. */
2170 && ! vect_compute_data_ref_alignment (first_dr
))
2171 || ! verify_data_ref_alignment (dr
))
2173 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2175 "not vectorized: bad data alignment in basic "
2183 /* Function vect_slp_analyze_instance_alignment
2185 Analyze the alignment of the data-references in the SLP instance.
2186 Return FALSE if a data reference is found that cannot be vectorized. */
2189 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE
, vect_location
,
2193 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2197 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2198 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2201 node
= SLP_INSTANCE_TREE (instance
);
2202 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2203 && ! vect_slp_analyze_and_verify_node_alignment
2204 (SLP_INSTANCE_TREE (instance
)))
2211 /* Analyze groups of accesses: check that DR belongs to a group of
2212 accesses of legal size, step, etc. Detect gaps, single element
2213 interleaving, and other special cases. Set grouped access info.
2214 Collect groups of strided stores for further use in SLP analysis.
2215 Worker for vect_analyze_group_access. */
2218 vect_analyze_group_access_1 (struct data_reference
*dr
)
2220 tree step
= DR_STEP (dr
);
2221 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2222 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2223 gimple
*stmt
= DR_STMT (dr
);
2224 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2225 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2226 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2227 HOST_WIDE_INT dr_step
= -1;
2228 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2229 bool slp_impossible
= false;
2231 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2232 size of the interleaving group (including gaps). */
2233 if (tree_fits_shwi_p (step
))
2235 dr_step
= tree_to_shwi (step
);
2236 /* Check that STEP is a multiple of type size. Otherwise there is
2237 a non-element-sized gap at the end of the group which we
2238 cannot represent in GROUP_GAP or GROUP_SIZE.
2239 ??? As we can handle non-constant step fine here we should
2240 simply remove uses of GROUP_GAP between the last and first
2241 element and instead rely on DR_STEP. GROUP_SIZE then would
2242 simply not include that gap. */
2243 if ((dr_step
% type_size
) != 0)
2245 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE
, vect_location
,
2249 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2250 dump_printf (MSG_NOTE
,
2251 " is not a multiple of the element size for ");
2252 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2253 dump_printf (MSG_NOTE
, "\n");
2257 groupsize
= absu_hwi (dr_step
) / type_size
;
2262 /* Not consecutive access is possible only if it is a part of interleaving. */
2263 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2265 /* Check if it this DR is a part of interleaving, and is a single
2266 element of the group that is accessed in the loop. */
2268 /* Gaps are supported only for loads. STEP must be a multiple of the type
2269 size. The size of the group must be a power of 2. */
2271 && (dr_step
% type_size
) == 0
2273 && pow2p_hwi (groupsize
))
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2276 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2277 GROUP_GAP (stmt_info
) = groupsize
- 1;
2278 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_NOTE
, vect_location
,
2281 "Detected single element interleaving ");
2282 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2283 dump_printf (MSG_NOTE
, " step ");
2284 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2285 dump_printf (MSG_NOTE
, "\n");
2291 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2294 "not consecutive access ");
2295 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2300 /* Mark the statement as unvectorizable. */
2301 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2305 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2306 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2310 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2312 /* First stmt in the interleaving chain. Check the chain. */
2313 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2314 struct data_reference
*data_ref
= dr
;
2315 unsigned int count
= 1;
2316 tree prev_init
= DR_INIT (data_ref
);
2317 gimple
*prev
= stmt
;
2318 HOST_WIDE_INT diff
, gaps
= 0;
2322 /* Skip same data-refs. In case that two or more stmts share
2323 data-ref (supported only for loads), we vectorize only the first
2324 stmt, and the rest get their vectorized loads from the first
2326 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2327 DR_INIT (STMT_VINFO_DATA_REF (
2328 vinfo_for_stmt (next
)))))
2330 if (DR_IS_WRITE (data_ref
))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2334 "Two store stmts share the same dr.\n");
2338 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2340 "Two or more load stmts share the same dr.\n");
2342 /* For load use the same data-ref load. */
2343 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2346 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2351 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2353 /* All group members have the same STEP by construction. */
2354 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2356 /* Check that the distance between two accesses is equal to the type
2357 size. Otherwise, we have gaps. */
2358 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2359 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2362 /* FORNOW: SLP of accesses with gaps is not supported. */
2363 slp_impossible
= true;
2364 if (DR_IS_WRITE (data_ref
))
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2368 "interleaved store with gaps\n");
2375 last_accessed_element
+= diff
;
2377 /* Store the gap from the previous member of the group. If there is no
2378 gap in the access, GROUP_GAP is always 1. */
2379 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2381 prev_init
= DR_INIT (data_ref
);
2382 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2383 /* Count the number of data-refs in the chain. */
2388 groupsize
= count
+ gaps
;
2390 /* This could be UINT_MAX but as we are generating code in a very
2391 inefficient way we have to cap earlier. See PR78699 for example. */
2392 if (groupsize
> 4096)
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2396 "group is too large\n");
2400 /* Check that the size of the interleaving is equal to count for stores,
2401 i.e., that there are no gaps. */
2402 if (groupsize
!= count
2403 && !DR_IS_READ (dr
))
2405 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2407 "interleaved store with gaps\n");
2411 /* If there is a gap after the last load in the group it is the
2412 difference between the groupsize and the last accessed
2414 When there is no gap, this difference should be 0. */
2415 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2417 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2418 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE
, vect_location
,
2421 "Detected interleaving ");
2422 if (DR_IS_READ (dr
))
2423 dump_printf (MSG_NOTE
, "load ");
2425 dump_printf (MSG_NOTE
, "store ");
2426 dump_printf (MSG_NOTE
, "of size %u starting with ",
2427 (unsigned)groupsize
);
2428 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2429 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2430 dump_printf_loc (MSG_NOTE
, vect_location
,
2431 "There is a gap of %u elements after the group\n",
2432 GROUP_GAP (vinfo_for_stmt (stmt
)));
2435 /* SLP: create an SLP data structure for every interleaving group of
2436 stores for further analysis in vect_analyse_slp. */
2437 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2440 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2442 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2449 /* Analyze groups of accesses: check that DR belongs to a group of
2450 accesses of legal size, step, etc. Detect gaps, single element
2451 interleaving, and other special cases. Set grouped access info.
2452 Collect groups of strided stores for further use in SLP analysis. */
2455 vect_analyze_group_access (struct data_reference
*dr
)
2457 if (!vect_analyze_group_access_1 (dr
))
2459 /* Dissolve the group if present. */
2461 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2464 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2465 next
= GROUP_NEXT_ELEMENT (vinfo
);
2466 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2467 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2475 /* Analyze the access pattern of the data-reference DR.
2476 In case of non-consecutive accesses call vect_analyze_group_access() to
2477 analyze groups of accesses. */
2480 vect_analyze_data_ref_access (struct data_reference
*dr
)
2482 tree step
= DR_STEP (dr
);
2483 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2484 gimple
*stmt
= DR_STMT (dr
);
2485 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2486 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2487 struct loop
*loop
= NULL
;
2490 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2492 if (loop_vinfo
&& !step
)
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2496 "bad data-ref access in loop\n");
2500 /* Allow loads with zero step in inner-loop vectorization. */
2501 if (loop_vinfo
&& integer_zerop (step
))
2503 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2504 if (!nested_in_vect_loop_p (loop
, stmt
))
2505 return DR_IS_READ (dr
);
2506 /* Allow references with zero step for outer loops marked
2507 with pragma omp simd only - it guarantees absence of
2508 loop-carried dependencies between inner loop iterations. */
2509 if (!loop
->force_vectorize
)
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE
, vect_location
,
2513 "zero step in inner loop of nest\n");
2518 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2520 /* Interleaved accesses are not yet supported within outer-loop
2521 vectorization for references in the inner-loop. */
2522 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2524 /* For the rest of the analysis we use the outer-loop step. */
2525 step
= STMT_VINFO_DR_STEP (stmt_info
);
2526 if (integer_zerop (step
))
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_NOTE
, vect_location
,
2530 "zero step in outer loop.\n");
2531 return DR_IS_READ (dr
);
2536 if (TREE_CODE (step
) == INTEGER_CST
)
2538 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2539 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2541 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2543 /* Mark that it is not interleaving. */
2544 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2549 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE
, vect_location
,
2553 "grouped access in outer loop.\n");
2558 /* Assume this is a DR handled by non-constant strided load case. */
2559 if (TREE_CODE (step
) != INTEGER_CST
)
2560 return (STMT_VINFO_STRIDED_P (stmt_info
)
2561 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2562 || vect_analyze_group_access (dr
)));
2564 /* Not consecutive access - check if it's a part of interleaving group. */
2565 return vect_analyze_group_access (dr
);
2570 /* A helper function used in the comparator function to sort data
2571 references. T1 and T2 are two data references to be compared.
2572 The function returns -1, 0, or 1. */
2575 compare_tree (tree t1
, tree t2
)
2578 enum tree_code code
;
2591 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2592 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2594 code
= TREE_CODE (t1
);
2597 /* For const values, we can just use hash values for comparisons. */
2605 hashval_t h1
= iterative_hash_expr (t1
, 0);
2606 hashval_t h2
= iterative_hash_expr (t2
, 0);
2608 return h1
< h2
? -1 : 1;
2613 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2617 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2618 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2622 tclass
= TREE_CODE_CLASS (code
);
2624 /* For var-decl, we could compare their UIDs. */
2625 if (tclass
== tcc_declaration
)
2627 if (DECL_UID (t1
) != DECL_UID (t2
))
2628 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2632 /* For expressions with operands, compare their operands recursively. */
2633 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2635 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2645 /* Compare two data-references DRA and DRB to group them into chunks
2646 suitable for grouping. */
2649 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2651 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2652 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2655 /* Stabilize sort. */
2659 /* DRs in different loops never belong to the same group. */
2660 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2661 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2663 return loopa
->num
< loopb
->num
? -1 : 1;
2665 /* Ordering of DRs according to base. */
2666 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2668 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2673 /* And according to DR_OFFSET. */
2674 if (!dr_equal_offsets_p (dra
, drb
))
2676 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2681 /* Put reads before writes. */
2682 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2683 return DR_IS_READ (dra
) ? -1 : 1;
2685 /* Then sort after access size. */
2686 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2687 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2689 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2690 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2695 /* And after step. */
2696 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2698 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2703 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2704 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2706 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2710 /* Function vect_analyze_data_ref_accesses.
2712 Analyze the access pattern of all the data references in the loop.
2714 FORNOW: the only access pattern that is considered vectorizable is a
2715 simple step 1 (consecutive) access.
2717 FORNOW: handle only arrays and pointer accesses. */
2720 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2723 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2724 struct data_reference
*dr
;
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE
, vect_location
,
2728 "=== vect_analyze_data_ref_accesses ===\n");
2730 if (datarefs
.is_empty ())
2733 /* Sort the array of datarefs to make building the interleaving chains
2734 linear. Don't modify the original vector's order, it is needed for
2735 determining what dependencies are reversed. */
2736 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2737 datarefs_copy
.qsort (dr_group_sort_cmp
);
2739 /* Build the interleaving chains. */
2740 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2742 data_reference_p dra
= datarefs_copy
[i
];
2743 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2744 stmt_vec_info lastinfo
= NULL
;
2745 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2750 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2752 data_reference_p drb
= datarefs_copy
[i
];
2753 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2754 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2757 /* ??? Imperfect sorting (non-compatible types, non-modulo
2758 accesses, same accesses) can lead to a group to be artificially
2759 split here as we don't just skip over those. If it really
2760 matters we can push those to a worklist and re-iterate
2761 over them. The we can just skip ahead to the next DR here. */
2763 /* DRs in a different loop should not be put into the same
2764 interleaving group. */
2765 if (gimple_bb (DR_STMT (dra
))->loop_father
2766 != gimple_bb (DR_STMT (drb
))->loop_father
)
2769 /* Check that the data-refs have same first location (except init)
2770 and they are both either store or load (not load and store,
2771 not masked loads or stores). */
2772 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2773 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2774 DR_BASE_ADDRESS (drb
), 0)
2775 || !dr_equal_offsets_p (dra
, drb
)
2776 || !gimple_assign_single_p (DR_STMT (dra
))
2777 || !gimple_assign_single_p (DR_STMT (drb
)))
2780 /* Check that the data-refs have the same constant size. */
2781 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2782 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2783 if (!tree_fits_uhwi_p (sza
)
2784 || !tree_fits_uhwi_p (szb
)
2785 || !tree_int_cst_equal (sza
, szb
))
2788 /* Check that the data-refs have the same step. */
2789 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2792 /* Do not place the same access in the interleaving chain twice. */
2793 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2796 /* Check the types are compatible.
2797 ??? We don't distinguish this during sorting. */
2798 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2799 TREE_TYPE (DR_REF (drb
))))
2802 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2803 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2804 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2805 gcc_assert (init_a
<= init_b
);
2807 /* If init_b == init_a + the size of the type * k, we have an
2808 interleaving, and DRA is accessed before DRB. */
2809 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2810 if (type_size_a
== 0
2811 || (init_b
- init_a
) % type_size_a
!= 0)
2814 /* If we have a store, the accesses are adjacent. This splits
2815 groups into chunks we support (we don't support vectorization
2816 of stores with gaps). */
2817 if (!DR_IS_READ (dra
)
2818 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2819 (DR_INIT (datarefs_copy
[i
-1]))
2823 /* If the step (if not zero or non-constant) is greater than the
2824 difference between data-refs' inits this splits groups into
2826 if (tree_fits_shwi_p (DR_STEP (dra
)))
2828 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2829 if (step
!= 0 && step
<= (init_b
- init_a
))
2833 if (dump_enabled_p ())
2835 dump_printf_loc (MSG_NOTE
, vect_location
,
2836 "Detected interleaving ");
2837 if (DR_IS_READ (dra
))
2838 dump_printf (MSG_NOTE
, "load ");
2840 dump_printf (MSG_NOTE
, "store ");
2841 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2842 dump_printf (MSG_NOTE
, " and ");
2843 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2844 dump_printf (MSG_NOTE
, "\n");
2847 /* Link the found element into the group list. */
2848 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2850 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2851 lastinfo
= stmtinfo_a
;
2853 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2854 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2855 lastinfo
= stmtinfo_b
;
2859 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2860 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2861 && !vect_analyze_data_ref_access (dr
))
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2865 "not vectorized: complicated access pattern.\n");
2867 if (is_a
<bb_vec_info
> (vinfo
))
2869 /* Mark the statement as not vectorizable. */
2870 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2875 datarefs_copy
.release ();
2880 datarefs_copy
.release ();
2885 /* Operator == between two dr_with_seg_len objects.
2887 This equality operator is used to make sure two data refs
2888 are the same one so that we will consider to combine the
2889 aliasing checks of those two pairs of data dependent data
2893 operator == (const dr_with_seg_len
& d1
,
2894 const dr_with_seg_len
& d2
)
2896 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2897 DR_BASE_ADDRESS (d2
.dr
), 0)
2898 && compare_tree (DR_OFFSET (d1
.dr
), DR_OFFSET (d2
.dr
)) == 0
2899 && compare_tree (DR_INIT (d1
.dr
), DR_INIT (d2
.dr
)) == 0
2900 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2903 /* Function comp_dr_with_seg_len_pair.
2905 Comparison function for sorting objects of dr_with_seg_len_pair_t
2906 so that we can combine aliasing checks in one scan. */
2909 comp_dr_with_seg_len_pair (const void *pa_
, const void *pb_
)
2911 const dr_with_seg_len_pair_t
* pa
= (const dr_with_seg_len_pair_t
*) pa_
;
2912 const dr_with_seg_len_pair_t
* pb
= (const dr_with_seg_len_pair_t
*) pb_
;
2913 const dr_with_seg_len
&a1
= pa
->first
, &a2
= pa
->second
;
2914 const dr_with_seg_len
&b1
= pb
->first
, &b2
= pb
->second
;
2916 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2917 if a and c have the same basic address snd step, and b and d have the same
2918 address and step. Therefore, if any a&c or b&d don't have the same address
2919 and step, we don't care the order of those two pairs after sorting. */
2922 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a1
.dr
),
2923 DR_BASE_ADDRESS (b1
.dr
))) != 0)
2925 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a2
.dr
),
2926 DR_BASE_ADDRESS (b2
.dr
))) != 0)
2928 if ((comp_res
= compare_tree (DR_STEP (a1
.dr
), DR_STEP (b1
.dr
))) != 0)
2930 if ((comp_res
= compare_tree (DR_STEP (a2
.dr
), DR_STEP (b2
.dr
))) != 0)
2932 if ((comp_res
= compare_tree (DR_OFFSET (a1
.dr
), DR_OFFSET (b1
.dr
))) != 0)
2934 if ((comp_res
= compare_tree (DR_INIT (a1
.dr
), DR_INIT (b1
.dr
))) != 0)
2936 if ((comp_res
= compare_tree (DR_OFFSET (a2
.dr
), DR_OFFSET (b2
.dr
))) != 0)
2938 if ((comp_res
= compare_tree (DR_INIT (a2
.dr
), DR_INIT (b2
.dr
))) != 0)
2944 /* Function vect_vfa_segment_size.
2946 Create an expression that computes the size of segment
2947 that will be accessed for a data reference. The functions takes into
2948 account that realignment loads may access one more vector.
2951 DR: The data reference.
2952 LENGTH_FACTOR: segment length to consider.
2954 Return an expression whose value is the size of segment which will be
2958 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2960 tree segment_length
;
2962 if (integer_zerop (DR_STEP (dr
)))
2963 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2965 segment_length
= size_binop (MULT_EXPR
,
2966 fold_convert (sizetype
, DR_STEP (dr
)),
2967 fold_convert (sizetype
, length_factor
));
2969 if (vect_supportable_dr_alignment (dr
, false)
2970 == dr_explicit_realign_optimized
)
2972 tree vector_size
= TYPE_SIZE_UNIT
2973 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2975 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2977 return segment_length
;
2980 /* Function vect_no_alias_p.
2982 Given data references A and B with equal base and offset, the alias
2983 relation can be decided at compilation time, return TRUE if they do
2984 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2985 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2989 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2990 tree segment_length_a
, tree segment_length_b
)
2992 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
2993 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
2994 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
2997 tree seg_a_min
= DR_INIT (a
);
2998 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
2999 seg_a_min
, segment_length_a
);
3000 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3001 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3003 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3005 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
3006 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
3007 seg_a_max
, unit_size
);
3008 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
3009 DR_INIT (a
), unit_size
);
3011 tree seg_b_min
= DR_INIT (b
);
3012 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
3013 seg_b_min
, segment_length_b
);
3014 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3016 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3017 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3018 seg_b_max
, unit_size
);
3019 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3020 DR_INIT (b
), unit_size
);
3023 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3024 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3030 /* Function vect_prune_runtime_alias_test_list.
3032 Prune a list of ddrs to be tested at run-time by versioning for alias.
3033 Merge several alias checks into one if possible.
3034 Return FALSE if resulting list of ddrs is longer then allowed by
3035 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3038 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3040 vec
<ddr_p
> may_alias_ddrs
=
3041 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3042 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
3043 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3044 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3045 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3051 if (dump_enabled_p ())
3052 dump_printf_loc (MSG_NOTE
, vect_location
,
3053 "=== vect_prune_runtime_alias_test_list ===\n");
3055 if (may_alias_ddrs
.is_empty ())
3058 /* Basically, for each pair of dependent data refs store_ptr_0
3059 and load_ptr_0, we create an expression:
3061 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3062 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3064 for aliasing checks. However, in some cases we can decrease
3065 the number of checks by combining two checks into one. For
3066 example, suppose we have another pair of data refs store_ptr_0
3067 and load_ptr_1, and if the following condition is satisfied:
3069 load_ptr_0 < load_ptr_1 &&
3070 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3072 (this condition means, in each iteration of vectorized loop,
3073 the accessed memory of store_ptr_0 cannot be between the memory
3074 of load_ptr_0 and load_ptr_1.)
3076 we then can use only the following expression to finish the
3077 alising checks between store_ptr_0 & load_ptr_0 and
3078 store_ptr_0 & load_ptr_1:
3080 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3081 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3083 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3084 same basic address. */
3086 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3088 /* First, we collect all data ref pairs for aliasing checks. */
3089 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3092 struct data_reference
*dr_a
, *dr_b
;
3093 gimple
*dr_group_first_a
, *dr_group_first_b
;
3094 tree segment_length_a
, segment_length_b
;
3095 gimple
*stmt_a
, *stmt_b
;
3098 stmt_a
= DR_STMT (DDR_A (ddr
));
3099 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3100 if (dr_group_first_a
)
3102 stmt_a
= dr_group_first_a
;
3103 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3107 stmt_b
= DR_STMT (DDR_B (ddr
));
3108 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3109 if (dr_group_first_b
)
3111 stmt_b
= dr_group_first_b
;
3112 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3115 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3116 length_factor
= scalar_loop_iters
;
3118 length_factor
= size_int (vect_factor
);
3119 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3120 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3122 comp_res
= compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
));
3124 comp_res
= compare_tree (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
));
3126 /* Alias is known at compilation time. */
3128 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3129 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3130 && TREE_CODE (segment_length_a
) == INTEGER_CST
3131 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3133 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3136 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_NOTE
, vect_location
,
3138 "not vectorized: compilation time alias.\n");
3143 dr_with_seg_len_pair_t dr_with_seg_len_pair
3144 (dr_with_seg_len (dr_a
, segment_length_a
),
3145 dr_with_seg_len (dr_b
, segment_length_b
));
3147 /* Canonicalize pairs by sorting the two DR members. */
3149 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3151 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3154 /* Second, we sort the collected data ref pairs so that we can scan
3155 them once to combine all possible aliasing checks. */
3156 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3158 /* Third, we scan the sorted dr pairs and check if we can combine
3159 alias checks of two neighboring dr pairs. */
3160 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3162 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3163 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3164 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3165 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3166 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3168 /* Remove duplicate data ref pairs. */
3169 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3171 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_NOTE
, vect_location
,
3174 "found equal ranges ");
3175 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3176 DR_REF (dr_a1
->dr
));
3177 dump_printf (MSG_NOTE
, ", ");
3178 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3179 DR_REF (dr_b1
->dr
));
3180 dump_printf (MSG_NOTE
, " and ");
3181 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3182 DR_REF (dr_a2
->dr
));
3183 dump_printf (MSG_NOTE
, ", ");
3184 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3185 DR_REF (dr_b2
->dr
));
3186 dump_printf (MSG_NOTE
, "\n");
3189 comp_alias_ddrs
.ordered_remove (i
--);
3193 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3195 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3196 and DR_A1 and DR_A2 are two consecutive memrefs. */
3197 if (*dr_a1
== *dr_a2
)
3199 std::swap (dr_a1
, dr_b1
);
3200 std::swap (dr_a2
, dr_b2
);
3203 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3204 DR_BASE_ADDRESS (dr_a2
->dr
), 0)
3205 || !operand_equal_p (DR_OFFSET (dr_a1
->dr
),
3206 DR_OFFSET (dr_a2
->dr
), 0)
3207 || !tree_fits_shwi_p (DR_INIT (dr_a1
->dr
))
3208 || !tree_fits_shwi_p (DR_INIT (dr_a2
->dr
)))
3211 /* Make sure dr_a1 starts left of dr_a2. */
3212 if (tree_int_cst_lt (DR_INIT (dr_a2
->dr
), DR_INIT (dr_a1
->dr
)))
3213 std::swap (*dr_a1
, *dr_a2
);
3215 bool do_remove
= false;
3216 unsigned HOST_WIDE_INT diff
3217 = (tree_to_shwi (DR_INIT (dr_a2
->dr
))
3218 - tree_to_shwi (DR_INIT (dr_a1
->dr
)));
3220 /* If the left segment does not extend beyond the start of the
3221 right segment the new segment length is that of the right
3222 plus the segment distance. */
3223 if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3224 && compare_tree_int (dr_a1
->seg_len
, diff
) <= 0)
3226 dr_a1
->seg_len
= size_binop (PLUS_EXPR
, dr_a2
->seg_len
,
3230 /* Generally the new segment length is the maximum of the
3231 left segment size and the right segment size plus the distance.
3232 ??? We can also build tree MAX_EXPR here but it's not clear this
3234 else if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3235 && tree_fits_uhwi_p (dr_a2
->seg_len
))
3237 unsigned HOST_WIDE_INT seg_len_a1
= tree_to_uhwi (dr_a1
->seg_len
);
3238 unsigned HOST_WIDE_INT seg_len_a2
= tree_to_uhwi (dr_a2
->seg_len
);
3239 dr_a1
->seg_len
= size_int (MAX (seg_len_a1
, diff
+ seg_len_a2
));
3242 /* Now we check if the following condition is satisfied:
3244 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3246 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3247 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3248 have to make a best estimation. We can get the minimum value
3249 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3250 then either of the following two conditions can guarantee the
3253 1: DIFF <= MIN_SEG_LEN_B
3254 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3257 unsigned HOST_WIDE_INT min_seg_len_b
3258 = (tree_fits_uhwi_p (dr_b1
->seg_len
)
3259 ? tree_to_uhwi (dr_b1
->seg_len
)
3262 if (diff
<= min_seg_len_b
3263 || (tree_fits_uhwi_p (dr_a1
->seg_len
)
3264 && diff
- tree_to_uhwi (dr_a1
->seg_len
) < min_seg_len_b
))
3266 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3267 dr_a2
->seg_len
, size_int (diff
));
3274 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_NOTE
, vect_location
,
3277 "merging ranges for ");
3278 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a1
->dr
));
3279 dump_printf (MSG_NOTE
, ", ");
3280 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b1
->dr
));
3281 dump_printf (MSG_NOTE
, " and ");
3282 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a2
->dr
));
3283 dump_printf (MSG_NOTE
, ", ");
3284 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b2
->dr
));
3285 dump_printf (MSG_NOTE
, "\n");
3287 comp_alias_ddrs
.ordered_remove (i
--);
3292 dump_printf_loc (MSG_NOTE
, vect_location
,
3293 "improved number of alias checks from %d to %d\n",
3294 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3295 if ((int) comp_alias_ddrs
.length () >
3296 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3300 "number of versioning for alias "
3301 "run-time tests exceeds %d "
3302 "(--param vect-max-version-for-alias-checks)\n",
3303 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3307 /* All alias checks have been resolved at compilation time. */
3308 if (!comp_alias_ddrs
.length ())
3309 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
3314 /* Return true if a non-affine read or write in STMT is suitable for a
3315 gather load or scatter store. Describe the operation in *INFO if so. */
3318 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3319 gather_scatter_info
*info
)
3321 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3322 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3323 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3324 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3325 tree offtype
= NULL_TREE
;
3326 tree decl
, base
, off
;
3328 int punsignedp
, reversep
, pvolatilep
= 0;
3331 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3332 see if we can use the def stmt of the address. */
3333 if (is_gimple_call (stmt
)
3334 && gimple_call_internal_p (stmt
)
3335 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3336 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3337 && TREE_CODE (base
) == MEM_REF
3338 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3339 && integer_zerop (TREE_OPERAND (base
, 1))
3340 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3342 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3343 if (is_gimple_assign (def_stmt
)
3344 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3345 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3348 /* The gather and scatter builtins need address of the form
3349 loop_invariant + vector * {1, 2, 4, 8}
3351 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3352 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3353 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3354 multiplications and additions in it. To get a vector, we need
3355 a single SSA_NAME that will be defined in the loop and will
3356 contain everything that is not loop invariant and that can be
3357 vectorized. The following code attempts to find such a preexistng
3358 SSA_NAME OFF and put the loop invariants into a tree BASE
3359 that can be gimplified before the loop. */
3360 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3361 &punsignedp
, &reversep
, &pvolatilep
);
3362 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3364 if (TREE_CODE (base
) == MEM_REF
)
3366 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3368 if (off
== NULL_TREE
)
3370 offset_int moff
= mem_ref_offset (base
);
3371 off
= wide_int_to_tree (sizetype
, moff
);
3374 off
= size_binop (PLUS_EXPR
, off
,
3375 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3377 base
= TREE_OPERAND (base
, 0);
3380 base
= build_fold_addr_expr (base
);
3382 if (off
== NULL_TREE
)
3383 off
= size_zero_node
;
3385 /* If base is not loop invariant, either off is 0, then we start with just
3386 the constant offset in the loop invariant BASE and continue with base
3387 as OFF, otherwise give up.
3388 We could handle that case by gimplifying the addition of base + off
3389 into some SSA_NAME and use that as off, but for now punt. */
3390 if (!expr_invariant_in_loop_p (loop
, base
))
3392 if (!integer_zerop (off
))
3395 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3397 /* Otherwise put base + constant offset into the loop invariant BASE
3398 and continue with OFF. */
3401 base
= fold_convert (sizetype
, base
);
3402 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3405 /* OFF at this point may be either a SSA_NAME or some tree expression
3406 from get_inner_reference. Try to peel off loop invariants from it
3407 into BASE as long as possible. */
3409 while (offtype
== NULL_TREE
)
3411 enum tree_code code
;
3412 tree op0
, op1
, add
= NULL_TREE
;
3414 if (TREE_CODE (off
) == SSA_NAME
)
3416 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3418 if (expr_invariant_in_loop_p (loop
, off
))
3421 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3424 op0
= gimple_assign_rhs1 (def_stmt
);
3425 code
= gimple_assign_rhs_code (def_stmt
);
3426 op1
= gimple_assign_rhs2 (def_stmt
);
3430 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3432 code
= TREE_CODE (off
);
3433 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3437 case POINTER_PLUS_EXPR
:
3439 if (expr_invariant_in_loop_p (loop
, op0
))
3444 add
= fold_convert (sizetype
, add
);
3446 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3447 base
= size_binop (PLUS_EXPR
, base
, add
);
3450 if (expr_invariant_in_loop_p (loop
, op1
))
3458 if (expr_invariant_in_loop_p (loop
, op1
))
3460 add
= fold_convert (sizetype
, op1
);
3461 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3467 if (scale
== 1 && tree_fits_shwi_p (op1
))
3469 scale
= tree_to_shwi (op1
);
3478 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3479 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3481 if (TYPE_PRECISION (TREE_TYPE (op0
))
3482 == TYPE_PRECISION (TREE_TYPE (off
)))
3487 if (TYPE_PRECISION (TREE_TYPE (op0
))
3488 < TYPE_PRECISION (TREE_TYPE (off
)))
3491 offtype
= TREE_TYPE (off
);
3502 /* If at the end OFF still isn't a SSA_NAME or isn't
3503 defined in the loop, punt. */
3504 if (TREE_CODE (off
) != SSA_NAME
3505 || expr_invariant_in_loop_p (loop
, off
))
3508 if (offtype
== NULL_TREE
)
3509 offtype
= TREE_TYPE (off
);
3511 if (DR_IS_READ (dr
))
3512 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3515 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3518 if (decl
== NULL_TREE
)
3524 info
->offset_dt
= vect_unknown_def_type
;
3525 info
->offset_vectype
= NULL_TREE
;
3526 info
->scale
= scale
;
3530 /* Function vect_analyze_data_refs.
3532 Find all the data references in the loop or basic block.
3534 The general structure of the analysis of data refs in the vectorizer is as
3536 1- vect_analyze_data_refs(loop/bb): call
3537 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3538 in the loop/bb and their dependences.
3539 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3540 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3541 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3546 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3548 struct loop
*loop
= NULL
;
3550 struct data_reference
*dr
;
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE
, vect_location
,
3555 "=== vect_analyze_data_refs ===\n");
3557 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3558 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3560 /* Go through the data-refs, check that the analysis succeeded. Update
3561 pointer from stmt_vec_info struct to DR and vectype. */
3563 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3564 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3567 stmt_vec_info stmt_info
;
3568 tree base
, offset
, init
;
3569 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3570 bool simd_lane_access
= false;
3574 if (!dr
|| !DR_REF (dr
))
3576 if (dump_enabled_p ())
3577 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3578 "not vectorized: unhandled data-ref\n");
3582 stmt
= DR_STMT (dr
);
3583 stmt_info
= vinfo_for_stmt (stmt
);
3585 /* Discard clobbers from the dataref vector. We will remove
3586 clobber stmts during vectorization. */
3587 if (gimple_clobber_p (stmt
))
3590 if (i
== datarefs
.length () - 1)
3595 datarefs
.ordered_remove (i
);
3600 /* Check that analysis of the data-ref succeeded. */
3601 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3606 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3607 && targetm
.vectorize
.builtin_gather
!= NULL
;
3610 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3611 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3612 bool maybe_simd_lane_access
3613 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3615 /* If target supports vector gather loads or scatter stores, or if
3616 this might be a SIMD lane access, see if they can't be used. */
3617 if (is_a
<loop_vec_info
> (vinfo
)
3618 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3619 && !nested_in_vect_loop_p (loop
, stmt
))
3621 struct data_reference
*newdr
3622 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3623 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3624 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3625 if (DR_BASE_ADDRESS (newdr
)
3626 && DR_OFFSET (newdr
)
3629 && integer_zerop (DR_STEP (newdr
)))
3631 if (maybe_simd_lane_access
)
3633 tree off
= DR_OFFSET (newdr
);
3635 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3636 && TREE_CODE (off
) == MULT_EXPR
3637 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3639 tree step
= TREE_OPERAND (off
, 1);
3640 off
= TREE_OPERAND (off
, 0);
3642 if (CONVERT_EXPR_P (off
)
3643 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3645 < TYPE_PRECISION (TREE_TYPE (off
)))
3646 off
= TREE_OPERAND (off
, 0);
3647 if (TREE_CODE (off
) == SSA_NAME
)
3649 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3650 tree reft
= TREE_TYPE (DR_REF (newdr
));
3651 if (is_gimple_call (def
)
3652 && gimple_call_internal_p (def
)
3653 && (gimple_call_internal_fn (def
)
3654 == IFN_GOMP_SIMD_LANE
))
3656 tree arg
= gimple_call_arg (def
, 0);
3657 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3658 arg
= SSA_NAME_VAR (arg
);
3659 if (arg
== loop
->simduid
3661 && tree_int_cst_equal
3662 (TYPE_SIZE_UNIT (reft
),
3665 DR_OFFSET (newdr
) = ssize_int (0);
3666 DR_STEP (newdr
) = step
;
3667 DR_ALIGNED_TO (newdr
)
3668 = size_int (BIGGEST_ALIGNMENT
);
3670 simd_lane_access
= true;
3676 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3680 gatherscatter
= GATHER
;
3682 gatherscatter
= SCATTER
;
3685 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3686 free_data_ref (newdr
);
3689 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3691 if (dump_enabled_p ())
3693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3694 "not vectorized: data ref analysis "
3696 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3699 if (is_a
<bb_vec_info
> (vinfo
))
3706 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3708 if (dump_enabled_p ())
3709 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3710 "not vectorized: base addr of dr is a "
3713 if (is_a
<bb_vec_info
> (vinfo
))
3716 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3721 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3723 if (dump_enabled_p ())
3725 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3726 "not vectorized: volatile type ");
3727 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3730 if (is_a
<bb_vec_info
> (vinfo
))
3736 if (stmt_can_throw_internal (stmt
))
3738 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3741 "not vectorized: statement can throw an "
3743 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3746 if (is_a
<bb_vec_info
> (vinfo
))
3749 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3754 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3755 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3757 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3760 "not vectorized: statement is bitfield "
3762 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3765 if (is_a
<bb_vec_info
> (vinfo
))
3768 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3773 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3774 offset
= unshare_expr (DR_OFFSET (dr
));
3775 init
= unshare_expr (DR_INIT (dr
));
3777 if (is_gimple_call (stmt
)
3778 && (!gimple_call_internal_p (stmt
)
3779 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3780 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3782 if (dump_enabled_p ())
3784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3785 "not vectorized: dr in a call ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3789 if (is_a
<bb_vec_info
> (vinfo
))
3792 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3797 /* Update DR field in stmt_vec_info struct. */
3799 /* If the dataref is in an inner-loop of the loop that is considered for
3800 for vectorization, we also want to analyze the access relative to
3801 the outer-loop (DR contains information only relative to the
3802 inner-most enclosing loop). We do that by building a reference to the
3803 first location accessed by the inner-loop, and analyze it relative to
3805 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3807 tree outer_step
, outer_base
, outer_init
;
3808 HOST_WIDE_INT pbitsize
, pbitpos
;
3811 int punsignedp
, preversep
, pvolatilep
;
3812 affine_iv base_iv
, offset_iv
;
3815 /* Build a reference to the first location accessed by the
3816 inner-loop: *(BASE+INIT). (The first location is actually
3817 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3818 tree inner_base
= build_fold_indirect_ref
3819 (fold_build_pointer_plus (base
, init
));
3821 if (dump_enabled_p ())
3823 dump_printf_loc (MSG_NOTE
, vect_location
,
3824 "analyze in outer-loop: ");
3825 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3826 dump_printf (MSG_NOTE
, "\n");
3829 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3830 &poffset
, &pmode
, &punsignedp
,
3831 &preversep
, &pvolatilep
);
3832 gcc_assert (outer_base
!= NULL_TREE
);
3834 if (pbitpos
% BITS_PER_UNIT
!= 0)
3836 if (dump_enabled_p ())
3837 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3838 "failed: bit offset alignment.\n");
3844 if (dump_enabled_p ())
3845 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3846 "failed: reverse storage order.\n");
3850 outer_base
= build_fold_addr_expr (outer_base
);
3851 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3854 if (dump_enabled_p ())
3855 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3856 "failed: evolution of base is not affine.\n");
3863 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3871 offset_iv
.base
= ssize_int (0);
3872 offset_iv
.step
= ssize_int (0);
3874 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3877 if (dump_enabled_p ())
3878 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3879 "evolution of offset is not affine.\n");
3883 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3884 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3885 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3886 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3887 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3889 outer_step
= size_binop (PLUS_EXPR
,
3890 fold_convert (ssizetype
, base_iv
.step
),
3891 fold_convert (ssizetype
, offset_iv
.step
));
3893 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3894 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3895 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3896 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3897 STMT_VINFO_DR_OFFSET (stmt_info
) =
3898 fold_convert (ssizetype
, offset_iv
.base
);
3899 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3900 size_int (highest_pow2_factor (offset_iv
.base
));
3902 if (dump_enabled_p ())
3904 dump_printf_loc (MSG_NOTE
, vect_location
,
3905 "\touter base_address: ");
3906 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3907 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3908 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3909 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3910 STMT_VINFO_DR_OFFSET (stmt_info
));
3911 dump_printf (MSG_NOTE
,
3912 "\n\touter constant offset from base address: ");
3913 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3914 STMT_VINFO_DR_INIT (stmt_info
));
3915 dump_printf (MSG_NOTE
, "\n\touter step: ");
3916 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3917 STMT_VINFO_DR_STEP (stmt_info
));
3918 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3919 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3920 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3921 dump_printf (MSG_NOTE
, "\n");
3925 if (STMT_VINFO_DATA_REF (stmt_info
))
3927 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3930 "not vectorized: more than one data ref "
3932 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3935 if (is_a
<bb_vec_info
> (vinfo
))
3938 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3943 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3944 if (simd_lane_access
)
3946 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3947 free_data_ref (datarefs
[i
]);
3951 /* Set vectype for STMT. */
3952 scalar_type
= TREE_TYPE (DR_REF (dr
));
3953 STMT_VINFO_VECTYPE (stmt_info
)
3954 = get_vectype_for_scalar_type (scalar_type
);
3955 if (!STMT_VINFO_VECTYPE (stmt_info
))
3957 if (dump_enabled_p ())
3959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3960 "not vectorized: no vectype for stmt: ");
3961 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3962 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3963 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3965 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3968 if (is_a
<bb_vec_info
> (vinfo
))
3970 /* No vector type is fine, the ref can still participate
3971 in dependence analysis, we just can't vectorize it. */
3972 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3976 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3978 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3979 if (gatherscatter
!= SG_NONE
)
3986 if (dump_enabled_p ())
3988 dump_printf_loc (MSG_NOTE
, vect_location
,
3989 "got vectype for stmt: ");
3990 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3991 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3992 STMT_VINFO_VECTYPE (stmt_info
));
3993 dump_printf (MSG_NOTE
, "\n");
3997 /* Adjust the minimal vectorization factor according to the
3999 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4003 if (gatherscatter
!= SG_NONE
)
4005 gather_scatter_info gs_info
;
4006 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4008 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4010 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4012 if (dump_enabled_p ())
4014 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4015 (gatherscatter
== GATHER
) ?
4016 "not vectorized: not suitable for gather "
4018 "not vectorized: not suitable for scatter "
4020 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4025 free_data_ref (datarefs
[i
]);
4027 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4030 else if (is_a
<loop_vec_info
> (vinfo
)
4031 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4033 if (nested_in_vect_loop_p (loop
, stmt
))
4035 if (dump_enabled_p ())
4037 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4038 "not vectorized: not suitable for strided "
4040 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4044 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4048 /* If we stopped analysis at the first dataref we could not analyze
4049 when trying to vectorize a basic-block mark the rest of the datarefs
4050 as not vectorizable and truncate the vector of datarefs. That
4051 avoids spending useless time in analyzing their dependence. */
4052 if (i
!= datarefs
.length ())
4054 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4055 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4057 data_reference_p dr
= datarefs
[j
];
4058 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4061 datarefs
.truncate (i
);
4068 /* Function vect_get_new_vect_var.
4070 Returns a name for a new variable. The current naming scheme appends the
4071 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4072 the name of vectorizer generated variables, and appends that to NAME if
4076 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4083 case vect_simple_var
:
4086 case vect_scalar_var
:
4092 case vect_pointer_var
:
4101 char* tmp
= concat (prefix
, "_", name
, NULL
);
4102 new_vect_var
= create_tmp_reg (type
, tmp
);
4106 new_vect_var
= create_tmp_reg (type
, prefix
);
4108 return new_vect_var
;
4111 /* Like vect_get_new_vect_var but return an SSA name. */
4114 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4121 case vect_simple_var
:
4124 case vect_scalar_var
:
4127 case vect_pointer_var
:
4136 char* tmp
= concat (prefix
, "_", name
, NULL
);
4137 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4141 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4143 return new_vect_var
;
4146 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4149 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
4150 stmt_vec_info stmt_info
)
4152 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4153 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
4154 int misalign
= DR_MISALIGNMENT (dr
);
4156 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4158 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4161 /* Function vect_create_addr_base_for_vector_ref.
4163 Create an expression that computes the address of the first memory location
4164 that will be accessed for a data reference.
4167 STMT: The statement containing the data reference.
4168 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4169 OFFSET: Optional. If supplied, it is be added to the initial address.
4170 LOOP: Specify relative to which loop-nest should the address be computed.
4171 For example, when the dataref is in an inner-loop nested in an
4172 outer-loop that is now being vectorized, LOOP can be either the
4173 outer-loop, or the inner-loop. The first memory location accessed
4174 by the following dataref ('in' points to short):
4181 if LOOP=i_loop: &in (relative to i_loop)
4182 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4183 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4184 initial address. Unlike OFFSET, which is number of elements to
4185 be added, BYTE_OFFSET is measured in bytes.
4188 1. Return an SSA_NAME whose value is the address of the memory location of
4189 the first vector of the data reference.
4190 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4191 these statement(s) which define the returned SSA_NAME.
4193 FORNOW: We are only handling array accesses with step 1. */
4196 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4197 gimple_seq
*new_stmt_list
,
4202 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4203 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4205 const char *base_name
;
4208 gimple_seq seq
= NULL
;
4212 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4213 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4215 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4217 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4219 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4221 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4222 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4223 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4227 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4228 base_offset
= unshare_expr (DR_OFFSET (dr
));
4229 init
= unshare_expr (DR_INIT (dr
));
4233 base_name
= get_name (data_ref_base
);
4236 base_offset
= ssize_int (0);
4237 init
= ssize_int (0);
4238 base_name
= get_name (DR_REF (dr
));
4241 /* Create base_offset */
4242 base_offset
= size_binop (PLUS_EXPR
,
4243 fold_convert (sizetype
, base_offset
),
4244 fold_convert (sizetype
, init
));
4248 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4249 fold_convert (sizetype
, offset
), step
);
4250 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4251 base_offset
, offset
);
4255 byte_offset
= fold_convert (sizetype
, byte_offset
);
4256 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4257 base_offset
, byte_offset
);
4260 /* base + base_offset */
4262 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4265 addr_base
= build1 (ADDR_EXPR
,
4266 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4267 unshare_expr (DR_REF (dr
)));
4270 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4271 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4272 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4273 gimple_seq_add_seq (new_stmt_list
, seq
);
4275 if (DR_PTR_INFO (dr
)
4276 && TREE_CODE (addr_base
) == SSA_NAME
4277 && !SSA_NAME_PTR_INFO (addr_base
))
4279 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4280 if (offset
|| byte_offset
)
4281 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4284 if (dump_enabled_p ())
4286 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4287 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4288 dump_printf (MSG_NOTE
, "\n");
4295 /* Function vect_create_data_ref_ptr.
4297 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4298 location accessed in the loop by STMT, along with the def-use update
4299 chain to appropriately advance the pointer through the loop iterations.
4300 Also set aliasing information for the pointer. This pointer is used by
4301 the callers to this function to create a memory reference expression for
4302 vector load/store access.
4305 1. STMT: a stmt that references memory. Expected to be of the form
4306 GIMPLE_ASSIGN <name, data-ref> or
4307 GIMPLE_ASSIGN <data-ref, name>.
4308 2. AGGR_TYPE: the type of the reference, which should be either a vector
4310 3. AT_LOOP: the loop where the vector memref is to be created.
4311 4. OFFSET (optional): an offset to be added to the initial address accessed
4312 by the data-ref in STMT.
4313 5. BSI: location where the new stmts are to be placed if there is no loop
4314 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4315 pointing to the initial address.
4316 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4317 to the initial address accessed by the data-ref in STMT. This is
4318 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4322 1. Declare a new ptr to vector_type, and have it point to the base of the
4323 data reference (initial addressed accessed by the data reference).
4324 For example, for vector of type V8HI, the following code is generated:
4327 ap = (v8hi *)initial_address;
4329 if OFFSET is not supplied:
4330 initial_address = &a[init];
4331 if OFFSET is supplied:
4332 initial_address = &a[init + OFFSET];
4333 if BYTE_OFFSET is supplied:
4334 initial_address = &a[init] + BYTE_OFFSET;
4336 Return the initial_address in INITIAL_ADDRESS.
4338 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4339 update the pointer in each iteration of the loop.
4341 Return the increment stmt that updates the pointer in PTR_INCR.
4343 3. Set INV_P to true if the access pattern of the data reference in the
4344 vectorized loop is invariant. Set it to false otherwise.
4346 4. Return the pointer. */
4349 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4350 tree offset
, tree
*initial_address
,
4351 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4352 bool only_init
, bool *inv_p
, tree byte_offset
)
4354 const char *base_name
;
4355 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4356 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4357 struct loop
*loop
= NULL
;
4358 bool nested_in_vect_loop
= false;
4359 struct loop
*containing_loop
= NULL
;
4363 gimple_seq new_stmt_list
= NULL
;
4367 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4369 gimple_stmt_iterator incr_gsi
;
4371 tree indx_before_incr
, indx_after_incr
;
4374 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4376 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4377 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4381 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4382 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4383 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4384 pe
= loop_preheader_edge (loop
);
4388 gcc_assert (bb_vinfo
);
4393 /* Check the step (evolution) of the load in LOOP, and record
4394 whether it's invariant. */
4395 if (nested_in_vect_loop
)
4396 step
= STMT_VINFO_DR_STEP (stmt_info
);
4398 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4400 if (integer_zerop (step
))
4405 /* Create an expression for the first address accessed by this load
4407 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4409 if (dump_enabled_p ())
4411 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4412 dump_printf_loc (MSG_NOTE
, vect_location
,
4413 "create %s-pointer variable to type: ",
4414 get_tree_code_name (TREE_CODE (aggr_type
)));
4415 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4416 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4417 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4418 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4419 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4420 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4421 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4423 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4424 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4425 dump_printf (MSG_NOTE
, "\n");
4428 /* (1) Create the new aggregate-pointer variable.
4429 Vector and array types inherit the alias set of their component
4430 type by default so we need to use a ref-all pointer if the data
4431 reference does not conflict with the created aggregated data
4432 reference because it is not addressable. */
4433 bool need_ref_all
= false;
4434 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4435 get_alias_set (DR_REF (dr
))))
4436 need_ref_all
= true;
4437 /* Likewise for any of the data references in the stmt group. */
4438 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4440 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4443 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4444 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4445 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4446 get_alias_set (DR_REF (sdr
))))
4448 need_ref_all
= true;
4451 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4455 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4457 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4460 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4461 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4462 def-use update cycles for the pointer: one relative to the outer-loop
4463 (LOOP), which is what steps (3) and (4) below do. The other is relative
4464 to the inner-loop (which is the inner-most loop containing the dataref),
4465 and this is done be step (5) below.
4467 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4468 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4469 redundant. Steps (3),(4) create the following:
4472 LOOP: vp1 = phi(vp0,vp2)
4478 If there is an inner-loop nested in loop, then step (5) will also be
4479 applied, and an additional update in the inner-loop will be created:
4482 LOOP: vp1 = phi(vp0,vp2)
4484 inner: vp3 = phi(vp1,vp4)
4485 vp4 = vp3 + inner_step
4491 /* (2) Calculate the initial address of the aggregate-pointer, and set
4492 the aggregate-pointer to point to it before the loop. */
4494 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4496 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4497 offset
, loop
, byte_offset
);
4502 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4503 gcc_assert (!new_bb
);
4506 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4509 *initial_address
= new_temp
;
4510 aggr_ptr_init
= new_temp
;
4512 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4513 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4514 inner-loop nested in LOOP (during outer-loop vectorization). */
4516 /* No update in loop is required. */
4517 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4518 aptr
= aggr_ptr_init
;
4521 /* The step of the aggregate pointer is the type size. */
4522 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4523 /* One exception to the above is when the scalar step of the load in
4524 LOOP is zero. In this case the step here is also zero. */
4526 iv_step
= size_zero_node
;
4527 else if (tree_int_cst_sgn (step
) == -1)
4528 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4530 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4532 create_iv (aggr_ptr_init
,
4533 fold_convert (aggr_ptr_type
, iv_step
),
4534 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4535 &indx_before_incr
, &indx_after_incr
);
4536 incr
= gsi_stmt (incr_gsi
);
4537 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4539 /* Copy the points-to information if it exists. */
4540 if (DR_PTR_INFO (dr
))
4542 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4543 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4548 aptr
= indx_before_incr
;
4551 if (!nested_in_vect_loop
|| only_init
)
4555 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4556 nested in LOOP, if exists. */
4558 gcc_assert (nested_in_vect_loop
);
4561 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4563 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4564 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4566 incr
= gsi_stmt (incr_gsi
);
4567 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4569 /* Copy the points-to information if it exists. */
4570 if (DR_PTR_INFO (dr
))
4572 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4573 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4578 return indx_before_incr
;
4585 /* Function bump_vector_ptr
4587 Increment a pointer (to a vector type) by vector-size. If requested,
4588 i.e. if PTR-INCR is given, then also connect the new increment stmt
4589 to the existing def-use update-chain of the pointer, by modifying
4590 the PTR_INCR as illustrated below:
4592 The pointer def-use update-chain before this function:
4593 DATAREF_PTR = phi (p_0, p_2)
4595 PTR_INCR: p_2 = DATAREF_PTR + step
4597 The pointer def-use update-chain after this function:
4598 DATAREF_PTR = phi (p_0, p_2)
4600 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4602 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4605 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4607 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4608 the loop. The increment amount across iterations is expected
4610 BSI - location where the new update stmt is to be placed.
4611 STMT - the original scalar memory-access stmt that is being vectorized.
4612 BUMP - optional. The offset by which to bump the pointer. If not given,
4613 the offset is assumed to be vector_size.
4615 Output: Return NEW_DATAREF_PTR as illustrated above.
4620 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4621 gimple
*stmt
, tree bump
)
4623 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4624 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4625 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4626 tree update
= TYPE_SIZE_UNIT (vectype
);
4629 use_operand_p use_p
;
4630 tree new_dataref_ptr
;
4635 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4636 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4638 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4639 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4640 dataref_ptr
, update
);
4641 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4643 /* Copy the points-to information if it exists. */
4644 if (DR_PTR_INFO (dr
))
4646 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4647 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4651 return new_dataref_ptr
;
4653 /* Update the vector-pointer's cross-iteration increment. */
4654 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4656 tree use
= USE_FROM_PTR (use_p
);
4658 if (use
== dataref_ptr
)
4659 SET_USE (use_p
, new_dataref_ptr
);
4661 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4664 return new_dataref_ptr
;
4668 /* Function vect_create_destination_var.
4670 Create a new temporary of type VECTYPE. */
4673 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4679 enum vect_var_kind kind
;
4682 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4686 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4688 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4690 name
= get_name (scalar_dest
);
4692 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4694 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4695 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4701 /* Function vect_grouped_store_supported.
4703 Returns TRUE if interleave high and interleave low permutations
4704 are supported, and FALSE otherwise. */
4707 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4709 machine_mode mode
= TYPE_MODE (vectype
);
4711 /* vect_permute_store_chain requires the group size to be equal to 3 or
4712 be a power of two. */
4713 if (count
!= 3 && exact_log2 (count
) == -1)
4715 if (dump_enabled_p ())
4716 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4717 "the size of the group of accesses"
4718 " is not a power of 2 or not eqaul to 3\n");
4722 /* Check that the permutation is supported. */
4723 if (VECTOR_MODE_P (mode
))
4725 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4726 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4730 unsigned int j0
= 0, j1
= 0, j2
= 0;
4733 for (j
= 0; j
< 3; j
++)
4735 int nelt0
= ((3 - j
) * nelt
) % 3;
4736 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4737 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4738 for (i
= 0; i
< nelt
; i
++)
4740 if (3 * i
+ nelt0
< nelt
)
4741 sel
[3 * i
+ nelt0
] = j0
++;
4742 if (3 * i
+ nelt1
< nelt
)
4743 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4744 if (3 * i
+ nelt2
< nelt
)
4745 sel
[3 * i
+ nelt2
] = 0;
4747 if (!can_vec_perm_p (mode
, false, sel
))
4749 if (dump_enabled_p ())
4750 dump_printf (MSG_MISSED_OPTIMIZATION
,
4751 "permutaion op not supported by target.\n");
4755 for (i
= 0; i
< nelt
; i
++)
4757 if (3 * i
+ nelt0
< nelt
)
4758 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4759 if (3 * i
+ nelt1
< nelt
)
4760 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4761 if (3 * i
+ nelt2
< nelt
)
4762 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4764 if (!can_vec_perm_p (mode
, false, sel
))
4766 if (dump_enabled_p ())
4767 dump_printf (MSG_MISSED_OPTIMIZATION
,
4768 "permutaion op not supported by target.\n");
4776 /* If length is not equal to 3 then only power of 2 is supported. */
4777 gcc_assert (pow2p_hwi (count
));
4779 for (i
= 0; i
< nelt
/ 2; i
++)
4782 sel
[i
* 2 + 1] = i
+ nelt
;
4784 if (can_vec_perm_p (mode
, false, sel
))
4786 for (i
= 0; i
< nelt
; i
++)
4788 if (can_vec_perm_p (mode
, false, sel
))
4794 if (dump_enabled_p ())
4795 dump_printf (MSG_MISSED_OPTIMIZATION
,
4796 "permutaion op not supported by target.\n");
4801 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4805 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4807 return vect_lanes_optab_supported_p ("vec_store_lanes",
4808 vec_store_lanes_optab
,
4813 /* Function vect_permute_store_chain.
4815 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4816 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4817 the data correctly for the stores. Return the final references for stores
4820 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4821 The input is 4 vectors each containing 8 elements. We assign a number to
4822 each element, the input sequence is:
4824 1st vec: 0 1 2 3 4 5 6 7
4825 2nd vec: 8 9 10 11 12 13 14 15
4826 3rd vec: 16 17 18 19 20 21 22 23
4827 4th vec: 24 25 26 27 28 29 30 31
4829 The output sequence should be:
4831 1st vec: 0 8 16 24 1 9 17 25
4832 2nd vec: 2 10 18 26 3 11 19 27
4833 3rd vec: 4 12 20 28 5 13 21 30
4834 4th vec: 6 14 22 30 7 15 23 31
4836 i.e., we interleave the contents of the four vectors in their order.
4838 We use interleave_high/low instructions to create such output. The input of
4839 each interleave_high/low operation is two vectors:
4842 the even elements of the result vector are obtained left-to-right from the
4843 high/low elements of the first vector. The odd elements of the result are
4844 obtained left-to-right from the high/low elements of the second vector.
4845 The output of interleave_high will be: 0 4 1 5
4846 and of interleave_low: 2 6 3 7
4849 The permutation is done in log LENGTH stages. In each stage interleave_high
4850 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4851 where the first argument is taken from the first half of DR_CHAIN and the
4852 second argument from it's second half.
4855 I1: interleave_high (1st vec, 3rd vec)
4856 I2: interleave_low (1st vec, 3rd vec)
4857 I3: interleave_high (2nd vec, 4th vec)
4858 I4: interleave_low (2nd vec, 4th vec)
4860 The output for the first stage is:
4862 I1: 0 16 1 17 2 18 3 19
4863 I2: 4 20 5 21 6 22 7 23
4864 I3: 8 24 9 25 10 26 11 27
4865 I4: 12 28 13 29 14 30 15 31
4867 The output of the second stage, i.e. the final result is:
4869 I1: 0 8 16 24 1 9 17 25
4870 I2: 2 10 18 26 3 11 19 27
4871 I3: 4 12 20 28 5 13 21 30
4872 I4: 6 14 22 30 7 15 23 31. */
4875 vect_permute_store_chain (vec
<tree
> dr_chain
,
4876 unsigned int length
,
4878 gimple_stmt_iterator
*gsi
,
4879 vec
<tree
> *result_chain
)
4881 tree vect1
, vect2
, high
, low
;
4883 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4884 tree perm_mask_low
, perm_mask_high
;
4886 tree perm3_mask_low
, perm3_mask_high
;
4887 unsigned int i
, n
, log_length
= exact_log2 (length
);
4888 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4889 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4891 result_chain
->quick_grow (length
);
4892 memcpy (result_chain
->address (), dr_chain
.address (),
4893 length
* sizeof (tree
));
4897 unsigned int j0
= 0, j1
= 0, j2
= 0;
4899 for (j
= 0; j
< 3; j
++)
4901 int nelt0
= ((3 - j
) * nelt
) % 3;
4902 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4903 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4905 for (i
= 0; i
< nelt
; i
++)
4907 if (3 * i
+ nelt0
< nelt
)
4908 sel
[3 * i
+ nelt0
] = j0
++;
4909 if (3 * i
+ nelt1
< nelt
)
4910 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4911 if (3 * i
+ nelt2
< nelt
)
4912 sel
[3 * i
+ nelt2
] = 0;
4914 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4916 for (i
= 0; i
< nelt
; i
++)
4918 if (3 * i
+ nelt0
< nelt
)
4919 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4920 if (3 * i
+ nelt1
< nelt
)
4921 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4922 if (3 * i
+ nelt2
< nelt
)
4923 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4925 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4927 vect1
= dr_chain
[0];
4928 vect2
= dr_chain
[1];
4930 /* Create interleaving stmt:
4931 low = VEC_PERM_EXPR <vect1, vect2,
4932 {j, nelt, *, j + 1, nelt + j + 1, *,
4933 j + 2, nelt + j + 2, *, ...}> */
4934 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4935 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4936 vect2
, perm3_mask_low
);
4937 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4940 vect2
= dr_chain
[2];
4941 /* Create interleaving stmt:
4942 low = VEC_PERM_EXPR <vect1, vect2,
4943 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4944 6, 7, nelt + j + 2, ...}> */
4945 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4946 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4947 vect2
, perm3_mask_high
);
4948 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4949 (*result_chain
)[j
] = data_ref
;
4954 /* If length is not equal to 3 then only power of 2 is supported. */
4955 gcc_assert (pow2p_hwi (length
));
4957 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4960 sel
[i
* 2 + 1] = i
+ nelt
;
4962 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4964 for (i
= 0; i
< nelt
; i
++)
4966 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4968 for (i
= 0, n
= log_length
; i
< n
; i
++)
4970 for (j
= 0; j
< length
/2; j
++)
4972 vect1
= dr_chain
[j
];
4973 vect2
= dr_chain
[j
+length
/2];
4975 /* Create interleaving stmt:
4976 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4978 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4979 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4980 vect2
, perm_mask_high
);
4981 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4982 (*result_chain
)[2*j
] = high
;
4984 /* Create interleaving stmt:
4985 low = VEC_PERM_EXPR <vect1, vect2,
4986 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4988 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4989 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4990 vect2
, perm_mask_low
);
4991 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4992 (*result_chain
)[2*j
+1] = low
;
4994 memcpy (dr_chain
.address (), result_chain
->address (),
4995 length
* sizeof (tree
));
5000 /* Function vect_setup_realignment
5002 This function is called when vectorizing an unaligned load using
5003 the dr_explicit_realign[_optimized] scheme.
5004 This function generates the following code at the loop prolog:
5007 x msq_init = *(floor(p)); # prolog load
5008 realignment_token = call target_builtin;
5010 x msq = phi (msq_init, ---)
5012 The stmts marked with x are generated only for the case of
5013 dr_explicit_realign_optimized.
5015 The code above sets up a new (vector) pointer, pointing to the first
5016 location accessed by STMT, and a "floor-aligned" load using that pointer.
5017 It also generates code to compute the "realignment-token" (if the relevant
5018 target hook was defined), and creates a phi-node at the loop-header bb
5019 whose arguments are the result of the prolog-load (created by this
5020 function) and the result of a load that takes place in the loop (to be
5021 created by the caller to this function).
5023 For the case of dr_explicit_realign_optimized:
5024 The caller to this function uses the phi-result (msq) to create the
5025 realignment code inside the loop, and sets up the missing phi argument,
5028 msq = phi (msq_init, lsq)
5029 lsq = *(floor(p')); # load in loop
5030 result = realign_load (msq, lsq, realignment_token);
5032 For the case of dr_explicit_realign:
5034 msq = *(floor(p)); # load in loop
5036 lsq = *(floor(p')); # load in loop
5037 result = realign_load (msq, lsq, realignment_token);
5040 STMT - (scalar) load stmt to be vectorized. This load accesses
5041 a memory location that may be unaligned.
5042 BSI - place where new code is to be inserted.
5043 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5047 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5048 target hook, if defined.
5049 Return value - the result of the loop-header phi node. */
5052 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5053 tree
*realignment_token
,
5054 enum dr_alignment_support alignment_support_scheme
,
5056 struct loop
**at_loop
)
5058 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5059 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5060 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5061 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5062 struct loop
*loop
= NULL
;
5064 tree scalar_dest
= gimple_assign_lhs (stmt
);
5070 tree msq_init
= NULL_TREE
;
5073 tree msq
= NULL_TREE
;
5074 gimple_seq stmts
= NULL
;
5076 bool compute_in_loop
= false;
5077 bool nested_in_vect_loop
= false;
5078 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5079 struct loop
*loop_for_initial_load
= NULL
;
5083 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5084 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5087 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5088 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5090 /* We need to generate three things:
5091 1. the misalignment computation
5092 2. the extra vector load (for the optimized realignment scheme).
5093 3. the phi node for the two vectors from which the realignment is
5094 done (for the optimized realignment scheme). */
5096 /* 1. Determine where to generate the misalignment computation.
5098 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5099 calculation will be generated by this function, outside the loop (in the
5100 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5101 caller, inside the loop.
5103 Background: If the misalignment remains fixed throughout the iterations of
5104 the loop, then both realignment schemes are applicable, and also the
5105 misalignment computation can be done outside LOOP. This is because we are
5106 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5107 are a multiple of VS (the Vector Size), and therefore the misalignment in
5108 different vectorized LOOP iterations is always the same.
5109 The problem arises only if the memory access is in an inner-loop nested
5110 inside LOOP, which is now being vectorized using outer-loop vectorization.
5111 This is the only case when the misalignment of the memory access may not
5112 remain fixed throughout the iterations of the inner-loop (as explained in
5113 detail in vect_supportable_dr_alignment). In this case, not only is the
5114 optimized realignment scheme not applicable, but also the misalignment
5115 computation (and generation of the realignment token that is passed to
5116 REALIGN_LOAD) have to be done inside the loop.
5118 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5119 or not, which in turn determines if the misalignment is computed inside
5120 the inner-loop, or outside LOOP. */
5122 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5124 compute_in_loop
= true;
5125 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5129 /* 2. Determine where to generate the extra vector load.
5131 For the optimized realignment scheme, instead of generating two vector
5132 loads in each iteration, we generate a single extra vector load in the
5133 preheader of the loop, and in each iteration reuse the result of the
5134 vector load from the previous iteration. In case the memory access is in
5135 an inner-loop nested inside LOOP, which is now being vectorized using
5136 outer-loop vectorization, we need to determine whether this initial vector
5137 load should be generated at the preheader of the inner-loop, or can be
5138 generated at the preheader of LOOP. If the memory access has no evolution
5139 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5140 to be generated inside LOOP (in the preheader of the inner-loop). */
5142 if (nested_in_vect_loop
)
5144 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5145 bool invariant_in_outerloop
=
5146 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5147 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5150 loop_for_initial_load
= loop
;
5152 *at_loop
= loop_for_initial_load
;
5154 if (loop_for_initial_load
)
5155 pe
= loop_preheader_edge (loop_for_initial_load
);
5157 /* 3. For the case of the optimized realignment, create the first vector
5158 load at the loop preheader. */
5160 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5162 /* Create msq_init = *(floor(p1)) in the loop preheader */
5165 gcc_assert (!compute_in_loop
);
5166 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5167 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5168 NULL_TREE
, &init_addr
, NULL
, &inc
,
5170 if (TREE_CODE (ptr
) == SSA_NAME
)
5171 new_temp
= copy_ssa_name (ptr
);
5173 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5174 new_stmt
= gimple_build_assign
5175 (new_temp
, BIT_AND_EXPR
, ptr
,
5176 build_int_cst (TREE_TYPE (ptr
),
5177 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5178 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5179 gcc_assert (!new_bb
);
5181 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5182 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5183 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5184 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5185 gimple_assign_set_lhs (new_stmt
, new_temp
);
5188 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5189 gcc_assert (!new_bb
);
5192 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5194 msq_init
= gimple_assign_lhs (new_stmt
);
5197 /* 4. Create realignment token using a target builtin, if available.
5198 It is done either inside the containing loop, or before LOOP (as
5199 determined above). */
5201 if (targetm
.vectorize
.builtin_mask_for_load
)
5206 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5209 /* Generate the INIT_ADDR computation outside LOOP. */
5210 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5214 pe
= loop_preheader_edge (loop
);
5215 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5216 gcc_assert (!new_bb
);
5219 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5222 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5223 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5225 vect_create_destination_var (scalar_dest
,
5226 gimple_call_return_type (new_stmt
));
5227 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5228 gimple_call_set_lhs (new_stmt
, new_temp
);
5230 if (compute_in_loop
)
5231 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5234 /* Generate the misalignment computation outside LOOP. */
5235 pe
= loop_preheader_edge (loop
);
5236 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5237 gcc_assert (!new_bb
);
5240 *realignment_token
= gimple_call_lhs (new_stmt
);
5242 /* The result of the CALL_EXPR to this builtin is determined from
5243 the value of the parameter and no global variables are touched
5244 which makes the builtin a "const" function. Requiring the
5245 builtin to have the "const" attribute makes it unnecessary
5246 to call mark_call_clobbered. */
5247 gcc_assert (TREE_READONLY (builtin_decl
));
5250 if (alignment_support_scheme
== dr_explicit_realign
)
5253 gcc_assert (!compute_in_loop
);
5254 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5257 /* 5. Create msq = phi <msq_init, lsq> in loop */
5259 pe
= loop_preheader_edge (containing_loop
);
5260 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5261 msq
= make_ssa_name (vec_dest
);
5262 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5263 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5269 /* Function vect_grouped_load_supported.
5271 COUNT is the size of the load group (the number of statements plus the
5272 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5273 only one statement, with a gap of COUNT - 1.
5275 Returns true if a suitable permute exists. */
5278 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5279 unsigned HOST_WIDE_INT count
)
5281 machine_mode mode
= TYPE_MODE (vectype
);
5283 /* If this is single-element interleaving with an element distance
5284 that leaves unused vector loads around punt - we at least create
5285 very sub-optimal code in that case (and blow up memory,
5287 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5289 if (dump_enabled_p ())
5290 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5291 "single-element interleaving not supported "
5292 "for not adjacent vector loads\n");
5296 /* vect_permute_load_chain requires the group size to be equal to 3 or
5297 be a power of two. */
5298 if (count
!= 3 && exact_log2 (count
) == -1)
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5302 "the size of the group of accesses"
5303 " is not a power of 2 or not equal to 3\n");
5307 /* Check that the permutation is supported. */
5308 if (VECTOR_MODE_P (mode
))
5310 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5311 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5316 for (k
= 0; k
< 3; k
++)
5318 for (i
= 0; i
< nelt
; i
++)
5319 if (3 * i
+ k
< 2 * nelt
)
5323 if (!can_vec_perm_p (mode
, false, sel
))
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5327 "shuffle of 3 loads is not supported by"
5331 for (i
= 0, j
= 0; i
< nelt
; i
++)
5332 if (3 * i
+ k
< 2 * nelt
)
5335 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5336 if (!can_vec_perm_p (mode
, false, sel
))
5338 if (dump_enabled_p ())
5339 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5340 "shuffle of 3 loads is not supported by"
5349 /* If length is not equal to 3 then only power of 2 is supported. */
5350 gcc_assert (pow2p_hwi (count
));
5351 for (i
= 0; i
< nelt
; i
++)
5353 if (can_vec_perm_p (mode
, false, sel
))
5355 for (i
= 0; i
< nelt
; i
++)
5357 if (can_vec_perm_p (mode
, false, sel
))
5363 if (dump_enabled_p ())
5364 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5365 "extract even/odd not supported by target\n");
5369 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5373 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5375 return vect_lanes_optab_supported_p ("vec_load_lanes",
5376 vec_load_lanes_optab
,
5380 /* Function vect_permute_load_chain.
5382 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5383 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5384 the input data correctly. Return the final references for loads in
5387 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5388 The input is 4 vectors each containing 8 elements. We assign a number to each
5389 element, the input sequence is:
5391 1st vec: 0 1 2 3 4 5 6 7
5392 2nd vec: 8 9 10 11 12 13 14 15
5393 3rd vec: 16 17 18 19 20 21 22 23
5394 4th vec: 24 25 26 27 28 29 30 31
5396 The output sequence should be:
5398 1st vec: 0 4 8 12 16 20 24 28
5399 2nd vec: 1 5 9 13 17 21 25 29
5400 3rd vec: 2 6 10 14 18 22 26 30
5401 4th vec: 3 7 11 15 19 23 27 31
5403 i.e., the first output vector should contain the first elements of each
5404 interleaving group, etc.
5406 We use extract_even/odd instructions to create such output. The input of
5407 each extract_even/odd operation is two vectors
5411 and the output is the vector of extracted even/odd elements. The output of
5412 extract_even will be: 0 2 4 6
5413 and of extract_odd: 1 3 5 7
5416 The permutation is done in log LENGTH stages. In each stage extract_even
5417 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5418 their order. In our example,
5420 E1: extract_even (1st vec, 2nd vec)
5421 E2: extract_odd (1st vec, 2nd vec)
5422 E3: extract_even (3rd vec, 4th vec)
5423 E4: extract_odd (3rd vec, 4th vec)
5425 The output for the first stage will be:
5427 E1: 0 2 4 6 8 10 12 14
5428 E2: 1 3 5 7 9 11 13 15
5429 E3: 16 18 20 22 24 26 28 30
5430 E4: 17 19 21 23 25 27 29 31
5432 In order to proceed and create the correct sequence for the next stage (or
5433 for the correct output, if the second stage is the last one, as in our
5434 example), we first put the output of extract_even operation and then the
5435 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5436 The input for the second stage is:
5438 1st vec (E1): 0 2 4 6 8 10 12 14
5439 2nd vec (E3): 16 18 20 22 24 26 28 30
5440 3rd vec (E2): 1 3 5 7 9 11 13 15
5441 4th vec (E4): 17 19 21 23 25 27 29 31
5443 The output of the second stage:
5445 E1: 0 4 8 12 16 20 24 28
5446 E2: 2 6 10 14 18 22 26 30
5447 E3: 1 5 9 13 17 21 25 29
5448 E4: 3 7 11 15 19 23 27 31
5450 And RESULT_CHAIN after reordering:
5452 1st vec (E1): 0 4 8 12 16 20 24 28
5453 2nd vec (E3): 1 5 9 13 17 21 25 29
5454 3rd vec (E2): 2 6 10 14 18 22 26 30
5455 4th vec (E4): 3 7 11 15 19 23 27 31. */
5458 vect_permute_load_chain (vec
<tree
> dr_chain
,
5459 unsigned int length
,
5461 gimple_stmt_iterator
*gsi
,
5462 vec
<tree
> *result_chain
)
5464 tree data_ref
, first_vect
, second_vect
;
5465 tree perm_mask_even
, perm_mask_odd
;
5466 tree perm3_mask_low
, perm3_mask_high
;
5468 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5469 unsigned int i
, j
, log_length
= exact_log2 (length
);
5470 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5471 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5473 result_chain
->quick_grow (length
);
5474 memcpy (result_chain
->address (), dr_chain
.address (),
5475 length
* sizeof (tree
));
5481 for (k
= 0; k
< 3; k
++)
5483 for (i
= 0; i
< nelt
; i
++)
5484 if (3 * i
+ k
< 2 * nelt
)
5488 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5490 for (i
= 0, j
= 0; i
< nelt
; i
++)
5491 if (3 * i
+ k
< 2 * nelt
)
5494 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5496 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5498 first_vect
= dr_chain
[0];
5499 second_vect
= dr_chain
[1];
5501 /* Create interleaving stmt (low part of):
5502 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5504 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5505 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5506 second_vect
, perm3_mask_low
);
5507 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5509 /* Create interleaving stmt (high part of):
5510 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5512 first_vect
= data_ref
;
5513 second_vect
= dr_chain
[2];
5514 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5515 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5516 second_vect
, perm3_mask_high
);
5517 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5518 (*result_chain
)[k
] = data_ref
;
5523 /* If length is not equal to 3 then only power of 2 is supported. */
5524 gcc_assert (pow2p_hwi (length
));
5526 for (i
= 0; i
< nelt
; ++i
)
5528 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5530 for (i
= 0; i
< nelt
; ++i
)
5532 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5534 for (i
= 0; i
< log_length
; i
++)
5536 for (j
= 0; j
< length
; j
+= 2)
5538 first_vect
= dr_chain
[j
];
5539 second_vect
= dr_chain
[j
+1];
5541 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5542 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5543 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5544 first_vect
, second_vect
,
5546 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5547 (*result_chain
)[j
/2] = data_ref
;
5549 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5550 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5551 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5552 first_vect
, second_vect
,
5554 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5555 (*result_chain
)[j
/2+length
/2] = data_ref
;
5557 memcpy (dr_chain
.address (), result_chain
->address (),
5558 length
* sizeof (tree
));
5563 /* Function vect_shift_permute_load_chain.
5565 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5566 sequence of stmts to reorder the input data accordingly.
5567 Return the final references for loads in RESULT_CHAIN.
5568 Return true if successed, false otherwise.
5570 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5571 The input is 3 vectors each containing 8 elements. We assign a
5572 number to each element, the input sequence is:
5574 1st vec: 0 1 2 3 4 5 6 7
5575 2nd vec: 8 9 10 11 12 13 14 15
5576 3rd vec: 16 17 18 19 20 21 22 23
5578 The output sequence should be:
5580 1st vec: 0 3 6 9 12 15 18 21
5581 2nd vec: 1 4 7 10 13 16 19 22
5582 3rd vec: 2 5 8 11 14 17 20 23
5584 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5586 First we shuffle all 3 vectors to get correct elements order:
5588 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5589 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5590 3rd vec: (16 19 22) (17 20 23) (18 21)
5592 Next we unite and shift vector 3 times:
5595 shift right by 6 the concatenation of:
5596 "1st vec" and "2nd vec"
5597 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5598 "2nd vec" and "3rd vec"
5599 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5600 "3rd vec" and "1st vec"
5601 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5604 So that now new vectors are:
5606 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5607 2nd vec: (10 13) (16 19 22) (17 20 23)
5608 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5611 shift right by 5 the concatenation of:
5612 "1st vec" and "3rd vec"
5613 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5614 "2nd vec" and "1st vec"
5615 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5616 "3rd vec" and "2nd vec"
5617 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5620 So that now new vectors are:
5622 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5623 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5624 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5627 shift right by 5 the concatenation of:
5628 "1st vec" and "1st vec"
5629 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5630 shift right by 3 the concatenation of:
5631 "2nd vec" and "2nd vec"
5632 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5635 So that now all vectors are READY:
5636 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5637 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5638 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5640 This algorithm is faster than one in vect_permute_load_chain if:
5641 1. "shift of a concatination" is faster than general permutation.
5643 2. The TARGET machine can't execute vector instructions in parallel.
5644 This is because each step of the algorithm depends on previous.
5645 The algorithm in vect_permute_load_chain is much more parallel.
5647 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5651 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5652 unsigned int length
,
5654 gimple_stmt_iterator
*gsi
,
5655 vec
<tree
> *result_chain
)
5657 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5658 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5659 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5662 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5664 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5665 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5666 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5667 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5669 result_chain
->quick_grow (length
);
5670 memcpy (result_chain
->address (), dr_chain
.address (),
5671 length
* sizeof (tree
));
5673 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5675 unsigned int j
, log_length
= exact_log2 (length
);
5676 for (i
= 0; i
< nelt
/ 2; ++i
)
5678 for (i
= 0; i
< nelt
/ 2; ++i
)
5679 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5680 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5682 if (dump_enabled_p ())
5683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5684 "shuffle of 2 fields structure is not \
5685 supported by target\n");
5688 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5690 for (i
= 0; i
< nelt
/ 2; ++i
)
5692 for (i
= 0; i
< nelt
/ 2; ++i
)
5693 sel
[nelt
/ 2 + i
] = i
* 2;
5694 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5698 "shuffle of 2 fields structure is not \
5699 supported by target\n");
5702 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5704 /* Generating permutation constant to shift all elements.
5705 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5706 for (i
= 0; i
< nelt
; i
++)
5707 sel
[i
] = nelt
/ 2 + i
;
5708 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5712 "shift permutation is not supported by target\n");
5715 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5717 /* Generating permutation constant to select vector from 2.
5718 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5719 for (i
= 0; i
< nelt
/ 2; i
++)
5721 for (i
= nelt
/ 2; i
< nelt
; i
++)
5723 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5727 "select is not supported by target\n");
5730 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5732 for (i
= 0; i
< log_length
; i
++)
5734 for (j
= 0; j
< length
; j
+= 2)
5736 first_vect
= dr_chain
[j
];
5737 second_vect
= dr_chain
[j
+ 1];
5739 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5740 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5741 first_vect
, first_vect
,
5743 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5746 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5747 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5748 second_vect
, second_vect
,
5750 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5753 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5754 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5755 vect
[0], vect
[1], shift1_mask
);
5756 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5757 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5759 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5760 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5761 vect
[0], vect
[1], select_mask
);
5762 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5763 (*result_chain
)[j
/2] = data_ref
;
5765 memcpy (dr_chain
.address (), result_chain
->address (),
5766 length
* sizeof (tree
));
5770 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5772 unsigned int k
= 0, l
= 0;
5774 /* Generating permutation constant to get all elements in rigth order.
5775 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5776 for (i
= 0; i
< nelt
; i
++)
5778 if (3 * k
+ (l
% 3) >= nelt
)
5781 l
+= (3 - (nelt
% 3));
5783 sel
[i
] = 3 * k
+ (l
% 3);
5786 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5790 "shuffle of 3 fields structure is not \
5791 supported by target\n");
5794 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5796 /* Generating permutation constant to shift all elements.
5797 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5798 for (i
= 0; i
< nelt
; i
++)
5799 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5800 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5804 "shift permutation is not supported by target\n");
5807 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5809 /* Generating permutation constant to shift all elements.
5810 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5811 for (i
= 0; i
< nelt
; i
++)
5812 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5813 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5815 if (dump_enabled_p ())
5816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5817 "shift permutation is not supported by target\n");
5820 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5822 /* Generating permutation constant to shift all elements.
5823 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5824 for (i
= 0; i
< nelt
; i
++)
5825 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5826 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5828 if (dump_enabled_p ())
5829 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5830 "shift permutation is not supported by target\n");
5833 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5835 /* Generating permutation constant to shift all elements.
5836 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5837 for (i
= 0; i
< nelt
; i
++)
5838 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5839 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5843 "shift permutation is not supported by target\n");
5846 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5848 for (k
= 0; k
< 3; k
++)
5850 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5851 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5852 dr_chain
[k
], dr_chain
[k
],
5854 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5858 for (k
= 0; k
< 3; k
++)
5860 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5861 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5862 vect
[k
% 3], vect
[(k
+ 1) % 3],
5864 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5865 vect_shift
[k
] = data_ref
;
5868 for (k
= 0; k
< 3; k
++)
5870 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5871 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5872 vect_shift
[(4 - k
) % 3],
5873 vect_shift
[(3 - k
) % 3],
5875 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5879 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5881 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5882 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5883 vect
[0], shift3_mask
);
5884 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5885 (*result_chain
)[nelt
% 3] = data_ref
;
5887 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5888 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5889 vect
[1], shift4_mask
);
5890 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5891 (*result_chain
)[0] = data_ref
;
5897 /* Function vect_transform_grouped_load.
5899 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5900 to perform their permutation and ascribe the result vectorized statements to
5901 the scalar statements.
5905 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5906 gimple_stmt_iterator
*gsi
)
5909 vec
<tree
> result_chain
= vNULL
;
5911 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5912 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5913 vectors, that are ready for vector computation. */
5914 result_chain
.create (size
);
5916 /* If reassociation width for vector type is 2 or greater target machine can
5917 execute 2 or more vector instructions in parallel. Otherwise try to
5918 get chain for loads group using vect_shift_permute_load_chain. */
5919 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5920 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5922 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5923 gsi
, &result_chain
))
5924 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5925 vect_record_grouped_load_vectors (stmt
, result_chain
);
5926 result_chain
.release ();
5929 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5930 generated as part of the vectorization of STMT. Assign the statement
5931 for each vector to the associated scalar statement. */
5934 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5936 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5937 gimple
*next_stmt
, *new_stmt
;
5938 unsigned int i
, gap_count
;
5941 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5942 Since we scan the chain starting from it's first node, their order
5943 corresponds the order of data-refs in RESULT_CHAIN. */
5944 next_stmt
= first_stmt
;
5946 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5951 /* Skip the gaps. Loads created for the gaps will be removed by dead
5952 code elimination pass later. No need to check for the first stmt in
5953 the group, since it always exists.
5954 GROUP_GAP is the number of steps in elements from the previous
5955 access (if there is no gap GROUP_GAP is 1). We skip loads that
5956 correspond to the gaps. */
5957 if (next_stmt
!= first_stmt
5958 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5966 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5967 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5968 copies, and we put the new vector statement in the first available
5970 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5971 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5974 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5977 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5979 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5982 prev_stmt
= rel_stmt
;
5984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5987 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5992 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5994 /* If NEXT_STMT accesses the same DR as the previous statement,
5995 put the same TMP_DATA_REF as its vectorized statement; otherwise
5996 get the next data-ref from RESULT_CHAIN. */
5997 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6003 /* Function vect_force_dr_alignment_p.
6005 Returns whether the alignment of a DECL can be forced to be aligned
6006 on ALIGNMENT bit boundary. */
6009 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6014 if (decl_in_symtab_p (decl
)
6015 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6018 if (TREE_STATIC (decl
))
6019 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6021 return (alignment
<= MAX_STACK_ALIGNMENT
);
6025 /* Return whether the data reference DR is supported with respect to its
6027 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6028 it is aligned, i.e., check if it is possible to vectorize it with different
6031 enum dr_alignment_support
6032 vect_supportable_dr_alignment (struct data_reference
*dr
,
6033 bool check_aligned_accesses
)
6035 gimple
*stmt
= DR_STMT (dr
);
6036 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6037 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6038 machine_mode mode
= TYPE_MODE (vectype
);
6039 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6040 struct loop
*vect_loop
= NULL
;
6041 bool nested_in_vect_loop
= false;
6043 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6046 /* For now assume all conditional loads/stores support unaligned
6047 access without any special code. */
6048 if (is_gimple_call (stmt
)
6049 && gimple_call_internal_p (stmt
)
6050 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6051 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6052 return dr_unaligned_supported
;
6056 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6057 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6060 /* Possibly unaligned access. */
6062 /* We can choose between using the implicit realignment scheme (generating
6063 a misaligned_move stmt) and the explicit realignment scheme (generating
6064 aligned loads with a REALIGN_LOAD). There are two variants to the
6065 explicit realignment scheme: optimized, and unoptimized.
6066 We can optimize the realignment only if the step between consecutive
6067 vector loads is equal to the vector size. Since the vector memory
6068 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6069 is guaranteed that the misalignment amount remains the same throughout the
6070 execution of the vectorized loop. Therefore, we can create the
6071 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6072 at the loop preheader.
6074 However, in the case of outer-loop vectorization, when vectorizing a
6075 memory access in the inner-loop nested within the LOOP that is now being
6076 vectorized, while it is guaranteed that the misalignment of the
6077 vectorized memory access will remain the same in different outer-loop
6078 iterations, it is *not* guaranteed that is will remain the same throughout
6079 the execution of the inner-loop. This is because the inner-loop advances
6080 with the original scalar step (and not in steps of VS). If the inner-loop
6081 step happens to be a multiple of VS, then the misalignment remains fixed
6082 and we can use the optimized realignment scheme. For example:
6088 When vectorizing the i-loop in the above example, the step between
6089 consecutive vector loads is 1, and so the misalignment does not remain
6090 fixed across the execution of the inner-loop, and the realignment cannot
6091 be optimized (as illustrated in the following pseudo vectorized loop):
6093 for (i=0; i<N; i+=4)
6094 for (j=0; j<M; j++){
6095 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6096 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6097 // (assuming that we start from an aligned address).
6100 We therefore have to use the unoptimized realignment scheme:
6102 for (i=0; i<N; i+=4)
6103 for (j=k; j<M; j+=4)
6104 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6105 // that the misalignment of the initial address is
6108 The loop can then be vectorized as follows:
6110 for (k=0; k<4; k++){
6111 rt = get_realignment_token (&vp[k]);
6112 for (i=0; i<N; i+=4){
6114 for (j=k; j<M; j+=4){
6116 va = REALIGN_LOAD <v1,v2,rt>;
6123 if (DR_IS_READ (dr
))
6125 bool is_packed
= false;
6126 tree type
= (TREE_TYPE (DR_REF (dr
)));
6128 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6129 && (!targetm
.vectorize
.builtin_mask_for_load
6130 || targetm
.vectorize
.builtin_mask_for_load ()))
6132 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6134 /* If we are doing SLP then the accesses need not have the
6135 same alignment, instead it depends on the SLP group size. */
6137 && STMT_SLP_TYPE (stmt_info
)
6138 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6139 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
6140 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
6142 else if (!loop_vinfo
6143 || (nested_in_vect_loop
6144 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6145 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6146 return dr_explicit_realign
;
6148 return dr_explicit_realign_optimized
;
6150 if (!known_alignment_for_access_p (dr
))
6151 is_packed
= not_size_aligned (DR_REF (dr
));
6153 if (targetm
.vectorize
.support_vector_misalignment
6154 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6155 /* Can't software pipeline the loads, but can at least do them. */
6156 return dr_unaligned_supported
;
6160 bool is_packed
= false;
6161 tree type
= (TREE_TYPE (DR_REF (dr
)));
6163 if (!known_alignment_for_access_p (dr
))
6164 is_packed
= not_size_aligned (DR_REF (dr
));
6166 if (targetm
.vectorize
.support_vector_misalignment
6167 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6168 return dr_unaligned_supported
;
6172 return dr_unaligned_unsupported
;