1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
59 tree vectype
, unsigned HOST_WIDE_INT count
)
61 machine_mode mode
, array_mode
;
64 mode
= TYPE_MODE (vectype
);
65 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
66 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
69 if (array_mode
== BLKmode
)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
74 GET_MODE_NAME (mode
), count
);
78 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
82 "cannot use %s<%s><%s>\n", name
,
83 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE
, vect_location
,
89 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
90 GET_MODE_NAME (mode
));
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
114 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
115 HOST_WIDE_INT
*rhs_size_unit
)
117 tree scalar_type
= gimple_expr_type (stmt
);
118 HOST_WIDE_INT lhs
, rhs
;
120 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
122 if (is_gimple_assign (stmt
)
123 && (gimple_assign_cast_p (stmt
)
124 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
128 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
130 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
132 scalar_type
= rhs_type
;
135 *lhs_size_unit
= lhs
;
136 *rhs_size_unit
= rhs
;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
146 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
148 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE
, vect_location
,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
158 dump_printf (MSG_NOTE
, " and ");
159 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
160 dump_printf (MSG_NOTE
, "\n");
163 if (optimize_loop_nest_for_size_p (loop
))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
167 "versioning not supported when optimizing"
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
177 "versioning not yet supported for outer-loops.\n");
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
183 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
188 "versioning not yet supported for non-constant "
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
206 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
207 loop_vec_info loop_vinfo
, int *max_vf
)
210 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
211 struct data_reference
*dra
= DDR_A (ddr
);
212 struct data_reference
*drb
= DDR_B (ddr
);
213 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
214 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
215 lambda_vector dist_v
;
216 unsigned int loop_depth
;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
228 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
244 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
246 get_alias_set (DR_REF (drb
))))
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop
->safelen
>= 2)
256 if (loop
->safelen
< *max_vf
)
257 *max_vf
= loop
->safelen
;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
272 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
275 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
287 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop
->safelen
>= 2)
304 if (loop
->safelen
< *max_vf
)
305 *max_vf
= loop
->safelen
;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
320 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
336 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
342 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
345 int dist
= dist_v
[loop_depth
];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance = %d.\n", dist
);
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE
, vect_location
,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
358 dump_printf (MSG_NOTE
, " and ");
359 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
360 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
384 gimple
*earlier_stmt
;
385 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "READ_WRITE dependence in interleaving."
400 if (dist
> 0 && DDR_REVERSED_P (ddr
))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
419 && abs (dist
) < *max_vf
)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf
= abs (dist
);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE
, vect_location
,
426 "adjusting maximal vectorization factor to %i\n",
430 if (abs (dist
) >= *max_vf
)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "dependence distance >= VF.\n");
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
446 dump_printf (MSG_NOTE
, " and ");
447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
448 dump_printf (MSG_NOTE
, "\n");
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
467 struct data_dependence_relation
*ddr
;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE
, vect_location
,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo
)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
479 &LOOP_VINFO_DDRS (loop_vinfo
),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
487 *max_vf
= LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo
);
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
490 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
507 struct data_reference
*dra
= DDR_A (ddr
);
508 struct data_reference
*drb
= DDR_B (ddr
);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
524 /* If dra and drb are part of the same interleaving chain consider
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
539 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
541 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE
, vect_location
,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
549 dump_printf (MSG_NOTE
, " and ");
550 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
551 dump_printf (MSG_NOTE
, "\n");
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
563 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
564 vec
<gimple
*> stores
, gimple
*last_store
)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
569 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
570 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
572 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
573 if (access
== last_access
)
575 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
576 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
577 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
579 gimple
*stmt
= gsi_stmt (gsi
);
580 if (! gimple_vuse (stmt
)
581 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
592 bool dependent
= false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt
))
599 if (stmt
!= last_store
)
603 FOR_EACH_VEC_ELT (stores
, i
, store
)
605 data_reference
*store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
607 ddr_p ddr
= initialize_data_dependence_relation
608 (dr_a
, store_dr
, vNULL
);
609 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
610 free_dependence_relation (ddr
);
617 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
619 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
620 free_dependence_relation (ddr
);
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
637 vect_slp_analyze_instance_dependence (slp_instance instance
)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE
, vect_location
,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store
= SLP_INSTANCE_TREE (instance
);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple
*last_store
= NULL
;
652 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
655 /* Mark stores in this instance and remember the last one. */
656 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
657 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
668 if (! vect_slp_analyze_node_dependences (instance
, load
,
670 ? SLP_TREE_SCALAR_STMTS (store
)
671 : vNULL
, last_store
))
677 /* Unset the visited flag. */
679 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
698 vect_compute_data_ref_alignment (struct data_reference
*dr
)
700 gimple
*stmt
= DR_STMT (dr
);
701 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
702 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
703 struct loop
*loop
= NULL
;
704 tree ref
= DR_REF (dr
);
706 tree base
, base_addr
;
707 tree misalign
= NULL_TREE
;
710 unsigned HOST_WIDE_INT alignment
;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "vect_compute_data_ref_alignment:\n");
717 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr
, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr
)))
723 misalign
= DR_INIT (dr
);
724 aligned_to
= DR_ALIGNED_TO (dr
);
725 base_addr
= DR_BASE_ADDRESS (dr
);
726 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
736 tree step
= DR_STEP (dr
);
738 if (tree_fits_shwi_p (step
)
739 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE
, vect_location
,
743 "inner step divides the vector-size.\n");
744 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
745 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
746 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
752 "inner step doesn't divide the vector-size.\n");
753 misalign
= NULL_TREE
;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
763 tree step
= DR_STEP (dr
);
764 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
766 if (tree_fits_shwi_p (step
)
767 && ((tree_to_shwi (step
) * vf
)
768 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
772 "step doesn't divide the vector-size.\n");
773 misalign
= NULL_TREE
;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
780 while (handled_component_p (base
))
781 base
= TREE_OPERAND (base
, 0);
782 unsigned int base_alignment
;
783 unsigned HOST_WIDE_INT base_bitpos
;
784 get_object_alignment_1 (base
, &base_alignment
, &base_bitpos
);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
789 if (TREE_CODE (base
) == MEM_REF
)
791 base_bitpos
-= mem_ref_offset (base
).to_short_addr () * BITS_PER_UNIT
;
792 base_bitpos
&= (base_alignment
- 1);
794 if (base_bitpos
!= 0)
795 base_alignment
= base_bitpos
& -base_bitpos
;
796 /* Also look at the alignment of the base address DR analysis
798 unsigned int base_addr_alignment
= get_pointer_alignment (base_addr
);
799 if (base_addr_alignment
> base_alignment
)
800 base_alignment
= base_addr_alignment
;
802 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
803 DR_VECT_AUX (dr
)->base_element_aligned
= true;
805 alignment
= TYPE_ALIGN_UNIT (vectype
);
807 if ((compare_tree_int (aligned_to
, alignment
) < 0)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
813 "Unknown alignment for access: ");
814 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
815 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
820 if (base_alignment
< TYPE_ALIGN (vectype
))
823 if (TREE_CODE (base
) == ADDR_EXPR
)
824 base
= TREE_OPERAND (base
, 0);
825 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
827 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE
, vect_location
,
830 "can't force alignment of ref: ");
831 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
832 dump_printf (MSG_NOTE
, "\n");
837 if (DECL_USER_ALIGN (base
))
839 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE
, vect_location
,
842 "not forcing alignment of user-aligned "
844 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
845 dump_printf (MSG_NOTE
, "\n");
850 /* Force the alignment of the decl.
851 NOTE: This is the only change to the code we make during
852 the analysis phase, before deciding to vectorize the loop. */
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
856 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
857 dump_printf (MSG_NOTE
, "\n");
860 DR_VECT_AUX (dr
)->base_decl
= base
;
861 DR_VECT_AUX (dr
)->base_misaligned
= true;
862 DR_VECT_AUX (dr
)->base_element_aligned
= true;
865 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
866 step
= STMT_VINFO_DR_STEP (stmt_info
);
869 /* If this is a backward running DR then first access in the larger
870 vectype actually is N-1 elements before the address in the DR.
871 Adjust misalign accordingly. */
872 if (tree_int_cst_sgn (step
) < 0)
874 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
875 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
876 otherwise we wouldn't be here. */
877 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, step
);
878 /* PLUS because STEP was negative. */
879 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
882 SET_DR_MISALIGNMENT (dr
,
883 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
885 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
888 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
889 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
890 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
897 /* Function vect_update_misalignment_for_peel
899 DR - the data reference whose misalignment is to be adjusted.
900 DR_PEEL - the data reference whose misalignment is being made
901 zero in the vector loop by the peel.
902 NPEEL - the number of iterations in the peel loop if the misalignment
903 of DR_PEEL is known at compile time. */
906 vect_update_misalignment_for_peel (struct data_reference
*dr
,
907 struct data_reference
*dr_peel
, int npeel
)
910 vec
<dr_p
> same_align_drs
;
911 struct data_reference
*current_dr
;
912 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
913 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
914 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
915 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
917 /* For interleaved data accesses the step in the loop must be multiplied by
918 the size of the interleaving group. */
919 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
920 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
921 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
922 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
924 /* It can be assumed that the data refs with the same alignment as dr_peel
925 are aligned in the vector loop. */
927 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
928 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
930 if (current_dr
!= dr
)
932 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
933 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
934 SET_DR_MISALIGNMENT (dr
, 0);
938 if (known_alignment_for_access_p (dr
)
939 && known_alignment_for_access_p (dr_peel
))
941 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
942 int misal
= DR_MISALIGNMENT (dr
);
943 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
944 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
945 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
946 SET_DR_MISALIGNMENT (dr
, misal
);
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
952 SET_DR_MISALIGNMENT (dr
, -1);
956 /* Function verify_data_ref_alignment
958 Return TRUE if DR can be handled with respect to alignment. */
961 verify_data_ref_alignment (data_reference_p dr
)
963 enum dr_alignment_support supportable_dr_alignment
964 = vect_supportable_dr_alignment (dr
, false);
965 if (!supportable_dr_alignment
)
967 if (dump_enabled_p ())
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
971 "not vectorized: unsupported unaligned load.");
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
974 "not vectorized: unsupported unaligned "
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
979 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
984 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE
, vect_location
,
986 "Vectorizing an unaligned access.\n");
991 /* Function vect_verify_datarefs_alignment
993 Return TRUE if all data references in the loop can be
994 handled with respect to alignment. */
997 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
999 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1000 struct data_reference
*dr
;
1003 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1005 gimple
*stmt
= DR_STMT (dr
);
1006 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1008 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1011 /* For interleaving, only the alignment of the first access matters. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1013 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1016 /* Strided accesses perform only component accesses, alignment is
1017 irrelevant for them. */
1018 if (STMT_VINFO_STRIDED_P (stmt_info
)
1019 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1022 if (! verify_data_ref_alignment (dr
))
1029 /* Given an memory reference EXP return whether its alignment is less
1033 not_size_aligned (tree exp
)
1035 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1038 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1039 > get_object_alignment (exp
));
1042 /* Function vector_alignment_reachable_p
1044 Return true if vector alignment for DR is reachable by peeling
1045 a few loop iterations. Return false otherwise. */
1048 vector_alignment_reachable_p (struct data_reference
*dr
)
1050 gimple
*stmt
= DR_STMT (dr
);
1051 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1052 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1056 /* For interleaved access we peel only if number of iterations in
1057 the prolog loop ({VF - misalignment}), is a multiple of the
1058 number of the interleaved accesses. */
1059 int elem_size
, mis_in_elements
;
1060 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1062 /* FORNOW: handle only known alignment. */
1063 if (!known_alignment_for_access_p (dr
))
1066 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1067 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1069 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1073 /* If misalignment is known at the compile time then allow peeling
1074 only if natural alignment is reachable through peeling. */
1075 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1077 HOST_WIDE_INT elmsize
=
1078 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1079 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE
, vect_location
,
1082 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1083 dump_printf (MSG_NOTE
,
1084 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1086 if (DR_MISALIGNMENT (dr
) % elmsize
)
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1090 "data size does not divide the misalignment.\n");
1095 if (!known_alignment_for_access_p (dr
))
1097 tree type
= TREE_TYPE (DR_REF (dr
));
1098 bool is_packed
= not_size_aligned (DR_REF (dr
));
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1101 "Unknown misalignment, is_packed = %d\n",is_packed
);
1102 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1103 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1113 /* Calculate the cost of the memory access represented by DR. */
1116 vect_get_data_access_cost (struct data_reference
*dr
,
1117 unsigned int *inside_cost
,
1118 unsigned int *outside_cost
,
1119 stmt_vector_for_cost
*body_cost_vec
)
1121 gimple
*stmt
= DR_STMT (dr
);
1122 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1123 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1124 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1125 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1126 int ncopies
= vf
/ nunits
;
1128 if (DR_IS_READ (dr
))
1129 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1130 NULL
, body_cost_vec
, false);
1132 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE
, vect_location
,
1136 "vect_get_data_access_cost: inside_cost = %d, "
1137 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1141 typedef struct _vect_peel_info
1144 struct data_reference
*dr
;
1148 typedef struct _vect_peel_extended_info
1150 struct _vect_peel_info peel_info
;
1151 unsigned int inside_cost
;
1152 unsigned int outside_cost
;
1153 stmt_vector_for_cost body_cost_vec
;
1154 } *vect_peel_extended_info
;
1157 /* Peeling hashtable helpers. */
1159 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1161 static inline hashval_t
hash (const _vect_peel_info
*);
1162 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1166 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1168 return (hashval_t
) peel_info
->npeel
;
1172 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1174 return (a
->npeel
== b
->npeel
);
1178 /* Insert DR into peeling hash table with NPEEL as key. */
1181 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1182 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1185 struct _vect_peel_info elem
, *slot
;
1186 _vect_peel_info
**new_slot
;
1187 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1190 slot
= peeling_htab
->find (&elem
);
1195 slot
= XNEW (struct _vect_peel_info
);
1196 slot
->npeel
= npeel
;
1199 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1203 if (!supportable_dr_alignment
1204 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1205 slot
->count
+= VECT_MAX_COST
;
1209 /* Traverse peeling hash table to find peeling option that aligns maximum
1210 number of data accesses. */
1213 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1214 _vect_peel_extended_info
*max
)
1216 vect_peel_info elem
= *slot
;
1218 if (elem
->count
> max
->peel_info
.count
1219 || (elem
->count
== max
->peel_info
.count
1220 && max
->peel_info
.npeel
> elem
->npeel
))
1222 max
->peel_info
.npeel
= elem
->npeel
;
1223 max
->peel_info
.count
= elem
->count
;
1224 max
->peel_info
.dr
= elem
->dr
;
1231 /* Traverse peeling hash table and calculate cost for each peeling option.
1232 Find the one with the lowest cost. */
1235 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1236 _vect_peel_extended_info
*min
)
1238 vect_peel_info elem
= *slot
;
1239 int save_misalignment
, dummy
;
1240 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1241 gimple
*stmt
= DR_STMT (elem
->dr
);
1242 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1243 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1244 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1245 struct data_reference
*dr
;
1246 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1248 prologue_cost_vec
.create (2);
1249 body_cost_vec
.create (2);
1250 epilogue_cost_vec
.create (2);
1252 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1254 stmt
= DR_STMT (dr
);
1255 stmt_info
= vinfo_for_stmt (stmt
);
1256 /* For interleaving, only the alignment of the first access
1258 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1259 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1262 /* Strided accesses perform only component accesses, alignment is
1263 irrelevant for them. */
1264 if (STMT_VINFO_STRIDED_P (stmt_info
)
1265 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1268 save_misalignment
= DR_MISALIGNMENT (dr
);
1269 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1270 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1272 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1275 outside_cost
+= vect_get_known_peeling_cost
1276 (loop_vinfo
, elem
->npeel
, &dummy
,
1277 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1278 &prologue_cost_vec
, &epilogue_cost_vec
);
1280 /* Prologue and epilogue costs are added to the target model later.
1281 These costs depend only on the scalar iteration cost, the
1282 number of peeling iterations finally chosen, and the number of
1283 misaligned statements. So discard the information found here. */
1284 prologue_cost_vec
.release ();
1285 epilogue_cost_vec
.release ();
1287 if (inside_cost
< min
->inside_cost
1288 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1290 min
->inside_cost
= inside_cost
;
1291 min
->outside_cost
= outside_cost
;
1292 min
->body_cost_vec
.release ();
1293 min
->body_cost_vec
= body_cost_vec
;
1294 min
->peel_info
.dr
= elem
->dr
;
1295 min
->peel_info
.npeel
= elem
->npeel
;
1298 body_cost_vec
.release ();
1304 /* Choose best peeling option by traversing peeling hash table and either
1305 choosing an option with the lowest cost (if cost model is enabled) or the
1306 option that aligns as many accesses as possible. */
1308 static struct data_reference
*
1309 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1310 loop_vec_info loop_vinfo
,
1311 unsigned int *npeel
,
1312 stmt_vector_for_cost
*body_cost_vec
)
1314 struct _vect_peel_extended_info res
;
1316 res
.peel_info
.dr
= NULL
;
1317 res
.body_cost_vec
= stmt_vector_for_cost ();
1319 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1321 res
.inside_cost
= INT_MAX
;
1322 res
.outside_cost
= INT_MAX
;
1323 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1324 vect_peeling_hash_get_lowest_cost
> (&res
);
1328 res
.peel_info
.count
= 0;
1329 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1330 vect_peeling_hash_get_most_frequent
> (&res
);
1333 *npeel
= res
.peel_info
.npeel
;
1334 *body_cost_vec
= res
.body_cost_vec
;
1335 return res
.peel_info
.dr
;
1339 /* Function vect_enhance_data_refs_alignment
1341 This pass will use loop versioning and loop peeling in order to enhance
1342 the alignment of data references in the loop.
1344 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1345 original loop is to be vectorized. Any other loops that are created by
1346 the transformations performed in this pass - are not supposed to be
1347 vectorized. This restriction will be relaxed.
1349 This pass will require a cost model to guide it whether to apply peeling
1350 or versioning or a combination of the two. For example, the scheme that
1351 intel uses when given a loop with several memory accesses, is as follows:
1352 choose one memory access ('p') which alignment you want to force by doing
1353 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1354 other accesses are not necessarily aligned, or (2) use loop versioning to
1355 generate one loop in which all accesses are aligned, and another loop in
1356 which only 'p' is necessarily aligned.
1358 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1359 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1360 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1362 Devising a cost model is the most critical aspect of this work. It will
1363 guide us on which access to peel for, whether to use loop versioning, how
1364 many versions to create, etc. The cost model will probably consist of
1365 generic considerations as well as target specific considerations (on
1366 powerpc for example, misaligned stores are more painful than misaligned
1369 Here are the general steps involved in alignment enhancements:
1371 -- original loop, before alignment analysis:
1372 for (i=0; i<N; i++){
1373 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1374 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1377 -- After vect_compute_data_refs_alignment:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = 3
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- Possibility 1: we do loop versioning:
1385 for (i=0; i<N; i++){ # loop 1A
1386 x = q[i]; # DR_MISALIGNMENT(q) = 3
1387 p[i] = y; # DR_MISALIGNMENT(p) = 0
1391 for (i=0; i<N; i++){ # loop 1B
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1397 -- Possibility 2: we do loop peeling:
1398 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1402 for (i = 3; i < N; i++){ # loop 2A
1403 x = q[i]; # DR_MISALIGNMENT(q) = 0
1404 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1407 -- Possibility 3: combination of loop peeling and versioning:
1408 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1413 for (i = 3; i<N; i++){ # loop 3A
1414 x = q[i]; # DR_MISALIGNMENT(q) = 0
1415 p[i] = y; # DR_MISALIGNMENT(p) = 0
1419 for (i = 3; i<N; i++){ # loop 3B
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1425 These loops are later passed to loop_transform to be vectorized. The
1426 vectorizer will use the alignment information to guide the transformation
1427 (whether to generate regular loads/stores, or with special handling for
1431 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1433 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1434 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1435 enum dr_alignment_support supportable_dr_alignment
;
1436 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1437 struct data_reference
*dr
;
1439 bool do_peeling
= false;
1440 bool do_versioning
= false;
1443 stmt_vec_info stmt_info
;
1444 unsigned int npeel
= 0;
1445 bool all_misalignments_unknown
= true;
1446 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1447 unsigned possible_npeel_number
= 1;
1449 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1450 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1451 hash_table
<peel_info_hasher
> peeling_htab (1);
1453 if (dump_enabled_p ())
1454 dump_printf_loc (MSG_NOTE
, vect_location
,
1455 "=== vect_enhance_data_refs_alignment ===\n");
1457 /* Reset data so we can safely be called multiple times. */
1458 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1459 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1461 /* While cost model enhancements are expected in the future, the high level
1462 view of the code at this time is as follows:
1464 A) If there is a misaligned access then see if peeling to align
1465 this access can make all data references satisfy
1466 vect_supportable_dr_alignment. If so, update data structures
1467 as needed and return true.
1469 B) If peeling wasn't possible and there is a data reference with an
1470 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1471 then see if loop versioning checks can be used to make all data
1472 references satisfy vect_supportable_dr_alignment. If so, update
1473 data structures as needed and return true.
1475 C) If neither peeling nor versioning were successful then return false if
1476 any data reference does not satisfy vect_supportable_dr_alignment.
1478 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1480 Note, Possibility 3 above (which is peeling and versioning together) is not
1481 being done at this time. */
1483 /* (1) Peeling to force alignment. */
1485 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1487 + How many accesses will become aligned due to the peeling
1488 - How many accesses will become unaligned due to the peeling,
1489 and the cost of misaligned accesses.
1490 - The cost of peeling (the extra runtime checks, the increase
1493 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1495 stmt
= DR_STMT (dr
);
1496 stmt_info
= vinfo_for_stmt (stmt
);
1498 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1501 /* For interleaving, only the alignment of the first access
1503 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1504 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1507 /* For invariant accesses there is nothing to enhance. */
1508 if (integer_zerop (DR_STEP (dr
)))
1511 /* Strided accesses perform only component accesses, alignment is
1512 irrelevant for them. */
1513 if (STMT_VINFO_STRIDED_P (stmt_info
)
1514 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1517 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1518 do_peeling
= vector_alignment_reachable_p (dr
);
1521 if (known_alignment_for_access_p (dr
))
1523 unsigned int npeel_tmp
;
1524 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1525 size_zero_node
) < 0;
1527 /* Save info about DR in the hash table. */
1528 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1529 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1530 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1531 TREE_TYPE (DR_REF (dr
))));
1532 npeel_tmp
= (negative
1533 ? (mis
- nelements
) : (nelements
- mis
))
1536 /* For multiple types, it is possible that the bigger type access
1537 will have more than one peeling option. E.g., a loop with two
1538 types: one of size (vector size / 4), and the other one of
1539 size (vector size / 8). Vectorization factor will 8. If both
1540 access are misaligned by 3, the first one needs one scalar
1541 iteration to be aligned, and the second one needs 5. But the
1542 first one will be aligned also by peeling 5 scalar
1543 iterations, and in that case both accesses will be aligned.
1544 Hence, except for the immediate peeling amount, we also want
1545 to try to add full vector size, while we don't exceed
1546 vectorization factor.
1547 We do this automtically for cost model, since we calculate cost
1548 for every peeling option. */
1549 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1551 if (STMT_SLP_TYPE (stmt_info
))
1552 possible_npeel_number
1553 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1555 possible_npeel_number
= vf
/ nelements
;
1558 /* Handle the aligned case. We may decide to align some other
1559 access, making DR unaligned. */
1560 if (DR_MISALIGNMENT (dr
) == 0)
1563 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1564 possible_npeel_number
++;
1567 for (j
= 0; j
< possible_npeel_number
; j
++)
1569 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1571 npeel_tmp
+= nelements
;
1574 all_misalignments_unknown
= false;
1575 /* Data-ref that was chosen for the case that all the
1576 misalignments are unknown is not relevant anymore, since we
1577 have a data-ref with known alignment. */
1582 /* If we don't know any misalignment values, we prefer
1583 peeling for data-ref that has the maximum number of data-refs
1584 with the same alignment, unless the target prefers to align
1585 stores over load. */
1586 if (all_misalignments_unknown
)
1588 unsigned same_align_drs
1589 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1591 || same_align_drs_max
< same_align_drs
)
1593 same_align_drs_max
= same_align_drs
;
1596 /* For data-refs with the same number of related
1597 accesses prefer the one where the misalign
1598 computation will be invariant in the outermost loop. */
1599 else if (same_align_drs_max
== same_align_drs
)
1601 struct loop
*ivloop0
, *ivloop
;
1602 ivloop0
= outermost_invariant_loop_for_expr
1603 (loop
, DR_BASE_ADDRESS (dr0
));
1604 ivloop
= outermost_invariant_loop_for_expr
1605 (loop
, DR_BASE_ADDRESS (dr
));
1606 if ((ivloop
&& !ivloop0
)
1607 || (ivloop
&& ivloop0
1608 && flow_loop_nested_p (ivloop
, ivloop0
)))
1612 if (!first_store
&& DR_IS_WRITE (dr
))
1616 /* If there are both known and unknown misaligned accesses in the
1617 loop, we choose peeling amount according to the known
1619 if (!supportable_dr_alignment
)
1622 if (!first_store
&& DR_IS_WRITE (dr
))
1629 if (!aligned_access_p (dr
))
1631 if (dump_enabled_p ())
1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1633 "vector alignment may not be reachable\n");
1639 /* Check if we can possibly peel the loop. */
1640 if (!vect_can_advance_ivs_p (loop_vinfo
)
1641 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1646 && all_misalignments_unknown
1647 && vect_supportable_dr_alignment (dr0
, false))
1649 /* Check if the target requires to prefer stores over loads, i.e., if
1650 misaligned stores are more expensive than misaligned loads (taking
1651 drs with same alignment into account). */
1652 if (first_store
&& DR_IS_READ (dr0
))
1654 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1655 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1656 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1657 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1658 stmt_vector_for_cost dummy
;
1661 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1663 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1664 &store_outside_cost
, &dummy
);
1668 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1669 aligning the load DR0). */
1670 load_inside_penalty
= store_inside_cost
;
1671 load_outside_penalty
= store_outside_cost
;
1673 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1674 DR_STMT (first_store
))).iterate (i
, &dr
);
1676 if (DR_IS_READ (dr
))
1678 load_inside_penalty
+= load_inside_cost
;
1679 load_outside_penalty
+= load_outside_cost
;
1683 load_inside_penalty
+= store_inside_cost
;
1684 load_outside_penalty
+= store_outside_cost
;
1687 /* Calculate the penalty for leaving DR0 unaligned (by
1688 aligning the FIRST_STORE). */
1689 store_inside_penalty
= load_inside_cost
;
1690 store_outside_penalty
= load_outside_cost
;
1692 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1693 DR_STMT (dr0
))).iterate (i
, &dr
);
1695 if (DR_IS_READ (dr
))
1697 store_inside_penalty
+= load_inside_cost
;
1698 store_outside_penalty
+= load_outside_cost
;
1702 store_inside_penalty
+= store_inside_cost
;
1703 store_outside_penalty
+= store_outside_cost
;
1706 if (load_inside_penalty
> store_inside_penalty
1707 || (load_inside_penalty
== store_inside_penalty
1708 && load_outside_penalty
> store_outside_penalty
))
1712 /* In case there are only loads with different unknown misalignments, use
1713 peeling only if it may help to align other accesses in the loop or
1714 if it may help improving load bandwith when we'd end up using
1716 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1718 && !STMT_VINFO_SAME_ALIGN_REFS (
1719 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1720 && (vect_supportable_dr_alignment (dr0
, false)
1721 != dr_unaligned_supported
1722 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1723 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1727 if (do_peeling
&& !dr0
)
1729 /* Peeling is possible, but there is no data access that is not supported
1730 unless aligned. So we try to choose the best possible peeling. */
1732 /* We should get here only if there are drs with known misalignment. */
1733 gcc_assert (!all_misalignments_unknown
);
1735 /* Choose the best peeling from the hash table. */
1736 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1745 stmt
= DR_STMT (dr0
);
1746 stmt_info
= vinfo_for_stmt (stmt
);
1747 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1748 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1750 if (known_alignment_for_access_p (dr0
))
1752 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1753 size_zero_node
) < 0;
1756 /* Since it's known at compile time, compute the number of
1757 iterations in the peeled loop (the peeling factor) for use in
1758 updating DR_MISALIGNMENT values. The peeling factor is the
1759 vectorization factor minus the misalignment as an element
1761 mis
= DR_MISALIGNMENT (dr0
);
1762 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1763 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1767 /* For interleaved data access every iteration accesses all the
1768 members of the group, therefore we divide the number of iterations
1769 by the group size. */
1770 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1771 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1772 npeel
/= GROUP_SIZE (stmt_info
);
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE
, vect_location
,
1776 "Try peeling by %d\n", npeel
);
1779 /* Ensure that all data refs can be vectorized after the peel. */
1780 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1782 int save_misalignment
;
1787 stmt
= DR_STMT (dr
);
1788 stmt_info
= vinfo_for_stmt (stmt
);
1789 /* For interleaving, only the alignment of the first access
1791 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1792 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1795 /* Strided accesses perform only component accesses, alignment is
1796 irrelevant for them. */
1797 if (STMT_VINFO_STRIDED_P (stmt_info
)
1798 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1801 save_misalignment
= DR_MISALIGNMENT (dr
);
1802 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1803 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1804 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1806 if (!supportable_dr_alignment
)
1813 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1815 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1820 body_cost_vec
.release ();
1825 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1828 unsigned max_allowed_peel
1829 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1830 if (max_allowed_peel
!= (unsigned)-1)
1832 unsigned max_peel
= npeel
;
1835 gimple
*dr_stmt
= DR_STMT (dr0
);
1836 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1837 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1838 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1840 if (max_peel
> max_allowed_peel
)
1843 if (dump_enabled_p ())
1844 dump_printf_loc (MSG_NOTE
, vect_location
,
1845 "Disable peeling, max peels reached: %d\n", max_peel
);
1850 /* Cost model #2 - if peeling may result in a remaining loop not
1851 iterating enough to be vectorized then do not peel. */
1853 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1856 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1857 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1858 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1864 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1865 If the misalignment of DR_i is identical to that of dr0 then set
1866 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1867 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1868 by the peeling factor times the element size of DR_i (MOD the
1869 vectorization factor times the size). Otherwise, the
1870 misalignment of DR_i must be set to unknown. */
1871 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1874 /* Strided accesses perform only component accesses, alignment
1875 is irrelevant for them. */
1876 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1877 if (STMT_VINFO_STRIDED_P (stmt_info
)
1878 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1881 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1884 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1886 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1888 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1889 = DR_MISALIGNMENT (dr0
);
1890 SET_DR_MISALIGNMENT (dr0
, 0);
1891 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE
, vect_location
,
1894 "Alignment of access forced using peeling.\n");
1895 dump_printf_loc (MSG_NOTE
, vect_location
,
1896 "Peeling for alignment will be applied.\n");
1898 /* The inside-loop cost will be accounted for in vectorizable_load
1899 and vectorizable_store correctly with adjusted alignments.
1900 Drop the body_cst_vec on the floor here. */
1901 body_cost_vec
.release ();
1903 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1909 body_cost_vec
.release ();
1911 /* (2) Versioning to force alignment. */
1913 /* Try versioning if:
1914 1) optimize loop for speed
1915 2) there is at least one unsupported misaligned data ref with an unknown
1917 3) all misaligned data refs with a known misalignment are supported, and
1918 4) the number of runtime alignment checks is within reason. */
1921 optimize_loop_nest_for_speed_p (loop
)
1922 && (!loop
->inner
); /* FORNOW */
1926 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1928 stmt
= DR_STMT (dr
);
1929 stmt_info
= vinfo_for_stmt (stmt
);
1931 /* For interleaving, only the alignment of the first access
1933 if (aligned_access_p (dr
)
1934 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1935 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1938 if (STMT_VINFO_STRIDED_P (stmt_info
))
1940 /* Strided loads perform only component accesses, alignment is
1941 irrelevant for them. */
1942 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1944 do_versioning
= false;
1948 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1950 if (!supportable_dr_alignment
)
1956 if (known_alignment_for_access_p (dr
)
1957 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1958 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1960 do_versioning
= false;
1964 stmt
= DR_STMT (dr
);
1965 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1966 gcc_assert (vectype
);
1968 /* The rightmost bits of an aligned address must be zeros.
1969 Construct the mask needed for this test. For example,
1970 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1971 mask must be 15 = 0xf. */
1972 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1974 /* FORNOW: use the same mask to test all potentially unaligned
1975 references in the loop. The vectorizer currently supports
1976 a single vector size, see the reference to
1977 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1978 vectorization factor is computed. */
1979 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1980 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1981 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1982 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1987 /* Versioning requires at least one misaligned data reference. */
1988 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1989 do_versioning
= false;
1990 else if (!do_versioning
)
1991 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1996 vec
<gimple
*> may_misalign_stmts
1997 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2000 /* It can now be assumed that the data references in the statements
2001 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2002 of the loop being vectorized. */
2003 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2005 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2006 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2007 SET_DR_MISALIGNMENT (dr
, 0);
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE
, vect_location
,
2010 "Alignment of access forced using versioning.\n");
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE
, vect_location
,
2015 "Versioning for alignment will be applied.\n");
2017 /* Peeling and versioning can't be done together at this time. */
2018 gcc_assert (! (do_peeling
&& do_versioning
));
2020 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2025 /* This point is reached if neither peeling nor versioning is being done. */
2026 gcc_assert (! (do_peeling
|| do_versioning
));
2028 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2033 /* Function vect_find_same_alignment_drs.
2035 Update group and alignment relations according to the chosen
2036 vectorization factor. */
2039 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
2040 loop_vec_info loop_vinfo
)
2043 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2044 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2045 struct data_reference
*dra
= DDR_A (ddr
);
2046 struct data_reference
*drb
= DDR_B (ddr
);
2047 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2048 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2049 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2050 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2051 lambda_vector dist_v
;
2052 unsigned int loop_depth
;
2054 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2060 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2063 /* Loop-based vectorization and known data dependence. */
2064 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2067 /* Data-dependence analysis reports a distance vector of zero
2068 for data-references that overlap only in the first iteration
2069 but have different sign step (see PR45764).
2070 So as a sanity check require equal DR_STEP. */
2071 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2074 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2075 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2077 int dist
= dist_v
[loop_depth
];
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE
, vect_location
,
2081 "dependence distance = %d.\n", dist
);
2083 /* Same loop iteration. */
2085 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2087 /* Two references with distance zero have the same alignment. */
2088 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2089 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2090 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE
, vect_location
,
2093 "accesses have the same alignment.\n");
2094 dump_printf (MSG_NOTE
,
2095 "dependence distance modulo vf == 0 between ");
2096 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2097 dump_printf (MSG_NOTE
, " and ");
2098 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2099 dump_printf (MSG_NOTE
, "\n");
2106 /* Function vect_analyze_data_refs_alignment
2108 Analyze the alignment of the data-references in the loop.
2109 Return FALSE if a data reference is found that cannot be vectorized. */
2112 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_NOTE
, vect_location
,
2116 "=== vect_analyze_data_refs_alignment ===\n");
2118 /* Mark groups of data references with same alignment using
2119 data dependence information. */
2120 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2121 struct data_dependence_relation
*ddr
;
2124 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2125 vect_find_same_alignment_drs (ddr
, vinfo
);
2127 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2128 struct data_reference
*dr
;
2130 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2132 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2133 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2134 && !vect_compute_data_ref_alignment (dr
))
2136 /* Strided accesses perform only component accesses, misalignment
2137 information is irrelevant for them. */
2138 if (STMT_VINFO_STRIDED_P (stmt_info
)
2139 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2144 "not vectorized: can't calculate alignment "
2155 /* Analyze alignment of DRs of stmts in NODE. */
2158 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2160 /* We vectorize from the first scalar stmt in the node unless
2161 the node is permuted in which case we start from the first
2162 element in the group. */
2163 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2164 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2165 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2166 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2168 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2169 if (! vect_compute_data_ref_alignment (dr
)
2170 /* For creating the data-ref pointer we need alignment of the
2171 first element anyway. */
2173 && ! vect_compute_data_ref_alignment (first_dr
))
2174 || ! verify_data_ref_alignment (dr
))
2176 if (dump_enabled_p ())
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2178 "not vectorized: bad data alignment in basic "
2186 /* Function vect_slp_analyze_instance_alignment
2188 Analyze the alignment of the data-references in the SLP instance.
2189 Return FALSE if a data reference is found that cannot be vectorized. */
2192 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2194 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_NOTE
, vect_location
,
2196 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2200 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2201 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2204 node
= SLP_INSTANCE_TREE (instance
);
2205 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2206 && ! vect_slp_analyze_and_verify_node_alignment
2207 (SLP_INSTANCE_TREE (instance
)))
2214 /* Analyze groups of accesses: check that DR belongs to a group of
2215 accesses of legal size, step, etc. Detect gaps, single element
2216 interleaving, and other special cases. Set grouped access info.
2217 Collect groups of strided stores for further use in SLP analysis.
2218 Worker for vect_analyze_group_access. */
2221 vect_analyze_group_access_1 (struct data_reference
*dr
)
2223 tree step
= DR_STEP (dr
);
2224 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2225 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2226 gimple
*stmt
= DR_STMT (dr
);
2227 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2228 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2229 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2230 HOST_WIDE_INT dr_step
= -1;
2231 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2232 bool slp_impossible
= false;
2234 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2235 size of the interleaving group (including gaps). */
2236 if (tree_fits_shwi_p (step
))
2238 dr_step
= tree_to_shwi (step
);
2239 /* Check that STEP is a multiple of type size. Otherwise there is
2240 a non-element-sized gap at the end of the group which we
2241 cannot represent in GROUP_GAP or GROUP_SIZE.
2242 ??? As we can handle non-constant step fine here we should
2243 simply remove uses of GROUP_GAP between the last and first
2244 element and instead rely on DR_STEP. GROUP_SIZE then would
2245 simply not include that gap. */
2246 if ((dr_step
% type_size
) != 0)
2248 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE
, vect_location
,
2252 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2253 dump_printf (MSG_NOTE
,
2254 " is not a multiple of the element size for ");
2255 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2256 dump_printf (MSG_NOTE
, "\n");
2260 groupsize
= absu_hwi (dr_step
) / type_size
;
2265 /* Not consecutive access is possible only if it is a part of interleaving. */
2266 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2268 /* Check if it this DR is a part of interleaving, and is a single
2269 element of the group that is accessed in the loop. */
2271 /* Gaps are supported only for loads. STEP must be a multiple of the type
2272 size. The size of the group must be a power of 2. */
2274 && (dr_step
% type_size
) == 0
2276 && pow2p_hwi (groupsize
))
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2279 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2280 GROUP_GAP (stmt_info
) = groupsize
- 1;
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_NOTE
, vect_location
,
2284 "Detected single element interleaving ");
2285 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2286 dump_printf (MSG_NOTE
, " step ");
2287 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2288 dump_printf (MSG_NOTE
, "\n");
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2297 "not consecutive access ");
2298 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2303 /* Mark the statement as unvectorizable. */
2304 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2308 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2309 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2313 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2315 /* First stmt in the interleaving chain. Check the chain. */
2316 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2317 struct data_reference
*data_ref
= dr
;
2318 unsigned int count
= 1;
2319 tree prev_init
= DR_INIT (data_ref
);
2320 gimple
*prev
= stmt
;
2321 HOST_WIDE_INT diff
, gaps
= 0;
2325 /* Skip same data-refs. In case that two or more stmts share
2326 data-ref (supported only for loads), we vectorize only the first
2327 stmt, and the rest get their vectorized loads from the first
2329 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2330 DR_INIT (STMT_VINFO_DATA_REF (
2331 vinfo_for_stmt (next
)))))
2333 if (DR_IS_WRITE (data_ref
))
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2337 "Two store stmts share the same dr.\n");
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2343 "Two or more load stmts share the same dr.\n");
2345 /* For load use the same data-ref load. */
2346 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2349 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2354 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2356 /* All group members have the same STEP by construction. */
2357 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2359 /* Check that the distance between two accesses is equal to the type
2360 size. Otherwise, we have gaps. */
2361 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2362 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2365 /* FORNOW: SLP of accesses with gaps is not supported. */
2366 slp_impossible
= true;
2367 if (DR_IS_WRITE (data_ref
))
2369 if (dump_enabled_p ())
2370 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2371 "interleaved store with gaps\n");
2378 last_accessed_element
+= diff
;
2380 /* Store the gap from the previous member of the group. If there is no
2381 gap in the access, GROUP_GAP is always 1. */
2382 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2384 prev_init
= DR_INIT (data_ref
);
2385 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2386 /* Count the number of data-refs in the chain. */
2391 groupsize
= count
+ gaps
;
2393 if (groupsize
> UINT_MAX
)
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2397 "group is too large\n");
2401 /* Check that the size of the interleaving is equal to count for stores,
2402 i.e., that there are no gaps. */
2403 if (groupsize
!= count
2404 && !DR_IS_READ (dr
))
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2408 "interleaved store with gaps\n");
2412 /* If there is a gap after the last load in the group it is the
2413 difference between the groupsize and the last accessed
2415 When there is no gap, this difference should be 0. */
2416 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2418 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2419 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_NOTE
, vect_location
,
2422 "Detected interleaving ");
2423 if (DR_IS_READ (dr
))
2424 dump_printf (MSG_NOTE
, "load ");
2426 dump_printf (MSG_NOTE
, "store ");
2427 dump_printf (MSG_NOTE
, "of size %u starting with ",
2428 (unsigned)groupsize
);
2429 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2430 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2431 dump_printf_loc (MSG_NOTE
, vect_location
,
2432 "There is a gap of %u elements after the group\n",
2433 GROUP_GAP (vinfo_for_stmt (stmt
)));
2436 /* SLP: create an SLP data structure for every interleaving group of
2437 stores for further analysis in vect_analyse_slp. */
2438 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2441 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2443 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2450 /* Analyze groups of accesses: check that DR belongs to a group of
2451 accesses of legal size, step, etc. Detect gaps, single element
2452 interleaving, and other special cases. Set grouped access info.
2453 Collect groups of strided stores for further use in SLP analysis. */
2456 vect_analyze_group_access (struct data_reference
*dr
)
2458 if (!vect_analyze_group_access_1 (dr
))
2460 /* Dissolve the group if present. */
2462 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2465 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2466 next
= GROUP_NEXT_ELEMENT (vinfo
);
2467 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2468 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2476 /* Analyze the access pattern of the data-reference DR.
2477 In case of non-consecutive accesses call vect_analyze_group_access() to
2478 analyze groups of accesses. */
2481 vect_analyze_data_ref_access (struct data_reference
*dr
)
2483 tree step
= DR_STEP (dr
);
2484 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2485 gimple
*stmt
= DR_STMT (dr
);
2486 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2487 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2488 struct loop
*loop
= NULL
;
2491 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2493 if (loop_vinfo
&& !step
)
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2497 "bad data-ref access in loop\n");
2501 /* Allow loads with zero step in inner-loop vectorization. */
2502 if (loop_vinfo
&& integer_zerop (step
))
2504 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2505 if (!nested_in_vect_loop_p (loop
, stmt
))
2506 return DR_IS_READ (dr
);
2507 /* Allow references with zero step for outer loops marked
2508 with pragma omp simd only - it guarantees absence of
2509 loop-carried dependencies between inner loop iterations. */
2510 if (!loop
->force_vectorize
)
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_NOTE
, vect_location
,
2514 "zero step in inner loop of nest\n");
2519 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2521 /* Interleaved accesses are not yet supported within outer-loop
2522 vectorization for references in the inner-loop. */
2523 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2525 /* For the rest of the analysis we use the outer-loop step. */
2526 step
= STMT_VINFO_DR_STEP (stmt_info
);
2527 if (integer_zerop (step
))
2529 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_NOTE
, vect_location
,
2531 "zero step in outer loop.\n");
2532 return DR_IS_READ (dr
);
2537 if (TREE_CODE (step
) == INTEGER_CST
)
2539 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2540 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2542 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2544 /* Mark that it is not interleaving. */
2545 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2550 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_NOTE
, vect_location
,
2554 "grouped access in outer loop.\n");
2559 /* Assume this is a DR handled by non-constant strided load case. */
2560 if (TREE_CODE (step
) != INTEGER_CST
)
2561 return (STMT_VINFO_STRIDED_P (stmt_info
)
2562 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2563 || vect_analyze_group_access (dr
)));
2565 /* Not consecutive access - check if it's a part of interleaving group. */
2566 return vect_analyze_group_access (dr
);
2571 /* A helper function used in the comparator function to sort data
2572 references. T1 and T2 are two data references to be compared.
2573 The function returns -1, 0, or 1. */
2576 compare_tree (tree t1
, tree t2
)
2579 enum tree_code code
;
2592 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2593 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2595 code
= TREE_CODE (t1
);
2598 /* For const values, we can just use hash values for comparisons. */
2606 hashval_t h1
= iterative_hash_expr (t1
, 0);
2607 hashval_t h2
= iterative_hash_expr (t2
, 0);
2609 return h1
< h2
? -1 : 1;
2614 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2618 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2619 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2623 tclass
= TREE_CODE_CLASS (code
);
2625 /* For var-decl, we could compare their UIDs. */
2626 if (tclass
== tcc_declaration
)
2628 if (DECL_UID (t1
) != DECL_UID (t2
))
2629 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2633 /* For expressions with operands, compare their operands recursively. */
2634 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2636 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2646 /* Compare two data-references DRA and DRB to group them into chunks
2647 suitable for grouping. */
2650 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2652 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2653 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2656 /* Stabilize sort. */
2660 /* DRs in different loops never belong to the same group. */
2661 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2662 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2664 return loopa
->num
< loopb
->num
? -1 : 1;
2666 /* Ordering of DRs according to base. */
2667 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2669 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2674 /* And according to DR_OFFSET. */
2675 if (!dr_equal_offsets_p (dra
, drb
))
2677 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2682 /* Put reads before writes. */
2683 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2684 return DR_IS_READ (dra
) ? -1 : 1;
2686 /* Then sort after access size. */
2687 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2688 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2690 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2691 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2696 /* And after step. */
2697 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2699 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2704 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2705 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2707 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2711 /* Function vect_analyze_data_ref_accesses.
2713 Analyze the access pattern of all the data references in the loop.
2715 FORNOW: the only access pattern that is considered vectorizable is a
2716 simple step 1 (consecutive) access.
2718 FORNOW: handle only arrays and pointer accesses. */
2721 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2724 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2725 struct data_reference
*dr
;
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE
, vect_location
,
2729 "=== vect_analyze_data_ref_accesses ===\n");
2731 if (datarefs
.is_empty ())
2734 /* Sort the array of datarefs to make building the interleaving chains
2735 linear. Don't modify the original vector's order, it is needed for
2736 determining what dependencies are reversed. */
2737 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2738 datarefs_copy
.qsort (dr_group_sort_cmp
);
2740 /* Build the interleaving chains. */
2741 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2743 data_reference_p dra
= datarefs_copy
[i
];
2744 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2745 stmt_vec_info lastinfo
= NULL
;
2746 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2751 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2753 data_reference_p drb
= datarefs_copy
[i
];
2754 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2755 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2758 /* ??? Imperfect sorting (non-compatible types, non-modulo
2759 accesses, same accesses) can lead to a group to be artificially
2760 split here as we don't just skip over those. If it really
2761 matters we can push those to a worklist and re-iterate
2762 over them. The we can just skip ahead to the next DR here. */
2764 /* DRs in a different loop should not be put into the same
2765 interleaving group. */
2766 if (gimple_bb (DR_STMT (dra
))->loop_father
2767 != gimple_bb (DR_STMT (drb
))->loop_father
)
2770 /* Check that the data-refs have same first location (except init)
2771 and they are both either store or load (not load and store,
2772 not masked loads or stores). */
2773 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2774 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2775 DR_BASE_ADDRESS (drb
), 0)
2776 || !dr_equal_offsets_p (dra
, drb
)
2777 || !gimple_assign_single_p (DR_STMT (dra
))
2778 || !gimple_assign_single_p (DR_STMT (drb
)))
2781 /* Check that the data-refs have the same constant size. */
2782 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2783 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2784 if (!tree_fits_uhwi_p (sza
)
2785 || !tree_fits_uhwi_p (szb
)
2786 || !tree_int_cst_equal (sza
, szb
))
2789 /* Check that the data-refs have the same step. */
2790 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2793 /* Do not place the same access in the interleaving chain twice. */
2794 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2797 /* Check the types are compatible.
2798 ??? We don't distinguish this during sorting. */
2799 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2800 TREE_TYPE (DR_REF (drb
))))
2803 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2804 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2805 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2806 gcc_assert (init_a
<= init_b
);
2808 /* If init_b == init_a + the size of the type * k, we have an
2809 interleaving, and DRA is accessed before DRB. */
2810 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2811 if (type_size_a
== 0
2812 || (init_b
- init_a
) % type_size_a
!= 0)
2815 /* If we have a store, the accesses are adjacent. This splits
2816 groups into chunks we support (we don't support vectorization
2817 of stores with gaps). */
2818 if (!DR_IS_READ (dra
)
2819 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2820 (DR_INIT (datarefs_copy
[i
-1]))
2824 /* If the step (if not zero or non-constant) is greater than the
2825 difference between data-refs' inits this splits groups into
2827 if (tree_fits_shwi_p (DR_STEP (dra
)))
2829 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2830 if (step
!= 0 && step
<= (init_b
- init_a
))
2834 if (dump_enabled_p ())
2836 dump_printf_loc (MSG_NOTE
, vect_location
,
2837 "Detected interleaving ");
2838 if (DR_IS_READ (dra
))
2839 dump_printf (MSG_NOTE
, "load ");
2841 dump_printf (MSG_NOTE
, "store ");
2842 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2843 dump_printf (MSG_NOTE
, " and ");
2844 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2845 dump_printf (MSG_NOTE
, "\n");
2848 /* Link the found element into the group list. */
2849 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2851 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2852 lastinfo
= stmtinfo_a
;
2854 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2855 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2856 lastinfo
= stmtinfo_b
;
2860 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2861 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2862 && !vect_analyze_data_ref_access (dr
))
2864 if (dump_enabled_p ())
2865 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2866 "not vectorized: complicated access pattern.\n");
2868 if (is_a
<bb_vec_info
> (vinfo
))
2870 /* Mark the statement as not vectorizable. */
2871 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2876 datarefs_copy
.release ();
2881 datarefs_copy
.release ();
2886 /* Operator == between two dr_with_seg_len objects.
2888 This equality operator is used to make sure two data refs
2889 are the same one so that we will consider to combine the
2890 aliasing checks of those two pairs of data dependent data
2894 operator == (const dr_with_seg_len
& d1
,
2895 const dr_with_seg_len
& d2
)
2897 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2898 DR_BASE_ADDRESS (d2
.dr
), 0)
2899 && compare_tree (DR_OFFSET (d1
.dr
), DR_OFFSET (d2
.dr
)) == 0
2900 && compare_tree (DR_INIT (d1
.dr
), DR_INIT (d2
.dr
)) == 0
2901 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2904 /* Function comp_dr_with_seg_len_pair.
2906 Comparison function for sorting objects of dr_with_seg_len_pair_t
2907 so that we can combine aliasing checks in one scan. */
2910 comp_dr_with_seg_len_pair (const void *pa_
, const void *pb_
)
2912 const dr_with_seg_len_pair_t
* pa
= (const dr_with_seg_len_pair_t
*) pa_
;
2913 const dr_with_seg_len_pair_t
* pb
= (const dr_with_seg_len_pair_t
*) pb_
;
2914 const dr_with_seg_len
&a1
= pa
->first
, &a2
= pa
->second
;
2915 const dr_with_seg_len
&b1
= pb
->first
, &b2
= pb
->second
;
2917 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2918 if a and c have the same basic address snd step, and b and d have the same
2919 address and step. Therefore, if any a&c or b&d don't have the same address
2920 and step, we don't care the order of those two pairs after sorting. */
2923 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a1
.dr
),
2924 DR_BASE_ADDRESS (b1
.dr
))) != 0)
2926 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a2
.dr
),
2927 DR_BASE_ADDRESS (b2
.dr
))) != 0)
2929 if ((comp_res
= compare_tree (DR_STEP (a1
.dr
), DR_STEP (b1
.dr
))) != 0)
2931 if ((comp_res
= compare_tree (DR_STEP (a2
.dr
), DR_STEP (b2
.dr
))) != 0)
2933 if ((comp_res
= compare_tree (DR_OFFSET (a1
.dr
), DR_OFFSET (b1
.dr
))) != 0)
2935 if ((comp_res
= compare_tree (DR_INIT (a1
.dr
), DR_INIT (b1
.dr
))) != 0)
2937 if ((comp_res
= compare_tree (DR_OFFSET (a2
.dr
), DR_OFFSET (b2
.dr
))) != 0)
2939 if ((comp_res
= compare_tree (DR_INIT (a2
.dr
), DR_INIT (b2
.dr
))) != 0)
2945 /* Function vect_vfa_segment_size.
2947 Create an expression that computes the size of segment
2948 that will be accessed for a data reference. The functions takes into
2949 account that realignment loads may access one more vector.
2952 DR: The data reference.
2953 LENGTH_FACTOR: segment length to consider.
2955 Return an expression whose value is the size of segment which will be
2959 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2961 tree segment_length
;
2963 if (integer_zerop (DR_STEP (dr
)))
2964 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2966 segment_length
= size_binop (MULT_EXPR
,
2967 fold_convert (sizetype
, DR_STEP (dr
)),
2968 fold_convert (sizetype
, length_factor
));
2970 if (vect_supportable_dr_alignment (dr
, false)
2971 == dr_explicit_realign_optimized
)
2973 tree vector_size
= TYPE_SIZE_UNIT
2974 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2976 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2978 return segment_length
;
2981 /* Function vect_no_alias_p.
2983 Given data references A and B with equal base and offset, the alias
2984 relation can be decided at compilation time, return TRUE if they do
2985 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2986 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2990 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2991 tree segment_length_a
, tree segment_length_b
)
2993 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
2994 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
2995 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
2998 tree seg_a_min
= DR_INIT (a
);
2999 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
3000 seg_a_min
, segment_length_a
);
3001 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3002 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3004 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3006 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
3007 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
3008 seg_a_max
, unit_size
);
3009 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
3010 DR_INIT (a
), unit_size
);
3012 tree seg_b_min
= DR_INIT (b
);
3013 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
3014 seg_b_min
, segment_length_b
);
3015 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3017 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3018 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3019 seg_b_max
, unit_size
);
3020 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3021 DR_INIT (b
), unit_size
);
3024 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3025 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3031 /* Function vect_prune_runtime_alias_test_list.
3033 Prune a list of ddrs to be tested at run-time by versioning for alias.
3034 Merge several alias checks into one if possible.
3035 Return FALSE if resulting list of ddrs is longer then allowed by
3036 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3039 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3041 vec
<ddr_p
> may_alias_ddrs
=
3042 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3043 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
3044 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3045 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3046 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3052 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_NOTE
, vect_location
,
3054 "=== vect_prune_runtime_alias_test_list ===\n");
3056 if (may_alias_ddrs
.is_empty ())
3059 /* Basically, for each pair of dependent data refs store_ptr_0
3060 and load_ptr_0, we create an expression:
3062 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3063 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3065 for aliasing checks. However, in some cases we can decrease
3066 the number of checks by combining two checks into one. For
3067 example, suppose we have another pair of data refs store_ptr_0
3068 and load_ptr_1, and if the following condition is satisfied:
3070 load_ptr_0 < load_ptr_1 &&
3071 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3073 (this condition means, in each iteration of vectorized loop,
3074 the accessed memory of store_ptr_0 cannot be between the memory
3075 of load_ptr_0 and load_ptr_1.)
3077 we then can use only the following expression to finish the
3078 alising checks between store_ptr_0 & load_ptr_0 and
3079 store_ptr_0 & load_ptr_1:
3081 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3082 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3084 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3085 same basic address. */
3087 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3089 /* First, we collect all data ref pairs for aliasing checks. */
3090 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3093 struct data_reference
*dr_a
, *dr_b
;
3094 gimple
*dr_group_first_a
, *dr_group_first_b
;
3095 tree segment_length_a
, segment_length_b
;
3096 gimple
*stmt_a
, *stmt_b
;
3099 stmt_a
= DR_STMT (DDR_A (ddr
));
3100 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3101 if (dr_group_first_a
)
3103 stmt_a
= dr_group_first_a
;
3104 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3108 stmt_b
= DR_STMT (DDR_B (ddr
));
3109 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3110 if (dr_group_first_b
)
3112 stmt_b
= dr_group_first_b
;
3113 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3116 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3117 length_factor
= scalar_loop_iters
;
3119 length_factor
= size_int (vect_factor
);
3120 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3121 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3123 comp_res
= compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
));
3125 comp_res
= compare_tree (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
));
3127 /* Alias is known at compilation time. */
3129 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3130 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3131 && TREE_CODE (segment_length_a
) == INTEGER_CST
3132 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3134 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3137 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE
, vect_location
,
3139 "not vectorized: compilation time alias.\n");
3144 dr_with_seg_len_pair_t dr_with_seg_len_pair
3145 (dr_with_seg_len (dr_a
, segment_length_a
),
3146 dr_with_seg_len (dr_b
, segment_length_b
));
3148 /* Canonicalize pairs by sorting the two DR members. */
3150 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3152 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3155 /* Second, we sort the collected data ref pairs so that we can scan
3156 them once to combine all possible aliasing checks. */
3157 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3159 /* Third, we scan the sorted dr pairs and check if we can combine
3160 alias checks of two neighboring dr pairs. */
3161 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3163 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3164 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3165 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3166 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3167 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3169 /* Remove duplicate data ref pairs. */
3170 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3172 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_NOTE
, vect_location
,
3175 "found equal ranges ");
3176 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3177 DR_REF (dr_a1
->dr
));
3178 dump_printf (MSG_NOTE
, ", ");
3179 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3180 DR_REF (dr_b1
->dr
));
3181 dump_printf (MSG_NOTE
, " and ");
3182 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3183 DR_REF (dr_a2
->dr
));
3184 dump_printf (MSG_NOTE
, ", ");
3185 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3186 DR_REF (dr_b2
->dr
));
3187 dump_printf (MSG_NOTE
, "\n");
3190 comp_alias_ddrs
.ordered_remove (i
--);
3194 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3196 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3197 and DR_A1 and DR_A2 are two consecutive memrefs. */
3198 if (*dr_a1
== *dr_a2
)
3200 std::swap (dr_a1
, dr_b1
);
3201 std::swap (dr_a2
, dr_b2
);
3204 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3205 DR_BASE_ADDRESS (dr_a2
->dr
), 0)
3206 || !operand_equal_p (DR_OFFSET (dr_a1
->dr
),
3207 DR_OFFSET (dr_a2
->dr
), 0)
3208 || !tree_fits_shwi_p (DR_INIT (dr_a1
->dr
))
3209 || !tree_fits_shwi_p (DR_INIT (dr_a2
->dr
)))
3212 /* Make sure dr_a1 starts left of dr_a2. */
3213 if (tree_int_cst_lt (DR_INIT (dr_a2
->dr
), DR_INIT (dr_a1
->dr
)))
3214 std::swap (*dr_a1
, *dr_a2
);
3216 bool do_remove
= false;
3217 unsigned HOST_WIDE_INT diff
3218 = (tree_to_shwi (DR_INIT (dr_a2
->dr
))
3219 - tree_to_shwi (DR_INIT (dr_a1
->dr
)));
3221 /* If the left segment does not extend beyond the start of the
3222 right segment the new segment length is that of the right
3223 plus the segment distance. */
3224 if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3225 && compare_tree_int (dr_a1
->seg_len
, diff
) <= 0)
3227 dr_a1
->seg_len
= size_binop (PLUS_EXPR
, dr_a2
->seg_len
,
3231 /* Generally the new segment length is the maximum of the
3232 left segment size and the right segment size plus the distance.
3233 ??? We can also build tree MAX_EXPR here but it's not clear this
3235 else if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3236 && tree_fits_uhwi_p (dr_a2
->seg_len
))
3238 unsigned HOST_WIDE_INT seg_len_a1
= tree_to_uhwi (dr_a1
->seg_len
);
3239 unsigned HOST_WIDE_INT seg_len_a2
= tree_to_uhwi (dr_a2
->seg_len
);
3240 dr_a1
->seg_len
= size_int (MAX (seg_len_a1
, diff
+ seg_len_a2
));
3243 /* Now we check if the following condition is satisfied:
3245 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3247 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3248 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3249 have to make a best estimation. We can get the minimum value
3250 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3251 then either of the following two conditions can guarantee the
3254 1: DIFF <= MIN_SEG_LEN_B
3255 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3258 unsigned HOST_WIDE_INT min_seg_len_b
3259 = (tree_fits_uhwi_p (dr_b1
->seg_len
)
3260 ? tree_to_uhwi (dr_b1
->seg_len
)
3263 if (diff
<= min_seg_len_b
3264 || (tree_fits_uhwi_p (dr_a1
->seg_len
)
3265 && diff
- tree_to_uhwi (dr_a1
->seg_len
) < min_seg_len_b
))
3267 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3268 dr_a2
->seg_len
, size_int (diff
));
3275 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_NOTE
, vect_location
,
3278 "merging ranges for ");
3279 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a1
->dr
));
3280 dump_printf (MSG_NOTE
, ", ");
3281 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b1
->dr
));
3282 dump_printf (MSG_NOTE
, " and ");
3283 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a2
->dr
));
3284 dump_printf (MSG_NOTE
, ", ");
3285 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b2
->dr
));
3286 dump_printf (MSG_NOTE
, "\n");
3288 comp_alias_ddrs
.ordered_remove (i
--);
3293 dump_printf_loc (MSG_NOTE
, vect_location
,
3294 "improved number of alias checks from %d to %d\n",
3295 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3296 if ((int) comp_alias_ddrs
.length () >
3297 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3301 "number of versioning for alias "
3302 "run-time tests exceeds %d "
3303 "(--param vect-max-version-for-alias-checks)\n",
3304 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3308 /* All alias checks have been resolved at compilation time. */
3309 if (!comp_alias_ddrs
.length ())
3310 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
3315 /* Return true if a non-affine read or write in STMT is suitable for a
3316 gather load or scatter store. Describe the operation in *INFO if so. */
3319 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3320 gather_scatter_info
*info
)
3322 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3323 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3324 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3325 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3326 tree offtype
= NULL_TREE
;
3327 tree decl
, base
, off
;
3329 int punsignedp
, reversep
, pvolatilep
= 0;
3332 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3333 see if we can use the def stmt of the address. */
3334 if (is_gimple_call (stmt
)
3335 && gimple_call_internal_p (stmt
)
3336 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3337 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3338 && TREE_CODE (base
) == MEM_REF
3339 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3340 && integer_zerop (TREE_OPERAND (base
, 1))
3341 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3343 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3344 if (is_gimple_assign (def_stmt
)
3345 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3346 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3349 /* The gather and scatter builtins need address of the form
3350 loop_invariant + vector * {1, 2, 4, 8}
3352 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3353 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3354 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3355 multiplications and additions in it. To get a vector, we need
3356 a single SSA_NAME that will be defined in the loop and will
3357 contain everything that is not loop invariant and that can be
3358 vectorized. The following code attempts to find such a preexistng
3359 SSA_NAME OFF and put the loop invariants into a tree BASE
3360 that can be gimplified before the loop. */
3361 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3362 &punsignedp
, &reversep
, &pvolatilep
);
3363 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3365 if (TREE_CODE (base
) == MEM_REF
)
3367 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3369 if (off
== NULL_TREE
)
3371 offset_int moff
= mem_ref_offset (base
);
3372 off
= wide_int_to_tree (sizetype
, moff
);
3375 off
= size_binop (PLUS_EXPR
, off
,
3376 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3378 base
= TREE_OPERAND (base
, 0);
3381 base
= build_fold_addr_expr (base
);
3383 if (off
== NULL_TREE
)
3384 off
= size_zero_node
;
3386 /* If base is not loop invariant, either off is 0, then we start with just
3387 the constant offset in the loop invariant BASE and continue with base
3388 as OFF, otherwise give up.
3389 We could handle that case by gimplifying the addition of base + off
3390 into some SSA_NAME and use that as off, but for now punt. */
3391 if (!expr_invariant_in_loop_p (loop
, base
))
3393 if (!integer_zerop (off
))
3396 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3398 /* Otherwise put base + constant offset into the loop invariant BASE
3399 and continue with OFF. */
3402 base
= fold_convert (sizetype
, base
);
3403 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3406 /* OFF at this point may be either a SSA_NAME or some tree expression
3407 from get_inner_reference. Try to peel off loop invariants from it
3408 into BASE as long as possible. */
3410 while (offtype
== NULL_TREE
)
3412 enum tree_code code
;
3413 tree op0
, op1
, add
= NULL_TREE
;
3415 if (TREE_CODE (off
) == SSA_NAME
)
3417 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3419 if (expr_invariant_in_loop_p (loop
, off
))
3422 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3425 op0
= gimple_assign_rhs1 (def_stmt
);
3426 code
= gimple_assign_rhs_code (def_stmt
);
3427 op1
= gimple_assign_rhs2 (def_stmt
);
3431 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3433 code
= TREE_CODE (off
);
3434 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3438 case POINTER_PLUS_EXPR
:
3440 if (expr_invariant_in_loop_p (loop
, op0
))
3445 add
= fold_convert (sizetype
, add
);
3447 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3448 base
= size_binop (PLUS_EXPR
, base
, add
);
3451 if (expr_invariant_in_loop_p (loop
, op1
))
3459 if (expr_invariant_in_loop_p (loop
, op1
))
3461 add
= fold_convert (sizetype
, op1
);
3462 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3468 if (scale
== 1 && tree_fits_shwi_p (op1
))
3470 scale
= tree_to_shwi (op1
);
3479 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3480 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3482 if (TYPE_PRECISION (TREE_TYPE (op0
))
3483 == TYPE_PRECISION (TREE_TYPE (off
)))
3488 if (TYPE_PRECISION (TREE_TYPE (op0
))
3489 < TYPE_PRECISION (TREE_TYPE (off
)))
3492 offtype
= TREE_TYPE (off
);
3503 /* If at the end OFF still isn't a SSA_NAME or isn't
3504 defined in the loop, punt. */
3505 if (TREE_CODE (off
) != SSA_NAME
3506 || expr_invariant_in_loop_p (loop
, off
))
3509 if (offtype
== NULL_TREE
)
3510 offtype
= TREE_TYPE (off
);
3512 if (DR_IS_READ (dr
))
3513 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3516 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3519 if (decl
== NULL_TREE
)
3525 info
->offset_dt
= vect_unknown_def_type
;
3526 info
->offset_vectype
= NULL_TREE
;
3527 info
->scale
= scale
;
3531 /* Function vect_analyze_data_refs.
3533 Find all the data references in the loop or basic block.
3535 The general structure of the analysis of data refs in the vectorizer is as
3537 1- vect_analyze_data_refs(loop/bb): call
3538 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3539 in the loop/bb and their dependences.
3540 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3541 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3542 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3547 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3549 struct loop
*loop
= NULL
;
3551 struct data_reference
*dr
;
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE
, vect_location
,
3556 "=== vect_analyze_data_refs ===\n");
3558 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3559 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3561 /* Go through the data-refs, check that the analysis succeeded. Update
3562 pointer from stmt_vec_info struct to DR and vectype. */
3564 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3565 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3568 stmt_vec_info stmt_info
;
3569 tree base
, offset
, init
;
3570 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3571 bool simd_lane_access
= false;
3575 if (!dr
|| !DR_REF (dr
))
3577 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3579 "not vectorized: unhandled data-ref\n");
3583 stmt
= DR_STMT (dr
);
3584 stmt_info
= vinfo_for_stmt (stmt
);
3586 /* Discard clobbers from the dataref vector. We will remove
3587 clobber stmts during vectorization. */
3588 if (gimple_clobber_p (stmt
))
3591 if (i
== datarefs
.length () - 1)
3596 datarefs
.ordered_remove (i
);
3601 /* Check that analysis of the data-ref succeeded. */
3602 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3607 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3608 && targetm
.vectorize
.builtin_gather
!= NULL
;
3611 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3612 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3613 bool maybe_simd_lane_access
3614 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3616 /* If target supports vector gather loads or scatter stores, or if
3617 this might be a SIMD lane access, see if they can't be used. */
3618 if (is_a
<loop_vec_info
> (vinfo
)
3619 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3620 && !nested_in_vect_loop_p (loop
, stmt
))
3622 struct data_reference
*newdr
3623 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3624 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3625 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3626 if (DR_BASE_ADDRESS (newdr
)
3627 && DR_OFFSET (newdr
)
3630 && integer_zerop (DR_STEP (newdr
)))
3632 if (maybe_simd_lane_access
)
3634 tree off
= DR_OFFSET (newdr
);
3636 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3637 && TREE_CODE (off
) == MULT_EXPR
3638 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3640 tree step
= TREE_OPERAND (off
, 1);
3641 off
= TREE_OPERAND (off
, 0);
3643 if (CONVERT_EXPR_P (off
)
3644 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3646 < TYPE_PRECISION (TREE_TYPE (off
)))
3647 off
= TREE_OPERAND (off
, 0);
3648 if (TREE_CODE (off
) == SSA_NAME
)
3650 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3651 tree reft
= TREE_TYPE (DR_REF (newdr
));
3652 if (is_gimple_call (def
)
3653 && gimple_call_internal_p (def
)
3654 && (gimple_call_internal_fn (def
)
3655 == IFN_GOMP_SIMD_LANE
))
3657 tree arg
= gimple_call_arg (def
, 0);
3658 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3659 arg
= SSA_NAME_VAR (arg
);
3660 if (arg
== loop
->simduid
3662 && tree_int_cst_equal
3663 (TYPE_SIZE_UNIT (reft
),
3666 DR_OFFSET (newdr
) = ssize_int (0);
3667 DR_STEP (newdr
) = step
;
3668 DR_ALIGNED_TO (newdr
)
3669 = size_int (BIGGEST_ALIGNMENT
);
3671 simd_lane_access
= true;
3677 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3681 gatherscatter
= GATHER
;
3683 gatherscatter
= SCATTER
;
3686 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3687 free_data_ref (newdr
);
3690 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3692 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3695 "not vectorized: data ref analysis "
3697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3700 if (is_a
<bb_vec_info
> (vinfo
))
3707 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3709 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3711 "not vectorized: base addr of dr is a "
3714 if (is_a
<bb_vec_info
> (vinfo
))
3717 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3722 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3724 if (dump_enabled_p ())
3726 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3727 "not vectorized: volatile type ");
3728 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3731 if (is_a
<bb_vec_info
> (vinfo
))
3737 if (stmt_can_throw_internal (stmt
))
3739 if (dump_enabled_p ())
3741 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3742 "not vectorized: statement can throw an "
3744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3747 if (is_a
<bb_vec_info
> (vinfo
))
3750 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3755 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3756 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3758 if (dump_enabled_p ())
3760 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3761 "not vectorized: statement is bitfield "
3763 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3766 if (is_a
<bb_vec_info
> (vinfo
))
3769 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3774 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3775 offset
= unshare_expr (DR_OFFSET (dr
));
3776 init
= unshare_expr (DR_INIT (dr
));
3778 if (is_gimple_call (stmt
)
3779 && (!gimple_call_internal_p (stmt
)
3780 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3781 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3786 "not vectorized: dr in a call ");
3787 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3790 if (is_a
<bb_vec_info
> (vinfo
))
3793 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3798 /* Update DR field in stmt_vec_info struct. */
3800 /* If the dataref is in an inner-loop of the loop that is considered for
3801 for vectorization, we also want to analyze the access relative to
3802 the outer-loop (DR contains information only relative to the
3803 inner-most enclosing loop). We do that by building a reference to the
3804 first location accessed by the inner-loop, and analyze it relative to
3806 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3808 tree outer_step
, outer_base
, outer_init
;
3809 HOST_WIDE_INT pbitsize
, pbitpos
;
3812 int punsignedp
, preversep
, pvolatilep
;
3813 affine_iv base_iv
, offset_iv
;
3816 /* Build a reference to the first location accessed by the
3817 inner-loop: *(BASE+INIT). (The first location is actually
3818 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3819 tree inner_base
= build_fold_indirect_ref
3820 (fold_build_pointer_plus (base
, init
));
3822 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_NOTE
, vect_location
,
3825 "analyze in outer-loop: ");
3826 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3827 dump_printf (MSG_NOTE
, "\n");
3830 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3831 &poffset
, &pmode
, &punsignedp
,
3832 &preversep
, &pvolatilep
);
3833 gcc_assert (outer_base
!= NULL_TREE
);
3835 if (pbitpos
% BITS_PER_UNIT
!= 0)
3837 if (dump_enabled_p ())
3838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3839 "failed: bit offset alignment.\n");
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3847 "failed: reverse storage order.\n");
3851 outer_base
= build_fold_addr_expr (outer_base
);
3852 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3855 if (dump_enabled_p ())
3856 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3857 "failed: evolution of base is not affine.\n");
3864 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3872 offset_iv
.base
= ssize_int (0);
3873 offset_iv
.step
= ssize_int (0);
3875 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3878 if (dump_enabled_p ())
3879 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3880 "evolution of offset is not affine.\n");
3884 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3885 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3886 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3887 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3888 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3890 outer_step
= size_binop (PLUS_EXPR
,
3891 fold_convert (ssizetype
, base_iv
.step
),
3892 fold_convert (ssizetype
, offset_iv
.step
));
3894 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3895 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3896 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3897 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3898 STMT_VINFO_DR_OFFSET (stmt_info
) =
3899 fold_convert (ssizetype
, offset_iv
.base
);
3900 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3901 size_int (highest_pow2_factor (offset_iv
.base
));
3903 if (dump_enabled_p ())
3905 dump_printf_loc (MSG_NOTE
, vect_location
,
3906 "\touter base_address: ");
3907 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3908 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3909 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3910 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3911 STMT_VINFO_DR_OFFSET (stmt_info
));
3912 dump_printf (MSG_NOTE
,
3913 "\n\touter constant offset from base address: ");
3914 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3915 STMT_VINFO_DR_INIT (stmt_info
));
3916 dump_printf (MSG_NOTE
, "\n\touter step: ");
3917 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3918 STMT_VINFO_DR_STEP (stmt_info
));
3919 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3920 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3921 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3922 dump_printf (MSG_NOTE
, "\n");
3926 if (STMT_VINFO_DATA_REF (stmt_info
))
3928 if (dump_enabled_p ())
3930 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3931 "not vectorized: more than one data ref "
3933 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3936 if (is_a
<bb_vec_info
> (vinfo
))
3939 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3944 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3945 if (simd_lane_access
)
3947 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3948 free_data_ref (datarefs
[i
]);
3952 /* Set vectype for STMT. */
3953 scalar_type
= TREE_TYPE (DR_REF (dr
));
3954 STMT_VINFO_VECTYPE (stmt_info
)
3955 = get_vectype_for_scalar_type (scalar_type
);
3956 if (!STMT_VINFO_VECTYPE (stmt_info
))
3958 if (dump_enabled_p ())
3960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3961 "not vectorized: no vectype for stmt: ");
3962 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3963 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3964 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3966 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3969 if (is_a
<bb_vec_info
> (vinfo
))
3971 /* No vector type is fine, the ref can still participate
3972 in dependence analysis, we just can't vectorize it. */
3973 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3977 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3979 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3980 if (gatherscatter
!= SG_NONE
)
3987 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_NOTE
, vect_location
,
3990 "got vectype for stmt: ");
3991 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3992 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3993 STMT_VINFO_VECTYPE (stmt_info
));
3994 dump_printf (MSG_NOTE
, "\n");
3998 /* Adjust the minimal vectorization factor according to the
4000 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4004 if (gatherscatter
!= SG_NONE
)
4006 gather_scatter_info gs_info
;
4007 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4009 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4011 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4013 if (dump_enabled_p ())
4015 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4016 (gatherscatter
== GATHER
) ?
4017 "not vectorized: not suitable for gather "
4019 "not vectorized: not suitable for scatter "
4021 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4026 free_data_ref (datarefs
[i
]);
4028 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4031 else if (is_a
<loop_vec_info
> (vinfo
)
4032 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4034 if (nested_in_vect_loop_p (loop
, stmt
))
4036 if (dump_enabled_p ())
4038 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4039 "not vectorized: not suitable for strided "
4041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4045 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4049 /* If we stopped analysis at the first dataref we could not analyze
4050 when trying to vectorize a basic-block mark the rest of the datarefs
4051 as not vectorizable and truncate the vector of datarefs. That
4052 avoids spending useless time in analyzing their dependence. */
4053 if (i
!= datarefs
.length ())
4055 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4056 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4058 data_reference_p dr
= datarefs
[j
];
4059 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4062 datarefs
.truncate (i
);
4069 /* Function vect_get_new_vect_var.
4071 Returns a name for a new variable. The current naming scheme appends the
4072 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4073 the name of vectorizer generated variables, and appends that to NAME if
4077 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4084 case vect_simple_var
:
4087 case vect_scalar_var
:
4093 case vect_pointer_var
:
4102 char* tmp
= concat (prefix
, "_", name
, NULL
);
4103 new_vect_var
= create_tmp_reg (type
, tmp
);
4107 new_vect_var
= create_tmp_reg (type
, prefix
);
4109 return new_vect_var
;
4112 /* Like vect_get_new_vect_var but return an SSA name. */
4115 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4122 case vect_simple_var
:
4125 case vect_scalar_var
:
4128 case vect_pointer_var
:
4137 char* tmp
= concat (prefix
, "_", name
, NULL
);
4138 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4142 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4144 return new_vect_var
;
4147 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4150 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
4151 stmt_vec_info stmt_info
)
4153 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4154 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
4155 int misalign
= DR_MISALIGNMENT (dr
);
4157 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4159 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4162 /* Function vect_create_addr_base_for_vector_ref.
4164 Create an expression that computes the address of the first memory location
4165 that will be accessed for a data reference.
4168 STMT: The statement containing the data reference.
4169 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4170 OFFSET: Optional. If supplied, it is be added to the initial address.
4171 LOOP: Specify relative to which loop-nest should the address be computed.
4172 For example, when the dataref is in an inner-loop nested in an
4173 outer-loop that is now being vectorized, LOOP can be either the
4174 outer-loop, or the inner-loop. The first memory location accessed
4175 by the following dataref ('in' points to short):
4182 if LOOP=i_loop: &in (relative to i_loop)
4183 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4184 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4185 initial address. Unlike OFFSET, which is number of elements to
4186 be added, BYTE_OFFSET is measured in bytes.
4189 1. Return an SSA_NAME whose value is the address of the memory location of
4190 the first vector of the data reference.
4191 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4192 these statement(s) which define the returned SSA_NAME.
4194 FORNOW: We are only handling array accesses with step 1. */
4197 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4198 gimple_seq
*new_stmt_list
,
4203 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4204 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4206 const char *base_name
;
4209 gimple_seq seq
= NULL
;
4213 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4214 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4216 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4218 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4220 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4222 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4223 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4224 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4228 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4229 base_offset
= unshare_expr (DR_OFFSET (dr
));
4230 init
= unshare_expr (DR_INIT (dr
));
4234 base_name
= get_name (data_ref_base
);
4237 base_offset
= ssize_int (0);
4238 init
= ssize_int (0);
4239 base_name
= get_name (DR_REF (dr
));
4242 /* Create base_offset */
4243 base_offset
= size_binop (PLUS_EXPR
,
4244 fold_convert (sizetype
, base_offset
),
4245 fold_convert (sizetype
, init
));
4249 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4250 fold_convert (sizetype
, offset
), step
);
4251 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4252 base_offset
, offset
);
4256 byte_offset
= fold_convert (sizetype
, byte_offset
);
4257 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4258 base_offset
, byte_offset
);
4261 /* base + base_offset */
4263 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4266 addr_base
= build1 (ADDR_EXPR
,
4267 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4268 unshare_expr (DR_REF (dr
)));
4271 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4272 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4273 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4274 gimple_seq_add_seq (new_stmt_list
, seq
);
4276 if (DR_PTR_INFO (dr
)
4277 && TREE_CODE (addr_base
) == SSA_NAME
4278 && !SSA_NAME_PTR_INFO (addr_base
))
4280 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4281 if (offset
|| byte_offset
)
4282 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4285 if (dump_enabled_p ())
4287 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4288 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4289 dump_printf (MSG_NOTE
, "\n");
4296 /* Function vect_create_data_ref_ptr.
4298 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4299 location accessed in the loop by STMT, along with the def-use update
4300 chain to appropriately advance the pointer through the loop iterations.
4301 Also set aliasing information for the pointer. This pointer is used by
4302 the callers to this function to create a memory reference expression for
4303 vector load/store access.
4306 1. STMT: a stmt that references memory. Expected to be of the form
4307 GIMPLE_ASSIGN <name, data-ref> or
4308 GIMPLE_ASSIGN <data-ref, name>.
4309 2. AGGR_TYPE: the type of the reference, which should be either a vector
4311 3. AT_LOOP: the loop where the vector memref is to be created.
4312 4. OFFSET (optional): an offset to be added to the initial address accessed
4313 by the data-ref in STMT.
4314 5. BSI: location where the new stmts are to be placed if there is no loop
4315 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4316 pointing to the initial address.
4317 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4318 to the initial address accessed by the data-ref in STMT. This is
4319 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4323 1. Declare a new ptr to vector_type, and have it point to the base of the
4324 data reference (initial addressed accessed by the data reference).
4325 For example, for vector of type V8HI, the following code is generated:
4328 ap = (v8hi *)initial_address;
4330 if OFFSET is not supplied:
4331 initial_address = &a[init];
4332 if OFFSET is supplied:
4333 initial_address = &a[init + OFFSET];
4334 if BYTE_OFFSET is supplied:
4335 initial_address = &a[init] + BYTE_OFFSET;
4337 Return the initial_address in INITIAL_ADDRESS.
4339 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4340 update the pointer in each iteration of the loop.
4342 Return the increment stmt that updates the pointer in PTR_INCR.
4344 3. Set INV_P to true if the access pattern of the data reference in the
4345 vectorized loop is invariant. Set it to false otherwise.
4347 4. Return the pointer. */
4350 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4351 tree offset
, tree
*initial_address
,
4352 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4353 bool only_init
, bool *inv_p
, tree byte_offset
)
4355 const char *base_name
;
4356 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4357 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4358 struct loop
*loop
= NULL
;
4359 bool nested_in_vect_loop
= false;
4360 struct loop
*containing_loop
= NULL
;
4364 gimple_seq new_stmt_list
= NULL
;
4368 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4370 gimple_stmt_iterator incr_gsi
;
4372 tree indx_before_incr
, indx_after_incr
;
4375 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4377 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4378 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4382 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4383 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4384 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4385 pe
= loop_preheader_edge (loop
);
4389 gcc_assert (bb_vinfo
);
4394 /* Check the step (evolution) of the load in LOOP, and record
4395 whether it's invariant. */
4396 if (nested_in_vect_loop
)
4397 step
= STMT_VINFO_DR_STEP (stmt_info
);
4399 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4401 if (integer_zerop (step
))
4406 /* Create an expression for the first address accessed by this load
4408 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4410 if (dump_enabled_p ())
4412 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4413 dump_printf_loc (MSG_NOTE
, vect_location
,
4414 "create %s-pointer variable to type: ",
4415 get_tree_code_name (TREE_CODE (aggr_type
)));
4416 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4417 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4418 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4419 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4420 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4421 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4422 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4424 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4425 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4426 dump_printf (MSG_NOTE
, "\n");
4429 /* (1) Create the new aggregate-pointer variable.
4430 Vector and array types inherit the alias set of their component
4431 type by default so we need to use a ref-all pointer if the data
4432 reference does not conflict with the created aggregated data
4433 reference because it is not addressable. */
4434 bool need_ref_all
= false;
4435 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4436 get_alias_set (DR_REF (dr
))))
4437 need_ref_all
= true;
4438 /* Likewise for any of the data references in the stmt group. */
4439 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4441 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4444 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4445 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4446 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4447 get_alias_set (DR_REF (sdr
))))
4449 need_ref_all
= true;
4452 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4456 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4458 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4461 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4462 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4463 def-use update cycles for the pointer: one relative to the outer-loop
4464 (LOOP), which is what steps (3) and (4) below do. The other is relative
4465 to the inner-loop (which is the inner-most loop containing the dataref),
4466 and this is done be step (5) below.
4468 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4469 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4470 redundant. Steps (3),(4) create the following:
4473 LOOP: vp1 = phi(vp0,vp2)
4479 If there is an inner-loop nested in loop, then step (5) will also be
4480 applied, and an additional update in the inner-loop will be created:
4483 LOOP: vp1 = phi(vp0,vp2)
4485 inner: vp3 = phi(vp1,vp4)
4486 vp4 = vp3 + inner_step
4492 /* (2) Calculate the initial address of the aggregate-pointer, and set
4493 the aggregate-pointer to point to it before the loop. */
4495 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4497 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4498 offset
, loop
, byte_offset
);
4503 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4504 gcc_assert (!new_bb
);
4507 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4510 *initial_address
= new_temp
;
4511 aggr_ptr_init
= new_temp
;
4513 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4514 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4515 inner-loop nested in LOOP (during outer-loop vectorization). */
4517 /* No update in loop is required. */
4518 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4519 aptr
= aggr_ptr_init
;
4522 /* The step of the aggregate pointer is the type size. */
4523 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4524 /* One exception to the above is when the scalar step of the load in
4525 LOOP is zero. In this case the step here is also zero. */
4527 iv_step
= size_zero_node
;
4528 else if (tree_int_cst_sgn (step
) == -1)
4529 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4531 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4533 create_iv (aggr_ptr_init
,
4534 fold_convert (aggr_ptr_type
, iv_step
),
4535 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4536 &indx_before_incr
, &indx_after_incr
);
4537 incr
= gsi_stmt (incr_gsi
);
4538 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4540 /* Copy the points-to information if it exists. */
4541 if (DR_PTR_INFO (dr
))
4543 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4544 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4549 aptr
= indx_before_incr
;
4552 if (!nested_in_vect_loop
|| only_init
)
4556 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4557 nested in LOOP, if exists. */
4559 gcc_assert (nested_in_vect_loop
);
4562 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4564 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4565 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4567 incr
= gsi_stmt (incr_gsi
);
4568 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4570 /* Copy the points-to information if it exists. */
4571 if (DR_PTR_INFO (dr
))
4573 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4574 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4579 return indx_before_incr
;
4586 /* Function bump_vector_ptr
4588 Increment a pointer (to a vector type) by vector-size. If requested,
4589 i.e. if PTR-INCR is given, then also connect the new increment stmt
4590 to the existing def-use update-chain of the pointer, by modifying
4591 the PTR_INCR as illustrated below:
4593 The pointer def-use update-chain before this function:
4594 DATAREF_PTR = phi (p_0, p_2)
4596 PTR_INCR: p_2 = DATAREF_PTR + step
4598 The pointer def-use update-chain after this function:
4599 DATAREF_PTR = phi (p_0, p_2)
4601 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4603 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4606 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4608 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4609 the loop. The increment amount across iterations is expected
4611 BSI - location where the new update stmt is to be placed.
4612 STMT - the original scalar memory-access stmt that is being vectorized.
4613 BUMP - optional. The offset by which to bump the pointer. If not given,
4614 the offset is assumed to be vector_size.
4616 Output: Return NEW_DATAREF_PTR as illustrated above.
4621 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4622 gimple
*stmt
, tree bump
)
4624 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4625 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4626 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4627 tree update
= TYPE_SIZE_UNIT (vectype
);
4630 use_operand_p use_p
;
4631 tree new_dataref_ptr
;
4636 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4637 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4639 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4640 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4641 dataref_ptr
, update
);
4642 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4644 /* Copy the points-to information if it exists. */
4645 if (DR_PTR_INFO (dr
))
4647 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4648 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4652 return new_dataref_ptr
;
4654 /* Update the vector-pointer's cross-iteration increment. */
4655 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4657 tree use
= USE_FROM_PTR (use_p
);
4659 if (use
== dataref_ptr
)
4660 SET_USE (use_p
, new_dataref_ptr
);
4662 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4665 return new_dataref_ptr
;
4669 /* Function vect_create_destination_var.
4671 Create a new temporary of type VECTYPE. */
4674 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4680 enum vect_var_kind kind
;
4683 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4687 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4689 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4691 name
= get_name (scalar_dest
);
4693 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4695 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4696 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4702 /* Function vect_grouped_store_supported.
4704 Returns TRUE if interleave high and interleave low permutations
4705 are supported, and FALSE otherwise. */
4708 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4710 machine_mode mode
= TYPE_MODE (vectype
);
4712 /* vect_permute_store_chain requires the group size to be equal to 3 or
4713 be a power of two. */
4714 if (count
!= 3 && exact_log2 (count
) == -1)
4716 if (dump_enabled_p ())
4717 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4718 "the size of the group of accesses"
4719 " is not a power of 2 or not eqaul to 3\n");
4723 /* Check that the permutation is supported. */
4724 if (VECTOR_MODE_P (mode
))
4726 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4727 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4731 unsigned int j0
= 0, j1
= 0, j2
= 0;
4734 for (j
= 0; j
< 3; j
++)
4736 int nelt0
= ((3 - j
) * nelt
) % 3;
4737 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4738 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4739 for (i
= 0; i
< nelt
; i
++)
4741 if (3 * i
+ nelt0
< nelt
)
4742 sel
[3 * i
+ nelt0
] = j0
++;
4743 if (3 * i
+ nelt1
< nelt
)
4744 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4745 if (3 * i
+ nelt2
< nelt
)
4746 sel
[3 * i
+ nelt2
] = 0;
4748 if (!can_vec_perm_p (mode
, false, sel
))
4750 if (dump_enabled_p ())
4751 dump_printf (MSG_MISSED_OPTIMIZATION
,
4752 "permutaion op not supported by target.\n");
4756 for (i
= 0; i
< nelt
; i
++)
4758 if (3 * i
+ nelt0
< nelt
)
4759 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4760 if (3 * i
+ nelt1
< nelt
)
4761 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4762 if (3 * i
+ nelt2
< nelt
)
4763 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4765 if (!can_vec_perm_p (mode
, false, sel
))
4767 if (dump_enabled_p ())
4768 dump_printf (MSG_MISSED_OPTIMIZATION
,
4769 "permutaion op not supported by target.\n");
4777 /* If length is not equal to 3 then only power of 2 is supported. */
4778 gcc_assert (pow2p_hwi (count
));
4780 for (i
= 0; i
< nelt
/ 2; i
++)
4783 sel
[i
* 2 + 1] = i
+ nelt
;
4785 if (can_vec_perm_p (mode
, false, sel
))
4787 for (i
= 0; i
< nelt
; i
++)
4789 if (can_vec_perm_p (mode
, false, sel
))
4795 if (dump_enabled_p ())
4796 dump_printf (MSG_MISSED_OPTIMIZATION
,
4797 "permutaion op not supported by target.\n");
4802 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4806 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4808 return vect_lanes_optab_supported_p ("vec_store_lanes",
4809 vec_store_lanes_optab
,
4814 /* Function vect_permute_store_chain.
4816 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4817 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4818 the data correctly for the stores. Return the final references for stores
4821 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4822 The input is 4 vectors each containing 8 elements. We assign a number to
4823 each element, the input sequence is:
4825 1st vec: 0 1 2 3 4 5 6 7
4826 2nd vec: 8 9 10 11 12 13 14 15
4827 3rd vec: 16 17 18 19 20 21 22 23
4828 4th vec: 24 25 26 27 28 29 30 31
4830 The output sequence should be:
4832 1st vec: 0 8 16 24 1 9 17 25
4833 2nd vec: 2 10 18 26 3 11 19 27
4834 3rd vec: 4 12 20 28 5 13 21 30
4835 4th vec: 6 14 22 30 7 15 23 31
4837 i.e., we interleave the contents of the four vectors in their order.
4839 We use interleave_high/low instructions to create such output. The input of
4840 each interleave_high/low operation is two vectors:
4843 the even elements of the result vector are obtained left-to-right from the
4844 high/low elements of the first vector. The odd elements of the result are
4845 obtained left-to-right from the high/low elements of the second vector.
4846 The output of interleave_high will be: 0 4 1 5
4847 and of interleave_low: 2 6 3 7
4850 The permutation is done in log LENGTH stages. In each stage interleave_high
4851 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4852 where the first argument is taken from the first half of DR_CHAIN and the
4853 second argument from it's second half.
4856 I1: interleave_high (1st vec, 3rd vec)
4857 I2: interleave_low (1st vec, 3rd vec)
4858 I3: interleave_high (2nd vec, 4th vec)
4859 I4: interleave_low (2nd vec, 4th vec)
4861 The output for the first stage is:
4863 I1: 0 16 1 17 2 18 3 19
4864 I2: 4 20 5 21 6 22 7 23
4865 I3: 8 24 9 25 10 26 11 27
4866 I4: 12 28 13 29 14 30 15 31
4868 The output of the second stage, i.e. the final result is:
4870 I1: 0 8 16 24 1 9 17 25
4871 I2: 2 10 18 26 3 11 19 27
4872 I3: 4 12 20 28 5 13 21 30
4873 I4: 6 14 22 30 7 15 23 31. */
4876 vect_permute_store_chain (vec
<tree
> dr_chain
,
4877 unsigned int length
,
4879 gimple_stmt_iterator
*gsi
,
4880 vec
<tree
> *result_chain
)
4882 tree vect1
, vect2
, high
, low
;
4884 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4885 tree perm_mask_low
, perm_mask_high
;
4887 tree perm3_mask_low
, perm3_mask_high
;
4888 unsigned int i
, n
, log_length
= exact_log2 (length
);
4889 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4890 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4892 result_chain
->quick_grow (length
);
4893 memcpy (result_chain
->address (), dr_chain
.address (),
4894 length
* sizeof (tree
));
4898 unsigned int j0
= 0, j1
= 0, j2
= 0;
4900 for (j
= 0; j
< 3; j
++)
4902 int nelt0
= ((3 - j
) * nelt
) % 3;
4903 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4904 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4906 for (i
= 0; i
< nelt
; i
++)
4908 if (3 * i
+ nelt0
< nelt
)
4909 sel
[3 * i
+ nelt0
] = j0
++;
4910 if (3 * i
+ nelt1
< nelt
)
4911 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4912 if (3 * i
+ nelt2
< nelt
)
4913 sel
[3 * i
+ nelt2
] = 0;
4915 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4917 for (i
= 0; i
< nelt
; i
++)
4919 if (3 * i
+ nelt0
< nelt
)
4920 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4921 if (3 * i
+ nelt1
< nelt
)
4922 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4923 if (3 * i
+ nelt2
< nelt
)
4924 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4926 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4928 vect1
= dr_chain
[0];
4929 vect2
= dr_chain
[1];
4931 /* Create interleaving stmt:
4932 low = VEC_PERM_EXPR <vect1, vect2,
4933 {j, nelt, *, j + 1, nelt + j + 1, *,
4934 j + 2, nelt + j + 2, *, ...}> */
4935 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4936 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4937 vect2
, perm3_mask_low
);
4938 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4941 vect2
= dr_chain
[2];
4942 /* Create interleaving stmt:
4943 low = VEC_PERM_EXPR <vect1, vect2,
4944 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4945 6, 7, nelt + j + 2, ...}> */
4946 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4947 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4948 vect2
, perm3_mask_high
);
4949 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4950 (*result_chain
)[j
] = data_ref
;
4955 /* If length is not equal to 3 then only power of 2 is supported. */
4956 gcc_assert (pow2p_hwi (length
));
4958 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4961 sel
[i
* 2 + 1] = i
+ nelt
;
4963 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4965 for (i
= 0; i
< nelt
; i
++)
4967 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4969 for (i
= 0, n
= log_length
; i
< n
; i
++)
4971 for (j
= 0; j
< length
/2; j
++)
4973 vect1
= dr_chain
[j
];
4974 vect2
= dr_chain
[j
+length
/2];
4976 /* Create interleaving stmt:
4977 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4979 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4980 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4981 vect2
, perm_mask_high
);
4982 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4983 (*result_chain
)[2*j
] = high
;
4985 /* Create interleaving stmt:
4986 low = VEC_PERM_EXPR <vect1, vect2,
4987 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4989 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4990 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4991 vect2
, perm_mask_low
);
4992 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4993 (*result_chain
)[2*j
+1] = low
;
4995 memcpy (dr_chain
.address (), result_chain
->address (),
4996 length
* sizeof (tree
));
5001 /* Function vect_setup_realignment
5003 This function is called when vectorizing an unaligned load using
5004 the dr_explicit_realign[_optimized] scheme.
5005 This function generates the following code at the loop prolog:
5008 x msq_init = *(floor(p)); # prolog load
5009 realignment_token = call target_builtin;
5011 x msq = phi (msq_init, ---)
5013 The stmts marked with x are generated only for the case of
5014 dr_explicit_realign_optimized.
5016 The code above sets up a new (vector) pointer, pointing to the first
5017 location accessed by STMT, and a "floor-aligned" load using that pointer.
5018 It also generates code to compute the "realignment-token" (if the relevant
5019 target hook was defined), and creates a phi-node at the loop-header bb
5020 whose arguments are the result of the prolog-load (created by this
5021 function) and the result of a load that takes place in the loop (to be
5022 created by the caller to this function).
5024 For the case of dr_explicit_realign_optimized:
5025 The caller to this function uses the phi-result (msq) to create the
5026 realignment code inside the loop, and sets up the missing phi argument,
5029 msq = phi (msq_init, lsq)
5030 lsq = *(floor(p')); # load in loop
5031 result = realign_load (msq, lsq, realignment_token);
5033 For the case of dr_explicit_realign:
5035 msq = *(floor(p)); # load in loop
5037 lsq = *(floor(p')); # load in loop
5038 result = realign_load (msq, lsq, realignment_token);
5041 STMT - (scalar) load stmt to be vectorized. This load accesses
5042 a memory location that may be unaligned.
5043 BSI - place where new code is to be inserted.
5044 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5048 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5049 target hook, if defined.
5050 Return value - the result of the loop-header phi node. */
5053 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5054 tree
*realignment_token
,
5055 enum dr_alignment_support alignment_support_scheme
,
5057 struct loop
**at_loop
)
5059 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5060 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5061 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5062 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5063 struct loop
*loop
= NULL
;
5065 tree scalar_dest
= gimple_assign_lhs (stmt
);
5071 tree msq_init
= NULL_TREE
;
5074 tree msq
= NULL_TREE
;
5075 gimple_seq stmts
= NULL
;
5077 bool compute_in_loop
= false;
5078 bool nested_in_vect_loop
= false;
5079 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5080 struct loop
*loop_for_initial_load
= NULL
;
5084 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5085 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5088 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5089 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5091 /* We need to generate three things:
5092 1. the misalignment computation
5093 2. the extra vector load (for the optimized realignment scheme).
5094 3. the phi node for the two vectors from which the realignment is
5095 done (for the optimized realignment scheme). */
5097 /* 1. Determine where to generate the misalignment computation.
5099 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5100 calculation will be generated by this function, outside the loop (in the
5101 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5102 caller, inside the loop.
5104 Background: If the misalignment remains fixed throughout the iterations of
5105 the loop, then both realignment schemes are applicable, and also the
5106 misalignment computation can be done outside LOOP. This is because we are
5107 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5108 are a multiple of VS (the Vector Size), and therefore the misalignment in
5109 different vectorized LOOP iterations is always the same.
5110 The problem arises only if the memory access is in an inner-loop nested
5111 inside LOOP, which is now being vectorized using outer-loop vectorization.
5112 This is the only case when the misalignment of the memory access may not
5113 remain fixed throughout the iterations of the inner-loop (as explained in
5114 detail in vect_supportable_dr_alignment). In this case, not only is the
5115 optimized realignment scheme not applicable, but also the misalignment
5116 computation (and generation of the realignment token that is passed to
5117 REALIGN_LOAD) have to be done inside the loop.
5119 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5120 or not, which in turn determines if the misalignment is computed inside
5121 the inner-loop, or outside LOOP. */
5123 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5125 compute_in_loop
= true;
5126 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5130 /* 2. Determine where to generate the extra vector load.
5132 For the optimized realignment scheme, instead of generating two vector
5133 loads in each iteration, we generate a single extra vector load in the
5134 preheader of the loop, and in each iteration reuse the result of the
5135 vector load from the previous iteration. In case the memory access is in
5136 an inner-loop nested inside LOOP, which is now being vectorized using
5137 outer-loop vectorization, we need to determine whether this initial vector
5138 load should be generated at the preheader of the inner-loop, or can be
5139 generated at the preheader of LOOP. If the memory access has no evolution
5140 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5141 to be generated inside LOOP (in the preheader of the inner-loop). */
5143 if (nested_in_vect_loop
)
5145 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5146 bool invariant_in_outerloop
=
5147 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5148 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5151 loop_for_initial_load
= loop
;
5153 *at_loop
= loop_for_initial_load
;
5155 if (loop_for_initial_load
)
5156 pe
= loop_preheader_edge (loop_for_initial_load
);
5158 /* 3. For the case of the optimized realignment, create the first vector
5159 load at the loop preheader. */
5161 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5163 /* Create msq_init = *(floor(p1)) in the loop preheader */
5166 gcc_assert (!compute_in_loop
);
5167 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5168 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5169 NULL_TREE
, &init_addr
, NULL
, &inc
,
5171 if (TREE_CODE (ptr
) == SSA_NAME
)
5172 new_temp
= copy_ssa_name (ptr
);
5174 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5175 new_stmt
= gimple_build_assign
5176 (new_temp
, BIT_AND_EXPR
, ptr
,
5177 build_int_cst (TREE_TYPE (ptr
),
5178 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5179 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5180 gcc_assert (!new_bb
);
5182 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5183 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5184 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5185 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5186 gimple_assign_set_lhs (new_stmt
, new_temp
);
5189 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5190 gcc_assert (!new_bb
);
5193 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5195 msq_init
= gimple_assign_lhs (new_stmt
);
5198 /* 4. Create realignment token using a target builtin, if available.
5199 It is done either inside the containing loop, or before LOOP (as
5200 determined above). */
5202 if (targetm
.vectorize
.builtin_mask_for_load
)
5207 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5210 /* Generate the INIT_ADDR computation outside LOOP. */
5211 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5215 pe
= loop_preheader_edge (loop
);
5216 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5217 gcc_assert (!new_bb
);
5220 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5223 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5224 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5226 vect_create_destination_var (scalar_dest
,
5227 gimple_call_return_type (new_stmt
));
5228 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5229 gimple_call_set_lhs (new_stmt
, new_temp
);
5231 if (compute_in_loop
)
5232 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5235 /* Generate the misalignment computation outside LOOP. */
5236 pe
= loop_preheader_edge (loop
);
5237 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5238 gcc_assert (!new_bb
);
5241 *realignment_token
= gimple_call_lhs (new_stmt
);
5243 /* The result of the CALL_EXPR to this builtin is determined from
5244 the value of the parameter and no global variables are touched
5245 which makes the builtin a "const" function. Requiring the
5246 builtin to have the "const" attribute makes it unnecessary
5247 to call mark_call_clobbered. */
5248 gcc_assert (TREE_READONLY (builtin_decl
));
5251 if (alignment_support_scheme
== dr_explicit_realign
)
5254 gcc_assert (!compute_in_loop
);
5255 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5258 /* 5. Create msq = phi <msq_init, lsq> in loop */
5260 pe
= loop_preheader_edge (containing_loop
);
5261 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5262 msq
= make_ssa_name (vec_dest
);
5263 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5264 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5270 /* Function vect_grouped_load_supported.
5272 COUNT is the size of the load group (the number of statements plus the
5273 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5274 only one statement, with a gap of COUNT - 1.
5276 Returns true if a suitable permute exists. */
5279 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5280 unsigned HOST_WIDE_INT count
)
5282 machine_mode mode
= TYPE_MODE (vectype
);
5284 /* If this is single-element interleaving with an element distance
5285 that leaves unused vector loads around punt - we at least create
5286 very sub-optimal code in that case (and blow up memory,
5288 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5290 if (dump_enabled_p ())
5291 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5292 "single-element interleaving not supported "
5293 "for not adjacent vector loads\n");
5297 /* vect_permute_load_chain requires the group size to be equal to 3 or
5298 be a power of two. */
5299 if (count
!= 3 && exact_log2 (count
) == -1)
5301 if (dump_enabled_p ())
5302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5303 "the size of the group of accesses"
5304 " is not a power of 2 or not equal to 3\n");
5308 /* Check that the permutation is supported. */
5309 if (VECTOR_MODE_P (mode
))
5311 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5312 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5317 for (k
= 0; k
< 3; k
++)
5319 for (i
= 0; i
< nelt
; i
++)
5320 if (3 * i
+ k
< 2 * nelt
)
5324 if (!can_vec_perm_p (mode
, false, sel
))
5326 if (dump_enabled_p ())
5327 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5328 "shuffle of 3 loads is not supported by"
5332 for (i
= 0, j
= 0; i
< nelt
; i
++)
5333 if (3 * i
+ k
< 2 * nelt
)
5336 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5337 if (!can_vec_perm_p (mode
, false, sel
))
5339 if (dump_enabled_p ())
5340 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5341 "shuffle of 3 loads is not supported by"
5350 /* If length is not equal to 3 then only power of 2 is supported. */
5351 gcc_assert (pow2p_hwi (count
));
5352 for (i
= 0; i
< nelt
; i
++)
5354 if (can_vec_perm_p (mode
, false, sel
))
5356 for (i
= 0; i
< nelt
; i
++)
5358 if (can_vec_perm_p (mode
, false, sel
))
5364 if (dump_enabled_p ())
5365 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5366 "extract even/odd not supported by target\n");
5370 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5374 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5376 return vect_lanes_optab_supported_p ("vec_load_lanes",
5377 vec_load_lanes_optab
,
5381 /* Function vect_permute_load_chain.
5383 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5384 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5385 the input data correctly. Return the final references for loads in
5388 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5389 The input is 4 vectors each containing 8 elements. We assign a number to each
5390 element, the input sequence is:
5392 1st vec: 0 1 2 3 4 5 6 7
5393 2nd vec: 8 9 10 11 12 13 14 15
5394 3rd vec: 16 17 18 19 20 21 22 23
5395 4th vec: 24 25 26 27 28 29 30 31
5397 The output sequence should be:
5399 1st vec: 0 4 8 12 16 20 24 28
5400 2nd vec: 1 5 9 13 17 21 25 29
5401 3rd vec: 2 6 10 14 18 22 26 30
5402 4th vec: 3 7 11 15 19 23 27 31
5404 i.e., the first output vector should contain the first elements of each
5405 interleaving group, etc.
5407 We use extract_even/odd instructions to create such output. The input of
5408 each extract_even/odd operation is two vectors
5412 and the output is the vector of extracted even/odd elements. The output of
5413 extract_even will be: 0 2 4 6
5414 and of extract_odd: 1 3 5 7
5417 The permutation is done in log LENGTH stages. In each stage extract_even
5418 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5419 their order. In our example,
5421 E1: extract_even (1st vec, 2nd vec)
5422 E2: extract_odd (1st vec, 2nd vec)
5423 E3: extract_even (3rd vec, 4th vec)
5424 E4: extract_odd (3rd vec, 4th vec)
5426 The output for the first stage will be:
5428 E1: 0 2 4 6 8 10 12 14
5429 E2: 1 3 5 7 9 11 13 15
5430 E3: 16 18 20 22 24 26 28 30
5431 E4: 17 19 21 23 25 27 29 31
5433 In order to proceed and create the correct sequence for the next stage (or
5434 for the correct output, if the second stage is the last one, as in our
5435 example), we first put the output of extract_even operation and then the
5436 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5437 The input for the second stage is:
5439 1st vec (E1): 0 2 4 6 8 10 12 14
5440 2nd vec (E3): 16 18 20 22 24 26 28 30
5441 3rd vec (E2): 1 3 5 7 9 11 13 15
5442 4th vec (E4): 17 19 21 23 25 27 29 31
5444 The output of the second stage:
5446 E1: 0 4 8 12 16 20 24 28
5447 E2: 2 6 10 14 18 22 26 30
5448 E3: 1 5 9 13 17 21 25 29
5449 E4: 3 7 11 15 19 23 27 31
5451 And RESULT_CHAIN after reordering:
5453 1st vec (E1): 0 4 8 12 16 20 24 28
5454 2nd vec (E3): 1 5 9 13 17 21 25 29
5455 3rd vec (E2): 2 6 10 14 18 22 26 30
5456 4th vec (E4): 3 7 11 15 19 23 27 31. */
5459 vect_permute_load_chain (vec
<tree
> dr_chain
,
5460 unsigned int length
,
5462 gimple_stmt_iterator
*gsi
,
5463 vec
<tree
> *result_chain
)
5465 tree data_ref
, first_vect
, second_vect
;
5466 tree perm_mask_even
, perm_mask_odd
;
5467 tree perm3_mask_low
, perm3_mask_high
;
5469 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5470 unsigned int i
, j
, log_length
= exact_log2 (length
);
5471 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5472 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5474 result_chain
->quick_grow (length
);
5475 memcpy (result_chain
->address (), dr_chain
.address (),
5476 length
* sizeof (tree
));
5482 for (k
= 0; k
< 3; k
++)
5484 for (i
= 0; i
< nelt
; i
++)
5485 if (3 * i
+ k
< 2 * nelt
)
5489 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5491 for (i
= 0, j
= 0; i
< nelt
; i
++)
5492 if (3 * i
+ k
< 2 * nelt
)
5495 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5497 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5499 first_vect
= dr_chain
[0];
5500 second_vect
= dr_chain
[1];
5502 /* Create interleaving stmt (low part of):
5503 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5505 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5506 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5507 second_vect
, perm3_mask_low
);
5508 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5510 /* Create interleaving stmt (high part of):
5511 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5513 first_vect
= data_ref
;
5514 second_vect
= dr_chain
[2];
5515 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5516 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5517 second_vect
, perm3_mask_high
);
5518 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5519 (*result_chain
)[k
] = data_ref
;
5524 /* If length is not equal to 3 then only power of 2 is supported. */
5525 gcc_assert (pow2p_hwi (length
));
5527 for (i
= 0; i
< nelt
; ++i
)
5529 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5531 for (i
= 0; i
< nelt
; ++i
)
5533 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5535 for (i
= 0; i
< log_length
; i
++)
5537 for (j
= 0; j
< length
; j
+= 2)
5539 first_vect
= dr_chain
[j
];
5540 second_vect
= dr_chain
[j
+1];
5542 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5543 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5544 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5545 first_vect
, second_vect
,
5547 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5548 (*result_chain
)[j
/2] = data_ref
;
5550 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5551 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5552 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5553 first_vect
, second_vect
,
5555 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5556 (*result_chain
)[j
/2+length
/2] = data_ref
;
5558 memcpy (dr_chain
.address (), result_chain
->address (),
5559 length
* sizeof (tree
));
5564 /* Function vect_shift_permute_load_chain.
5566 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5567 sequence of stmts to reorder the input data accordingly.
5568 Return the final references for loads in RESULT_CHAIN.
5569 Return true if successed, false otherwise.
5571 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5572 The input is 3 vectors each containing 8 elements. We assign a
5573 number to each element, the input sequence is:
5575 1st vec: 0 1 2 3 4 5 6 7
5576 2nd vec: 8 9 10 11 12 13 14 15
5577 3rd vec: 16 17 18 19 20 21 22 23
5579 The output sequence should be:
5581 1st vec: 0 3 6 9 12 15 18 21
5582 2nd vec: 1 4 7 10 13 16 19 22
5583 3rd vec: 2 5 8 11 14 17 20 23
5585 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5587 First we shuffle all 3 vectors to get correct elements order:
5589 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5590 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5591 3rd vec: (16 19 22) (17 20 23) (18 21)
5593 Next we unite and shift vector 3 times:
5596 shift right by 6 the concatenation of:
5597 "1st vec" and "2nd vec"
5598 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5599 "2nd vec" and "3rd vec"
5600 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5601 "3rd vec" and "1st vec"
5602 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5605 So that now new vectors are:
5607 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5608 2nd vec: (10 13) (16 19 22) (17 20 23)
5609 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5612 shift right by 5 the concatenation of:
5613 "1st vec" and "3rd vec"
5614 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5615 "2nd vec" and "1st vec"
5616 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5617 "3rd vec" and "2nd vec"
5618 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5621 So that now new vectors are:
5623 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5624 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5625 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5628 shift right by 5 the concatenation of:
5629 "1st vec" and "1st vec"
5630 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5631 shift right by 3 the concatenation of:
5632 "2nd vec" and "2nd vec"
5633 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5636 So that now all vectors are READY:
5637 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5638 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5639 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5641 This algorithm is faster than one in vect_permute_load_chain if:
5642 1. "shift of a concatination" is faster than general permutation.
5644 2. The TARGET machine can't execute vector instructions in parallel.
5645 This is because each step of the algorithm depends on previous.
5646 The algorithm in vect_permute_load_chain is much more parallel.
5648 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5652 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5653 unsigned int length
,
5655 gimple_stmt_iterator
*gsi
,
5656 vec
<tree
> *result_chain
)
5658 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5659 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5660 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5663 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5665 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5666 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5667 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5668 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5670 result_chain
->quick_grow (length
);
5671 memcpy (result_chain
->address (), dr_chain
.address (),
5672 length
* sizeof (tree
));
5674 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5676 unsigned int j
, log_length
= exact_log2 (length
);
5677 for (i
= 0; i
< nelt
/ 2; ++i
)
5679 for (i
= 0; i
< nelt
/ 2; ++i
)
5680 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5681 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5685 "shuffle of 2 fields structure is not \
5686 supported by target\n");
5689 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5691 for (i
= 0; i
< nelt
/ 2; ++i
)
5693 for (i
= 0; i
< nelt
/ 2; ++i
)
5694 sel
[nelt
/ 2 + i
] = i
* 2;
5695 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5697 if (dump_enabled_p ())
5698 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5699 "shuffle of 2 fields structure is not \
5700 supported by target\n");
5703 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5705 /* Generating permutation constant to shift all elements.
5706 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5707 for (i
= 0; i
< nelt
; i
++)
5708 sel
[i
] = nelt
/ 2 + i
;
5709 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5713 "shift permutation is not supported by target\n");
5716 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5718 /* Generating permutation constant to select vector from 2.
5719 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5720 for (i
= 0; i
< nelt
/ 2; i
++)
5722 for (i
= nelt
/ 2; i
< nelt
; i
++)
5724 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5726 if (dump_enabled_p ())
5727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5728 "select is not supported by target\n");
5731 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5733 for (i
= 0; i
< log_length
; i
++)
5735 for (j
= 0; j
< length
; j
+= 2)
5737 first_vect
= dr_chain
[j
];
5738 second_vect
= dr_chain
[j
+ 1];
5740 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5741 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5742 first_vect
, first_vect
,
5744 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5747 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5748 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5749 second_vect
, second_vect
,
5751 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5754 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5755 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5756 vect
[0], vect
[1], shift1_mask
);
5757 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5758 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5760 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5761 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5762 vect
[0], vect
[1], select_mask
);
5763 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5764 (*result_chain
)[j
/2] = data_ref
;
5766 memcpy (dr_chain
.address (), result_chain
->address (),
5767 length
* sizeof (tree
));
5771 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5773 unsigned int k
= 0, l
= 0;
5775 /* Generating permutation constant to get all elements in rigth order.
5776 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5777 for (i
= 0; i
< nelt
; i
++)
5779 if (3 * k
+ (l
% 3) >= nelt
)
5782 l
+= (3 - (nelt
% 3));
5784 sel
[i
] = 3 * k
+ (l
% 3);
5787 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5789 if (dump_enabled_p ())
5790 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5791 "shuffle of 3 fields structure is not \
5792 supported by target\n");
5795 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5797 /* Generating permutation constant to shift all elements.
5798 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5799 for (i
= 0; i
< nelt
; i
++)
5800 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5801 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5805 "shift permutation is not supported by target\n");
5808 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5810 /* Generating permutation constant to shift all elements.
5811 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5812 for (i
= 0; i
< nelt
; i
++)
5813 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5814 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5816 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5818 "shift permutation is not supported by target\n");
5821 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5823 /* Generating permutation constant to shift all elements.
5824 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5825 for (i
= 0; i
< nelt
; i
++)
5826 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5827 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5829 if (dump_enabled_p ())
5830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5831 "shift permutation is not supported by target\n");
5834 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5836 /* Generating permutation constant to shift all elements.
5837 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5838 for (i
= 0; i
< nelt
; i
++)
5839 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5840 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5842 if (dump_enabled_p ())
5843 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5844 "shift permutation is not supported by target\n");
5847 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5849 for (k
= 0; k
< 3; k
++)
5851 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5852 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5853 dr_chain
[k
], dr_chain
[k
],
5855 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5859 for (k
= 0; k
< 3; k
++)
5861 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5862 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5863 vect
[k
% 3], vect
[(k
+ 1) % 3],
5865 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5866 vect_shift
[k
] = data_ref
;
5869 for (k
= 0; k
< 3; k
++)
5871 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5872 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5873 vect_shift
[(4 - k
) % 3],
5874 vect_shift
[(3 - k
) % 3],
5876 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5880 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5882 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5883 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5884 vect
[0], shift3_mask
);
5885 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5886 (*result_chain
)[nelt
% 3] = data_ref
;
5888 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5889 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5890 vect
[1], shift4_mask
);
5891 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5892 (*result_chain
)[0] = data_ref
;
5898 /* Function vect_transform_grouped_load.
5900 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5901 to perform their permutation and ascribe the result vectorized statements to
5902 the scalar statements.
5906 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5907 gimple_stmt_iterator
*gsi
)
5910 vec
<tree
> result_chain
= vNULL
;
5912 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5913 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5914 vectors, that are ready for vector computation. */
5915 result_chain
.create (size
);
5917 /* If reassociation width for vector type is 2 or greater target machine can
5918 execute 2 or more vector instructions in parallel. Otherwise try to
5919 get chain for loads group using vect_shift_permute_load_chain. */
5920 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5921 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5923 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5924 gsi
, &result_chain
))
5925 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5926 vect_record_grouped_load_vectors (stmt
, result_chain
);
5927 result_chain
.release ();
5930 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5931 generated as part of the vectorization of STMT. Assign the statement
5932 for each vector to the associated scalar statement. */
5935 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5937 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5938 gimple
*next_stmt
, *new_stmt
;
5939 unsigned int i
, gap_count
;
5942 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5943 Since we scan the chain starting from it's first node, their order
5944 corresponds the order of data-refs in RESULT_CHAIN. */
5945 next_stmt
= first_stmt
;
5947 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5952 /* Skip the gaps. Loads created for the gaps will be removed by dead
5953 code elimination pass later. No need to check for the first stmt in
5954 the group, since it always exists.
5955 GROUP_GAP is the number of steps in elements from the previous
5956 access (if there is no gap GROUP_GAP is 1). We skip loads that
5957 correspond to the gaps. */
5958 if (next_stmt
!= first_stmt
5959 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5967 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5968 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5969 copies, and we put the new vector statement in the first available
5971 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5972 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5975 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5978 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5980 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5983 prev_stmt
= rel_stmt
;
5985 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5988 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5993 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5995 /* If NEXT_STMT accesses the same DR as the previous statement,
5996 put the same TMP_DATA_REF as its vectorized statement; otherwise
5997 get the next data-ref from RESULT_CHAIN. */
5998 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6004 /* Function vect_force_dr_alignment_p.
6006 Returns whether the alignment of a DECL can be forced to be aligned
6007 on ALIGNMENT bit boundary. */
6010 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6015 if (decl_in_symtab_p (decl
)
6016 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6019 if (TREE_STATIC (decl
))
6020 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6022 return (alignment
<= MAX_STACK_ALIGNMENT
);
6026 /* Return whether the data reference DR is supported with respect to its
6028 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6029 it is aligned, i.e., check if it is possible to vectorize it with different
6032 enum dr_alignment_support
6033 vect_supportable_dr_alignment (struct data_reference
*dr
,
6034 bool check_aligned_accesses
)
6036 gimple
*stmt
= DR_STMT (dr
);
6037 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6038 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6039 machine_mode mode
= TYPE_MODE (vectype
);
6040 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6041 struct loop
*vect_loop
= NULL
;
6042 bool nested_in_vect_loop
= false;
6044 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6047 /* For now assume all conditional loads/stores support unaligned
6048 access without any special code. */
6049 if (is_gimple_call (stmt
)
6050 && gimple_call_internal_p (stmt
)
6051 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6052 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6053 return dr_unaligned_supported
;
6057 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6058 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6061 /* Possibly unaligned access. */
6063 /* We can choose between using the implicit realignment scheme (generating
6064 a misaligned_move stmt) and the explicit realignment scheme (generating
6065 aligned loads with a REALIGN_LOAD). There are two variants to the
6066 explicit realignment scheme: optimized, and unoptimized.
6067 We can optimize the realignment only if the step between consecutive
6068 vector loads is equal to the vector size. Since the vector memory
6069 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6070 is guaranteed that the misalignment amount remains the same throughout the
6071 execution of the vectorized loop. Therefore, we can create the
6072 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6073 at the loop preheader.
6075 However, in the case of outer-loop vectorization, when vectorizing a
6076 memory access in the inner-loop nested within the LOOP that is now being
6077 vectorized, while it is guaranteed that the misalignment of the
6078 vectorized memory access will remain the same in different outer-loop
6079 iterations, it is *not* guaranteed that is will remain the same throughout
6080 the execution of the inner-loop. This is because the inner-loop advances
6081 with the original scalar step (and not in steps of VS). If the inner-loop
6082 step happens to be a multiple of VS, then the misalignment remains fixed
6083 and we can use the optimized realignment scheme. For example:
6089 When vectorizing the i-loop in the above example, the step between
6090 consecutive vector loads is 1, and so the misalignment does not remain
6091 fixed across the execution of the inner-loop, and the realignment cannot
6092 be optimized (as illustrated in the following pseudo vectorized loop):
6094 for (i=0; i<N; i+=4)
6095 for (j=0; j<M; j++){
6096 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6097 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6098 // (assuming that we start from an aligned address).
6101 We therefore have to use the unoptimized realignment scheme:
6103 for (i=0; i<N; i+=4)
6104 for (j=k; j<M; j+=4)
6105 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6106 // that the misalignment of the initial address is
6109 The loop can then be vectorized as follows:
6111 for (k=0; k<4; k++){
6112 rt = get_realignment_token (&vp[k]);
6113 for (i=0; i<N; i+=4){
6115 for (j=k; j<M; j+=4){
6117 va = REALIGN_LOAD <v1,v2,rt>;
6124 if (DR_IS_READ (dr
))
6126 bool is_packed
= false;
6127 tree type
= (TREE_TYPE (DR_REF (dr
)));
6129 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6130 && (!targetm
.vectorize
.builtin_mask_for_load
6131 || targetm
.vectorize
.builtin_mask_for_load ()))
6133 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6135 /* If we are doing SLP then the accesses need not have the
6136 same alignment, instead it depends on the SLP group size. */
6138 && STMT_SLP_TYPE (stmt_info
)
6139 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6140 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
6141 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
6143 else if (!loop_vinfo
6144 || (nested_in_vect_loop
6145 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6146 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6147 return dr_explicit_realign
;
6149 return dr_explicit_realign_optimized
;
6151 if (!known_alignment_for_access_p (dr
))
6152 is_packed
= not_size_aligned (DR_REF (dr
));
6154 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6155 || targetm
.vectorize
.
6156 support_vector_misalignment (mode
, type
,
6157 DR_MISALIGNMENT (dr
), is_packed
))
6158 /* Can't software pipeline the loads, but can at least do them. */
6159 return dr_unaligned_supported
;
6163 bool is_packed
= false;
6164 tree type
= (TREE_TYPE (DR_REF (dr
)));
6166 if (!known_alignment_for_access_p (dr
))
6167 is_packed
= not_size_aligned (DR_REF (dr
));
6169 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
6170 || targetm
.vectorize
.
6171 support_vector_misalignment (mode
, type
,
6172 DR_MISALIGNMENT (dr
), is_packed
))
6173 return dr_unaligned_supported
;
6177 return dr_unaligned_unsupported
;