1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
59 tree vectype
, unsigned HOST_WIDE_INT count
)
61 machine_mode mode
, array_mode
;
64 mode
= TYPE_MODE (vectype
);
65 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
66 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
69 if (array_mode
== BLKmode
)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
74 GET_MODE_NAME (mode
), count
);
78 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
82 "cannot use %s<%s><%s>\n", name
,
83 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE
, vect_location
,
89 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
90 GET_MODE_NAME (mode
));
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
114 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
115 HOST_WIDE_INT
*rhs_size_unit
)
117 tree scalar_type
= gimple_expr_type (stmt
);
118 HOST_WIDE_INT lhs
, rhs
;
120 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
122 if (is_gimple_assign (stmt
)
123 && (gimple_assign_cast_p (stmt
)
124 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
128 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
130 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
132 scalar_type
= rhs_type
;
135 *lhs_size_unit
= lhs
;
136 *rhs_size_unit
= rhs
;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
146 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
148 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE
, vect_location
,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
158 dump_printf (MSG_NOTE
, " and ");
159 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
160 dump_printf (MSG_NOTE
, "\n");
163 if (optimize_loop_nest_for_size_p (loop
))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
167 "versioning not supported when optimizing"
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
177 "versioning not yet supported for outer-loops.\n");
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
183 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
188 "versioning not yet supported for non-constant "
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
206 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
207 loop_vec_info loop_vinfo
, int *max_vf
)
210 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
211 struct data_reference
*dra
= DDR_A (ddr
);
212 struct data_reference
*drb
= DDR_B (ddr
);
213 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
214 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
215 lambda_vector dist_v
;
216 unsigned int loop_depth
;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
228 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a
)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a
) == GROUP_FIRST_ELEMENT (stmtinfo_b
))
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
244 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
246 get_alias_set (DR_REF (drb
))))
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop
->safelen
>= 2)
256 if (loop
->safelen
< *max_vf
)
257 *max_vf
= loop
->safelen
;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
272 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
275 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
287 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
290 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop
->safelen
>= 2)
304 if (loop
->safelen
< *max_vf
)
305 *max_vf
= loop
->safelen
;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
320 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
323 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
334 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
336 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
342 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
345 int dist
= dist_v
[loop_depth
];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance = %d.\n", dist
);
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE
, vect_location
,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
358 dump_printf (MSG_NOTE
, " and ");
359 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
360 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
384 gimple
*earlier_stmt
;
385 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
391 "READ_WRITE dependence in interleaving."
400 if (dist
> 0 && DDR_REVERSED_P (ddr
))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
419 && abs (dist
) < *max_vf
)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf
= abs (dist
);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE
, vect_location
,
426 "adjusting maximal vectorization factor to %i\n",
430 if (abs (dist
) >= *max_vf
)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE
, vect_location
,
436 "dependence distance >= VF.\n");
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
446 dump_printf (MSG_NOTE
, " and ");
447 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
448 dump_printf (MSG_NOTE
, "\n");
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
467 struct data_dependence_relation
*ddr
;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE
, vect_location
,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo
)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
479 &LOOP_VINFO_DDRS (loop_vinfo
),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
487 *max_vf
= LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo
);
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
490 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
507 struct data_reference
*dra
= DDR_A (ddr
);
508 struct data_reference
*drb
= DDR_B (ddr
);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
524 /* If dra and drb are part of the same interleaving chain consider
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
539 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
541 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE
, vect_location
,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
549 dump_printf (MSG_NOTE
, " and ");
550 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
551 dump_printf (MSG_NOTE
, "\n");
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
563 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
,
564 vec
<gimple
*> stores
, gimple
*last_store
)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
569 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
570 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
572 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
573 if (access
== last_access
)
575 data_reference
*dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (access
));
576 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
577 gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
579 gimple
*stmt
= gsi_stmt (gsi
);
580 if (! gimple_vuse (stmt
)
581 || (DR_IS_READ (dr_a
) && ! gimple_vdef (stmt
)))
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference
*dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
592 bool dependent
= false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt
))
599 if (stmt
!= last_store
)
603 FOR_EACH_VEC_ELT (stores
, i
, store
)
605 data_reference
*store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store
));
607 ddr_p ddr
= initialize_data_dependence_relation
608 (dr_a
, store_dr
, vNULL
);
609 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
610 free_dependence_relation (ddr
);
617 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
619 dependent
= vect_slp_analyze_data_ref_dependence (ddr
);
620 free_dependence_relation (ddr
);
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
637 vect_slp_analyze_instance_dependence (slp_instance instance
)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE
, vect_location
,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store
= SLP_INSTANCE_TREE (instance
);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0])))
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple
*last_store
= NULL
;
652 if (! vect_slp_analyze_node_dependences (instance
, store
, vNULL
, NULL
))
655 /* Mark stores in this instance and remember the last one. */
656 last_store
= vect_find_last_scalar_stmt_in_slp (store
);
657 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], true);
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
668 if (! vect_slp_analyze_node_dependences (instance
, load
,
670 ? SLP_TREE_SCALAR_STMTS (store
)
671 : vNULL
, last_store
))
677 /* Unset the visited flag. */
679 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
], false);
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
698 vect_compute_data_ref_alignment (struct data_reference
*dr
)
700 gimple
*stmt
= DR_STMT (dr
);
701 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
702 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
703 struct loop
*loop
= NULL
;
704 tree ref
= DR_REF (dr
);
706 tree base
, base_addr
;
707 tree misalign
= NULL_TREE
;
710 unsigned HOST_WIDE_INT alignment
;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "vect_compute_data_ref_alignment:\n");
717 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr
, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr
)))
723 misalign
= DR_INIT (dr
);
724 aligned_to
= DR_ALIGNED_TO (dr
);
725 base_addr
= DR_BASE_ADDRESS (dr
);
726 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
736 tree step
= DR_STEP (dr
);
738 if (tree_fits_shwi_p (step
)
739 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE
, vect_location
,
743 "inner step divides the vector-size.\n");
744 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
745 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
746 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
752 "inner step doesn't divide the vector-size.\n");
753 misalign
= NULL_TREE
;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
763 tree step
= DR_STEP (dr
);
764 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
766 if (tree_fits_shwi_p (step
)
767 && ((tree_to_shwi (step
) * vf
)
768 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
772 "step doesn't divide the vector-size.\n");
773 misalign
= NULL_TREE
;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
780 while (handled_component_p (base
))
781 base
= TREE_OPERAND (base
, 0);
782 unsigned int base_alignment
= 0;
783 unsigned HOST_WIDE_INT base_bitpos
;
784 get_object_alignment_1 (base
, &base_alignment
, &base_bitpos
);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
789 if (TREE_CODE (base
) == MEM_REF
)
791 /* Note all this only works if DR_BASE_ADDRESS is the same as
792 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
793 in other offsets. We need to rework DR to compute the alingment
794 of DR_BASE_ADDRESS as long as all information is still available. */
795 if (operand_equal_p (TREE_OPERAND (base
, 0), base_addr
, 0))
797 base_bitpos
-= mem_ref_offset (base
).to_short_addr () * BITS_PER_UNIT
;
798 base_bitpos
&= (base_alignment
- 1);
801 base_bitpos
= BITS_PER_UNIT
;
803 if (base_bitpos
!= 0)
804 base_alignment
= base_bitpos
& -base_bitpos
;
805 /* Also look at the alignment of the base address DR analysis
807 unsigned int base_addr_alignment
= get_pointer_alignment (base_addr
);
808 if (base_addr_alignment
> base_alignment
)
809 base_alignment
= base_addr_alignment
;
811 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
812 DR_VECT_AUX (dr
)->base_element_aligned
= true;
814 alignment
= TYPE_ALIGN_UNIT (vectype
);
816 if ((compare_tree_int (aligned_to
, alignment
) < 0)
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
822 "Unknown alignment for access: ");
823 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
824 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
829 if (base_alignment
< TYPE_ALIGN (vectype
))
832 if (TREE_CODE (base
) == ADDR_EXPR
)
833 base
= TREE_OPERAND (base
, 0);
834 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
836 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE
, vect_location
,
839 "can't force alignment of ref: ");
840 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
841 dump_printf (MSG_NOTE
, "\n");
846 if (DECL_USER_ALIGN (base
))
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE
, vect_location
,
851 "not forcing alignment of user-aligned "
853 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, base
);
854 dump_printf (MSG_NOTE
, "\n");
859 /* Force the alignment of the decl.
860 NOTE: This is the only change to the code we make during
861 the analysis phase, before deciding to vectorize the loop. */
862 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
865 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
866 dump_printf (MSG_NOTE
, "\n");
869 DR_VECT_AUX (dr
)->base_decl
= base
;
870 DR_VECT_AUX (dr
)->base_misaligned
= true;
871 DR_VECT_AUX (dr
)->base_element_aligned
= true;
874 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
875 step
= STMT_VINFO_DR_STEP (stmt_info
);
878 /* If this is a backward running DR then first access in the larger
879 vectype actually is N-1 elements before the address in the DR.
880 Adjust misalign accordingly. */
881 if (tree_int_cst_sgn (step
) < 0)
883 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
884 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
885 otherwise we wouldn't be here. */
886 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, step
);
887 /* PLUS because STEP was negative. */
888 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
891 SET_DR_MISALIGNMENT (dr
,
892 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
894 if (dump_enabled_p ())
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
897 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
898 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
899 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
906 /* Function vect_update_misalignment_for_peel
908 DR - the data reference whose misalignment is to be adjusted.
909 DR_PEEL - the data reference whose misalignment is being made
910 zero in the vector loop by the peel.
911 NPEEL - the number of iterations in the peel loop if the misalignment
912 of DR_PEEL is known at compile time. */
915 vect_update_misalignment_for_peel (struct data_reference
*dr
,
916 struct data_reference
*dr_peel
, int npeel
)
919 vec
<dr_p
> same_align_drs
;
920 struct data_reference
*current_dr
;
921 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
922 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
923 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
924 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
926 /* For interleaved data accesses the step in the loop must be multiplied by
927 the size of the interleaving group. */
928 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
929 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
930 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
931 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
933 /* It can be assumed that the data refs with the same alignment as dr_peel
934 are aligned in the vector loop. */
936 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
937 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
939 if (current_dr
!= dr
)
941 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
942 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
943 SET_DR_MISALIGNMENT (dr
, 0);
947 if (known_alignment_for_access_p (dr
)
948 && known_alignment_for_access_p (dr_peel
))
950 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
951 int misal
= DR_MISALIGNMENT (dr
);
952 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
953 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
954 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
955 SET_DR_MISALIGNMENT (dr
, misal
);
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
961 SET_DR_MISALIGNMENT (dr
, -1);
965 /* Function verify_data_ref_alignment
967 Return TRUE if DR can be handled with respect to alignment. */
970 verify_data_ref_alignment (data_reference_p dr
)
972 enum dr_alignment_support supportable_dr_alignment
973 = vect_supportable_dr_alignment (dr
, false);
974 if (!supportable_dr_alignment
)
976 if (dump_enabled_p ())
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
980 "not vectorized: unsupported unaligned load.");
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
983 "not vectorized: unsupported unaligned "
986 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
988 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
993 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE
, vect_location
,
995 "Vectorizing an unaligned access.\n");
1000 /* Function vect_verify_datarefs_alignment
1002 Return TRUE if all data references in the loop can be
1003 handled with respect to alignment. */
1006 vect_verify_datarefs_alignment (loop_vec_info vinfo
)
1008 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
1009 struct data_reference
*dr
;
1012 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1014 gimple
*stmt
= DR_STMT (dr
);
1015 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1017 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1020 /* For interleaving, only the alignment of the first access matters. */
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1022 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1025 /* Strided accesses perform only component accesses, alignment is
1026 irrelevant for them. */
1027 if (STMT_VINFO_STRIDED_P (stmt_info
)
1028 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1031 if (! verify_data_ref_alignment (dr
))
1038 /* Given an memory reference EXP return whether its alignment is less
1042 not_size_aligned (tree exp
)
1044 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1047 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1048 > get_object_alignment (exp
));
1051 /* Function vector_alignment_reachable_p
1053 Return true if vector alignment for DR is reachable by peeling
1054 a few loop iterations. Return false otherwise. */
1057 vector_alignment_reachable_p (struct data_reference
*dr
)
1059 gimple
*stmt
= DR_STMT (dr
);
1060 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1061 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1065 /* For interleaved access we peel only if number of iterations in
1066 the prolog loop ({VF - misalignment}), is a multiple of the
1067 number of the interleaved accesses. */
1068 int elem_size
, mis_in_elements
;
1069 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1071 /* FORNOW: handle only known alignment. */
1072 if (!known_alignment_for_access_p (dr
))
1075 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1076 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1078 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1082 /* If misalignment is known at the compile time then allow peeling
1083 only if natural alignment is reachable through peeling. */
1084 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1086 HOST_WIDE_INT elmsize
=
1087 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1088 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE
, vect_location
,
1091 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1092 dump_printf (MSG_NOTE
,
1093 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1095 if (DR_MISALIGNMENT (dr
) % elmsize
)
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1099 "data size does not divide the misalignment.\n");
1104 if (!known_alignment_for_access_p (dr
))
1106 tree type
= TREE_TYPE (DR_REF (dr
));
1107 bool is_packed
= not_size_aligned (DR_REF (dr
));
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1110 "Unknown misalignment, %snaturally aligned\n",
1111 is_packed
? "not " : "");
1112 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1119 /* Calculate the cost of the memory access represented by DR. */
1122 vect_get_data_access_cost (struct data_reference
*dr
,
1123 unsigned int *inside_cost
,
1124 unsigned int *outside_cost
,
1125 stmt_vector_for_cost
*body_cost_vec
)
1127 gimple
*stmt
= DR_STMT (dr
);
1128 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1129 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1130 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1131 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1132 int ncopies
= vf
/ nunits
;
1134 if (DR_IS_READ (dr
))
1135 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1136 NULL
, body_cost_vec
, false);
1138 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE
, vect_location
,
1142 "vect_get_data_access_cost: inside_cost = %d, "
1143 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1147 typedef struct _vect_peel_info
1149 struct data_reference
*dr
;
1154 typedef struct _vect_peel_extended_info
1156 struct _vect_peel_info peel_info
;
1157 unsigned int inside_cost
;
1158 unsigned int outside_cost
;
1159 stmt_vector_for_cost body_cost_vec
;
1160 } *vect_peel_extended_info
;
1163 /* Peeling hashtable helpers. */
1165 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1167 static inline hashval_t
hash (const _vect_peel_info
*);
1168 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1172 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1174 return (hashval_t
) peel_info
->npeel
;
1178 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1180 return (a
->npeel
== b
->npeel
);
1184 /* Insert DR into peeling hash table with NPEEL as key. */
1187 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1188 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1191 struct _vect_peel_info elem
, *slot
;
1192 _vect_peel_info
**new_slot
;
1193 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1196 slot
= peeling_htab
->find (&elem
);
1201 slot
= XNEW (struct _vect_peel_info
);
1202 slot
->npeel
= npeel
;
1205 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1209 if (!supportable_dr_alignment
1210 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1211 slot
->count
+= VECT_MAX_COST
;
1215 /* Traverse peeling hash table to find peeling option that aligns maximum
1216 number of data accesses. */
1219 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1220 _vect_peel_extended_info
*max
)
1222 vect_peel_info elem
= *slot
;
1224 if (elem
->count
> max
->peel_info
.count
1225 || (elem
->count
== max
->peel_info
.count
1226 && max
->peel_info
.npeel
> elem
->npeel
))
1228 max
->peel_info
.npeel
= elem
->npeel
;
1229 max
->peel_info
.count
= elem
->count
;
1230 max
->peel_info
.dr
= elem
->dr
;
1237 /* Traverse peeling hash table and calculate cost for each peeling option.
1238 Find the one with the lowest cost. */
1241 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1242 _vect_peel_extended_info
*min
)
1244 vect_peel_info elem
= *slot
;
1245 int save_misalignment
, dummy
;
1246 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1247 gimple
*stmt
= DR_STMT (elem
->dr
);
1248 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1249 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1250 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1251 struct data_reference
*dr
;
1252 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1254 prologue_cost_vec
.create (2);
1255 body_cost_vec
.create (2);
1256 epilogue_cost_vec
.create (2);
1258 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1260 stmt
= DR_STMT (dr
);
1261 stmt_info
= vinfo_for_stmt (stmt
);
1262 /* For interleaving, only the alignment of the first access
1264 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1265 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1268 /* Strided accesses perform only component accesses, alignment is
1269 irrelevant for them. */
1270 if (STMT_VINFO_STRIDED_P (stmt_info
)
1271 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1274 save_misalignment
= DR_MISALIGNMENT (dr
);
1275 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1276 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1278 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1281 outside_cost
+= vect_get_known_peeling_cost
1282 (loop_vinfo
, elem
->npeel
, &dummy
,
1283 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1284 &prologue_cost_vec
, &epilogue_cost_vec
);
1286 /* Prologue and epilogue costs are added to the target model later.
1287 These costs depend only on the scalar iteration cost, the
1288 number of peeling iterations finally chosen, and the number of
1289 misaligned statements. So discard the information found here. */
1290 prologue_cost_vec
.release ();
1291 epilogue_cost_vec
.release ();
1293 if (inside_cost
< min
->inside_cost
1294 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1296 min
->inside_cost
= inside_cost
;
1297 min
->outside_cost
= outside_cost
;
1298 min
->body_cost_vec
.release ();
1299 min
->body_cost_vec
= body_cost_vec
;
1300 min
->peel_info
.dr
= elem
->dr
;
1301 min
->peel_info
.npeel
= elem
->npeel
;
1304 body_cost_vec
.release ();
1310 /* Choose best peeling option by traversing peeling hash table and either
1311 choosing an option with the lowest cost (if cost model is enabled) or the
1312 option that aligns as many accesses as possible. */
1314 static struct data_reference
*
1315 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1316 loop_vec_info loop_vinfo
,
1317 unsigned int *npeel
,
1318 stmt_vector_for_cost
*body_cost_vec
)
1320 struct _vect_peel_extended_info res
;
1322 res
.peel_info
.dr
= NULL
;
1323 res
.body_cost_vec
= stmt_vector_for_cost ();
1325 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1327 res
.inside_cost
= INT_MAX
;
1328 res
.outside_cost
= INT_MAX
;
1329 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1330 vect_peeling_hash_get_lowest_cost
> (&res
);
1334 res
.peel_info
.count
= 0;
1335 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1336 vect_peeling_hash_get_most_frequent
> (&res
);
1339 *npeel
= res
.peel_info
.npeel
;
1340 *body_cost_vec
= res
.body_cost_vec
;
1341 return res
.peel_info
.dr
;
1345 /* Function vect_enhance_data_refs_alignment
1347 This pass will use loop versioning and loop peeling in order to enhance
1348 the alignment of data references in the loop.
1350 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1351 original loop is to be vectorized. Any other loops that are created by
1352 the transformations performed in this pass - are not supposed to be
1353 vectorized. This restriction will be relaxed.
1355 This pass will require a cost model to guide it whether to apply peeling
1356 or versioning or a combination of the two. For example, the scheme that
1357 intel uses when given a loop with several memory accesses, is as follows:
1358 choose one memory access ('p') which alignment you want to force by doing
1359 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1360 other accesses are not necessarily aligned, or (2) use loop versioning to
1361 generate one loop in which all accesses are aligned, and another loop in
1362 which only 'p' is necessarily aligned.
1364 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1365 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1366 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1368 Devising a cost model is the most critical aspect of this work. It will
1369 guide us on which access to peel for, whether to use loop versioning, how
1370 many versions to create, etc. The cost model will probably consist of
1371 generic considerations as well as target specific considerations (on
1372 powerpc for example, misaligned stores are more painful than misaligned
1375 Here are the general steps involved in alignment enhancements:
1377 -- original loop, before alignment analysis:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- After vect_compute_data_refs_alignment:
1384 for (i=0; i<N; i++){
1385 x = q[i]; # DR_MISALIGNMENT(q) = 3
1386 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1389 -- Possibility 1: we do loop versioning:
1391 for (i=0; i<N; i++){ # loop 1A
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = 0
1397 for (i=0; i<N; i++){ # loop 1B
1398 x = q[i]; # DR_MISALIGNMENT(q) = 3
1399 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1403 -- Possibility 2: we do loop peeling:
1404 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1408 for (i = 3; i < N; i++){ # loop 2A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1413 -- Possibility 3: combination of loop peeling and versioning:
1414 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1419 for (i = 3; i<N; i++){ # loop 3A
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = 0
1425 for (i = 3; i<N; i++){ # loop 3B
1426 x = q[i]; # DR_MISALIGNMENT(q) = 0
1427 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1431 These loops are later passed to loop_transform to be vectorized. The
1432 vectorizer will use the alignment information to guide the transformation
1433 (whether to generate regular loads/stores, or with special handling for
1437 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1439 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1440 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1441 enum dr_alignment_support supportable_dr_alignment
;
1442 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1443 struct data_reference
*dr
;
1445 bool do_peeling
= false;
1446 bool do_versioning
= false;
1449 stmt_vec_info stmt_info
;
1450 unsigned int npeel
= 0;
1451 bool all_misalignments_unknown
= true;
1452 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1453 unsigned possible_npeel_number
= 1;
1455 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1456 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1457 hash_table
<peel_info_hasher
> peeling_htab (1);
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE
, vect_location
,
1461 "=== vect_enhance_data_refs_alignment ===\n");
1463 /* Reset data so we can safely be called multiple times. */
1464 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1465 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1467 /* While cost model enhancements are expected in the future, the high level
1468 view of the code at this time is as follows:
1470 A) If there is a misaligned access then see if peeling to align
1471 this access can make all data references satisfy
1472 vect_supportable_dr_alignment. If so, update data structures
1473 as needed and return true.
1475 B) If peeling wasn't possible and there is a data reference with an
1476 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1477 then see if loop versioning checks can be used to make all data
1478 references satisfy vect_supportable_dr_alignment. If so, update
1479 data structures as needed and return true.
1481 C) If neither peeling nor versioning were successful then return false if
1482 any data reference does not satisfy vect_supportable_dr_alignment.
1484 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1486 Note, Possibility 3 above (which is peeling and versioning together) is not
1487 being done at this time. */
1489 /* (1) Peeling to force alignment. */
1491 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1493 + How many accesses will become aligned due to the peeling
1494 - How many accesses will become unaligned due to the peeling,
1495 and the cost of misaligned accesses.
1496 - The cost of peeling (the extra runtime checks, the increase
1499 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1501 stmt
= DR_STMT (dr
);
1502 stmt_info
= vinfo_for_stmt (stmt
);
1504 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1507 /* For interleaving, only the alignment of the first access
1509 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1510 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1513 /* For invariant accesses there is nothing to enhance. */
1514 if (integer_zerop (DR_STEP (dr
)))
1517 /* Strided accesses perform only component accesses, alignment is
1518 irrelevant for them. */
1519 if (STMT_VINFO_STRIDED_P (stmt_info
)
1520 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1523 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1524 do_peeling
= vector_alignment_reachable_p (dr
);
1527 if (known_alignment_for_access_p (dr
))
1529 unsigned int npeel_tmp
;
1530 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1531 size_zero_node
) < 0;
1533 /* Save info about DR in the hash table. */
1534 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1535 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1536 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1537 TREE_TYPE (DR_REF (dr
))));
1538 npeel_tmp
= (negative
1539 ? (mis
- nelements
) : (nelements
- mis
))
1542 /* For multiple types, it is possible that the bigger type access
1543 will have more than one peeling option. E.g., a loop with two
1544 types: one of size (vector size / 4), and the other one of
1545 size (vector size / 8). Vectorization factor will 8. If both
1546 access are misaligned by 3, the first one needs one scalar
1547 iteration to be aligned, and the second one needs 5. But the
1548 first one will be aligned also by peeling 5 scalar
1549 iterations, and in that case both accesses will be aligned.
1550 Hence, except for the immediate peeling amount, we also want
1551 to try to add full vector size, while we don't exceed
1552 vectorization factor.
1553 We do this automtically for cost model, since we calculate cost
1554 for every peeling option. */
1555 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1557 if (STMT_SLP_TYPE (stmt_info
))
1558 possible_npeel_number
1559 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1561 possible_npeel_number
= vf
/ nelements
;
1564 /* Handle the aligned case. We may decide to align some other
1565 access, making DR unaligned. */
1566 if (DR_MISALIGNMENT (dr
) == 0)
1569 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1570 possible_npeel_number
++;
1573 for (j
= 0; j
< possible_npeel_number
; j
++)
1575 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1577 npeel_tmp
+= nelements
;
1580 all_misalignments_unknown
= false;
1581 /* Data-ref that was chosen for the case that all the
1582 misalignments are unknown is not relevant anymore, since we
1583 have a data-ref with known alignment. */
1588 /* If we don't know any misalignment values, we prefer
1589 peeling for data-ref that has the maximum number of data-refs
1590 with the same alignment, unless the target prefers to align
1591 stores over load. */
1592 if (all_misalignments_unknown
)
1594 unsigned same_align_drs
1595 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1597 || same_align_drs_max
< same_align_drs
)
1599 same_align_drs_max
= same_align_drs
;
1602 /* For data-refs with the same number of related
1603 accesses prefer the one where the misalign
1604 computation will be invariant in the outermost loop. */
1605 else if (same_align_drs_max
== same_align_drs
)
1607 struct loop
*ivloop0
, *ivloop
;
1608 ivloop0
= outermost_invariant_loop_for_expr
1609 (loop
, DR_BASE_ADDRESS (dr0
));
1610 ivloop
= outermost_invariant_loop_for_expr
1611 (loop
, DR_BASE_ADDRESS (dr
));
1612 if ((ivloop
&& !ivloop0
)
1613 || (ivloop
&& ivloop0
1614 && flow_loop_nested_p (ivloop
, ivloop0
)))
1618 if (!first_store
&& DR_IS_WRITE (dr
))
1622 /* If there are both known and unknown misaligned accesses in the
1623 loop, we choose peeling amount according to the known
1625 if (!supportable_dr_alignment
)
1628 if (!first_store
&& DR_IS_WRITE (dr
))
1635 if (!aligned_access_p (dr
))
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1639 "vector alignment may not be reachable\n");
1645 /* Check if we can possibly peel the loop. */
1646 if (!vect_can_advance_ivs_p (loop_vinfo
)
1647 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1652 && all_misalignments_unknown
1653 && vect_supportable_dr_alignment (dr0
, false))
1655 /* Check if the target requires to prefer stores over loads, i.e., if
1656 misaligned stores are more expensive than misaligned loads (taking
1657 drs with same alignment into account). */
1658 if (first_store
&& DR_IS_READ (dr0
))
1660 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1661 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1662 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1663 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1664 stmt_vector_for_cost dummy
;
1667 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1669 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1670 &store_outside_cost
, &dummy
);
1674 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1675 aligning the load DR0). */
1676 load_inside_penalty
= store_inside_cost
;
1677 load_outside_penalty
= store_outside_cost
;
1679 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1680 DR_STMT (first_store
))).iterate (i
, &dr
);
1682 if (DR_IS_READ (dr
))
1684 load_inside_penalty
+= load_inside_cost
;
1685 load_outside_penalty
+= load_outside_cost
;
1689 load_inside_penalty
+= store_inside_cost
;
1690 load_outside_penalty
+= store_outside_cost
;
1693 /* Calculate the penalty for leaving DR0 unaligned (by
1694 aligning the FIRST_STORE). */
1695 store_inside_penalty
= load_inside_cost
;
1696 store_outside_penalty
= load_outside_cost
;
1698 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1699 DR_STMT (dr0
))).iterate (i
, &dr
);
1701 if (DR_IS_READ (dr
))
1703 store_inside_penalty
+= load_inside_cost
;
1704 store_outside_penalty
+= load_outside_cost
;
1708 store_inside_penalty
+= store_inside_cost
;
1709 store_outside_penalty
+= store_outside_cost
;
1712 if (load_inside_penalty
> store_inside_penalty
1713 || (load_inside_penalty
== store_inside_penalty
1714 && load_outside_penalty
> store_outside_penalty
))
1718 /* In case there are only loads with different unknown misalignments, use
1719 peeling only if it may help to align other accesses in the loop or
1720 if it may help improving load bandwith when we'd end up using
1722 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1724 && !STMT_VINFO_SAME_ALIGN_REFS (
1725 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1726 && (vect_supportable_dr_alignment (dr0
, false)
1727 != dr_unaligned_supported
1728 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1729 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1733 if (do_peeling
&& !dr0
)
1735 /* Peeling is possible, but there is no data access that is not supported
1736 unless aligned. So we try to choose the best possible peeling. */
1738 /* We should get here only if there are drs with known misalignment. */
1739 gcc_assert (!all_misalignments_unknown
);
1741 /* Choose the best peeling from the hash table. */
1742 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1751 stmt
= DR_STMT (dr0
);
1752 stmt_info
= vinfo_for_stmt (stmt
);
1753 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1754 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1756 if (known_alignment_for_access_p (dr0
))
1758 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1759 size_zero_node
) < 0;
1762 /* Since it's known at compile time, compute the number of
1763 iterations in the peeled loop (the peeling factor) for use in
1764 updating DR_MISALIGNMENT values. The peeling factor is the
1765 vectorization factor minus the misalignment as an element
1767 mis
= DR_MISALIGNMENT (dr0
);
1768 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1769 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1773 /* For interleaved data access every iteration accesses all the
1774 members of the group, therefore we divide the number of iterations
1775 by the group size. */
1776 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1777 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1778 npeel
/= GROUP_SIZE (stmt_info
);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE
, vect_location
,
1782 "Try peeling by %d\n", npeel
);
1785 /* Ensure that all data refs can be vectorized after the peel. */
1786 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1788 int save_misalignment
;
1793 stmt
= DR_STMT (dr
);
1794 stmt_info
= vinfo_for_stmt (stmt
);
1795 /* For interleaving, only the alignment of the first access
1797 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1798 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1801 /* Strided accesses perform only component accesses, alignment is
1802 irrelevant for them. */
1803 if (STMT_VINFO_STRIDED_P (stmt_info
)
1804 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1807 save_misalignment
= DR_MISALIGNMENT (dr
);
1808 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1809 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1810 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1812 if (!supportable_dr_alignment
)
1819 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1821 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1826 body_cost_vec
.release ();
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1836 if (max_allowed_peel
!= (unsigned)-1)
1838 unsigned max_peel
= npeel
;
1841 gimple
*dr_stmt
= DR_STMT (dr0
);
1842 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1843 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1844 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1846 if (max_peel
> max_allowed_peel
)
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE
, vect_location
,
1851 "Disable peeling, max peels reached: %d\n", max_peel
);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1862 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
1883 if (STMT_VINFO_STRIDED_P (stmt_info
)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1887 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1895 = DR_MISALIGNMENT (dr0
);
1896 SET_DR_MISALIGNMENT (dr0
, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE
, vect_location
,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE
, vect_location
,
1902 "Peeling for alignment will be applied.\n");
1904 /* The inside-loop cost will be accounted for in vectorizable_load
1905 and vectorizable_store correctly with adjusted alignments.
1906 Drop the body_cst_vec on the floor here. */
1907 body_cost_vec
.release ();
1909 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1915 body_cost_vec
.release ();
1917 /* (2) Versioning to force alignment. */
1919 /* Try versioning if:
1920 1) optimize loop for speed
1921 2) there is at least one unsupported misaligned data ref with an unknown
1923 3) all misaligned data refs with a known misalignment are supported, and
1924 4) the number of runtime alignment checks is within reason. */
1927 optimize_loop_nest_for_speed_p (loop
)
1928 && (!loop
->inner
); /* FORNOW */
1932 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1934 stmt
= DR_STMT (dr
);
1935 stmt_info
= vinfo_for_stmt (stmt
);
1937 /* For interleaving, only the alignment of the first access
1939 if (aligned_access_p (dr
)
1940 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1941 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1944 if (STMT_VINFO_STRIDED_P (stmt_info
))
1946 /* Strided loads perform only component accesses, alignment is
1947 irrelevant for them. */
1948 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1950 do_versioning
= false;
1954 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1956 if (!supportable_dr_alignment
)
1962 if (known_alignment_for_access_p (dr
)
1963 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1964 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1966 do_versioning
= false;
1970 stmt
= DR_STMT (dr
);
1971 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1972 gcc_assert (vectype
);
1974 /* The rightmost bits of an aligned address must be zeros.
1975 Construct the mask needed for this test. For example,
1976 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1977 mask must be 15 = 0xf. */
1978 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1980 /* FORNOW: use the same mask to test all potentially unaligned
1981 references in the loop. The vectorizer currently supports
1982 a single vector size, see the reference to
1983 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1984 vectorization factor is computed. */
1985 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1986 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1987 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1993 /* Versioning requires at least one misaligned data reference. */
1994 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1995 do_versioning
= false;
1996 else if (!do_versioning
)
1997 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2002 vec
<gimple
*> may_misalign_stmts
2003 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2006 /* It can now be assumed that the data references in the statements
2007 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2008 of the loop being vectorized. */
2009 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
2011 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2012 dr
= STMT_VINFO_DATA_REF (stmt_info
);
2013 SET_DR_MISALIGNMENT (dr
, 0);
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE
, vect_location
,
2016 "Alignment of access forced using versioning.\n");
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE
, vect_location
,
2021 "Versioning for alignment will be applied.\n");
2023 /* Peeling and versioning can't be done together at this time. */
2024 gcc_assert (! (do_peeling
&& do_versioning
));
2026 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2031 /* This point is reached if neither peeling nor versioning is being done. */
2032 gcc_assert (! (do_peeling
|| do_versioning
));
2034 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
2039 /* Function vect_find_same_alignment_drs.
2041 Update group and alignment relations according to the chosen
2042 vectorization factor. */
2045 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
2046 loop_vec_info loop_vinfo
)
2049 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2050 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2051 struct data_reference
*dra
= DDR_A (ddr
);
2052 struct data_reference
*drb
= DDR_B (ddr
);
2053 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2054 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2055 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2056 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2057 lambda_vector dist_v
;
2058 unsigned int loop_depth
;
2060 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2066 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2069 /* Loop-based vectorization and known data dependence. */
2070 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2073 /* Data-dependence analysis reports a distance vector of zero
2074 for data-references that overlap only in the first iteration
2075 but have different sign step (see PR45764).
2076 So as a sanity check require equal DR_STEP. */
2077 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2080 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2081 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2083 int dist
= dist_v
[loop_depth
];
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE
, vect_location
,
2087 "dependence distance = %d.\n", dist
);
2089 /* Same loop iteration. */
2091 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2093 /* Two references with distance zero have the same alignment. */
2094 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2095 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2096 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_NOTE
, vect_location
,
2099 "accesses have the same alignment.\n");
2100 dump_printf (MSG_NOTE
,
2101 "dependence distance modulo vf == 0 between ");
2102 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2103 dump_printf (MSG_NOTE
, " and ");
2104 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2105 dump_printf (MSG_NOTE
, "\n");
2112 /* Function vect_analyze_data_refs_alignment
2114 Analyze the alignment of the data-references in the loop.
2115 Return FALSE if a data reference is found that cannot be vectorized. */
2118 vect_analyze_data_refs_alignment (loop_vec_info vinfo
)
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE
, vect_location
,
2122 "=== vect_analyze_data_refs_alignment ===\n");
2124 /* Mark groups of data references with same alignment using
2125 data dependence information. */
2126 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2127 struct data_dependence_relation
*ddr
;
2130 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2131 vect_find_same_alignment_drs (ddr
, vinfo
);
2133 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2134 struct data_reference
*dr
;
2136 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2138 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
2139 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
2140 && !vect_compute_data_ref_alignment (dr
))
2142 /* Strided accesses perform only component accesses, misalignment
2143 information is irrelevant for them. */
2144 if (STMT_VINFO_STRIDED_P (stmt_info
)
2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2150 "not vectorized: can't calculate alignment "
2161 /* Analyze alignment of DRs of stmts in NODE. */
2164 vect_slp_analyze_and_verify_node_alignment (slp_tree node
)
2166 /* We vectorize from the first scalar stmt in the node unless
2167 the node is permuted in which case we start from the first
2168 element in the group. */
2169 gimple
*first_stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
2170 data_reference_p first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2171 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2172 first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt
));
2174 data_reference_p dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt
));
2175 if (! vect_compute_data_ref_alignment (dr
)
2176 /* For creating the data-ref pointer we need alignment of the
2177 first element anyway. */
2179 && ! vect_compute_data_ref_alignment (first_dr
))
2180 || ! verify_data_ref_alignment (dr
))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2184 "not vectorized: bad data alignment in basic "
2192 /* Function vect_slp_analyze_instance_alignment
2194 Analyze the alignment of the data-references in the SLP instance.
2195 Return FALSE if a data reference is found that cannot be vectorized. */
2198 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance
)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_NOTE
, vect_location
,
2202 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2207 if (! vect_slp_analyze_and_verify_node_alignment (node
))
2210 node
= SLP_INSTANCE_TREE (instance
);
2211 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node
)[0]))
2212 && ! vect_slp_analyze_and_verify_node_alignment
2213 (SLP_INSTANCE_TREE (instance
)))
2220 /* Analyze groups of accesses: check that DR belongs to a group of
2221 accesses of legal size, step, etc. Detect gaps, single element
2222 interleaving, and other special cases. Set grouped access info.
2223 Collect groups of strided stores for further use in SLP analysis.
2224 Worker for vect_analyze_group_access. */
2227 vect_analyze_group_access_1 (struct data_reference
*dr
)
2229 tree step
= DR_STEP (dr
);
2230 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2231 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2232 gimple
*stmt
= DR_STMT (dr
);
2233 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2234 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2235 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2236 HOST_WIDE_INT dr_step
= -1;
2237 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2238 bool slp_impossible
= false;
2240 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2241 size of the interleaving group (including gaps). */
2242 if (tree_fits_shwi_p (step
))
2244 dr_step
= tree_to_shwi (step
);
2245 /* Check that STEP is a multiple of type size. Otherwise there is
2246 a non-element-sized gap at the end of the group which we
2247 cannot represent in GROUP_GAP or GROUP_SIZE.
2248 ??? As we can handle non-constant step fine here we should
2249 simply remove uses of GROUP_GAP between the last and first
2250 element and instead rely on DR_STEP. GROUP_SIZE then would
2251 simply not include that gap. */
2252 if ((dr_step
% type_size
) != 0)
2254 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE
, vect_location
,
2258 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2259 dump_printf (MSG_NOTE
,
2260 " is not a multiple of the element size for ");
2261 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2262 dump_printf (MSG_NOTE
, "\n");
2266 groupsize
= absu_hwi (dr_step
) / type_size
;
2271 /* Not consecutive access is possible only if it is a part of interleaving. */
2272 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2274 /* Check if it this DR is a part of interleaving, and is a single
2275 element of the group that is accessed in the loop. */
2277 /* Gaps are supported only for loads. STEP must be a multiple of the type
2278 size. The size of the group must be a power of 2. */
2280 && (dr_step
% type_size
) == 0
2282 && pow2p_hwi (groupsize
))
2284 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2285 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2286 GROUP_GAP (stmt_info
) = groupsize
- 1;
2287 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE
, vect_location
,
2290 "Detected single element interleaving ");
2291 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2292 dump_printf (MSG_NOTE
, " step ");
2293 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2294 dump_printf (MSG_NOTE
, "\n");
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2303 "not consecutive access ");
2304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2309 /* Mark the statement as unvectorizable. */
2310 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2314 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2315 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2319 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2321 /* First stmt in the interleaving chain. Check the chain. */
2322 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2323 struct data_reference
*data_ref
= dr
;
2324 unsigned int count
= 1;
2325 tree prev_init
= DR_INIT (data_ref
);
2326 gimple
*prev
= stmt
;
2327 HOST_WIDE_INT diff
, gaps
= 0;
2331 /* Skip same data-refs. In case that two or more stmts share
2332 data-ref (supported only for loads), we vectorize only the first
2333 stmt, and the rest get their vectorized loads from the first
2335 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2336 DR_INIT (STMT_VINFO_DATA_REF (
2337 vinfo_for_stmt (next
)))))
2339 if (DR_IS_WRITE (data_ref
))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2343 "Two store stmts share the same dr.\n");
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2349 "Two or more load stmts share the same dr.\n");
2351 /* For load use the same data-ref load. */
2352 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2355 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2360 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2362 /* All group members have the same STEP by construction. */
2363 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2365 /* Check that the distance between two accesses is equal to the type
2366 size. Otherwise, we have gaps. */
2367 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2368 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2371 /* FORNOW: SLP of accesses with gaps is not supported. */
2372 slp_impossible
= true;
2373 if (DR_IS_WRITE (data_ref
))
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2377 "interleaved store with gaps\n");
2384 last_accessed_element
+= diff
;
2386 /* Store the gap from the previous member of the group. If there is no
2387 gap in the access, GROUP_GAP is always 1. */
2388 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2390 prev_init
= DR_INIT (data_ref
);
2391 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2392 /* Count the number of data-refs in the chain. */
2397 groupsize
= count
+ gaps
;
2399 /* This could be UINT_MAX but as we are generating code in a very
2400 inefficient way we have to cap earlier. See PR78699 for example. */
2401 if (groupsize
> 4096)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2405 "group is too large\n");
2409 /* Check that the size of the interleaving is equal to count for stores,
2410 i.e., that there are no gaps. */
2411 if (groupsize
!= count
2412 && !DR_IS_READ (dr
))
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2416 "interleaved store with gaps\n");
2420 /* If there is a gap after the last load in the group it is the
2421 difference between the groupsize and the last accessed
2423 When there is no gap, this difference should be 0. */
2424 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2426 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2427 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE
, vect_location
,
2430 "Detected interleaving ");
2431 if (DR_IS_READ (dr
))
2432 dump_printf (MSG_NOTE
, "load ");
2434 dump_printf (MSG_NOTE
, "store ");
2435 dump_printf (MSG_NOTE
, "of size %u starting with ",
2436 (unsigned)groupsize
);
2437 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2438 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2439 dump_printf_loc (MSG_NOTE
, vect_location
,
2440 "There is a gap of %u elements after the group\n",
2441 GROUP_GAP (vinfo_for_stmt (stmt
)));
2444 /* SLP: create an SLP data structure for every interleaving group of
2445 stores for further analysis in vect_analyse_slp. */
2446 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2449 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2451 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2458 /* Analyze groups of accesses: check that DR belongs to a group of
2459 accesses of legal size, step, etc. Detect gaps, single element
2460 interleaving, and other special cases. Set grouped access info.
2461 Collect groups of strided stores for further use in SLP analysis. */
2464 vect_analyze_group_access (struct data_reference
*dr
)
2466 if (!vect_analyze_group_access_1 (dr
))
2468 /* Dissolve the group if present. */
2470 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2473 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2474 next
= GROUP_NEXT_ELEMENT (vinfo
);
2475 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2476 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2484 /* Analyze the access pattern of the data-reference DR.
2485 In case of non-consecutive accesses call vect_analyze_group_access() to
2486 analyze groups of accesses. */
2489 vect_analyze_data_ref_access (struct data_reference
*dr
)
2491 tree step
= DR_STEP (dr
);
2492 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2493 gimple
*stmt
= DR_STMT (dr
);
2494 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2495 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2496 struct loop
*loop
= NULL
;
2499 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2501 if (loop_vinfo
&& !step
)
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2505 "bad data-ref access in loop\n");
2509 /* Allow loads with zero step in inner-loop vectorization. */
2510 if (loop_vinfo
&& integer_zerop (step
))
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2513 if (!nested_in_vect_loop_p (loop
, stmt
))
2514 return DR_IS_READ (dr
);
2515 /* Allow references with zero step for outer loops marked
2516 with pragma omp simd only - it guarantees absence of
2517 loop-carried dependencies between inner loop iterations. */
2518 if (!loop
->force_vectorize
)
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE
, vect_location
,
2522 "zero step in inner loop of nest\n");
2527 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2529 /* Interleaved accesses are not yet supported within outer-loop
2530 vectorization for references in the inner-loop. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2533 /* For the rest of the analysis we use the outer-loop step. */
2534 step
= STMT_VINFO_DR_STEP (stmt_info
);
2535 if (integer_zerop (step
))
2537 if (dump_enabled_p ())
2538 dump_printf_loc (MSG_NOTE
, vect_location
,
2539 "zero step in outer loop.\n");
2540 return DR_IS_READ (dr
);
2545 if (TREE_CODE (step
) == INTEGER_CST
)
2547 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2548 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2550 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2552 /* Mark that it is not interleaving. */
2553 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2558 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE
, vect_location
,
2562 "grouped access in outer loop.\n");
2567 /* Assume this is a DR handled by non-constant strided load case. */
2568 if (TREE_CODE (step
) != INTEGER_CST
)
2569 return (STMT_VINFO_STRIDED_P (stmt_info
)
2570 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2571 || vect_analyze_group_access (dr
)));
2573 /* Not consecutive access - check if it's a part of interleaving group. */
2574 return vect_analyze_group_access (dr
);
2579 /* A helper function used in the comparator function to sort data
2580 references. T1 and T2 are two data references to be compared.
2581 The function returns -1, 0, or 1. */
2584 compare_tree (tree t1
, tree t2
)
2587 enum tree_code code
;
2600 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2601 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2603 code
= TREE_CODE (t1
);
2606 /* For const values, we can just use hash values for comparisons. */
2614 hashval_t h1
= iterative_hash_expr (t1
, 0);
2615 hashval_t h2
= iterative_hash_expr (t2
, 0);
2617 return h1
< h2
? -1 : 1;
2622 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2626 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2627 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2631 tclass
= TREE_CODE_CLASS (code
);
2633 /* For var-decl, we could compare their UIDs. */
2634 if (tclass
== tcc_declaration
)
2636 if (DECL_UID (t1
) != DECL_UID (t2
))
2637 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2641 /* For expressions with operands, compare their operands recursively. */
2642 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2644 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2654 /* Compare two data-references DRA and DRB to group them into chunks
2655 suitable for grouping. */
2658 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2660 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2661 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2664 /* Stabilize sort. */
2668 /* DRs in different loops never belong to the same group. */
2669 loop_p loopa
= gimple_bb (DR_STMT (dra
))->loop_father
;
2670 loop_p loopb
= gimple_bb (DR_STMT (drb
))->loop_father
;
2672 return loopa
->num
< loopb
->num
? -1 : 1;
2674 /* Ordering of DRs according to base. */
2675 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2677 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2682 /* And according to DR_OFFSET. */
2683 if (!dr_equal_offsets_p (dra
, drb
))
2685 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2690 /* Put reads before writes. */
2691 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2692 return DR_IS_READ (dra
) ? -1 : 1;
2694 /* Then sort after access size. */
2695 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2696 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2698 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2699 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2704 /* And after step. */
2705 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2707 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2712 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2713 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2715 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2719 /* Function vect_analyze_data_ref_accesses.
2721 Analyze the access pattern of all the data references in the loop.
2723 FORNOW: the only access pattern that is considered vectorizable is a
2724 simple step 1 (consecutive) access.
2726 FORNOW: handle only arrays and pointer accesses. */
2729 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2732 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2733 struct data_reference
*dr
;
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE
, vect_location
,
2737 "=== vect_analyze_data_ref_accesses ===\n");
2739 if (datarefs
.is_empty ())
2742 /* Sort the array of datarefs to make building the interleaving chains
2743 linear. Don't modify the original vector's order, it is needed for
2744 determining what dependencies are reversed. */
2745 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2746 datarefs_copy
.qsort (dr_group_sort_cmp
);
2748 /* Build the interleaving chains. */
2749 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2751 data_reference_p dra
= datarefs_copy
[i
];
2752 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2753 stmt_vec_info lastinfo
= NULL
;
2754 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a
))
2759 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2761 data_reference_p drb
= datarefs_copy
[i
];
2762 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2763 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
2766 /* ??? Imperfect sorting (non-compatible types, non-modulo
2767 accesses, same accesses) can lead to a group to be artificially
2768 split here as we don't just skip over those. If it really
2769 matters we can push those to a worklist and re-iterate
2770 over them. The we can just skip ahead to the next DR here. */
2772 /* DRs in a different loop should not be put into the same
2773 interleaving group. */
2774 if (gimple_bb (DR_STMT (dra
))->loop_father
2775 != gimple_bb (DR_STMT (drb
))->loop_father
)
2778 /* Check that the data-refs have same first location (except init)
2779 and they are both either store or load (not load and store,
2780 not masked loads or stores). */
2781 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2782 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2783 DR_BASE_ADDRESS (drb
), 0)
2784 || !dr_equal_offsets_p (dra
, drb
)
2785 || !gimple_assign_single_p (DR_STMT (dra
))
2786 || !gimple_assign_single_p (DR_STMT (drb
)))
2789 /* Check that the data-refs have the same constant size. */
2790 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2791 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2792 if (!tree_fits_uhwi_p (sza
)
2793 || !tree_fits_uhwi_p (szb
)
2794 || !tree_int_cst_equal (sza
, szb
))
2797 /* Check that the data-refs have the same step. */
2798 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2801 /* Do not place the same access in the interleaving chain twice. */
2802 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2805 /* Check the types are compatible.
2806 ??? We don't distinguish this during sorting. */
2807 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2808 TREE_TYPE (DR_REF (drb
))))
2811 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2812 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2813 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2814 gcc_assert (init_a
<= init_b
);
2816 /* If init_b == init_a + the size of the type * k, we have an
2817 interleaving, and DRA is accessed before DRB. */
2818 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2819 if (type_size_a
== 0
2820 || (init_b
- init_a
) % type_size_a
!= 0)
2823 /* If we have a store, the accesses are adjacent. This splits
2824 groups into chunks we support (we don't support vectorization
2825 of stores with gaps). */
2826 if (!DR_IS_READ (dra
)
2827 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2828 (DR_INIT (datarefs_copy
[i
-1]))
2832 /* If the step (if not zero or non-constant) is greater than the
2833 difference between data-refs' inits this splits groups into
2835 if (tree_fits_shwi_p (DR_STEP (dra
)))
2837 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2838 if (step
!= 0 && step
<= (init_b
- init_a
))
2842 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_NOTE
, vect_location
,
2845 "Detected interleaving ");
2846 if (DR_IS_READ (dra
))
2847 dump_printf (MSG_NOTE
, "load ");
2849 dump_printf (MSG_NOTE
, "store ");
2850 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2851 dump_printf (MSG_NOTE
, " and ");
2852 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2853 dump_printf (MSG_NOTE
, "\n");
2856 /* Link the found element into the group list. */
2857 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2859 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2860 lastinfo
= stmtinfo_a
;
2862 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2863 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2864 lastinfo
= stmtinfo_b
;
2868 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2869 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2870 && !vect_analyze_data_ref_access (dr
))
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2874 "not vectorized: complicated access pattern.\n");
2876 if (is_a
<bb_vec_info
> (vinfo
))
2878 /* Mark the statement as not vectorizable. */
2879 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2884 datarefs_copy
.release ();
2889 datarefs_copy
.release ();
2894 /* Operator == between two dr_with_seg_len objects.
2896 This equality operator is used to make sure two data refs
2897 are the same one so that we will consider to combine the
2898 aliasing checks of those two pairs of data dependent data
2902 operator == (const dr_with_seg_len
& d1
,
2903 const dr_with_seg_len
& d2
)
2905 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2906 DR_BASE_ADDRESS (d2
.dr
), 0)
2907 && compare_tree (DR_OFFSET (d1
.dr
), DR_OFFSET (d2
.dr
)) == 0
2908 && compare_tree (DR_INIT (d1
.dr
), DR_INIT (d2
.dr
)) == 0
2909 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2912 /* Function comp_dr_with_seg_len_pair.
2914 Comparison function for sorting objects of dr_with_seg_len_pair_t
2915 so that we can combine aliasing checks in one scan. */
2918 comp_dr_with_seg_len_pair (const void *pa_
, const void *pb_
)
2920 const dr_with_seg_len_pair_t
* pa
= (const dr_with_seg_len_pair_t
*) pa_
;
2921 const dr_with_seg_len_pair_t
* pb
= (const dr_with_seg_len_pair_t
*) pb_
;
2922 const dr_with_seg_len
&a1
= pa
->first
, &a2
= pa
->second
;
2923 const dr_with_seg_len
&b1
= pb
->first
, &b2
= pb
->second
;
2925 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2926 if a and c have the same basic address snd step, and b and d have the same
2927 address and step. Therefore, if any a&c or b&d don't have the same address
2928 and step, we don't care the order of those two pairs after sorting. */
2931 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a1
.dr
),
2932 DR_BASE_ADDRESS (b1
.dr
))) != 0)
2934 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (a2
.dr
),
2935 DR_BASE_ADDRESS (b2
.dr
))) != 0)
2937 if ((comp_res
= compare_tree (DR_STEP (a1
.dr
), DR_STEP (b1
.dr
))) != 0)
2939 if ((comp_res
= compare_tree (DR_STEP (a2
.dr
), DR_STEP (b2
.dr
))) != 0)
2941 if ((comp_res
= compare_tree (DR_OFFSET (a1
.dr
), DR_OFFSET (b1
.dr
))) != 0)
2943 if ((comp_res
= compare_tree (DR_INIT (a1
.dr
), DR_INIT (b1
.dr
))) != 0)
2945 if ((comp_res
= compare_tree (DR_OFFSET (a2
.dr
), DR_OFFSET (b2
.dr
))) != 0)
2947 if ((comp_res
= compare_tree (DR_INIT (a2
.dr
), DR_INIT (b2
.dr
))) != 0)
2953 /* Function vect_vfa_segment_size.
2955 Create an expression that computes the size of segment
2956 that will be accessed for a data reference. The functions takes into
2957 account that realignment loads may access one more vector.
2960 DR: The data reference.
2961 LENGTH_FACTOR: segment length to consider.
2963 Return an expression whose value is the size of segment which will be
2967 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2969 tree segment_length
;
2971 if (integer_zerop (DR_STEP (dr
)))
2972 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2974 segment_length
= size_binop (MULT_EXPR
,
2975 fold_convert (sizetype
, DR_STEP (dr
)),
2976 fold_convert (sizetype
, length_factor
));
2978 if (vect_supportable_dr_alignment (dr
, false)
2979 == dr_explicit_realign_optimized
)
2981 tree vector_size
= TYPE_SIZE_UNIT
2982 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2984 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2986 return segment_length
;
2989 /* Function vect_no_alias_p.
2991 Given data references A and B with equal base and offset, the alias
2992 relation can be decided at compilation time, return TRUE if they do
2993 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2994 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2998 vect_no_alias_p (struct data_reference
*a
, struct data_reference
*b
,
2999 tree segment_length_a
, tree segment_length_b
)
3001 gcc_assert (TREE_CODE (DR_INIT (a
)) == INTEGER_CST
3002 && TREE_CODE (DR_INIT (b
)) == INTEGER_CST
);
3003 if (tree_int_cst_equal (DR_INIT (a
), DR_INIT (b
)))
3006 tree seg_a_min
= DR_INIT (a
);
3007 tree seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_min
),
3008 seg_a_min
, segment_length_a
);
3009 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3010 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3012 if (tree_int_cst_compare (DR_STEP (a
), size_zero_node
) < 0)
3014 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a
)));
3015 seg_a_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_a_max
),
3016 seg_a_max
, unit_size
);
3017 seg_a_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (a
)),
3018 DR_INIT (a
), unit_size
);
3020 tree seg_b_min
= DR_INIT (b
);
3021 tree seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_min
),
3022 seg_b_min
, segment_length_b
);
3023 if (tree_int_cst_compare (DR_STEP (b
), size_zero_node
) < 0)
3025 tree unit_size
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b
)));
3026 seg_b_min
= fold_build2 (PLUS_EXPR
, TREE_TYPE (seg_b_max
),
3027 seg_b_max
, unit_size
);
3028 seg_b_max
= fold_build2 (PLUS_EXPR
, TREE_TYPE (DR_INIT (b
)),
3029 DR_INIT (b
), unit_size
);
3032 if (tree_int_cst_le (seg_a_max
, seg_b_min
)
3033 || tree_int_cst_le (seg_b_max
, seg_a_min
))
3039 /* Function vect_prune_runtime_alias_test_list.
3041 Prune a list of ddrs to be tested at run-time by versioning for alias.
3042 Merge several alias checks into one if possible.
3043 Return FALSE if resulting list of ddrs is longer then allowed by
3044 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3047 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3049 vec
<ddr_p
> may_alias_ddrs
=
3050 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3051 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
3052 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3053 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3054 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_NOTE
, vect_location
,
3062 "=== vect_prune_runtime_alias_test_list ===\n");
3064 if (may_alias_ddrs
.is_empty ())
3067 /* Basically, for each pair of dependent data refs store_ptr_0
3068 and load_ptr_0, we create an expression:
3070 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3071 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3073 for aliasing checks. However, in some cases we can decrease
3074 the number of checks by combining two checks into one. For
3075 example, suppose we have another pair of data refs store_ptr_0
3076 and load_ptr_1, and if the following condition is satisfied:
3078 load_ptr_0 < load_ptr_1 &&
3079 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3081 (this condition means, in each iteration of vectorized loop,
3082 the accessed memory of store_ptr_0 cannot be between the memory
3083 of load_ptr_0 and load_ptr_1.)
3085 we then can use only the following expression to finish the
3086 alising checks between store_ptr_0 & load_ptr_0 and
3087 store_ptr_0 & load_ptr_1:
3089 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3090 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3092 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3093 same basic address. */
3095 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3097 /* First, we collect all data ref pairs for aliasing checks. */
3098 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3101 struct data_reference
*dr_a
, *dr_b
;
3102 gimple
*dr_group_first_a
, *dr_group_first_b
;
3103 tree segment_length_a
, segment_length_b
;
3104 gimple
*stmt_a
, *stmt_b
;
3107 stmt_a
= DR_STMT (DDR_A (ddr
));
3108 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
3109 if (dr_group_first_a
)
3111 stmt_a
= dr_group_first_a
;
3112 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
3116 stmt_b
= DR_STMT (DDR_B (ddr
));
3117 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
3118 if (dr_group_first_b
)
3120 stmt_b
= dr_group_first_b
;
3121 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
3124 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
3125 length_factor
= scalar_loop_iters
;
3127 length_factor
= size_int (vect_factor
);
3128 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
3129 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
3131 comp_res
= compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
));
3133 comp_res
= compare_tree (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
));
3135 /* Alias is known at compilation time. */
3137 && TREE_CODE (DR_STEP (dr_a
)) == INTEGER_CST
3138 && TREE_CODE (DR_STEP (dr_b
)) == INTEGER_CST
3139 && TREE_CODE (segment_length_a
) == INTEGER_CST
3140 && TREE_CODE (segment_length_b
) == INTEGER_CST
)
3142 if (vect_no_alias_p (dr_a
, dr_b
, segment_length_a
, segment_length_b
))
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_NOTE
, vect_location
,
3147 "not vectorized: compilation time alias.\n");
3152 dr_with_seg_len_pair_t dr_with_seg_len_pair
3153 (dr_with_seg_len (dr_a
, segment_length_a
),
3154 dr_with_seg_len (dr_b
, segment_length_b
));
3156 /* Canonicalize pairs by sorting the two DR members. */
3158 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
3160 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3163 /* Second, we sort the collected data ref pairs so that we can scan
3164 them once to combine all possible aliasing checks. */
3165 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
3167 /* Third, we scan the sorted dr pairs and check if we can combine
3168 alias checks of two neighboring dr pairs. */
3169 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
3171 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3172 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
3173 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
3174 *dr_a2
= &comp_alias_ddrs
[i
].first
,
3175 *dr_b2
= &comp_alias_ddrs
[i
].second
;
3177 /* Remove duplicate data ref pairs. */
3178 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
3180 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE
, vect_location
,
3183 "found equal ranges ");
3184 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3185 DR_REF (dr_a1
->dr
));
3186 dump_printf (MSG_NOTE
, ", ");
3187 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3188 DR_REF (dr_b1
->dr
));
3189 dump_printf (MSG_NOTE
, " and ");
3190 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3191 DR_REF (dr_a2
->dr
));
3192 dump_printf (MSG_NOTE
, ", ");
3193 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3194 DR_REF (dr_b2
->dr
));
3195 dump_printf (MSG_NOTE
, "\n");
3198 comp_alias_ddrs
.ordered_remove (i
--);
3202 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3204 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3205 and DR_A1 and DR_A2 are two consecutive memrefs. */
3206 if (*dr_a1
== *dr_a2
)
3208 std::swap (dr_a1
, dr_b1
);
3209 std::swap (dr_a2
, dr_b2
);
3212 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3213 DR_BASE_ADDRESS (dr_a2
->dr
), 0)
3214 || !operand_equal_p (DR_OFFSET (dr_a1
->dr
),
3215 DR_OFFSET (dr_a2
->dr
), 0)
3216 || !tree_fits_shwi_p (DR_INIT (dr_a1
->dr
))
3217 || !tree_fits_shwi_p (DR_INIT (dr_a2
->dr
)))
3220 /* Make sure dr_a1 starts left of dr_a2. */
3221 if (tree_int_cst_lt (DR_INIT (dr_a2
->dr
), DR_INIT (dr_a1
->dr
)))
3222 std::swap (*dr_a1
, *dr_a2
);
3224 bool do_remove
= false;
3225 unsigned HOST_WIDE_INT diff
3226 = (tree_to_shwi (DR_INIT (dr_a2
->dr
))
3227 - tree_to_shwi (DR_INIT (dr_a1
->dr
)));
3229 /* If the left segment does not extend beyond the start of the
3230 right segment the new segment length is that of the right
3231 plus the segment distance. */
3232 if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3233 && compare_tree_int (dr_a1
->seg_len
, diff
) <= 0)
3235 dr_a1
->seg_len
= size_binop (PLUS_EXPR
, dr_a2
->seg_len
,
3239 /* Generally the new segment length is the maximum of the
3240 left segment size and the right segment size plus the distance.
3241 ??? We can also build tree MAX_EXPR here but it's not clear this
3243 else if (tree_fits_uhwi_p (dr_a1
->seg_len
)
3244 && tree_fits_uhwi_p (dr_a2
->seg_len
))
3246 unsigned HOST_WIDE_INT seg_len_a1
= tree_to_uhwi (dr_a1
->seg_len
);
3247 unsigned HOST_WIDE_INT seg_len_a2
= tree_to_uhwi (dr_a2
->seg_len
);
3248 dr_a1
->seg_len
= size_int (MAX (seg_len_a1
, diff
+ seg_len_a2
));
3251 /* Now we check if the following condition is satisfied:
3253 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3255 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3256 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3257 have to make a best estimation. We can get the minimum value
3258 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3259 then either of the following two conditions can guarantee the
3262 1: DIFF <= MIN_SEG_LEN_B
3263 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3266 unsigned HOST_WIDE_INT min_seg_len_b
3267 = (tree_fits_uhwi_p (dr_b1
->seg_len
)
3268 ? tree_to_uhwi (dr_b1
->seg_len
)
3271 if (diff
<= min_seg_len_b
3272 || (tree_fits_uhwi_p (dr_a1
->seg_len
)
3273 && diff
- tree_to_uhwi (dr_a1
->seg_len
) < min_seg_len_b
))
3275 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3276 dr_a2
->seg_len
, size_int (diff
));
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE
, vect_location
,
3286 "merging ranges for ");
3287 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a1
->dr
));
3288 dump_printf (MSG_NOTE
, ", ");
3289 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b1
->dr
));
3290 dump_printf (MSG_NOTE
, " and ");
3291 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_a2
->dr
));
3292 dump_printf (MSG_NOTE
, ", ");
3293 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr_b2
->dr
));
3294 dump_printf (MSG_NOTE
, "\n");
3296 comp_alias_ddrs
.ordered_remove (i
--);
3301 dump_printf_loc (MSG_NOTE
, vect_location
,
3302 "improved number of alias checks from %d to %d\n",
3303 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3304 if ((int) comp_alias_ddrs
.length () >
3305 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3309 "number of versioning for alias "
3310 "run-time tests exceeds %d "
3311 "(--param vect-max-version-for-alias-checks)\n",
3312 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
));
3316 /* All alias checks have been resolved at compilation time. */
3317 if (!comp_alias_ddrs
.length ())
3318 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).truncate (0);
3323 /* Return true if a non-affine read or write in STMT is suitable for a
3324 gather load or scatter store. Describe the operation in *INFO if so. */
3327 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
,
3328 gather_scatter_info
*info
)
3330 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3331 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3332 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3333 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3334 tree offtype
= NULL_TREE
;
3335 tree decl
, base
, off
;
3337 int punsignedp
, reversep
, pvolatilep
= 0;
3340 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3341 see if we can use the def stmt of the address. */
3342 if (is_gimple_call (stmt
)
3343 && gimple_call_internal_p (stmt
)
3344 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3345 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3346 && TREE_CODE (base
) == MEM_REF
3347 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3348 && integer_zerop (TREE_OPERAND (base
, 1))
3349 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3351 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3352 if (is_gimple_assign (def_stmt
)
3353 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3354 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3357 /* The gather and scatter builtins need address of the form
3358 loop_invariant + vector * {1, 2, 4, 8}
3360 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3361 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3362 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3363 multiplications and additions in it. To get a vector, we need
3364 a single SSA_NAME that will be defined in the loop and will
3365 contain everything that is not loop invariant and that can be
3366 vectorized. The following code attempts to find such a preexistng
3367 SSA_NAME OFF and put the loop invariants into a tree BASE
3368 that can be gimplified before the loop. */
3369 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3370 &punsignedp
, &reversep
, &pvolatilep
);
3371 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3373 if (TREE_CODE (base
) == MEM_REF
)
3375 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3377 if (off
== NULL_TREE
)
3379 offset_int moff
= mem_ref_offset (base
);
3380 off
= wide_int_to_tree (sizetype
, moff
);
3383 off
= size_binop (PLUS_EXPR
, off
,
3384 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3386 base
= TREE_OPERAND (base
, 0);
3389 base
= build_fold_addr_expr (base
);
3391 if (off
== NULL_TREE
)
3392 off
= size_zero_node
;
3394 /* If base is not loop invariant, either off is 0, then we start with just
3395 the constant offset in the loop invariant BASE and continue with base
3396 as OFF, otherwise give up.
3397 We could handle that case by gimplifying the addition of base + off
3398 into some SSA_NAME and use that as off, but for now punt. */
3399 if (!expr_invariant_in_loop_p (loop
, base
))
3401 if (!integer_zerop (off
))
3404 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3406 /* Otherwise put base + constant offset into the loop invariant BASE
3407 and continue with OFF. */
3410 base
= fold_convert (sizetype
, base
);
3411 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3414 /* OFF at this point may be either a SSA_NAME or some tree expression
3415 from get_inner_reference. Try to peel off loop invariants from it
3416 into BASE as long as possible. */
3418 while (offtype
== NULL_TREE
)
3420 enum tree_code code
;
3421 tree op0
, op1
, add
= NULL_TREE
;
3423 if (TREE_CODE (off
) == SSA_NAME
)
3425 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3427 if (expr_invariant_in_loop_p (loop
, off
))
3430 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3433 op0
= gimple_assign_rhs1 (def_stmt
);
3434 code
= gimple_assign_rhs_code (def_stmt
);
3435 op1
= gimple_assign_rhs2 (def_stmt
);
3439 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3441 code
= TREE_CODE (off
);
3442 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3446 case POINTER_PLUS_EXPR
:
3448 if (expr_invariant_in_loop_p (loop
, op0
))
3453 add
= fold_convert (sizetype
, add
);
3455 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3456 base
= size_binop (PLUS_EXPR
, base
, add
);
3459 if (expr_invariant_in_loop_p (loop
, op1
))
3467 if (expr_invariant_in_loop_p (loop
, op1
))
3469 add
= fold_convert (sizetype
, op1
);
3470 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3476 if (scale
== 1 && tree_fits_shwi_p (op1
))
3478 scale
= tree_to_shwi (op1
);
3487 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3488 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3490 if (TYPE_PRECISION (TREE_TYPE (op0
))
3491 == TYPE_PRECISION (TREE_TYPE (off
)))
3496 if (TYPE_PRECISION (TREE_TYPE (op0
))
3497 < TYPE_PRECISION (TREE_TYPE (off
)))
3500 offtype
= TREE_TYPE (off
);
3511 /* If at the end OFF still isn't a SSA_NAME or isn't
3512 defined in the loop, punt. */
3513 if (TREE_CODE (off
) != SSA_NAME
3514 || expr_invariant_in_loop_p (loop
, off
))
3517 if (offtype
== NULL_TREE
)
3518 offtype
= TREE_TYPE (off
);
3520 if (DR_IS_READ (dr
))
3521 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3524 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3527 if (decl
== NULL_TREE
)
3533 info
->offset_dt
= vect_unknown_def_type
;
3534 info
->offset_vectype
= NULL_TREE
;
3535 info
->scale
= scale
;
3539 /* Function vect_analyze_data_refs.
3541 Find all the data references in the loop or basic block.
3543 The general structure of the analysis of data refs in the vectorizer is as
3545 1- vect_analyze_data_refs(loop/bb): call
3546 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3547 in the loop/bb and their dependences.
3548 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3549 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3550 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3555 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3557 struct loop
*loop
= NULL
;
3559 struct data_reference
*dr
;
3562 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_NOTE
, vect_location
,
3564 "=== vect_analyze_data_refs ===\n");
3566 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3567 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3569 /* Go through the data-refs, check that the analysis succeeded. Update
3570 pointer from stmt_vec_info struct to DR and vectype. */
3572 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3573 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3576 stmt_vec_info stmt_info
;
3577 tree base
, offset
, init
;
3578 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3579 bool simd_lane_access
= false;
3583 if (!dr
|| !DR_REF (dr
))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3587 "not vectorized: unhandled data-ref\n");
3591 stmt
= DR_STMT (dr
);
3592 stmt_info
= vinfo_for_stmt (stmt
);
3594 /* Discard clobbers from the dataref vector. We will remove
3595 clobber stmts during vectorization. */
3596 if (gimple_clobber_p (stmt
))
3599 if (i
== datarefs
.length () - 1)
3604 datarefs
.ordered_remove (i
);
3609 /* Check that analysis of the data-ref succeeded. */
3610 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3615 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3616 && targetm
.vectorize
.builtin_gather
!= NULL
;
3619 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3620 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3621 bool maybe_simd_lane_access
3622 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3624 /* If target supports vector gather loads or scatter stores, or if
3625 this might be a SIMD lane access, see if they can't be used. */
3626 if (is_a
<loop_vec_info
> (vinfo
)
3627 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3628 && !nested_in_vect_loop_p (loop
, stmt
))
3630 struct data_reference
*newdr
3631 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3632 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3633 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3634 if (DR_BASE_ADDRESS (newdr
)
3635 && DR_OFFSET (newdr
)
3638 && integer_zerop (DR_STEP (newdr
)))
3640 if (maybe_simd_lane_access
)
3642 tree off
= DR_OFFSET (newdr
);
3644 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3645 && TREE_CODE (off
) == MULT_EXPR
3646 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3648 tree step
= TREE_OPERAND (off
, 1);
3649 off
= TREE_OPERAND (off
, 0);
3651 if (CONVERT_EXPR_P (off
)
3652 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3654 < TYPE_PRECISION (TREE_TYPE (off
)))
3655 off
= TREE_OPERAND (off
, 0);
3656 if (TREE_CODE (off
) == SSA_NAME
)
3658 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3659 tree reft
= TREE_TYPE (DR_REF (newdr
));
3660 if (is_gimple_call (def
)
3661 && gimple_call_internal_p (def
)
3662 && (gimple_call_internal_fn (def
)
3663 == IFN_GOMP_SIMD_LANE
))
3665 tree arg
= gimple_call_arg (def
, 0);
3666 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3667 arg
= SSA_NAME_VAR (arg
);
3668 if (arg
== loop
->simduid
3670 && tree_int_cst_equal
3671 (TYPE_SIZE_UNIT (reft
),
3674 DR_OFFSET (newdr
) = ssize_int (0);
3675 DR_STEP (newdr
) = step
;
3676 DR_ALIGNED_TO (newdr
)
3677 = size_int (BIGGEST_ALIGNMENT
);
3679 simd_lane_access
= true;
3685 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3689 gatherscatter
= GATHER
;
3691 gatherscatter
= SCATTER
;
3694 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3695 free_data_ref (newdr
);
3698 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3703 "not vectorized: data ref analysis "
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3708 if (is_a
<bb_vec_info
> (vinfo
))
3715 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3719 "not vectorized: base addr of dr is a "
3722 if (is_a
<bb_vec_info
> (vinfo
))
3725 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3730 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3735 "not vectorized: volatile type ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3739 if (is_a
<bb_vec_info
> (vinfo
))
3745 if (stmt_can_throw_internal (stmt
))
3747 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3750 "not vectorized: statement can throw an "
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3755 if (is_a
<bb_vec_info
> (vinfo
))
3758 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3763 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3764 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3766 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3769 "not vectorized: statement is bitfield "
3771 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3774 if (is_a
<bb_vec_info
> (vinfo
))
3777 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3782 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3783 offset
= unshare_expr (DR_OFFSET (dr
));
3784 init
= unshare_expr (DR_INIT (dr
));
3786 if (is_gimple_call (stmt
)
3787 && (!gimple_call_internal_p (stmt
)
3788 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3789 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3791 if (dump_enabled_p ())
3793 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3794 "not vectorized: dr in a call ");
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3798 if (is_a
<bb_vec_info
> (vinfo
))
3801 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3806 /* Update DR field in stmt_vec_info struct. */
3808 /* If the dataref is in an inner-loop of the loop that is considered for
3809 for vectorization, we also want to analyze the access relative to
3810 the outer-loop (DR contains information only relative to the
3811 inner-most enclosing loop). We do that by building a reference to the
3812 first location accessed by the inner-loop, and analyze it relative to
3814 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3816 tree outer_step
, outer_base
, outer_init
;
3817 HOST_WIDE_INT pbitsize
, pbitpos
;
3820 int punsignedp
, preversep
, pvolatilep
;
3821 affine_iv base_iv
, offset_iv
;
3824 /* Build a reference to the first location accessed by the
3825 inner-loop: *(BASE+INIT). (The first location is actually
3826 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3827 tree inner_base
= build_fold_indirect_ref
3828 (fold_build_pointer_plus (base
, init
));
3830 if (dump_enabled_p ())
3832 dump_printf_loc (MSG_NOTE
, vect_location
,
3833 "analyze in outer-loop: ");
3834 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3835 dump_printf (MSG_NOTE
, "\n");
3838 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3839 &poffset
, &pmode
, &punsignedp
,
3840 &preversep
, &pvolatilep
);
3841 gcc_assert (outer_base
!= NULL_TREE
);
3843 if (pbitpos
% BITS_PER_UNIT
!= 0)
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3847 "failed: bit offset alignment.\n");
3853 if (dump_enabled_p ())
3854 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3855 "failed: reverse storage order.\n");
3859 outer_base
= build_fold_addr_expr (outer_base
);
3860 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3865 "failed: evolution of base is not affine.\n");
3872 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3880 offset_iv
.base
= ssize_int (0);
3881 offset_iv
.step
= ssize_int (0);
3883 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3886 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3888 "evolution of offset is not affine.\n");
3892 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3893 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3894 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3895 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3896 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3898 outer_step
= size_binop (PLUS_EXPR
,
3899 fold_convert (ssizetype
, base_iv
.step
),
3900 fold_convert (ssizetype
, offset_iv
.step
));
3902 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3903 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3904 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3905 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3906 STMT_VINFO_DR_OFFSET (stmt_info
) =
3907 fold_convert (ssizetype
, offset_iv
.base
);
3908 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3909 size_int (highest_pow2_factor (offset_iv
.base
));
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_NOTE
, vect_location
,
3914 "\touter base_address: ");
3915 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3916 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3917 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3918 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3919 STMT_VINFO_DR_OFFSET (stmt_info
));
3920 dump_printf (MSG_NOTE
,
3921 "\n\touter constant offset from base address: ");
3922 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3923 STMT_VINFO_DR_INIT (stmt_info
));
3924 dump_printf (MSG_NOTE
, "\n\touter step: ");
3925 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3926 STMT_VINFO_DR_STEP (stmt_info
));
3927 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3928 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3929 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3930 dump_printf (MSG_NOTE
, "\n");
3934 if (STMT_VINFO_DATA_REF (stmt_info
))
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3939 "not vectorized: more than one data ref "
3941 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3944 if (is_a
<bb_vec_info
> (vinfo
))
3947 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3952 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3953 if (simd_lane_access
)
3955 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3956 free_data_ref (datarefs
[i
]);
3960 /* Set vectype for STMT. */
3961 scalar_type
= TREE_TYPE (DR_REF (dr
));
3962 STMT_VINFO_VECTYPE (stmt_info
)
3963 = get_vectype_for_scalar_type (scalar_type
);
3964 if (!STMT_VINFO_VECTYPE (stmt_info
))
3966 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3969 "not vectorized: no vectype for stmt: ");
3970 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3971 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3972 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3974 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3977 if (is_a
<bb_vec_info
> (vinfo
))
3979 /* No vector type is fine, the ref can still participate
3980 in dependence analysis, we just can't vectorize it. */
3981 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3985 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3987 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3988 if (gatherscatter
!= SG_NONE
)
3995 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_NOTE
, vect_location
,
3998 "got vectype for stmt: ");
3999 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
4000 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
4001 STMT_VINFO_VECTYPE (stmt_info
));
4002 dump_printf (MSG_NOTE
, "\n");
4006 /* Adjust the minimal vectorization factor according to the
4008 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
4012 if (gatherscatter
!= SG_NONE
)
4014 gather_scatter_info gs_info
;
4015 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
4017 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info
.offset
)))
4019 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
4021 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4024 (gatherscatter
== GATHER
) ?
4025 "not vectorized: not suitable for gather "
4027 "not vectorized: not suitable for scatter "
4029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4034 free_data_ref (datarefs
[i
]);
4036 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4039 else if (is_a
<loop_vec_info
> (vinfo
)
4040 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4042 if (nested_in_vect_loop_p (loop
, stmt
))
4044 if (dump_enabled_p ())
4046 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4047 "not vectorized: not suitable for strided "
4049 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
4053 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4057 /* If we stopped analysis at the first dataref we could not analyze
4058 when trying to vectorize a basic-block mark the rest of the datarefs
4059 as not vectorizable and truncate the vector of datarefs. That
4060 avoids spending useless time in analyzing their dependence. */
4061 if (i
!= datarefs
.length ())
4063 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
4064 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
4066 data_reference_p dr
= datarefs
[j
];
4067 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
4070 datarefs
.truncate (i
);
4077 /* Function vect_get_new_vect_var.
4079 Returns a name for a new variable. The current naming scheme appends the
4080 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4081 the name of vectorizer generated variables, and appends that to NAME if
4085 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4092 case vect_simple_var
:
4095 case vect_scalar_var
:
4101 case vect_pointer_var
:
4110 char* tmp
= concat (prefix
, "_", name
, NULL
);
4111 new_vect_var
= create_tmp_reg (type
, tmp
);
4115 new_vect_var
= create_tmp_reg (type
, prefix
);
4117 return new_vect_var
;
4120 /* Like vect_get_new_vect_var but return an SSA name. */
4123 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4130 case vect_simple_var
:
4133 case vect_scalar_var
:
4136 case vect_pointer_var
:
4145 char* tmp
= concat (prefix
, "_", name
, NULL
);
4146 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4150 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4152 return new_vect_var
;
4155 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4158 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
4159 stmt_vec_info stmt_info
)
4161 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
4162 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
4163 int misalign
= DR_MISALIGNMENT (dr
);
4165 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4167 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
4170 /* Function vect_create_addr_base_for_vector_ref.
4172 Create an expression that computes the address of the first memory location
4173 that will be accessed for a data reference.
4176 STMT: The statement containing the data reference.
4177 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4178 OFFSET: Optional. If supplied, it is be added to the initial address.
4179 LOOP: Specify relative to which loop-nest should the address be computed.
4180 For example, when the dataref is in an inner-loop nested in an
4181 outer-loop that is now being vectorized, LOOP can be either the
4182 outer-loop, or the inner-loop. The first memory location accessed
4183 by the following dataref ('in' points to short):
4190 if LOOP=i_loop: &in (relative to i_loop)
4191 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4192 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4193 initial address. Unlike OFFSET, which is number of elements to
4194 be added, BYTE_OFFSET is measured in bytes.
4197 1. Return an SSA_NAME whose value is the address of the memory location of
4198 the first vector of the data reference.
4199 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4200 these statement(s) which define the returned SSA_NAME.
4202 FORNOW: We are only handling array accesses with step 1. */
4205 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
4206 gimple_seq
*new_stmt_list
,
4211 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4212 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4214 const char *base_name
;
4217 gimple_seq seq
= NULL
;
4221 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4222 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4224 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
4226 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4228 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
4230 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
4231 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
4232 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4236 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4237 base_offset
= unshare_expr (DR_OFFSET (dr
));
4238 init
= unshare_expr (DR_INIT (dr
));
4242 base_name
= get_name (data_ref_base
);
4245 base_offset
= ssize_int (0);
4246 init
= ssize_int (0);
4247 base_name
= get_name (DR_REF (dr
));
4250 /* Create base_offset */
4251 base_offset
= size_binop (PLUS_EXPR
,
4252 fold_convert (sizetype
, base_offset
),
4253 fold_convert (sizetype
, init
));
4257 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4258 fold_convert (sizetype
, offset
), step
);
4259 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4260 base_offset
, offset
);
4264 byte_offset
= fold_convert (sizetype
, byte_offset
);
4265 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4266 base_offset
, byte_offset
);
4269 /* base + base_offset */
4271 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4274 addr_base
= build1 (ADDR_EXPR
,
4275 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4276 unshare_expr (DR_REF (dr
)));
4279 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4280 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4281 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4282 gimple_seq_add_seq (new_stmt_list
, seq
);
4284 if (DR_PTR_INFO (dr
)
4285 && TREE_CODE (addr_base
) == SSA_NAME
4286 && !SSA_NAME_PTR_INFO (addr_base
))
4288 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4289 if (offset
|| byte_offset
)
4290 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4293 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4296 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4297 dump_printf (MSG_NOTE
, "\n");
4304 /* Function vect_create_data_ref_ptr.
4306 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4307 location accessed in the loop by STMT, along with the def-use update
4308 chain to appropriately advance the pointer through the loop iterations.
4309 Also set aliasing information for the pointer. This pointer is used by
4310 the callers to this function to create a memory reference expression for
4311 vector load/store access.
4314 1. STMT: a stmt that references memory. Expected to be of the form
4315 GIMPLE_ASSIGN <name, data-ref> or
4316 GIMPLE_ASSIGN <data-ref, name>.
4317 2. AGGR_TYPE: the type of the reference, which should be either a vector
4319 3. AT_LOOP: the loop where the vector memref is to be created.
4320 4. OFFSET (optional): an offset to be added to the initial address accessed
4321 by the data-ref in STMT.
4322 5. BSI: location where the new stmts are to be placed if there is no loop
4323 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4324 pointing to the initial address.
4325 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4326 to the initial address accessed by the data-ref in STMT. This is
4327 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4331 1. Declare a new ptr to vector_type, and have it point to the base of the
4332 data reference (initial addressed accessed by the data reference).
4333 For example, for vector of type V8HI, the following code is generated:
4336 ap = (v8hi *)initial_address;
4338 if OFFSET is not supplied:
4339 initial_address = &a[init];
4340 if OFFSET is supplied:
4341 initial_address = &a[init + OFFSET];
4342 if BYTE_OFFSET is supplied:
4343 initial_address = &a[init] + BYTE_OFFSET;
4345 Return the initial_address in INITIAL_ADDRESS.
4347 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4348 update the pointer in each iteration of the loop.
4350 Return the increment stmt that updates the pointer in PTR_INCR.
4352 3. Set INV_P to true if the access pattern of the data reference in the
4353 vectorized loop is invariant. Set it to false otherwise.
4355 4. Return the pointer. */
4358 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4359 tree offset
, tree
*initial_address
,
4360 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4361 bool only_init
, bool *inv_p
, tree byte_offset
)
4363 const char *base_name
;
4364 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4365 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4366 struct loop
*loop
= NULL
;
4367 bool nested_in_vect_loop
= false;
4368 struct loop
*containing_loop
= NULL
;
4372 gimple_seq new_stmt_list
= NULL
;
4376 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4378 gimple_stmt_iterator incr_gsi
;
4380 tree indx_before_incr
, indx_after_incr
;
4383 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4385 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4386 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4390 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4391 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4392 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4393 pe
= loop_preheader_edge (loop
);
4397 gcc_assert (bb_vinfo
);
4402 /* Check the step (evolution) of the load in LOOP, and record
4403 whether it's invariant. */
4404 if (nested_in_vect_loop
)
4405 step
= STMT_VINFO_DR_STEP (stmt_info
);
4407 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4409 if (integer_zerop (step
))
4414 /* Create an expression for the first address accessed by this load
4416 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4418 if (dump_enabled_p ())
4420 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4421 dump_printf_loc (MSG_NOTE
, vect_location
,
4422 "create %s-pointer variable to type: ",
4423 get_tree_code_name (TREE_CODE (aggr_type
)));
4424 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4425 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4426 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4427 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4428 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4429 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4430 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4432 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4433 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4434 dump_printf (MSG_NOTE
, "\n");
4437 /* (1) Create the new aggregate-pointer variable.
4438 Vector and array types inherit the alias set of their component
4439 type by default so we need to use a ref-all pointer if the data
4440 reference does not conflict with the created aggregated data
4441 reference because it is not addressable. */
4442 bool need_ref_all
= false;
4443 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4444 get_alias_set (DR_REF (dr
))))
4445 need_ref_all
= true;
4446 /* Likewise for any of the data references in the stmt group. */
4447 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4449 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4452 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4453 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4454 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4455 get_alias_set (DR_REF (sdr
))))
4457 need_ref_all
= true;
4460 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4464 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4466 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4469 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4470 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4471 def-use update cycles for the pointer: one relative to the outer-loop
4472 (LOOP), which is what steps (3) and (4) below do. The other is relative
4473 to the inner-loop (which is the inner-most loop containing the dataref),
4474 and this is done be step (5) below.
4476 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4477 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4478 redundant. Steps (3),(4) create the following:
4481 LOOP: vp1 = phi(vp0,vp2)
4487 If there is an inner-loop nested in loop, then step (5) will also be
4488 applied, and an additional update in the inner-loop will be created:
4491 LOOP: vp1 = phi(vp0,vp2)
4493 inner: vp3 = phi(vp1,vp4)
4494 vp4 = vp3 + inner_step
4500 /* (2) Calculate the initial address of the aggregate-pointer, and set
4501 the aggregate-pointer to point to it before the loop. */
4503 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4505 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4506 offset
, loop
, byte_offset
);
4511 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4512 gcc_assert (!new_bb
);
4515 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4518 *initial_address
= new_temp
;
4519 aggr_ptr_init
= new_temp
;
4521 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4522 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4523 inner-loop nested in LOOP (during outer-loop vectorization). */
4525 /* No update in loop is required. */
4526 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4527 aptr
= aggr_ptr_init
;
4530 /* The step of the aggregate pointer is the type size. */
4531 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4532 /* One exception to the above is when the scalar step of the load in
4533 LOOP is zero. In this case the step here is also zero. */
4535 iv_step
= size_zero_node
;
4536 else if (tree_int_cst_sgn (step
) == -1)
4537 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4539 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4541 create_iv (aggr_ptr_init
,
4542 fold_convert (aggr_ptr_type
, iv_step
),
4543 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4544 &indx_before_incr
, &indx_after_incr
);
4545 incr
= gsi_stmt (incr_gsi
);
4546 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4548 /* Copy the points-to information if it exists. */
4549 if (DR_PTR_INFO (dr
))
4551 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4552 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4557 aptr
= indx_before_incr
;
4560 if (!nested_in_vect_loop
|| only_init
)
4564 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4565 nested in LOOP, if exists. */
4567 gcc_assert (nested_in_vect_loop
);
4570 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4572 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4573 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4575 incr
= gsi_stmt (incr_gsi
);
4576 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4578 /* Copy the points-to information if it exists. */
4579 if (DR_PTR_INFO (dr
))
4581 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4582 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4587 return indx_before_incr
;
4594 /* Function bump_vector_ptr
4596 Increment a pointer (to a vector type) by vector-size. If requested,
4597 i.e. if PTR-INCR is given, then also connect the new increment stmt
4598 to the existing def-use update-chain of the pointer, by modifying
4599 the PTR_INCR as illustrated below:
4601 The pointer def-use update-chain before this function:
4602 DATAREF_PTR = phi (p_0, p_2)
4604 PTR_INCR: p_2 = DATAREF_PTR + step
4606 The pointer def-use update-chain after this function:
4607 DATAREF_PTR = phi (p_0, p_2)
4609 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4611 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4614 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4616 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4617 the loop. The increment amount across iterations is expected
4619 BSI - location where the new update stmt is to be placed.
4620 STMT - the original scalar memory-access stmt that is being vectorized.
4621 BUMP - optional. The offset by which to bump the pointer. If not given,
4622 the offset is assumed to be vector_size.
4624 Output: Return NEW_DATAREF_PTR as illustrated above.
4629 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4630 gimple
*stmt
, tree bump
)
4632 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4633 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4634 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4635 tree update
= TYPE_SIZE_UNIT (vectype
);
4638 use_operand_p use_p
;
4639 tree new_dataref_ptr
;
4644 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4645 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4647 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4648 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4649 dataref_ptr
, update
);
4650 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4652 /* Copy the points-to information if it exists. */
4653 if (DR_PTR_INFO (dr
))
4655 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4656 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4660 return new_dataref_ptr
;
4662 /* Update the vector-pointer's cross-iteration increment. */
4663 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4665 tree use
= USE_FROM_PTR (use_p
);
4667 if (use
== dataref_ptr
)
4668 SET_USE (use_p
, new_dataref_ptr
);
4670 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4673 return new_dataref_ptr
;
4677 /* Function vect_create_destination_var.
4679 Create a new temporary of type VECTYPE. */
4682 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4688 enum vect_var_kind kind
;
4691 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
4695 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4697 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4699 name
= get_name (scalar_dest
);
4701 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4703 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4704 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4710 /* Function vect_grouped_store_supported.
4712 Returns TRUE if interleave high and interleave low permutations
4713 are supported, and FALSE otherwise. */
4716 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4718 machine_mode mode
= TYPE_MODE (vectype
);
4720 /* vect_permute_store_chain requires the group size to be equal to 3 or
4721 be a power of two. */
4722 if (count
!= 3 && exact_log2 (count
) == -1)
4724 if (dump_enabled_p ())
4725 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4726 "the size of the group of accesses"
4727 " is not a power of 2 or not eqaul to 3\n");
4731 /* Check that the permutation is supported. */
4732 if (VECTOR_MODE_P (mode
))
4734 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4735 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4739 unsigned int j0
= 0, j1
= 0, j2
= 0;
4742 for (j
= 0; j
< 3; j
++)
4744 int nelt0
= ((3 - j
) * nelt
) % 3;
4745 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4746 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4747 for (i
= 0; i
< nelt
; i
++)
4749 if (3 * i
+ nelt0
< nelt
)
4750 sel
[3 * i
+ nelt0
] = j0
++;
4751 if (3 * i
+ nelt1
< nelt
)
4752 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4753 if (3 * i
+ nelt2
< nelt
)
4754 sel
[3 * i
+ nelt2
] = 0;
4756 if (!can_vec_perm_p (mode
, false, sel
))
4758 if (dump_enabled_p ())
4759 dump_printf (MSG_MISSED_OPTIMIZATION
,
4760 "permutaion op not supported by target.\n");
4764 for (i
= 0; i
< nelt
; i
++)
4766 if (3 * i
+ nelt0
< nelt
)
4767 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4768 if (3 * i
+ nelt1
< nelt
)
4769 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4770 if (3 * i
+ nelt2
< nelt
)
4771 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4773 if (!can_vec_perm_p (mode
, false, sel
))
4775 if (dump_enabled_p ())
4776 dump_printf (MSG_MISSED_OPTIMIZATION
,
4777 "permutaion op not supported by target.\n");
4785 /* If length is not equal to 3 then only power of 2 is supported. */
4786 gcc_assert (pow2p_hwi (count
));
4788 for (i
= 0; i
< nelt
/ 2; i
++)
4791 sel
[i
* 2 + 1] = i
+ nelt
;
4793 if (can_vec_perm_p (mode
, false, sel
))
4795 for (i
= 0; i
< nelt
; i
++)
4797 if (can_vec_perm_p (mode
, false, sel
))
4803 if (dump_enabled_p ())
4804 dump_printf (MSG_MISSED_OPTIMIZATION
,
4805 "permutaion op not supported by target.\n");
4810 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4814 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4816 return vect_lanes_optab_supported_p ("vec_store_lanes",
4817 vec_store_lanes_optab
,
4822 /* Function vect_permute_store_chain.
4824 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4825 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4826 the data correctly for the stores. Return the final references for stores
4829 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4830 The input is 4 vectors each containing 8 elements. We assign a number to
4831 each element, the input sequence is:
4833 1st vec: 0 1 2 3 4 5 6 7
4834 2nd vec: 8 9 10 11 12 13 14 15
4835 3rd vec: 16 17 18 19 20 21 22 23
4836 4th vec: 24 25 26 27 28 29 30 31
4838 The output sequence should be:
4840 1st vec: 0 8 16 24 1 9 17 25
4841 2nd vec: 2 10 18 26 3 11 19 27
4842 3rd vec: 4 12 20 28 5 13 21 30
4843 4th vec: 6 14 22 30 7 15 23 31
4845 i.e., we interleave the contents of the four vectors in their order.
4847 We use interleave_high/low instructions to create such output. The input of
4848 each interleave_high/low operation is two vectors:
4851 the even elements of the result vector are obtained left-to-right from the
4852 high/low elements of the first vector. The odd elements of the result are
4853 obtained left-to-right from the high/low elements of the second vector.
4854 The output of interleave_high will be: 0 4 1 5
4855 and of interleave_low: 2 6 3 7
4858 The permutation is done in log LENGTH stages. In each stage interleave_high
4859 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4860 where the first argument is taken from the first half of DR_CHAIN and the
4861 second argument from it's second half.
4864 I1: interleave_high (1st vec, 3rd vec)
4865 I2: interleave_low (1st vec, 3rd vec)
4866 I3: interleave_high (2nd vec, 4th vec)
4867 I4: interleave_low (2nd vec, 4th vec)
4869 The output for the first stage is:
4871 I1: 0 16 1 17 2 18 3 19
4872 I2: 4 20 5 21 6 22 7 23
4873 I3: 8 24 9 25 10 26 11 27
4874 I4: 12 28 13 29 14 30 15 31
4876 The output of the second stage, i.e. the final result is:
4878 I1: 0 8 16 24 1 9 17 25
4879 I2: 2 10 18 26 3 11 19 27
4880 I3: 4 12 20 28 5 13 21 30
4881 I4: 6 14 22 30 7 15 23 31. */
4884 vect_permute_store_chain (vec
<tree
> dr_chain
,
4885 unsigned int length
,
4887 gimple_stmt_iterator
*gsi
,
4888 vec
<tree
> *result_chain
)
4890 tree vect1
, vect2
, high
, low
;
4892 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4893 tree perm_mask_low
, perm_mask_high
;
4895 tree perm3_mask_low
, perm3_mask_high
;
4896 unsigned int i
, n
, log_length
= exact_log2 (length
);
4897 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4898 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4900 result_chain
->quick_grow (length
);
4901 memcpy (result_chain
->address (), dr_chain
.address (),
4902 length
* sizeof (tree
));
4906 unsigned int j0
= 0, j1
= 0, j2
= 0;
4908 for (j
= 0; j
< 3; j
++)
4910 int nelt0
= ((3 - j
) * nelt
) % 3;
4911 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4912 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4914 for (i
= 0; i
< nelt
; i
++)
4916 if (3 * i
+ nelt0
< nelt
)
4917 sel
[3 * i
+ nelt0
] = j0
++;
4918 if (3 * i
+ nelt1
< nelt
)
4919 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4920 if (3 * i
+ nelt2
< nelt
)
4921 sel
[3 * i
+ nelt2
] = 0;
4923 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4925 for (i
= 0; i
< nelt
; i
++)
4927 if (3 * i
+ nelt0
< nelt
)
4928 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4929 if (3 * i
+ nelt1
< nelt
)
4930 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4931 if (3 * i
+ nelt2
< nelt
)
4932 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4934 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4936 vect1
= dr_chain
[0];
4937 vect2
= dr_chain
[1];
4939 /* Create interleaving stmt:
4940 low = VEC_PERM_EXPR <vect1, vect2,
4941 {j, nelt, *, j + 1, nelt + j + 1, *,
4942 j + 2, nelt + j + 2, *, ...}> */
4943 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4944 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4945 vect2
, perm3_mask_low
);
4946 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4949 vect2
= dr_chain
[2];
4950 /* Create interleaving stmt:
4951 low = VEC_PERM_EXPR <vect1, vect2,
4952 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4953 6, 7, nelt + j + 2, ...}> */
4954 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4955 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4956 vect2
, perm3_mask_high
);
4957 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4958 (*result_chain
)[j
] = data_ref
;
4963 /* If length is not equal to 3 then only power of 2 is supported. */
4964 gcc_assert (pow2p_hwi (length
));
4966 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4969 sel
[i
* 2 + 1] = i
+ nelt
;
4971 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4973 for (i
= 0; i
< nelt
; i
++)
4975 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4977 for (i
= 0, n
= log_length
; i
< n
; i
++)
4979 for (j
= 0; j
< length
/2; j
++)
4981 vect1
= dr_chain
[j
];
4982 vect2
= dr_chain
[j
+length
/2];
4984 /* Create interleaving stmt:
4985 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4987 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4988 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4989 vect2
, perm_mask_high
);
4990 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4991 (*result_chain
)[2*j
] = high
;
4993 /* Create interleaving stmt:
4994 low = VEC_PERM_EXPR <vect1, vect2,
4995 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4997 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4998 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4999 vect2
, perm_mask_low
);
5000 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5001 (*result_chain
)[2*j
+1] = low
;
5003 memcpy (dr_chain
.address (), result_chain
->address (),
5004 length
* sizeof (tree
));
5009 /* Function vect_setup_realignment
5011 This function is called when vectorizing an unaligned load using
5012 the dr_explicit_realign[_optimized] scheme.
5013 This function generates the following code at the loop prolog:
5016 x msq_init = *(floor(p)); # prolog load
5017 realignment_token = call target_builtin;
5019 x msq = phi (msq_init, ---)
5021 The stmts marked with x are generated only for the case of
5022 dr_explicit_realign_optimized.
5024 The code above sets up a new (vector) pointer, pointing to the first
5025 location accessed by STMT, and a "floor-aligned" load using that pointer.
5026 It also generates code to compute the "realignment-token" (if the relevant
5027 target hook was defined), and creates a phi-node at the loop-header bb
5028 whose arguments are the result of the prolog-load (created by this
5029 function) and the result of a load that takes place in the loop (to be
5030 created by the caller to this function).
5032 For the case of dr_explicit_realign_optimized:
5033 The caller to this function uses the phi-result (msq) to create the
5034 realignment code inside the loop, and sets up the missing phi argument,
5037 msq = phi (msq_init, lsq)
5038 lsq = *(floor(p')); # load in loop
5039 result = realign_load (msq, lsq, realignment_token);
5041 For the case of dr_explicit_realign:
5043 msq = *(floor(p)); # load in loop
5045 lsq = *(floor(p')); # load in loop
5046 result = realign_load (msq, lsq, realignment_token);
5049 STMT - (scalar) load stmt to be vectorized. This load accesses
5050 a memory location that may be unaligned.
5051 BSI - place where new code is to be inserted.
5052 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5056 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5057 target hook, if defined.
5058 Return value - the result of the loop-header phi node. */
5061 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
5062 tree
*realignment_token
,
5063 enum dr_alignment_support alignment_support_scheme
,
5065 struct loop
**at_loop
)
5067 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5068 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5069 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5070 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5071 struct loop
*loop
= NULL
;
5073 tree scalar_dest
= gimple_assign_lhs (stmt
);
5079 tree msq_init
= NULL_TREE
;
5082 tree msq
= NULL_TREE
;
5083 gimple_seq stmts
= NULL
;
5085 bool compute_in_loop
= false;
5086 bool nested_in_vect_loop
= false;
5087 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
5088 struct loop
*loop_for_initial_load
= NULL
;
5092 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5093 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
5096 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5097 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5099 /* We need to generate three things:
5100 1. the misalignment computation
5101 2. the extra vector load (for the optimized realignment scheme).
5102 3. the phi node for the two vectors from which the realignment is
5103 done (for the optimized realignment scheme). */
5105 /* 1. Determine where to generate the misalignment computation.
5107 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5108 calculation will be generated by this function, outside the loop (in the
5109 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5110 caller, inside the loop.
5112 Background: If the misalignment remains fixed throughout the iterations of
5113 the loop, then both realignment schemes are applicable, and also the
5114 misalignment computation can be done outside LOOP. This is because we are
5115 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5116 are a multiple of VS (the Vector Size), and therefore the misalignment in
5117 different vectorized LOOP iterations is always the same.
5118 The problem arises only if the memory access is in an inner-loop nested
5119 inside LOOP, which is now being vectorized using outer-loop vectorization.
5120 This is the only case when the misalignment of the memory access may not
5121 remain fixed throughout the iterations of the inner-loop (as explained in
5122 detail in vect_supportable_dr_alignment). In this case, not only is the
5123 optimized realignment scheme not applicable, but also the misalignment
5124 computation (and generation of the realignment token that is passed to
5125 REALIGN_LOAD) have to be done inside the loop.
5127 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5128 or not, which in turn determines if the misalignment is computed inside
5129 the inner-loop, or outside LOOP. */
5131 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5133 compute_in_loop
= true;
5134 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5138 /* 2. Determine where to generate the extra vector load.
5140 For the optimized realignment scheme, instead of generating two vector
5141 loads in each iteration, we generate a single extra vector load in the
5142 preheader of the loop, and in each iteration reuse the result of the
5143 vector load from the previous iteration. In case the memory access is in
5144 an inner-loop nested inside LOOP, which is now being vectorized using
5145 outer-loop vectorization, we need to determine whether this initial vector
5146 load should be generated at the preheader of the inner-loop, or can be
5147 generated at the preheader of LOOP. If the memory access has no evolution
5148 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5149 to be generated inside LOOP (in the preheader of the inner-loop). */
5151 if (nested_in_vect_loop
)
5153 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5154 bool invariant_in_outerloop
=
5155 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5156 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5159 loop_for_initial_load
= loop
;
5161 *at_loop
= loop_for_initial_load
;
5163 if (loop_for_initial_load
)
5164 pe
= loop_preheader_edge (loop_for_initial_load
);
5166 /* 3. For the case of the optimized realignment, create the first vector
5167 load at the loop preheader. */
5169 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5171 /* Create msq_init = *(floor(p1)) in the loop preheader */
5174 gcc_assert (!compute_in_loop
);
5175 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5176 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
5177 NULL_TREE
, &init_addr
, NULL
, &inc
,
5179 if (TREE_CODE (ptr
) == SSA_NAME
)
5180 new_temp
= copy_ssa_name (ptr
);
5182 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5183 new_stmt
= gimple_build_assign
5184 (new_temp
, BIT_AND_EXPR
, ptr
,
5185 build_int_cst (TREE_TYPE (ptr
),
5186 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
5187 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5188 gcc_assert (!new_bb
);
5190 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5191 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5192 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5193 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5194 gimple_assign_set_lhs (new_stmt
, new_temp
);
5197 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5198 gcc_assert (!new_bb
);
5201 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5203 msq_init
= gimple_assign_lhs (new_stmt
);
5206 /* 4. Create realignment token using a target builtin, if available.
5207 It is done either inside the containing loop, or before LOOP (as
5208 determined above). */
5210 if (targetm
.vectorize
.builtin_mask_for_load
)
5215 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5218 /* Generate the INIT_ADDR computation outside LOOP. */
5219 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
5223 pe
= loop_preheader_edge (loop
);
5224 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5225 gcc_assert (!new_bb
);
5228 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5231 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5232 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5234 vect_create_destination_var (scalar_dest
,
5235 gimple_call_return_type (new_stmt
));
5236 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5237 gimple_call_set_lhs (new_stmt
, new_temp
);
5239 if (compute_in_loop
)
5240 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5243 /* Generate the misalignment computation outside LOOP. */
5244 pe
= loop_preheader_edge (loop
);
5245 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5246 gcc_assert (!new_bb
);
5249 *realignment_token
= gimple_call_lhs (new_stmt
);
5251 /* The result of the CALL_EXPR to this builtin is determined from
5252 the value of the parameter and no global variables are touched
5253 which makes the builtin a "const" function. Requiring the
5254 builtin to have the "const" attribute makes it unnecessary
5255 to call mark_call_clobbered. */
5256 gcc_assert (TREE_READONLY (builtin_decl
));
5259 if (alignment_support_scheme
== dr_explicit_realign
)
5262 gcc_assert (!compute_in_loop
);
5263 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5266 /* 5. Create msq = phi <msq_init, lsq> in loop */
5268 pe
= loop_preheader_edge (containing_loop
);
5269 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5270 msq
= make_ssa_name (vec_dest
);
5271 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5272 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5278 /* Function vect_grouped_load_supported.
5280 COUNT is the size of the load group (the number of statements plus the
5281 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5282 only one statement, with a gap of COUNT - 1.
5284 Returns true if a suitable permute exists. */
5287 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5288 unsigned HOST_WIDE_INT count
)
5290 machine_mode mode
= TYPE_MODE (vectype
);
5292 /* If this is single-element interleaving with an element distance
5293 that leaves unused vector loads around punt - we at least create
5294 very sub-optimal code in that case (and blow up memory,
5296 if (single_element_p
&& count
> TYPE_VECTOR_SUBPARTS (vectype
))
5298 if (dump_enabled_p ())
5299 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5300 "single-element interleaving not supported "
5301 "for not adjacent vector loads\n");
5305 /* vect_permute_load_chain requires the group size to be equal to 3 or
5306 be a power of two. */
5307 if (count
!= 3 && exact_log2 (count
) == -1)
5309 if (dump_enabled_p ())
5310 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5311 "the size of the group of accesses"
5312 " is not a power of 2 or not equal to 3\n");
5316 /* Check that the permutation is supported. */
5317 if (VECTOR_MODE_P (mode
))
5319 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5320 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5325 for (k
= 0; k
< 3; k
++)
5327 for (i
= 0; i
< nelt
; i
++)
5328 if (3 * i
+ k
< 2 * nelt
)
5332 if (!can_vec_perm_p (mode
, false, sel
))
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5336 "shuffle of 3 loads is not supported by"
5340 for (i
= 0, j
= 0; i
< nelt
; i
++)
5341 if (3 * i
+ k
< 2 * nelt
)
5344 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5345 if (!can_vec_perm_p (mode
, false, sel
))
5347 if (dump_enabled_p ())
5348 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5349 "shuffle of 3 loads is not supported by"
5358 /* If length is not equal to 3 then only power of 2 is supported. */
5359 gcc_assert (pow2p_hwi (count
));
5360 for (i
= 0; i
< nelt
; i
++)
5362 if (can_vec_perm_p (mode
, false, sel
))
5364 for (i
= 0; i
< nelt
; i
++)
5366 if (can_vec_perm_p (mode
, false, sel
))
5372 if (dump_enabled_p ())
5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5374 "extract even/odd not supported by target\n");
5378 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5382 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5384 return vect_lanes_optab_supported_p ("vec_load_lanes",
5385 vec_load_lanes_optab
,
5389 /* Function vect_permute_load_chain.
5391 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5392 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5393 the input data correctly. Return the final references for loads in
5396 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5397 The input is 4 vectors each containing 8 elements. We assign a number to each
5398 element, the input sequence is:
5400 1st vec: 0 1 2 3 4 5 6 7
5401 2nd vec: 8 9 10 11 12 13 14 15
5402 3rd vec: 16 17 18 19 20 21 22 23
5403 4th vec: 24 25 26 27 28 29 30 31
5405 The output sequence should be:
5407 1st vec: 0 4 8 12 16 20 24 28
5408 2nd vec: 1 5 9 13 17 21 25 29
5409 3rd vec: 2 6 10 14 18 22 26 30
5410 4th vec: 3 7 11 15 19 23 27 31
5412 i.e., the first output vector should contain the first elements of each
5413 interleaving group, etc.
5415 We use extract_even/odd instructions to create such output. The input of
5416 each extract_even/odd operation is two vectors
5420 and the output is the vector of extracted even/odd elements. The output of
5421 extract_even will be: 0 2 4 6
5422 and of extract_odd: 1 3 5 7
5425 The permutation is done in log LENGTH stages. In each stage extract_even
5426 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5427 their order. In our example,
5429 E1: extract_even (1st vec, 2nd vec)
5430 E2: extract_odd (1st vec, 2nd vec)
5431 E3: extract_even (3rd vec, 4th vec)
5432 E4: extract_odd (3rd vec, 4th vec)
5434 The output for the first stage will be:
5436 E1: 0 2 4 6 8 10 12 14
5437 E2: 1 3 5 7 9 11 13 15
5438 E3: 16 18 20 22 24 26 28 30
5439 E4: 17 19 21 23 25 27 29 31
5441 In order to proceed and create the correct sequence for the next stage (or
5442 for the correct output, if the second stage is the last one, as in our
5443 example), we first put the output of extract_even operation and then the
5444 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5445 The input for the second stage is:
5447 1st vec (E1): 0 2 4 6 8 10 12 14
5448 2nd vec (E3): 16 18 20 22 24 26 28 30
5449 3rd vec (E2): 1 3 5 7 9 11 13 15
5450 4th vec (E4): 17 19 21 23 25 27 29 31
5452 The output of the second stage:
5454 E1: 0 4 8 12 16 20 24 28
5455 E2: 2 6 10 14 18 22 26 30
5456 E3: 1 5 9 13 17 21 25 29
5457 E4: 3 7 11 15 19 23 27 31
5459 And RESULT_CHAIN after reordering:
5461 1st vec (E1): 0 4 8 12 16 20 24 28
5462 2nd vec (E3): 1 5 9 13 17 21 25 29
5463 3rd vec (E2): 2 6 10 14 18 22 26 30
5464 4th vec (E4): 3 7 11 15 19 23 27 31. */
5467 vect_permute_load_chain (vec
<tree
> dr_chain
,
5468 unsigned int length
,
5470 gimple_stmt_iterator
*gsi
,
5471 vec
<tree
> *result_chain
)
5473 tree data_ref
, first_vect
, second_vect
;
5474 tree perm_mask_even
, perm_mask_odd
;
5475 tree perm3_mask_low
, perm3_mask_high
;
5477 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5478 unsigned int i
, j
, log_length
= exact_log2 (length
);
5479 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5480 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5482 result_chain
->quick_grow (length
);
5483 memcpy (result_chain
->address (), dr_chain
.address (),
5484 length
* sizeof (tree
));
5490 for (k
= 0; k
< 3; k
++)
5492 for (i
= 0; i
< nelt
; i
++)
5493 if (3 * i
+ k
< 2 * nelt
)
5497 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5499 for (i
= 0, j
= 0; i
< nelt
; i
++)
5500 if (3 * i
+ k
< 2 * nelt
)
5503 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5505 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5507 first_vect
= dr_chain
[0];
5508 second_vect
= dr_chain
[1];
5510 /* Create interleaving stmt (low part of):
5511 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5513 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5514 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5515 second_vect
, perm3_mask_low
);
5516 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5518 /* Create interleaving stmt (high part of):
5519 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5521 first_vect
= data_ref
;
5522 second_vect
= dr_chain
[2];
5523 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5524 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5525 second_vect
, perm3_mask_high
);
5526 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5527 (*result_chain
)[k
] = data_ref
;
5532 /* If length is not equal to 3 then only power of 2 is supported. */
5533 gcc_assert (pow2p_hwi (length
));
5535 for (i
= 0; i
< nelt
; ++i
)
5537 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5539 for (i
= 0; i
< nelt
; ++i
)
5541 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5543 for (i
= 0; i
< log_length
; i
++)
5545 for (j
= 0; j
< length
; j
+= 2)
5547 first_vect
= dr_chain
[j
];
5548 second_vect
= dr_chain
[j
+1];
5550 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5551 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5552 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5553 first_vect
, second_vect
,
5555 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5556 (*result_chain
)[j
/2] = data_ref
;
5558 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5559 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5560 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5561 first_vect
, second_vect
,
5563 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5564 (*result_chain
)[j
/2+length
/2] = data_ref
;
5566 memcpy (dr_chain
.address (), result_chain
->address (),
5567 length
* sizeof (tree
));
5572 /* Function vect_shift_permute_load_chain.
5574 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5575 sequence of stmts to reorder the input data accordingly.
5576 Return the final references for loads in RESULT_CHAIN.
5577 Return true if successed, false otherwise.
5579 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5580 The input is 3 vectors each containing 8 elements. We assign a
5581 number to each element, the input sequence is:
5583 1st vec: 0 1 2 3 4 5 6 7
5584 2nd vec: 8 9 10 11 12 13 14 15
5585 3rd vec: 16 17 18 19 20 21 22 23
5587 The output sequence should be:
5589 1st vec: 0 3 6 9 12 15 18 21
5590 2nd vec: 1 4 7 10 13 16 19 22
5591 3rd vec: 2 5 8 11 14 17 20 23
5593 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5595 First we shuffle all 3 vectors to get correct elements order:
5597 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5598 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5599 3rd vec: (16 19 22) (17 20 23) (18 21)
5601 Next we unite and shift vector 3 times:
5604 shift right by 6 the concatenation of:
5605 "1st vec" and "2nd vec"
5606 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5607 "2nd vec" and "3rd vec"
5608 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5609 "3rd vec" and "1st vec"
5610 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5613 So that now new vectors are:
5615 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5616 2nd vec: (10 13) (16 19 22) (17 20 23)
5617 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5620 shift right by 5 the concatenation of:
5621 "1st vec" and "3rd vec"
5622 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5623 "2nd vec" and "1st vec"
5624 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5625 "3rd vec" and "2nd vec"
5626 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5629 So that now new vectors are:
5631 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5632 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5633 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5636 shift right by 5 the concatenation of:
5637 "1st vec" and "1st vec"
5638 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5639 shift right by 3 the concatenation of:
5640 "2nd vec" and "2nd vec"
5641 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5644 So that now all vectors are READY:
5645 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5646 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5647 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5649 This algorithm is faster than one in vect_permute_load_chain if:
5650 1. "shift of a concatination" is faster than general permutation.
5652 2. The TARGET machine can't execute vector instructions in parallel.
5653 This is because each step of the algorithm depends on previous.
5654 The algorithm in vect_permute_load_chain is much more parallel.
5656 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5660 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5661 unsigned int length
,
5663 gimple_stmt_iterator
*gsi
,
5664 vec
<tree
> *result_chain
)
5666 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5667 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5668 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5671 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5673 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5674 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5675 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5676 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5678 result_chain
->quick_grow (length
);
5679 memcpy (result_chain
->address (), dr_chain
.address (),
5680 length
* sizeof (tree
));
5682 if (pow2p_hwi (length
) && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5684 unsigned int j
, log_length
= exact_log2 (length
);
5685 for (i
= 0; i
< nelt
/ 2; ++i
)
5687 for (i
= 0; i
< nelt
/ 2; ++i
)
5688 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5689 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5693 "shuffle of 2 fields structure is not \
5694 supported by target\n");
5697 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5699 for (i
= 0; i
< nelt
/ 2; ++i
)
5701 for (i
= 0; i
< nelt
/ 2; ++i
)
5702 sel
[nelt
/ 2 + i
] = i
* 2;
5703 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5707 "shuffle of 2 fields structure is not \
5708 supported by target\n");
5711 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5713 /* Generating permutation constant to shift all elements.
5714 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5715 for (i
= 0; i
< nelt
; i
++)
5716 sel
[i
] = nelt
/ 2 + i
;
5717 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5721 "shift permutation is not supported by target\n");
5724 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5726 /* Generating permutation constant to select vector from 2.
5727 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5728 for (i
= 0; i
< nelt
/ 2; i
++)
5730 for (i
= nelt
/ 2; i
< nelt
; i
++)
5732 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5736 "select is not supported by target\n");
5739 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5741 for (i
= 0; i
< log_length
; i
++)
5743 for (j
= 0; j
< length
; j
+= 2)
5745 first_vect
= dr_chain
[j
];
5746 second_vect
= dr_chain
[j
+ 1];
5748 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5749 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5750 first_vect
, first_vect
,
5752 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5755 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5756 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5757 second_vect
, second_vect
,
5759 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5762 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5763 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5764 vect
[0], vect
[1], shift1_mask
);
5765 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5766 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5768 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5769 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5770 vect
[0], vect
[1], select_mask
);
5771 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5772 (*result_chain
)[j
/2] = data_ref
;
5774 memcpy (dr_chain
.address (), result_chain
->address (),
5775 length
* sizeof (tree
));
5779 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5781 unsigned int k
= 0, l
= 0;
5783 /* Generating permutation constant to get all elements in rigth order.
5784 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5785 for (i
= 0; i
< nelt
; i
++)
5787 if (3 * k
+ (l
% 3) >= nelt
)
5790 l
+= (3 - (nelt
% 3));
5792 sel
[i
] = 3 * k
+ (l
% 3);
5795 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5797 if (dump_enabled_p ())
5798 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5799 "shuffle of 3 fields structure is not \
5800 supported by target\n");
5803 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5805 /* Generating permutation constant to shift all elements.
5806 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5807 for (i
= 0; i
< nelt
; i
++)
5808 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5809 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5811 if (dump_enabled_p ())
5812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5813 "shift permutation is not supported by target\n");
5816 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5818 /* Generating permutation constant to shift all elements.
5819 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5820 for (i
= 0; i
< nelt
; i
++)
5821 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5822 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5824 if (dump_enabled_p ())
5825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5826 "shift permutation is not supported by target\n");
5829 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5831 /* Generating permutation constant to shift all elements.
5832 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5833 for (i
= 0; i
< nelt
; i
++)
5834 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5835 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5837 if (dump_enabled_p ())
5838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5839 "shift permutation is not supported by target\n");
5842 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5844 /* Generating permutation constant to shift all elements.
5845 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5846 for (i
= 0; i
< nelt
; i
++)
5847 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5848 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5850 if (dump_enabled_p ())
5851 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5852 "shift permutation is not supported by target\n");
5855 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5857 for (k
= 0; k
< 3; k
++)
5859 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5860 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5861 dr_chain
[k
], dr_chain
[k
],
5863 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5867 for (k
= 0; k
< 3; k
++)
5869 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5870 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5871 vect
[k
% 3], vect
[(k
+ 1) % 3],
5873 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5874 vect_shift
[k
] = data_ref
;
5877 for (k
= 0; k
< 3; k
++)
5879 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5880 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5881 vect_shift
[(4 - k
) % 3],
5882 vect_shift
[(3 - k
) % 3],
5884 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5888 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5890 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5891 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5892 vect
[0], shift3_mask
);
5893 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5894 (*result_chain
)[nelt
% 3] = data_ref
;
5896 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5897 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5898 vect
[1], shift4_mask
);
5899 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5900 (*result_chain
)[0] = data_ref
;
5906 /* Function vect_transform_grouped_load.
5908 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5909 to perform their permutation and ascribe the result vectorized statements to
5910 the scalar statements.
5914 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5915 gimple_stmt_iterator
*gsi
)
5918 vec
<tree
> result_chain
= vNULL
;
5920 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5921 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5922 vectors, that are ready for vector computation. */
5923 result_chain
.create (size
);
5925 /* If reassociation width for vector type is 2 or greater target machine can
5926 execute 2 or more vector instructions in parallel. Otherwise try to
5927 get chain for loads group using vect_shift_permute_load_chain. */
5928 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5929 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5931 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5932 gsi
, &result_chain
))
5933 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5934 vect_record_grouped_load_vectors (stmt
, result_chain
);
5935 result_chain
.release ();
5938 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5939 generated as part of the vectorization of STMT. Assign the statement
5940 for each vector to the associated scalar statement. */
5943 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5945 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5946 gimple
*next_stmt
, *new_stmt
;
5947 unsigned int i
, gap_count
;
5950 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5951 Since we scan the chain starting from it's first node, their order
5952 corresponds the order of data-refs in RESULT_CHAIN. */
5953 next_stmt
= first_stmt
;
5955 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5960 /* Skip the gaps. Loads created for the gaps will be removed by dead
5961 code elimination pass later. No need to check for the first stmt in
5962 the group, since it always exists.
5963 GROUP_GAP is the number of steps in elements from the previous
5964 access (if there is no gap GROUP_GAP is 1). We skip loads that
5965 correspond to the gaps. */
5966 if (next_stmt
!= first_stmt
5967 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5975 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5976 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5977 copies, and we put the new vector statement in the first available
5979 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5980 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5983 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5986 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5988 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5991 prev_stmt
= rel_stmt
;
5993 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5996 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
6001 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
6003 /* If NEXT_STMT accesses the same DR as the previous statement,
6004 put the same TMP_DATA_REF as its vectorized statement; otherwise
6005 get the next data-ref from RESULT_CHAIN. */
6006 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
6012 /* Function vect_force_dr_alignment_p.
6014 Returns whether the alignment of a DECL can be forced to be aligned
6015 on ALIGNMENT bit boundary. */
6018 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
6023 if (decl_in_symtab_p (decl
)
6024 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6027 if (TREE_STATIC (decl
))
6028 return (alignment
<= MAX_OFILE_ALIGNMENT
);
6030 return (alignment
<= MAX_STACK_ALIGNMENT
);
6034 /* Return whether the data reference DR is supported with respect to its
6036 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6037 it is aligned, i.e., check if it is possible to vectorize it with different
6040 enum dr_alignment_support
6041 vect_supportable_dr_alignment (struct data_reference
*dr
,
6042 bool check_aligned_accesses
)
6044 gimple
*stmt
= DR_STMT (dr
);
6045 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
6046 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6047 machine_mode mode
= TYPE_MODE (vectype
);
6048 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6049 struct loop
*vect_loop
= NULL
;
6050 bool nested_in_vect_loop
= false;
6052 if (aligned_access_p (dr
) && !check_aligned_accesses
)
6055 /* For now assume all conditional loads/stores support unaligned
6056 access without any special code. */
6057 if (is_gimple_call (stmt
)
6058 && gimple_call_internal_p (stmt
)
6059 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6060 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6061 return dr_unaligned_supported
;
6065 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6066 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
6069 /* Possibly unaligned access. */
6071 /* We can choose between using the implicit realignment scheme (generating
6072 a misaligned_move stmt) and the explicit realignment scheme (generating
6073 aligned loads with a REALIGN_LOAD). There are two variants to the
6074 explicit realignment scheme: optimized, and unoptimized.
6075 We can optimize the realignment only if the step between consecutive
6076 vector loads is equal to the vector size. Since the vector memory
6077 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6078 is guaranteed that the misalignment amount remains the same throughout the
6079 execution of the vectorized loop. Therefore, we can create the
6080 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6081 at the loop preheader.
6083 However, in the case of outer-loop vectorization, when vectorizing a
6084 memory access in the inner-loop nested within the LOOP that is now being
6085 vectorized, while it is guaranteed that the misalignment of the
6086 vectorized memory access will remain the same in different outer-loop
6087 iterations, it is *not* guaranteed that is will remain the same throughout
6088 the execution of the inner-loop. This is because the inner-loop advances
6089 with the original scalar step (and not in steps of VS). If the inner-loop
6090 step happens to be a multiple of VS, then the misalignment remains fixed
6091 and we can use the optimized realignment scheme. For example:
6097 When vectorizing the i-loop in the above example, the step between
6098 consecutive vector loads is 1, and so the misalignment does not remain
6099 fixed across the execution of the inner-loop, and the realignment cannot
6100 be optimized (as illustrated in the following pseudo vectorized loop):
6102 for (i=0; i<N; i+=4)
6103 for (j=0; j<M; j++){
6104 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6105 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6106 // (assuming that we start from an aligned address).
6109 We therefore have to use the unoptimized realignment scheme:
6111 for (i=0; i<N; i+=4)
6112 for (j=k; j<M; j+=4)
6113 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6114 // that the misalignment of the initial address is
6117 The loop can then be vectorized as follows:
6119 for (k=0; k<4; k++){
6120 rt = get_realignment_token (&vp[k]);
6121 for (i=0; i<N; i+=4){
6123 for (j=k; j<M; j+=4){
6125 va = REALIGN_LOAD <v1,v2,rt>;
6132 if (DR_IS_READ (dr
))
6134 bool is_packed
= false;
6135 tree type
= (TREE_TYPE (DR_REF (dr
)));
6137 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6138 && (!targetm
.vectorize
.builtin_mask_for_load
6139 || targetm
.vectorize
.builtin_mask_for_load ()))
6141 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6143 /* If we are doing SLP then the accesses need not have the
6144 same alignment, instead it depends on the SLP group size. */
6146 && STMT_SLP_TYPE (stmt_info
)
6147 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6148 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)))
6149 % TYPE_VECTOR_SUBPARTS (vectype
) != 0))
6151 else if (!loop_vinfo
6152 || (nested_in_vect_loop
6153 && (TREE_INT_CST_LOW (DR_STEP (dr
))
6154 != GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6155 return dr_explicit_realign
;
6157 return dr_explicit_realign_optimized
;
6159 if (!known_alignment_for_access_p (dr
))
6160 is_packed
= not_size_aligned (DR_REF (dr
));
6162 if (targetm
.vectorize
.support_vector_misalignment
6163 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6164 /* Can't software pipeline the loads, but can at least do them. */
6165 return dr_unaligned_supported
;
6169 bool is_packed
= false;
6170 tree type
= (TREE_TYPE (DR_REF (dr
)));
6172 if (!known_alignment_for_access_p (dr
))
6173 is_packed
= not_size_aligned (DR_REF (dr
));
6175 if (targetm
.vectorize
.support_vector_misalignment
6176 (mode
, type
, DR_MISALIGNMENT (dr
), is_packed
))
6177 return dr_unaligned_supported
;
6181 return dr_unaligned_unsupported
;